xref: /dragonfly/sys/vm/vm_object.c (revision f2187f0a)
1 /*
2  * Copyright (c) 1991, 1993, 2013
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * The Mach Operating System project at Carnegie-Mellon University.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
33  *
34  *
35  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36  * All rights reserved.
37  *
38  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39  *
40  * Permission to use, copy, modify and distribute this software and
41  * its documentation is hereby granted, provided that both the copyright
42  * notice and this permission notice appear in all copies of the
43  * software, derivative works or modified versions, and any portions
44  * thereof, and that both notices appear in supporting documentation.
45  *
46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49  *
50  * Carnegie Mellon requests users of this software to return to
51  *
52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53  *  School of Computer Science
54  *  Carnegie Mellon University
55  *  Pittsburgh PA 15213-3890
56  *
57  * any improvements or extensions that they make and grant Carnegie the
58  * rights to redistribute these changes.
59  *
60  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61  */
62 
63 /*
64  *	Virtual memory object module.
65  */
66 
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h>		/* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/sysctl.h>
77 #include <sys/refcount.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pageout.h>
86 #include <vm/vm_pager.h>
87 #include <vm/swap_pager.h>
88 #include <vm/vm_kern.h>
89 #include <vm/vm_extern.h>
90 #include <vm/vm_zone.h>
91 
92 #include <vm/vm_page2.h>
93 
94 #include <machine/specialreg.h>
95 
96 #define EASY_SCAN_FACTOR	8
97 
98 static void	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
99 					     int pagerflags);
100 static void	vm_object_lock_init(vm_object_t);
101 
102 /*
103  *	Virtual memory objects maintain the actual data
104  *	associated with allocated virtual memory.  A given
105  *	page of memory exists within exactly one object.
106  *
107  *	An object is only deallocated when all "references"
108  *	are given up.  Only one "reference" to a given
109  *	region of an object should be writeable.
110  *
111  *	Associated with each object is a list of all resident
112  *	memory pages belonging to that object; this list is
113  *	maintained by the "vm_page" module, and locked by the object's
114  *	lock.
115  *
116  *	Each object also records a "pager" routine which is
117  *	used to retrieve (and store) pages to the proper backing
118  *	storage.  In addition, objects may be backed by other
119  *	objects from which they were virtual-copied.
120  *
121  *	The only items within the object structure which are
122  *	modified after time of creation are:
123  *		reference count		locked by object's lock
124  *		pager routine		locked by object's lock
125  *
126  */
127 
128 struct vm_object kernel_object;
129 
130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
131 
132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures");
133 
134 #define VMOBJ_HASH_PRIME1	66555444443333333ULL
135 #define VMOBJ_HASH_PRIME2	989042931893ULL
136 
137 int vm_object_debug;
138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
139 
140 static __inline
141 struct vm_object_hash *
142 vmobj_hash(vm_object_t obj)
143 {
144 	uintptr_t hash1;
145 	uintptr_t hash2;
146 
147 	hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
148 	hash1 %= VMOBJ_HASH_PRIME1;
149 	hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
150 	hash2 %= VMOBJ_HASH_PRIME2;
151 	return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
152 }
153 
154 #if defined(DEBUG_LOCKS)
155 
156 #define vm_object_vndeallocate(obj, vpp)	\
157                 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
158 
159 /*
160  * Debug helper to track hold/drop/ref/deallocate calls.
161  */
162 static void
163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
164 {
165 	int i;
166 
167 	i = atomic_fetchadd_int(&obj->debug_index, 1);
168 	i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
169 	ksnprintf(obj->debug_hold_thrs[i],
170 		  sizeof(obj->debug_hold_thrs[i]),
171 		  "%c%d:(%d):%s",
172 		  (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
173 		  (curthread->td_proc ? curthread->td_proc->p_pid : -1),
174 		  obj->ref_count,
175 		  curthread->td_comm);
176 	obj->debug_hold_file[i] = file;
177 	obj->debug_hold_line[i] = line;
178 #if 0
179 	/* Uncomment for debugging obj refs/derefs in reproducable cases */
180 	if (strcmp(curthread->td_comm, "sshd") == 0) {
181 		kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
182 			(curthread->td_proc ? curthread->td_proc->p_pid : -1),
183 			obj, obj->ref_count, addrem, file, line);
184 	}
185 #endif
186 }
187 
188 #endif
189 
190 /*
191  * Misc low level routines
192  */
193 static void
194 vm_object_lock_init(vm_object_t obj)
195 {
196 #if defined(DEBUG_LOCKS)
197 	int i;
198 
199 	obj->debug_index = 0;
200 	for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
201 		obj->debug_hold_thrs[i][0] = 0;
202 		obj->debug_hold_file[i] = NULL;
203 		obj->debug_hold_line[i] = 0;
204 	}
205 #endif
206 }
207 
208 void
209 vm_object_lock_swap(void)
210 {
211 	lwkt_token_swap();
212 }
213 
214 void
215 vm_object_lock(vm_object_t obj)
216 {
217 	lwkt_gettoken(&obj->token);
218 }
219 
220 /*
221  * Returns TRUE on sucesss
222  */
223 static int
224 vm_object_lock_try(vm_object_t obj)
225 {
226 	return(lwkt_trytoken(&obj->token));
227 }
228 
229 void
230 vm_object_lock_shared(vm_object_t obj)
231 {
232 	lwkt_gettoken_shared(&obj->token);
233 }
234 
235 void
236 vm_object_unlock(vm_object_t obj)
237 {
238 	lwkt_reltoken(&obj->token);
239 }
240 
241 void
242 vm_object_upgrade(vm_object_t obj)
243 {
244 	lwkt_reltoken(&obj->token);
245 	lwkt_gettoken(&obj->token);
246 }
247 
248 void
249 vm_object_downgrade(vm_object_t obj)
250 {
251 	lwkt_reltoken(&obj->token);
252 	lwkt_gettoken_shared(&obj->token);
253 }
254 
255 static __inline void
256 vm_object_assert_held(vm_object_t obj)
257 {
258 	ASSERT_LWKT_TOKEN_HELD(&obj->token);
259 }
260 
261 int
262 vm_quickcolor(void)
263 {
264 	globaldata_t gd = mycpu;
265 	int pg_color;
266 
267 	pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
268 	pg_color += gd->gd_quick_color;
269 	gd->gd_quick_color += PQ_PRIME2;
270 
271 	return pg_color;
272 }
273 
274 void
275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
276 {
277 	KKASSERT(obj != NULL);
278 
279 	/*
280 	 * Object must be held (object allocation is stable due to callers
281 	 * context, typically already holding the token on a parent object)
282 	 * prior to potentially blocking on the lock, otherwise the object
283 	 * can get ripped away from us.
284 	 */
285 	refcount_acquire(&obj->hold_count);
286 	vm_object_lock(obj);
287 
288 #if defined(DEBUG_LOCKS)
289 	debugvm_object_add(obj, file, line, 1);
290 #endif
291 }
292 
293 int
294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
295 {
296 	KKASSERT(obj != NULL);
297 
298 	/*
299 	 * Object must be held (object allocation is stable due to callers
300 	 * context, typically already holding the token on a parent object)
301 	 * prior to potentially blocking on the lock, otherwise the object
302 	 * can get ripped away from us.
303 	 */
304 	refcount_acquire(&obj->hold_count);
305 	if (vm_object_lock_try(obj) == 0) {
306 		if (refcount_release(&obj->hold_count)) {
307 			if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
308 				kfree(obj, M_VM_OBJECT);
309 		}
310 		return(0);
311 	}
312 
313 #if defined(DEBUG_LOCKS)
314 	debugvm_object_add(obj, file, line, 1);
315 #endif
316 	return(1);
317 }
318 
319 void
320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
321 {
322 	KKASSERT(obj != NULL);
323 
324 	/*
325 	 * Object must be held (object allocation is stable due to callers
326 	 * context, typically already holding the token on a parent object)
327 	 * prior to potentially blocking on the lock, otherwise the object
328 	 * can get ripped away from us.
329 	 */
330 	refcount_acquire(&obj->hold_count);
331 	vm_object_lock_shared(obj);
332 
333 #if defined(DEBUG_LOCKS)
334 	debugvm_object_add(obj, file, line, 1);
335 #endif
336 }
337 
338 /*
339  * Drop the token and hold_count on the object.
340  *
341  * WARNING! Token might be shared.
342  */
343 void
344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
345 {
346 	if (obj == NULL)
347 		return;
348 
349 	/*
350 	 * No new holders should be possible once we drop hold_count 1->0 as
351 	 * there is no longer any way to reference the object.
352 	 */
353 	KKASSERT(obj->hold_count > 0);
354 	if (refcount_release(&obj->hold_count)) {
355 #if defined(DEBUG_LOCKS)
356 		debugvm_object_add(obj, file, line, -1);
357 #endif
358 
359 		if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
360 			vm_object_unlock(obj);
361 			kfree(obj, M_VM_OBJECT);
362 		} else {
363 			vm_object_unlock(obj);
364 		}
365 	} else {
366 #if defined(DEBUG_LOCKS)
367 		debugvm_object_add(obj, file, line, -1);
368 #endif
369 		vm_object_unlock(obj);
370 	}
371 }
372 
373 /*
374  * Initialize a freshly allocated object, returning a held object.
375  *
376  * Used only by vm_object_allocate(), zinitna() and vm_object_init().
377  *
378  * No requirements.
379  */
380 void
381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
382 {
383 	struct vm_object_hash *hash;
384 
385 	RB_INIT(&object->rb_memq);
386 	lwkt_token_init(&object->token, "vmobj");
387 
388 	object->type = type;
389 	object->size = size;
390 	object->ref_count = 1;
391 	object->memattr = VM_MEMATTR_DEFAULT;
392 	object->hold_count = 0;
393 	object->flags = 0;
394 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
395 		vm_object_set_flag(object, OBJ_ONEMAPPING);
396 	object->paging_in_progress = 0;
397 	object->resident_page_count = 0;
398 	/* cpu localization twist */
399 	object->pg_color = vm_quickcolor();
400 	object->handle = NULL;
401 
402 	atomic_add_int(&object->generation, 1);
403 	object->swblock_count = 0;
404 	RB_INIT(&object->swblock_root);
405 	vm_object_lock_init(object);
406 	pmap_object_init(object);
407 
408 	vm_object_hold(object);
409 
410 	hash = vmobj_hash(object);
411 	lwkt_gettoken(&hash->token);
412 	TAILQ_INSERT_TAIL(&hash->list, object, object_list);
413 	lwkt_reltoken(&hash->token);
414 }
415 
416 /*
417  * Initialize a VM object.
418  */
419 void
420 vm_object_init(vm_object_t object, vm_pindex_t size)
421 {
422 	_vm_object_allocate(OBJT_DEFAULT, size, object);
423 	vm_object_drop(object);
424 }
425 
426 /*
427  * Initialize the VM objects module.
428  *
429  * Called from the low level boot code only.  Note that this occurs before
430  * kmalloc is initialized so we cannot allocate any VM objects.
431  */
432 void
433 vm_object_init1(void)
434 {
435 	int i;
436 
437 	for (i = 0; i < VMOBJ_HSIZE; ++i) {
438 		TAILQ_INIT(&vm_object_hash[i].list);
439 		lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
440 	}
441 
442 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
443 			    &kernel_object);
444 	vm_object_drop(&kernel_object);
445 }
446 
447 void
448 vm_object_init2(void)
449 {
450 	kmalloc_set_unlimited(M_VM_OBJECT);
451 }
452 
453 /*
454  * Allocate and return a new object of the specified type and size.
455  *
456  * No requirements.
457  */
458 vm_object_t
459 vm_object_allocate(objtype_t type, vm_pindex_t size)
460 {
461 	vm_object_t obj;
462 
463 	obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
464 	_vm_object_allocate(type, size, obj);
465 	vm_object_drop(obj);
466 
467 	return (obj);
468 }
469 
470 /*
471  * This version returns a held object, allowing further atomic initialization
472  * of the object.
473  */
474 vm_object_t
475 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
476 {
477 	vm_object_t obj;
478 
479 	obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
480 	_vm_object_allocate(type, size, obj);
481 
482 	return (obj);
483 }
484 
485 /*
486  * Add an additional reference to a vm_object.  The object must already be
487  * held.  The original non-lock version is no longer supported.  The object
488  * must NOT be chain locked by anyone at the time the reference is added.
489  *
490  * The object must be held, but may be held shared if desired (hence why
491  * we use an atomic op).
492  */
493 void
494 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
495 {
496 	KKASSERT(object != NULL);
497 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
498 	atomic_add_int(&object->ref_count, 1);
499 	if (object->type == OBJT_VNODE) {
500 		vref(object->handle);
501 		/* XXX what if the vnode is being destroyed? */
502 	}
503 #if defined(DEBUG_LOCKS)
504 	debugvm_object_add(object, file, line, 1);
505 #endif
506 }
507 
508 /*
509  * This version is only allowed for vnode objects.
510  */
511 void
512 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
513 {
514 	KKASSERT(object->type == OBJT_VNODE);
515 	atomic_add_int(&object->ref_count, 1);
516 	vref(object->handle);
517 #if defined(DEBUG_LOCKS)
518 	debugvm_object_add(object, file, line, 1);
519 #endif
520 }
521 
522 /*
523  * Dereference an object and its underlying vnode.  The object may be
524  * held shared.  On return the object will remain held.
525  *
526  * This function may return a vnode in *vpp which the caller must release
527  * after the caller drops its own lock.  If vpp is NULL, we assume that
528  * the caller was holding an exclusive lock on the object and we vrele()
529  * the vp ourselves.
530  */
531 static void
532 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
533 				   VMOBJDBARGS)
534 {
535 	struct vnode *vp = (struct vnode *) object->handle;
536 
537 	KASSERT(object->type == OBJT_VNODE,
538 	    ("vm_object_vndeallocate: not a vnode object"));
539 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
540 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
541 #ifdef INVARIANTS
542 	if (object->ref_count == 0) {
543 		vprint("vm_object_vndeallocate", vp);
544 		panic("vm_object_vndeallocate: bad object reference count");
545 	}
546 #endif
547 	for (;;) {
548 		int count = object->ref_count;
549 		cpu_ccfence();
550 		if (count == 1) {
551 			vm_object_upgrade(object);
552 			if (atomic_cmpset_int(&object->ref_count, count, 0)) {
553 				vclrflags(vp, VTEXT);
554 				break;
555 			}
556 		} else {
557 			if (atomic_cmpset_int(&object->ref_count,
558 					      count, count - 1)) {
559 				break;
560 			}
561 		}
562 		/* retry */
563 	}
564 #if defined(DEBUG_LOCKS)
565 	debugvm_object_add(object, file, line, -1);
566 #endif
567 
568 	/*
569 	 * vrele or return the vp to vrele.  We can only safely vrele(vp)
570 	 * if the object was locked exclusively.  But there are two races
571 	 * here.
572 	 *
573 	 * We had to upgrade the object above to safely clear VTEXT
574 	 * but the alternative path where the shared lock is retained
575 	 * can STILL race to 0 in other paths and cause our own vrele()
576 	 * to terminate the vnode.  We can't allow that if the VM object
577 	 * is still locked shared.
578 	 */
579 	if (vpp)
580 		*vpp = vp;
581 	else
582 		vrele(vp);
583 }
584 
585 /*
586  * Release a reference to the specified object, gained either through a
587  * vm_object_allocate or a vm_object_reference call.  When all references
588  * are gone, storage associated with this object may be relinquished.
589  *
590  * The caller does not have to hold the object locked but must have control
591  * over the reference in question in order to guarantee that the object
592  * does not get ripped out from under us.
593  *
594  * XXX Currently all deallocations require an exclusive lock.
595  */
596 void
597 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
598 {
599 	struct vnode *vp;
600 	int count;
601 
602 	if (object == NULL)
603 		return;
604 
605 	for (;;) {
606 		count = object->ref_count;
607 		cpu_ccfence();
608 
609 		/*
610 		 * If decrementing the count enters into special handling
611 		 * territory (0, 1, or 2) we have to do it the hard way.
612 		 * Fortunate though, objects with only a few refs like this
613 		 * are not likely to be heavily contended anyway.
614 		 *
615 		 * For vnode objects we only care about 1->0 transitions.
616 		 */
617 		if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
618 #if defined(DEBUG_LOCKS)
619 			debugvm_object_add(object, file, line, 0);
620 #endif
621 			vm_object_hold(object);
622 			vm_object_deallocate_locked(object);
623 			vm_object_drop(object);
624 			break;
625 		}
626 
627 		/*
628 		 * Try to decrement ref_count without acquiring a hold on
629 		 * the object.  This is particularly important for the exec*()
630 		 * and exit*() code paths because the program binary may
631 		 * have a great deal of sharing and an exclusive lock will
632 		 * crowbar performance in those circumstances.
633 		 */
634 		if (object->type == OBJT_VNODE) {
635 			vp = (struct vnode *)object->handle;
636 			if (atomic_cmpset_int(&object->ref_count,
637 					      count, count - 1)) {
638 #if defined(DEBUG_LOCKS)
639 				debugvm_object_add(object, file, line, -1);
640 #endif
641 
642 				vrele(vp);
643 				break;
644 			}
645 			/* retry */
646 		} else {
647 			if (atomic_cmpset_int(&object->ref_count,
648 					      count, count - 1)) {
649 #if defined(DEBUG_LOCKS)
650 				debugvm_object_add(object, file, line, -1);
651 #endif
652 				break;
653 			}
654 			/* retry */
655 		}
656 		/* retry */
657 	}
658 }
659 
660 void
661 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
662 {
663 	/*
664 	 * Degenerate case
665 	 */
666 	if (object == NULL)
667 		return;
668 
669 	/*
670 	 * vnode case, caller either locked the object exclusively
671 	 * or this is a recursion with must_drop != 0 and the vnode
672 	 * object will be locked shared.
673 	 *
674 	 * If locked shared we have to drop the object before we can
675 	 * call vrele() or risk a shared/exclusive livelock.
676 	 */
677 	if (object->type == OBJT_VNODE) {
678 		ASSERT_LWKT_TOKEN_HELD(&object->token);
679 		vm_object_vndeallocate(object, NULL);
680 		return;
681 	}
682 	ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
683 
684 	/*
685 	 * Normal case (object is locked exclusively)
686 	 */
687 	if (object->ref_count == 0) {
688 		panic("vm_object_deallocate: object deallocated "
689 		      "too many times: %d", object->type);
690 	}
691 	if (object->ref_count > 2) {
692 		atomic_add_int(&object->ref_count, -1);
693 #if defined(DEBUG_LOCKS)
694 		debugvm_object_add(object, file, line, -1);
695 #endif
696 		return;
697 	}
698 
699 	/*
700 	 * Drop the ref and handle termination on the 1->0 transition.
701 	 * We may have blocked above so we have to recheck.
702 	 */
703 	KKASSERT(object->ref_count != 0);
704 	if (object->ref_count >= 2) {
705 		atomic_add_int(&object->ref_count, -1);
706 #if defined(DEBUG_LOCKS)
707 		debugvm_object_add(object, file, line, -1);
708 #endif
709 		return;
710 	}
711 
712 	atomic_add_int(&object->ref_count, -1);
713 	if ((object->flags & OBJ_DEAD) == 0)
714 		vm_object_terminate(object);
715 }
716 
717 /*
718  * Destroy the specified object, freeing up related resources.
719  *
720  * The object must have zero references.
721  *
722  * The object must held.  The caller is responsible for dropping the object
723  * after terminate returns.  Terminate does NOT drop the object.
724  */
725 static int vm_object_terminate_callback(vm_page_t p, void *data);
726 
727 void
728 vm_object_terminate(vm_object_t object)
729 {
730 	struct rb_vm_page_scan_info info;
731 	struct vm_object_hash *hash;
732 
733 	/*
734 	 * Make sure no one uses us.  Once we set OBJ_DEAD we should be
735 	 * able to safely block.
736 	 */
737 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
738 	KKASSERT((object->flags & OBJ_DEAD) == 0);
739 	vm_object_set_flag(object, OBJ_DEAD);
740 
741 	/*
742 	 * Wait for the pageout daemon to be done with the object
743 	 */
744 	vm_object_pip_wait(object, "objtrm1");
745 
746 	KASSERT(!object->paging_in_progress,
747 		("vm_object_terminate: pageout in progress"));
748 
749 	/*
750 	 * Clean and free the pages, as appropriate. All references to the
751 	 * object are gone, so we don't need to lock it.
752 	 */
753 	if (object->type == OBJT_VNODE) {
754 		struct vnode *vp;
755 
756 		/*
757 		 * Clean pages and flush buffers.
758 		 *
759 		 * NOTE!  TMPFS buffer flushes do not typically flush the
760 		 *	  actual page to swap as this would be highly
761 		 *	  inefficient, and normal filesystems usually wrap
762 		 *	  page flushes with buffer cache buffers.
763 		 *
764 		 *	  To deal with this we have to call vinvalbuf() both
765 		 *	  before and after the vm_object_page_clean().
766 		 */
767 		vp = (struct vnode *) object->handle;
768 		vinvalbuf(vp, V_SAVE, 0, 0);
769 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
770 		vinvalbuf(vp, V_SAVE, 0, 0);
771 	}
772 
773 	/*
774 	 * Wait for any I/O to complete, after which there had better not
775 	 * be any references left on the object.
776 	 */
777 	vm_object_pip_wait(object, "objtrm2");
778 
779 	if (object->ref_count != 0) {
780 		panic("vm_object_terminate: object with references, "
781 		      "ref_count=%d", object->ref_count);
782 	}
783 
784 	/*
785 	 * Cleanup any shared pmaps associated with this object.
786 	 */
787 	pmap_object_free(object);
788 
789 	/*
790 	 * Now free any remaining pages. For internal objects, this also
791 	 * removes them from paging queues. Don't free wired pages, just
792 	 * remove them from the object.
793 	 */
794 	info.count = 0;
795 	info.object = object;
796 	do {
797 		info.error = 0;
798 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
799 					vm_object_terminate_callback, &info);
800 	} while (info.error);
801 
802 	/*
803 	 * Let the pager know object is dead.
804 	 */
805 	vm_pager_deallocate(object);
806 
807 	/*
808 	 * Wait for the object hold count to hit 1, clean out pages as
809 	 * we go.  vmobj_token interlocks any race conditions that might
810 	 * pick the object up from the vm_object_list after we have cleared
811 	 * rb_memq.
812 	 */
813 	for (;;) {
814 		if (RB_ROOT(&object->rb_memq) == NULL)
815 			break;
816 		kprintf("vm_object_terminate: Warning, object %p "
817 			"still has %ld pages\n",
818 			object, object->resident_page_count);
819 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
820 					vm_object_terminate_callback, &info);
821 	}
822 
823 	/*
824 	 * There had better not be any pages left
825 	 */
826 	KKASSERT(object->resident_page_count == 0);
827 
828 	/*
829 	 * Remove the object from the global object list.
830 	 */
831 	hash = vmobj_hash(object);
832 	lwkt_gettoken(&hash->token);
833 	TAILQ_REMOVE(&hash->list, object, object_list);
834 	lwkt_reltoken(&hash->token);
835 
836 	if (object->ref_count != 0) {
837 		panic("vm_object_terminate2: object with references, "
838 		      "ref_count=%d", object->ref_count);
839 	}
840 
841 	/*
842 	 * NOTE: The object hold_count is at least 1, so we cannot kfree()
843 	 *	 the object here.  See vm_object_drop().
844 	 */
845 }
846 
847 /*
848  * The caller must hold the object.
849  */
850 static int
851 vm_object_terminate_callback(vm_page_t p, void *data)
852 {
853 	struct rb_vm_page_scan_info *info = data;
854 	vm_object_t object;
855 
856 	object = p->object;
857 	KKASSERT(object == info->object);
858 	if (vm_page_busy_try(p, TRUE)) {
859 		vm_page_sleep_busy(p, TRUE, "vmotrm");
860 		info->error = 1;
861 		return 0;
862 	}
863 	if (object != p->object) {
864 		/* XXX remove once we determine it can't happen */
865 		kprintf("vm_object_terminate: Warning: Encountered "
866 			"busied page %p on queue %d\n", p, p->queue);
867 		vm_page_wakeup(p);
868 		info->error = 1;
869 	} else if (p->wire_count == 0) {
870 		/*
871 		 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
872 		 */
873 		vm_page_free(p);
874 		mycpu->gd_cnt.v_pfree++;
875 	} else {
876 		if (p->queue != PQ_NONE) {
877 			kprintf("vm_object_terminate: Warning: Encountered "
878 				"wired page %p on queue %d\n", p, p->queue);
879 			if (vm_object_debug > 0) {
880 				--vm_object_debug;
881 				print_backtrace(10);
882 			}
883 		}
884 		vm_page_remove(p);
885 		vm_page_wakeup(p);
886 	}
887 
888 	/*
889 	 * Must be at end to avoid SMP races, caller holds object token
890 	 */
891 	if ((++info->count & 63) == 0)
892 		lwkt_user_yield();
893 	return(0);
894 }
895 
896 /*
897  * Clean all dirty pages in the specified range of object.  Leaves page
898  * on whatever queue it is currently on.   If NOSYNC is set then do not
899  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
900  * leaving the object dirty.
901  *
902  * When stuffing pages asynchronously, allow clustering.  XXX we need a
903  * synchronous clustering mode implementation.
904  *
905  * Odd semantics: if start == end, we clean everything.
906  *
907  * The object must be locked? XXX
908  */
909 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
910 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
911 
912 void
913 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
914 		     int flags)
915 {
916 	struct rb_vm_page_scan_info info;
917 	struct vnode *vp;
918 	int wholescan;
919 	int pagerflags;
920 	int generation;
921 
922 	vm_object_hold(object);
923 	if (object->type != OBJT_VNODE ||
924 	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
925 		vm_object_drop(object);
926 		return;
927 	}
928 
929 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
930 			VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
931 	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
932 
933 	vp = object->handle;
934 
935 	/*
936 	 * Interlock other major object operations.  This allows us to
937 	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
938 	 */
939 	vm_object_set_flag(object, OBJ_CLEANING);
940 
941 	/*
942 	 * Handle 'entire object' case
943 	 */
944 	info.start_pindex = start;
945 	if (end == 0) {
946 		info.end_pindex = object->size - 1;
947 	} else {
948 		info.end_pindex = end - 1;
949 	}
950 	wholescan = (start == 0 && info.end_pindex == object->size - 1);
951 	info.limit = flags;
952 	info.pagerflags = pagerflags;
953 	info.object = object;
954 
955 	/*
956 	 * If cleaning the entire object do a pass to mark the pages read-only.
957 	 * If everything worked out ok, clear OBJ_WRITEABLE and
958 	 * OBJ_MIGHTBEDIRTY.
959 	 */
960 	if (wholescan) {
961 		info.error = 0;
962 		info.count = 0;
963 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
964 					vm_object_page_clean_pass1, &info);
965 		if (info.error == 0) {
966 			vm_object_clear_flag(object,
967 					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
968 			if (object->type == OBJT_VNODE &&
969 			    (vp = (struct vnode *)object->handle) != NULL) {
970 				/*
971 				 * Use new-style interface to clear VISDIRTY
972 				 * because the vnode is not necessarily removed
973 				 * from the syncer list(s) as often as it was
974 				 * under the old interface, which can leave
975 				 * the vnode on the syncer list after reclaim.
976 				 */
977 				vclrobjdirty(vp);
978 			}
979 		}
980 	}
981 
982 	/*
983 	 * Do a pass to clean all the dirty pages we find.
984 	 */
985 	do {
986 		info.error = 0;
987 		info.count = 0;
988 		generation = object->generation;
989 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
990 					vm_object_page_clean_pass2, &info);
991 	} while (info.error || generation != object->generation);
992 
993 	vm_object_clear_flag(object, OBJ_CLEANING);
994 	vm_object_drop(object);
995 }
996 
997 /*
998  * The caller must hold the object.
999  */
1000 static
1001 int
1002 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1003 {
1004 	struct rb_vm_page_scan_info *info = data;
1005 
1006 	KKASSERT(p->object == info->object);
1007 
1008 	vm_page_flag_set(p, PG_CLEANCHK);
1009 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1010 		info->error = 1;
1011 	} else if (vm_page_busy_try(p, FALSE)) {
1012 		info->error = 1;
1013 	} else {
1014 		KKASSERT(p->object == info->object);
1015 		vm_page_protect(p, VM_PROT_READ);
1016 		vm_page_wakeup(p);
1017 	}
1018 
1019 	/*
1020 	 * Must be at end to avoid SMP races, caller holds object token
1021 	 */
1022 	if ((++info->count & 63) == 0)
1023 		lwkt_user_yield();
1024 	return(0);
1025 }
1026 
1027 /*
1028  * The caller must hold the object
1029  */
1030 static
1031 int
1032 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1033 {
1034 	struct rb_vm_page_scan_info *info = data;
1035 	int generation;
1036 
1037 	KKASSERT(p->object == info->object);
1038 
1039 	/*
1040 	 * Do not mess with pages that were inserted after we started
1041 	 * the cleaning pass.
1042 	 */
1043 	if ((p->flags & PG_CLEANCHK) == 0)
1044 		goto done;
1045 
1046 	generation = info->object->generation;
1047 
1048 	if (vm_page_busy_try(p, TRUE)) {
1049 		vm_page_sleep_busy(p, TRUE, "vpcwai");
1050 		info->error = 1;
1051 		goto done;
1052 	}
1053 
1054 	KKASSERT(p->object == info->object &&
1055 		 info->object->generation == generation);
1056 
1057 	/*
1058 	 * Before wasting time traversing the pmaps, check for trivial
1059 	 * cases where the page cannot be dirty.
1060 	 */
1061 	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1062 		KKASSERT((p->dirty & p->valid) == 0 &&
1063 			 (p->flags & PG_NEED_COMMIT) == 0);
1064 		vm_page_wakeup(p);
1065 		goto done;
1066 	}
1067 
1068 	/*
1069 	 * Check whether the page is dirty or not.  The page has been set
1070 	 * to be read-only so the check will not race a user dirtying the
1071 	 * page.
1072 	 */
1073 	vm_page_test_dirty(p);
1074 	if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1075 		vm_page_flag_clear(p, PG_CLEANCHK);
1076 		vm_page_wakeup(p);
1077 		goto done;
1078 	}
1079 
1080 	/*
1081 	 * If we have been asked to skip nosync pages and this is a
1082 	 * nosync page, skip it.  Note that the object flags were
1083 	 * not cleared in this case (because pass1 will have returned an
1084 	 * error), so we do not have to set them.
1085 	 */
1086 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1087 		vm_page_flag_clear(p, PG_CLEANCHK);
1088 		vm_page_wakeup(p);
1089 		goto done;
1090 	}
1091 
1092 	/*
1093 	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
1094 	 * the pages that get successfully flushed.  Set info->error if
1095 	 * we raced an object modification.
1096 	 */
1097 	vm_object_page_collect_flush(info->object, p, info->pagerflags);
1098 	/* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1099 
1100 	/*
1101 	 * Must be at end to avoid SMP races, caller holds object token
1102 	 */
1103 done:
1104 	if ((++info->count & 63) == 0)
1105 		lwkt_user_yield();
1106 	return(0);
1107 }
1108 
1109 /*
1110  * Collect the specified page and nearby pages and flush them out.
1111  * The number of pages flushed is returned.  The passed page is busied
1112  * by the caller and we are responsible for its disposition.
1113  *
1114  * The caller must hold the object.
1115  */
1116 static void
1117 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1118 {
1119 	int error;
1120 	int is;
1121 	int ib;
1122 	int i;
1123 	int page_base;
1124 	vm_pindex_t pi;
1125 	vm_page_t ma[BLIST_MAX_ALLOC];
1126 
1127 	ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1128 
1129 	pi = p->pindex;
1130 	page_base = pi % BLIST_MAX_ALLOC;
1131 	ma[page_base] = p;
1132 	ib = page_base - 1;
1133 	is = page_base + 1;
1134 
1135 	while (ib >= 0) {
1136 		vm_page_t tp;
1137 
1138 		tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1139 					     TRUE, &error);
1140 		if (error)
1141 			break;
1142 		if (tp == NULL)
1143 			break;
1144 		if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1145 		    (tp->flags & PG_CLEANCHK) == 0) {
1146 			vm_page_wakeup(tp);
1147 			break;
1148 		}
1149 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1150 			vm_page_flag_clear(tp, PG_CLEANCHK);
1151 			vm_page_wakeup(tp);
1152 			break;
1153 		}
1154 		vm_page_test_dirty(tp);
1155 		if ((tp->dirty & tp->valid) == 0 &&
1156 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1157 			vm_page_flag_clear(tp, PG_CLEANCHK);
1158 			vm_page_wakeup(tp);
1159 			break;
1160 		}
1161 		ma[ib] = tp;
1162 		--ib;
1163 	}
1164 	++ib;	/* fixup */
1165 
1166 	while (is < BLIST_MAX_ALLOC &&
1167 	       pi - page_base + is < object->size) {
1168 		vm_page_t tp;
1169 
1170 		tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1171 					     TRUE, &error);
1172 		if (error)
1173 			break;
1174 		if (tp == NULL)
1175 			break;
1176 		if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
1177 		    (tp->flags & PG_CLEANCHK) == 0) {
1178 			vm_page_wakeup(tp);
1179 			break;
1180 		}
1181 		if ((tp->queue - tp->pc) == PQ_CACHE) {
1182 			vm_page_flag_clear(tp, PG_CLEANCHK);
1183 			vm_page_wakeup(tp);
1184 			break;
1185 		}
1186 		vm_page_test_dirty(tp);
1187 		if ((tp->dirty & tp->valid) == 0 &&
1188 		    (tp->flags & PG_NEED_COMMIT) == 0) {
1189 			vm_page_flag_clear(tp, PG_CLEANCHK);
1190 			vm_page_wakeup(tp);
1191 			break;
1192 		}
1193 		ma[is] = tp;
1194 		++is;
1195 	}
1196 
1197 	/*
1198 	 * All pages in the ma[] array are busied now
1199 	 */
1200 	for (i = ib; i < is; ++i) {
1201 		vm_page_flag_clear(ma[i], PG_CLEANCHK);
1202 		vm_page_hold(ma[i]);	/* XXX need this any more? */
1203 	}
1204 	vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1205 	for (i = ib; i < is; ++i)	/* XXX need this any more? */
1206 		vm_page_unhold(ma[i]);
1207 }
1208 
1209 /*
1210  * Same as vm_object_pmap_copy, except range checking really
1211  * works, and is meant for small sections of an object.
1212  *
1213  * This code protects resident pages by making them read-only
1214  * and is typically called on a fork or split when a page
1215  * is converted to copy-on-write.
1216  *
1217  * NOTE: If the page is already at VM_PROT_NONE, calling
1218  * vm_page_protect will have no effect.
1219  */
1220 void
1221 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1222 {
1223 	vm_pindex_t idx;
1224 	vm_page_t p;
1225 
1226 	if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
1227 		return;
1228 
1229 	vm_object_hold(object);
1230 	for (idx = start; idx < end; idx++) {
1231 		p = vm_page_lookup(object, idx);
1232 		if (p == NULL)
1233 			continue;
1234 		vm_page_protect(p, VM_PROT_READ);
1235 	}
1236 	vm_object_drop(object);
1237 }
1238 
1239 /*
1240  * Removes all physical pages in the specified object range from all
1241  * physical maps.
1242  *
1243  * The object must *not* be locked.
1244  */
1245 
1246 static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
1247 
1248 void
1249 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
1250 {
1251 	struct rb_vm_page_scan_info info;
1252 
1253 	if (object == NULL)
1254 		return;
1255 	if (start == end)
1256 		return;
1257 	info.start_pindex = start;
1258 	info.end_pindex = end - 1;
1259 	info.count = 0;
1260 	info.object = object;
1261 
1262 	vm_object_hold(object);
1263 	do {
1264 		info.error = 0;
1265 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1266 					vm_object_pmap_remove_callback, &info);
1267 	} while (info.error);
1268 	if (start == 0 && end == object->size)
1269 		vm_object_clear_flag(object, OBJ_WRITEABLE);
1270 	vm_object_drop(object);
1271 }
1272 
1273 /*
1274  * The caller must hold the object
1275  */
1276 static int
1277 vm_object_pmap_remove_callback(vm_page_t p, void *data)
1278 {
1279 	struct rb_vm_page_scan_info *info = data;
1280 
1281 	if (info->object != p->object ||
1282 	    p->pindex < info->start_pindex ||
1283 	    p->pindex > info->end_pindex) {
1284 		kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n",
1285 			info->object, p);
1286 		info->error = 1;
1287 		return(0);
1288 	}
1289 
1290 	vm_page_protect(p, VM_PROT_NONE);
1291 
1292 	/*
1293 	 * Must be at end to avoid SMP races, caller holds object token
1294 	 */
1295 	if ((++info->count & 63) == 0)
1296 		lwkt_user_yield();
1297 	return(0);
1298 }
1299 
1300 /*
1301  * Implements the madvise function at the object/page level.
1302  *
1303  * MADV_WILLNEED	(any object)
1304  *
1305  *	Activate the specified pages if they are resident.
1306  *
1307  * MADV_DONTNEED	(any object)
1308  *
1309  *	Deactivate the specified pages if they are resident.
1310  *
1311  * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1312  *
1313  *	Deactivate and clean the specified pages if they are
1314  *	resident.  This permits the process to reuse the pages
1315  *	without faulting or the kernel to reclaim the pages
1316  *	without I/O.
1317  *
1318  * No requirements.
1319  */
1320 void
1321 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1322 		  vm_pindex_t count, int advise)
1323 {
1324 	vm_pindex_t end;
1325 	vm_page_t m;
1326 	int error;
1327 
1328 	if (object == NULL)
1329 		return;
1330 
1331 	end = pindex + count;
1332 
1333 	vm_object_hold(object);
1334 
1335 	/*
1336 	 * Locate and adjust resident pages.  This only applies to the
1337 	 * primary object in the mapping.
1338 	 */
1339 	for (; pindex < end; pindex += 1) {
1340 relookup:
1341 		/*
1342 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1343 		 * and those pages must be OBJ_ONEMAPPING.
1344 		 */
1345 		if (advise == MADV_FREE) {
1346 			if ((object->type != OBJT_DEFAULT &&
1347 			     object->type != OBJT_SWAP) ||
1348 			    (object->flags & OBJ_ONEMAPPING) == 0) {
1349 				continue;
1350 			}
1351 		}
1352 
1353 		m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1354 
1355 		if (error) {
1356 			vm_page_sleep_busy(m, TRUE, "madvpo");
1357 			goto relookup;
1358 		}
1359 		if (m == NULL) {
1360 			/*
1361 			 * There may be swap even if there is no backing page
1362 			 */
1363 			if (advise == MADV_FREE && object->type == OBJT_SWAP)
1364 				swap_pager_freespace(object, pindex, 1);
1365 			continue;
1366 		}
1367 
1368 		/*
1369 		 * If the page is not in a normal active state, we skip it.
1370 		 * If the page is not managed there are no page queues to
1371 		 * mess with.  Things can break if we mess with pages in
1372 		 * any of the below states.
1373 		 */
1374 		if (m->wire_count ||
1375 		    (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
1376 		    m->valid != VM_PAGE_BITS_ALL
1377 		) {
1378 			vm_page_wakeup(m);
1379 			continue;
1380 		}
1381 
1382 		/*
1383 		 * Theoretically once a page is known not to be busy, an
1384 		 * interrupt cannot come along and rip it out from under us.
1385 		 */
1386 		if (advise == MADV_WILLNEED) {
1387 			vm_page_activate(m);
1388 		} else if (advise == MADV_DONTNEED) {
1389 			vm_page_dontneed(m);
1390 		} else if (advise == MADV_FREE) {
1391 			/*
1392 			 * Mark the page clean.  This will allow the page
1393 			 * to be freed up by the system.  However, such pages
1394 			 * are often reused quickly by malloc()/free()
1395 			 * so we do not do anything that would cause
1396 			 * a page fault if we can help it.
1397 			 *
1398 			 * Specifically, we do not try to actually free
1399 			 * the page now nor do we try to put it in the
1400 			 * cache (which would cause a page fault on reuse).
1401 			 *
1402 			 * But we do make the page is freeable as we
1403 			 * can without actually taking the step of unmapping
1404 			 * it.
1405 			 */
1406 			pmap_clear_modify(m);
1407 			m->dirty = 0;
1408 			m->act_count = 0;
1409 			vm_page_dontneed(m);
1410 			if (object->type == OBJT_SWAP)
1411 				swap_pager_freespace(object, pindex, 1);
1412 		}
1413 		vm_page_wakeup(m);
1414 	}
1415 	vm_object_drop(object);
1416 }
1417 
1418 /*
1419  * Removes all physical pages in the specified object range from the
1420  * object's list of pages.
1421  *
1422  * No requirements.
1423  */
1424 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1425 
1426 void
1427 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1428 		      boolean_t clean_only)
1429 {
1430 	struct rb_vm_page_scan_info info;
1431 	int all;
1432 
1433 	/*
1434 	 * Degenerate cases and assertions
1435 	 */
1436 	vm_object_hold(object);
1437 	if (object == NULL ||
1438 	    (object->resident_page_count == 0 && object->swblock_count == 0)) {
1439 		vm_object_drop(object);
1440 		return;
1441 	}
1442 	KASSERT(object->type != OBJT_PHYS,
1443 		("attempt to remove pages from a physical object"));
1444 
1445 	/*
1446 	 * Indicate that paging is occuring on the object
1447 	 */
1448 	vm_object_pip_add(object, 1);
1449 
1450 	/*
1451 	 * Figure out the actual removal range and whether we are removing
1452 	 * the entire contents of the object or not.  If removing the entire
1453 	 * contents, be sure to get all pages, even those that might be
1454 	 * beyond the end of the object.
1455 	 */
1456 	info.object = object;
1457 	info.start_pindex = start;
1458 	if (end == 0)
1459 		info.end_pindex = (vm_pindex_t)-1;
1460 	else
1461 		info.end_pindex = end - 1;
1462 	info.limit = clean_only;
1463 	info.count = 0;
1464 	all = (start == 0 && info.end_pindex >= object->size - 1);
1465 
1466 	/*
1467 	 * Loop until we are sure we have gotten them all.
1468 	 */
1469 	do {
1470 		info.error = 0;
1471 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1472 					vm_object_page_remove_callback, &info);
1473 	} while (info.error);
1474 
1475 	/*
1476 	 * Remove any related swap if throwing away pages, or for
1477 	 * non-swap objects (the swap is a clean copy in that case).
1478 	 */
1479 	if (object->type != OBJT_SWAP || clean_only == FALSE) {
1480 		if (all)
1481 			swap_pager_freespace_all(object);
1482 		else
1483 			swap_pager_freespace(object, info.start_pindex,
1484 			     info.end_pindex - info.start_pindex + 1);
1485 	}
1486 
1487 	/*
1488 	 * Cleanup
1489 	 */
1490 	vm_object_pip_wakeup(object);
1491 	vm_object_drop(object);
1492 }
1493 
1494 /*
1495  * The caller must hold the object.
1496  *
1497  * NOTE: User yields are allowed when removing more than one page, but not
1498  *	 allowed if only removing one page (the path for single page removals
1499  *	 might hold a spinlock).
1500  */
1501 static int
1502 vm_object_page_remove_callback(vm_page_t p, void *data)
1503 {
1504 	struct rb_vm_page_scan_info *info = data;
1505 
1506 	if (info->object != p->object ||
1507 	    p->pindex < info->start_pindex ||
1508 	    p->pindex > info->end_pindex) {
1509 		kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1510 			info->object, p);
1511 		return(0);
1512 	}
1513 	if (vm_page_busy_try(p, TRUE)) {
1514 		vm_page_sleep_busy(p, TRUE, "vmopar");
1515 		info->error = 1;
1516 		return(0);
1517 	}
1518 	if (info->object != p->object) {
1519 		/* this should never happen */
1520 		kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1521 			info->object, p);
1522 		vm_page_wakeup(p);
1523 		return(0);
1524 	}
1525 
1526 	/*
1527 	 * Wired pages cannot be destroyed, but they can be invalidated
1528 	 * and we do so if clean_only (limit) is not set.
1529 	 *
1530 	 * WARNING!  The page may be wired due to being part of a buffer
1531 	 *	     cache buffer, and the buffer might be marked B_CACHE.
1532 	 *	     This is fine as part of a truncation but VFSs must be
1533 	 *	     sure to fix the buffer up when re-extending the file.
1534 	 *
1535 	 * NOTE!     PG_NEED_COMMIT is ignored.
1536 	 */
1537 	if (p->wire_count != 0) {
1538 		vm_page_protect(p, VM_PROT_NONE);
1539 		if (info->limit == 0)
1540 			p->valid = 0;
1541 		vm_page_wakeup(p);
1542 		goto done;
1543 	}
1544 
1545 	/*
1546 	 * limit is our clean_only flag.  If set and the page is dirty or
1547 	 * requires a commit, do not free it.  If set and the page is being
1548 	 * held by someone, do not free it.
1549 	 */
1550 	if (info->limit && p->valid) {
1551 		vm_page_test_dirty(p);
1552 		if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1553 			vm_page_wakeup(p);
1554 			goto done;
1555 		}
1556 	}
1557 
1558 	/*
1559 	 * Destroy the page
1560 	 */
1561 	vm_page_protect(p, VM_PROT_NONE);
1562 	vm_page_free(p);
1563 
1564 	/*
1565 	 * Must be at end to avoid SMP races, caller holds object token
1566 	 */
1567 done:
1568 	if ((++info->count & 63) == 0)
1569 		lwkt_user_yield();
1570 
1571 	return(0);
1572 }
1573 
1574 /*
1575  * Try to extend prev_object into an adjoining region of virtual
1576  * memory, return TRUE on success.
1577  *
1578  * The caller does not need to hold (prev_object) but must have a stable
1579  * pointer to it (typically by holding the vm_map locked).
1580  *
1581  * This function only works for anonymous memory objects which either
1582  * have (a) one reference or (b) we are extending the object's size.
1583  * Otherwise the related VM pages we want to use for the object might
1584  * be in use by another mapping.
1585  */
1586 boolean_t
1587 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1588 		   vm_size_t prev_size, vm_size_t next_size)
1589 {
1590 	vm_pindex_t next_pindex;
1591 
1592 	if (prev_object == NULL)
1593 		return (TRUE);
1594 
1595 	vm_object_hold(prev_object);
1596 
1597 	if (prev_object->type != OBJT_DEFAULT &&
1598 	    prev_object->type != OBJT_SWAP) {
1599 		vm_object_drop(prev_object);
1600 		return (FALSE);
1601 	}
1602 
1603 #if 0
1604 	/* caller now checks this */
1605 	/*
1606 	 * Try to collapse the object first
1607 	 */
1608 	vm_object_collapse(prev_object, NULL);
1609 #endif
1610 
1611 #if 0
1612 	/* caller now checks this */
1613 	/*
1614 	 * We can't coalesce if we shadow another object (figuring out the
1615 	 * relationships become too complex).
1616 	 */
1617 	if (prev_object->backing_object != NULL) {
1618 		vm_object_chain_release(prev_object);
1619 		vm_object_drop(prev_object);
1620 		return (FALSE);
1621 	}
1622 #endif
1623 
1624 	prev_size >>= PAGE_SHIFT;
1625 	next_size >>= PAGE_SHIFT;
1626 	next_pindex = prev_pindex + prev_size;
1627 
1628 	/*
1629 	 * We can't if the object has more than one ref count unless we
1630 	 * are extending it into newly minted space.
1631 	 */
1632 	if (prev_object->ref_count > 1 &&
1633 	    prev_object->size != next_pindex) {
1634 		vm_object_drop(prev_object);
1635 		return (FALSE);
1636 	}
1637 
1638 	/*
1639 	 * Remove any pages that may still be in the object from a previous
1640 	 * deallocation.
1641 	 */
1642 	if (next_pindex < prev_object->size) {
1643 		vm_object_page_remove(prev_object,
1644 				      next_pindex,
1645 				      next_pindex + next_size, FALSE);
1646 		if (prev_object->type == OBJT_SWAP)
1647 			swap_pager_freespace(prev_object,
1648 					     next_pindex, next_size);
1649 	}
1650 
1651 	/*
1652 	 * Extend the object if necessary.
1653 	 */
1654 	if (next_pindex + next_size > prev_object->size)
1655 		prev_object->size = next_pindex + next_size;
1656 	vm_object_drop(prev_object);
1657 
1658 	return (TRUE);
1659 }
1660 
1661 /*
1662  * Make the object writable and flag is being possibly dirty.
1663  *
1664  * The object might not be held (or might be held but held shared),
1665  * the related vnode is probably not held either.  Object and vnode are
1666  * stable by virtue of the vm_page busied by the caller preventing
1667  * destruction.
1668  *
1669  * If the related mount is flagged MNTK_THR_SYNC we need to call
1670  * vsetobjdirty().  Filesystems using this option usually shortcut
1671  * synchronization by only scanning the syncer list.
1672  */
1673 void
1674 vm_object_set_writeable_dirty(vm_object_t object)
1675 {
1676 	struct vnode *vp;
1677 
1678 	/*vm_object_assert_held(object);*/
1679 	/*
1680 	 * Avoid contention in vm fault path by checking the state before
1681 	 * issuing an atomic op on it.
1682 	 */
1683 	if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1684 	    (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1685 		vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1686 	}
1687 	if (object->type == OBJT_VNODE &&
1688 	    (vp = (struct vnode *)object->handle) != NULL) {
1689 		if ((vp->v_flag & VOBJDIRTY) == 0) {
1690 			if (vp->v_mount &&
1691 			    (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1692 				/*
1693 				 * New style THR_SYNC places vnodes on the
1694 				 * syncer list more deterministically.
1695 				 */
1696 				vsetobjdirty(vp);
1697 			} else {
1698 				/*
1699 				 * Old style scan would not necessarily place
1700 				 * a vnode on the syncer list when possibly
1701 				 * modified via mmap.
1702 				 */
1703 				vsetflags(vp, VOBJDIRTY);
1704 			}
1705 		}
1706 	}
1707 }
1708 
1709 #include "opt_ddb.h"
1710 #ifdef DDB
1711 #include <sys/cons.h>
1712 
1713 #include <ddb/ddb.h>
1714 
1715 static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
1716 				       vm_map_entry_t entry);
1717 static int	vm_object_in_map (vm_object_t object);
1718 
1719 /*
1720  * The caller must hold the object.
1721  */
1722 static int
1723 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1724 {
1725 	vm_map_backing_t ba;
1726 	vm_map_t tmpm;
1727 	vm_map_entry_t tmpe;
1728 	int entcount;
1729 
1730 	if (map == NULL)
1731 		return 0;
1732 	if (entry == NULL) {
1733 		tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1734 		entcount = map->nentries;
1735 		while (entcount-- && tmpe) {
1736 			if( _vm_object_in_map(map, object, tmpe)) {
1737 				return 1;
1738 			}
1739 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1740 		}
1741 		return (0);
1742 	}
1743 	switch(entry->maptype) {
1744 	case VM_MAPTYPE_SUBMAP:
1745 		tmpm = entry->ba.sub_map;
1746 		tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1747 		entcount = tmpm->nentries;
1748 		while (entcount-- && tmpe) {
1749 			if( _vm_object_in_map(tmpm, object, tmpe)) {
1750 				return 1;
1751 			}
1752 			tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1753 		}
1754 		break;
1755 	case VM_MAPTYPE_NORMAL:
1756 	case VM_MAPTYPE_VPAGETABLE:
1757 		ba = &entry->ba;
1758 		while (ba) {
1759 			if (ba->object == object)
1760 				return TRUE;
1761 			ba = ba->backing_ba;
1762 		}
1763 		break;
1764 	default:
1765 		break;
1766 	}
1767 	return 0;
1768 }
1769 
1770 static int vm_object_in_map_callback(struct proc *p, void *data);
1771 
1772 struct vm_object_in_map_info {
1773 	vm_object_t object;
1774 	int rv;
1775 };
1776 
1777 /*
1778  * Debugging only
1779  */
1780 static int
1781 vm_object_in_map(vm_object_t object)
1782 {
1783 	struct vm_object_in_map_info info;
1784 
1785 	info.rv = 0;
1786 	info.object = object;
1787 
1788 	allproc_scan(vm_object_in_map_callback, &info, 0);
1789 	if (info.rv)
1790 		return 1;
1791 	if( _vm_object_in_map(&kernel_map, object, 0))
1792 		return 1;
1793 	if( _vm_object_in_map(&pager_map, object, 0))
1794 		return 1;
1795 	if( _vm_object_in_map(&buffer_map, object, 0))
1796 		return 1;
1797 	return 0;
1798 }
1799 
1800 /*
1801  * Debugging only
1802  */
1803 static int
1804 vm_object_in_map_callback(struct proc *p, void *data)
1805 {
1806 	struct vm_object_in_map_info *info = data;
1807 
1808 	if (p->p_vmspace) {
1809 		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1810 			info->rv = 1;
1811 			return -1;
1812 		}
1813 	}
1814 	return (0);
1815 }
1816 
1817 DB_SHOW_COMMAND(vmochk, vm_object_check)
1818 {
1819 	struct vm_object_hash *hash;
1820 	vm_object_t object;
1821 	int n;
1822 
1823 	/*
1824 	 * make sure that internal objs are in a map somewhere
1825 	 * and none have zero ref counts.
1826 	 */
1827 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1828 		hash = &vm_object_hash[n];
1829 		for (object = TAILQ_FIRST(&hash->list);
1830 				object != NULL;
1831 				object = TAILQ_NEXT(object, object_list)) {
1832 			if (object->type == OBJT_MARKER)
1833 				continue;
1834 			if (object->handle != NULL ||
1835 			    (object->type != OBJT_DEFAULT &&
1836 			     object->type != OBJT_SWAP)) {
1837 				continue;
1838 			}
1839 			if (object->ref_count == 0) {
1840 				db_printf("vmochk: internal obj has "
1841 					  "zero ref count: %ld\n",
1842 					  (long)object->size);
1843 			}
1844 			if (vm_object_in_map(object))
1845 				continue;
1846 			db_printf("vmochk: internal obj is not in a map: "
1847 				  "ref: %d, size: %lu: 0x%lx\n",
1848 				  object->ref_count, (u_long)object->size,
1849 				  (u_long)object->size);
1850 		}
1851 	}
1852 }
1853 
1854 /*
1855  * Debugging only
1856  */
1857 DB_SHOW_COMMAND(object, vm_object_print_static)
1858 {
1859 	/* XXX convert args. */
1860 	vm_object_t object = (vm_object_t)addr;
1861 	boolean_t full = have_addr;
1862 
1863 	vm_page_t p;
1864 
1865 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
1866 #define	count	was_count
1867 
1868 	int count;
1869 
1870 	if (object == NULL)
1871 		return;
1872 
1873 	db_iprintf(
1874 	    "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1875 	    object, (int)object->type, (u_long)object->size,
1876 	    object->resident_page_count, object->ref_count, object->flags);
1877 	/*
1878 	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1879 	 */
1880 	db_iprintf("\n");
1881 
1882 	if (!full)
1883 		return;
1884 
1885 	db_indent += 2;
1886 	count = 0;
1887 	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1888 		if (count == 0)
1889 			db_iprintf("memory:=");
1890 		else if (count == 6) {
1891 			db_printf("\n");
1892 			db_iprintf(" ...");
1893 			count = 0;
1894 		} else
1895 			db_printf(",");
1896 		count++;
1897 
1898 		db_printf("(off=0x%lx,page=0x%lx)",
1899 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1900 	}
1901 	if (count != 0)
1902 		db_printf("\n");
1903 	db_indent -= 2;
1904 }
1905 
1906 /* XXX. */
1907 #undef count
1908 
1909 /*
1910  * XXX need this non-static entry for calling from vm_map_print.
1911  *
1912  * Debugging only
1913  */
1914 void
1915 vm_object_print(/* db_expr_t */ long addr,
1916 		boolean_t have_addr,
1917 		/* db_expr_t */ long count,
1918 		char *modif)
1919 {
1920 	vm_object_print_static(addr, have_addr, count, modif);
1921 }
1922 
1923 /*
1924  * Debugging only
1925  */
1926 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1927 {
1928 	struct vm_object_hash *hash;
1929 	vm_object_t object;
1930 	int nl = 0;
1931 	int c;
1932 	int n;
1933 
1934 	for (n = 0; n < VMOBJ_HSIZE; ++n) {
1935 		hash = &vm_object_hash[n];
1936 		for (object = TAILQ_FIRST(&hash->list);
1937 				object != NULL;
1938 				object = TAILQ_NEXT(object, object_list)) {
1939 			vm_pindex_t idx, fidx;
1940 			vm_pindex_t osize;
1941 			vm_paddr_t pa = -1, padiff;
1942 			int rcount;
1943 			vm_page_t m;
1944 
1945 			if (object->type == OBJT_MARKER)
1946 				continue;
1947 			db_printf("new object: %p\n", (void *)object);
1948 			if ( nl > 18) {
1949 				c = cngetc();
1950 				if (c != ' ')
1951 					return;
1952 				nl = 0;
1953 			}
1954 			nl++;
1955 			rcount = 0;
1956 			fidx = 0;
1957 			osize = object->size;
1958 			if (osize > 128)
1959 				osize = 128;
1960 			for (idx = 0; idx < osize; idx++) {
1961 				m = vm_page_lookup(object, idx);
1962 				if (m == NULL) {
1963 					if (rcount) {
1964 						db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1965 							(long)fidx, rcount, (long)pa);
1966 						if ( nl > 18) {
1967 							c = cngetc();
1968 							if (c != ' ')
1969 								return;
1970 							nl = 0;
1971 						}
1972 						nl++;
1973 						rcount = 0;
1974 					}
1975 					continue;
1976 				}
1977 
1978 				if (rcount &&
1979 					(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1980 					++rcount;
1981 					continue;
1982 				}
1983 				if (rcount) {
1984 					padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1985 					padiff >>= PAGE_SHIFT;
1986 					padiff &= PQ_L2_MASK;
1987 					if (padiff == 0) {
1988 						pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
1989 						++rcount;
1990 						continue;
1991 					}
1992 					db_printf(" index(%ld)run(%d)pa(0x%lx)",
1993 						(long)fidx, rcount, (long)pa);
1994 					db_printf("pd(%ld)\n", (long)padiff);
1995 					if ( nl > 18) {
1996 						c = cngetc();
1997 						if (c != ' ')
1998 							return;
1999 						nl = 0;
2000 					}
2001 					nl++;
2002 				}
2003 				fidx = idx;
2004 				pa = VM_PAGE_TO_PHYS(m);
2005 				rcount = 1;
2006 			}
2007 			if (rcount) {
2008 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2009 					(long)fidx, rcount, (long)pa);
2010 				if ( nl > 18) {
2011 					c = cngetc();
2012 					if (c != ' ')
2013 						return;
2014 					nl = 0;
2015 				}
2016 				nl++;
2017 			}
2018 		}
2019 	}
2020 }
2021 #endif /* DDB */
2022