xref: /dragonfly/sys/vm/vm_object.c (revision dcd37f7d)
1 /*
2  * (MPSAFE)
3  *
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * The Mach Operating System project at Carnegie-Mellon University.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
39  *
40  *
41  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42  * All rights reserved.
43  *
44  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45  *
46  * Permission to use, copy, modify and distribute this software and
47  * its documentation is hereby granted, provided that both the copyright
48  * notice and this permission notice appear in all copies of the
49  * software, derivative works or modified versions, and any portions
50  * thereof, and that both notices appear in supporting documentation.
51  *
52  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55  *
56  * Carnegie Mellon requests users of this software to return to
57  *
58  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
59  *  School of Computer Science
60  *  Carnegie Mellon University
61  *  Pittsburgh PA 15213-3890
62  *
63  * any improvements or extensions that they make and grant Carnegie the
64  * rights to redistribute these changes.
65  *
66  * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
67  * $DragonFly: src/sys/vm/vm_object.c,v 1.33 2008/05/09 07:24:48 dillon Exp $
68  */
69 
70 /*
71  *	Virtual memory object module.
72  */
73 
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/proc.h>		/* for curproc, pageproc */
77 #include <sys/vnode.h>
78 #include <sys/vmmeter.h>
79 #include <sys/mman.h>
80 #include <sys/mount.h>
81 #include <sys/kernel.h>
82 #include <sys/sysctl.h>
83 
84 #include <vm/vm.h>
85 #include <vm/vm_param.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pageout.h>
91 #include <vm/vm_pager.h>
92 #include <vm/swap_pager.h>
93 #include <vm/vm_kern.h>
94 #include <vm/vm_extern.h>
95 #include <vm/vm_zone.h>
96 
97 #define EASY_SCAN_FACTOR	8
98 
99 static void	vm_object_qcollapse(vm_object_t object);
100 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
101 					     int pagerflags);
102 
103 /*
104  *	Virtual memory objects maintain the actual data
105  *	associated with allocated virtual memory.  A given
106  *	page of memory exists within exactly one object.
107  *
108  *	An object is only deallocated when all "references"
109  *	are given up.  Only one "reference" to a given
110  *	region of an object should be writeable.
111  *
112  *	Associated with each object is a list of all resident
113  *	memory pages belonging to that object; this list is
114  *	maintained by the "vm_page" module, and locked by the object's
115  *	lock.
116  *
117  *	Each object also records a "pager" routine which is
118  *	used to retrieve (and store) pages to the proper backing
119  *	storage.  In addition, objects may be backed by other
120  *	objects from which they were virtual-copied.
121  *
122  *	The only items within the object structure which are
123  *	modified after time of creation are:
124  *		reference count		locked by object's lock
125  *		pager routine		locked by object's lock
126  *
127  */
128 
129 struct object_q vm_object_list;
130 struct vm_object kernel_object;
131 
132 static long vm_object_count;		/* count of all objects */
133 extern int vm_pageout_page_count;
134 
135 static long object_collapses;
136 static long object_bypasses;
137 static int next_index;
138 static vm_zone_t obj_zone;
139 static struct vm_zone obj_zone_store;
140 static int object_hash_rand;
141 #define VM_OBJECTS_INIT 256
142 static struct vm_object vm_objects_init[VM_OBJECTS_INIT];
143 
144 /*
145  * Initialize a freshly allocated object
146  *
147  * Used only by vm_object_allocate() and zinitna().
148  *
149  * No requirements.
150  */
151 void
152 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
153 {
154 	int incr;
155 
156 	RB_INIT(&object->rb_memq);
157 	LIST_INIT(&object->shadow_head);
158 
159 	object->type = type;
160 	object->size = size;
161 	object->ref_count = 1;
162 	object->flags = 0;
163 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
164 		vm_object_set_flag(object, OBJ_ONEMAPPING);
165 	object->paging_in_progress = 0;
166 	object->resident_page_count = 0;
167 	object->shadow_count = 0;
168 	object->pg_color = next_index;
169 	if ( size > (PQ_L2_SIZE / 3 + PQ_PRIME1))
170 		incr = PQ_L2_SIZE / 3 + PQ_PRIME1;
171 	else
172 		incr = size;
173 	next_index = (next_index + incr) & PQ_L2_MASK;
174 	object->handle = NULL;
175 	object->backing_object = NULL;
176 	object->backing_object_offset = (vm_ooffset_t) 0;
177 	/*
178 	 * Try to generate a number that will spread objects out in the
179 	 * hash table.  We 'wipe' new objects across the hash in 128 page
180 	 * increments plus 1 more to offset it a little more by the time
181 	 * it wraps around.
182 	 */
183 	object->hash_rand = object_hash_rand - 129;
184 
185 	object->generation++;
186 	object->swblock_count = 0;
187 	RB_INIT(&object->swblock_root);
188 
189 	crit_enter();
190 	lwkt_gettoken(&vm_token);
191 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
192 	vm_object_count++;
193 	object_hash_rand = object->hash_rand;
194 	lwkt_reltoken(&vm_token);
195 	crit_exit();
196 }
197 
198 /*
199  * Initialize the VM objects module.
200  *
201  * Called from the low level boot code only.
202  */
203 void
204 vm_object_init(void)
205 {
206 	TAILQ_INIT(&vm_object_list);
207 
208 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
209 			    &kernel_object);
210 
211 	obj_zone = &obj_zone_store;
212 	zbootinit(obj_zone, "VM OBJECT", sizeof (struct vm_object),
213 		vm_objects_init, VM_OBJECTS_INIT);
214 }
215 
216 void
217 vm_object_init2(void)
218 {
219 	zinitna(obj_zone, NULL, NULL, 0, 0, ZONE_PANICFAIL, 1);
220 }
221 
222 /*
223  * Allocate and return a new object of the specified type and size.
224  *
225  * No requirements.
226  */
227 vm_object_t
228 vm_object_allocate(objtype_t type, vm_pindex_t size)
229 {
230 	vm_object_t result;
231 
232 	result = (vm_object_t) zalloc(obj_zone);
233 
234 	_vm_object_allocate(type, size, result);
235 
236 	return (result);
237 }
238 
239 /*
240  * Add an additional reference to a vm_object.
241  *
242  * Object passed by caller must be stable or caller must already
243  * hold vm_token to avoid races.
244  */
245 void
246 vm_object_reference(vm_object_t object)
247 {
248 	if (object == NULL)
249 		return;
250 
251 	lwkt_gettoken(&vm_token);
252 	object->ref_count++;
253 	if (object->type == OBJT_VNODE) {
254 		vref(object->handle);
255 		/* XXX what if the vnode is being destroyed? */
256 	}
257 	lwkt_reltoken(&vm_token);
258 }
259 
260 /*
261  * Dereference an object and its underlying vnode.
262  *
263  * The caller must hold vm_token.
264  */
265 static void
266 vm_object_vndeallocate(vm_object_t object)
267 {
268 	struct vnode *vp = (struct vnode *) object->handle;
269 
270 	KASSERT(object->type == OBJT_VNODE,
271 	    ("vm_object_vndeallocate: not a vnode object"));
272 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
273 #ifdef INVARIANTS
274 	if (object->ref_count == 0) {
275 		vprint("vm_object_vndeallocate", vp);
276 		panic("vm_object_vndeallocate: bad object reference count");
277 	}
278 #endif
279 
280 	object->ref_count--;
281 	if (object->ref_count == 0)
282 		vclrflags(vp, VTEXT);
283 	vrele(vp);
284 }
285 
286 /*
287  * Release a reference to the specified object, gained either through a
288  * vm_object_allocate or a vm_object_reference call.  When all references
289  * are gone, storage associated with this object may be relinquished.
290  *
291  * The object must not be locked.
292  */
293 void
294 vm_object_deallocate(vm_object_t object)
295 {
296 	vm_object_t temp;
297 
298 	lwkt_gettoken(&vm_token);
299 
300 	while (object != NULL) {
301 		if (object->type == OBJT_VNODE) {
302 			vm_object_vndeallocate(object);
303 			break;
304 		}
305 
306 		if (object->ref_count == 0) {
307 			panic("vm_object_deallocate: object deallocated "
308 			      "too many times: %d", object->type);
309 		}
310 		if (object->ref_count > 2) {
311 			object->ref_count--;
312 			break;
313 		}
314 
315 		/*
316 		 * Here on ref_count of one or two, which are special cases for
317 		 * objects.
318 		 */
319 		if ((object->ref_count == 2) && (object->shadow_count == 0)) {
320 			vm_object_set_flag(object, OBJ_ONEMAPPING);
321 			object->ref_count--;
322 			break;
323 		}
324 		if ((object->ref_count == 2) && (object->shadow_count == 1)) {
325 			object->ref_count--;
326 			if ((object->handle == NULL) &&
327 			    (object->type == OBJT_DEFAULT ||
328 			     object->type == OBJT_SWAP)) {
329 				vm_object_t robject;
330 
331 				robject = LIST_FIRST(&object->shadow_head);
332 				KASSERT(robject != NULL,
333 					("vm_object_deallocate: ref_count: "
334 					"%d, shadow_count: %d",
335 					object->ref_count,
336 					object->shadow_count));
337 
338 				if ((robject->handle == NULL) &&
339 				    (robject->type == OBJT_DEFAULT ||
340 				     robject->type == OBJT_SWAP)) {
341 
342 					robject->ref_count++;
343 
344 					while (
345 						robject->paging_in_progress ||
346 						object->paging_in_progress
347 					) {
348 						vm_object_pip_sleep(robject, "objde1");
349 						vm_object_pip_sleep(object, "objde2");
350 					}
351 
352 					if (robject->ref_count == 1) {
353 						robject->ref_count--;
354 						object = robject;
355 						goto doterm;
356 					}
357 
358 					object = robject;
359 					vm_object_collapse(object);
360 					continue;
361 				}
362 			}
363 			break;
364 		}
365 
366 		/*
367 		 * Normal dereferencing path
368 		 */
369 		object->ref_count--;
370 		if (object->ref_count != 0)
371 			break;
372 
373 		/*
374 		 * Termination path
375 		 */
376 doterm:
377 
378 		temp = object->backing_object;
379 		if (temp) {
380 			LIST_REMOVE(object, shadow_list);
381 			temp->shadow_count--;
382 			temp->generation++;
383 			object->backing_object = NULL;
384 		}
385 
386 		/*
387 		 * Don't double-terminate, we could be in a termination
388 		 * recursion due to the terminate having to sync data
389 		 * to disk.
390 		 */
391 		if ((object->flags & OBJ_DEAD) == 0)
392 			vm_object_terminate(object);
393 		object = temp;
394 	}
395 	lwkt_reltoken(&vm_token);
396 }
397 
398 /*
399  * Destroy the specified object, freeing up related resources.
400  *
401  * The object must have zero references.
402  *
403  * The caller must be holding vm_token and properly interlock with
404  * OBJ_DEAD.
405  */
406 static int vm_object_terminate_callback(vm_page_t p, void *data);
407 
408 void
409 vm_object_terminate(vm_object_t object)
410 {
411 	/*
412 	 * Make sure no one uses us.
413 	 */
414 	ASSERT_LWKT_TOKEN_HELD(&vm_token);
415 	vm_object_set_flag(object, OBJ_DEAD);
416 
417 	/*
418 	 * wait for the pageout daemon to be done with the object
419 	 */
420 	vm_object_pip_wait(object, "objtrm");
421 
422 	KASSERT(!object->paging_in_progress,
423 		("vm_object_terminate: pageout in progress"));
424 
425 	/*
426 	 * Clean and free the pages, as appropriate. All references to the
427 	 * object are gone, so we don't need to lock it.
428 	 */
429 	if (object->type == OBJT_VNODE) {
430 		struct vnode *vp;
431 
432 		/*
433 		 * Clean pages and flush buffers.
434 		 */
435 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
436 
437 		vp = (struct vnode *) object->handle;
438 		vinvalbuf(vp, V_SAVE, 0, 0);
439 	}
440 
441 	/*
442 	 * Wait for any I/O to complete, after which there had better not
443 	 * be any references left on the object.
444 	 */
445 	vm_object_pip_wait(object, "objtrm");
446 
447 	if (object->ref_count != 0)
448 		panic("vm_object_terminate: object with references, ref_count=%d", object->ref_count);
449 
450 	/*
451 	 * Now free any remaining pages. For internal objects, this also
452 	 * removes them from paging queues. Don't free wired pages, just
453 	 * remove them from the object.
454 	 */
455 	crit_enter();
456 	vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
457 				vm_object_terminate_callback, NULL);
458 	crit_exit();
459 
460 	/*
461 	 * Let the pager know object is dead.
462 	 */
463 	vm_pager_deallocate(object);
464 
465 	/*
466 	 * Remove the object from the global object list.
467 	 */
468 	crit_enter();
469 	TAILQ_REMOVE(&vm_object_list, object, object_list);
470 	vm_object_count--;
471 	crit_exit();
472 
473 	vm_object_dead_wakeup(object);
474 	if (object->ref_count != 0)
475 		panic("vm_object_terminate2: object with references, ref_count=%d", object->ref_count);
476 
477 	/*
478 	 * Free the space for the object.
479 	 */
480 	zfree(obj_zone, object);
481 }
482 
483 /*
484  * The caller must hold vm_token.
485  */
486 static int
487 vm_object_terminate_callback(vm_page_t p, void *data __unused)
488 {
489 	if (p->busy || (p->flags & PG_BUSY))
490 		panic("vm_object_terminate: freeing busy page %p", p);
491 	if (p->wire_count == 0) {
492 		vm_page_busy(p);
493 		vm_page_free(p);
494 		mycpu->gd_cnt.v_pfree++;
495 	} else {
496 		if (p->queue != PQ_NONE)
497 			kprintf("vm_object_terminate: Warning: Encountered wired page %p on queue %d\n", p, p->queue);
498 		vm_page_busy(p);
499 		vm_page_remove(p);
500 		vm_page_wakeup(p);
501 	}
502 	return(0);
503 }
504 
505 /*
506  * The object is dead but still has an object<->pager association.  Sleep
507  * and return.  The caller typically retests the association in a loop.
508  *
509  * No requirement.
510  */
511 void
512 vm_object_dead_sleep(vm_object_t object, const char *wmesg)
513 {
514 	crit_enter();
515 	lwkt_gettoken(&vm_token);
516 	if (object->handle) {
517 		vm_object_set_flag(object, OBJ_DEADWNT);
518 		tsleep(object, 0, wmesg, 0);
519 	}
520 	lwkt_reltoken(&vm_token);
521 	crit_exit();
522 }
523 
524 /*
525  * Wakeup anyone waiting for the object<->pager disassociation on
526  * a dead object.
527  *
528  * No requirement.
529  */
530 void
531 vm_object_dead_wakeup(vm_object_t object)
532 {
533 	lwkt_gettoken(&vm_token);
534 	if (object->flags & OBJ_DEADWNT) {
535 		vm_object_clear_flag(object, OBJ_DEADWNT);
536 		wakeup(object);
537 	}
538 	lwkt_reltoken(&vm_token);
539 }
540 
541 /*
542  * Clean all dirty pages in the specified range of object.  Leaves page
543  * on whatever queue it is currently on.   If NOSYNC is set then do not
544  * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
545  * leaving the object dirty.
546  *
547  * When stuffing pages asynchronously, allow clustering.  XXX we need a
548  * synchronous clustering mode implementation.
549  *
550  * Odd semantics: if start == end, we clean everything.
551  *
552  * The object must be locked? XXX
553  */
554 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
555 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
556 
557 void
558 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
559 		     int flags)
560 {
561 	struct rb_vm_page_scan_info info;
562 	struct vnode *vp;
563 	int wholescan;
564 	int pagerflags;
565 	int curgeneration;
566 
567 	lwkt_gettoken(&vm_token);
568 	if (object->type != OBJT_VNODE ||
569 	    (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
570 		lwkt_reltoken(&vm_token);
571 		return;
572 	}
573 
574 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
575 			VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
576 	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
577 
578 	vp = object->handle;
579 
580 	/*
581 	 * Interlock other major object operations.  This allows us to
582 	 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
583 	 */
584 	crit_enter();
585 	vm_object_set_flag(object, OBJ_CLEANING);
586 
587 	/*
588 	 * Handle 'entire object' case
589 	 */
590 	info.start_pindex = start;
591 	if (end == 0) {
592 		info.end_pindex = object->size - 1;
593 	} else {
594 		info.end_pindex = end - 1;
595 	}
596 	wholescan = (start == 0 && info.end_pindex == object->size - 1);
597 	info.limit = flags;
598 	info.pagerflags = pagerflags;
599 	info.object = object;
600 
601 	/*
602 	 * If cleaning the entire object do a pass to mark the pages read-only.
603 	 * If everything worked out ok, clear OBJ_WRITEABLE and
604 	 * OBJ_MIGHTBEDIRTY.
605 	 */
606 	if (wholescan) {
607 		info.error = 0;
608 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
609 					vm_object_page_clean_pass1, &info);
610 		if (info.error == 0) {
611 			vm_object_clear_flag(object,
612 					     OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
613 			if (object->type == OBJT_VNODE &&
614 			    (vp = (struct vnode *)object->handle) != NULL) {
615 				if (vp->v_flag & VOBJDIRTY)
616 					vclrflags(vp, VOBJDIRTY);
617 			}
618 		}
619 	}
620 
621 	/*
622 	 * Do a pass to clean all the dirty pages we find.
623 	 */
624 	do {
625 		info.error = 0;
626 		curgeneration = object->generation;
627 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
628 					vm_object_page_clean_pass2, &info);
629 	} while (info.error || curgeneration != object->generation);
630 
631 	vm_object_clear_flag(object, OBJ_CLEANING);
632 	crit_exit();
633 	lwkt_reltoken(&vm_token);
634 }
635 
636 /*
637  * The caller must hold vm_token.
638  */
639 static
640 int
641 vm_object_page_clean_pass1(struct vm_page *p, void *data)
642 {
643 	struct rb_vm_page_scan_info *info = data;
644 
645 	vm_page_flag_set(p, PG_CLEANCHK);
646 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC))
647 		info->error = 1;
648 	else
649 		vm_page_protect(p, VM_PROT_READ);	/* must not block */
650 	return(0);
651 }
652 
653 /*
654  * The caller must hold vm_token.
655  */
656 static
657 int
658 vm_object_page_clean_pass2(struct vm_page *p, void *data)
659 {
660 	struct rb_vm_page_scan_info *info = data;
661 	int n;
662 
663 	/*
664 	 * Do not mess with pages that were inserted after we started
665 	 * the cleaning pass.
666 	 */
667 	if ((p->flags & PG_CLEANCHK) == 0)
668 		return(0);
669 
670 	/*
671 	 * Before wasting time traversing the pmaps, check for trivial
672 	 * cases where the page cannot be dirty.
673 	 */
674 	if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
675 		KKASSERT((p->dirty & p->valid) == 0);
676 		return(0);
677 	}
678 
679 	/*
680 	 * Check whether the page is dirty or not.  The page has been set
681 	 * to be read-only so the check will not race a user dirtying the
682 	 * page.
683 	 */
684 	vm_page_test_dirty(p);
685 	if ((p->dirty & p->valid) == 0) {
686 		vm_page_flag_clear(p, PG_CLEANCHK);
687 		return(0);
688 	}
689 
690 	/*
691 	 * If we have been asked to skip nosync pages and this is a
692 	 * nosync page, skip it.  Note that the object flags were
693 	 * not cleared in this case (because pass1 will have returned an
694 	 * error), so we do not have to set them.
695 	 */
696 	if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
697 		vm_page_flag_clear(p, PG_CLEANCHK);
698 		return(0);
699 	}
700 
701 	/*
702 	 * Flush as many pages as we can.  PG_CLEANCHK will be cleared on
703 	 * the pages that get successfully flushed.  Set info->error if
704 	 * we raced an object modification.
705 	 */
706 	n = vm_object_page_collect_flush(info->object, p, info->pagerflags);
707 	if (n == 0)
708 		info->error = 1;
709 	return(0);
710 }
711 
712 /*
713  * Collect the specified page and nearby pages and flush them out.
714  * The number of pages flushed is returned.
715  *
716  * The caller must hold vm_token.
717  */
718 static int
719 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
720 {
721 	int runlen;
722 	int maxf;
723 	int chkb;
724 	int maxb;
725 	int i;
726 	int curgeneration;
727 	vm_pindex_t pi;
728 	vm_page_t maf[vm_pageout_page_count];
729 	vm_page_t mab[vm_pageout_page_count];
730 	vm_page_t ma[vm_pageout_page_count];
731 
732 	curgeneration = object->generation;
733 
734 	pi = p->pindex;
735 	while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
736 		if (object->generation != curgeneration) {
737 			return(0);
738 		}
739 	}
740 	KKASSERT(p->object == object && p->pindex == pi);
741 
742 	maxf = 0;
743 	for(i = 1; i < vm_pageout_page_count; i++) {
744 		vm_page_t tp;
745 
746 		if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
747 			if ((tp->flags & PG_BUSY) ||
748 				((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
749 				 (tp->flags & PG_CLEANCHK) == 0) ||
750 				(tp->busy != 0))
751 				break;
752 			if((tp->queue - tp->pc) == PQ_CACHE) {
753 				vm_page_flag_clear(tp, PG_CLEANCHK);
754 				break;
755 			}
756 			vm_page_test_dirty(tp);
757 			if ((tp->dirty & tp->valid) == 0) {
758 				vm_page_flag_clear(tp, PG_CLEANCHK);
759 				break;
760 			}
761 			maf[ i - 1 ] = tp;
762 			maxf++;
763 			continue;
764 		}
765 		break;
766 	}
767 
768 	maxb = 0;
769 	chkb = vm_pageout_page_count -  maxf;
770 	if (chkb) {
771 		for(i = 1; i < chkb;i++) {
772 			vm_page_t tp;
773 
774 			if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
775 				if ((tp->flags & PG_BUSY) ||
776 					((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
777 					 (tp->flags & PG_CLEANCHK) == 0) ||
778 					(tp->busy != 0))
779 					break;
780 				if((tp->queue - tp->pc) == PQ_CACHE) {
781 					vm_page_flag_clear(tp, PG_CLEANCHK);
782 					break;
783 				}
784 				vm_page_test_dirty(tp);
785 				if ((tp->dirty & tp->valid) == 0) {
786 					vm_page_flag_clear(tp, PG_CLEANCHK);
787 					break;
788 				}
789 				mab[ i - 1 ] = tp;
790 				maxb++;
791 				continue;
792 			}
793 			break;
794 		}
795 	}
796 
797 	for(i = 0; i < maxb; i++) {
798 		int index = (maxb - i) - 1;
799 		ma[index] = mab[i];
800 		vm_page_flag_clear(ma[index], PG_CLEANCHK);
801 	}
802 	vm_page_flag_clear(p, PG_CLEANCHK);
803 	ma[maxb] = p;
804 	for(i = 0; i < maxf; i++) {
805 		int index = (maxb + i) + 1;
806 		ma[index] = maf[i];
807 		vm_page_flag_clear(ma[index], PG_CLEANCHK);
808 	}
809 	runlen = maxb + maxf + 1;
810 
811 	vm_pageout_flush(ma, runlen, pagerflags);
812 	for (i = 0; i < runlen; i++) {
813 		if (ma[i]->valid & ma[i]->dirty) {
814 			vm_page_protect(ma[i], VM_PROT_READ);
815 			vm_page_flag_set(ma[i], PG_CLEANCHK);
816 
817 			/*
818 			 * maxf will end up being the actual number of pages
819 			 * we wrote out contiguously, non-inclusive of the
820 			 * first page.  We do not count look-behind pages.
821 			 */
822 			if (i >= maxb + 1 && (maxf > i - maxb - 1))
823 				maxf = i - maxb - 1;
824 		}
825 	}
826 	return(maxf + 1);
827 }
828 
829 /*
830  * Same as vm_object_pmap_copy, except range checking really
831  * works, and is meant for small sections of an object.
832  *
833  * This code protects resident pages by making them read-only
834  * and is typically called on a fork or split when a page
835  * is converted to copy-on-write.
836  *
837  * NOTE: If the page is already at VM_PROT_NONE, calling
838  * vm_page_protect will have no effect.
839  */
840 void
841 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
842 {
843 	vm_pindex_t idx;
844 	vm_page_t p;
845 
846 	if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
847 		return;
848 
849 	/*
850 	 * spl protection needed to prevent races between the lookup,
851 	 * an interrupt unbusy/free, and our protect call.
852 	 */
853 	crit_enter();
854 	lwkt_gettoken(&vm_token);
855 	for (idx = start; idx < end; idx++) {
856 		p = vm_page_lookup(object, idx);
857 		if (p == NULL)
858 			continue;
859 		vm_page_protect(p, VM_PROT_READ);
860 	}
861 	lwkt_reltoken(&vm_token);
862 	crit_exit();
863 }
864 
865 /*
866  * Removes all physical pages in the specified object range from all
867  * physical maps.
868  *
869  * The object must *not* be locked.
870  */
871 
872 static int vm_object_pmap_remove_callback(vm_page_t p, void *data);
873 
874 void
875 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
876 {
877 	struct rb_vm_page_scan_info info;
878 
879 	if (object == NULL)
880 		return;
881 	info.start_pindex = start;
882 	info.end_pindex = end - 1;
883 
884 	crit_enter();
885 	lwkt_gettoken(&vm_token);
886 	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
887 				vm_object_pmap_remove_callback, &info);
888 	if (start == 0 && end == object->size)
889 		vm_object_clear_flag(object, OBJ_WRITEABLE);
890 	lwkt_reltoken(&vm_token);
891 	crit_exit();
892 }
893 
894 /*
895  * The caller must hold vm_token.
896  */
897 static int
898 vm_object_pmap_remove_callback(vm_page_t p, void *data __unused)
899 {
900 	vm_page_protect(p, VM_PROT_NONE);
901 	return(0);
902 }
903 
904 /*
905  * Implements the madvise function at the object/page level.
906  *
907  * MADV_WILLNEED	(any object)
908  *
909  *	Activate the specified pages if they are resident.
910  *
911  * MADV_DONTNEED	(any object)
912  *
913  *	Deactivate the specified pages if they are resident.
914  *
915  * MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
916  *
917  *	Deactivate and clean the specified pages if they are
918  *	resident.  This permits the process to reuse the pages
919  *	without faulting or the kernel to reclaim the pages
920  *	without I/O.
921  *
922  * No requirements.
923  */
924 void
925 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
926 {
927 	vm_pindex_t end, tpindex;
928 	vm_object_t tobject;
929 	vm_page_t m;
930 
931 	if (object == NULL)
932 		return;
933 
934 	end = pindex + count;
935 
936 	lwkt_gettoken(&vm_token);
937 
938 	/*
939 	 * Locate and adjust resident pages
940 	 */
941 	for (; pindex < end; pindex += 1) {
942 relookup:
943 		tobject = object;
944 		tpindex = pindex;
945 shadowlookup:
946 		/*
947 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
948 		 * and those pages must be OBJ_ONEMAPPING.
949 		 */
950 		if (advise == MADV_FREE) {
951 			if ((tobject->type != OBJT_DEFAULT &&
952 			     tobject->type != OBJT_SWAP) ||
953 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
954 				continue;
955 			}
956 		}
957 
958 		/*
959 		 * spl protection is required to avoid a race between the
960 		 * lookup, an interrupt unbusy/free, and our busy check.
961 		 */
962 
963 		crit_enter();
964 		m = vm_page_lookup(tobject, tpindex);
965 
966 		if (m == NULL) {
967 			/*
968 			 * There may be swap even if there is no backing page
969 			 */
970 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
971 				swap_pager_freespace(tobject, tpindex, 1);
972 
973 			/*
974 			 * next object
975 			 */
976 			crit_exit();
977 			if (tobject->backing_object == NULL)
978 				continue;
979 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
980 			tobject = tobject->backing_object;
981 			goto shadowlookup;
982 		}
983 
984 		/*
985 		 * If the page is busy or not in a normal active state,
986 		 * we skip it.  If the page is not managed there are no
987 		 * page queues to mess with.  Things can break if we mess
988 		 * with pages in any of the below states.
989 		 */
990 		if (
991 		    m->hold_count ||
992 		    m->wire_count ||
993 		    (m->flags & PG_UNMANAGED) ||
994 		    m->valid != VM_PAGE_BITS_ALL
995 		) {
996 			crit_exit();
997 			continue;
998 		}
999 
1000  		if (vm_page_sleep_busy(m, TRUE, "madvpo")) {
1001 			crit_exit();
1002   			goto relookup;
1003 		}
1004 		crit_exit();
1005 
1006 		/*
1007 		 * Theoretically once a page is known not to be busy, an
1008 		 * interrupt cannot come along and rip it out from under us.
1009 		 */
1010 
1011 		if (advise == MADV_WILLNEED) {
1012 			vm_page_activate(m);
1013 		} else if (advise == MADV_DONTNEED) {
1014 			vm_page_dontneed(m);
1015 		} else if (advise == MADV_FREE) {
1016 			/*
1017 			 * Mark the page clean.  This will allow the page
1018 			 * to be freed up by the system.  However, such pages
1019 			 * are often reused quickly by malloc()/free()
1020 			 * so we do not do anything that would cause
1021 			 * a page fault if we can help it.
1022 			 *
1023 			 * Specifically, we do not try to actually free
1024 			 * the page now nor do we try to put it in the
1025 			 * cache (which would cause a page fault on reuse).
1026 			 *
1027 			 * But we do make the page is freeable as we
1028 			 * can without actually taking the step of unmapping
1029 			 * it.
1030 			 */
1031 			pmap_clear_modify(m);
1032 			m->dirty = 0;
1033 			m->act_count = 0;
1034 			vm_page_dontneed(m);
1035 			if (tobject->type == OBJT_SWAP)
1036 				swap_pager_freespace(tobject, tpindex, 1);
1037 		}
1038 	}
1039 	lwkt_reltoken(&vm_token);
1040 }
1041 
1042 /*
1043  * Create a new object which is backed by the specified existing object
1044  * range.  The source object reference is deallocated.
1045  *
1046  * The new object and offset into that object are returned in the source
1047  * parameters.
1048  *
1049  * No other requirements.
1050  */
1051 void
1052 vm_object_shadow(vm_object_t *object, vm_ooffset_t *offset, vm_size_t length)
1053 {
1054 	vm_object_t source;
1055 	vm_object_t result;
1056 
1057 	source = *object;
1058 
1059 	/*
1060 	 * Don't create the new object if the old object isn't shared.
1061 	 */
1062 	lwkt_gettoken(&vm_token);
1063 
1064 	if (source != NULL &&
1065 	    source->ref_count == 1 &&
1066 	    source->handle == NULL &&
1067 	    (source->type == OBJT_DEFAULT ||
1068 	     source->type == OBJT_SWAP)) {
1069 		lwkt_reltoken(&vm_token);
1070 		return;
1071 	}
1072 
1073 	/*
1074 	 * Allocate a new object with the given length
1075 	 */
1076 
1077 	if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL)
1078 		panic("vm_object_shadow: no object for shadowing");
1079 
1080 	/*
1081 	 * The new object shadows the source object, adding a reference to it.
1082 	 * Our caller changes his reference to point to the new object,
1083 	 * removing a reference to the source object.  Net result: no change
1084 	 * of reference count.
1085 	 *
1086 	 * Try to optimize the result object's page color when shadowing
1087 	 * in order to maintain page coloring consistency in the combined
1088 	 * shadowed object.
1089 	 */
1090 	result->backing_object = source;
1091 	if (source) {
1092 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
1093 		source->shadow_count++;
1094 		source->generation++;
1095 		result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) & PQ_L2_MASK;
1096 	}
1097 
1098 	/*
1099 	 * Store the offset into the source object, and fix up the offset into
1100 	 * the new object.
1101 	 */
1102 	result->backing_object_offset = *offset;
1103 	lwkt_reltoken(&vm_token);
1104 
1105 	/*
1106 	 * Return the new things
1107 	 */
1108 	*offset = 0;
1109 	*object = result;
1110 }
1111 
1112 #define	OBSC_TEST_ALL_SHADOWED	0x0001
1113 #define	OBSC_COLLAPSE_NOWAIT	0x0002
1114 #define	OBSC_COLLAPSE_WAIT	0x0004
1115 
1116 static int vm_object_backing_scan_callback(vm_page_t p, void *data);
1117 
1118 /*
1119  * The caller must hold vm_token.
1120  */
1121 static __inline int
1122 vm_object_backing_scan(vm_object_t object, int op)
1123 {
1124 	struct rb_vm_page_scan_info info;
1125 	vm_object_t backing_object;
1126 
1127 	crit_enter();
1128 
1129 	backing_object = object->backing_object;
1130 	info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
1131 
1132 	/*
1133 	 * Initial conditions
1134 	 */
1135 
1136 	if (op & OBSC_TEST_ALL_SHADOWED) {
1137 		/*
1138 		 * We do not want to have to test for the existence of
1139 		 * swap pages in the backing object.  XXX but with the
1140 		 * new swapper this would be pretty easy to do.
1141 		 *
1142 		 * XXX what about anonymous MAP_SHARED memory that hasn't
1143 		 * been ZFOD faulted yet?  If we do not test for this, the
1144 		 * shadow test may succeed! XXX
1145 		 */
1146 		if (backing_object->type != OBJT_DEFAULT) {
1147 			crit_exit();
1148 			return(0);
1149 		}
1150 	}
1151 	if (op & OBSC_COLLAPSE_WAIT) {
1152 		KKASSERT((backing_object->flags & OBJ_DEAD) == 0);
1153 		vm_object_set_flag(backing_object, OBJ_DEAD);
1154 	}
1155 
1156 	/*
1157 	 * Our scan.   We have to retry if a negative error code is returned,
1158 	 * otherwise 0 or 1 will be returned in info.error.  0 Indicates that
1159 	 * the scan had to be stopped because the parent does not completely
1160 	 * shadow the child.
1161 	 */
1162 	info.object = object;
1163 	info.backing_object = backing_object;
1164 	info.limit = op;
1165 	do {
1166 		info.error = 1;
1167 		vm_page_rb_tree_RB_SCAN(&backing_object->rb_memq, NULL,
1168 					vm_object_backing_scan_callback,
1169 					&info);
1170 	} while (info.error < 0);
1171 	crit_exit();
1172 	return(info.error);
1173 }
1174 
1175 /*
1176  * The caller must hold vm_token.
1177  */
1178 static int
1179 vm_object_backing_scan_callback(vm_page_t p, void *data)
1180 {
1181 	struct rb_vm_page_scan_info *info = data;
1182 	vm_object_t backing_object;
1183 	vm_object_t object;
1184 	vm_pindex_t new_pindex;
1185 	vm_pindex_t backing_offset_index;
1186 	int op;
1187 
1188 	new_pindex = p->pindex - info->backing_offset_index;
1189 	op = info->limit;
1190 	object = info->object;
1191 	backing_object = info->backing_object;
1192 	backing_offset_index = info->backing_offset_index;
1193 
1194 	if (op & OBSC_TEST_ALL_SHADOWED) {
1195 		vm_page_t pp;
1196 
1197 		/*
1198 		 * Ignore pages outside the parent object's range
1199 		 * and outside the parent object's mapping of the
1200 		 * backing object.
1201 		 *
1202 		 * note that we do not busy the backing object's
1203 		 * page.
1204 		 */
1205 		if (
1206 		    p->pindex < backing_offset_index ||
1207 		    new_pindex >= object->size
1208 		) {
1209 			return(0);
1210 		}
1211 
1212 		/*
1213 		 * See if the parent has the page or if the parent's
1214 		 * object pager has the page.  If the parent has the
1215 		 * page but the page is not valid, the parent's
1216 		 * object pager must have the page.
1217 		 *
1218 		 * If this fails, the parent does not completely shadow
1219 		 * the object and we might as well give up now.
1220 		 */
1221 
1222 		pp = vm_page_lookup(object, new_pindex);
1223 		if ((pp == NULL || pp->valid == 0) &&
1224 		    !vm_pager_has_page(object, new_pindex)
1225 		) {
1226 			info->error = 0;	/* problemo */
1227 			return(-1);		/* stop the scan */
1228 		}
1229 	}
1230 
1231 	/*
1232 	 * Check for busy page
1233 	 */
1234 
1235 	if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
1236 		vm_page_t pp;
1237 
1238 		if (op & OBSC_COLLAPSE_NOWAIT) {
1239 			if (
1240 			    (p->flags & PG_BUSY) ||
1241 			    !p->valid ||
1242 			    p->hold_count ||
1243 			    p->wire_count ||
1244 			    p->busy
1245 			) {
1246 				return(0);
1247 			}
1248 		} else if (op & OBSC_COLLAPSE_WAIT) {
1249 			if (vm_page_sleep_busy(p, TRUE, "vmocol")) {
1250 				/*
1251 				 * If we slept, anything could have
1252 				 * happened.   Ask that the scan be restarted.
1253 				 *
1254 				 * Since the object is marked dead, the
1255 				 * backing offset should not have changed.
1256 				 */
1257 				info->error = -1;
1258 				return(-1);
1259 			}
1260 		}
1261 
1262 		/*
1263 		 * Busy the page
1264 		 */
1265 		vm_page_busy(p);
1266 
1267 		KASSERT(
1268 		    p->object == backing_object,
1269 		    ("vm_object_qcollapse(): object mismatch")
1270 		);
1271 
1272 		/*
1273 		 * Destroy any associated swap
1274 		 */
1275 		if (backing_object->type == OBJT_SWAP)
1276 			swap_pager_freespace(backing_object, p->pindex, 1);
1277 
1278 		if (
1279 		    p->pindex < backing_offset_index ||
1280 		    new_pindex >= object->size
1281 		) {
1282 			/*
1283 			 * Page is out of the parent object's range, we
1284 			 * can simply destroy it.
1285 			 */
1286 			vm_page_protect(p, VM_PROT_NONE);
1287 			vm_page_free(p);
1288 			return(0);
1289 		}
1290 
1291 		pp = vm_page_lookup(object, new_pindex);
1292 		if (pp != NULL || vm_pager_has_page(object, new_pindex)) {
1293 			/*
1294 			 * page already exists in parent OR swap exists
1295 			 * for this location in the parent.  Destroy
1296 			 * the original page from the backing object.
1297 			 *
1298 			 * Leave the parent's page alone
1299 			 */
1300 			vm_page_protect(p, VM_PROT_NONE);
1301 			vm_page_free(p);
1302 			return(0);
1303 		}
1304 
1305 		/*
1306 		 * Page does not exist in parent, rename the
1307 		 * page from the backing object to the main object.
1308 		 *
1309 		 * If the page was mapped to a process, it can remain
1310 		 * mapped through the rename.
1311 		 */
1312 		if ((p->queue - p->pc) == PQ_CACHE)
1313 			vm_page_deactivate(p);
1314 
1315 		vm_page_rename(p, object, new_pindex);
1316 		/* page automatically made dirty by rename */
1317 	}
1318 	return(0);
1319 }
1320 
1321 /*
1322  * This version of collapse allows the operation to occur earlier and
1323  * when paging_in_progress is true for an object...  This is not a complete
1324  * operation, but should plug 99.9% of the rest of the leaks.
1325  *
1326  * The caller must hold vm_token.
1327  */
1328 static void
1329 vm_object_qcollapse(vm_object_t object)
1330 {
1331 	vm_object_t backing_object = object->backing_object;
1332 
1333 	if (backing_object->ref_count != 1)
1334 		return;
1335 
1336 	backing_object->ref_count += 2;
1337 
1338 	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
1339 
1340 	backing_object->ref_count -= 2;
1341 }
1342 
1343 /*
1344  * Collapse an object with the object backing it.  Pages in the backing
1345  * object are moved into the parent, and the backing object is deallocated.
1346  */
1347 void
1348 vm_object_collapse(vm_object_t object)
1349 {
1350 	lwkt_gettoken(&vm_token);
1351 
1352 	while (TRUE) {
1353 		vm_object_t backing_object;
1354 
1355 		/*
1356 		 * Verify that the conditions are right for collapse:
1357 		 *
1358 		 * The object exists and the backing object exists.
1359 		 */
1360 		if (object == NULL)
1361 			break;
1362 
1363 		if ((backing_object = object->backing_object) == NULL)
1364 			break;
1365 
1366 		/*
1367 		 * we check the backing object first, because it is most likely
1368 		 * not collapsable.
1369 		 */
1370 		if (backing_object->handle != NULL ||
1371 		    (backing_object->type != OBJT_DEFAULT &&
1372 		     backing_object->type != OBJT_SWAP) ||
1373 		    (backing_object->flags & OBJ_DEAD) ||
1374 		    object->handle != NULL ||
1375 		    (object->type != OBJT_DEFAULT &&
1376 		     object->type != OBJT_SWAP) ||
1377 		    (object->flags & OBJ_DEAD)) {
1378 			break;
1379 		}
1380 
1381 		if (
1382 		    object->paging_in_progress != 0 ||
1383 		    backing_object->paging_in_progress != 0
1384 		) {
1385 			vm_object_qcollapse(object);
1386 			break;
1387 		}
1388 
1389 		/*
1390 		 * We know that we can either collapse the backing object (if
1391 		 * the parent is the only reference to it) or (perhaps) have
1392 		 * the parent bypass the object if the parent happens to shadow
1393 		 * all the resident pages in the entire backing object.
1394 		 *
1395 		 * This is ignoring pager-backed pages such as swap pages.
1396 		 * vm_object_backing_scan fails the shadowing test in this
1397 		 * case.
1398 		 */
1399 
1400 		if (backing_object->ref_count == 1) {
1401 			/*
1402 			 * If there is exactly one reference to the backing
1403 			 * object, we can collapse it into the parent.
1404 			 */
1405 			vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
1406 
1407 			/*
1408 			 * Move the pager from backing_object to object.
1409 			 */
1410 
1411 			if (backing_object->type == OBJT_SWAP) {
1412 				vm_object_pip_add(backing_object, 1);
1413 
1414 				/*
1415 				 * scrap the paging_offset junk and do a
1416 				 * discrete copy.  This also removes major
1417 				 * assumptions about how the swap-pager
1418 				 * works from where it doesn't belong.  The
1419 				 * new swapper is able to optimize the
1420 				 * destroy-source case.
1421 				 */
1422 
1423 				vm_object_pip_add(object, 1);
1424 				swap_pager_copy(
1425 				    backing_object,
1426 				    object,
1427 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
1428 				vm_object_pip_wakeup(object);
1429 
1430 				vm_object_pip_wakeup(backing_object);
1431 			}
1432 			/*
1433 			 * Object now shadows whatever backing_object did.
1434 			 * Note that the reference to
1435 			 * backing_object->backing_object moves from within
1436 			 * backing_object to within object.
1437 			 */
1438 
1439 			LIST_REMOVE(object, shadow_list);
1440 			object->backing_object->shadow_count--;
1441 			object->backing_object->generation++;
1442 			if (backing_object->backing_object) {
1443 				LIST_REMOVE(backing_object, shadow_list);
1444 				backing_object->backing_object->shadow_count--;
1445 				backing_object->backing_object->generation++;
1446 			}
1447 			object->backing_object = backing_object->backing_object;
1448 			if (object->backing_object) {
1449 				LIST_INSERT_HEAD(
1450 				    &object->backing_object->shadow_head,
1451 				    object,
1452 				    shadow_list
1453 				);
1454 				object->backing_object->shadow_count++;
1455 				object->backing_object->generation++;
1456 			}
1457 
1458 			object->backing_object_offset +=
1459 			    backing_object->backing_object_offset;
1460 
1461 			/*
1462 			 * Discard backing_object.
1463 			 *
1464 			 * Since the backing object has no pages, no pager left,
1465 			 * and no object references within it, all that is
1466 			 * necessary is to dispose of it.
1467 			 */
1468 
1469 			KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object));
1470 			KASSERT(RB_EMPTY(&backing_object->rb_memq), ("backing_object %p somehow has left over pages during collapse!", backing_object));
1471 			crit_enter();
1472 			TAILQ_REMOVE(&vm_object_list, backing_object,
1473 				     object_list);
1474 			vm_object_count--;
1475 			crit_exit();
1476 
1477 			zfree(obj_zone, backing_object);
1478 
1479 			object_collapses++;
1480 		} else {
1481 			vm_object_t new_backing_object;
1482 
1483 			/*
1484 			 * If we do not entirely shadow the backing object,
1485 			 * there is nothing we can do so we give up.
1486 			 */
1487 
1488 			if (vm_object_backing_scan(object, OBSC_TEST_ALL_SHADOWED) == 0) {
1489 				break;
1490 			}
1491 
1492 			/*
1493 			 * Make the parent shadow the next object in the
1494 			 * chain.  Deallocating backing_object will not remove
1495 			 * it, since its reference count is at least 2.
1496 			 */
1497 
1498 			LIST_REMOVE(object, shadow_list);
1499 			backing_object->shadow_count--;
1500 			backing_object->generation++;
1501 
1502 			new_backing_object = backing_object->backing_object;
1503 			if ((object->backing_object = new_backing_object) != NULL) {
1504 				vm_object_reference(new_backing_object);
1505 				LIST_INSERT_HEAD(
1506 				    &new_backing_object->shadow_head,
1507 				    object,
1508 				    shadow_list
1509 				);
1510 				new_backing_object->shadow_count++;
1511 				new_backing_object->generation++;
1512 				object->backing_object_offset +=
1513 					backing_object->backing_object_offset;
1514 			}
1515 
1516 			/*
1517 			 * Drop the reference count on backing_object. Since
1518 			 * its ref_count was at least 2, it will not vanish;
1519 			 * so we don't need to call vm_object_deallocate, but
1520 			 * we do anyway.
1521 			 */
1522 			vm_object_deallocate(backing_object);
1523 			object_bypasses++;
1524 		}
1525 
1526 		/*
1527 		 * Try again with this object's new backing object.
1528 		 */
1529 	}
1530 	lwkt_reltoken(&vm_token);
1531 }
1532 
1533 /*
1534  * Removes all physical pages in the specified object range from the
1535  * object's list of pages.
1536  *
1537  * No requirements.
1538  */
1539 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1540 
1541 void
1542 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1543 		      boolean_t clean_only)
1544 {
1545 	struct rb_vm_page_scan_info info;
1546 	int all;
1547 
1548 	/*
1549 	 * Degenerate cases and assertions
1550 	 */
1551 	lwkt_gettoken(&vm_token);
1552 	if (object == NULL ||
1553 	    (object->resident_page_count == 0 && object->swblock_count == 0)) {
1554 		lwkt_reltoken(&vm_token);
1555 		return;
1556 	}
1557 	KASSERT(object->type != OBJT_PHYS,
1558 		("attempt to remove pages from a physical object"));
1559 
1560 	/*
1561 	 * Indicate that paging is occuring on the object
1562 	 */
1563 	crit_enter();
1564 	vm_object_pip_add(object, 1);
1565 
1566 	/*
1567 	 * Figure out the actual removal range and whether we are removing
1568 	 * the entire contents of the object or not.  If removing the entire
1569 	 * contents, be sure to get all pages, even those that might be
1570 	 * beyond the end of the object.
1571 	 */
1572 	info.start_pindex = start;
1573 	if (end == 0)
1574 		info.end_pindex = (vm_pindex_t)-1;
1575 	else
1576 		info.end_pindex = end - 1;
1577 	info.limit = clean_only;
1578 	all = (start == 0 && info.end_pindex >= object->size - 1);
1579 
1580 	/*
1581 	 * Loop until we are sure we have gotten them all.
1582 	 */
1583 	do {
1584 		info.error = 0;
1585 		vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1586 					vm_object_page_remove_callback, &info);
1587 	} while (info.error);
1588 
1589 	/*
1590 	 * Remove any related swap if throwing away pages, or for
1591 	 * non-swap objects (the swap is a clean copy in that case).
1592 	 */
1593 	if (object->type != OBJT_SWAP || clean_only == FALSE) {
1594 		if (all)
1595 			swap_pager_freespace_all(object);
1596 		else
1597 			swap_pager_freespace(object, info.start_pindex,
1598 			     info.end_pindex - info.start_pindex + 1);
1599 	}
1600 
1601 	/*
1602 	 * Cleanup
1603 	 */
1604 	vm_object_pip_wakeup(object);
1605 	crit_exit();
1606 	lwkt_reltoken(&vm_token);
1607 }
1608 
1609 /*
1610  * The caller must hold vm_token.
1611  */
1612 static int
1613 vm_object_page_remove_callback(vm_page_t p, void *data)
1614 {
1615 	struct rb_vm_page_scan_info *info = data;
1616 
1617 	/*
1618 	 * Wired pages cannot be destroyed, but they can be invalidated
1619 	 * and we do so if clean_only (limit) is not set.
1620 	 *
1621 	 * WARNING!  The page may be wired due to being part of a buffer
1622 	 *	     cache buffer, and the buffer might be marked B_CACHE.
1623 	 *	     This is fine as part of a truncation but VFSs must be
1624 	 *	     sure to fix the buffer up when re-extending the file.
1625 	 */
1626 	if (p->wire_count != 0) {
1627 		vm_page_protect(p, VM_PROT_NONE);
1628 		if (info->limit == 0)
1629 			p->valid = 0;
1630 		return(0);
1631 	}
1632 
1633 	/*
1634 	 * The busy flags are only cleared at
1635 	 * interrupt -- minimize the spl transitions
1636 	 */
1637 
1638 	if (vm_page_sleep_busy(p, TRUE, "vmopar")) {
1639 		info->error = 1;
1640 		return(0);
1641 	}
1642 
1643 	/*
1644 	 * limit is our clean_only flag.  If set and the page is dirty, do
1645 	 * not free it.  If set and the page is being held by someone, do
1646 	 * not free it.
1647 	 */
1648 	if (info->limit && p->valid) {
1649 		vm_page_test_dirty(p);
1650 		if (p->valid & p->dirty)
1651 			return(0);
1652 		if (p->hold_count)
1653 			return(0);
1654 	}
1655 
1656 	/*
1657 	 * Destroy the page
1658 	 */
1659 	vm_page_busy(p);
1660 	vm_page_protect(p, VM_PROT_NONE);
1661 	vm_page_free(p);
1662 	return(0);
1663 }
1664 
1665 /*
1666  * Coalesces two objects backing up adjoining regions of memory into a
1667  * single object.
1668  *
1669  * returns TRUE if objects were combined.
1670  *
1671  * NOTE: Only works at the moment if the second object is NULL -
1672  *	 if it's not, which object do we lock first?
1673  *
1674  * Parameters:
1675  *	prev_object	First object to coalesce
1676  *	prev_offset	Offset into prev_object
1677  *	next_object	Second object into coalesce
1678  *	next_offset	Offset into next_object
1679  *
1680  *	prev_size	Size of reference to prev_object
1681  *	next_size	Size of reference to next_object
1682  *
1683  * The object must not be locked.
1684  */
1685 boolean_t
1686 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1687 		   vm_size_t prev_size, vm_size_t next_size)
1688 {
1689 	vm_pindex_t next_pindex;
1690 
1691 	if (prev_object == NULL) {
1692 		return (TRUE);
1693 	}
1694 
1695 	if (prev_object->type != OBJT_DEFAULT &&
1696 	    prev_object->type != OBJT_SWAP) {
1697 		return (FALSE);
1698 	}
1699 
1700 	lwkt_gettoken(&vm_token);
1701 
1702 	/*
1703 	 * Try to collapse the object first
1704 	 */
1705 	vm_object_collapse(prev_object);
1706 
1707 	/*
1708 	 * Can't coalesce if: . more than one reference . paged out . shadows
1709 	 * another object . has a copy elsewhere (any of which mean that the
1710 	 * pages not mapped to prev_entry may be in use anyway)
1711 	 */
1712 
1713 	if (prev_object->backing_object != NULL) {
1714 		lwkt_reltoken(&vm_token);
1715 		return (FALSE);
1716 	}
1717 
1718 	prev_size >>= PAGE_SHIFT;
1719 	next_size >>= PAGE_SHIFT;
1720 	next_pindex = prev_pindex + prev_size;
1721 
1722 	if ((prev_object->ref_count > 1) &&
1723 	    (prev_object->size != next_pindex)) {
1724 		lwkt_reltoken(&vm_token);
1725 		return (FALSE);
1726 	}
1727 
1728 	/*
1729 	 * Remove any pages that may still be in the object from a previous
1730 	 * deallocation.
1731 	 */
1732 	if (next_pindex < prev_object->size) {
1733 		vm_object_page_remove(prev_object,
1734 				      next_pindex,
1735 				      next_pindex + next_size, FALSE);
1736 		if (prev_object->type == OBJT_SWAP)
1737 			swap_pager_freespace(prev_object,
1738 					     next_pindex, next_size);
1739 	}
1740 
1741 	/*
1742 	 * Extend the object if necessary.
1743 	 */
1744 	if (next_pindex + next_size > prev_object->size)
1745 		prev_object->size = next_pindex + next_size;
1746 
1747 	lwkt_reltoken(&vm_token);
1748 	return (TRUE);
1749 }
1750 
1751 /*
1752  * Make the object writable and flag is being possibly dirty.
1753  *
1754  * No requirements.
1755  */
1756 void
1757 vm_object_set_writeable_dirty(vm_object_t object)
1758 {
1759 	struct vnode *vp;
1760 
1761 	lwkt_gettoken(&vm_token);
1762 	vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1763 	if (object->type == OBJT_VNODE &&
1764 	    (vp = (struct vnode *)object->handle) != NULL) {
1765 		if ((vp->v_flag & VOBJDIRTY) == 0) {
1766 			vsetflags(vp, VOBJDIRTY);
1767 		}
1768 	}
1769 	lwkt_reltoken(&vm_token);
1770 }
1771 
1772 #include "opt_ddb.h"
1773 #ifdef DDB
1774 #include <sys/kernel.h>
1775 
1776 #include <sys/cons.h>
1777 
1778 #include <ddb/ddb.h>
1779 
1780 static int	_vm_object_in_map (vm_map_t map, vm_object_t object,
1781 				       vm_map_entry_t entry);
1782 static int	vm_object_in_map (vm_object_t object);
1783 
1784 /*
1785  * The caller must hold vm_token.
1786  */
1787 static int
1788 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1789 {
1790 	vm_map_t tmpm;
1791 	vm_map_entry_t tmpe;
1792 	vm_object_t obj;
1793 	int entcount;
1794 
1795 	if (map == 0)
1796 		return 0;
1797 	if (entry == 0) {
1798 		tmpe = map->header.next;
1799 		entcount = map->nentries;
1800 		while (entcount-- && (tmpe != &map->header)) {
1801 			if( _vm_object_in_map(map, object, tmpe)) {
1802 				return 1;
1803 			}
1804 			tmpe = tmpe->next;
1805 		}
1806 		return (0);
1807 	}
1808 	switch(entry->maptype) {
1809 	case VM_MAPTYPE_SUBMAP:
1810 		tmpm = entry->object.sub_map;
1811 		tmpe = tmpm->header.next;
1812 		entcount = tmpm->nentries;
1813 		while (entcount-- && tmpe != &tmpm->header) {
1814 			if( _vm_object_in_map(tmpm, object, tmpe)) {
1815 				return 1;
1816 			}
1817 			tmpe = tmpe->next;
1818 		}
1819 		break;
1820 	case VM_MAPTYPE_NORMAL:
1821 	case VM_MAPTYPE_VPAGETABLE:
1822 		obj = entry->object.vm_object;
1823 		while (obj) {
1824 			if (obj == object)
1825 				return 1;
1826 			obj = obj->backing_object;
1827 		}
1828 		break;
1829 	default:
1830 		break;
1831 	}
1832 	return 0;
1833 }
1834 
1835 static int vm_object_in_map_callback(struct proc *p, void *data);
1836 
1837 struct vm_object_in_map_info {
1838 	vm_object_t object;
1839 	int rv;
1840 };
1841 
1842 /*
1843  * Debugging only
1844  */
1845 static int
1846 vm_object_in_map(vm_object_t object)
1847 {
1848 	struct vm_object_in_map_info info;
1849 
1850 	info.rv = 0;
1851 	info.object = object;
1852 
1853 	allproc_scan(vm_object_in_map_callback, &info);
1854 	if (info.rv)
1855 		return 1;
1856 	if( _vm_object_in_map(&kernel_map, object, 0))
1857 		return 1;
1858 	if( _vm_object_in_map(&pager_map, object, 0))
1859 		return 1;
1860 	if( _vm_object_in_map(&buffer_map, object, 0))
1861 		return 1;
1862 	return 0;
1863 }
1864 
1865 /*
1866  * Debugging only
1867  */
1868 static int
1869 vm_object_in_map_callback(struct proc *p, void *data)
1870 {
1871 	struct vm_object_in_map_info *info = data;
1872 
1873 	if (p->p_vmspace) {
1874 		if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1875 			info->rv = 1;
1876 			return -1;
1877 		}
1878 	}
1879 	return (0);
1880 }
1881 
1882 DB_SHOW_COMMAND(vmochk, vm_object_check)
1883 {
1884 	vm_object_t object;
1885 
1886 	/*
1887 	 * make sure that internal objs are in a map somewhere
1888 	 * and none have zero ref counts.
1889 	 */
1890 	for (object = TAILQ_FIRST(&vm_object_list);
1891 			object != NULL;
1892 			object = TAILQ_NEXT(object, object_list)) {
1893 		if (object->type == OBJT_MARKER)
1894 			continue;
1895 		if (object->handle == NULL &&
1896 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
1897 			if (object->ref_count == 0) {
1898 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
1899 					(long)object->size);
1900 			}
1901 			if (!vm_object_in_map(object)) {
1902 				db_printf(
1903 			"vmochk: internal obj is not in a map: "
1904 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
1905 				    object->ref_count, (u_long)object->size,
1906 				    (u_long)object->size,
1907 				    (void *)object->backing_object);
1908 			}
1909 		}
1910 	}
1911 }
1912 
1913 /*
1914  * Debugging only
1915  */
1916 DB_SHOW_COMMAND(object, vm_object_print_static)
1917 {
1918 	/* XXX convert args. */
1919 	vm_object_t object = (vm_object_t)addr;
1920 	boolean_t full = have_addr;
1921 
1922 	vm_page_t p;
1923 
1924 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
1925 #define	count	was_count
1926 
1927 	int count;
1928 
1929 	if (object == NULL)
1930 		return;
1931 
1932 	db_iprintf(
1933 	    "Object %p: type=%d, size=0x%lx, res=%d, ref=%d, flags=0x%x\n",
1934 	    object, (int)object->type, (u_long)object->size,
1935 	    object->resident_page_count, object->ref_count, object->flags);
1936 	/*
1937 	 * XXX no %qd in kernel.  Truncate object->backing_object_offset.
1938 	 */
1939 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%lx\n",
1940 	    object->shadow_count,
1941 	    object->backing_object ? object->backing_object->ref_count : 0,
1942 	    object->backing_object, (long)object->backing_object_offset);
1943 
1944 	if (!full)
1945 		return;
1946 
1947 	db_indent += 2;
1948 	count = 0;
1949 	RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1950 		if (count == 0)
1951 			db_iprintf("memory:=");
1952 		else if (count == 6) {
1953 			db_printf("\n");
1954 			db_iprintf(" ...");
1955 			count = 0;
1956 		} else
1957 			db_printf(",");
1958 		count++;
1959 
1960 		db_printf("(off=0x%lx,page=0x%lx)",
1961 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1962 	}
1963 	if (count != 0)
1964 		db_printf("\n");
1965 	db_indent -= 2;
1966 }
1967 
1968 /* XXX. */
1969 #undef count
1970 
1971 /*
1972  * XXX need this non-static entry for calling from vm_map_print.
1973  *
1974  * Debugging only
1975  */
1976 void
1977 vm_object_print(/* db_expr_t */ long addr,
1978 		boolean_t have_addr,
1979 		/* db_expr_t */ long count,
1980 		char *modif)
1981 {
1982 	vm_object_print_static(addr, have_addr, count, modif);
1983 }
1984 
1985 /*
1986  * Debugging only
1987  */
1988 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1989 {
1990 	vm_object_t object;
1991 	int nl = 0;
1992 	int c;
1993 	for (object = TAILQ_FIRST(&vm_object_list);
1994 			object != NULL;
1995 			object = TAILQ_NEXT(object, object_list)) {
1996 		vm_pindex_t idx, fidx;
1997 		vm_pindex_t osize;
1998 		vm_paddr_t pa = -1, padiff;
1999 		int rcount;
2000 		vm_page_t m;
2001 
2002 		if (object->type == OBJT_MARKER)
2003 			continue;
2004 		db_printf("new object: %p\n", (void *)object);
2005 		if ( nl > 18) {
2006 			c = cngetc();
2007 			if (c != ' ')
2008 				return;
2009 			nl = 0;
2010 		}
2011 		nl++;
2012 		rcount = 0;
2013 		fidx = 0;
2014 		osize = object->size;
2015 		if (osize > 128)
2016 			osize = 128;
2017 		for (idx = 0; idx < osize; idx++) {
2018 			m = vm_page_lookup(object, idx);
2019 			if (m == NULL) {
2020 				if (rcount) {
2021 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2022 						(long)fidx, rcount, (long)pa);
2023 					if ( nl > 18) {
2024 						c = cngetc();
2025 						if (c != ' ')
2026 							return;
2027 						nl = 0;
2028 					}
2029 					nl++;
2030 					rcount = 0;
2031 				}
2032 				continue;
2033 			}
2034 
2035 
2036 			if (rcount &&
2037 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
2038 				++rcount;
2039 				continue;
2040 			}
2041 			if (rcount) {
2042 				padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
2043 				padiff >>= PAGE_SHIFT;
2044 				padiff &= PQ_L2_MASK;
2045 				if (padiff == 0) {
2046 					pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
2047 					++rcount;
2048 					continue;
2049 				}
2050 				db_printf(" index(%ld)run(%d)pa(0x%lx)",
2051 					(long)fidx, rcount, (long)pa);
2052 				db_printf("pd(%ld)\n", (long)padiff);
2053 				if ( nl > 18) {
2054 					c = cngetc();
2055 					if (c != ' ')
2056 						return;
2057 					nl = 0;
2058 				}
2059 				nl++;
2060 			}
2061 			fidx = idx;
2062 			pa = VM_PAGE_TO_PHYS(m);
2063 			rcount = 1;
2064 		}
2065 		if (rcount) {
2066 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2067 				(long)fidx, rcount, (long)pa);
2068 			if ( nl > 18) {
2069 				c = cngetc();
2070 				if (c != ' ')
2071 					return;
2072 				nl = 0;
2073 			}
2074 			nl++;
2075 		}
2076 	}
2077 }
2078 #endif /* DDB */
2079