1 /*
2 * Copyright (c) 1991, 1993, 2013
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94
33 *
34 *
35 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36 * All rights reserved.
37 *
38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39 *
40 * Permission to use, copy, modify and distribute this software and
41 * its documentation is hereby granted, provided that both the copyright
42 * notice and this permission notice appear in all copies of the
43 * software, derivative works or modified versions, and any portions
44 * thereof, and that both notices appear in supporting documentation.
45 *
46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49 *
50 * Carnegie Mellon requests users of this software to return to
51 *
52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
53 * School of Computer Science
54 * Carnegie Mellon University
55 * Pittsburgh PA 15213-3890
56 *
57 * any improvements or extensions that they make and grant Carnegie the
58 * rights to redistribute these changes.
59 *
60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $
61 */
62
63 /*
64 * Virtual memory object module.
65 */
66
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/proc.h> /* for curproc, pageproc */
70 #include <sys/thread.h>
71 #include <sys/vnode.h>
72 #include <sys/vmmeter.h>
73 #include <sys/mman.h>
74 #include <sys/mount.h>
75 #include <sys/kernel.h>
76 #include <sys/malloc.h>
77 #include <sys/sysctl.h>
78 #include <sys/refcount.h>
79
80 #include <vm/vm.h>
81 #include <vm/vm_param.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_kern.h>
90 #include <vm/vm_extern.h>
91 #include <vm/vm_zone.h>
92
93 #include <vm/vm_page2.h>
94
95 #include <machine/specialreg.h>
96
97 #define EASY_SCAN_FACTOR 8
98
99 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
100 int pagerflags);
101 static void vm_object_lock_init(vm_object_t);
102
103 /*
104 * Virtual memory objects maintain the actual data
105 * associated with allocated virtual memory. A given
106 * page of memory exists within exactly one object.
107 *
108 * An object is only deallocated when all "references"
109 * are given up. Only one "reference" to a given
110 * region of an object should be writeable.
111 *
112 * Associated with each object is a list of all resident
113 * memory pages belonging to that object; this list is
114 * maintained by the "vm_page" module, and locked by the object's
115 * lock.
116 *
117 * Each object also records a "pager" routine which is
118 * used to retrieve (and store) pages to the proper backing
119 * storage. In addition, objects may be backed by other
120 * objects from which they were virtual-copied.
121 *
122 * The only items within the object structure which are
123 * modified after time of creation are:
124 * reference count locked by object's lock
125 * pager routine locked by object's lock
126 *
127 */
128
129 static struct vm_object kernel_object_store;
130 struct vm_object *kernel_object = &kernel_object_store;
131
132 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE];
133
134 static MALLOC_DEFINE_OBJ(M_VM_OBJECT, sizeof(struct vm_object),
135 "vm_object", "vm_object structures");
136
137 #define VMOBJ_HASH_PRIME1 66555444443333333ULL
138 #define VMOBJ_HASH_PRIME2 989042931893ULL
139
140 int vm_object_debug;
141 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, "");
142
143 static __inline
144 struct vm_object_hash *
vmobj_hash(vm_object_t obj)145 vmobj_hash(vm_object_t obj)
146 {
147 uintptr_t hash1;
148 uintptr_t hash2;
149
150 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18);
151 hash1 %= VMOBJ_HASH_PRIME1;
152 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24);
153 hash2 %= VMOBJ_HASH_PRIME2;
154 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]);
155 }
156
157 #if defined(DEBUG_LOCKS)
158
159 #define vm_object_vndeallocate(obj, vpp) \
160 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__)
161
162 /*
163 * Debug helper to track hold/drop/ref/deallocate calls.
164 */
165 static void
debugvm_object_add(vm_object_t obj,char * file,int line,int addrem)166 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem)
167 {
168 int i;
169
170 i = atomic_fetchadd_int(&obj->debug_index, 1);
171 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1);
172 ksnprintf(obj->debug_hold_thrs[i],
173 sizeof(obj->debug_hold_thrs[i]),
174 "%c%d:(%d):%s",
175 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')),
176 (curthread->td_proc ? curthread->td_proc->p_pid : -1),
177 obj->ref_count,
178 curthread->td_comm);
179 obj->debug_hold_file[i] = file;
180 obj->debug_hold_line[i] = line;
181 #if 0
182 /* Uncomment for debugging obj refs/derefs in reproducable cases */
183 if (strcmp(curthread->td_comm, "sshd") == 0) {
184 kprintf("%d %p refs=%d ar=%d file: %s/%d\n",
185 (curthread->td_proc ? curthread->td_proc->p_pid : -1),
186 obj, obj->ref_count, addrem, file, line);
187 }
188 #endif
189 }
190
191 #endif
192
193 /*
194 * Misc low level routines
195 */
196 static void
vm_object_lock_init(vm_object_t obj)197 vm_object_lock_init(vm_object_t obj)
198 {
199 #if defined(DEBUG_LOCKS)
200 int i;
201
202 obj->debug_index = 0;
203 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) {
204 obj->debug_hold_thrs[i][0] = 0;
205 obj->debug_hold_file[i] = NULL;
206 obj->debug_hold_line[i] = 0;
207 }
208 #endif
209 }
210
211 void
vm_object_lock_swap(void)212 vm_object_lock_swap(void)
213 {
214 lwkt_token_swap();
215 }
216
217 void
vm_object_lock(vm_object_t obj)218 vm_object_lock(vm_object_t obj)
219 {
220 lwkt_gettoken(&obj->token);
221 }
222
223 /*
224 * Returns TRUE on sucesss
225 */
226 static int
vm_object_lock_try(vm_object_t obj)227 vm_object_lock_try(vm_object_t obj)
228 {
229 return(lwkt_trytoken(&obj->token));
230 }
231
232 void
vm_object_lock_shared(vm_object_t obj)233 vm_object_lock_shared(vm_object_t obj)
234 {
235 lwkt_gettoken_shared(&obj->token);
236 }
237
238 void
vm_object_unlock(vm_object_t obj)239 vm_object_unlock(vm_object_t obj)
240 {
241 lwkt_reltoken(&obj->token);
242 }
243
244 void
vm_object_upgrade(vm_object_t obj)245 vm_object_upgrade(vm_object_t obj)
246 {
247 lwkt_reltoken(&obj->token);
248 lwkt_gettoken(&obj->token);
249 }
250
251 void
vm_object_downgrade(vm_object_t obj)252 vm_object_downgrade(vm_object_t obj)
253 {
254 lwkt_reltoken(&obj->token);
255 lwkt_gettoken_shared(&obj->token);
256 }
257
258 static __inline void
vm_object_assert_held(vm_object_t obj)259 vm_object_assert_held(vm_object_t obj)
260 {
261 ASSERT_LWKT_TOKEN_HELD(&obj->token);
262 }
263
264 /*
265 * Aquire a semi-random base page color for a new object. Our main concern
266 * is that the color be spread out a bit. Further spreading out occurs in
267 * bio_page_alloc().
268 */
269 int
vm_quickcolor(void)270 vm_quickcolor(void)
271 {
272 globaldata_t gd = mycpu;
273 int pg_color;
274
275 pg_color = (int)(intptr_t)gd->gd_curthread >> 10;
276 pg_color += gd->gd_quick_color;
277 gd->gd_quick_color += PQ_PRIME2;
278
279 return pg_color;
280 }
281
282 void
VMOBJDEBUG(vm_object_hold)283 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS)
284 {
285 KKASSERT(obj != NULL);
286
287 /*
288 * Object must be held (object allocation is stable due to callers
289 * context, typically already holding the token on a parent object)
290 * prior to potentially blocking on the lock, otherwise the object
291 * can get ripped away from us.
292 */
293 refcount_acquire(&obj->hold_count);
294 vm_object_lock(obj);
295
296 #if defined(DEBUG_LOCKS)
297 debugvm_object_add(obj, file, line, 1);
298 #endif
299 }
300
301 int
VMOBJDEBUG(vm_object_hold_try)302 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS)
303 {
304 KKASSERT(obj != NULL);
305
306 /*
307 * Object must be held (object allocation is stable due to callers
308 * context, typically already holding the token on a parent object)
309 * prior to potentially blocking on the lock, otherwise the object
310 * can get ripped away from us.
311 */
312 refcount_acquire(&obj->hold_count);
313 if (vm_object_lock_try(obj) == 0) {
314 if (refcount_release(&obj->hold_count)) {
315 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD))
316 kfree_obj(obj, M_VM_OBJECT);
317 }
318 return(0);
319 }
320
321 #if defined(DEBUG_LOCKS)
322 debugvm_object_add(obj, file, line, 1);
323 #endif
324 return(1);
325 }
326
327 void
VMOBJDEBUG(vm_object_hold_shared)328 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS)
329 {
330 KKASSERT(obj != NULL);
331
332 /*
333 * Object must be held (object allocation is stable due to callers
334 * context, typically already holding the token on a parent object)
335 * prior to potentially blocking on the lock, otherwise the object
336 * can get ripped away from us.
337 */
338 refcount_acquire(&obj->hold_count);
339 vm_object_lock_shared(obj);
340
341 #if defined(DEBUG_LOCKS)
342 debugvm_object_add(obj, file, line, 1);
343 #endif
344 }
345
346 /*
347 * Drop the token and hold_count on the object.
348 *
349 * WARNING! Token might be shared.
350 */
351 void
VMOBJDEBUG(vm_object_drop)352 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS)
353 {
354 if (obj == NULL)
355 return;
356
357 /*
358 * No new holders should be possible once we drop hold_count 1->0 as
359 * there is no longer any way to reference the object.
360 */
361 KKASSERT(obj->hold_count > 0);
362 if (refcount_release(&obj->hold_count)) {
363 #if defined(DEBUG_LOCKS)
364 debugvm_object_add(obj, file, line, -1);
365 #endif
366
367 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) {
368 vm_object_unlock(obj);
369 kfree_obj(obj, M_VM_OBJECT);
370 } else {
371 vm_object_unlock(obj);
372 }
373 } else {
374 #if defined(DEBUG_LOCKS)
375 debugvm_object_add(obj, file, line, -1);
376 #endif
377 vm_object_unlock(obj);
378 }
379 }
380
381 /*
382 * Initialize a freshly allocated object, returning a held object.
383 *
384 * Used only by vm_object_allocate(), zinitna() and vm_object_init().
385 *
386 * No requirements.
387 */
388 void
_vm_object_allocate(objtype_t type,vm_pindex_t size,vm_object_t object,const char * ident)389 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object,
390 const char *ident)
391 {
392 struct vm_object_hash *hash;
393
394 RB_INIT(&object->rb_memq);
395 lwkt_token_init(&object->token, ident);
396
397 TAILQ_INIT(&object->backing_list);
398 lockinit(&object->backing_lk, "baclk", 0, 0);
399
400 object->type = type;
401 object->size = size;
402 object->ref_count = 1;
403 object->memattr = VM_MEMATTR_DEFAULT;
404 object->hold_count = 0;
405 object->flags = 0;
406 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
407 vm_object_set_flag(object, OBJ_ONEMAPPING);
408 object->paging_in_progress = 0;
409 object->resident_page_count = 0;
410 /* cpu localization twist */
411 object->pg_color = vm_quickcolor();
412 object->handle = NULL;
413
414 atomic_add_int(&object->generation, 1);
415 object->swblock_count = 0;
416 RB_INIT(&object->swblock_root);
417 vm_object_lock_init(object);
418 pmap_object_init(object);
419
420 vm_object_hold(object);
421
422 hash = vmobj_hash(object);
423 lwkt_gettoken(&hash->token);
424 TAILQ_INSERT_TAIL(&hash->list, object, object_entry);
425 lwkt_reltoken(&hash->token);
426 }
427
428 /*
429 * Initialize a VM object.
430 */
431 void
vm_object_init(vm_object_t object,vm_pindex_t size)432 vm_object_init(vm_object_t object, vm_pindex_t size)
433 {
434 _vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj");
435 vm_object_drop(object);
436 }
437
438 /*
439 * Initialize the VM objects module.
440 *
441 * Called from the low level boot code only. Note that this occurs before
442 * kmalloc is initialized so we cannot allocate any VM objects.
443 */
444 void
vm_object_init1(void)445 vm_object_init1(void)
446 {
447 int i;
448
449 for (i = 0; i < VMOBJ_HSIZE; ++i) {
450 TAILQ_INIT(&vm_object_hash[i].list);
451 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst");
452 }
453
454 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd),
455 kernel_object, "kobj");
456 vm_object_drop(kernel_object);
457 }
458
459 void
vm_object_init2(void)460 vm_object_init2(void)
461 {
462 kmalloc_obj_set_unlimited(M_VM_OBJECT);
463 }
464
465 /*
466 * Allocate and return a new object of the specified type and size.
467 *
468 * No requirements.
469 */
470 vm_object_t
vm_object_allocate(objtype_t type,vm_pindex_t size)471 vm_object_allocate(objtype_t type, vm_pindex_t size)
472 {
473 vm_object_t obj;
474
475 obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
476 _vm_object_allocate(type, size, obj, "vmobj");
477 vm_object_drop(obj);
478
479 return (obj);
480 }
481
482 /*
483 * This version returns a held object, allowing further atomic initialization
484 * of the object.
485 */
486 vm_object_t
vm_object_allocate_hold(objtype_t type,vm_pindex_t size)487 vm_object_allocate_hold(objtype_t type, vm_pindex_t size)
488 {
489 vm_object_t obj;
490
491 obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO);
492 _vm_object_allocate(type, size, obj, "vmobj");
493
494 return (obj);
495 }
496
497 /*
498 * Add an additional reference to a vm_object. The object must already be
499 * held. The original non-lock version is no longer supported. The object
500 * must NOT be chain locked by anyone at the time the reference is added.
501 *
502 * The object must be held, but may be held shared if desired (hence why
503 * we use an atomic op).
504 */
505 void
VMOBJDEBUG(vm_object_reference_locked)506 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS)
507 {
508 KKASSERT(object != NULL);
509 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
510 atomic_add_int(&object->ref_count, 1);
511 if (object->type == OBJT_VNODE) {
512 vref(object->handle);
513 /* XXX what if the vnode is being destroyed? */
514 }
515 #if defined(DEBUG_LOCKS)
516 debugvm_object_add(object, file, line, 1);
517 #endif
518 }
519
520 /*
521 * This version is only allowed in situations where the caller
522 * already knows that the object is deterministically referenced
523 * (usually because its taken from a ref'd vnode, or during a map_entry
524 * replication).
525 */
526 void
VMOBJDEBUG(vm_object_reference_quick)527 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS)
528 {
529 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0);
530 atomic_add_int(&object->ref_count, 1);
531 if (object->type == OBJT_VNODE)
532 vref(object->handle);
533 #if defined(DEBUG_LOCKS)
534 debugvm_object_add(object, file, line, 1);
535 #endif
536 }
537
538 /*
539 * Dereference an object and its underlying vnode. The object may be
540 * held shared. On return the object will remain held.
541 *
542 * This function may return a vnode in *vpp which the caller must release
543 * after the caller drops its own lock. If vpp is NULL, we assume that
544 * the caller was holding an exclusive lock on the object and we vrele()
545 * the vp ourselves.
546 */
547 static void
VMOBJDEBUG(vm_object_vndeallocate)548 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp
549 VMOBJDBARGS)
550 {
551 struct vnode *vp = (struct vnode *) object->handle;
552 int count;
553
554 KASSERT(object->type == OBJT_VNODE,
555 ("vm_object_vndeallocate: not a vnode object"));
556 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
557 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
558 #ifdef INVARIANTS
559 if (object->ref_count == 0) {
560 vprint("vm_object_vndeallocate", vp);
561 panic("vm_object_vndeallocate: bad object reference count");
562 }
563 #endif
564 count = object->ref_count;
565 cpu_ccfence();
566 for (;;) {
567 if (count == 1) {
568 vm_object_upgrade(object);
569 if (atomic_fcmpset_int(&object->ref_count, &count, 0)) {
570 vclrflags(vp, VTEXT);
571 break;
572 }
573 } else {
574 if (atomic_fcmpset_int(&object->ref_count,
575 &count, count - 1)) {
576 break;
577 }
578 }
579 cpu_pause();
580 /* retry */
581 }
582 #if defined(DEBUG_LOCKS)
583 debugvm_object_add(object, file, line, -1);
584 #endif
585
586 /*
587 * vrele or return the vp to vrele. We can only safely vrele(vp)
588 * if the object was locked exclusively. But there are two races
589 * here.
590 *
591 * We had to upgrade the object above to safely clear VTEXT
592 * but the alternative path where the shared lock is retained
593 * can STILL race to 0 in other paths and cause our own vrele()
594 * to terminate the vnode. We can't allow that if the VM object
595 * is still locked shared.
596 */
597 if (vpp)
598 *vpp = vp;
599 else
600 vrele(vp);
601 }
602
603 /*
604 * Release a reference to the specified object, gained either through a
605 * vm_object_allocate or a vm_object_reference call. When all references
606 * are gone, storage associated with this object may be relinquished.
607 *
608 * The caller does not have to hold the object locked but must have control
609 * over the reference in question in order to guarantee that the object
610 * does not get ripped out from under us.
611 *
612 * XXX Currently all deallocations require an exclusive lock.
613 */
614 void
VMOBJDEBUG(vm_object_deallocate)615 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS)
616 {
617 struct vnode *vp;
618 int count;
619
620 if (object == NULL)
621 return;
622
623 count = object->ref_count;
624 cpu_ccfence();
625 for (;;) {
626 /*
627 * If decrementing the count enters into special handling
628 * territory (0, 1, or 2) we have to do it the hard way.
629 * Fortunate though, objects with only a few refs like this
630 * are not likely to be heavily contended anyway.
631 *
632 * For vnode objects we only care about 1->0 transitions.
633 */
634 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) {
635 #if defined(DEBUG_LOCKS)
636 debugvm_object_add(object, file, line, 0);
637 #endif
638 vm_object_hold(object);
639 vm_object_deallocate_locked(object);
640 vm_object_drop(object);
641 break;
642 }
643
644 /*
645 * Try to decrement ref_count without acquiring a hold on
646 * the object. This is particularly important for the exec*()
647 * and exit*() code paths because the program binary may
648 * have a great deal of sharing and an exclusive lock will
649 * crowbar performance in those circumstances.
650 */
651 if (object->type == OBJT_VNODE) {
652 vp = (struct vnode *)object->handle;
653 if (atomic_fcmpset_int(&object->ref_count,
654 &count, count - 1)) {
655 #if defined(DEBUG_LOCKS)
656 debugvm_object_add(object, file, line, -1);
657 #endif
658
659 vrele(vp);
660 break;
661 }
662 /* retry */
663 } else {
664 if (atomic_fcmpset_int(&object->ref_count,
665 &count, count - 1)) {
666 #if defined(DEBUG_LOCKS)
667 debugvm_object_add(object, file, line, -1);
668 #endif
669 break;
670 }
671 /* retry */
672 }
673 cpu_pause();
674 /* retry */
675 }
676 }
677
678 void
VMOBJDEBUG(vm_object_deallocate_locked)679 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS)
680 {
681 /*
682 * Degenerate case
683 */
684 if (object == NULL)
685 return;
686
687 /*
688 * vnode case, caller either locked the object exclusively
689 * or this is a recursion with must_drop != 0 and the vnode
690 * object will be locked shared.
691 *
692 * If locked shared we have to drop the object before we can
693 * call vrele() or risk a shared/exclusive livelock.
694 */
695 if (object->type == OBJT_VNODE) {
696 ASSERT_LWKT_TOKEN_HELD(&object->token);
697 vm_object_vndeallocate(object, NULL);
698 return;
699 }
700 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token);
701
702 /*
703 * Normal case (object is locked exclusively)
704 */
705 if (object->ref_count == 0) {
706 panic("vm_object_deallocate: object deallocated "
707 "too many times: %d", object->type);
708 }
709 if (object->ref_count > 2) {
710 atomic_add_int(&object->ref_count, -1);
711 #if defined(DEBUG_LOCKS)
712 debugvm_object_add(object, file, line, -1);
713 #endif
714 return;
715 }
716
717 /*
718 * Drop the ref and handle termination on the 1->0 transition.
719 * We may have blocked above so we have to recheck.
720 */
721 KKASSERT(object->ref_count != 0);
722 if (object->ref_count >= 2) {
723 atomic_add_int(&object->ref_count, -1);
724 #if defined(DEBUG_LOCKS)
725 debugvm_object_add(object, file, line, -1);
726 #endif
727 return;
728 }
729
730 atomic_add_int(&object->ref_count, -1);
731 if ((object->flags & OBJ_DEAD) == 0)
732 vm_object_terminate(object);
733 }
734
735 /*
736 * Destroy the specified object, freeing up related resources.
737 *
738 * The object must have zero references.
739 *
740 * The object must held. The caller is responsible for dropping the object
741 * after terminate returns. Terminate does NOT drop the object.
742 */
743 static int vm_object_terminate_callback(vm_page_t p, void *data);
744
745 void
vm_object_terminate(vm_object_t object)746 vm_object_terminate(vm_object_t object)
747 {
748 struct rb_vm_page_scan_info info;
749 struct vm_object_hash *hash;
750
751 /*
752 * Make sure no one uses us. Once we set OBJ_DEAD we should be
753 * able to safely block.
754 */
755 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
756 KKASSERT((object->flags & OBJ_DEAD) == 0);
757 vm_object_set_flag(object, OBJ_DEAD);
758
759 /*
760 * Wait for the pageout daemon to be done with the object
761 */
762 vm_object_pip_wait(object, "objtrm1");
763
764 KASSERT(!object->paging_in_progress,
765 ("vm_object_terminate: pageout in progress"));
766
767 /*
768 * Clean and free the pages, as appropriate. All references to the
769 * object are gone, so we don't need to lock it.
770 */
771 if (object->type == OBJT_VNODE) {
772 struct vnode *vp;
773
774 /*
775 * Clean pages and flush buffers.
776 *
777 * NOTE! TMPFS buffer flushes do not typically flush the
778 * actual page to swap as this would be highly
779 * inefficient, and normal filesystems usually wrap
780 * page flushes with buffer cache buffers.
781 *
782 * To deal with this we have to call vinvalbuf() both
783 * before and after the vm_object_page_clean().
784 */
785 vp = (struct vnode *) object->handle;
786 vinvalbuf(vp, V_SAVE, 0, 0);
787 vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
788 vinvalbuf(vp, V_SAVE, 0, 0);
789 }
790
791 /*
792 * Wait for any I/O to complete, after which there had better not
793 * be any references left on the object.
794 */
795 vm_object_pip_wait(object, "objtrm2");
796
797 if (object->ref_count != 0) {
798 panic("vm_object_terminate: object with references, "
799 "ref_count=%d", object->ref_count);
800 }
801
802 /*
803 * Cleanup any shared pmaps associated with this object.
804 */
805 pmap_object_free(object);
806
807 /*
808 * Now free any remaining pages. For internal objects, this also
809 * removes them from paging queues. Don't free wired pages, just
810 * remove them from the object.
811 */
812 info.count = 0;
813 info.object = object;
814 do {
815 info.error = 0;
816 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
817 vm_object_terminate_callback, &info);
818 } while (info.error);
819
820 /*
821 * Let the pager know object is dead.
822 */
823 vm_pager_deallocate(object);
824
825 /*
826 * Wait for the object hold count to hit 1, clean out pages as
827 * we go. vmobj_token interlocks any race conditions that might
828 * pick the object up from the vm_object_list after we have cleared
829 * rb_memq.
830 */
831 for (;;) {
832 if (RB_ROOT(&object->rb_memq) == NULL)
833 break;
834 kprintf("vm_object_terminate: Warning, object %p "
835 "still has %ld pages\n",
836 object, object->resident_page_count);
837 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
838 vm_object_terminate_callback, &info);
839 }
840
841 /*
842 * There had better not be any pages left
843 */
844 KKASSERT(object->resident_page_count == 0);
845
846 /*
847 * Remove the object from the global object list.
848 */
849 hash = vmobj_hash(object);
850 lwkt_gettoken(&hash->token);
851 TAILQ_REMOVE(&hash->list, object, object_entry);
852 lwkt_reltoken(&hash->token);
853
854 if (object->ref_count != 0) {
855 panic("vm_object_terminate2: object with references, "
856 "ref_count=%d", object->ref_count);
857 }
858
859 /*
860 * NOTE: The object hold_count is at least 1, so we cannot kfree()
861 * the object here. See vm_object_drop().
862 */
863 }
864
865 /*
866 * The caller must hold the object.
867 *
868 * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED
869 * or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync()
870 * is called, due to normal pmap operations. This is because only
871 * global pmap operations on the vm_page can clear the bits and not
872 * just local operations on individual pmaps.
873 *
874 * Most interactions that necessitate the clearing of these bits
875 * proactively call vm_page_protect(), and we must do so here as well.
876 */
877 static int
vm_object_terminate_callback(vm_page_t p,void * data)878 vm_object_terminate_callback(vm_page_t p, void *data)
879 {
880 struct rb_vm_page_scan_info *info = data;
881 vm_object_t object;
882
883 object = p->object;
884 KKASSERT(object == info->object);
885 if (vm_page_busy_try(p, TRUE)) {
886 vm_page_sleep_busy(p, TRUE, "vmotrm");
887 info->error = 1;
888 return 0;
889 }
890 if (object != p->object) {
891 /* XXX remove once we determine it can't happen */
892 kprintf("vm_object_terminate: Warning: Encountered "
893 "busied page %p on queue %d\n", p, p->queue);
894 vm_page_wakeup(p);
895 info->error = 1;
896 } else if (p->wire_count == 0) {
897 /*
898 * NOTE: p->dirty and PG_NEED_COMMIT are ignored.
899 */
900 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
901 vm_page_protect(p, VM_PROT_NONE);
902 vm_page_free(p);
903 mycpu->gd_cnt.v_pfree++;
904 } else {
905 if (p->queue != PQ_NONE) {
906 kprintf("vm_object_terminate: Warning: Encountered "
907 "wired page %p on queue %d\n", p, p->queue);
908 if (vm_object_debug > 0) {
909 --vm_object_debug;
910 print_backtrace(10);
911 }
912 }
913 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE))
914 vm_page_protect(p, VM_PROT_NONE);
915 vm_page_remove(p);
916 vm_page_wakeup(p);
917 }
918
919 /*
920 * Must be at end to avoid SMP races, caller holds object token
921 */
922 if ((++info->count & 63) == 0)
923 lwkt_user_yield();
924 return(0);
925 }
926
927 /*
928 * Clean all dirty pages in the specified range of object. Leaves page
929 * on whatever queue it is currently on. If NOSYNC is set then do not
930 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC),
931 * leaving the object dirty.
932 *
933 * When stuffing pages asynchronously, allow clustering. XXX we need a
934 * synchronous clustering mode implementation.
935 *
936 * Odd semantics: if start == end, we clean everything.
937 *
938 * The object must be locked? XXX
939 */
940 static int vm_object_page_clean_pass1(struct vm_page *p, void *data);
941 static int vm_object_page_clean_pass2(struct vm_page *p, void *data);
942
943 void
vm_object_page_clean(vm_object_t object,vm_pindex_t start,vm_pindex_t end,int flags)944 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
945 int flags)
946 {
947 struct rb_vm_page_scan_info info;
948 struct vnode *vp;
949 int wholescan;
950 int pagerflags;
951 int generation;
952
953 vm_object_hold(object);
954 if (object->type != OBJT_VNODE ||
955 (object->flags & OBJ_MIGHTBEDIRTY) == 0) {
956 vm_object_drop(object);
957 return;
958 }
959
960 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ?
961 OBJPC_SYNC : OBJPC_CLUSTER_OK;
962 pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0;
963
964 vp = object->handle;
965
966 /*
967 * Interlock other major object operations. This allows us to
968 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
969 */
970 vm_object_set_flag(object, OBJ_CLEANING);
971
972 /*
973 * Handle 'entire object' case
974 */
975 info.start_pindex = start;
976 if (end == 0) {
977 info.end_pindex = object->size - 1;
978 } else {
979 info.end_pindex = end - 1;
980 }
981 wholescan = (start == 0 && info.end_pindex == object->size - 1);
982 info.limit = flags;
983 info.pagerflags = pagerflags;
984 info.object = object;
985
986 /*
987 * If cleaning the entire object do a pass to mark the pages read-only.
988 * If everything worked out ok, clear OBJ_WRITEABLE and
989 * OBJ_MIGHTBEDIRTY.
990 */
991 if (wholescan) {
992 info.error = 0;
993 info.count = 0;
994 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
995 vm_object_page_clean_pass1, &info);
996 if (info.error == 0) {
997 vm_object_clear_flag(object,
998 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
999 if (object->type == OBJT_VNODE &&
1000 (vp = (struct vnode *)object->handle) != NULL) {
1001 /*
1002 * Use new-style interface to clear VISDIRTY
1003 * because the vnode is not necessarily removed
1004 * from the syncer list(s) as often as it was
1005 * under the old interface, which can leave
1006 * the vnode on the syncer list after reclaim.
1007 */
1008 vclrobjdirty(vp);
1009 }
1010 }
1011 }
1012
1013 /*
1014 * Do a pass to clean all the dirty pages we find.
1015 */
1016 do {
1017 info.error = 0;
1018 info.count = 0;
1019 generation = object->generation;
1020 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1021 vm_object_page_clean_pass2, &info);
1022 } while (info.error || generation != object->generation);
1023
1024 vm_object_clear_flag(object, OBJ_CLEANING);
1025 vm_object_drop(object);
1026 }
1027
1028 /*
1029 * The caller must hold the object.
1030 */
1031 static
1032 int
vm_object_page_clean_pass1(struct vm_page * p,void * data)1033 vm_object_page_clean_pass1(struct vm_page *p, void *data)
1034 {
1035 struct rb_vm_page_scan_info *info = data;
1036
1037 KKASSERT(p->object == info->object);
1038
1039 vm_page_flag_set(p, PG_CLEANCHK);
1040 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1041 info->error = 1;
1042 } else if (vm_page_busy_try(p, FALSE)) {
1043 info->error = 1;
1044 } else {
1045 KKASSERT(p->object == info->object);
1046 vm_page_protect(p, VM_PROT_READ);
1047 vm_page_wakeup(p);
1048 }
1049
1050 /*
1051 * Must be at end to avoid SMP races, caller holds object token
1052 */
1053 if ((++info->count & 63) == 0)
1054 lwkt_user_yield();
1055 return(0);
1056 }
1057
1058 /*
1059 * The caller must hold the object
1060 */
1061 static
1062 int
vm_object_page_clean_pass2(struct vm_page * p,void * data)1063 vm_object_page_clean_pass2(struct vm_page *p, void *data)
1064 {
1065 struct rb_vm_page_scan_info *info = data;
1066 int generation;
1067
1068 KKASSERT(p->object == info->object);
1069
1070 /*
1071 * Do not mess with pages that were inserted after we started
1072 * the cleaning pass.
1073 */
1074 if ((p->flags & PG_CLEANCHK) == 0)
1075 goto done;
1076
1077 generation = info->object->generation;
1078
1079 if (vm_page_busy_try(p, TRUE)) {
1080 vm_page_sleep_busy(p, TRUE, "vpcwai");
1081 info->error = 1;
1082 goto done;
1083 }
1084
1085 KKASSERT(p->object == info->object &&
1086 info->object->generation == generation);
1087
1088 /*
1089 * Before wasting time traversing the pmaps, check for trivial
1090 * cases where the page cannot be dirty.
1091 */
1092 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) {
1093 KKASSERT((p->dirty & p->valid) == 0 &&
1094 (p->flags & PG_NEED_COMMIT) == 0);
1095 vm_page_wakeup(p);
1096 goto done;
1097 }
1098
1099 /*
1100 * Check whether the page is dirty or not. The page has been set
1101 * to be read-only so the check will not race a user dirtying the
1102 * page.
1103 */
1104 vm_page_test_dirty(p);
1105 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) {
1106 vm_page_flag_clear(p, PG_CLEANCHK);
1107 vm_page_wakeup(p);
1108 goto done;
1109 }
1110
1111 /*
1112 * If we have been asked to skip nosync pages and this is a
1113 * nosync page, skip it. Note that the object flags were
1114 * not cleared in this case (because pass1 will have returned an
1115 * error), so we do not have to set them.
1116 */
1117 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
1118 vm_page_flag_clear(p, PG_CLEANCHK);
1119 vm_page_wakeup(p);
1120 goto done;
1121 }
1122
1123 /*
1124 * Flush as many pages as we can. PG_CLEANCHK will be cleared on
1125 * the pages that get successfully flushed. Set info->error if
1126 * we raced an object modification.
1127 */
1128 vm_object_page_collect_flush(info->object, p, info->pagerflags);
1129 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */
1130
1131 /*
1132 * Must be at end to avoid SMP races, caller holds object token
1133 */
1134 done:
1135 if ((++info->count & 63) == 0)
1136 lwkt_user_yield();
1137 return(0);
1138 }
1139
1140 /*
1141 * Collect the specified page and nearby pages and flush them out.
1142 * The number of pages flushed is returned. The passed page is busied
1143 * by the caller and we are responsible for its disposition.
1144 *
1145 * The caller must hold the object.
1146 */
1147 static void
vm_object_page_collect_flush(vm_object_t object,vm_page_t p,int pagerflags)1148 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags)
1149 {
1150 int error;
1151 int is;
1152 int ib;
1153 int i;
1154 int page_base;
1155 vm_pindex_t pi;
1156 vm_page_t ma[BLIST_MAX_ALLOC];
1157
1158 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1159
1160 pi = p->pindex;
1161 page_base = pi % BLIST_MAX_ALLOC;
1162 ma[page_base] = p;
1163 ib = page_base - 1;
1164 is = page_base + 1;
1165
1166 while (ib >= 0) {
1167 vm_page_t tp;
1168
1169 tp = vm_page_lookup_busy_try(object, pi - page_base + ib,
1170 TRUE, &error);
1171 if (error)
1172 break;
1173 if (tp == NULL)
1174 break;
1175 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1176 (tp->flags & PG_CLEANCHK) == 0) {
1177 vm_page_wakeup(tp);
1178 break;
1179 }
1180 if ((tp->queue - tp->pc) == PQ_CACHE) {
1181 vm_page_flag_clear(tp, PG_CLEANCHK);
1182 vm_page_wakeup(tp);
1183 break;
1184 }
1185 vm_page_test_dirty(tp);
1186 if ((tp->dirty & tp->valid) == 0 &&
1187 (tp->flags & PG_NEED_COMMIT) == 0) {
1188 vm_page_flag_clear(tp, PG_CLEANCHK);
1189 vm_page_wakeup(tp);
1190 break;
1191 }
1192 ma[ib] = tp;
1193 --ib;
1194 }
1195 ++ib; /* fixup */
1196
1197 while (is < BLIST_MAX_ALLOC &&
1198 pi - page_base + is < object->size) {
1199 vm_page_t tp;
1200
1201 tp = vm_page_lookup_busy_try(object, pi - page_base + is,
1202 TRUE, &error);
1203 if (error)
1204 break;
1205 if (tp == NULL)
1206 break;
1207 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 &&
1208 (tp->flags & PG_CLEANCHK) == 0) {
1209 vm_page_wakeup(tp);
1210 break;
1211 }
1212 if ((tp->queue - tp->pc) == PQ_CACHE) {
1213 vm_page_flag_clear(tp, PG_CLEANCHK);
1214 vm_page_wakeup(tp);
1215 break;
1216 }
1217 vm_page_test_dirty(tp);
1218 if ((tp->dirty & tp->valid) == 0 &&
1219 (tp->flags & PG_NEED_COMMIT) == 0) {
1220 vm_page_flag_clear(tp, PG_CLEANCHK);
1221 vm_page_wakeup(tp);
1222 break;
1223 }
1224 ma[is] = tp;
1225 ++is;
1226 }
1227
1228 /*
1229 * All pages in the ma[] array are busied now
1230 */
1231 for (i = ib; i < is; ++i) {
1232 vm_page_flag_clear(ma[i], PG_CLEANCHK);
1233 vm_page_hold(ma[i]); /* XXX need this any more? */
1234 }
1235 vm_pageout_flush(&ma[ib], is - ib, pagerflags);
1236 for (i = ib; i < is; ++i) /* XXX need this any more? */
1237 vm_page_unhold(ma[i]);
1238 }
1239
1240 /*
1241 * Implements the madvise function at the object/page level.
1242 *
1243 * MADV_WILLNEED (any object)
1244 *
1245 * Activate the specified pages if they are resident.
1246 *
1247 * MADV_DONTNEED (any object)
1248 *
1249 * Deactivate the specified pages if they are resident.
1250 *
1251 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only)
1252 *
1253 * Deactivate and clean the specified pages if they are
1254 * resident. This permits the process to reuse the pages
1255 * without faulting or the kernel to reclaim the pages
1256 * without I/O.
1257 *
1258 * No requirements.
1259 */
1260 void
vm_object_madvise(vm_object_t object,vm_pindex_t pindex,vm_pindex_t count,int advise)1261 vm_object_madvise(vm_object_t object, vm_pindex_t pindex,
1262 vm_pindex_t count, int advise)
1263 {
1264 vm_pindex_t end;
1265 vm_page_t m;
1266 int error;
1267
1268 if (object == NULL)
1269 return;
1270
1271 end = pindex + count;
1272
1273 vm_object_hold(object);
1274
1275 /*
1276 * Locate and adjust resident pages. This only applies to the
1277 * primary object in the mapping.
1278 */
1279 for (; pindex < end; pindex += 1) {
1280 relookup:
1281 /*
1282 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
1283 * and those pages must be OBJ_ONEMAPPING.
1284 */
1285 if (advise == MADV_FREE) {
1286 if ((object->type != OBJT_DEFAULT &&
1287 object->type != OBJT_SWAP) ||
1288 (object->flags & OBJ_ONEMAPPING) == 0) {
1289 continue;
1290 }
1291 }
1292
1293 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
1294
1295 if (error) {
1296 vm_page_sleep_busy(m, TRUE, "madvpo");
1297 goto relookup;
1298 }
1299 if (m == NULL) {
1300 /*
1301 * There may be swap even if there is no backing page
1302 */
1303 if (advise == MADV_FREE && object->type == OBJT_SWAP)
1304 swap_pager_freespace(object, pindex, 1);
1305 continue;
1306 }
1307
1308 /*
1309 * If the page is not in a normal active state, we skip it.
1310 * If the page is not managed there are no page queues to
1311 * mess with. Things can break if we mess with pages in
1312 * any of the below states.
1313 */
1314 if (m->wire_count ||
1315 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED |
1316 PG_NEED_COMMIT)) ||
1317 m->valid != VM_PAGE_BITS_ALL
1318 ) {
1319 vm_page_wakeup(m);
1320 continue;
1321 }
1322
1323 /*
1324 * Theoretically once a page is known not to be busy, an
1325 * interrupt cannot come along and rip it out from under us.
1326 */
1327 if (advise == MADV_WILLNEED) {
1328 vm_page_activate(m);
1329 } else if (advise == MADV_DONTNEED) {
1330 vm_page_dontneed(m);
1331 } else if (advise == MADV_FREE) {
1332 /*
1333 * Mark the page clean. This will allow the page
1334 * to be freed up by the system. However, such pages
1335 * are often reused quickly by malloc()/free()
1336 * so we do not do anything that would cause
1337 * a page fault if we can help it.
1338 *
1339 * Specifically, we do not try to actually free
1340 * the page now nor do we try to put it in the
1341 * cache (which would cause a page fault on reuse).
1342 *
1343 * But we do make the page is freeable as we
1344 * can without actually taking the step of unmapping
1345 * it.
1346 */
1347 pmap_clear_modify(m);
1348 m->dirty = 0;
1349 m->act_count = 0;
1350 vm_page_dontneed(m);
1351 if (object->type == OBJT_SWAP)
1352 swap_pager_freespace(object, pindex, 1);
1353 }
1354 vm_page_wakeup(m);
1355 }
1356 vm_object_drop(object);
1357 }
1358
1359 /*
1360 * Removes all physical pages in the specified object range from the
1361 * object's list of pages.
1362 *
1363 * No requirements.
1364 */
1365 static int vm_object_page_remove_callback(vm_page_t p, void *data);
1366
1367 void
vm_object_page_remove(vm_object_t object,vm_pindex_t start,vm_pindex_t end,boolean_t clean_only)1368 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
1369 boolean_t clean_only)
1370 {
1371 struct rb_vm_page_scan_info info;
1372 int all;
1373
1374 /*
1375 * Degenerate cases and assertions.
1376 *
1377 * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects.
1378 * These objects do not have to have their pages entered into
1379 * them and are handled via their vm_map_backing lists.
1380 */
1381 vm_object_hold(object);
1382 if (object == NULL ||
1383 (object->type != OBJT_MGTDEVICE &&
1384 object->resident_page_count == 0 && object->swblock_count == 0)) {
1385 vm_object_drop(object);
1386 return;
1387 }
1388 KASSERT(object->type != OBJT_PHYS,
1389 ("attempt to remove pages from a physical object"));
1390
1391 /*
1392 * Indicate that paging is occuring on the object
1393 */
1394 vm_object_pip_add(object, 1);
1395
1396 /*
1397 * Figure out the actual removal range and whether we are removing
1398 * the entire contents of the object or not. If removing the entire
1399 * contents, be sure to get all pages, even those that might be
1400 * beyond the end of the object.
1401 *
1402 * NOTE: end is non-inclusive, but info.end_pindex is inclusive.
1403 */
1404 info.object = object;
1405 info.start_pindex = start;
1406 if (end == 0 || end == (vm_pindex_t)-1) {
1407 info.end_pindex = (vm_pindex_t)-1;
1408 end = object->size;
1409 } else {
1410 info.end_pindex = end - 1;
1411 }
1412 info.limit = clean_only;
1413 info.count = 0;
1414 all = (start == 0 && info.end_pindex >= object->size - 1);
1415
1416 /*
1417 * Efficiently remove pages from the pmap via a backing scan.
1418 *
1419 * NOTE: This is the only way pages can be removed and unwired
1420 * from OBJT_MGTDEVICE devices which typically do not enter
1421 * their pages into the vm_object's RB tree. And possibly
1422 * other OBJT_* types in the future.
1423 */
1424 {
1425 vm_map_backing_t ba;
1426 vm_pindex_t sba, eba;
1427 vm_offset_t sva, eva;
1428
1429 lockmgr(&object->backing_lk, LK_EXCLUSIVE);
1430 TAILQ_FOREACH(ba, &object->backing_list, entry) {
1431 /*
1432 * object offset range within the ba, intersectioned
1433 * with the page range specified for the object
1434 */
1435 sba = OFF_TO_IDX(ba->offset);
1436 eba = sba + OFF_TO_IDX(ba->end - ba->start);
1437 if (sba < start)
1438 sba = start;
1439 if (eba > end)
1440 eba = end;
1441
1442 /*
1443 * If the intersection is valid, remove the related
1444 * pages.
1445 *
1446 * NOTE! This may also remove other incidental pages
1447 * in the pmap, as the backing area may be
1448 * overloaded.
1449 *
1450 * NOTE! pages for MGTDEVICE objects are only removed
1451 * here, they aren't entered into rb_memq, so
1452 * we must use pmap_remove() instead of
1453 * the non-TLB-invalidating pmap_remove_pages().
1454 */
1455 if (sba < eba) {
1456 sva = ba->start + IDX_TO_OFF(sba) - ba->offset;
1457 eva = sva + IDX_TO_OFF(eba - sba);
1458 #if 0
1459 kprintf("VM_OBJECT_PAGE_REMOVE "
1460 "%p[%016jx] %016jx-%016jx\n",
1461 ba->pmap, ba->start, sva, eva);
1462 #endif
1463 pmap_remove(ba->pmap, sva, eva);
1464 }
1465 }
1466 lockmgr(&object->backing_lk, LK_RELEASE);
1467 }
1468
1469 /*
1470 * Remove and free pages entered onto the object list. Note that
1471 * for OBJT_MGTDEVICE objects, there are typically no pages entered.
1472 *
1473 * Loop until we are sure we have gotten them all.
1474 */
1475 do {
1476 info.error = 0;
1477 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
1478 vm_object_page_remove_callback, &info);
1479 } while (info.error);
1480
1481 /*
1482 * Remove any related swap if throwing away pages, or for
1483 * non-swap objects (the swap is a clean copy in that case).
1484 */
1485 if (object->type != OBJT_SWAP || clean_only == FALSE) {
1486 if (all)
1487 swap_pager_freespace_all(object);
1488 else
1489 swap_pager_freespace(object, info.start_pindex,
1490 info.end_pindex - info.start_pindex + 1);
1491 }
1492
1493 /*
1494 * Cleanup
1495 */
1496 vm_object_pip_wakeup(object);
1497 vm_object_drop(object);
1498 }
1499
1500 /*
1501 * The caller must hold the object.
1502 *
1503 * NOTE: User yields are allowed when removing more than one page, but not
1504 * allowed if only removing one page (the path for single page removals
1505 * might hold a spinlock).
1506 */
1507 static int
vm_object_page_remove_callback(vm_page_t p,void * data)1508 vm_object_page_remove_callback(vm_page_t p, void *data)
1509 {
1510 struct rb_vm_page_scan_info *info = data;
1511
1512 if (info->object != p->object ||
1513 p->pindex < info->start_pindex ||
1514 p->pindex > info->end_pindex) {
1515 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n",
1516 info->object, p);
1517 return(0);
1518 }
1519 if (vm_page_busy_try(p, TRUE)) {
1520 vm_page_sleep_busy(p, TRUE, "vmopar");
1521 info->error = 1;
1522 return(0);
1523 }
1524 if (info->object != p->object) {
1525 /* this should never happen */
1526 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n",
1527 info->object, p);
1528 vm_page_wakeup(p);
1529 return(0);
1530 }
1531
1532 /*
1533 * Wired pages cannot be destroyed, but they can be invalidated
1534 * and we do so if clean_only (limit) is not set.
1535 *
1536 * WARNING! The page may be wired due to being part of a buffer
1537 * cache buffer, and the buffer might be marked B_CACHE.
1538 * This is fine as part of a truncation but VFSs must be
1539 * sure to fix the buffer up when re-extending the file.
1540 *
1541 * NOTE! PG_NEED_COMMIT is ignored.
1542 */
1543 if (p->wire_count != 0) {
1544 vm_page_protect(p, VM_PROT_NONE);
1545 if (info->limit == 0)
1546 p->valid = 0;
1547 vm_page_wakeup(p);
1548 goto done;
1549 }
1550
1551 /*
1552 * limit is our clean_only flag. If set and the page is dirty or
1553 * requires a commit, do not free it. If set and the page is being
1554 * held by someone, do not free it.
1555 */
1556 if (info->limit && p->valid) {
1557 vm_page_test_dirty(p);
1558 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1559 vm_page_wakeup(p);
1560 goto done;
1561 }
1562 }
1563
1564 /*
1565 * Destroy the page. But we have to re-test whether its dirty after
1566 * removing it from its pmaps.
1567 */
1568 vm_page_protect(p, VM_PROT_NONE);
1569 if (info->limit && p->valid) {
1570 vm_page_test_dirty(p);
1571 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) {
1572 vm_page_wakeup(p);
1573 goto done;
1574 }
1575 }
1576 vm_page_free(p);
1577
1578 /*
1579 * Must be at end to avoid SMP races, caller holds object token
1580 */
1581 done:
1582 if ((++info->count & 63) == 0)
1583 lwkt_user_yield();
1584
1585 return(0);
1586 }
1587
1588 /*
1589 * Try to extend prev_object into an adjoining region of virtual
1590 * memory, return TRUE on success.
1591 *
1592 * The caller does not need to hold (prev_object) but must have a stable
1593 * pointer to it (typically by holding the vm_map locked).
1594 *
1595 * This function only works for anonymous memory objects which either
1596 * have (a) one reference or (b) we are extending the object's size.
1597 * Otherwise the related VM pages we want to use for the object might
1598 * be in use by another mapping.
1599 */
1600 boolean_t
vm_object_coalesce(vm_object_t prev_object,vm_pindex_t prev_pindex,vm_size_t prev_size,vm_size_t next_size)1601 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex,
1602 vm_size_t prev_size, vm_size_t next_size)
1603 {
1604 vm_pindex_t next_pindex;
1605
1606 if (prev_object == NULL)
1607 return (TRUE);
1608
1609 vm_object_hold(prev_object);
1610
1611 if (prev_object->type != OBJT_DEFAULT &&
1612 prev_object->type != OBJT_SWAP) {
1613 vm_object_drop(prev_object);
1614 return (FALSE);
1615 }
1616
1617 #if 0
1618 /* caller now checks this */
1619 /*
1620 * Try to collapse the object first
1621 */
1622 vm_object_collapse(prev_object, NULL);
1623 #endif
1624
1625 #if 0
1626 /* caller now checks this */
1627 /*
1628 * We can't coalesce if we shadow another object (figuring out the
1629 * relationships become too complex).
1630 */
1631 if (prev_object->backing_object != NULL) {
1632 vm_object_chain_release(prev_object);
1633 vm_object_drop(prev_object);
1634 return (FALSE);
1635 }
1636 #endif
1637
1638 prev_size >>= PAGE_SHIFT;
1639 next_size >>= PAGE_SHIFT;
1640 next_pindex = prev_pindex + prev_size;
1641
1642 /*
1643 * We can't if the object has more than one ref count unless we
1644 * are extending it into newly minted space.
1645 */
1646 if (prev_object->ref_count > 1 &&
1647 prev_object->size != next_pindex) {
1648 vm_object_drop(prev_object);
1649 return (FALSE);
1650 }
1651
1652 /*
1653 * Remove any pages that may still be in the object from a previous
1654 * deallocation.
1655 */
1656 if (next_pindex < prev_object->size) {
1657 vm_object_page_remove(prev_object,
1658 next_pindex,
1659 next_pindex + next_size, FALSE);
1660 if (prev_object->type == OBJT_SWAP)
1661 swap_pager_freespace(prev_object,
1662 next_pindex, next_size);
1663 }
1664
1665 /*
1666 * Extend the object if necessary.
1667 */
1668 if (next_pindex + next_size > prev_object->size)
1669 prev_object->size = next_pindex + next_size;
1670 vm_object_drop(prev_object);
1671
1672 return (TRUE);
1673 }
1674
1675 /*
1676 * Make the object writable and flag is being possibly dirty.
1677 *
1678 * The object might not be held (or might be held but held shared),
1679 * the related vnode is probably not held either. Object and vnode are
1680 * stable by virtue of the vm_page busied by the caller preventing
1681 * destruction.
1682 *
1683 * If the related mount is flagged MNTK_THR_SYNC we need to call
1684 * vsetobjdirty(). Filesystems using this option usually shortcut
1685 * synchronization by only scanning the syncer list.
1686 */
1687 void
vm_object_set_writeable_dirty(vm_object_t object)1688 vm_object_set_writeable_dirty(vm_object_t object)
1689 {
1690 struct vnode *vp;
1691
1692 /*vm_object_assert_held(object);*/
1693 /*
1694 * Avoid contention in vm fault path by checking the state before
1695 * issuing an atomic op on it.
1696 */
1697 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
1698 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
1699 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
1700 }
1701 if (object->type == OBJT_VNODE &&
1702 (vp = (struct vnode *)object->handle) != NULL) {
1703 if ((vp->v_flag & VOBJDIRTY) == 0) {
1704 if (vp->v_mount &&
1705 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) {
1706 /*
1707 * New style THR_SYNC places vnodes on the
1708 * syncer list more deterministically.
1709 */
1710 vsetobjdirty(vp);
1711 } else {
1712 /*
1713 * Old style scan would not necessarily place
1714 * a vnode on the syncer list when possibly
1715 * modified via mmap.
1716 */
1717 vsetflags(vp, VOBJDIRTY);
1718 }
1719 }
1720 }
1721 }
1722
1723 #include "opt_ddb.h"
1724 #ifdef DDB
1725 #include <sys/cons.h>
1726
1727 #include <ddb/ddb.h>
1728
1729 static int _vm_object_in_map (vm_map_t map, vm_object_t object,
1730 vm_map_entry_t entry);
1731 static int vm_object_in_map (vm_object_t object);
1732
1733 /*
1734 * The caller must hold the object.
1735 */
1736 static int
_vm_object_in_map(vm_map_t map,vm_object_t object,vm_map_entry_t entry)1737 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
1738 {
1739 vm_map_backing_t ba;
1740 vm_map_t tmpm;
1741 vm_map_entry_t tmpe;
1742 int entcount;
1743
1744 if (map == NULL)
1745 return 0;
1746 if (entry == NULL) {
1747 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root);
1748 entcount = map->nentries;
1749 while (entcount-- && tmpe) {
1750 if( _vm_object_in_map(map, object, tmpe)) {
1751 return 1;
1752 }
1753 tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1754 }
1755 return (0);
1756 }
1757 switch(entry->maptype) {
1758 case VM_MAPTYPE_SUBMAP:
1759 tmpm = entry->ba.sub_map;
1760 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root);
1761 entcount = tmpm->nentries;
1762 while (entcount-- && tmpe) {
1763 if( _vm_object_in_map(tmpm, object, tmpe)) {
1764 return 1;
1765 }
1766 tmpe = vm_map_rb_tree_RB_NEXT(tmpe);
1767 }
1768 break;
1769 case VM_MAPTYPE_NORMAL:
1770 ba = &entry->ba;
1771 while (ba) {
1772 if (ba->object == object)
1773 return TRUE;
1774 ba = ba->backing_ba;
1775 }
1776 break;
1777 default:
1778 break;
1779 }
1780 return 0;
1781 }
1782
1783 static int vm_object_in_map_callback(struct proc *p, void *data);
1784
1785 struct vm_object_in_map_info {
1786 vm_object_t object;
1787 int rv;
1788 };
1789
1790 /*
1791 * Debugging only
1792 */
1793 static int
vm_object_in_map(vm_object_t object)1794 vm_object_in_map(vm_object_t object)
1795 {
1796 struct vm_object_in_map_info info;
1797
1798 info.rv = 0;
1799 info.object = object;
1800
1801 allproc_scan(vm_object_in_map_callback, &info, 0);
1802 if (info.rv)
1803 return 1;
1804 if( _vm_object_in_map(kernel_map, object, 0))
1805 return 1;
1806 if( _vm_object_in_map(pager_map, object, 0))
1807 return 1;
1808 if( _vm_object_in_map(buffer_map, object, 0))
1809 return 1;
1810 return 0;
1811 }
1812
1813 /*
1814 * Debugging only
1815 */
1816 static int
vm_object_in_map_callback(struct proc * p,void * data)1817 vm_object_in_map_callback(struct proc *p, void *data)
1818 {
1819 struct vm_object_in_map_info *info = data;
1820
1821 if (p->p_vmspace) {
1822 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) {
1823 info->rv = 1;
1824 return -1;
1825 }
1826 }
1827 return (0);
1828 }
1829
DB_SHOW_COMMAND(vmochk,vm_object_check)1830 DB_SHOW_COMMAND(vmochk, vm_object_check)
1831 {
1832 struct vm_object_hash *hash;
1833 vm_object_t object;
1834 int n;
1835
1836 /*
1837 * make sure that internal objs are in a map somewhere
1838 * and none have zero ref counts.
1839 */
1840 for (n = 0; n < VMOBJ_HSIZE; ++n) {
1841 hash = &vm_object_hash[n];
1842 for (object = TAILQ_FIRST(&hash->list);
1843 object != NULL;
1844 object = TAILQ_NEXT(object, object_entry)) {
1845 if (object->type == OBJT_MARKER)
1846 continue;
1847 if (object->handle != NULL ||
1848 (object->type != OBJT_DEFAULT &&
1849 object->type != OBJT_SWAP)) {
1850 continue;
1851 }
1852 if (object->ref_count == 0) {
1853 db_printf("vmochk: internal obj has "
1854 "zero ref count: %ld\n",
1855 (long)object->size);
1856 }
1857 if (vm_object_in_map(object))
1858 continue;
1859 db_printf("vmochk: internal obj is not in a map: "
1860 "ref: %d, size: %lu: 0x%lx\n",
1861 object->ref_count, (u_long)object->size,
1862 (u_long)object->size);
1863 }
1864 }
1865 }
1866
1867 /*
1868 * Debugging only
1869 */
DB_SHOW_COMMAND(object,vm_object_print_static)1870 DB_SHOW_COMMAND(object, vm_object_print_static)
1871 {
1872 /* XXX convert args. */
1873 vm_object_t object = (vm_object_t)addr;
1874 boolean_t full = have_addr;
1875
1876 vm_page_t p;
1877
1878 /* XXX count is an (unused) arg. Avoid shadowing it. */
1879 #define count was_count
1880
1881 int count;
1882
1883 if (object == NULL)
1884 return;
1885
1886 db_iprintf(
1887 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n",
1888 object, (int)object->type, (u_long)object->size,
1889 object->resident_page_count, object->ref_count, object->flags);
1890 /*
1891 * XXX no %qd in kernel. Truncate object->backing_object_offset.
1892 */
1893 db_iprintf("\n");
1894
1895 if (!full)
1896 return;
1897
1898 db_indent += 2;
1899 count = 0;
1900 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) {
1901 if (count == 0)
1902 db_iprintf("memory:=");
1903 else if (count == 6) {
1904 db_printf("\n");
1905 db_iprintf(" ...");
1906 count = 0;
1907 } else
1908 db_printf(",");
1909 count++;
1910
1911 db_printf("(off=0x%lx,page=0x%lx)",
1912 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
1913 }
1914 if (count != 0)
1915 db_printf("\n");
1916 db_indent -= 2;
1917 }
1918
1919 /* XXX. */
1920 #undef count
1921
1922 /*
1923 * XXX need this non-static entry for calling from vm_map_print.
1924 *
1925 * Debugging only
1926 */
1927 void
vm_object_print(long addr,boolean_t have_addr,long count,char * modif)1928 vm_object_print(/* db_expr_t */ long addr,
1929 boolean_t have_addr,
1930 /* db_expr_t */ long count,
1931 char *modif)
1932 {
1933 vm_object_print_static(addr, have_addr, count, modif);
1934 }
1935
1936 /*
1937 * Debugging only
1938 */
DB_SHOW_COMMAND(vmopag,vm_object_print_pages)1939 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
1940 {
1941 struct vm_object_hash *hash;
1942 vm_object_t object;
1943 int nl = 0;
1944 int c;
1945 int n;
1946
1947 for (n = 0; n < VMOBJ_HSIZE; ++n) {
1948 hash = &vm_object_hash[n];
1949 for (object = TAILQ_FIRST(&hash->list);
1950 object != NULL;
1951 object = TAILQ_NEXT(object, object_entry)) {
1952 vm_pindex_t idx, fidx;
1953 vm_pindex_t osize;
1954 vm_paddr_t pa = -1, padiff;
1955 int rcount;
1956 vm_page_t m;
1957
1958 if (object->type == OBJT_MARKER)
1959 continue;
1960 db_printf("new object: %p\n", (void *)object);
1961 if ( nl > 18) {
1962 c = cngetc();
1963 if (c != ' ')
1964 return;
1965 nl = 0;
1966 }
1967 nl++;
1968 rcount = 0;
1969 fidx = 0;
1970 osize = object->size;
1971 if (osize > 128)
1972 osize = 128;
1973 for (idx = 0; idx < osize; idx++) {
1974 m = vm_page_lookup(object, idx);
1975 if (m == NULL) {
1976 if (rcount) {
1977 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
1978 (long)fidx, rcount, (long)pa);
1979 if ( nl > 18) {
1980 c = cngetc();
1981 if (c != ' ')
1982 return;
1983 nl = 0;
1984 }
1985 nl++;
1986 rcount = 0;
1987 }
1988 continue;
1989 }
1990
1991 if (rcount &&
1992 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
1993 ++rcount;
1994 continue;
1995 }
1996 if (rcount) {
1997 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
1998 padiff >>= PAGE_SHIFT;
1999 padiff &= PQ_L2_MASK;
2000 if (padiff == 0) {
2001 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
2002 ++rcount;
2003 continue;
2004 }
2005 db_printf(" index(%ld)run(%d)pa(0x%lx)",
2006 (long)fidx, rcount, (long)pa);
2007 db_printf("pd(%ld)\n", (long)padiff);
2008 if ( nl > 18) {
2009 c = cngetc();
2010 if (c != ' ')
2011 return;
2012 nl = 0;
2013 }
2014 nl++;
2015 }
2016 fidx = idx;
2017 pa = VM_PAGE_TO_PHYS(m);
2018 rcount = 1;
2019 }
2020 if (rcount) {
2021 db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
2022 (long)fidx, rcount, (long)pa);
2023 if ( nl > 18) {
2024 c = cngetc();
2025 if (c != ' ')
2026 return;
2027 nl = 0;
2028 }
2029 nl++;
2030 }
2031 }
2032 }
2033 }
2034 #endif /* DDB */
2035