1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/malloc.h> 77 #include <sys/sysctl.h> 78 #include <sys/refcount.h> 79 80 #include <vm/vm.h> 81 #include <vm/vm_param.h> 82 #include <vm/pmap.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_pager.h> 88 #include <vm/swap_pager.h> 89 #include <vm/vm_kern.h> 90 #include <vm/vm_extern.h> 91 #include <vm/vm_zone.h> 92 93 #include <vm/vm_page2.h> 94 95 #include <machine/specialreg.h> 96 97 #define EASY_SCAN_FACTOR 8 98 99 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 100 int pagerflags); 101 static void vm_object_lock_init(vm_object_t); 102 103 /* 104 * Virtual memory objects maintain the actual data 105 * associated with allocated virtual memory. A given 106 * page of memory exists within exactly one object. 107 * 108 * An object is only deallocated when all "references" 109 * are given up. Only one "reference" to a given 110 * region of an object should be writeable. 111 * 112 * Associated with each object is a list of all resident 113 * memory pages belonging to that object; this list is 114 * maintained by the "vm_page" module, and locked by the object's 115 * lock. 116 * 117 * Each object also records a "pager" routine which is 118 * used to retrieve (and store) pages to the proper backing 119 * storage. In addition, objects may be backed by other 120 * objects from which they were virtual-copied. 121 * 122 * The only items within the object structure which are 123 * modified after time of creation are: 124 * reference count locked by object's lock 125 * pager routine locked by object's lock 126 * 127 */ 128 129 static struct vm_object kernel_object_store; 130 struct vm_object *kernel_object = &kernel_object_store; 131 132 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 133 134 static MALLOC_DEFINE_OBJ(M_VM_OBJECT, sizeof(struct vm_object), 135 "vm_object", "vm_object structures"); 136 137 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 138 #define VMOBJ_HASH_PRIME2 989042931893ULL 139 140 int vm_object_debug; 141 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 142 143 static __inline 144 struct vm_object_hash * 145 vmobj_hash(vm_object_t obj) 146 { 147 uintptr_t hash1; 148 uintptr_t hash2; 149 150 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 151 hash1 %= VMOBJ_HASH_PRIME1; 152 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 153 hash2 %= VMOBJ_HASH_PRIME2; 154 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 155 } 156 157 #if defined(DEBUG_LOCKS) 158 159 #define vm_object_vndeallocate(obj, vpp) \ 160 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 161 162 /* 163 * Debug helper to track hold/drop/ref/deallocate calls. 164 */ 165 static void 166 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 167 { 168 int i; 169 170 i = atomic_fetchadd_int(&obj->debug_index, 1); 171 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 172 ksnprintf(obj->debug_hold_thrs[i], 173 sizeof(obj->debug_hold_thrs[i]), 174 "%c%d:(%d):%s", 175 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 176 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 177 obj->ref_count, 178 curthread->td_comm); 179 obj->debug_hold_file[i] = file; 180 obj->debug_hold_line[i] = line; 181 #if 0 182 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 183 if (strcmp(curthread->td_comm, "sshd") == 0) { 184 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 185 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 186 obj, obj->ref_count, addrem, file, line); 187 } 188 #endif 189 } 190 191 #endif 192 193 /* 194 * Misc low level routines 195 */ 196 static void 197 vm_object_lock_init(vm_object_t obj) 198 { 199 #if defined(DEBUG_LOCKS) 200 int i; 201 202 obj->debug_index = 0; 203 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 204 obj->debug_hold_thrs[i][0] = 0; 205 obj->debug_hold_file[i] = NULL; 206 obj->debug_hold_line[i] = 0; 207 } 208 #endif 209 } 210 211 void 212 vm_object_lock_swap(void) 213 { 214 lwkt_token_swap(); 215 } 216 217 void 218 vm_object_lock(vm_object_t obj) 219 { 220 lwkt_gettoken(&obj->token); 221 } 222 223 /* 224 * Returns TRUE on sucesss 225 */ 226 static int 227 vm_object_lock_try(vm_object_t obj) 228 { 229 return(lwkt_trytoken(&obj->token)); 230 } 231 232 void 233 vm_object_lock_shared(vm_object_t obj) 234 { 235 lwkt_gettoken_shared(&obj->token); 236 } 237 238 void 239 vm_object_unlock(vm_object_t obj) 240 { 241 lwkt_reltoken(&obj->token); 242 } 243 244 void 245 vm_object_upgrade(vm_object_t obj) 246 { 247 lwkt_reltoken(&obj->token); 248 lwkt_gettoken(&obj->token); 249 } 250 251 void 252 vm_object_downgrade(vm_object_t obj) 253 { 254 lwkt_reltoken(&obj->token); 255 lwkt_gettoken_shared(&obj->token); 256 } 257 258 static __inline void 259 vm_object_assert_held(vm_object_t obj) 260 { 261 ASSERT_LWKT_TOKEN_HELD(&obj->token); 262 } 263 264 int 265 vm_quickcolor(void) 266 { 267 globaldata_t gd = mycpu; 268 int pg_color; 269 270 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 271 pg_color += gd->gd_quick_color; 272 gd->gd_quick_color += PQ_PRIME2; 273 274 return pg_color; 275 } 276 277 void 278 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 279 { 280 KKASSERT(obj != NULL); 281 282 /* 283 * Object must be held (object allocation is stable due to callers 284 * context, typically already holding the token on a parent object) 285 * prior to potentially blocking on the lock, otherwise the object 286 * can get ripped away from us. 287 */ 288 refcount_acquire(&obj->hold_count); 289 vm_object_lock(obj); 290 291 #if defined(DEBUG_LOCKS) 292 debugvm_object_add(obj, file, line, 1); 293 #endif 294 } 295 296 int 297 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 298 { 299 KKASSERT(obj != NULL); 300 301 /* 302 * Object must be held (object allocation is stable due to callers 303 * context, typically already holding the token on a parent object) 304 * prior to potentially blocking on the lock, otherwise the object 305 * can get ripped away from us. 306 */ 307 refcount_acquire(&obj->hold_count); 308 if (vm_object_lock_try(obj) == 0) { 309 if (refcount_release(&obj->hold_count)) { 310 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 311 kfree_obj(obj, M_VM_OBJECT); 312 } 313 return(0); 314 } 315 316 #if defined(DEBUG_LOCKS) 317 debugvm_object_add(obj, file, line, 1); 318 #endif 319 return(1); 320 } 321 322 void 323 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 324 { 325 KKASSERT(obj != NULL); 326 327 /* 328 * Object must be held (object allocation is stable due to callers 329 * context, typically already holding the token on a parent object) 330 * prior to potentially blocking on the lock, otherwise the object 331 * can get ripped away from us. 332 */ 333 refcount_acquire(&obj->hold_count); 334 vm_object_lock_shared(obj); 335 336 #if defined(DEBUG_LOCKS) 337 debugvm_object_add(obj, file, line, 1); 338 #endif 339 } 340 341 /* 342 * Drop the token and hold_count on the object. 343 * 344 * WARNING! Token might be shared. 345 */ 346 void 347 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 348 { 349 if (obj == NULL) 350 return; 351 352 /* 353 * No new holders should be possible once we drop hold_count 1->0 as 354 * there is no longer any way to reference the object. 355 */ 356 KKASSERT(obj->hold_count > 0); 357 if (refcount_release(&obj->hold_count)) { 358 #if defined(DEBUG_LOCKS) 359 debugvm_object_add(obj, file, line, -1); 360 #endif 361 362 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 363 vm_object_unlock(obj); 364 kfree_obj(obj, M_VM_OBJECT); 365 } else { 366 vm_object_unlock(obj); 367 } 368 } else { 369 #if defined(DEBUG_LOCKS) 370 debugvm_object_add(obj, file, line, -1); 371 #endif 372 vm_object_unlock(obj); 373 } 374 } 375 376 /* 377 * Initialize a freshly allocated object, returning a held object. 378 * 379 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 380 * 381 * No requirements. 382 */ 383 void 384 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object, 385 const char *ident) 386 { 387 struct vm_object_hash *hash; 388 389 RB_INIT(&object->rb_memq); 390 lwkt_token_init(&object->token, ident); 391 392 TAILQ_INIT(&object->backing_list); 393 lockinit(&object->backing_lk, "baclk", 0, 0); 394 395 object->type = type; 396 object->size = size; 397 object->ref_count = 1; 398 object->memattr = VM_MEMATTR_DEFAULT; 399 object->hold_count = 0; 400 object->flags = 0; 401 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 402 vm_object_set_flag(object, OBJ_ONEMAPPING); 403 object->paging_in_progress = 0; 404 object->resident_page_count = 0; 405 /* cpu localization twist */ 406 object->pg_color = vm_quickcolor(); 407 object->handle = NULL; 408 409 atomic_add_int(&object->generation, 1); 410 object->swblock_count = 0; 411 RB_INIT(&object->swblock_root); 412 vm_object_lock_init(object); 413 pmap_object_init(object); 414 415 vm_object_hold(object); 416 417 hash = vmobj_hash(object); 418 lwkt_gettoken(&hash->token); 419 TAILQ_INSERT_TAIL(&hash->list, object, object_entry); 420 lwkt_reltoken(&hash->token); 421 } 422 423 /* 424 * Initialize a VM object. 425 */ 426 void 427 vm_object_init(vm_object_t object, vm_pindex_t size) 428 { 429 _vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj"); 430 vm_object_drop(object); 431 } 432 433 /* 434 * Initialize the VM objects module. 435 * 436 * Called from the low level boot code only. Note that this occurs before 437 * kmalloc is initialized so we cannot allocate any VM objects. 438 */ 439 void 440 vm_object_init1(void) 441 { 442 int i; 443 444 for (i = 0; i < VMOBJ_HSIZE; ++i) { 445 TAILQ_INIT(&vm_object_hash[i].list); 446 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 447 } 448 449 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 450 kernel_object, "kobj"); 451 vm_object_drop(kernel_object); 452 } 453 454 void 455 vm_object_init2(void) 456 { 457 kmalloc_obj_set_unlimited(M_VM_OBJECT); 458 } 459 460 /* 461 * Allocate and return a new object of the specified type and size. 462 * 463 * No requirements. 464 */ 465 vm_object_t 466 vm_object_allocate(objtype_t type, vm_pindex_t size) 467 { 468 vm_object_t obj; 469 470 obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 471 _vm_object_allocate(type, size, obj, "vmobj"); 472 vm_object_drop(obj); 473 474 return (obj); 475 } 476 477 /* 478 * This version returns a held object, allowing further atomic initialization 479 * of the object. 480 */ 481 vm_object_t 482 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 483 { 484 vm_object_t obj; 485 486 obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 487 _vm_object_allocate(type, size, obj, "vmobj"); 488 489 return (obj); 490 } 491 492 /* 493 * Add an additional reference to a vm_object. The object must already be 494 * held. The original non-lock version is no longer supported. The object 495 * must NOT be chain locked by anyone at the time the reference is added. 496 * 497 * The object must be held, but may be held shared if desired (hence why 498 * we use an atomic op). 499 */ 500 void 501 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 502 { 503 KKASSERT(object != NULL); 504 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 505 atomic_add_int(&object->ref_count, 1); 506 if (object->type == OBJT_VNODE) { 507 vref(object->handle); 508 /* XXX what if the vnode is being destroyed? */ 509 } 510 #if defined(DEBUG_LOCKS) 511 debugvm_object_add(object, file, line, 1); 512 #endif 513 } 514 515 /* 516 * This version is only allowed in situations where the caller 517 * already knows that the object is deterministically referenced 518 * (usually because its taken from a ref'd vnode, or during a map_entry 519 * replication). 520 */ 521 void 522 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 523 { 524 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0); 525 atomic_add_int(&object->ref_count, 1); 526 if (object->type == OBJT_VNODE) 527 vref(object->handle); 528 #if defined(DEBUG_LOCKS) 529 debugvm_object_add(object, file, line, 1); 530 #endif 531 } 532 533 /* 534 * Dereference an object and its underlying vnode. The object may be 535 * held shared. On return the object will remain held. 536 * 537 * This function may return a vnode in *vpp which the caller must release 538 * after the caller drops its own lock. If vpp is NULL, we assume that 539 * the caller was holding an exclusive lock on the object and we vrele() 540 * the vp ourselves. 541 */ 542 static void 543 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 544 VMOBJDBARGS) 545 { 546 struct vnode *vp = (struct vnode *) object->handle; 547 int count; 548 549 KASSERT(object->type == OBJT_VNODE, 550 ("vm_object_vndeallocate: not a vnode object")); 551 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 552 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 553 #ifdef INVARIANTS 554 if (object->ref_count == 0) { 555 vprint("vm_object_vndeallocate", vp); 556 panic("vm_object_vndeallocate: bad object reference count"); 557 } 558 #endif 559 count = object->ref_count; 560 cpu_ccfence(); 561 for (;;) { 562 if (count == 1) { 563 vm_object_upgrade(object); 564 if (atomic_fcmpset_int(&object->ref_count, &count, 0)) { 565 vclrflags(vp, VTEXT); 566 break; 567 } 568 } else { 569 if (atomic_fcmpset_int(&object->ref_count, 570 &count, count - 1)) { 571 break; 572 } 573 } 574 cpu_pause(); 575 /* retry */ 576 } 577 #if defined(DEBUG_LOCKS) 578 debugvm_object_add(object, file, line, -1); 579 #endif 580 581 /* 582 * vrele or return the vp to vrele. We can only safely vrele(vp) 583 * if the object was locked exclusively. But there are two races 584 * here. 585 * 586 * We had to upgrade the object above to safely clear VTEXT 587 * but the alternative path where the shared lock is retained 588 * can STILL race to 0 in other paths and cause our own vrele() 589 * to terminate the vnode. We can't allow that if the VM object 590 * is still locked shared. 591 */ 592 if (vpp) 593 *vpp = vp; 594 else 595 vrele(vp); 596 } 597 598 /* 599 * Release a reference to the specified object, gained either through a 600 * vm_object_allocate or a vm_object_reference call. When all references 601 * are gone, storage associated with this object may be relinquished. 602 * 603 * The caller does not have to hold the object locked but must have control 604 * over the reference in question in order to guarantee that the object 605 * does not get ripped out from under us. 606 * 607 * XXX Currently all deallocations require an exclusive lock. 608 */ 609 void 610 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 611 { 612 struct vnode *vp; 613 int count; 614 615 if (object == NULL) 616 return; 617 618 count = object->ref_count; 619 cpu_ccfence(); 620 for (;;) { 621 /* 622 * If decrementing the count enters into special handling 623 * territory (0, 1, or 2) we have to do it the hard way. 624 * Fortunate though, objects with only a few refs like this 625 * are not likely to be heavily contended anyway. 626 * 627 * For vnode objects we only care about 1->0 transitions. 628 */ 629 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 630 #if defined(DEBUG_LOCKS) 631 debugvm_object_add(object, file, line, 0); 632 #endif 633 vm_object_hold(object); 634 vm_object_deallocate_locked(object); 635 vm_object_drop(object); 636 break; 637 } 638 639 /* 640 * Try to decrement ref_count without acquiring a hold on 641 * the object. This is particularly important for the exec*() 642 * and exit*() code paths because the program binary may 643 * have a great deal of sharing and an exclusive lock will 644 * crowbar performance in those circumstances. 645 */ 646 if (object->type == OBJT_VNODE) { 647 vp = (struct vnode *)object->handle; 648 if (atomic_fcmpset_int(&object->ref_count, 649 &count, count - 1)) { 650 #if defined(DEBUG_LOCKS) 651 debugvm_object_add(object, file, line, -1); 652 #endif 653 654 vrele(vp); 655 break; 656 } 657 /* retry */ 658 } else { 659 if (atomic_fcmpset_int(&object->ref_count, 660 &count, count - 1)) { 661 #if defined(DEBUG_LOCKS) 662 debugvm_object_add(object, file, line, -1); 663 #endif 664 break; 665 } 666 /* retry */ 667 } 668 cpu_pause(); 669 /* retry */ 670 } 671 } 672 673 void 674 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 675 { 676 /* 677 * Degenerate case 678 */ 679 if (object == NULL) 680 return; 681 682 /* 683 * vnode case, caller either locked the object exclusively 684 * or this is a recursion with must_drop != 0 and the vnode 685 * object will be locked shared. 686 * 687 * If locked shared we have to drop the object before we can 688 * call vrele() or risk a shared/exclusive livelock. 689 */ 690 if (object->type == OBJT_VNODE) { 691 ASSERT_LWKT_TOKEN_HELD(&object->token); 692 vm_object_vndeallocate(object, NULL); 693 return; 694 } 695 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 696 697 /* 698 * Normal case (object is locked exclusively) 699 */ 700 if (object->ref_count == 0) { 701 panic("vm_object_deallocate: object deallocated " 702 "too many times: %d", object->type); 703 } 704 if (object->ref_count > 2) { 705 atomic_add_int(&object->ref_count, -1); 706 #if defined(DEBUG_LOCKS) 707 debugvm_object_add(object, file, line, -1); 708 #endif 709 return; 710 } 711 712 /* 713 * Drop the ref and handle termination on the 1->0 transition. 714 * We may have blocked above so we have to recheck. 715 */ 716 KKASSERT(object->ref_count != 0); 717 if (object->ref_count >= 2) { 718 atomic_add_int(&object->ref_count, -1); 719 #if defined(DEBUG_LOCKS) 720 debugvm_object_add(object, file, line, -1); 721 #endif 722 return; 723 } 724 725 atomic_add_int(&object->ref_count, -1); 726 if ((object->flags & OBJ_DEAD) == 0) 727 vm_object_terminate(object); 728 } 729 730 /* 731 * Destroy the specified object, freeing up related resources. 732 * 733 * The object must have zero references. 734 * 735 * The object must held. The caller is responsible for dropping the object 736 * after terminate returns. Terminate does NOT drop the object. 737 */ 738 static int vm_object_terminate_callback(vm_page_t p, void *data); 739 740 void 741 vm_object_terminate(vm_object_t object) 742 { 743 struct rb_vm_page_scan_info info; 744 struct vm_object_hash *hash; 745 746 /* 747 * Make sure no one uses us. Once we set OBJ_DEAD we should be 748 * able to safely block. 749 */ 750 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 751 KKASSERT((object->flags & OBJ_DEAD) == 0); 752 vm_object_set_flag(object, OBJ_DEAD); 753 754 /* 755 * Wait for the pageout daemon to be done with the object 756 */ 757 vm_object_pip_wait(object, "objtrm1"); 758 759 KASSERT(!object->paging_in_progress, 760 ("vm_object_terminate: pageout in progress")); 761 762 /* 763 * Clean and free the pages, as appropriate. All references to the 764 * object are gone, so we don't need to lock it. 765 */ 766 if (object->type == OBJT_VNODE) { 767 struct vnode *vp; 768 769 /* 770 * Clean pages and flush buffers. 771 * 772 * NOTE! TMPFS buffer flushes do not typically flush the 773 * actual page to swap as this would be highly 774 * inefficient, and normal filesystems usually wrap 775 * page flushes with buffer cache buffers. 776 * 777 * To deal with this we have to call vinvalbuf() both 778 * before and after the vm_object_page_clean(). 779 */ 780 vp = (struct vnode *) object->handle; 781 vinvalbuf(vp, V_SAVE, 0, 0); 782 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 783 vinvalbuf(vp, V_SAVE, 0, 0); 784 } 785 786 /* 787 * Wait for any I/O to complete, after which there had better not 788 * be any references left on the object. 789 */ 790 vm_object_pip_wait(object, "objtrm2"); 791 792 if (object->ref_count != 0) { 793 panic("vm_object_terminate: object with references, " 794 "ref_count=%d", object->ref_count); 795 } 796 797 /* 798 * Cleanup any shared pmaps associated with this object. 799 */ 800 pmap_object_free(object); 801 802 /* 803 * Now free any remaining pages. For internal objects, this also 804 * removes them from paging queues. Don't free wired pages, just 805 * remove them from the object. 806 */ 807 info.count = 0; 808 info.object = object; 809 do { 810 info.error = 0; 811 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 812 vm_object_terminate_callback, &info); 813 } while (info.error); 814 815 /* 816 * Let the pager know object is dead. 817 */ 818 vm_pager_deallocate(object); 819 820 /* 821 * Wait for the object hold count to hit 1, clean out pages as 822 * we go. vmobj_token interlocks any race conditions that might 823 * pick the object up from the vm_object_list after we have cleared 824 * rb_memq. 825 */ 826 for (;;) { 827 if (RB_ROOT(&object->rb_memq) == NULL) 828 break; 829 kprintf("vm_object_terminate: Warning, object %p " 830 "still has %ld pages\n", 831 object, object->resident_page_count); 832 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 833 vm_object_terminate_callback, &info); 834 } 835 836 /* 837 * There had better not be any pages left 838 */ 839 KKASSERT(object->resident_page_count == 0); 840 841 /* 842 * Remove the object from the global object list. 843 */ 844 hash = vmobj_hash(object); 845 lwkt_gettoken(&hash->token); 846 TAILQ_REMOVE(&hash->list, object, object_entry); 847 lwkt_reltoken(&hash->token); 848 849 if (object->ref_count != 0) { 850 panic("vm_object_terminate2: object with references, " 851 "ref_count=%d", object->ref_count); 852 } 853 854 /* 855 * NOTE: The object hold_count is at least 1, so we cannot kfree() 856 * the object here. See vm_object_drop(). 857 */ 858 } 859 860 /* 861 * The caller must hold the object. 862 * 863 * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED 864 * or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync() 865 * is called, due to normal pmap operations. This is because only 866 * global pmap operations on the vm_page can clear the bits and not 867 * just local operations on individual pmaps. 868 * 869 * Most interactions that necessitate the clearing of these bits 870 * proactively call vm_page_protect(), and we must do so here as well. 871 */ 872 static int 873 vm_object_terminate_callback(vm_page_t p, void *data) 874 { 875 struct rb_vm_page_scan_info *info = data; 876 vm_object_t object; 877 878 object = p->object; 879 KKASSERT(object == info->object); 880 if (vm_page_busy_try(p, TRUE)) { 881 vm_page_sleep_busy(p, TRUE, "vmotrm"); 882 info->error = 1; 883 return 0; 884 } 885 if (object != p->object) { 886 /* XXX remove once we determine it can't happen */ 887 kprintf("vm_object_terminate: Warning: Encountered " 888 "busied page %p on queue %d\n", p, p->queue); 889 vm_page_wakeup(p); 890 info->error = 1; 891 } else if (p->wire_count == 0) { 892 /* 893 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 894 */ 895 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE)) 896 vm_page_protect(p, VM_PROT_NONE); 897 vm_page_free(p); 898 mycpu->gd_cnt.v_pfree++; 899 } else { 900 if (p->queue != PQ_NONE) { 901 kprintf("vm_object_terminate: Warning: Encountered " 902 "wired page %p on queue %d\n", p, p->queue); 903 if (vm_object_debug > 0) { 904 --vm_object_debug; 905 print_backtrace(10); 906 } 907 } 908 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE)) 909 vm_page_protect(p, VM_PROT_NONE); 910 vm_page_remove(p); 911 vm_page_wakeup(p); 912 } 913 914 /* 915 * Must be at end to avoid SMP races, caller holds object token 916 */ 917 if ((++info->count & 63) == 0) 918 lwkt_user_yield(); 919 return(0); 920 } 921 922 /* 923 * Clean all dirty pages in the specified range of object. Leaves page 924 * on whatever queue it is currently on. If NOSYNC is set then do not 925 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 926 * leaving the object dirty. 927 * 928 * When stuffing pages asynchronously, allow clustering. XXX we need a 929 * synchronous clustering mode implementation. 930 * 931 * Odd semantics: if start == end, we clean everything. 932 * 933 * The object must be locked? XXX 934 */ 935 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 936 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 937 938 void 939 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 940 int flags) 941 { 942 struct rb_vm_page_scan_info info; 943 struct vnode *vp; 944 int wholescan; 945 int pagerflags; 946 int generation; 947 948 vm_object_hold(object); 949 if (object->type != OBJT_VNODE || 950 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 951 vm_object_drop(object); 952 return; 953 } 954 955 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 956 OBJPC_SYNC : OBJPC_CLUSTER_OK; 957 pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0; 958 959 vp = object->handle; 960 961 /* 962 * Interlock other major object operations. This allows us to 963 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 964 */ 965 vm_object_set_flag(object, OBJ_CLEANING); 966 967 /* 968 * Handle 'entire object' case 969 */ 970 info.start_pindex = start; 971 if (end == 0) { 972 info.end_pindex = object->size - 1; 973 } else { 974 info.end_pindex = end - 1; 975 } 976 wholescan = (start == 0 && info.end_pindex == object->size - 1); 977 info.limit = flags; 978 info.pagerflags = pagerflags; 979 info.object = object; 980 981 /* 982 * If cleaning the entire object do a pass to mark the pages read-only. 983 * If everything worked out ok, clear OBJ_WRITEABLE and 984 * OBJ_MIGHTBEDIRTY. 985 */ 986 if (wholescan) { 987 info.error = 0; 988 info.count = 0; 989 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 990 vm_object_page_clean_pass1, &info); 991 if (info.error == 0) { 992 vm_object_clear_flag(object, 993 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 994 if (object->type == OBJT_VNODE && 995 (vp = (struct vnode *)object->handle) != NULL) { 996 /* 997 * Use new-style interface to clear VISDIRTY 998 * because the vnode is not necessarily removed 999 * from the syncer list(s) as often as it was 1000 * under the old interface, which can leave 1001 * the vnode on the syncer list after reclaim. 1002 */ 1003 vclrobjdirty(vp); 1004 } 1005 } 1006 } 1007 1008 /* 1009 * Do a pass to clean all the dirty pages we find. 1010 */ 1011 do { 1012 info.error = 0; 1013 info.count = 0; 1014 generation = object->generation; 1015 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1016 vm_object_page_clean_pass2, &info); 1017 } while (info.error || generation != object->generation); 1018 1019 vm_object_clear_flag(object, OBJ_CLEANING); 1020 vm_object_drop(object); 1021 } 1022 1023 /* 1024 * The caller must hold the object. 1025 */ 1026 static 1027 int 1028 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1029 { 1030 struct rb_vm_page_scan_info *info = data; 1031 1032 KKASSERT(p->object == info->object); 1033 1034 vm_page_flag_set(p, PG_CLEANCHK); 1035 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1036 info->error = 1; 1037 } else if (vm_page_busy_try(p, FALSE)) { 1038 info->error = 1; 1039 } else { 1040 KKASSERT(p->object == info->object); 1041 vm_page_protect(p, VM_PROT_READ); 1042 vm_page_wakeup(p); 1043 } 1044 1045 /* 1046 * Must be at end to avoid SMP races, caller holds object token 1047 */ 1048 if ((++info->count & 63) == 0) 1049 lwkt_user_yield(); 1050 return(0); 1051 } 1052 1053 /* 1054 * The caller must hold the object 1055 */ 1056 static 1057 int 1058 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1059 { 1060 struct rb_vm_page_scan_info *info = data; 1061 int generation; 1062 1063 KKASSERT(p->object == info->object); 1064 1065 /* 1066 * Do not mess with pages that were inserted after we started 1067 * the cleaning pass. 1068 */ 1069 if ((p->flags & PG_CLEANCHK) == 0) 1070 goto done; 1071 1072 generation = info->object->generation; 1073 1074 if (vm_page_busy_try(p, TRUE)) { 1075 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1076 info->error = 1; 1077 goto done; 1078 } 1079 1080 KKASSERT(p->object == info->object && 1081 info->object->generation == generation); 1082 1083 /* 1084 * Before wasting time traversing the pmaps, check for trivial 1085 * cases where the page cannot be dirty. 1086 */ 1087 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1088 KKASSERT((p->dirty & p->valid) == 0 && 1089 (p->flags & PG_NEED_COMMIT) == 0); 1090 vm_page_wakeup(p); 1091 goto done; 1092 } 1093 1094 /* 1095 * Check whether the page is dirty or not. The page has been set 1096 * to be read-only so the check will not race a user dirtying the 1097 * page. 1098 */ 1099 vm_page_test_dirty(p); 1100 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1101 vm_page_flag_clear(p, PG_CLEANCHK); 1102 vm_page_wakeup(p); 1103 goto done; 1104 } 1105 1106 /* 1107 * If we have been asked to skip nosync pages and this is a 1108 * nosync page, skip it. Note that the object flags were 1109 * not cleared in this case (because pass1 will have returned an 1110 * error), so we do not have to set them. 1111 */ 1112 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1113 vm_page_flag_clear(p, PG_CLEANCHK); 1114 vm_page_wakeup(p); 1115 goto done; 1116 } 1117 1118 /* 1119 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1120 * the pages that get successfully flushed. Set info->error if 1121 * we raced an object modification. 1122 */ 1123 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1124 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1125 1126 /* 1127 * Must be at end to avoid SMP races, caller holds object token 1128 */ 1129 done: 1130 if ((++info->count & 63) == 0) 1131 lwkt_user_yield(); 1132 return(0); 1133 } 1134 1135 /* 1136 * Collect the specified page and nearby pages and flush them out. 1137 * The number of pages flushed is returned. The passed page is busied 1138 * by the caller and we are responsible for its disposition. 1139 * 1140 * The caller must hold the object. 1141 */ 1142 static void 1143 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1144 { 1145 int error; 1146 int is; 1147 int ib; 1148 int i; 1149 int page_base; 1150 vm_pindex_t pi; 1151 vm_page_t ma[BLIST_MAX_ALLOC]; 1152 1153 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1154 1155 pi = p->pindex; 1156 page_base = pi % BLIST_MAX_ALLOC; 1157 ma[page_base] = p; 1158 ib = page_base - 1; 1159 is = page_base + 1; 1160 1161 while (ib >= 0) { 1162 vm_page_t tp; 1163 1164 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1165 TRUE, &error); 1166 if (error) 1167 break; 1168 if (tp == NULL) 1169 break; 1170 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 && 1171 (tp->flags & PG_CLEANCHK) == 0) { 1172 vm_page_wakeup(tp); 1173 break; 1174 } 1175 if ((tp->queue - tp->pc) == PQ_CACHE) { 1176 vm_page_flag_clear(tp, PG_CLEANCHK); 1177 vm_page_wakeup(tp); 1178 break; 1179 } 1180 vm_page_test_dirty(tp); 1181 if ((tp->dirty & tp->valid) == 0 && 1182 (tp->flags & PG_NEED_COMMIT) == 0) { 1183 vm_page_flag_clear(tp, PG_CLEANCHK); 1184 vm_page_wakeup(tp); 1185 break; 1186 } 1187 ma[ib] = tp; 1188 --ib; 1189 } 1190 ++ib; /* fixup */ 1191 1192 while (is < BLIST_MAX_ALLOC && 1193 pi - page_base + is < object->size) { 1194 vm_page_t tp; 1195 1196 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1197 TRUE, &error); 1198 if (error) 1199 break; 1200 if (tp == NULL) 1201 break; 1202 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 && 1203 (tp->flags & PG_CLEANCHK) == 0) { 1204 vm_page_wakeup(tp); 1205 break; 1206 } 1207 if ((tp->queue - tp->pc) == PQ_CACHE) { 1208 vm_page_flag_clear(tp, PG_CLEANCHK); 1209 vm_page_wakeup(tp); 1210 break; 1211 } 1212 vm_page_test_dirty(tp); 1213 if ((tp->dirty & tp->valid) == 0 && 1214 (tp->flags & PG_NEED_COMMIT) == 0) { 1215 vm_page_flag_clear(tp, PG_CLEANCHK); 1216 vm_page_wakeup(tp); 1217 break; 1218 } 1219 ma[is] = tp; 1220 ++is; 1221 } 1222 1223 /* 1224 * All pages in the ma[] array are busied now 1225 */ 1226 for (i = ib; i < is; ++i) { 1227 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1228 vm_page_hold(ma[i]); /* XXX need this any more? */ 1229 } 1230 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1231 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1232 vm_page_unhold(ma[i]); 1233 } 1234 1235 /* 1236 * Implements the madvise function at the object/page level. 1237 * 1238 * MADV_WILLNEED (any object) 1239 * 1240 * Activate the specified pages if they are resident. 1241 * 1242 * MADV_DONTNEED (any object) 1243 * 1244 * Deactivate the specified pages if they are resident. 1245 * 1246 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1247 * 1248 * Deactivate and clean the specified pages if they are 1249 * resident. This permits the process to reuse the pages 1250 * without faulting or the kernel to reclaim the pages 1251 * without I/O. 1252 * 1253 * No requirements. 1254 */ 1255 void 1256 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1257 vm_pindex_t count, int advise) 1258 { 1259 vm_pindex_t end; 1260 vm_page_t m; 1261 int error; 1262 1263 if (object == NULL) 1264 return; 1265 1266 end = pindex + count; 1267 1268 vm_object_hold(object); 1269 1270 /* 1271 * Locate and adjust resident pages. This only applies to the 1272 * primary object in the mapping. 1273 */ 1274 for (; pindex < end; pindex += 1) { 1275 relookup: 1276 /* 1277 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1278 * and those pages must be OBJ_ONEMAPPING. 1279 */ 1280 if (advise == MADV_FREE) { 1281 if ((object->type != OBJT_DEFAULT && 1282 object->type != OBJT_SWAP) || 1283 (object->flags & OBJ_ONEMAPPING) == 0) { 1284 continue; 1285 } 1286 } 1287 1288 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1289 1290 if (error) { 1291 vm_page_sleep_busy(m, TRUE, "madvpo"); 1292 goto relookup; 1293 } 1294 if (m == NULL) { 1295 /* 1296 * There may be swap even if there is no backing page 1297 */ 1298 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1299 swap_pager_freespace(object, pindex, 1); 1300 continue; 1301 } 1302 1303 /* 1304 * If the page is not in a normal active state, we skip it. 1305 * If the page is not managed there are no page queues to 1306 * mess with. Things can break if we mess with pages in 1307 * any of the below states. 1308 */ 1309 if (m->wire_count || 1310 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED | 1311 PG_NEED_COMMIT)) || 1312 m->valid != VM_PAGE_BITS_ALL 1313 ) { 1314 vm_page_wakeup(m); 1315 continue; 1316 } 1317 1318 /* 1319 * Theoretically once a page is known not to be busy, an 1320 * interrupt cannot come along and rip it out from under us. 1321 */ 1322 if (advise == MADV_WILLNEED) { 1323 vm_page_activate(m); 1324 } else if (advise == MADV_DONTNEED) { 1325 vm_page_dontneed(m); 1326 } else if (advise == MADV_FREE) { 1327 /* 1328 * Mark the page clean. This will allow the page 1329 * to be freed up by the system. However, such pages 1330 * are often reused quickly by malloc()/free() 1331 * so we do not do anything that would cause 1332 * a page fault if we can help it. 1333 * 1334 * Specifically, we do not try to actually free 1335 * the page now nor do we try to put it in the 1336 * cache (which would cause a page fault on reuse). 1337 * 1338 * But we do make the page is freeable as we 1339 * can without actually taking the step of unmapping 1340 * it. 1341 */ 1342 pmap_clear_modify(m); 1343 m->dirty = 0; 1344 m->act_count = 0; 1345 vm_page_dontneed(m); 1346 if (object->type == OBJT_SWAP) 1347 swap_pager_freespace(object, pindex, 1); 1348 } 1349 vm_page_wakeup(m); 1350 } 1351 vm_object_drop(object); 1352 } 1353 1354 /* 1355 * Removes all physical pages in the specified object range from the 1356 * object's list of pages. 1357 * 1358 * No requirements. 1359 */ 1360 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1361 1362 void 1363 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1364 boolean_t clean_only) 1365 { 1366 struct rb_vm_page_scan_info info; 1367 int all; 1368 1369 /* 1370 * Degenerate cases and assertions. 1371 * 1372 * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects. 1373 * These objects do not have to have their pages entered into 1374 * them and are handled via their vm_map_backing lists. 1375 */ 1376 vm_object_hold(object); 1377 if (object == NULL || 1378 (object->type != OBJT_MGTDEVICE && 1379 object->resident_page_count == 0 && object->swblock_count == 0)) { 1380 vm_object_drop(object); 1381 return; 1382 } 1383 KASSERT(object->type != OBJT_PHYS, 1384 ("attempt to remove pages from a physical object")); 1385 1386 /* 1387 * Indicate that paging is occuring on the object 1388 */ 1389 vm_object_pip_add(object, 1); 1390 1391 /* 1392 * Figure out the actual removal range and whether we are removing 1393 * the entire contents of the object or not. If removing the entire 1394 * contents, be sure to get all pages, even those that might be 1395 * beyond the end of the object. 1396 * 1397 * NOTE: end is non-inclusive, but info.end_pindex is inclusive. 1398 */ 1399 info.object = object; 1400 info.start_pindex = start; 1401 if (end == 0 || end == (vm_pindex_t)-1) { 1402 info.end_pindex = (vm_pindex_t)-1; 1403 end = object->size; 1404 } else { 1405 info.end_pindex = end - 1; 1406 } 1407 info.limit = clean_only; 1408 info.count = 0; 1409 all = (start == 0 && info.end_pindex >= object->size - 1); 1410 1411 /* 1412 * Efficiently remove pages from the pmap via a backing scan. 1413 * 1414 * NOTE: This is the only way pages can be removed and unwired 1415 * from OBJT_MGTDEVICE devices which typically do not enter 1416 * their pages into the vm_object's RB tree. And possibly 1417 * other OBJT_* types in the future. 1418 */ 1419 { 1420 vm_map_backing_t ba; 1421 vm_pindex_t sba, eba; 1422 vm_offset_t sva, eva; 1423 1424 lockmgr(&object->backing_lk, LK_EXCLUSIVE); 1425 TAILQ_FOREACH(ba, &object->backing_list, entry) { 1426 /* 1427 * object offset range within the ba, intersectioned 1428 * with the page range specified for the object 1429 */ 1430 sba = OFF_TO_IDX(ba->offset); 1431 eba = sba + OFF_TO_IDX(ba->end - ba->start); 1432 if (sba < start) 1433 sba = start; 1434 if (eba > end) 1435 eba = end; 1436 1437 /* 1438 * If the intersection is valid, remove the related 1439 * pages. 1440 * 1441 * NOTE! This may also remove other incidental pages 1442 * in the pmap, as the backing area may be 1443 * overloaded. 1444 * 1445 * NOTE! pages for MGTDEVICE objects are only removed 1446 * here, they aren't entered into rb_memq, so 1447 * we must use pmap_remove() instead of 1448 * the non-TLB-invalidating pmap_remove_pages(). 1449 */ 1450 if (sba < eba) { 1451 sva = ba->start + IDX_TO_OFF(sba) - ba->offset; 1452 eva = sva + IDX_TO_OFF(eba - sba); 1453 #if 0 1454 kprintf("VM_OBJECT_PAGE_REMOVE " 1455 "%p[%016jx] %016jx-%016jx\n", 1456 ba->pmap, ba->start, sva, eva); 1457 #endif 1458 pmap_remove(ba->pmap, sva, eva); 1459 } 1460 } 1461 lockmgr(&object->backing_lk, LK_RELEASE); 1462 } 1463 1464 /* 1465 * Remove and free pages entered onto the object list. Note that 1466 * for OBJT_MGTDEVICE objects, there are typically no pages entered. 1467 * 1468 * Loop until we are sure we have gotten them all. 1469 */ 1470 do { 1471 info.error = 0; 1472 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1473 vm_object_page_remove_callback, &info); 1474 } while (info.error); 1475 1476 /* 1477 * Remove any related swap if throwing away pages, or for 1478 * non-swap objects (the swap is a clean copy in that case). 1479 */ 1480 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1481 if (all) 1482 swap_pager_freespace_all(object); 1483 else 1484 swap_pager_freespace(object, info.start_pindex, 1485 info.end_pindex - info.start_pindex + 1); 1486 } 1487 1488 /* 1489 * Cleanup 1490 */ 1491 vm_object_pip_wakeup(object); 1492 vm_object_drop(object); 1493 } 1494 1495 /* 1496 * The caller must hold the object. 1497 * 1498 * NOTE: User yields are allowed when removing more than one page, but not 1499 * allowed if only removing one page (the path for single page removals 1500 * might hold a spinlock). 1501 */ 1502 static int 1503 vm_object_page_remove_callback(vm_page_t p, void *data) 1504 { 1505 struct rb_vm_page_scan_info *info = data; 1506 1507 if (info->object != p->object || 1508 p->pindex < info->start_pindex || 1509 p->pindex > info->end_pindex) { 1510 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1511 info->object, p); 1512 return(0); 1513 } 1514 if (vm_page_busy_try(p, TRUE)) { 1515 vm_page_sleep_busy(p, TRUE, "vmopar"); 1516 info->error = 1; 1517 return(0); 1518 } 1519 if (info->object != p->object) { 1520 /* this should never happen */ 1521 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1522 info->object, p); 1523 vm_page_wakeup(p); 1524 return(0); 1525 } 1526 1527 /* 1528 * Wired pages cannot be destroyed, but they can be invalidated 1529 * and we do so if clean_only (limit) is not set. 1530 * 1531 * WARNING! The page may be wired due to being part of a buffer 1532 * cache buffer, and the buffer might be marked B_CACHE. 1533 * This is fine as part of a truncation but VFSs must be 1534 * sure to fix the buffer up when re-extending the file. 1535 * 1536 * NOTE! PG_NEED_COMMIT is ignored. 1537 */ 1538 if (p->wire_count != 0) { 1539 vm_page_protect(p, VM_PROT_NONE); 1540 if (info->limit == 0) 1541 p->valid = 0; 1542 vm_page_wakeup(p); 1543 goto done; 1544 } 1545 1546 /* 1547 * limit is our clean_only flag. If set and the page is dirty or 1548 * requires a commit, do not free it. If set and the page is being 1549 * held by someone, do not free it. 1550 */ 1551 if (info->limit && p->valid) { 1552 vm_page_test_dirty(p); 1553 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1554 vm_page_wakeup(p); 1555 goto done; 1556 } 1557 } 1558 1559 /* 1560 * Destroy the page. But we have to re-test whether its dirty after 1561 * removing it from its pmaps. 1562 */ 1563 vm_page_protect(p, VM_PROT_NONE); 1564 if (info->limit && p->valid) { 1565 vm_page_test_dirty(p); 1566 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1567 vm_page_wakeup(p); 1568 goto done; 1569 } 1570 } 1571 vm_page_free(p); 1572 1573 /* 1574 * Must be at end to avoid SMP races, caller holds object token 1575 */ 1576 done: 1577 if ((++info->count & 63) == 0) 1578 lwkt_user_yield(); 1579 1580 return(0); 1581 } 1582 1583 /* 1584 * Try to extend prev_object into an adjoining region of virtual 1585 * memory, return TRUE on success. 1586 * 1587 * The caller does not need to hold (prev_object) but must have a stable 1588 * pointer to it (typically by holding the vm_map locked). 1589 * 1590 * This function only works for anonymous memory objects which either 1591 * have (a) one reference or (b) we are extending the object's size. 1592 * Otherwise the related VM pages we want to use for the object might 1593 * be in use by another mapping. 1594 */ 1595 boolean_t 1596 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1597 vm_size_t prev_size, vm_size_t next_size) 1598 { 1599 vm_pindex_t next_pindex; 1600 1601 if (prev_object == NULL) 1602 return (TRUE); 1603 1604 vm_object_hold(prev_object); 1605 1606 if (prev_object->type != OBJT_DEFAULT && 1607 prev_object->type != OBJT_SWAP) { 1608 vm_object_drop(prev_object); 1609 return (FALSE); 1610 } 1611 1612 #if 0 1613 /* caller now checks this */ 1614 /* 1615 * Try to collapse the object first 1616 */ 1617 vm_object_collapse(prev_object, NULL); 1618 #endif 1619 1620 #if 0 1621 /* caller now checks this */ 1622 /* 1623 * We can't coalesce if we shadow another object (figuring out the 1624 * relationships become too complex). 1625 */ 1626 if (prev_object->backing_object != NULL) { 1627 vm_object_chain_release(prev_object); 1628 vm_object_drop(prev_object); 1629 return (FALSE); 1630 } 1631 #endif 1632 1633 prev_size >>= PAGE_SHIFT; 1634 next_size >>= PAGE_SHIFT; 1635 next_pindex = prev_pindex + prev_size; 1636 1637 /* 1638 * We can't if the object has more than one ref count unless we 1639 * are extending it into newly minted space. 1640 */ 1641 if (prev_object->ref_count > 1 && 1642 prev_object->size != next_pindex) { 1643 vm_object_drop(prev_object); 1644 return (FALSE); 1645 } 1646 1647 /* 1648 * Remove any pages that may still be in the object from a previous 1649 * deallocation. 1650 */ 1651 if (next_pindex < prev_object->size) { 1652 vm_object_page_remove(prev_object, 1653 next_pindex, 1654 next_pindex + next_size, FALSE); 1655 if (prev_object->type == OBJT_SWAP) 1656 swap_pager_freespace(prev_object, 1657 next_pindex, next_size); 1658 } 1659 1660 /* 1661 * Extend the object if necessary. 1662 */ 1663 if (next_pindex + next_size > prev_object->size) 1664 prev_object->size = next_pindex + next_size; 1665 vm_object_drop(prev_object); 1666 1667 return (TRUE); 1668 } 1669 1670 /* 1671 * Make the object writable and flag is being possibly dirty. 1672 * 1673 * The object might not be held (or might be held but held shared), 1674 * the related vnode is probably not held either. Object and vnode are 1675 * stable by virtue of the vm_page busied by the caller preventing 1676 * destruction. 1677 * 1678 * If the related mount is flagged MNTK_THR_SYNC we need to call 1679 * vsetobjdirty(). Filesystems using this option usually shortcut 1680 * synchronization by only scanning the syncer list. 1681 */ 1682 void 1683 vm_object_set_writeable_dirty(vm_object_t object) 1684 { 1685 struct vnode *vp; 1686 1687 /*vm_object_assert_held(object);*/ 1688 /* 1689 * Avoid contention in vm fault path by checking the state before 1690 * issuing an atomic op on it. 1691 */ 1692 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1693 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1694 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1695 } 1696 if (object->type == OBJT_VNODE && 1697 (vp = (struct vnode *)object->handle) != NULL) { 1698 if ((vp->v_flag & VOBJDIRTY) == 0) { 1699 if (vp->v_mount && 1700 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1701 /* 1702 * New style THR_SYNC places vnodes on the 1703 * syncer list more deterministically. 1704 */ 1705 vsetobjdirty(vp); 1706 } else { 1707 /* 1708 * Old style scan would not necessarily place 1709 * a vnode on the syncer list when possibly 1710 * modified via mmap. 1711 */ 1712 vsetflags(vp, VOBJDIRTY); 1713 } 1714 } 1715 } 1716 } 1717 1718 #include "opt_ddb.h" 1719 #ifdef DDB 1720 #include <sys/cons.h> 1721 1722 #include <ddb/ddb.h> 1723 1724 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1725 vm_map_entry_t entry); 1726 static int vm_object_in_map (vm_object_t object); 1727 1728 /* 1729 * The caller must hold the object. 1730 */ 1731 static int 1732 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1733 { 1734 vm_map_backing_t ba; 1735 vm_map_t tmpm; 1736 vm_map_entry_t tmpe; 1737 int entcount; 1738 1739 if (map == NULL) 1740 return 0; 1741 if (entry == NULL) { 1742 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1743 entcount = map->nentries; 1744 while (entcount-- && tmpe) { 1745 if( _vm_object_in_map(map, object, tmpe)) { 1746 return 1; 1747 } 1748 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1749 } 1750 return (0); 1751 } 1752 switch(entry->maptype) { 1753 case VM_MAPTYPE_SUBMAP: 1754 tmpm = entry->ba.sub_map; 1755 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1756 entcount = tmpm->nentries; 1757 while (entcount-- && tmpe) { 1758 if( _vm_object_in_map(tmpm, object, tmpe)) { 1759 return 1; 1760 } 1761 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1762 } 1763 break; 1764 case VM_MAPTYPE_NORMAL: 1765 ba = &entry->ba; 1766 while (ba) { 1767 if (ba->object == object) 1768 return TRUE; 1769 ba = ba->backing_ba; 1770 } 1771 break; 1772 default: 1773 break; 1774 } 1775 return 0; 1776 } 1777 1778 static int vm_object_in_map_callback(struct proc *p, void *data); 1779 1780 struct vm_object_in_map_info { 1781 vm_object_t object; 1782 int rv; 1783 }; 1784 1785 /* 1786 * Debugging only 1787 */ 1788 static int 1789 vm_object_in_map(vm_object_t object) 1790 { 1791 struct vm_object_in_map_info info; 1792 1793 info.rv = 0; 1794 info.object = object; 1795 1796 allproc_scan(vm_object_in_map_callback, &info, 0); 1797 if (info.rv) 1798 return 1; 1799 if( _vm_object_in_map(kernel_map, object, 0)) 1800 return 1; 1801 if( _vm_object_in_map(pager_map, object, 0)) 1802 return 1; 1803 if( _vm_object_in_map(buffer_map, object, 0)) 1804 return 1; 1805 return 0; 1806 } 1807 1808 /* 1809 * Debugging only 1810 */ 1811 static int 1812 vm_object_in_map_callback(struct proc *p, void *data) 1813 { 1814 struct vm_object_in_map_info *info = data; 1815 1816 if (p->p_vmspace) { 1817 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1818 info->rv = 1; 1819 return -1; 1820 } 1821 } 1822 return (0); 1823 } 1824 1825 DB_SHOW_COMMAND(vmochk, vm_object_check) 1826 { 1827 struct vm_object_hash *hash; 1828 vm_object_t object; 1829 int n; 1830 1831 /* 1832 * make sure that internal objs are in a map somewhere 1833 * and none have zero ref counts. 1834 */ 1835 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1836 hash = &vm_object_hash[n]; 1837 for (object = TAILQ_FIRST(&hash->list); 1838 object != NULL; 1839 object = TAILQ_NEXT(object, object_entry)) { 1840 if (object->type == OBJT_MARKER) 1841 continue; 1842 if (object->handle != NULL || 1843 (object->type != OBJT_DEFAULT && 1844 object->type != OBJT_SWAP)) { 1845 continue; 1846 } 1847 if (object->ref_count == 0) { 1848 db_printf("vmochk: internal obj has " 1849 "zero ref count: %ld\n", 1850 (long)object->size); 1851 } 1852 if (vm_object_in_map(object)) 1853 continue; 1854 db_printf("vmochk: internal obj is not in a map: " 1855 "ref: %d, size: %lu: 0x%lx\n", 1856 object->ref_count, (u_long)object->size, 1857 (u_long)object->size); 1858 } 1859 } 1860 } 1861 1862 /* 1863 * Debugging only 1864 */ 1865 DB_SHOW_COMMAND(object, vm_object_print_static) 1866 { 1867 /* XXX convert args. */ 1868 vm_object_t object = (vm_object_t)addr; 1869 boolean_t full = have_addr; 1870 1871 vm_page_t p; 1872 1873 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1874 #define count was_count 1875 1876 int count; 1877 1878 if (object == NULL) 1879 return; 1880 1881 db_iprintf( 1882 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1883 object, (int)object->type, (u_long)object->size, 1884 object->resident_page_count, object->ref_count, object->flags); 1885 /* 1886 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1887 */ 1888 db_iprintf("\n"); 1889 1890 if (!full) 1891 return; 1892 1893 db_indent += 2; 1894 count = 0; 1895 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1896 if (count == 0) 1897 db_iprintf("memory:="); 1898 else if (count == 6) { 1899 db_printf("\n"); 1900 db_iprintf(" ..."); 1901 count = 0; 1902 } else 1903 db_printf(","); 1904 count++; 1905 1906 db_printf("(off=0x%lx,page=0x%lx)", 1907 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1908 } 1909 if (count != 0) 1910 db_printf("\n"); 1911 db_indent -= 2; 1912 } 1913 1914 /* XXX. */ 1915 #undef count 1916 1917 /* 1918 * XXX need this non-static entry for calling from vm_map_print. 1919 * 1920 * Debugging only 1921 */ 1922 void 1923 vm_object_print(/* db_expr_t */ long addr, 1924 boolean_t have_addr, 1925 /* db_expr_t */ long count, 1926 char *modif) 1927 { 1928 vm_object_print_static(addr, have_addr, count, modif); 1929 } 1930 1931 /* 1932 * Debugging only 1933 */ 1934 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1935 { 1936 struct vm_object_hash *hash; 1937 vm_object_t object; 1938 int nl = 0; 1939 int c; 1940 int n; 1941 1942 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1943 hash = &vm_object_hash[n]; 1944 for (object = TAILQ_FIRST(&hash->list); 1945 object != NULL; 1946 object = TAILQ_NEXT(object, object_entry)) { 1947 vm_pindex_t idx, fidx; 1948 vm_pindex_t osize; 1949 vm_paddr_t pa = -1, padiff; 1950 int rcount; 1951 vm_page_t m; 1952 1953 if (object->type == OBJT_MARKER) 1954 continue; 1955 db_printf("new object: %p\n", (void *)object); 1956 if ( nl > 18) { 1957 c = cngetc(); 1958 if (c != ' ') 1959 return; 1960 nl = 0; 1961 } 1962 nl++; 1963 rcount = 0; 1964 fidx = 0; 1965 osize = object->size; 1966 if (osize > 128) 1967 osize = 128; 1968 for (idx = 0; idx < osize; idx++) { 1969 m = vm_page_lookup(object, idx); 1970 if (m == NULL) { 1971 if (rcount) { 1972 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1973 (long)fidx, rcount, (long)pa); 1974 if ( nl > 18) { 1975 c = cngetc(); 1976 if (c != ' ') 1977 return; 1978 nl = 0; 1979 } 1980 nl++; 1981 rcount = 0; 1982 } 1983 continue; 1984 } 1985 1986 if (rcount && 1987 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1988 ++rcount; 1989 continue; 1990 } 1991 if (rcount) { 1992 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1993 padiff >>= PAGE_SHIFT; 1994 padiff &= PQ_L2_MASK; 1995 if (padiff == 0) { 1996 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1997 ++rcount; 1998 continue; 1999 } 2000 db_printf(" index(%ld)run(%d)pa(0x%lx)", 2001 (long)fidx, rcount, (long)pa); 2002 db_printf("pd(%ld)\n", (long)padiff); 2003 if ( nl > 18) { 2004 c = cngetc(); 2005 if (c != ' ') 2006 return; 2007 nl = 0; 2008 } 2009 nl++; 2010 } 2011 fidx = idx; 2012 pa = VM_PAGE_TO_PHYS(m); 2013 rcount = 1; 2014 } 2015 if (rcount) { 2016 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 2017 (long)fidx, rcount, (long)pa); 2018 if ( nl > 18) { 2019 c = cngetc(); 2020 if (c != ' ') 2021 return; 2022 nl = 0; 2023 } 2024 nl++; 2025 } 2026 } 2027 } 2028 } 2029 #endif /* DDB */ 2030