1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/malloc.h> 77 #include <sys/sysctl.h> 78 #include <sys/refcount.h> 79 80 #include <vm/vm.h> 81 #include <vm/vm_param.h> 82 #include <vm/pmap.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_pager.h> 88 #include <vm/swap_pager.h> 89 #include <vm/vm_kern.h> 90 #include <vm/vm_extern.h> 91 #include <vm/vm_zone.h> 92 93 #include <vm/vm_page2.h> 94 95 #include <machine/specialreg.h> 96 97 #define EASY_SCAN_FACTOR 8 98 99 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 100 int pagerflags); 101 static void vm_object_lock_init(vm_object_t); 102 103 /* 104 * Virtual memory objects maintain the actual data 105 * associated with allocated virtual memory. A given 106 * page of memory exists within exactly one object. 107 * 108 * An object is only deallocated when all "references" 109 * are given up. Only one "reference" to a given 110 * region of an object should be writeable. 111 * 112 * Associated with each object is a list of all resident 113 * memory pages belonging to that object; this list is 114 * maintained by the "vm_page" module, and locked by the object's 115 * lock. 116 * 117 * Each object also records a "pager" routine which is 118 * used to retrieve (and store) pages to the proper backing 119 * storage. In addition, objects may be backed by other 120 * objects from which they were virtual-copied. 121 * 122 * The only items within the object structure which are 123 * modified after time of creation are: 124 * reference count locked by object's lock 125 * pager routine locked by object's lock 126 * 127 */ 128 129 struct vm_object kernel_object; 130 131 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 132 133 static MALLOC_DEFINE_OBJ(M_VM_OBJECT, sizeof(struct vm_object), 134 "vm_object", "vm_object structures"); 135 136 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 137 #define VMOBJ_HASH_PRIME2 989042931893ULL 138 139 int vm_object_debug; 140 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 141 142 static __inline 143 struct vm_object_hash * 144 vmobj_hash(vm_object_t obj) 145 { 146 uintptr_t hash1; 147 uintptr_t hash2; 148 149 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 150 hash1 %= VMOBJ_HASH_PRIME1; 151 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 152 hash2 %= VMOBJ_HASH_PRIME2; 153 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 154 } 155 156 #if defined(DEBUG_LOCKS) 157 158 #define vm_object_vndeallocate(obj, vpp) \ 159 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 160 161 /* 162 * Debug helper to track hold/drop/ref/deallocate calls. 163 */ 164 static void 165 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 166 { 167 int i; 168 169 i = atomic_fetchadd_int(&obj->debug_index, 1); 170 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 171 ksnprintf(obj->debug_hold_thrs[i], 172 sizeof(obj->debug_hold_thrs[i]), 173 "%c%d:(%d):%s", 174 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 175 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 176 obj->ref_count, 177 curthread->td_comm); 178 obj->debug_hold_file[i] = file; 179 obj->debug_hold_line[i] = line; 180 #if 0 181 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 182 if (strcmp(curthread->td_comm, "sshd") == 0) { 183 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 184 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 185 obj, obj->ref_count, addrem, file, line); 186 } 187 #endif 188 } 189 190 #endif 191 192 /* 193 * Misc low level routines 194 */ 195 static void 196 vm_object_lock_init(vm_object_t obj) 197 { 198 #if defined(DEBUG_LOCKS) 199 int i; 200 201 obj->debug_index = 0; 202 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 203 obj->debug_hold_thrs[i][0] = 0; 204 obj->debug_hold_file[i] = NULL; 205 obj->debug_hold_line[i] = 0; 206 } 207 #endif 208 } 209 210 void 211 vm_object_lock_swap(void) 212 { 213 lwkt_token_swap(); 214 } 215 216 void 217 vm_object_lock(vm_object_t obj) 218 { 219 lwkt_gettoken(&obj->token); 220 } 221 222 /* 223 * Returns TRUE on sucesss 224 */ 225 static int 226 vm_object_lock_try(vm_object_t obj) 227 { 228 return(lwkt_trytoken(&obj->token)); 229 } 230 231 void 232 vm_object_lock_shared(vm_object_t obj) 233 { 234 lwkt_gettoken_shared(&obj->token); 235 } 236 237 void 238 vm_object_unlock(vm_object_t obj) 239 { 240 lwkt_reltoken(&obj->token); 241 } 242 243 void 244 vm_object_upgrade(vm_object_t obj) 245 { 246 lwkt_reltoken(&obj->token); 247 lwkt_gettoken(&obj->token); 248 } 249 250 void 251 vm_object_downgrade(vm_object_t obj) 252 { 253 lwkt_reltoken(&obj->token); 254 lwkt_gettoken_shared(&obj->token); 255 } 256 257 static __inline void 258 vm_object_assert_held(vm_object_t obj) 259 { 260 ASSERT_LWKT_TOKEN_HELD(&obj->token); 261 } 262 263 int 264 vm_quickcolor(void) 265 { 266 globaldata_t gd = mycpu; 267 int pg_color; 268 269 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 270 pg_color += gd->gd_quick_color; 271 gd->gd_quick_color += PQ_PRIME2; 272 273 return pg_color; 274 } 275 276 void 277 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 278 { 279 KKASSERT(obj != NULL); 280 281 /* 282 * Object must be held (object allocation is stable due to callers 283 * context, typically already holding the token on a parent object) 284 * prior to potentially blocking on the lock, otherwise the object 285 * can get ripped away from us. 286 */ 287 refcount_acquire(&obj->hold_count); 288 vm_object_lock(obj); 289 290 #if defined(DEBUG_LOCKS) 291 debugvm_object_add(obj, file, line, 1); 292 #endif 293 } 294 295 int 296 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 297 { 298 KKASSERT(obj != NULL); 299 300 /* 301 * Object must be held (object allocation is stable due to callers 302 * context, typically already holding the token on a parent object) 303 * prior to potentially blocking on the lock, otherwise the object 304 * can get ripped away from us. 305 */ 306 refcount_acquire(&obj->hold_count); 307 if (vm_object_lock_try(obj) == 0) { 308 if (refcount_release(&obj->hold_count)) { 309 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 310 kfree_obj(obj, M_VM_OBJECT); 311 } 312 return(0); 313 } 314 315 #if defined(DEBUG_LOCKS) 316 debugvm_object_add(obj, file, line, 1); 317 #endif 318 return(1); 319 } 320 321 void 322 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 323 { 324 KKASSERT(obj != NULL); 325 326 /* 327 * Object must be held (object allocation is stable due to callers 328 * context, typically already holding the token on a parent object) 329 * prior to potentially blocking on the lock, otherwise the object 330 * can get ripped away from us. 331 */ 332 refcount_acquire(&obj->hold_count); 333 vm_object_lock_shared(obj); 334 335 #if defined(DEBUG_LOCKS) 336 debugvm_object_add(obj, file, line, 1); 337 #endif 338 } 339 340 /* 341 * Drop the token and hold_count on the object. 342 * 343 * WARNING! Token might be shared. 344 */ 345 void 346 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 347 { 348 if (obj == NULL) 349 return; 350 351 /* 352 * No new holders should be possible once we drop hold_count 1->0 as 353 * there is no longer any way to reference the object. 354 */ 355 KKASSERT(obj->hold_count > 0); 356 if (refcount_release(&obj->hold_count)) { 357 #if defined(DEBUG_LOCKS) 358 debugvm_object_add(obj, file, line, -1); 359 #endif 360 361 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 362 vm_object_unlock(obj); 363 kfree_obj(obj, M_VM_OBJECT); 364 } else { 365 vm_object_unlock(obj); 366 } 367 } else { 368 #if defined(DEBUG_LOCKS) 369 debugvm_object_add(obj, file, line, -1); 370 #endif 371 vm_object_unlock(obj); 372 } 373 } 374 375 /* 376 * Initialize a freshly allocated object, returning a held object. 377 * 378 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 379 * 380 * No requirements. 381 */ 382 void 383 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object, 384 const char *ident) 385 { 386 struct vm_object_hash *hash; 387 388 RB_INIT(&object->rb_memq); 389 lwkt_token_init(&object->token, ident); 390 391 TAILQ_INIT(&object->backing_list); 392 lockinit(&object->backing_lk, "baclk", 0, 0); 393 394 object->type = type; 395 object->size = size; 396 object->ref_count = 1; 397 object->memattr = VM_MEMATTR_DEFAULT; 398 object->hold_count = 0; 399 object->flags = 0; 400 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 401 vm_object_set_flag(object, OBJ_ONEMAPPING); 402 object->paging_in_progress = 0; 403 object->resident_page_count = 0; 404 /* cpu localization twist */ 405 object->pg_color = vm_quickcolor(); 406 object->handle = NULL; 407 408 atomic_add_int(&object->generation, 1); 409 object->swblock_count = 0; 410 RB_INIT(&object->swblock_root); 411 vm_object_lock_init(object); 412 pmap_object_init(object); 413 414 vm_object_hold(object); 415 416 hash = vmobj_hash(object); 417 lwkt_gettoken(&hash->token); 418 TAILQ_INSERT_TAIL(&hash->list, object, object_entry); 419 lwkt_reltoken(&hash->token); 420 } 421 422 /* 423 * Initialize a VM object. 424 */ 425 void 426 vm_object_init(vm_object_t object, vm_pindex_t size) 427 { 428 _vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj"); 429 vm_object_drop(object); 430 } 431 432 /* 433 * Initialize the VM objects module. 434 * 435 * Called from the low level boot code only. Note that this occurs before 436 * kmalloc is initialized so we cannot allocate any VM objects. 437 */ 438 void 439 vm_object_init1(void) 440 { 441 int i; 442 443 for (i = 0; i < VMOBJ_HSIZE; ++i) { 444 TAILQ_INIT(&vm_object_hash[i].list); 445 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 446 } 447 448 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 449 &kernel_object, "kobj"); 450 vm_object_drop(&kernel_object); 451 } 452 453 void 454 vm_object_init2(void) 455 { 456 kmalloc_obj_set_unlimited(M_VM_OBJECT); 457 } 458 459 /* 460 * Allocate and return a new object of the specified type and size. 461 * 462 * No requirements. 463 */ 464 vm_object_t 465 vm_object_allocate(objtype_t type, vm_pindex_t size) 466 { 467 vm_object_t obj; 468 469 obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 470 _vm_object_allocate(type, size, obj, "vmobj"); 471 vm_object_drop(obj); 472 473 return (obj); 474 } 475 476 /* 477 * This version returns a held object, allowing further atomic initialization 478 * of the object. 479 */ 480 vm_object_t 481 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 482 { 483 vm_object_t obj; 484 485 obj = kmalloc_obj(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 486 _vm_object_allocate(type, size, obj, "vmobj"); 487 488 return (obj); 489 } 490 491 /* 492 * Add an additional reference to a vm_object. The object must already be 493 * held. The original non-lock version is no longer supported. The object 494 * must NOT be chain locked by anyone at the time the reference is added. 495 * 496 * The object must be held, but may be held shared if desired (hence why 497 * we use an atomic op). 498 */ 499 void 500 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 501 { 502 KKASSERT(object != NULL); 503 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 504 atomic_add_int(&object->ref_count, 1); 505 if (object->type == OBJT_VNODE) { 506 vref(object->handle); 507 /* XXX what if the vnode is being destroyed? */ 508 } 509 #if defined(DEBUG_LOCKS) 510 debugvm_object_add(object, file, line, 1); 511 #endif 512 } 513 514 /* 515 * This version is only allowed in situations where the caller 516 * already knows that the object is deterministically referenced 517 * (usually because its taken from a ref'd vnode, or during a map_entry 518 * replication). 519 */ 520 void 521 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 522 { 523 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0); 524 atomic_add_int(&object->ref_count, 1); 525 if (object->type == OBJT_VNODE) 526 vref(object->handle); 527 #if defined(DEBUG_LOCKS) 528 debugvm_object_add(object, file, line, 1); 529 #endif 530 } 531 532 /* 533 * Dereference an object and its underlying vnode. The object may be 534 * held shared. On return the object will remain held. 535 * 536 * This function may return a vnode in *vpp which the caller must release 537 * after the caller drops its own lock. If vpp is NULL, we assume that 538 * the caller was holding an exclusive lock on the object and we vrele() 539 * the vp ourselves. 540 */ 541 static void 542 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 543 VMOBJDBARGS) 544 { 545 struct vnode *vp = (struct vnode *) object->handle; 546 int count; 547 548 KASSERT(object->type == OBJT_VNODE, 549 ("vm_object_vndeallocate: not a vnode object")); 550 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 551 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 552 #ifdef INVARIANTS 553 if (object->ref_count == 0) { 554 vprint("vm_object_vndeallocate", vp); 555 panic("vm_object_vndeallocate: bad object reference count"); 556 } 557 #endif 558 count = object->ref_count; 559 cpu_ccfence(); 560 for (;;) { 561 if (count == 1) { 562 vm_object_upgrade(object); 563 if (atomic_fcmpset_int(&object->ref_count, &count, 0)) { 564 vclrflags(vp, VTEXT); 565 break; 566 } 567 } else { 568 if (atomic_fcmpset_int(&object->ref_count, 569 &count, count - 1)) { 570 break; 571 } 572 } 573 cpu_pause(); 574 /* retry */ 575 } 576 #if defined(DEBUG_LOCKS) 577 debugvm_object_add(object, file, line, -1); 578 #endif 579 580 /* 581 * vrele or return the vp to vrele. We can only safely vrele(vp) 582 * if the object was locked exclusively. But there are two races 583 * here. 584 * 585 * We had to upgrade the object above to safely clear VTEXT 586 * but the alternative path where the shared lock is retained 587 * can STILL race to 0 in other paths and cause our own vrele() 588 * to terminate the vnode. We can't allow that if the VM object 589 * is still locked shared. 590 */ 591 if (vpp) 592 *vpp = vp; 593 else 594 vrele(vp); 595 } 596 597 /* 598 * Release a reference to the specified object, gained either through a 599 * vm_object_allocate or a vm_object_reference call. When all references 600 * are gone, storage associated with this object may be relinquished. 601 * 602 * The caller does not have to hold the object locked but must have control 603 * over the reference in question in order to guarantee that the object 604 * does not get ripped out from under us. 605 * 606 * XXX Currently all deallocations require an exclusive lock. 607 */ 608 void 609 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 610 { 611 struct vnode *vp; 612 int count; 613 614 if (object == NULL) 615 return; 616 617 count = object->ref_count; 618 cpu_ccfence(); 619 for (;;) { 620 /* 621 * If decrementing the count enters into special handling 622 * territory (0, 1, or 2) we have to do it the hard way. 623 * Fortunate though, objects with only a few refs like this 624 * are not likely to be heavily contended anyway. 625 * 626 * For vnode objects we only care about 1->0 transitions. 627 */ 628 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 629 #if defined(DEBUG_LOCKS) 630 debugvm_object_add(object, file, line, 0); 631 #endif 632 vm_object_hold(object); 633 vm_object_deallocate_locked(object); 634 vm_object_drop(object); 635 break; 636 } 637 638 /* 639 * Try to decrement ref_count without acquiring a hold on 640 * the object. This is particularly important for the exec*() 641 * and exit*() code paths because the program binary may 642 * have a great deal of sharing and an exclusive lock will 643 * crowbar performance in those circumstances. 644 */ 645 if (object->type == OBJT_VNODE) { 646 vp = (struct vnode *)object->handle; 647 if (atomic_fcmpset_int(&object->ref_count, 648 &count, count - 1)) { 649 #if defined(DEBUG_LOCKS) 650 debugvm_object_add(object, file, line, -1); 651 #endif 652 653 vrele(vp); 654 break; 655 } 656 /* retry */ 657 } else { 658 if (atomic_fcmpset_int(&object->ref_count, 659 &count, count - 1)) { 660 #if defined(DEBUG_LOCKS) 661 debugvm_object_add(object, file, line, -1); 662 #endif 663 break; 664 } 665 /* retry */ 666 } 667 cpu_pause(); 668 /* retry */ 669 } 670 } 671 672 void 673 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 674 { 675 /* 676 * Degenerate case 677 */ 678 if (object == NULL) 679 return; 680 681 /* 682 * vnode case, caller either locked the object exclusively 683 * or this is a recursion with must_drop != 0 and the vnode 684 * object will be locked shared. 685 * 686 * If locked shared we have to drop the object before we can 687 * call vrele() or risk a shared/exclusive livelock. 688 */ 689 if (object->type == OBJT_VNODE) { 690 ASSERT_LWKT_TOKEN_HELD(&object->token); 691 vm_object_vndeallocate(object, NULL); 692 return; 693 } 694 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 695 696 /* 697 * Normal case (object is locked exclusively) 698 */ 699 if (object->ref_count == 0) { 700 panic("vm_object_deallocate: object deallocated " 701 "too many times: %d", object->type); 702 } 703 if (object->ref_count > 2) { 704 atomic_add_int(&object->ref_count, -1); 705 #if defined(DEBUG_LOCKS) 706 debugvm_object_add(object, file, line, -1); 707 #endif 708 return; 709 } 710 711 /* 712 * Drop the ref and handle termination on the 1->0 transition. 713 * We may have blocked above so we have to recheck. 714 */ 715 KKASSERT(object->ref_count != 0); 716 if (object->ref_count >= 2) { 717 atomic_add_int(&object->ref_count, -1); 718 #if defined(DEBUG_LOCKS) 719 debugvm_object_add(object, file, line, -1); 720 #endif 721 return; 722 } 723 724 atomic_add_int(&object->ref_count, -1); 725 if ((object->flags & OBJ_DEAD) == 0) 726 vm_object_terminate(object); 727 } 728 729 /* 730 * Destroy the specified object, freeing up related resources. 731 * 732 * The object must have zero references. 733 * 734 * The object must held. The caller is responsible for dropping the object 735 * after terminate returns. Terminate does NOT drop the object. 736 */ 737 static int vm_object_terminate_callback(vm_page_t p, void *data); 738 739 void 740 vm_object_terminate(vm_object_t object) 741 { 742 struct rb_vm_page_scan_info info; 743 struct vm_object_hash *hash; 744 745 /* 746 * Make sure no one uses us. Once we set OBJ_DEAD we should be 747 * able to safely block. 748 */ 749 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 750 KKASSERT((object->flags & OBJ_DEAD) == 0); 751 vm_object_set_flag(object, OBJ_DEAD); 752 753 /* 754 * Wait for the pageout daemon to be done with the object 755 */ 756 vm_object_pip_wait(object, "objtrm1"); 757 758 KASSERT(!object->paging_in_progress, 759 ("vm_object_terminate: pageout in progress")); 760 761 /* 762 * Clean and free the pages, as appropriate. All references to the 763 * object are gone, so we don't need to lock it. 764 */ 765 if (object->type == OBJT_VNODE) { 766 struct vnode *vp; 767 768 /* 769 * Clean pages and flush buffers. 770 * 771 * NOTE! TMPFS buffer flushes do not typically flush the 772 * actual page to swap as this would be highly 773 * inefficient, and normal filesystems usually wrap 774 * page flushes with buffer cache buffers. 775 * 776 * To deal with this we have to call vinvalbuf() both 777 * before and after the vm_object_page_clean(). 778 */ 779 vp = (struct vnode *) object->handle; 780 vinvalbuf(vp, V_SAVE, 0, 0); 781 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 782 vinvalbuf(vp, V_SAVE, 0, 0); 783 } 784 785 /* 786 * Wait for any I/O to complete, after which there had better not 787 * be any references left on the object. 788 */ 789 vm_object_pip_wait(object, "objtrm2"); 790 791 if (object->ref_count != 0) { 792 panic("vm_object_terminate: object with references, " 793 "ref_count=%d", object->ref_count); 794 } 795 796 /* 797 * Cleanup any shared pmaps associated with this object. 798 */ 799 pmap_object_free(object); 800 801 /* 802 * Now free any remaining pages. For internal objects, this also 803 * removes them from paging queues. Don't free wired pages, just 804 * remove them from the object. 805 */ 806 info.count = 0; 807 info.object = object; 808 do { 809 info.error = 0; 810 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 811 vm_object_terminate_callback, &info); 812 } while (info.error); 813 814 /* 815 * Let the pager know object is dead. 816 */ 817 vm_pager_deallocate(object); 818 819 /* 820 * Wait for the object hold count to hit 1, clean out pages as 821 * we go. vmobj_token interlocks any race conditions that might 822 * pick the object up from the vm_object_list after we have cleared 823 * rb_memq. 824 */ 825 for (;;) { 826 if (RB_ROOT(&object->rb_memq) == NULL) 827 break; 828 kprintf("vm_object_terminate: Warning, object %p " 829 "still has %ld pages\n", 830 object, object->resident_page_count); 831 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 832 vm_object_terminate_callback, &info); 833 } 834 835 /* 836 * There had better not be any pages left 837 */ 838 KKASSERT(object->resident_page_count == 0); 839 840 /* 841 * Remove the object from the global object list. 842 */ 843 hash = vmobj_hash(object); 844 lwkt_gettoken(&hash->token); 845 TAILQ_REMOVE(&hash->list, object, object_entry); 846 lwkt_reltoken(&hash->token); 847 848 if (object->ref_count != 0) { 849 panic("vm_object_terminate2: object with references, " 850 "ref_count=%d", object->ref_count); 851 } 852 853 /* 854 * NOTE: The object hold_count is at least 1, so we cannot kfree() 855 * the object here. See vm_object_drop(). 856 */ 857 } 858 859 /* 860 * The caller must hold the object. 861 * 862 * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED 863 * or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync() 864 * is called, due to normal pmap operations. This is because only 865 * global pmap operations on the vm_page can clear the bits and not 866 * just local operations on individual pmaps. 867 * 868 * Most interactions that necessitate the clearing of these bits 869 * proactively call vm_page_protect(), and we must do so here as well. 870 */ 871 static int 872 vm_object_terminate_callback(vm_page_t p, void *data) 873 { 874 struct rb_vm_page_scan_info *info = data; 875 vm_object_t object; 876 877 object = p->object; 878 KKASSERT(object == info->object); 879 if (vm_page_busy_try(p, TRUE)) { 880 vm_page_sleep_busy(p, TRUE, "vmotrm"); 881 info->error = 1; 882 return 0; 883 } 884 if (object != p->object) { 885 /* XXX remove once we determine it can't happen */ 886 kprintf("vm_object_terminate: Warning: Encountered " 887 "busied page %p on queue %d\n", p, p->queue); 888 vm_page_wakeup(p); 889 info->error = 1; 890 } else if (p->wire_count == 0) { 891 /* 892 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 893 */ 894 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE)) 895 vm_page_protect(p, VM_PROT_NONE); 896 vm_page_free(p); 897 mycpu->gd_cnt.v_pfree++; 898 } else { 899 if (p->queue != PQ_NONE) { 900 kprintf("vm_object_terminate: Warning: Encountered " 901 "wired page %p on queue %d\n", p, p->queue); 902 if (vm_object_debug > 0) { 903 --vm_object_debug; 904 print_backtrace(10); 905 } 906 } 907 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE)) 908 vm_page_protect(p, VM_PROT_NONE); 909 vm_page_remove(p); 910 vm_page_wakeup(p); 911 } 912 913 /* 914 * Must be at end to avoid SMP races, caller holds object token 915 */ 916 if ((++info->count & 63) == 0) 917 lwkt_user_yield(); 918 return(0); 919 } 920 921 /* 922 * Clean all dirty pages in the specified range of object. Leaves page 923 * on whatever queue it is currently on. If NOSYNC is set then do not 924 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 925 * leaving the object dirty. 926 * 927 * When stuffing pages asynchronously, allow clustering. XXX we need a 928 * synchronous clustering mode implementation. 929 * 930 * Odd semantics: if start == end, we clean everything. 931 * 932 * The object must be locked? XXX 933 */ 934 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 935 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 936 937 void 938 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 939 int flags) 940 { 941 struct rb_vm_page_scan_info info; 942 struct vnode *vp; 943 int wholescan; 944 int pagerflags; 945 int generation; 946 947 vm_object_hold(object); 948 if (object->type != OBJT_VNODE || 949 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 950 vm_object_drop(object); 951 return; 952 } 953 954 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 955 OBJPC_SYNC : OBJPC_CLUSTER_OK; 956 pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0; 957 958 vp = object->handle; 959 960 /* 961 * Interlock other major object operations. This allows us to 962 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 963 */ 964 vm_object_set_flag(object, OBJ_CLEANING); 965 966 /* 967 * Handle 'entire object' case 968 */ 969 info.start_pindex = start; 970 if (end == 0) { 971 info.end_pindex = object->size - 1; 972 } else { 973 info.end_pindex = end - 1; 974 } 975 wholescan = (start == 0 && info.end_pindex == object->size - 1); 976 info.limit = flags; 977 info.pagerflags = pagerflags; 978 info.object = object; 979 980 /* 981 * If cleaning the entire object do a pass to mark the pages read-only. 982 * If everything worked out ok, clear OBJ_WRITEABLE and 983 * OBJ_MIGHTBEDIRTY. 984 */ 985 if (wholescan) { 986 info.error = 0; 987 info.count = 0; 988 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 989 vm_object_page_clean_pass1, &info); 990 if (info.error == 0) { 991 vm_object_clear_flag(object, 992 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 993 if (object->type == OBJT_VNODE && 994 (vp = (struct vnode *)object->handle) != NULL) { 995 /* 996 * Use new-style interface to clear VISDIRTY 997 * because the vnode is not necessarily removed 998 * from the syncer list(s) as often as it was 999 * under the old interface, which can leave 1000 * the vnode on the syncer list after reclaim. 1001 */ 1002 vclrobjdirty(vp); 1003 } 1004 } 1005 } 1006 1007 /* 1008 * Do a pass to clean all the dirty pages we find. 1009 */ 1010 do { 1011 info.error = 0; 1012 info.count = 0; 1013 generation = object->generation; 1014 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1015 vm_object_page_clean_pass2, &info); 1016 } while (info.error || generation != object->generation); 1017 1018 vm_object_clear_flag(object, OBJ_CLEANING); 1019 vm_object_drop(object); 1020 } 1021 1022 /* 1023 * The caller must hold the object. 1024 */ 1025 static 1026 int 1027 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1028 { 1029 struct rb_vm_page_scan_info *info = data; 1030 1031 KKASSERT(p->object == info->object); 1032 1033 vm_page_flag_set(p, PG_CLEANCHK); 1034 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1035 info->error = 1; 1036 } else if (vm_page_busy_try(p, FALSE)) { 1037 info->error = 1; 1038 } else { 1039 KKASSERT(p->object == info->object); 1040 vm_page_protect(p, VM_PROT_READ); 1041 vm_page_wakeup(p); 1042 } 1043 1044 /* 1045 * Must be at end to avoid SMP races, caller holds object token 1046 */ 1047 if ((++info->count & 63) == 0) 1048 lwkt_user_yield(); 1049 return(0); 1050 } 1051 1052 /* 1053 * The caller must hold the object 1054 */ 1055 static 1056 int 1057 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1058 { 1059 struct rb_vm_page_scan_info *info = data; 1060 int generation; 1061 1062 KKASSERT(p->object == info->object); 1063 1064 /* 1065 * Do not mess with pages that were inserted after we started 1066 * the cleaning pass. 1067 */ 1068 if ((p->flags & PG_CLEANCHK) == 0) 1069 goto done; 1070 1071 generation = info->object->generation; 1072 1073 if (vm_page_busy_try(p, TRUE)) { 1074 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1075 info->error = 1; 1076 goto done; 1077 } 1078 1079 KKASSERT(p->object == info->object && 1080 info->object->generation == generation); 1081 1082 /* 1083 * Before wasting time traversing the pmaps, check for trivial 1084 * cases where the page cannot be dirty. 1085 */ 1086 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1087 KKASSERT((p->dirty & p->valid) == 0 && 1088 (p->flags & PG_NEED_COMMIT) == 0); 1089 vm_page_wakeup(p); 1090 goto done; 1091 } 1092 1093 /* 1094 * Check whether the page is dirty or not. The page has been set 1095 * to be read-only so the check will not race a user dirtying the 1096 * page. 1097 */ 1098 vm_page_test_dirty(p); 1099 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1100 vm_page_flag_clear(p, PG_CLEANCHK); 1101 vm_page_wakeup(p); 1102 goto done; 1103 } 1104 1105 /* 1106 * If we have been asked to skip nosync pages and this is a 1107 * nosync page, skip it. Note that the object flags were 1108 * not cleared in this case (because pass1 will have returned an 1109 * error), so we do not have to set them. 1110 */ 1111 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1112 vm_page_flag_clear(p, PG_CLEANCHK); 1113 vm_page_wakeup(p); 1114 goto done; 1115 } 1116 1117 /* 1118 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1119 * the pages that get successfully flushed. Set info->error if 1120 * we raced an object modification. 1121 */ 1122 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1123 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1124 1125 /* 1126 * Must be at end to avoid SMP races, caller holds object token 1127 */ 1128 done: 1129 if ((++info->count & 63) == 0) 1130 lwkt_user_yield(); 1131 return(0); 1132 } 1133 1134 /* 1135 * Collect the specified page and nearby pages and flush them out. 1136 * The number of pages flushed is returned. The passed page is busied 1137 * by the caller and we are responsible for its disposition. 1138 * 1139 * The caller must hold the object. 1140 */ 1141 static void 1142 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1143 { 1144 int error; 1145 int is; 1146 int ib; 1147 int i; 1148 int page_base; 1149 vm_pindex_t pi; 1150 vm_page_t ma[BLIST_MAX_ALLOC]; 1151 1152 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1153 1154 pi = p->pindex; 1155 page_base = pi % BLIST_MAX_ALLOC; 1156 ma[page_base] = p; 1157 ib = page_base - 1; 1158 is = page_base + 1; 1159 1160 while (ib >= 0) { 1161 vm_page_t tp; 1162 1163 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1164 TRUE, &error); 1165 if (error) 1166 break; 1167 if (tp == NULL) 1168 break; 1169 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 && 1170 (tp->flags & PG_CLEANCHK) == 0) { 1171 vm_page_wakeup(tp); 1172 break; 1173 } 1174 if ((tp->queue - tp->pc) == PQ_CACHE) { 1175 vm_page_flag_clear(tp, PG_CLEANCHK); 1176 vm_page_wakeup(tp); 1177 break; 1178 } 1179 vm_page_test_dirty(tp); 1180 if ((tp->dirty & tp->valid) == 0 && 1181 (tp->flags & PG_NEED_COMMIT) == 0) { 1182 vm_page_flag_clear(tp, PG_CLEANCHK); 1183 vm_page_wakeup(tp); 1184 break; 1185 } 1186 ma[ib] = tp; 1187 --ib; 1188 } 1189 ++ib; /* fixup */ 1190 1191 while (is < BLIST_MAX_ALLOC && 1192 pi - page_base + is < object->size) { 1193 vm_page_t tp; 1194 1195 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1196 TRUE, &error); 1197 if (error) 1198 break; 1199 if (tp == NULL) 1200 break; 1201 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 && 1202 (tp->flags & PG_CLEANCHK) == 0) { 1203 vm_page_wakeup(tp); 1204 break; 1205 } 1206 if ((tp->queue - tp->pc) == PQ_CACHE) { 1207 vm_page_flag_clear(tp, PG_CLEANCHK); 1208 vm_page_wakeup(tp); 1209 break; 1210 } 1211 vm_page_test_dirty(tp); 1212 if ((tp->dirty & tp->valid) == 0 && 1213 (tp->flags & PG_NEED_COMMIT) == 0) { 1214 vm_page_flag_clear(tp, PG_CLEANCHK); 1215 vm_page_wakeup(tp); 1216 break; 1217 } 1218 ma[is] = tp; 1219 ++is; 1220 } 1221 1222 /* 1223 * All pages in the ma[] array are busied now 1224 */ 1225 for (i = ib; i < is; ++i) { 1226 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1227 vm_page_hold(ma[i]); /* XXX need this any more? */ 1228 } 1229 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1230 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1231 vm_page_unhold(ma[i]); 1232 } 1233 1234 /* 1235 * Implements the madvise function at the object/page level. 1236 * 1237 * MADV_WILLNEED (any object) 1238 * 1239 * Activate the specified pages if they are resident. 1240 * 1241 * MADV_DONTNEED (any object) 1242 * 1243 * Deactivate the specified pages if they are resident. 1244 * 1245 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1246 * 1247 * Deactivate and clean the specified pages if they are 1248 * resident. This permits the process to reuse the pages 1249 * without faulting or the kernel to reclaim the pages 1250 * without I/O. 1251 * 1252 * No requirements. 1253 */ 1254 void 1255 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1256 vm_pindex_t count, int advise) 1257 { 1258 vm_pindex_t end; 1259 vm_page_t m; 1260 int error; 1261 1262 if (object == NULL) 1263 return; 1264 1265 end = pindex + count; 1266 1267 vm_object_hold(object); 1268 1269 /* 1270 * Locate and adjust resident pages. This only applies to the 1271 * primary object in the mapping. 1272 */ 1273 for (; pindex < end; pindex += 1) { 1274 relookup: 1275 /* 1276 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1277 * and those pages must be OBJ_ONEMAPPING. 1278 */ 1279 if (advise == MADV_FREE) { 1280 if ((object->type != OBJT_DEFAULT && 1281 object->type != OBJT_SWAP) || 1282 (object->flags & OBJ_ONEMAPPING) == 0) { 1283 continue; 1284 } 1285 } 1286 1287 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1288 1289 if (error) { 1290 vm_page_sleep_busy(m, TRUE, "madvpo"); 1291 goto relookup; 1292 } 1293 if (m == NULL) { 1294 /* 1295 * There may be swap even if there is no backing page 1296 */ 1297 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1298 swap_pager_freespace(object, pindex, 1); 1299 continue; 1300 } 1301 1302 /* 1303 * If the page is not in a normal active state, we skip it. 1304 * If the page is not managed there are no page queues to 1305 * mess with. Things can break if we mess with pages in 1306 * any of the below states. 1307 */ 1308 if (m->wire_count || 1309 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED | 1310 PG_NEED_COMMIT)) || 1311 m->valid != VM_PAGE_BITS_ALL 1312 ) { 1313 vm_page_wakeup(m); 1314 continue; 1315 } 1316 1317 /* 1318 * Theoretically once a page is known not to be busy, an 1319 * interrupt cannot come along and rip it out from under us. 1320 */ 1321 if (advise == MADV_WILLNEED) { 1322 vm_page_activate(m); 1323 } else if (advise == MADV_DONTNEED) { 1324 vm_page_dontneed(m); 1325 } else if (advise == MADV_FREE) { 1326 /* 1327 * Mark the page clean. This will allow the page 1328 * to be freed up by the system. However, such pages 1329 * are often reused quickly by malloc()/free() 1330 * so we do not do anything that would cause 1331 * a page fault if we can help it. 1332 * 1333 * Specifically, we do not try to actually free 1334 * the page now nor do we try to put it in the 1335 * cache (which would cause a page fault on reuse). 1336 * 1337 * But we do make the page is freeable as we 1338 * can without actually taking the step of unmapping 1339 * it. 1340 */ 1341 pmap_clear_modify(m); 1342 m->dirty = 0; 1343 m->act_count = 0; 1344 vm_page_dontneed(m); 1345 if (object->type == OBJT_SWAP) 1346 swap_pager_freespace(object, pindex, 1); 1347 } 1348 vm_page_wakeup(m); 1349 } 1350 vm_object_drop(object); 1351 } 1352 1353 /* 1354 * Removes all physical pages in the specified object range from the 1355 * object's list of pages. 1356 * 1357 * No requirements. 1358 */ 1359 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1360 1361 void 1362 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1363 boolean_t clean_only) 1364 { 1365 struct rb_vm_page_scan_info info; 1366 int all; 1367 1368 /* 1369 * Degenerate cases and assertions. 1370 * 1371 * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects. 1372 * These objects do not have to have their pages entered into 1373 * them and are handled via their vm_map_backing lists. 1374 */ 1375 vm_object_hold(object); 1376 if (object == NULL || 1377 (object->type != OBJT_MGTDEVICE && 1378 object->resident_page_count == 0 && object->swblock_count == 0)) { 1379 vm_object_drop(object); 1380 return; 1381 } 1382 KASSERT(object->type != OBJT_PHYS, 1383 ("attempt to remove pages from a physical object")); 1384 1385 /* 1386 * Indicate that paging is occuring on the object 1387 */ 1388 vm_object_pip_add(object, 1); 1389 1390 /* 1391 * Figure out the actual removal range and whether we are removing 1392 * the entire contents of the object or not. If removing the entire 1393 * contents, be sure to get all pages, even those that might be 1394 * beyond the end of the object. 1395 * 1396 * NOTE: end is non-inclusive, but info.end_pindex is inclusive. 1397 */ 1398 info.object = object; 1399 info.start_pindex = start; 1400 if (end == 0 || end == (vm_pindex_t)-1) { 1401 info.end_pindex = (vm_pindex_t)-1; 1402 end = object->size; 1403 } else { 1404 info.end_pindex = end - 1; 1405 } 1406 info.limit = clean_only; 1407 info.count = 0; 1408 all = (start == 0 && info.end_pindex >= object->size - 1); 1409 1410 /* 1411 * Efficiently remove pages from the pmap via a backing scan. 1412 * 1413 * NOTE: This is the only way pages can be removed and unwired 1414 * from OBJT_MGTDEVICE devices which typically do not enter 1415 * their pages into the vm_object's RB tree. And possibly 1416 * other OBJT_* types in the future. 1417 */ 1418 { 1419 vm_map_backing_t ba; 1420 vm_pindex_t sba, eba; 1421 vm_offset_t sva, eva; 1422 1423 lockmgr(&object->backing_lk, LK_EXCLUSIVE); 1424 TAILQ_FOREACH(ba, &object->backing_list, entry) { 1425 /* 1426 * object offset range within the ba, intersectioned 1427 * with the page range specified for the object 1428 */ 1429 sba = OFF_TO_IDX(ba->offset); 1430 eba = sba + OFF_TO_IDX(ba->end - ba->start); 1431 if (sba < start) 1432 sba = start; 1433 if (eba > end) 1434 eba = end; 1435 1436 /* 1437 * If the intersection is valid, remove the related 1438 * pages. 1439 * 1440 * NOTE! This may also remove other incidental pages 1441 * in the pmap, as the backing area may be 1442 * overloaded. 1443 * 1444 * NOTE! pages for MGTDEVICE objects are only removed 1445 * here, they aren't entered into rb_memq, so 1446 * we must use pmap_remove() instead of 1447 * the non-TLB-invalidating pmap_remove_pages(). 1448 */ 1449 if (sba < eba) { 1450 sva = ba->start + IDX_TO_OFF(sba) - ba->offset; 1451 eva = sva + IDX_TO_OFF(eba - sba); 1452 #if 0 1453 kprintf("VM_OBJECT_PAGE_REMOVE " 1454 "%p[%016jx] %016jx-%016jx\n", 1455 ba->pmap, ba->start, sva, eva); 1456 #endif 1457 pmap_remove(ba->pmap, sva, eva); 1458 } 1459 } 1460 lockmgr(&object->backing_lk, LK_RELEASE); 1461 } 1462 1463 /* 1464 * Remove and free pages entered onto the object list. Note that 1465 * for OBJT_MGTDEVICE objects, there are typically no pages entered. 1466 * 1467 * Loop until we are sure we have gotten them all. 1468 */ 1469 do { 1470 info.error = 0; 1471 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1472 vm_object_page_remove_callback, &info); 1473 } while (info.error); 1474 1475 /* 1476 * Remove any related swap if throwing away pages, or for 1477 * non-swap objects (the swap is a clean copy in that case). 1478 */ 1479 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1480 if (all) 1481 swap_pager_freespace_all(object); 1482 else 1483 swap_pager_freespace(object, info.start_pindex, 1484 info.end_pindex - info.start_pindex + 1); 1485 } 1486 1487 /* 1488 * Cleanup 1489 */ 1490 vm_object_pip_wakeup(object); 1491 vm_object_drop(object); 1492 } 1493 1494 /* 1495 * The caller must hold the object. 1496 * 1497 * NOTE: User yields are allowed when removing more than one page, but not 1498 * allowed if only removing one page (the path for single page removals 1499 * might hold a spinlock). 1500 */ 1501 static int 1502 vm_object_page_remove_callback(vm_page_t p, void *data) 1503 { 1504 struct rb_vm_page_scan_info *info = data; 1505 1506 if (info->object != p->object || 1507 p->pindex < info->start_pindex || 1508 p->pindex > info->end_pindex) { 1509 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1510 info->object, p); 1511 return(0); 1512 } 1513 if (vm_page_busy_try(p, TRUE)) { 1514 vm_page_sleep_busy(p, TRUE, "vmopar"); 1515 info->error = 1; 1516 return(0); 1517 } 1518 if (info->object != p->object) { 1519 /* this should never happen */ 1520 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1521 info->object, p); 1522 vm_page_wakeup(p); 1523 return(0); 1524 } 1525 1526 /* 1527 * Wired pages cannot be destroyed, but they can be invalidated 1528 * and we do so if clean_only (limit) is not set. 1529 * 1530 * WARNING! The page may be wired due to being part of a buffer 1531 * cache buffer, and the buffer might be marked B_CACHE. 1532 * This is fine as part of a truncation but VFSs must be 1533 * sure to fix the buffer up when re-extending the file. 1534 * 1535 * NOTE! PG_NEED_COMMIT is ignored. 1536 */ 1537 if (p->wire_count != 0) { 1538 vm_page_protect(p, VM_PROT_NONE); 1539 if (info->limit == 0) 1540 p->valid = 0; 1541 vm_page_wakeup(p); 1542 goto done; 1543 } 1544 1545 /* 1546 * limit is our clean_only flag. If set and the page is dirty or 1547 * requires a commit, do not free it. If set and the page is being 1548 * held by someone, do not free it. 1549 */ 1550 if (info->limit && p->valid) { 1551 vm_page_test_dirty(p); 1552 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1553 vm_page_wakeup(p); 1554 goto done; 1555 } 1556 } 1557 1558 /* 1559 * Destroy the page. But we have to re-test whether its dirty after 1560 * removing it from its pmaps. 1561 */ 1562 vm_page_protect(p, VM_PROT_NONE); 1563 if (info->limit && p->valid) { 1564 vm_page_test_dirty(p); 1565 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1566 vm_page_wakeup(p); 1567 goto done; 1568 } 1569 } 1570 vm_page_free(p); 1571 1572 /* 1573 * Must be at end to avoid SMP races, caller holds object token 1574 */ 1575 done: 1576 if ((++info->count & 63) == 0) 1577 lwkt_user_yield(); 1578 1579 return(0); 1580 } 1581 1582 /* 1583 * Try to extend prev_object into an adjoining region of virtual 1584 * memory, return TRUE on success. 1585 * 1586 * The caller does not need to hold (prev_object) but must have a stable 1587 * pointer to it (typically by holding the vm_map locked). 1588 * 1589 * This function only works for anonymous memory objects which either 1590 * have (a) one reference or (b) we are extending the object's size. 1591 * Otherwise the related VM pages we want to use for the object might 1592 * be in use by another mapping. 1593 */ 1594 boolean_t 1595 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1596 vm_size_t prev_size, vm_size_t next_size) 1597 { 1598 vm_pindex_t next_pindex; 1599 1600 if (prev_object == NULL) 1601 return (TRUE); 1602 1603 vm_object_hold(prev_object); 1604 1605 if (prev_object->type != OBJT_DEFAULT && 1606 prev_object->type != OBJT_SWAP) { 1607 vm_object_drop(prev_object); 1608 return (FALSE); 1609 } 1610 1611 #if 0 1612 /* caller now checks this */ 1613 /* 1614 * Try to collapse the object first 1615 */ 1616 vm_object_collapse(prev_object, NULL); 1617 #endif 1618 1619 #if 0 1620 /* caller now checks this */ 1621 /* 1622 * We can't coalesce if we shadow another object (figuring out the 1623 * relationships become too complex). 1624 */ 1625 if (prev_object->backing_object != NULL) { 1626 vm_object_chain_release(prev_object); 1627 vm_object_drop(prev_object); 1628 return (FALSE); 1629 } 1630 #endif 1631 1632 prev_size >>= PAGE_SHIFT; 1633 next_size >>= PAGE_SHIFT; 1634 next_pindex = prev_pindex + prev_size; 1635 1636 /* 1637 * We can't if the object has more than one ref count unless we 1638 * are extending it into newly minted space. 1639 */ 1640 if (prev_object->ref_count > 1 && 1641 prev_object->size != next_pindex) { 1642 vm_object_drop(prev_object); 1643 return (FALSE); 1644 } 1645 1646 /* 1647 * Remove any pages that may still be in the object from a previous 1648 * deallocation. 1649 */ 1650 if (next_pindex < prev_object->size) { 1651 vm_object_page_remove(prev_object, 1652 next_pindex, 1653 next_pindex + next_size, FALSE); 1654 if (prev_object->type == OBJT_SWAP) 1655 swap_pager_freespace(prev_object, 1656 next_pindex, next_size); 1657 } 1658 1659 /* 1660 * Extend the object if necessary. 1661 */ 1662 if (next_pindex + next_size > prev_object->size) 1663 prev_object->size = next_pindex + next_size; 1664 vm_object_drop(prev_object); 1665 1666 return (TRUE); 1667 } 1668 1669 /* 1670 * Make the object writable and flag is being possibly dirty. 1671 * 1672 * The object might not be held (or might be held but held shared), 1673 * the related vnode is probably not held either. Object and vnode are 1674 * stable by virtue of the vm_page busied by the caller preventing 1675 * destruction. 1676 * 1677 * If the related mount is flagged MNTK_THR_SYNC we need to call 1678 * vsetobjdirty(). Filesystems using this option usually shortcut 1679 * synchronization by only scanning the syncer list. 1680 */ 1681 void 1682 vm_object_set_writeable_dirty(vm_object_t object) 1683 { 1684 struct vnode *vp; 1685 1686 /*vm_object_assert_held(object);*/ 1687 /* 1688 * Avoid contention in vm fault path by checking the state before 1689 * issuing an atomic op on it. 1690 */ 1691 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1692 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1693 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1694 } 1695 if (object->type == OBJT_VNODE && 1696 (vp = (struct vnode *)object->handle) != NULL) { 1697 if ((vp->v_flag & VOBJDIRTY) == 0) { 1698 if (vp->v_mount && 1699 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1700 /* 1701 * New style THR_SYNC places vnodes on the 1702 * syncer list more deterministically. 1703 */ 1704 vsetobjdirty(vp); 1705 } else { 1706 /* 1707 * Old style scan would not necessarily place 1708 * a vnode on the syncer list when possibly 1709 * modified via mmap. 1710 */ 1711 vsetflags(vp, VOBJDIRTY); 1712 } 1713 } 1714 } 1715 } 1716 1717 #include "opt_ddb.h" 1718 #ifdef DDB 1719 #include <sys/cons.h> 1720 1721 #include <ddb/ddb.h> 1722 1723 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1724 vm_map_entry_t entry); 1725 static int vm_object_in_map (vm_object_t object); 1726 1727 /* 1728 * The caller must hold the object. 1729 */ 1730 static int 1731 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1732 { 1733 vm_map_backing_t ba; 1734 vm_map_t tmpm; 1735 vm_map_entry_t tmpe; 1736 int entcount; 1737 1738 if (map == NULL) 1739 return 0; 1740 if (entry == NULL) { 1741 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1742 entcount = map->nentries; 1743 while (entcount-- && tmpe) { 1744 if( _vm_object_in_map(map, object, tmpe)) { 1745 return 1; 1746 } 1747 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1748 } 1749 return (0); 1750 } 1751 switch(entry->maptype) { 1752 case VM_MAPTYPE_SUBMAP: 1753 tmpm = entry->ba.sub_map; 1754 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1755 entcount = tmpm->nentries; 1756 while (entcount-- && tmpe) { 1757 if( _vm_object_in_map(tmpm, object, tmpe)) { 1758 return 1; 1759 } 1760 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1761 } 1762 break; 1763 case VM_MAPTYPE_NORMAL: 1764 ba = &entry->ba; 1765 while (ba) { 1766 if (ba->object == object) 1767 return TRUE; 1768 ba = ba->backing_ba; 1769 } 1770 break; 1771 default: 1772 break; 1773 } 1774 return 0; 1775 } 1776 1777 static int vm_object_in_map_callback(struct proc *p, void *data); 1778 1779 struct vm_object_in_map_info { 1780 vm_object_t object; 1781 int rv; 1782 }; 1783 1784 /* 1785 * Debugging only 1786 */ 1787 static int 1788 vm_object_in_map(vm_object_t object) 1789 { 1790 struct vm_object_in_map_info info; 1791 1792 info.rv = 0; 1793 info.object = object; 1794 1795 allproc_scan(vm_object_in_map_callback, &info, 0); 1796 if (info.rv) 1797 return 1; 1798 if( _vm_object_in_map(&kernel_map, object, 0)) 1799 return 1; 1800 if( _vm_object_in_map(&pager_map, object, 0)) 1801 return 1; 1802 if( _vm_object_in_map(&buffer_map, object, 0)) 1803 return 1; 1804 return 0; 1805 } 1806 1807 /* 1808 * Debugging only 1809 */ 1810 static int 1811 vm_object_in_map_callback(struct proc *p, void *data) 1812 { 1813 struct vm_object_in_map_info *info = data; 1814 1815 if (p->p_vmspace) { 1816 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1817 info->rv = 1; 1818 return -1; 1819 } 1820 } 1821 return (0); 1822 } 1823 1824 DB_SHOW_COMMAND(vmochk, vm_object_check) 1825 { 1826 struct vm_object_hash *hash; 1827 vm_object_t object; 1828 int n; 1829 1830 /* 1831 * make sure that internal objs are in a map somewhere 1832 * and none have zero ref counts. 1833 */ 1834 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1835 hash = &vm_object_hash[n]; 1836 for (object = TAILQ_FIRST(&hash->list); 1837 object != NULL; 1838 object = TAILQ_NEXT(object, object_entry)) { 1839 if (object->type == OBJT_MARKER) 1840 continue; 1841 if (object->handle != NULL || 1842 (object->type != OBJT_DEFAULT && 1843 object->type != OBJT_SWAP)) { 1844 continue; 1845 } 1846 if (object->ref_count == 0) { 1847 db_printf("vmochk: internal obj has " 1848 "zero ref count: %ld\n", 1849 (long)object->size); 1850 } 1851 if (vm_object_in_map(object)) 1852 continue; 1853 db_printf("vmochk: internal obj is not in a map: " 1854 "ref: %d, size: %lu: 0x%lx\n", 1855 object->ref_count, (u_long)object->size, 1856 (u_long)object->size); 1857 } 1858 } 1859 } 1860 1861 /* 1862 * Debugging only 1863 */ 1864 DB_SHOW_COMMAND(object, vm_object_print_static) 1865 { 1866 /* XXX convert args. */ 1867 vm_object_t object = (vm_object_t)addr; 1868 boolean_t full = have_addr; 1869 1870 vm_page_t p; 1871 1872 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1873 #define count was_count 1874 1875 int count; 1876 1877 if (object == NULL) 1878 return; 1879 1880 db_iprintf( 1881 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1882 object, (int)object->type, (u_long)object->size, 1883 object->resident_page_count, object->ref_count, object->flags); 1884 /* 1885 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1886 */ 1887 db_iprintf("\n"); 1888 1889 if (!full) 1890 return; 1891 1892 db_indent += 2; 1893 count = 0; 1894 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1895 if (count == 0) 1896 db_iprintf("memory:="); 1897 else if (count == 6) { 1898 db_printf("\n"); 1899 db_iprintf(" ..."); 1900 count = 0; 1901 } else 1902 db_printf(","); 1903 count++; 1904 1905 db_printf("(off=0x%lx,page=0x%lx)", 1906 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1907 } 1908 if (count != 0) 1909 db_printf("\n"); 1910 db_indent -= 2; 1911 } 1912 1913 /* XXX. */ 1914 #undef count 1915 1916 /* 1917 * XXX need this non-static entry for calling from vm_map_print. 1918 * 1919 * Debugging only 1920 */ 1921 void 1922 vm_object_print(/* db_expr_t */ long addr, 1923 boolean_t have_addr, 1924 /* db_expr_t */ long count, 1925 char *modif) 1926 { 1927 vm_object_print_static(addr, have_addr, count, modif); 1928 } 1929 1930 /* 1931 * Debugging only 1932 */ 1933 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1934 { 1935 struct vm_object_hash *hash; 1936 vm_object_t object; 1937 int nl = 0; 1938 int c; 1939 int n; 1940 1941 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1942 hash = &vm_object_hash[n]; 1943 for (object = TAILQ_FIRST(&hash->list); 1944 object != NULL; 1945 object = TAILQ_NEXT(object, object_entry)) { 1946 vm_pindex_t idx, fidx; 1947 vm_pindex_t osize; 1948 vm_paddr_t pa = -1, padiff; 1949 int rcount; 1950 vm_page_t m; 1951 1952 if (object->type == OBJT_MARKER) 1953 continue; 1954 db_printf("new object: %p\n", (void *)object); 1955 if ( nl > 18) { 1956 c = cngetc(); 1957 if (c != ' ') 1958 return; 1959 nl = 0; 1960 } 1961 nl++; 1962 rcount = 0; 1963 fidx = 0; 1964 osize = object->size; 1965 if (osize > 128) 1966 osize = 128; 1967 for (idx = 0; idx < osize; idx++) { 1968 m = vm_page_lookup(object, idx); 1969 if (m == NULL) { 1970 if (rcount) { 1971 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1972 (long)fidx, rcount, (long)pa); 1973 if ( nl > 18) { 1974 c = cngetc(); 1975 if (c != ' ') 1976 return; 1977 nl = 0; 1978 } 1979 nl++; 1980 rcount = 0; 1981 } 1982 continue; 1983 } 1984 1985 if (rcount && 1986 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1987 ++rcount; 1988 continue; 1989 } 1990 if (rcount) { 1991 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1992 padiff >>= PAGE_SHIFT; 1993 padiff &= PQ_L2_MASK; 1994 if (padiff == 0) { 1995 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1996 ++rcount; 1997 continue; 1998 } 1999 db_printf(" index(%ld)run(%d)pa(0x%lx)", 2000 (long)fidx, rcount, (long)pa); 2001 db_printf("pd(%ld)\n", (long)padiff); 2002 if ( nl > 18) { 2003 c = cngetc(); 2004 if (c != ' ') 2005 return; 2006 nl = 0; 2007 } 2008 nl++; 2009 } 2010 fidx = idx; 2011 pa = VM_PAGE_TO_PHYS(m); 2012 rcount = 1; 2013 } 2014 if (rcount) { 2015 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 2016 (long)fidx, rcount, (long)pa); 2017 if ( nl > 18) { 2018 c = cngetc(); 2019 if (c != ' ') 2020 return; 2021 nl = 0; 2022 } 2023 nl++; 2024 } 2025 } 2026 } 2027 } 2028 #endif /* DDB */ 2029