1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/malloc.h> 77 #include <sys/sysctl.h> 78 #include <sys/refcount.h> 79 80 #include <vm/vm.h> 81 #include <vm/vm_param.h> 82 #include <vm/pmap.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_pager.h> 88 #include <vm/swap_pager.h> 89 #include <vm/vm_kern.h> 90 #include <vm/vm_extern.h> 91 #include <vm/vm_zone.h> 92 93 #include <vm/vm_page2.h> 94 95 #include <machine/specialreg.h> 96 97 #define EASY_SCAN_FACTOR 8 98 99 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 100 int pagerflags); 101 static void vm_object_lock_init(vm_object_t); 102 103 /* 104 * Virtual memory objects maintain the actual data 105 * associated with allocated virtual memory. A given 106 * page of memory exists within exactly one object. 107 * 108 * An object is only deallocated when all "references" 109 * are given up. Only one "reference" to a given 110 * region of an object should be writeable. 111 * 112 * Associated with each object is a list of all resident 113 * memory pages belonging to that object; this list is 114 * maintained by the "vm_page" module, and locked by the object's 115 * lock. 116 * 117 * Each object also records a "pager" routine which is 118 * used to retrieve (and store) pages to the proper backing 119 * storage. In addition, objects may be backed by other 120 * objects from which they were virtual-copied. 121 * 122 * The only items within the object structure which are 123 * modified after time of creation are: 124 * reference count locked by object's lock 125 * pager routine locked by object's lock 126 * 127 */ 128 129 struct vm_object kernel_object; 130 131 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 132 133 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures"); 134 135 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 136 #define VMOBJ_HASH_PRIME2 989042931893ULL 137 138 int vm_object_debug; 139 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 140 141 static __inline 142 struct vm_object_hash * 143 vmobj_hash(vm_object_t obj) 144 { 145 uintptr_t hash1; 146 uintptr_t hash2; 147 148 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 149 hash1 %= VMOBJ_HASH_PRIME1; 150 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 151 hash2 %= VMOBJ_HASH_PRIME2; 152 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 153 } 154 155 #if defined(DEBUG_LOCKS) 156 157 #define vm_object_vndeallocate(obj, vpp) \ 158 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 159 160 /* 161 * Debug helper to track hold/drop/ref/deallocate calls. 162 */ 163 static void 164 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 165 { 166 int i; 167 168 i = atomic_fetchadd_int(&obj->debug_index, 1); 169 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 170 ksnprintf(obj->debug_hold_thrs[i], 171 sizeof(obj->debug_hold_thrs[i]), 172 "%c%d:(%d):%s", 173 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 174 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 175 obj->ref_count, 176 curthread->td_comm); 177 obj->debug_hold_file[i] = file; 178 obj->debug_hold_line[i] = line; 179 #if 0 180 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 181 if (strcmp(curthread->td_comm, "sshd") == 0) { 182 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 183 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 184 obj, obj->ref_count, addrem, file, line); 185 } 186 #endif 187 } 188 189 #endif 190 191 /* 192 * Misc low level routines 193 */ 194 static void 195 vm_object_lock_init(vm_object_t obj) 196 { 197 #if defined(DEBUG_LOCKS) 198 int i; 199 200 obj->debug_index = 0; 201 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 202 obj->debug_hold_thrs[i][0] = 0; 203 obj->debug_hold_file[i] = NULL; 204 obj->debug_hold_line[i] = 0; 205 } 206 #endif 207 } 208 209 void 210 vm_object_lock_swap(void) 211 { 212 lwkt_token_swap(); 213 } 214 215 void 216 vm_object_lock(vm_object_t obj) 217 { 218 lwkt_gettoken(&obj->token); 219 } 220 221 /* 222 * Returns TRUE on sucesss 223 */ 224 static int 225 vm_object_lock_try(vm_object_t obj) 226 { 227 return(lwkt_trytoken(&obj->token)); 228 } 229 230 void 231 vm_object_lock_shared(vm_object_t obj) 232 { 233 lwkt_gettoken_shared(&obj->token); 234 } 235 236 void 237 vm_object_unlock(vm_object_t obj) 238 { 239 lwkt_reltoken(&obj->token); 240 } 241 242 void 243 vm_object_upgrade(vm_object_t obj) 244 { 245 lwkt_reltoken(&obj->token); 246 lwkt_gettoken(&obj->token); 247 } 248 249 void 250 vm_object_downgrade(vm_object_t obj) 251 { 252 lwkt_reltoken(&obj->token); 253 lwkt_gettoken_shared(&obj->token); 254 } 255 256 static __inline void 257 vm_object_assert_held(vm_object_t obj) 258 { 259 ASSERT_LWKT_TOKEN_HELD(&obj->token); 260 } 261 262 int 263 vm_quickcolor(void) 264 { 265 globaldata_t gd = mycpu; 266 int pg_color; 267 268 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 269 pg_color += gd->gd_quick_color; 270 gd->gd_quick_color += PQ_PRIME2; 271 272 return pg_color; 273 } 274 275 void 276 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 277 { 278 KKASSERT(obj != NULL); 279 280 /* 281 * Object must be held (object allocation is stable due to callers 282 * context, typically already holding the token on a parent object) 283 * prior to potentially blocking on the lock, otherwise the object 284 * can get ripped away from us. 285 */ 286 refcount_acquire(&obj->hold_count); 287 vm_object_lock(obj); 288 289 #if defined(DEBUG_LOCKS) 290 debugvm_object_add(obj, file, line, 1); 291 #endif 292 } 293 294 int 295 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 296 { 297 KKASSERT(obj != NULL); 298 299 /* 300 * Object must be held (object allocation is stable due to callers 301 * context, typically already holding the token on a parent object) 302 * prior to potentially blocking on the lock, otherwise the object 303 * can get ripped away from us. 304 */ 305 refcount_acquire(&obj->hold_count); 306 if (vm_object_lock_try(obj) == 0) { 307 if (refcount_release(&obj->hold_count)) { 308 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 309 kfree(obj, M_VM_OBJECT); 310 } 311 return(0); 312 } 313 314 #if defined(DEBUG_LOCKS) 315 debugvm_object_add(obj, file, line, 1); 316 #endif 317 return(1); 318 } 319 320 void 321 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 322 { 323 KKASSERT(obj != NULL); 324 325 /* 326 * Object must be held (object allocation is stable due to callers 327 * context, typically already holding the token on a parent object) 328 * prior to potentially blocking on the lock, otherwise the object 329 * can get ripped away from us. 330 */ 331 refcount_acquire(&obj->hold_count); 332 vm_object_lock_shared(obj); 333 334 #if defined(DEBUG_LOCKS) 335 debugvm_object_add(obj, file, line, 1); 336 #endif 337 } 338 339 /* 340 * Drop the token and hold_count on the object. 341 * 342 * WARNING! Token might be shared. 343 */ 344 void 345 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 346 { 347 if (obj == NULL) 348 return; 349 350 /* 351 * No new holders should be possible once we drop hold_count 1->0 as 352 * there is no longer any way to reference the object. 353 */ 354 KKASSERT(obj->hold_count > 0); 355 if (refcount_release(&obj->hold_count)) { 356 #if defined(DEBUG_LOCKS) 357 debugvm_object_add(obj, file, line, -1); 358 #endif 359 360 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 361 vm_object_unlock(obj); 362 kfree(obj, M_VM_OBJECT); 363 } else { 364 vm_object_unlock(obj); 365 } 366 } else { 367 #if defined(DEBUG_LOCKS) 368 debugvm_object_add(obj, file, line, -1); 369 #endif 370 vm_object_unlock(obj); 371 } 372 } 373 374 /* 375 * Initialize a freshly allocated object, returning a held object. 376 * 377 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 378 * 379 * No requirements. 380 */ 381 void 382 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object, 383 const char *ident) 384 { 385 struct vm_object_hash *hash; 386 387 RB_INIT(&object->rb_memq); 388 lwkt_token_init(&object->token, ident); 389 390 TAILQ_INIT(&object->backing_list); 391 lockinit(&object->backing_lk, "baclk", 0, 0); 392 393 object->type = type; 394 object->size = size; 395 object->ref_count = 1; 396 object->memattr = VM_MEMATTR_DEFAULT; 397 object->hold_count = 0; 398 object->flags = 0; 399 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 400 vm_object_set_flag(object, OBJ_ONEMAPPING); 401 object->paging_in_progress = 0; 402 object->resident_page_count = 0; 403 /* cpu localization twist */ 404 object->pg_color = vm_quickcolor(); 405 object->handle = NULL; 406 407 atomic_add_int(&object->generation, 1); 408 object->swblock_count = 0; 409 RB_INIT(&object->swblock_root); 410 vm_object_lock_init(object); 411 pmap_object_init(object); 412 413 vm_object_hold(object); 414 415 hash = vmobj_hash(object); 416 lwkt_gettoken(&hash->token); 417 TAILQ_INSERT_TAIL(&hash->list, object, object_entry); 418 lwkt_reltoken(&hash->token); 419 } 420 421 /* 422 * Initialize a VM object. 423 */ 424 void 425 vm_object_init(vm_object_t object, vm_pindex_t size) 426 { 427 _vm_object_allocate(OBJT_DEFAULT, size, object, "vmobj"); 428 vm_object_drop(object); 429 } 430 431 /* 432 * Initialize the VM objects module. 433 * 434 * Called from the low level boot code only. Note that this occurs before 435 * kmalloc is initialized so we cannot allocate any VM objects. 436 */ 437 void 438 vm_object_init1(void) 439 { 440 int i; 441 442 for (i = 0; i < VMOBJ_HSIZE; ++i) { 443 TAILQ_INIT(&vm_object_hash[i].list); 444 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 445 } 446 447 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 448 &kernel_object, "kobj"); 449 vm_object_drop(&kernel_object); 450 } 451 452 void 453 vm_object_init2(void) 454 { 455 kmalloc_set_unlimited(M_VM_OBJECT); 456 } 457 458 /* 459 * Allocate and return a new object of the specified type and size. 460 * 461 * No requirements. 462 */ 463 vm_object_t 464 vm_object_allocate(objtype_t type, vm_pindex_t size) 465 { 466 vm_object_t obj; 467 468 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 469 _vm_object_allocate(type, size, obj, "vmobj"); 470 vm_object_drop(obj); 471 472 return (obj); 473 } 474 475 /* 476 * This version returns a held object, allowing further atomic initialization 477 * of the object. 478 */ 479 vm_object_t 480 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 481 { 482 vm_object_t obj; 483 484 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 485 _vm_object_allocate(type, size, obj, "vmobj"); 486 487 return (obj); 488 } 489 490 /* 491 * Add an additional reference to a vm_object. The object must already be 492 * held. The original non-lock version is no longer supported. The object 493 * must NOT be chain locked by anyone at the time the reference is added. 494 * 495 * The object must be held, but may be held shared if desired (hence why 496 * we use an atomic op). 497 */ 498 void 499 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 500 { 501 KKASSERT(object != NULL); 502 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 503 atomic_add_int(&object->ref_count, 1); 504 if (object->type == OBJT_VNODE) { 505 vref(object->handle); 506 /* XXX what if the vnode is being destroyed? */ 507 } 508 #if defined(DEBUG_LOCKS) 509 debugvm_object_add(object, file, line, 1); 510 #endif 511 } 512 513 /* 514 * This version is only allowed in situations where the caller 515 * already knows that the object is deterministically referenced 516 * (usually because its taken from a ref'd vnode, or during a map_entry 517 * replication). 518 */ 519 void 520 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 521 { 522 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0); 523 atomic_add_int(&object->ref_count, 1); 524 if (object->type == OBJT_VNODE) 525 vref(object->handle); 526 #if defined(DEBUG_LOCKS) 527 debugvm_object_add(object, file, line, 1); 528 #endif 529 } 530 531 /* 532 * Dereference an object and its underlying vnode. The object may be 533 * held shared. On return the object will remain held. 534 * 535 * This function may return a vnode in *vpp which the caller must release 536 * after the caller drops its own lock. If vpp is NULL, we assume that 537 * the caller was holding an exclusive lock on the object and we vrele() 538 * the vp ourselves. 539 */ 540 static void 541 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 542 VMOBJDBARGS) 543 { 544 struct vnode *vp = (struct vnode *) object->handle; 545 int count; 546 547 KASSERT(object->type == OBJT_VNODE, 548 ("vm_object_vndeallocate: not a vnode object")); 549 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 550 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 551 #ifdef INVARIANTS 552 if (object->ref_count == 0) { 553 vprint("vm_object_vndeallocate", vp); 554 panic("vm_object_vndeallocate: bad object reference count"); 555 } 556 #endif 557 count = object->ref_count; 558 cpu_ccfence(); 559 for (;;) { 560 if (count == 1) { 561 vm_object_upgrade(object); 562 if (atomic_fcmpset_int(&object->ref_count, &count, 0)) { 563 vclrflags(vp, VTEXT); 564 break; 565 } 566 } else { 567 if (atomic_fcmpset_int(&object->ref_count, 568 &count, count - 1)) { 569 break; 570 } 571 } 572 cpu_pause(); 573 /* retry */ 574 } 575 #if defined(DEBUG_LOCKS) 576 debugvm_object_add(object, file, line, -1); 577 #endif 578 579 /* 580 * vrele or return the vp to vrele. We can only safely vrele(vp) 581 * if the object was locked exclusively. But there are two races 582 * here. 583 * 584 * We had to upgrade the object above to safely clear VTEXT 585 * but the alternative path where the shared lock is retained 586 * can STILL race to 0 in other paths and cause our own vrele() 587 * to terminate the vnode. We can't allow that if the VM object 588 * is still locked shared. 589 */ 590 if (vpp) 591 *vpp = vp; 592 else 593 vrele(vp); 594 } 595 596 /* 597 * Release a reference to the specified object, gained either through a 598 * vm_object_allocate or a vm_object_reference call. When all references 599 * are gone, storage associated with this object may be relinquished. 600 * 601 * The caller does not have to hold the object locked but must have control 602 * over the reference in question in order to guarantee that the object 603 * does not get ripped out from under us. 604 * 605 * XXX Currently all deallocations require an exclusive lock. 606 */ 607 void 608 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 609 { 610 struct vnode *vp; 611 int count; 612 613 if (object == NULL) 614 return; 615 616 count = object->ref_count; 617 cpu_ccfence(); 618 for (;;) { 619 /* 620 * If decrementing the count enters into special handling 621 * territory (0, 1, or 2) we have to do it the hard way. 622 * Fortunate though, objects with only a few refs like this 623 * are not likely to be heavily contended anyway. 624 * 625 * For vnode objects we only care about 1->0 transitions. 626 */ 627 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 628 #if defined(DEBUG_LOCKS) 629 debugvm_object_add(object, file, line, 0); 630 #endif 631 vm_object_hold(object); 632 vm_object_deallocate_locked(object); 633 vm_object_drop(object); 634 break; 635 } 636 637 /* 638 * Try to decrement ref_count without acquiring a hold on 639 * the object. This is particularly important for the exec*() 640 * and exit*() code paths because the program binary may 641 * have a great deal of sharing and an exclusive lock will 642 * crowbar performance in those circumstances. 643 */ 644 if (object->type == OBJT_VNODE) { 645 vp = (struct vnode *)object->handle; 646 if (atomic_fcmpset_int(&object->ref_count, 647 &count, count - 1)) { 648 #if defined(DEBUG_LOCKS) 649 debugvm_object_add(object, file, line, -1); 650 #endif 651 652 vrele(vp); 653 break; 654 } 655 /* retry */ 656 } else { 657 if (atomic_fcmpset_int(&object->ref_count, 658 &count, count - 1)) { 659 #if defined(DEBUG_LOCKS) 660 debugvm_object_add(object, file, line, -1); 661 #endif 662 break; 663 } 664 /* retry */ 665 } 666 cpu_pause(); 667 /* retry */ 668 } 669 } 670 671 void 672 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 673 { 674 /* 675 * Degenerate case 676 */ 677 if (object == NULL) 678 return; 679 680 /* 681 * vnode case, caller either locked the object exclusively 682 * or this is a recursion with must_drop != 0 and the vnode 683 * object will be locked shared. 684 * 685 * If locked shared we have to drop the object before we can 686 * call vrele() or risk a shared/exclusive livelock. 687 */ 688 if (object->type == OBJT_VNODE) { 689 ASSERT_LWKT_TOKEN_HELD(&object->token); 690 vm_object_vndeallocate(object, NULL); 691 return; 692 } 693 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 694 695 /* 696 * Normal case (object is locked exclusively) 697 */ 698 if (object->ref_count == 0) { 699 panic("vm_object_deallocate: object deallocated " 700 "too many times: %d", object->type); 701 } 702 if (object->ref_count > 2) { 703 atomic_add_int(&object->ref_count, -1); 704 #if defined(DEBUG_LOCKS) 705 debugvm_object_add(object, file, line, -1); 706 #endif 707 return; 708 } 709 710 /* 711 * Drop the ref and handle termination on the 1->0 transition. 712 * We may have blocked above so we have to recheck. 713 */ 714 KKASSERT(object->ref_count != 0); 715 if (object->ref_count >= 2) { 716 atomic_add_int(&object->ref_count, -1); 717 #if defined(DEBUG_LOCKS) 718 debugvm_object_add(object, file, line, -1); 719 #endif 720 return; 721 } 722 723 atomic_add_int(&object->ref_count, -1); 724 if ((object->flags & OBJ_DEAD) == 0) 725 vm_object_terminate(object); 726 } 727 728 /* 729 * Destroy the specified object, freeing up related resources. 730 * 731 * The object must have zero references. 732 * 733 * The object must held. The caller is responsible for dropping the object 734 * after terminate returns. Terminate does NOT drop the object. 735 */ 736 static int vm_object_terminate_callback(vm_page_t p, void *data); 737 738 void 739 vm_object_terminate(vm_object_t object) 740 { 741 struct rb_vm_page_scan_info info; 742 struct vm_object_hash *hash; 743 744 /* 745 * Make sure no one uses us. Once we set OBJ_DEAD we should be 746 * able to safely block. 747 */ 748 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 749 KKASSERT((object->flags & OBJ_DEAD) == 0); 750 vm_object_set_flag(object, OBJ_DEAD); 751 752 /* 753 * Wait for the pageout daemon to be done with the object 754 */ 755 vm_object_pip_wait(object, "objtrm1"); 756 757 KASSERT(!object->paging_in_progress, 758 ("vm_object_terminate: pageout in progress")); 759 760 /* 761 * Clean and free the pages, as appropriate. All references to the 762 * object are gone, so we don't need to lock it. 763 */ 764 if (object->type == OBJT_VNODE) { 765 struct vnode *vp; 766 767 /* 768 * Clean pages and flush buffers. 769 * 770 * NOTE! TMPFS buffer flushes do not typically flush the 771 * actual page to swap as this would be highly 772 * inefficient, and normal filesystems usually wrap 773 * page flushes with buffer cache buffers. 774 * 775 * To deal with this we have to call vinvalbuf() both 776 * before and after the vm_object_page_clean(). 777 */ 778 vp = (struct vnode *) object->handle; 779 vinvalbuf(vp, V_SAVE, 0, 0); 780 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 781 vinvalbuf(vp, V_SAVE, 0, 0); 782 } 783 784 /* 785 * Wait for any I/O to complete, after which there had better not 786 * be any references left on the object. 787 */ 788 vm_object_pip_wait(object, "objtrm2"); 789 790 if (object->ref_count != 0) { 791 panic("vm_object_terminate: object with references, " 792 "ref_count=%d", object->ref_count); 793 } 794 795 /* 796 * Cleanup any shared pmaps associated with this object. 797 */ 798 pmap_object_free(object); 799 800 /* 801 * Now free any remaining pages. For internal objects, this also 802 * removes them from paging queues. Don't free wired pages, just 803 * remove them from the object. 804 */ 805 info.count = 0; 806 info.object = object; 807 do { 808 info.error = 0; 809 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 810 vm_object_terminate_callback, &info); 811 } while (info.error); 812 813 /* 814 * Let the pager know object is dead. 815 */ 816 vm_pager_deallocate(object); 817 818 /* 819 * Wait for the object hold count to hit 1, clean out pages as 820 * we go. vmobj_token interlocks any race conditions that might 821 * pick the object up from the vm_object_list after we have cleared 822 * rb_memq. 823 */ 824 for (;;) { 825 if (RB_ROOT(&object->rb_memq) == NULL) 826 break; 827 kprintf("vm_object_terminate: Warning, object %p " 828 "still has %ld pages\n", 829 object, object->resident_page_count); 830 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 831 vm_object_terminate_callback, &info); 832 } 833 834 /* 835 * There had better not be any pages left 836 */ 837 KKASSERT(object->resident_page_count == 0); 838 839 /* 840 * Remove the object from the global object list. 841 */ 842 hash = vmobj_hash(object); 843 lwkt_gettoken(&hash->token); 844 TAILQ_REMOVE(&hash->list, object, object_entry); 845 lwkt_reltoken(&hash->token); 846 847 if (object->ref_count != 0) { 848 panic("vm_object_terminate2: object with references, " 849 "ref_count=%d", object->ref_count); 850 } 851 852 /* 853 * NOTE: The object hold_count is at least 1, so we cannot kfree() 854 * the object here. See vm_object_drop(). 855 */ 856 } 857 858 /* 859 * The caller must hold the object. 860 * 861 * NOTE: It is possible for vm_page's to remain flagged PG_MAPPED 862 * or PG_MAPPED|PG_WRITEABLE, even after pmap_mapped_sync() 863 * is called, due to normal pmap operations. This is because only 864 * global pmap operations on the vm_page can clear the bits and not 865 * just local operations on individual pmaps. 866 * 867 * Most interactions that necessitate the clearing of these bits 868 * proactively call vm_page_protect(), and we must do so here as well. 869 */ 870 static int 871 vm_object_terminate_callback(vm_page_t p, void *data) 872 { 873 struct rb_vm_page_scan_info *info = data; 874 vm_object_t object; 875 876 object = p->object; 877 KKASSERT(object == info->object); 878 if (vm_page_busy_try(p, TRUE)) { 879 vm_page_sleep_busy(p, TRUE, "vmotrm"); 880 info->error = 1; 881 return 0; 882 } 883 if (object != p->object) { 884 /* XXX remove once we determine it can't happen */ 885 kprintf("vm_object_terminate: Warning: Encountered " 886 "busied page %p on queue %d\n", p, p->queue); 887 vm_page_wakeup(p); 888 info->error = 1; 889 } else if (p->wire_count == 0) { 890 /* 891 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 892 */ 893 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE)) 894 vm_page_protect(p, VM_PROT_NONE); 895 vm_page_free(p); 896 mycpu->gd_cnt.v_pfree++; 897 } else { 898 if (p->queue != PQ_NONE) { 899 kprintf("vm_object_terminate: Warning: Encountered " 900 "wired page %p on queue %d\n", p, p->queue); 901 if (vm_object_debug > 0) { 902 --vm_object_debug; 903 print_backtrace(10); 904 } 905 } 906 if (pmap_mapped_sync(p) & (PG_MAPPED | PG_WRITEABLE)) 907 vm_page_protect(p, VM_PROT_NONE); 908 vm_page_remove(p); 909 vm_page_wakeup(p); 910 } 911 912 /* 913 * Must be at end to avoid SMP races, caller holds object token 914 */ 915 if ((++info->count & 63) == 0) 916 lwkt_user_yield(); 917 return(0); 918 } 919 920 /* 921 * Clean all dirty pages in the specified range of object. Leaves page 922 * on whatever queue it is currently on. If NOSYNC is set then do not 923 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 924 * leaving the object dirty. 925 * 926 * When stuffing pages asynchronously, allow clustering. XXX we need a 927 * synchronous clustering mode implementation. 928 * 929 * Odd semantics: if start == end, we clean everything. 930 * 931 * The object must be locked? XXX 932 */ 933 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 934 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 935 936 void 937 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 938 int flags) 939 { 940 struct rb_vm_page_scan_info info; 941 struct vnode *vp; 942 int wholescan; 943 int pagerflags; 944 int generation; 945 946 vm_object_hold(object); 947 if (object->type != OBJT_VNODE || 948 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 949 vm_object_drop(object); 950 return; 951 } 952 953 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 954 OBJPC_SYNC : OBJPC_CLUSTER_OK; 955 pagerflags |= (flags & OBJPC_INVAL) ? OBJPC_INVAL : 0; 956 957 vp = object->handle; 958 959 /* 960 * Interlock other major object operations. This allows us to 961 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 962 */ 963 vm_object_set_flag(object, OBJ_CLEANING); 964 965 /* 966 * Handle 'entire object' case 967 */ 968 info.start_pindex = start; 969 if (end == 0) { 970 info.end_pindex = object->size - 1; 971 } else { 972 info.end_pindex = end - 1; 973 } 974 wholescan = (start == 0 && info.end_pindex == object->size - 1); 975 info.limit = flags; 976 info.pagerflags = pagerflags; 977 info.object = object; 978 979 /* 980 * If cleaning the entire object do a pass to mark the pages read-only. 981 * If everything worked out ok, clear OBJ_WRITEABLE and 982 * OBJ_MIGHTBEDIRTY. 983 */ 984 if (wholescan) { 985 info.error = 0; 986 info.count = 0; 987 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 988 vm_object_page_clean_pass1, &info); 989 if (info.error == 0) { 990 vm_object_clear_flag(object, 991 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 992 if (object->type == OBJT_VNODE && 993 (vp = (struct vnode *)object->handle) != NULL) { 994 /* 995 * Use new-style interface to clear VISDIRTY 996 * because the vnode is not necessarily removed 997 * from the syncer list(s) as often as it was 998 * under the old interface, which can leave 999 * the vnode on the syncer list after reclaim. 1000 */ 1001 vclrobjdirty(vp); 1002 } 1003 } 1004 } 1005 1006 /* 1007 * Do a pass to clean all the dirty pages we find. 1008 */ 1009 do { 1010 info.error = 0; 1011 info.count = 0; 1012 generation = object->generation; 1013 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1014 vm_object_page_clean_pass2, &info); 1015 } while (info.error || generation != object->generation); 1016 1017 vm_object_clear_flag(object, OBJ_CLEANING); 1018 vm_object_drop(object); 1019 } 1020 1021 /* 1022 * The caller must hold the object. 1023 */ 1024 static 1025 int 1026 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1027 { 1028 struct rb_vm_page_scan_info *info = data; 1029 1030 KKASSERT(p->object == info->object); 1031 1032 vm_page_flag_set(p, PG_CLEANCHK); 1033 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1034 info->error = 1; 1035 } else if (vm_page_busy_try(p, FALSE)) { 1036 info->error = 1; 1037 } else { 1038 KKASSERT(p->object == info->object); 1039 vm_page_protect(p, VM_PROT_READ); 1040 vm_page_wakeup(p); 1041 } 1042 1043 /* 1044 * Must be at end to avoid SMP races, caller holds object token 1045 */ 1046 if ((++info->count & 63) == 0) 1047 lwkt_user_yield(); 1048 return(0); 1049 } 1050 1051 /* 1052 * The caller must hold the object 1053 */ 1054 static 1055 int 1056 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1057 { 1058 struct rb_vm_page_scan_info *info = data; 1059 int generation; 1060 1061 KKASSERT(p->object == info->object); 1062 1063 /* 1064 * Do not mess with pages that were inserted after we started 1065 * the cleaning pass. 1066 */ 1067 if ((p->flags & PG_CLEANCHK) == 0) 1068 goto done; 1069 1070 generation = info->object->generation; 1071 1072 if (vm_page_busy_try(p, TRUE)) { 1073 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1074 info->error = 1; 1075 goto done; 1076 } 1077 1078 KKASSERT(p->object == info->object && 1079 info->object->generation == generation); 1080 1081 /* 1082 * Before wasting time traversing the pmaps, check for trivial 1083 * cases where the page cannot be dirty. 1084 */ 1085 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1086 KKASSERT((p->dirty & p->valid) == 0 && 1087 (p->flags & PG_NEED_COMMIT) == 0); 1088 vm_page_wakeup(p); 1089 goto done; 1090 } 1091 1092 /* 1093 * Check whether the page is dirty or not. The page has been set 1094 * to be read-only so the check will not race a user dirtying the 1095 * page. 1096 */ 1097 vm_page_test_dirty(p); 1098 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1099 vm_page_flag_clear(p, PG_CLEANCHK); 1100 vm_page_wakeup(p); 1101 goto done; 1102 } 1103 1104 /* 1105 * If we have been asked to skip nosync pages and this is a 1106 * nosync page, skip it. Note that the object flags were 1107 * not cleared in this case (because pass1 will have returned an 1108 * error), so we do not have to set them. 1109 */ 1110 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1111 vm_page_flag_clear(p, PG_CLEANCHK); 1112 vm_page_wakeup(p); 1113 goto done; 1114 } 1115 1116 /* 1117 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1118 * the pages that get successfully flushed. Set info->error if 1119 * we raced an object modification. 1120 */ 1121 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1122 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1123 1124 /* 1125 * Must be at end to avoid SMP races, caller holds object token 1126 */ 1127 done: 1128 if ((++info->count & 63) == 0) 1129 lwkt_user_yield(); 1130 return(0); 1131 } 1132 1133 /* 1134 * Collect the specified page and nearby pages and flush them out. 1135 * The number of pages flushed is returned. The passed page is busied 1136 * by the caller and we are responsible for its disposition. 1137 * 1138 * The caller must hold the object. 1139 */ 1140 static void 1141 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1142 { 1143 int error; 1144 int is; 1145 int ib; 1146 int i; 1147 int page_base; 1148 vm_pindex_t pi; 1149 vm_page_t ma[BLIST_MAX_ALLOC]; 1150 1151 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1152 1153 pi = p->pindex; 1154 page_base = pi % BLIST_MAX_ALLOC; 1155 ma[page_base] = p; 1156 ib = page_base - 1; 1157 is = page_base + 1; 1158 1159 while (ib >= 0) { 1160 vm_page_t tp; 1161 1162 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1163 TRUE, &error); 1164 if (error) 1165 break; 1166 if (tp == NULL) 1167 break; 1168 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 && 1169 (tp->flags & PG_CLEANCHK) == 0) { 1170 vm_page_wakeup(tp); 1171 break; 1172 } 1173 if ((tp->queue - tp->pc) == PQ_CACHE) { 1174 vm_page_flag_clear(tp, PG_CLEANCHK); 1175 vm_page_wakeup(tp); 1176 break; 1177 } 1178 vm_page_test_dirty(tp); 1179 if ((tp->dirty & tp->valid) == 0 && 1180 (tp->flags & PG_NEED_COMMIT) == 0) { 1181 vm_page_flag_clear(tp, PG_CLEANCHK); 1182 vm_page_wakeup(tp); 1183 break; 1184 } 1185 ma[ib] = tp; 1186 --ib; 1187 } 1188 ++ib; /* fixup */ 1189 1190 while (is < BLIST_MAX_ALLOC && 1191 pi - page_base + is < object->size) { 1192 vm_page_t tp; 1193 1194 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1195 TRUE, &error); 1196 if (error) 1197 break; 1198 if (tp == NULL) 1199 break; 1200 if ((pagerflags & OBJPC_IGNORE_CLEANCHK) == 0 && 1201 (tp->flags & PG_CLEANCHK) == 0) { 1202 vm_page_wakeup(tp); 1203 break; 1204 } 1205 if ((tp->queue - tp->pc) == PQ_CACHE) { 1206 vm_page_flag_clear(tp, PG_CLEANCHK); 1207 vm_page_wakeup(tp); 1208 break; 1209 } 1210 vm_page_test_dirty(tp); 1211 if ((tp->dirty & tp->valid) == 0 && 1212 (tp->flags & PG_NEED_COMMIT) == 0) { 1213 vm_page_flag_clear(tp, PG_CLEANCHK); 1214 vm_page_wakeup(tp); 1215 break; 1216 } 1217 ma[is] = tp; 1218 ++is; 1219 } 1220 1221 /* 1222 * All pages in the ma[] array are busied now 1223 */ 1224 for (i = ib; i < is; ++i) { 1225 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1226 vm_page_hold(ma[i]); /* XXX need this any more? */ 1227 } 1228 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1229 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1230 vm_page_unhold(ma[i]); 1231 } 1232 1233 /* 1234 * Implements the madvise function at the object/page level. 1235 * 1236 * MADV_WILLNEED (any object) 1237 * 1238 * Activate the specified pages if they are resident. 1239 * 1240 * MADV_DONTNEED (any object) 1241 * 1242 * Deactivate the specified pages if they are resident. 1243 * 1244 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1245 * 1246 * Deactivate and clean the specified pages if they are 1247 * resident. This permits the process to reuse the pages 1248 * without faulting or the kernel to reclaim the pages 1249 * without I/O. 1250 * 1251 * No requirements. 1252 */ 1253 void 1254 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1255 vm_pindex_t count, int advise) 1256 { 1257 vm_pindex_t end; 1258 vm_page_t m; 1259 int error; 1260 1261 if (object == NULL) 1262 return; 1263 1264 end = pindex + count; 1265 1266 vm_object_hold(object); 1267 1268 /* 1269 * Locate and adjust resident pages. This only applies to the 1270 * primary object in the mapping. 1271 */ 1272 for (; pindex < end; pindex += 1) { 1273 relookup: 1274 /* 1275 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1276 * and those pages must be OBJ_ONEMAPPING. 1277 */ 1278 if (advise == MADV_FREE) { 1279 if ((object->type != OBJT_DEFAULT && 1280 object->type != OBJT_SWAP) || 1281 (object->flags & OBJ_ONEMAPPING) == 0) { 1282 continue; 1283 } 1284 } 1285 1286 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1287 1288 if (error) { 1289 vm_page_sleep_busy(m, TRUE, "madvpo"); 1290 goto relookup; 1291 } 1292 if (m == NULL) { 1293 /* 1294 * There may be swap even if there is no backing page 1295 */ 1296 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1297 swap_pager_freespace(object, pindex, 1); 1298 continue; 1299 } 1300 1301 /* 1302 * If the page is not in a normal active state, we skip it. 1303 * If the page is not managed there are no page queues to 1304 * mess with. Things can break if we mess with pages in 1305 * any of the below states. 1306 */ 1307 if (m->wire_count || 1308 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED | 1309 PG_NEED_COMMIT)) || 1310 m->valid != VM_PAGE_BITS_ALL 1311 ) { 1312 vm_page_wakeup(m); 1313 continue; 1314 } 1315 1316 /* 1317 * Theoretically once a page is known not to be busy, an 1318 * interrupt cannot come along and rip it out from under us. 1319 */ 1320 if (advise == MADV_WILLNEED) { 1321 vm_page_activate(m); 1322 } else if (advise == MADV_DONTNEED) { 1323 vm_page_dontneed(m); 1324 } else if (advise == MADV_FREE) { 1325 /* 1326 * Mark the page clean. This will allow the page 1327 * to be freed up by the system. However, such pages 1328 * are often reused quickly by malloc()/free() 1329 * so we do not do anything that would cause 1330 * a page fault if we can help it. 1331 * 1332 * Specifically, we do not try to actually free 1333 * the page now nor do we try to put it in the 1334 * cache (which would cause a page fault on reuse). 1335 * 1336 * But we do make the page is freeable as we 1337 * can without actually taking the step of unmapping 1338 * it. 1339 */ 1340 pmap_clear_modify(m); 1341 m->dirty = 0; 1342 m->act_count = 0; 1343 vm_page_dontneed(m); 1344 if (object->type == OBJT_SWAP) 1345 swap_pager_freespace(object, pindex, 1); 1346 } 1347 vm_page_wakeup(m); 1348 } 1349 vm_object_drop(object); 1350 } 1351 1352 /* 1353 * Removes all physical pages in the specified object range from the 1354 * object's list of pages. 1355 * 1356 * No requirements. 1357 */ 1358 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1359 1360 void 1361 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1362 boolean_t clean_only) 1363 { 1364 struct rb_vm_page_scan_info info; 1365 int all; 1366 1367 /* 1368 * Degenerate cases and assertions. 1369 * 1370 * NOTE: Don't shortcut on resident_page_count for MGTDEVICE objects. 1371 * These objects do not have to have their pages entered into 1372 * them and are handled via their vm_map_backing lists. 1373 */ 1374 vm_object_hold(object); 1375 if (object == NULL || 1376 (object->type != OBJT_MGTDEVICE && 1377 object->resident_page_count == 0 && object->swblock_count == 0)) { 1378 vm_object_drop(object); 1379 return; 1380 } 1381 KASSERT(object->type != OBJT_PHYS, 1382 ("attempt to remove pages from a physical object")); 1383 1384 /* 1385 * Indicate that paging is occuring on the object 1386 */ 1387 vm_object_pip_add(object, 1); 1388 1389 /* 1390 * Figure out the actual removal range and whether we are removing 1391 * the entire contents of the object or not. If removing the entire 1392 * contents, be sure to get all pages, even those that might be 1393 * beyond the end of the object. 1394 * 1395 * NOTE: end is non-inclusive, but info.end_pindex is inclusive. 1396 */ 1397 info.object = object; 1398 info.start_pindex = start; 1399 if (end == 0 || end == (vm_pindex_t)-1) { 1400 info.end_pindex = (vm_pindex_t)-1; 1401 end = object->size; 1402 } else { 1403 info.end_pindex = end - 1; 1404 } 1405 info.limit = clean_only; 1406 info.count = 0; 1407 all = (start == 0 && info.end_pindex >= object->size - 1); 1408 1409 /* 1410 * Efficiently remove pages from the pmap via a backing scan. 1411 * 1412 * NOTE: This is the only way pages can be removed and unwired 1413 * from OBJT_MGTDEVICE devices which typically do not enter 1414 * their pages into the vm_object's RB tree. And possibly 1415 * other OBJT_* types in the future. 1416 */ 1417 { 1418 vm_map_backing_t ba; 1419 vm_pindex_t sba, eba; 1420 vm_offset_t sva, eva; 1421 1422 lockmgr(&object->backing_lk, LK_EXCLUSIVE); 1423 TAILQ_FOREACH(ba, &object->backing_list, entry) { 1424 /* 1425 * object offset range within the ba, intersectioned 1426 * with the page range specified for the object 1427 */ 1428 sba = OFF_TO_IDX(ba->offset); 1429 eba = sba + OFF_TO_IDX(ba->end - ba->start); 1430 if (sba < start) 1431 sba = start; 1432 if (eba > end) 1433 eba = end; 1434 1435 /* 1436 * If the intersection is valid, remove the related 1437 * pages. 1438 * 1439 * NOTE! This may also remove other incidental pages 1440 * in the pmap, as the backing area may be 1441 * overloaded. 1442 * 1443 * NOTE! pages for MGTDEVICE objects are only removed 1444 * here, they aren't entered into rb_memq, so 1445 * we must use pmap_remove() instead of 1446 * the non-TLB-invalidating pmap_remove_pages(). 1447 */ 1448 if (sba < eba) { 1449 sva = ba->start + IDX_TO_OFF(sba) - ba->offset; 1450 eva = sva + IDX_TO_OFF(eba - sba); 1451 #if 0 1452 kprintf("VM_OBJECT_PAGE_REMOVE " 1453 "%p[%016jx] %016jx-%016jx\n", 1454 ba->pmap, ba->start, sva, eva); 1455 #endif 1456 pmap_remove(ba->pmap, sva, eva); 1457 } 1458 } 1459 lockmgr(&object->backing_lk, LK_RELEASE); 1460 } 1461 1462 /* 1463 * Remove and free pages entered onto the object list. Note that 1464 * for OBJT_MGTDEVICE objects, there are typically no pages entered. 1465 * 1466 * Loop until we are sure we have gotten them all. 1467 */ 1468 do { 1469 info.error = 0; 1470 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1471 vm_object_page_remove_callback, &info); 1472 } while (info.error); 1473 1474 /* 1475 * Remove any related swap if throwing away pages, or for 1476 * non-swap objects (the swap is a clean copy in that case). 1477 */ 1478 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1479 if (all) 1480 swap_pager_freespace_all(object); 1481 else 1482 swap_pager_freespace(object, info.start_pindex, 1483 info.end_pindex - info.start_pindex + 1); 1484 } 1485 1486 /* 1487 * Cleanup 1488 */ 1489 vm_object_pip_wakeup(object); 1490 vm_object_drop(object); 1491 } 1492 1493 /* 1494 * The caller must hold the object. 1495 * 1496 * NOTE: User yields are allowed when removing more than one page, but not 1497 * allowed if only removing one page (the path for single page removals 1498 * might hold a spinlock). 1499 */ 1500 static int 1501 vm_object_page_remove_callback(vm_page_t p, void *data) 1502 { 1503 struct rb_vm_page_scan_info *info = data; 1504 1505 if (info->object != p->object || 1506 p->pindex < info->start_pindex || 1507 p->pindex > info->end_pindex) { 1508 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1509 info->object, p); 1510 return(0); 1511 } 1512 if (vm_page_busy_try(p, TRUE)) { 1513 vm_page_sleep_busy(p, TRUE, "vmopar"); 1514 info->error = 1; 1515 return(0); 1516 } 1517 if (info->object != p->object) { 1518 /* this should never happen */ 1519 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1520 info->object, p); 1521 vm_page_wakeup(p); 1522 return(0); 1523 } 1524 1525 /* 1526 * Wired pages cannot be destroyed, but they can be invalidated 1527 * and we do so if clean_only (limit) is not set. 1528 * 1529 * WARNING! The page may be wired due to being part of a buffer 1530 * cache buffer, and the buffer might be marked B_CACHE. 1531 * This is fine as part of a truncation but VFSs must be 1532 * sure to fix the buffer up when re-extending the file. 1533 * 1534 * NOTE! PG_NEED_COMMIT is ignored. 1535 */ 1536 if (p->wire_count != 0) { 1537 vm_page_protect(p, VM_PROT_NONE); 1538 if (info->limit == 0) 1539 p->valid = 0; 1540 vm_page_wakeup(p); 1541 goto done; 1542 } 1543 1544 /* 1545 * limit is our clean_only flag. If set and the page is dirty or 1546 * requires a commit, do not free it. If set and the page is being 1547 * held by someone, do not free it. 1548 */ 1549 if (info->limit && p->valid) { 1550 vm_page_test_dirty(p); 1551 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1552 vm_page_wakeup(p); 1553 goto done; 1554 } 1555 } 1556 1557 /* 1558 * Destroy the page. But we have to re-test whether its dirty after 1559 * removing it from its pmaps. 1560 */ 1561 vm_page_protect(p, VM_PROT_NONE); 1562 if (info->limit && p->valid) { 1563 vm_page_test_dirty(p); 1564 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1565 vm_page_wakeup(p); 1566 goto done; 1567 } 1568 } 1569 vm_page_free(p); 1570 1571 /* 1572 * Must be at end to avoid SMP races, caller holds object token 1573 */ 1574 done: 1575 if ((++info->count & 63) == 0) 1576 lwkt_user_yield(); 1577 1578 return(0); 1579 } 1580 1581 /* 1582 * Try to extend prev_object into an adjoining region of virtual 1583 * memory, return TRUE on success. 1584 * 1585 * The caller does not need to hold (prev_object) but must have a stable 1586 * pointer to it (typically by holding the vm_map locked). 1587 * 1588 * This function only works for anonymous memory objects which either 1589 * have (a) one reference or (b) we are extending the object's size. 1590 * Otherwise the related VM pages we want to use for the object might 1591 * be in use by another mapping. 1592 */ 1593 boolean_t 1594 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1595 vm_size_t prev_size, vm_size_t next_size) 1596 { 1597 vm_pindex_t next_pindex; 1598 1599 if (prev_object == NULL) 1600 return (TRUE); 1601 1602 vm_object_hold(prev_object); 1603 1604 if (prev_object->type != OBJT_DEFAULT && 1605 prev_object->type != OBJT_SWAP) { 1606 vm_object_drop(prev_object); 1607 return (FALSE); 1608 } 1609 1610 #if 0 1611 /* caller now checks this */ 1612 /* 1613 * Try to collapse the object first 1614 */ 1615 vm_object_collapse(prev_object, NULL); 1616 #endif 1617 1618 #if 0 1619 /* caller now checks this */ 1620 /* 1621 * We can't coalesce if we shadow another object (figuring out the 1622 * relationships become too complex). 1623 */ 1624 if (prev_object->backing_object != NULL) { 1625 vm_object_chain_release(prev_object); 1626 vm_object_drop(prev_object); 1627 return (FALSE); 1628 } 1629 #endif 1630 1631 prev_size >>= PAGE_SHIFT; 1632 next_size >>= PAGE_SHIFT; 1633 next_pindex = prev_pindex + prev_size; 1634 1635 /* 1636 * We can't if the object has more than one ref count unless we 1637 * are extending it into newly minted space. 1638 */ 1639 if (prev_object->ref_count > 1 && 1640 prev_object->size != next_pindex) { 1641 vm_object_drop(prev_object); 1642 return (FALSE); 1643 } 1644 1645 /* 1646 * Remove any pages that may still be in the object from a previous 1647 * deallocation. 1648 */ 1649 if (next_pindex < prev_object->size) { 1650 vm_object_page_remove(prev_object, 1651 next_pindex, 1652 next_pindex + next_size, FALSE); 1653 if (prev_object->type == OBJT_SWAP) 1654 swap_pager_freespace(prev_object, 1655 next_pindex, next_size); 1656 } 1657 1658 /* 1659 * Extend the object if necessary. 1660 */ 1661 if (next_pindex + next_size > prev_object->size) 1662 prev_object->size = next_pindex + next_size; 1663 vm_object_drop(prev_object); 1664 1665 return (TRUE); 1666 } 1667 1668 /* 1669 * Make the object writable and flag is being possibly dirty. 1670 * 1671 * The object might not be held (or might be held but held shared), 1672 * the related vnode is probably not held either. Object and vnode are 1673 * stable by virtue of the vm_page busied by the caller preventing 1674 * destruction. 1675 * 1676 * If the related mount is flagged MNTK_THR_SYNC we need to call 1677 * vsetobjdirty(). Filesystems using this option usually shortcut 1678 * synchronization by only scanning the syncer list. 1679 */ 1680 void 1681 vm_object_set_writeable_dirty(vm_object_t object) 1682 { 1683 struct vnode *vp; 1684 1685 /*vm_object_assert_held(object);*/ 1686 /* 1687 * Avoid contention in vm fault path by checking the state before 1688 * issuing an atomic op on it. 1689 */ 1690 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1691 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1692 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1693 } 1694 if (object->type == OBJT_VNODE && 1695 (vp = (struct vnode *)object->handle) != NULL) { 1696 if ((vp->v_flag & VOBJDIRTY) == 0) { 1697 if (vp->v_mount && 1698 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1699 /* 1700 * New style THR_SYNC places vnodes on the 1701 * syncer list more deterministically. 1702 */ 1703 vsetobjdirty(vp); 1704 } else { 1705 /* 1706 * Old style scan would not necessarily place 1707 * a vnode on the syncer list when possibly 1708 * modified via mmap. 1709 */ 1710 vsetflags(vp, VOBJDIRTY); 1711 } 1712 } 1713 } 1714 } 1715 1716 #include "opt_ddb.h" 1717 #ifdef DDB 1718 #include <sys/cons.h> 1719 1720 #include <ddb/ddb.h> 1721 1722 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1723 vm_map_entry_t entry); 1724 static int vm_object_in_map (vm_object_t object); 1725 1726 /* 1727 * The caller must hold the object. 1728 */ 1729 static int 1730 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1731 { 1732 vm_map_backing_t ba; 1733 vm_map_t tmpm; 1734 vm_map_entry_t tmpe; 1735 int entcount; 1736 1737 if (map == NULL) 1738 return 0; 1739 if (entry == NULL) { 1740 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1741 entcount = map->nentries; 1742 while (entcount-- && tmpe) { 1743 if( _vm_object_in_map(map, object, tmpe)) { 1744 return 1; 1745 } 1746 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1747 } 1748 return (0); 1749 } 1750 switch(entry->maptype) { 1751 case VM_MAPTYPE_SUBMAP: 1752 tmpm = entry->ba.sub_map; 1753 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1754 entcount = tmpm->nentries; 1755 while (entcount-- && tmpe) { 1756 if( _vm_object_in_map(tmpm, object, tmpe)) { 1757 return 1; 1758 } 1759 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1760 } 1761 break; 1762 case VM_MAPTYPE_NORMAL: 1763 case VM_MAPTYPE_VPAGETABLE: 1764 ba = &entry->ba; 1765 while (ba) { 1766 if (ba->object == object) 1767 return TRUE; 1768 ba = ba->backing_ba; 1769 } 1770 break; 1771 default: 1772 break; 1773 } 1774 return 0; 1775 } 1776 1777 static int vm_object_in_map_callback(struct proc *p, void *data); 1778 1779 struct vm_object_in_map_info { 1780 vm_object_t object; 1781 int rv; 1782 }; 1783 1784 /* 1785 * Debugging only 1786 */ 1787 static int 1788 vm_object_in_map(vm_object_t object) 1789 { 1790 struct vm_object_in_map_info info; 1791 1792 info.rv = 0; 1793 info.object = object; 1794 1795 allproc_scan(vm_object_in_map_callback, &info, 0); 1796 if (info.rv) 1797 return 1; 1798 if( _vm_object_in_map(&kernel_map, object, 0)) 1799 return 1; 1800 if( _vm_object_in_map(&pager_map, object, 0)) 1801 return 1; 1802 if( _vm_object_in_map(&buffer_map, object, 0)) 1803 return 1; 1804 return 0; 1805 } 1806 1807 /* 1808 * Debugging only 1809 */ 1810 static int 1811 vm_object_in_map_callback(struct proc *p, void *data) 1812 { 1813 struct vm_object_in_map_info *info = data; 1814 1815 if (p->p_vmspace) { 1816 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1817 info->rv = 1; 1818 return -1; 1819 } 1820 } 1821 return (0); 1822 } 1823 1824 DB_SHOW_COMMAND(vmochk, vm_object_check) 1825 { 1826 struct vm_object_hash *hash; 1827 vm_object_t object; 1828 int n; 1829 1830 /* 1831 * make sure that internal objs are in a map somewhere 1832 * and none have zero ref counts. 1833 */ 1834 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1835 hash = &vm_object_hash[n]; 1836 for (object = TAILQ_FIRST(&hash->list); 1837 object != NULL; 1838 object = TAILQ_NEXT(object, object_entry)) { 1839 if (object->type == OBJT_MARKER) 1840 continue; 1841 if (object->handle != NULL || 1842 (object->type != OBJT_DEFAULT && 1843 object->type != OBJT_SWAP)) { 1844 continue; 1845 } 1846 if (object->ref_count == 0) { 1847 db_printf("vmochk: internal obj has " 1848 "zero ref count: %ld\n", 1849 (long)object->size); 1850 } 1851 if (vm_object_in_map(object)) 1852 continue; 1853 db_printf("vmochk: internal obj is not in a map: " 1854 "ref: %d, size: %lu: 0x%lx\n", 1855 object->ref_count, (u_long)object->size, 1856 (u_long)object->size); 1857 } 1858 } 1859 } 1860 1861 /* 1862 * Debugging only 1863 */ 1864 DB_SHOW_COMMAND(object, vm_object_print_static) 1865 { 1866 /* XXX convert args. */ 1867 vm_object_t object = (vm_object_t)addr; 1868 boolean_t full = have_addr; 1869 1870 vm_page_t p; 1871 1872 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1873 #define count was_count 1874 1875 int count; 1876 1877 if (object == NULL) 1878 return; 1879 1880 db_iprintf( 1881 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1882 object, (int)object->type, (u_long)object->size, 1883 object->resident_page_count, object->ref_count, object->flags); 1884 /* 1885 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1886 */ 1887 db_iprintf("\n"); 1888 1889 if (!full) 1890 return; 1891 1892 db_indent += 2; 1893 count = 0; 1894 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1895 if (count == 0) 1896 db_iprintf("memory:="); 1897 else if (count == 6) { 1898 db_printf("\n"); 1899 db_iprintf(" ..."); 1900 count = 0; 1901 } else 1902 db_printf(","); 1903 count++; 1904 1905 db_printf("(off=0x%lx,page=0x%lx)", 1906 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1907 } 1908 if (count != 0) 1909 db_printf("\n"); 1910 db_indent -= 2; 1911 } 1912 1913 /* XXX. */ 1914 #undef count 1915 1916 /* 1917 * XXX need this non-static entry for calling from vm_map_print. 1918 * 1919 * Debugging only 1920 */ 1921 void 1922 vm_object_print(/* db_expr_t */ long addr, 1923 boolean_t have_addr, 1924 /* db_expr_t */ long count, 1925 char *modif) 1926 { 1927 vm_object_print_static(addr, have_addr, count, modif); 1928 } 1929 1930 /* 1931 * Debugging only 1932 */ 1933 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1934 { 1935 struct vm_object_hash *hash; 1936 vm_object_t object; 1937 int nl = 0; 1938 int c; 1939 int n; 1940 1941 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1942 hash = &vm_object_hash[n]; 1943 for (object = TAILQ_FIRST(&hash->list); 1944 object != NULL; 1945 object = TAILQ_NEXT(object, object_entry)) { 1946 vm_pindex_t idx, fidx; 1947 vm_pindex_t osize; 1948 vm_paddr_t pa = -1, padiff; 1949 int rcount; 1950 vm_page_t m; 1951 1952 if (object->type == OBJT_MARKER) 1953 continue; 1954 db_printf("new object: %p\n", (void *)object); 1955 if ( nl > 18) { 1956 c = cngetc(); 1957 if (c != ' ') 1958 return; 1959 nl = 0; 1960 } 1961 nl++; 1962 rcount = 0; 1963 fidx = 0; 1964 osize = object->size; 1965 if (osize > 128) 1966 osize = 128; 1967 for (idx = 0; idx < osize; idx++) { 1968 m = vm_page_lookup(object, idx); 1969 if (m == NULL) { 1970 if (rcount) { 1971 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1972 (long)fidx, rcount, (long)pa); 1973 if ( nl > 18) { 1974 c = cngetc(); 1975 if (c != ' ') 1976 return; 1977 nl = 0; 1978 } 1979 nl++; 1980 rcount = 0; 1981 } 1982 continue; 1983 } 1984 1985 if (rcount && 1986 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1987 ++rcount; 1988 continue; 1989 } 1990 if (rcount) { 1991 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1992 padiff >>= PAGE_SHIFT; 1993 padiff &= PQ_L2_MASK; 1994 if (padiff == 0) { 1995 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1996 ++rcount; 1997 continue; 1998 } 1999 db_printf(" index(%ld)run(%d)pa(0x%lx)", 2000 (long)fidx, rcount, (long)pa); 2001 db_printf("pd(%ld)\n", (long)padiff); 2002 if ( nl > 18) { 2003 c = cngetc(); 2004 if (c != ' ') 2005 return; 2006 nl = 0; 2007 } 2008 nl++; 2009 } 2010 fidx = idx; 2011 pa = VM_PAGE_TO_PHYS(m); 2012 rcount = 1; 2013 } 2014 if (rcount) { 2015 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 2016 (long)fidx, rcount, (long)pa); 2017 if ( nl > 18) { 2018 c = cngetc(); 2019 if (c != ' ') 2020 return; 2021 nl = 0; 2022 } 2023 nl++; 2024 } 2025 } 2026 } 2027 } 2028 #endif /* DDB */ 2029