1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/sysctl.h> 77 #include <sys/refcount.h> 78 79 #include <vm/vm.h> 80 #include <vm/vm_param.h> 81 #include <vm/pmap.h> 82 #include <vm/vm_map.h> 83 #include <vm/vm_object.h> 84 #include <vm/vm_page.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_pager.h> 87 #include <vm/swap_pager.h> 88 #include <vm/vm_kern.h> 89 #include <vm/vm_extern.h> 90 #include <vm/vm_zone.h> 91 92 #include <vm/vm_page2.h> 93 94 #include <machine/specialreg.h> 95 96 #define EASY_SCAN_FACTOR 8 97 98 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 99 int pagerflags); 100 static void vm_object_lock_init(vm_object_t); 101 102 /* 103 * Virtual memory objects maintain the actual data 104 * associated with allocated virtual memory. A given 105 * page of memory exists within exactly one object. 106 * 107 * An object is only deallocated when all "references" 108 * are given up. Only one "reference" to a given 109 * region of an object should be writeable. 110 * 111 * Associated with each object is a list of all resident 112 * memory pages belonging to that object; this list is 113 * maintained by the "vm_page" module, and locked by the object's 114 * lock. 115 * 116 * Each object also records a "pager" routine which is 117 * used to retrieve (and store) pages to the proper backing 118 * storage. In addition, objects may be backed by other 119 * objects from which they were virtual-copied. 120 * 121 * The only items within the object structure which are 122 * modified after time of creation are: 123 * reference count locked by object's lock 124 * pager routine locked by object's lock 125 * 126 */ 127 128 struct vm_object kernel_object; 129 130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 131 132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures"); 133 134 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 135 #define VMOBJ_HASH_PRIME2 989042931893ULL 136 137 int vm_object_debug; 138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 139 140 static __inline 141 struct vm_object_hash * 142 vmobj_hash(vm_object_t obj) 143 { 144 uintptr_t hash1; 145 uintptr_t hash2; 146 147 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 148 hash1 %= VMOBJ_HASH_PRIME1; 149 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 150 hash2 %= VMOBJ_HASH_PRIME2; 151 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 152 } 153 154 #if defined(DEBUG_LOCKS) 155 156 #define vm_object_vndeallocate(obj, vpp) \ 157 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 158 159 /* 160 * Debug helper to track hold/drop/ref/deallocate calls. 161 */ 162 static void 163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 164 { 165 int i; 166 167 i = atomic_fetchadd_int(&obj->debug_index, 1); 168 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 169 ksnprintf(obj->debug_hold_thrs[i], 170 sizeof(obj->debug_hold_thrs[i]), 171 "%c%d:(%d):%s", 172 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 173 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 174 obj->ref_count, 175 curthread->td_comm); 176 obj->debug_hold_file[i] = file; 177 obj->debug_hold_line[i] = line; 178 #if 0 179 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 180 if (strcmp(curthread->td_comm, "sshd") == 0) { 181 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 182 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 183 obj, obj->ref_count, addrem, file, line); 184 } 185 #endif 186 } 187 188 #endif 189 190 /* 191 * Misc low level routines 192 */ 193 static void 194 vm_object_lock_init(vm_object_t obj) 195 { 196 #if defined(DEBUG_LOCKS) 197 int i; 198 199 obj->debug_index = 0; 200 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 201 obj->debug_hold_thrs[i][0] = 0; 202 obj->debug_hold_file[i] = NULL; 203 obj->debug_hold_line[i] = 0; 204 } 205 #endif 206 } 207 208 void 209 vm_object_lock_swap(void) 210 { 211 lwkt_token_swap(); 212 } 213 214 void 215 vm_object_lock(vm_object_t obj) 216 { 217 lwkt_gettoken(&obj->token); 218 } 219 220 /* 221 * Returns TRUE on sucesss 222 */ 223 static int 224 vm_object_lock_try(vm_object_t obj) 225 { 226 return(lwkt_trytoken(&obj->token)); 227 } 228 229 void 230 vm_object_lock_shared(vm_object_t obj) 231 { 232 lwkt_gettoken_shared(&obj->token); 233 } 234 235 void 236 vm_object_unlock(vm_object_t obj) 237 { 238 lwkt_reltoken(&obj->token); 239 } 240 241 void 242 vm_object_upgrade(vm_object_t obj) 243 { 244 lwkt_reltoken(&obj->token); 245 lwkt_gettoken(&obj->token); 246 } 247 248 void 249 vm_object_downgrade(vm_object_t obj) 250 { 251 lwkt_reltoken(&obj->token); 252 lwkt_gettoken_shared(&obj->token); 253 } 254 255 static __inline void 256 vm_object_assert_held(vm_object_t obj) 257 { 258 ASSERT_LWKT_TOKEN_HELD(&obj->token); 259 } 260 261 int 262 vm_quickcolor(void) 263 { 264 globaldata_t gd = mycpu; 265 int pg_color; 266 267 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 268 pg_color += gd->gd_quick_color; 269 gd->gd_quick_color += PQ_PRIME2; 270 271 return pg_color; 272 } 273 274 void 275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 276 { 277 KKASSERT(obj != NULL); 278 279 /* 280 * Object must be held (object allocation is stable due to callers 281 * context, typically already holding the token on a parent object) 282 * prior to potentially blocking on the lock, otherwise the object 283 * can get ripped away from us. 284 */ 285 refcount_acquire(&obj->hold_count); 286 vm_object_lock(obj); 287 288 #if defined(DEBUG_LOCKS) 289 debugvm_object_add(obj, file, line, 1); 290 #endif 291 } 292 293 int 294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 295 { 296 KKASSERT(obj != NULL); 297 298 /* 299 * Object must be held (object allocation is stable due to callers 300 * context, typically already holding the token on a parent object) 301 * prior to potentially blocking on the lock, otherwise the object 302 * can get ripped away from us. 303 */ 304 refcount_acquire(&obj->hold_count); 305 if (vm_object_lock_try(obj) == 0) { 306 if (refcount_release(&obj->hold_count)) { 307 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 308 kfree(obj, M_VM_OBJECT); 309 } 310 return(0); 311 } 312 313 #if defined(DEBUG_LOCKS) 314 debugvm_object_add(obj, file, line, 1); 315 #endif 316 return(1); 317 } 318 319 void 320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 321 { 322 KKASSERT(obj != NULL); 323 324 /* 325 * Object must be held (object allocation is stable due to callers 326 * context, typically already holding the token on a parent object) 327 * prior to potentially blocking on the lock, otherwise the object 328 * can get ripped away from us. 329 */ 330 refcount_acquire(&obj->hold_count); 331 vm_object_lock_shared(obj); 332 333 #if defined(DEBUG_LOCKS) 334 debugvm_object_add(obj, file, line, 1); 335 #endif 336 } 337 338 /* 339 * Drop the token and hold_count on the object. 340 * 341 * WARNING! Token might be shared. 342 */ 343 void 344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 345 { 346 if (obj == NULL) 347 return; 348 349 /* 350 * No new holders should be possible once we drop hold_count 1->0 as 351 * there is no longer any way to reference the object. 352 */ 353 KKASSERT(obj->hold_count > 0); 354 if (refcount_release(&obj->hold_count)) { 355 #if defined(DEBUG_LOCKS) 356 debugvm_object_add(obj, file, line, -1); 357 #endif 358 359 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 360 vm_object_unlock(obj); 361 kfree(obj, M_VM_OBJECT); 362 } else { 363 vm_object_unlock(obj); 364 } 365 } else { 366 #if defined(DEBUG_LOCKS) 367 debugvm_object_add(obj, file, line, -1); 368 #endif 369 vm_object_unlock(obj); 370 } 371 } 372 373 /* 374 * Initialize a freshly allocated object, returning a held object. 375 * 376 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 377 * 378 * No requirements. 379 */ 380 void 381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) 382 { 383 struct vm_object_hash *hash; 384 385 RB_INIT(&object->rb_memq); 386 lwkt_token_init(&object->token, "vmobj"); 387 388 object->type = type; 389 object->size = size; 390 object->ref_count = 1; 391 object->memattr = VM_MEMATTR_DEFAULT; 392 object->hold_count = 0; 393 object->flags = 0; 394 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 395 vm_object_set_flag(object, OBJ_ONEMAPPING); 396 object->paging_in_progress = 0; 397 object->resident_page_count = 0; 398 /* cpu localization twist */ 399 object->pg_color = vm_quickcolor(); 400 object->handle = NULL; 401 402 atomic_add_int(&object->generation, 1); 403 object->swblock_count = 0; 404 RB_INIT(&object->swblock_root); 405 vm_object_lock_init(object); 406 pmap_object_init(object); 407 408 vm_object_hold(object); 409 410 hash = vmobj_hash(object); 411 lwkt_gettoken(&hash->token); 412 TAILQ_INSERT_TAIL(&hash->list, object, object_list); 413 lwkt_reltoken(&hash->token); 414 } 415 416 /* 417 * Initialize a VM object. 418 */ 419 void 420 vm_object_init(vm_object_t object, vm_pindex_t size) 421 { 422 _vm_object_allocate(OBJT_DEFAULT, size, object); 423 vm_object_drop(object); 424 } 425 426 /* 427 * Initialize the VM objects module. 428 * 429 * Called from the low level boot code only. Note that this occurs before 430 * kmalloc is initialized so we cannot allocate any VM objects. 431 */ 432 void 433 vm_object_init1(void) 434 { 435 int i; 436 437 for (i = 0; i < VMOBJ_HSIZE; ++i) { 438 TAILQ_INIT(&vm_object_hash[i].list); 439 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 440 } 441 442 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 443 &kernel_object); 444 vm_object_drop(&kernel_object); 445 } 446 447 void 448 vm_object_init2(void) 449 { 450 kmalloc_set_unlimited(M_VM_OBJECT); 451 } 452 453 /* 454 * Allocate and return a new object of the specified type and size. 455 * 456 * No requirements. 457 */ 458 vm_object_t 459 vm_object_allocate(objtype_t type, vm_pindex_t size) 460 { 461 vm_object_t obj; 462 463 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 464 _vm_object_allocate(type, size, obj); 465 vm_object_drop(obj); 466 467 return (obj); 468 } 469 470 /* 471 * This version returns a held object, allowing further atomic initialization 472 * of the object. 473 */ 474 vm_object_t 475 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 476 { 477 vm_object_t obj; 478 479 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 480 _vm_object_allocate(type, size, obj); 481 482 return (obj); 483 } 484 485 /* 486 * Add an additional reference to a vm_object. The object must already be 487 * held. The original non-lock version is no longer supported. The object 488 * must NOT be chain locked by anyone at the time the reference is added. 489 * 490 * The object must be held, but may be held shared if desired (hence why 491 * we use an atomic op). 492 */ 493 void 494 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 495 { 496 KKASSERT(object != NULL); 497 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 498 atomic_add_int(&object->ref_count, 1); 499 if (object->type == OBJT_VNODE) { 500 vref(object->handle); 501 /* XXX what if the vnode is being destroyed? */ 502 } 503 #if defined(DEBUG_LOCKS) 504 debugvm_object_add(object, file, line, 1); 505 #endif 506 } 507 508 /* 509 * This version is only allowed for vnode objects. 510 */ 511 void 512 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 513 { 514 KKASSERT(object->type == OBJT_VNODE); 515 atomic_add_int(&object->ref_count, 1); 516 vref(object->handle); 517 #if defined(DEBUG_LOCKS) 518 debugvm_object_add(object, file, line, 1); 519 #endif 520 } 521 522 /* 523 * Dereference an object and its underlying vnode. The object may be 524 * held shared. On return the object will remain held. 525 * 526 * This function may return a vnode in *vpp which the caller must release 527 * after the caller drops its own lock. If vpp is NULL, we assume that 528 * the caller was holding an exclusive lock on the object and we vrele() 529 * the vp ourselves. 530 */ 531 static void 532 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 533 VMOBJDBARGS) 534 { 535 struct vnode *vp = (struct vnode *) object->handle; 536 537 KASSERT(object->type == OBJT_VNODE, 538 ("vm_object_vndeallocate: not a vnode object")); 539 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 540 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 541 #ifdef INVARIANTS 542 if (object->ref_count == 0) { 543 vprint("vm_object_vndeallocate", vp); 544 panic("vm_object_vndeallocate: bad object reference count"); 545 } 546 #endif 547 for (;;) { 548 int count = object->ref_count; 549 cpu_ccfence(); 550 if (count == 1) { 551 vm_object_upgrade(object); 552 if (atomic_cmpset_int(&object->ref_count, count, 0)) { 553 vclrflags(vp, VTEXT); 554 break; 555 } 556 } else { 557 if (atomic_cmpset_int(&object->ref_count, 558 count, count - 1)) { 559 break; 560 } 561 } 562 /* retry */ 563 } 564 #if defined(DEBUG_LOCKS) 565 debugvm_object_add(object, file, line, -1); 566 #endif 567 568 /* 569 * vrele or return the vp to vrele. We can only safely vrele(vp) 570 * if the object was locked exclusively. But there are two races 571 * here. 572 * 573 * We had to upgrade the object above to safely clear VTEXT 574 * but the alternative path where the shared lock is retained 575 * can STILL race to 0 in other paths and cause our own vrele() 576 * to terminate the vnode. We can't allow that if the VM object 577 * is still locked shared. 578 */ 579 if (vpp) 580 *vpp = vp; 581 else 582 vrele(vp); 583 } 584 585 /* 586 * Release a reference to the specified object, gained either through a 587 * vm_object_allocate or a vm_object_reference call. When all references 588 * are gone, storage associated with this object may be relinquished. 589 * 590 * The caller does not have to hold the object locked but must have control 591 * over the reference in question in order to guarantee that the object 592 * does not get ripped out from under us. 593 * 594 * XXX Currently all deallocations require an exclusive lock. 595 */ 596 void 597 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 598 { 599 struct vnode *vp; 600 int count; 601 602 if (object == NULL) 603 return; 604 605 for (;;) { 606 count = object->ref_count; 607 cpu_ccfence(); 608 609 /* 610 * If decrementing the count enters into special handling 611 * territory (0, 1, or 2) we have to do it the hard way. 612 * Fortunate though, objects with only a few refs like this 613 * are not likely to be heavily contended anyway. 614 * 615 * For vnode objects we only care about 1->0 transitions. 616 */ 617 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 618 #if defined(DEBUG_LOCKS) 619 debugvm_object_add(object, file, line, 0); 620 #endif 621 vm_object_hold(object); 622 vm_object_deallocate_locked(object); 623 vm_object_drop(object); 624 break; 625 } 626 627 /* 628 * Try to decrement ref_count without acquiring a hold on 629 * the object. This is particularly important for the exec*() 630 * and exit*() code paths because the program binary may 631 * have a great deal of sharing and an exclusive lock will 632 * crowbar performance in those circumstances. 633 */ 634 if (object->type == OBJT_VNODE) { 635 vp = (struct vnode *)object->handle; 636 if (atomic_cmpset_int(&object->ref_count, 637 count, count - 1)) { 638 #if defined(DEBUG_LOCKS) 639 debugvm_object_add(object, file, line, -1); 640 #endif 641 642 vrele(vp); 643 break; 644 } 645 /* retry */ 646 } else { 647 if (atomic_cmpset_int(&object->ref_count, 648 count, count - 1)) { 649 #if defined(DEBUG_LOCKS) 650 debugvm_object_add(object, file, line, -1); 651 #endif 652 break; 653 } 654 /* retry */ 655 } 656 /* retry */ 657 } 658 } 659 660 void 661 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 662 { 663 /* 664 * Degenerate case 665 */ 666 if (object == NULL) 667 return; 668 669 /* 670 * vnode case, caller either locked the object exclusively 671 * or this is a recursion with must_drop != 0 and the vnode 672 * object will be locked shared. 673 * 674 * If locked shared we have to drop the object before we can 675 * call vrele() or risk a shared/exclusive livelock. 676 */ 677 if (object->type == OBJT_VNODE) { 678 ASSERT_LWKT_TOKEN_HELD(&object->token); 679 vm_object_vndeallocate(object, NULL); 680 return; 681 } 682 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 683 684 /* 685 * Normal case (object is locked exclusively) 686 */ 687 if (object->ref_count == 0) { 688 panic("vm_object_deallocate: object deallocated " 689 "too many times: %d", object->type); 690 } 691 if (object->ref_count > 2) { 692 atomic_add_int(&object->ref_count, -1); 693 #if defined(DEBUG_LOCKS) 694 debugvm_object_add(object, file, line, -1); 695 #endif 696 return; 697 } 698 699 /* 700 * Drop the ref and handle termination on the 1->0 transition. 701 * We may have blocked above so we have to recheck. 702 */ 703 KKASSERT(object->ref_count != 0); 704 if (object->ref_count >= 2) { 705 atomic_add_int(&object->ref_count, -1); 706 #if defined(DEBUG_LOCKS) 707 debugvm_object_add(object, file, line, -1); 708 #endif 709 return; 710 } 711 712 atomic_add_int(&object->ref_count, -1); 713 if ((object->flags & OBJ_DEAD) == 0) 714 vm_object_terminate(object); 715 } 716 717 /* 718 * Destroy the specified object, freeing up related resources. 719 * 720 * The object must have zero references. 721 * 722 * The object must held. The caller is responsible for dropping the object 723 * after terminate returns. Terminate does NOT drop the object. 724 */ 725 static int vm_object_terminate_callback(vm_page_t p, void *data); 726 727 void 728 vm_object_terminate(vm_object_t object) 729 { 730 struct rb_vm_page_scan_info info; 731 struct vm_object_hash *hash; 732 733 /* 734 * Make sure no one uses us. Once we set OBJ_DEAD we should be 735 * able to safely block. 736 */ 737 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 738 KKASSERT((object->flags & OBJ_DEAD) == 0); 739 vm_object_set_flag(object, OBJ_DEAD); 740 741 /* 742 * Wait for the pageout daemon to be done with the object 743 */ 744 vm_object_pip_wait(object, "objtrm1"); 745 746 KASSERT(!object->paging_in_progress, 747 ("vm_object_terminate: pageout in progress")); 748 749 /* 750 * Clean and free the pages, as appropriate. All references to the 751 * object are gone, so we don't need to lock it. 752 */ 753 if (object->type == OBJT_VNODE) { 754 struct vnode *vp; 755 756 /* 757 * Clean pages and flush buffers. 758 * 759 * NOTE! TMPFS buffer flushes do not typically flush the 760 * actual page to swap as this would be highly 761 * inefficient, and normal filesystems usually wrap 762 * page flushes with buffer cache buffers. 763 * 764 * To deal with this we have to call vinvalbuf() both 765 * before and after the vm_object_page_clean(). 766 */ 767 vp = (struct vnode *) object->handle; 768 vinvalbuf(vp, V_SAVE, 0, 0); 769 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 770 vinvalbuf(vp, V_SAVE, 0, 0); 771 } 772 773 /* 774 * Wait for any I/O to complete, after which there had better not 775 * be any references left on the object. 776 */ 777 vm_object_pip_wait(object, "objtrm2"); 778 779 if (object->ref_count != 0) { 780 panic("vm_object_terminate: object with references, " 781 "ref_count=%d", object->ref_count); 782 } 783 784 /* 785 * Cleanup any shared pmaps associated with this object. 786 */ 787 pmap_object_free(object); 788 789 /* 790 * Now free any remaining pages. For internal objects, this also 791 * removes them from paging queues. Don't free wired pages, just 792 * remove them from the object. 793 */ 794 info.count = 0; 795 info.object = object; 796 do { 797 info.error = 0; 798 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 799 vm_object_terminate_callback, &info); 800 } while (info.error); 801 802 /* 803 * Let the pager know object is dead. 804 */ 805 vm_pager_deallocate(object); 806 807 /* 808 * Wait for the object hold count to hit 1, clean out pages as 809 * we go. vmobj_token interlocks any race conditions that might 810 * pick the object up from the vm_object_list after we have cleared 811 * rb_memq. 812 */ 813 for (;;) { 814 if (RB_ROOT(&object->rb_memq) == NULL) 815 break; 816 kprintf("vm_object_terminate: Warning, object %p " 817 "still has %ld pages\n", 818 object, object->resident_page_count); 819 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 820 vm_object_terminate_callback, &info); 821 } 822 823 /* 824 * There had better not be any pages left 825 */ 826 KKASSERT(object->resident_page_count == 0); 827 828 /* 829 * Remove the object from the global object list. 830 */ 831 hash = vmobj_hash(object); 832 lwkt_gettoken(&hash->token); 833 TAILQ_REMOVE(&hash->list, object, object_list); 834 lwkt_reltoken(&hash->token); 835 836 if (object->ref_count != 0) { 837 panic("vm_object_terminate2: object with references, " 838 "ref_count=%d", object->ref_count); 839 } 840 841 /* 842 * NOTE: The object hold_count is at least 1, so we cannot kfree() 843 * the object here. See vm_object_drop(). 844 */ 845 } 846 847 /* 848 * The caller must hold the object. 849 */ 850 static int 851 vm_object_terminate_callback(vm_page_t p, void *data) 852 { 853 struct rb_vm_page_scan_info *info = data; 854 vm_object_t object; 855 856 object = p->object; 857 KKASSERT(object == info->object); 858 if (vm_page_busy_try(p, TRUE)) { 859 vm_page_sleep_busy(p, TRUE, "vmotrm"); 860 info->error = 1; 861 return 0; 862 } 863 if (object != p->object) { 864 /* XXX remove once we determine it can't happen */ 865 kprintf("vm_object_terminate: Warning: Encountered " 866 "busied page %p on queue %d\n", p, p->queue); 867 vm_page_wakeup(p); 868 info->error = 1; 869 } else if (p->wire_count == 0) { 870 /* 871 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 872 */ 873 vm_page_free(p); 874 mycpu->gd_cnt.v_pfree++; 875 } else { 876 if (p->queue != PQ_NONE) { 877 kprintf("vm_object_terminate: Warning: Encountered " 878 "wired page %p on queue %d\n", p, p->queue); 879 if (vm_object_debug > 0) { 880 --vm_object_debug; 881 print_backtrace(10); 882 } 883 } 884 vm_page_remove(p); 885 vm_page_wakeup(p); 886 } 887 888 /* 889 * Must be at end to avoid SMP races, caller holds object token 890 */ 891 if ((++info->count & 63) == 0) 892 lwkt_user_yield(); 893 return(0); 894 } 895 896 /* 897 * Clean all dirty pages in the specified range of object. Leaves page 898 * on whatever queue it is currently on. If NOSYNC is set then do not 899 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 900 * leaving the object dirty. 901 * 902 * When stuffing pages asynchronously, allow clustering. XXX we need a 903 * synchronous clustering mode implementation. 904 * 905 * Odd semantics: if start == end, we clean everything. 906 * 907 * The object must be locked? XXX 908 */ 909 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 910 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 911 912 void 913 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 914 int flags) 915 { 916 struct rb_vm_page_scan_info info; 917 struct vnode *vp; 918 int wholescan; 919 int pagerflags; 920 int generation; 921 922 vm_object_hold(object); 923 if (object->type != OBJT_VNODE || 924 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 925 vm_object_drop(object); 926 return; 927 } 928 929 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 930 VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; 931 pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; 932 933 vp = object->handle; 934 935 /* 936 * Interlock other major object operations. This allows us to 937 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 938 */ 939 vm_object_set_flag(object, OBJ_CLEANING); 940 941 /* 942 * Handle 'entire object' case 943 */ 944 info.start_pindex = start; 945 if (end == 0) { 946 info.end_pindex = object->size - 1; 947 } else { 948 info.end_pindex = end - 1; 949 } 950 wholescan = (start == 0 && info.end_pindex == object->size - 1); 951 info.limit = flags; 952 info.pagerflags = pagerflags; 953 info.object = object; 954 955 /* 956 * If cleaning the entire object do a pass to mark the pages read-only. 957 * If everything worked out ok, clear OBJ_WRITEABLE and 958 * OBJ_MIGHTBEDIRTY. 959 */ 960 if (wholescan) { 961 info.error = 0; 962 info.count = 0; 963 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 964 vm_object_page_clean_pass1, &info); 965 if (info.error == 0) { 966 vm_object_clear_flag(object, 967 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 968 if (object->type == OBJT_VNODE && 969 (vp = (struct vnode *)object->handle) != NULL) { 970 /* 971 * Use new-style interface to clear VISDIRTY 972 * because the vnode is not necessarily removed 973 * from the syncer list(s) as often as it was 974 * under the old interface, which can leave 975 * the vnode on the syncer list after reclaim. 976 */ 977 vclrobjdirty(vp); 978 } 979 } 980 } 981 982 /* 983 * Do a pass to clean all the dirty pages we find. 984 */ 985 do { 986 info.error = 0; 987 info.count = 0; 988 generation = object->generation; 989 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 990 vm_object_page_clean_pass2, &info); 991 } while (info.error || generation != object->generation); 992 993 vm_object_clear_flag(object, OBJ_CLEANING); 994 vm_object_drop(object); 995 } 996 997 /* 998 * The caller must hold the object. 999 */ 1000 static 1001 int 1002 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1003 { 1004 struct rb_vm_page_scan_info *info = data; 1005 1006 KKASSERT(p->object == info->object); 1007 1008 vm_page_flag_set(p, PG_CLEANCHK); 1009 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1010 info->error = 1; 1011 } else if (vm_page_busy_try(p, FALSE)) { 1012 info->error = 1; 1013 } else { 1014 KKASSERT(p->object == info->object); 1015 vm_page_protect(p, VM_PROT_READ); 1016 vm_page_wakeup(p); 1017 } 1018 1019 /* 1020 * Must be at end to avoid SMP races, caller holds object token 1021 */ 1022 if ((++info->count & 63) == 0) 1023 lwkt_user_yield(); 1024 return(0); 1025 } 1026 1027 /* 1028 * The caller must hold the object 1029 */ 1030 static 1031 int 1032 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1033 { 1034 struct rb_vm_page_scan_info *info = data; 1035 int generation; 1036 1037 KKASSERT(p->object == info->object); 1038 1039 /* 1040 * Do not mess with pages that were inserted after we started 1041 * the cleaning pass. 1042 */ 1043 if ((p->flags & PG_CLEANCHK) == 0) 1044 goto done; 1045 1046 generation = info->object->generation; 1047 1048 if (vm_page_busy_try(p, TRUE)) { 1049 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1050 info->error = 1; 1051 goto done; 1052 } 1053 1054 KKASSERT(p->object == info->object && 1055 info->object->generation == generation); 1056 1057 /* 1058 * Before wasting time traversing the pmaps, check for trivial 1059 * cases where the page cannot be dirty. 1060 */ 1061 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1062 KKASSERT((p->dirty & p->valid) == 0 && 1063 (p->flags & PG_NEED_COMMIT) == 0); 1064 vm_page_wakeup(p); 1065 goto done; 1066 } 1067 1068 /* 1069 * Check whether the page is dirty or not. The page has been set 1070 * to be read-only so the check will not race a user dirtying the 1071 * page. 1072 */ 1073 vm_page_test_dirty(p); 1074 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1075 vm_page_flag_clear(p, PG_CLEANCHK); 1076 vm_page_wakeup(p); 1077 goto done; 1078 } 1079 1080 /* 1081 * If we have been asked to skip nosync pages and this is a 1082 * nosync page, skip it. Note that the object flags were 1083 * not cleared in this case (because pass1 will have returned an 1084 * error), so we do not have to set them. 1085 */ 1086 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1087 vm_page_flag_clear(p, PG_CLEANCHK); 1088 vm_page_wakeup(p); 1089 goto done; 1090 } 1091 1092 /* 1093 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1094 * the pages that get successfully flushed. Set info->error if 1095 * we raced an object modification. 1096 */ 1097 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1098 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1099 1100 /* 1101 * Must be at end to avoid SMP races, caller holds object token 1102 */ 1103 done: 1104 if ((++info->count & 63) == 0) 1105 lwkt_user_yield(); 1106 return(0); 1107 } 1108 1109 /* 1110 * Collect the specified page and nearby pages and flush them out. 1111 * The number of pages flushed is returned. The passed page is busied 1112 * by the caller and we are responsible for its disposition. 1113 * 1114 * The caller must hold the object. 1115 */ 1116 static void 1117 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1118 { 1119 int error; 1120 int is; 1121 int ib; 1122 int i; 1123 int page_base; 1124 vm_pindex_t pi; 1125 vm_page_t ma[BLIST_MAX_ALLOC]; 1126 1127 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1128 1129 pi = p->pindex; 1130 page_base = pi % BLIST_MAX_ALLOC; 1131 ma[page_base] = p; 1132 ib = page_base - 1; 1133 is = page_base + 1; 1134 1135 while (ib >= 0) { 1136 vm_page_t tp; 1137 1138 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1139 TRUE, &error); 1140 if (error) 1141 break; 1142 if (tp == NULL) 1143 break; 1144 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1145 (tp->flags & PG_CLEANCHK) == 0) { 1146 vm_page_wakeup(tp); 1147 break; 1148 } 1149 if ((tp->queue - tp->pc) == PQ_CACHE) { 1150 vm_page_flag_clear(tp, PG_CLEANCHK); 1151 vm_page_wakeup(tp); 1152 break; 1153 } 1154 vm_page_test_dirty(tp); 1155 if ((tp->dirty & tp->valid) == 0 && 1156 (tp->flags & PG_NEED_COMMIT) == 0) { 1157 vm_page_flag_clear(tp, PG_CLEANCHK); 1158 vm_page_wakeup(tp); 1159 break; 1160 } 1161 ma[ib] = tp; 1162 --ib; 1163 } 1164 ++ib; /* fixup */ 1165 1166 while (is < BLIST_MAX_ALLOC && 1167 pi - page_base + is < object->size) { 1168 vm_page_t tp; 1169 1170 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1171 TRUE, &error); 1172 if (error) 1173 break; 1174 if (tp == NULL) 1175 break; 1176 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1177 (tp->flags & PG_CLEANCHK) == 0) { 1178 vm_page_wakeup(tp); 1179 break; 1180 } 1181 if ((tp->queue - tp->pc) == PQ_CACHE) { 1182 vm_page_flag_clear(tp, PG_CLEANCHK); 1183 vm_page_wakeup(tp); 1184 break; 1185 } 1186 vm_page_test_dirty(tp); 1187 if ((tp->dirty & tp->valid) == 0 && 1188 (tp->flags & PG_NEED_COMMIT) == 0) { 1189 vm_page_flag_clear(tp, PG_CLEANCHK); 1190 vm_page_wakeup(tp); 1191 break; 1192 } 1193 ma[is] = tp; 1194 ++is; 1195 } 1196 1197 /* 1198 * All pages in the ma[] array are busied now 1199 */ 1200 for (i = ib; i < is; ++i) { 1201 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1202 vm_page_hold(ma[i]); /* XXX need this any more? */ 1203 } 1204 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1205 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1206 vm_page_unhold(ma[i]); 1207 } 1208 1209 /* 1210 * Same as vm_object_pmap_copy, except range checking really 1211 * works, and is meant for small sections of an object. 1212 * 1213 * This code protects resident pages by making them read-only 1214 * and is typically called on a fork or split when a page 1215 * is converted to copy-on-write. 1216 * 1217 * NOTE: If the page is already at VM_PROT_NONE, calling 1218 * vm_page_protect will have no effect. 1219 */ 1220 void 1221 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 1222 { 1223 vm_pindex_t idx; 1224 vm_page_t p; 1225 1226 if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0) 1227 return; 1228 1229 vm_object_hold(object); 1230 for (idx = start; idx < end; idx++) { 1231 p = vm_page_lookup(object, idx); 1232 if (p == NULL) 1233 continue; 1234 vm_page_protect(p, VM_PROT_READ); 1235 } 1236 vm_object_drop(object); 1237 } 1238 1239 /* 1240 * Removes all physical pages in the specified object range from all 1241 * physical maps. 1242 * 1243 * The object must *not* be locked. 1244 */ 1245 1246 static int vm_object_pmap_remove_callback(vm_page_t p, void *data); 1247 1248 void 1249 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 1250 { 1251 struct rb_vm_page_scan_info info; 1252 1253 if (object == NULL) 1254 return; 1255 if (start == end) 1256 return; 1257 info.start_pindex = start; 1258 info.end_pindex = end - 1; 1259 info.count = 0; 1260 info.object = object; 1261 1262 vm_object_hold(object); 1263 do { 1264 info.error = 0; 1265 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1266 vm_object_pmap_remove_callback, &info); 1267 } while (info.error); 1268 if (start == 0 && end == object->size) 1269 vm_object_clear_flag(object, OBJ_WRITEABLE); 1270 vm_object_drop(object); 1271 } 1272 1273 /* 1274 * The caller must hold the object 1275 */ 1276 static int 1277 vm_object_pmap_remove_callback(vm_page_t p, void *data) 1278 { 1279 struct rb_vm_page_scan_info *info = data; 1280 1281 if (info->object != p->object || 1282 p->pindex < info->start_pindex || 1283 p->pindex > info->end_pindex) { 1284 kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n", 1285 info->object, p); 1286 info->error = 1; 1287 return(0); 1288 } 1289 1290 vm_page_protect(p, VM_PROT_NONE); 1291 1292 /* 1293 * Must be at end to avoid SMP races, caller holds object token 1294 */ 1295 if ((++info->count & 63) == 0) 1296 lwkt_user_yield(); 1297 return(0); 1298 } 1299 1300 /* 1301 * Implements the madvise function at the object/page level. 1302 * 1303 * MADV_WILLNEED (any object) 1304 * 1305 * Activate the specified pages if they are resident. 1306 * 1307 * MADV_DONTNEED (any object) 1308 * 1309 * Deactivate the specified pages if they are resident. 1310 * 1311 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1312 * 1313 * Deactivate and clean the specified pages if they are 1314 * resident. This permits the process to reuse the pages 1315 * without faulting or the kernel to reclaim the pages 1316 * without I/O. 1317 * 1318 * No requirements. 1319 */ 1320 void 1321 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1322 vm_pindex_t count, int advise) 1323 { 1324 vm_pindex_t end; 1325 vm_page_t m; 1326 int error; 1327 1328 if (object == NULL) 1329 return; 1330 1331 end = pindex + count; 1332 1333 vm_object_hold(object); 1334 1335 /* 1336 * Locate and adjust resident pages. This only applies to the 1337 * primary object in the mapping. 1338 */ 1339 for (; pindex < end; pindex += 1) { 1340 relookup: 1341 /* 1342 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1343 * and those pages must be OBJ_ONEMAPPING. 1344 */ 1345 if (advise == MADV_FREE) { 1346 if ((object->type != OBJT_DEFAULT && 1347 object->type != OBJT_SWAP) || 1348 (object->flags & OBJ_ONEMAPPING) == 0) { 1349 continue; 1350 } 1351 } 1352 1353 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1354 1355 if (error) { 1356 vm_page_sleep_busy(m, TRUE, "madvpo"); 1357 goto relookup; 1358 } 1359 if (m == NULL) { 1360 /* 1361 * There may be swap even if there is no backing page 1362 */ 1363 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1364 swap_pager_freespace(object, pindex, 1); 1365 continue; 1366 } 1367 1368 /* 1369 * If the page is not in a normal active state, we skip it. 1370 * If the page is not managed there are no page queues to 1371 * mess with. Things can break if we mess with pages in 1372 * any of the below states. 1373 */ 1374 if (m->wire_count || 1375 (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1376 m->valid != VM_PAGE_BITS_ALL 1377 ) { 1378 vm_page_wakeup(m); 1379 continue; 1380 } 1381 1382 /* 1383 * Theoretically once a page is known not to be busy, an 1384 * interrupt cannot come along and rip it out from under us. 1385 */ 1386 if (advise == MADV_WILLNEED) { 1387 vm_page_activate(m); 1388 } else if (advise == MADV_DONTNEED) { 1389 vm_page_dontneed(m); 1390 } else if (advise == MADV_FREE) { 1391 /* 1392 * Mark the page clean. This will allow the page 1393 * to be freed up by the system. However, such pages 1394 * are often reused quickly by malloc()/free() 1395 * so we do not do anything that would cause 1396 * a page fault if we can help it. 1397 * 1398 * Specifically, we do not try to actually free 1399 * the page now nor do we try to put it in the 1400 * cache (which would cause a page fault on reuse). 1401 * 1402 * But we do make the page is freeable as we 1403 * can without actually taking the step of unmapping 1404 * it. 1405 */ 1406 pmap_clear_modify(m); 1407 m->dirty = 0; 1408 m->act_count = 0; 1409 vm_page_dontneed(m); 1410 if (object->type == OBJT_SWAP) 1411 swap_pager_freespace(object, pindex, 1); 1412 } 1413 vm_page_wakeup(m); 1414 } 1415 vm_object_drop(object); 1416 } 1417 1418 /* 1419 * Removes all physical pages in the specified object range from the 1420 * object's list of pages. 1421 * 1422 * No requirements. 1423 */ 1424 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1425 1426 void 1427 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1428 boolean_t clean_only) 1429 { 1430 struct rb_vm_page_scan_info info; 1431 int all; 1432 1433 /* 1434 * Degenerate cases and assertions 1435 */ 1436 vm_object_hold(object); 1437 if (object == NULL || 1438 (object->resident_page_count == 0 && object->swblock_count == 0)) { 1439 vm_object_drop(object); 1440 return; 1441 } 1442 KASSERT(object->type != OBJT_PHYS, 1443 ("attempt to remove pages from a physical object")); 1444 1445 /* 1446 * Indicate that paging is occuring on the object 1447 */ 1448 vm_object_pip_add(object, 1); 1449 1450 /* 1451 * Figure out the actual removal range and whether we are removing 1452 * the entire contents of the object or not. If removing the entire 1453 * contents, be sure to get all pages, even those that might be 1454 * beyond the end of the object. 1455 */ 1456 info.object = object; 1457 info.start_pindex = start; 1458 if (end == 0) 1459 info.end_pindex = (vm_pindex_t)-1; 1460 else 1461 info.end_pindex = end - 1; 1462 info.limit = clean_only; 1463 info.count = 0; 1464 all = (start == 0 && info.end_pindex >= object->size - 1); 1465 1466 /* 1467 * Loop until we are sure we have gotten them all. 1468 */ 1469 do { 1470 info.error = 0; 1471 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1472 vm_object_page_remove_callback, &info); 1473 } while (info.error); 1474 1475 /* 1476 * Remove any related swap if throwing away pages, or for 1477 * non-swap objects (the swap is a clean copy in that case). 1478 */ 1479 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1480 if (all) 1481 swap_pager_freespace_all(object); 1482 else 1483 swap_pager_freespace(object, info.start_pindex, 1484 info.end_pindex - info.start_pindex + 1); 1485 } 1486 1487 /* 1488 * Cleanup 1489 */ 1490 vm_object_pip_wakeup(object); 1491 vm_object_drop(object); 1492 } 1493 1494 /* 1495 * The caller must hold the object. 1496 * 1497 * NOTE: User yields are allowed when removing more than one page, but not 1498 * allowed if only removing one page (the path for single page removals 1499 * might hold a spinlock). 1500 */ 1501 static int 1502 vm_object_page_remove_callback(vm_page_t p, void *data) 1503 { 1504 struct rb_vm_page_scan_info *info = data; 1505 1506 if (info->object != p->object || 1507 p->pindex < info->start_pindex || 1508 p->pindex > info->end_pindex) { 1509 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1510 info->object, p); 1511 return(0); 1512 } 1513 if (vm_page_busy_try(p, TRUE)) { 1514 vm_page_sleep_busy(p, TRUE, "vmopar"); 1515 info->error = 1; 1516 return(0); 1517 } 1518 if (info->object != p->object) { 1519 /* this should never happen */ 1520 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1521 info->object, p); 1522 vm_page_wakeup(p); 1523 return(0); 1524 } 1525 1526 /* 1527 * Wired pages cannot be destroyed, but they can be invalidated 1528 * and we do so if clean_only (limit) is not set. 1529 * 1530 * WARNING! The page may be wired due to being part of a buffer 1531 * cache buffer, and the buffer might be marked B_CACHE. 1532 * This is fine as part of a truncation but VFSs must be 1533 * sure to fix the buffer up when re-extending the file. 1534 * 1535 * NOTE! PG_NEED_COMMIT is ignored. 1536 */ 1537 if (p->wire_count != 0) { 1538 vm_page_protect(p, VM_PROT_NONE); 1539 if (info->limit == 0) 1540 p->valid = 0; 1541 vm_page_wakeup(p); 1542 goto done; 1543 } 1544 1545 /* 1546 * limit is our clean_only flag. If set and the page is dirty or 1547 * requires a commit, do not free it. If set and the page is being 1548 * held by someone, do not free it. 1549 */ 1550 if (info->limit && p->valid) { 1551 vm_page_test_dirty(p); 1552 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1553 vm_page_wakeup(p); 1554 goto done; 1555 } 1556 } 1557 1558 /* 1559 * Destroy the page 1560 */ 1561 vm_page_protect(p, VM_PROT_NONE); 1562 vm_page_free(p); 1563 1564 /* 1565 * Must be at end to avoid SMP races, caller holds object token 1566 */ 1567 done: 1568 if ((++info->count & 63) == 0) 1569 lwkt_user_yield(); 1570 1571 return(0); 1572 } 1573 1574 /* 1575 * Try to extend prev_object into an adjoining region of virtual 1576 * memory, return TRUE on success. 1577 * 1578 * The caller does not need to hold (prev_object) but must have a stable 1579 * pointer to it (typically by holding the vm_map locked). 1580 * 1581 * This function only works for anonymous memory objects which either 1582 * have (a) one reference or (b) we are extending the object's size. 1583 * Otherwise the related VM pages we want to use for the object might 1584 * be in use by another mapping. 1585 */ 1586 boolean_t 1587 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1588 vm_size_t prev_size, vm_size_t next_size) 1589 { 1590 vm_pindex_t next_pindex; 1591 1592 if (prev_object == NULL) 1593 return (TRUE); 1594 1595 vm_object_hold(prev_object); 1596 1597 if (prev_object->type != OBJT_DEFAULT && 1598 prev_object->type != OBJT_SWAP) { 1599 vm_object_drop(prev_object); 1600 return (FALSE); 1601 } 1602 1603 #if 0 1604 /* caller now checks this */ 1605 /* 1606 * Try to collapse the object first 1607 */ 1608 vm_object_collapse(prev_object, NULL); 1609 #endif 1610 1611 #if 0 1612 /* caller now checks this */ 1613 /* 1614 * We can't coalesce if we shadow another object (figuring out the 1615 * relationships become too complex). 1616 */ 1617 if (prev_object->backing_object != NULL) { 1618 vm_object_chain_release(prev_object); 1619 vm_object_drop(prev_object); 1620 return (FALSE); 1621 } 1622 #endif 1623 1624 prev_size >>= PAGE_SHIFT; 1625 next_size >>= PAGE_SHIFT; 1626 next_pindex = prev_pindex + prev_size; 1627 1628 /* 1629 * We can't if the object has more than one ref count unless we 1630 * are extending it into newly minted space. 1631 */ 1632 if (prev_object->ref_count > 1 && 1633 prev_object->size != next_pindex) { 1634 vm_object_drop(prev_object); 1635 return (FALSE); 1636 } 1637 1638 /* 1639 * Remove any pages that may still be in the object from a previous 1640 * deallocation. 1641 */ 1642 if (next_pindex < prev_object->size) { 1643 vm_object_page_remove(prev_object, 1644 next_pindex, 1645 next_pindex + next_size, FALSE); 1646 if (prev_object->type == OBJT_SWAP) 1647 swap_pager_freespace(prev_object, 1648 next_pindex, next_size); 1649 } 1650 1651 /* 1652 * Extend the object if necessary. 1653 */ 1654 if (next_pindex + next_size > prev_object->size) 1655 prev_object->size = next_pindex + next_size; 1656 vm_object_drop(prev_object); 1657 1658 return (TRUE); 1659 } 1660 1661 /* 1662 * Make the object writable and flag is being possibly dirty. 1663 * 1664 * The object might not be held (or might be held but held shared), 1665 * the related vnode is probably not held either. Object and vnode are 1666 * stable by virtue of the vm_page busied by the caller preventing 1667 * destruction. 1668 * 1669 * If the related mount is flagged MNTK_THR_SYNC we need to call 1670 * vsetobjdirty(). Filesystems using this option usually shortcut 1671 * synchronization by only scanning the syncer list. 1672 */ 1673 void 1674 vm_object_set_writeable_dirty(vm_object_t object) 1675 { 1676 struct vnode *vp; 1677 1678 /*vm_object_assert_held(object);*/ 1679 /* 1680 * Avoid contention in vm fault path by checking the state before 1681 * issuing an atomic op on it. 1682 */ 1683 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1684 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1685 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1686 } 1687 if (object->type == OBJT_VNODE && 1688 (vp = (struct vnode *)object->handle) != NULL) { 1689 if ((vp->v_flag & VOBJDIRTY) == 0) { 1690 if (vp->v_mount && 1691 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1692 /* 1693 * New style THR_SYNC places vnodes on the 1694 * syncer list more deterministically. 1695 */ 1696 vsetobjdirty(vp); 1697 } else { 1698 /* 1699 * Old style scan would not necessarily place 1700 * a vnode on the syncer list when possibly 1701 * modified via mmap. 1702 */ 1703 vsetflags(vp, VOBJDIRTY); 1704 } 1705 } 1706 } 1707 } 1708 1709 #include "opt_ddb.h" 1710 #ifdef DDB 1711 #include <sys/cons.h> 1712 1713 #include <ddb/ddb.h> 1714 1715 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1716 vm_map_entry_t entry); 1717 static int vm_object_in_map (vm_object_t object); 1718 1719 /* 1720 * The caller must hold the object. 1721 */ 1722 static int 1723 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1724 { 1725 vm_map_backing_t ba; 1726 vm_map_t tmpm; 1727 vm_map_entry_t tmpe; 1728 int entcount; 1729 1730 if (map == NULL) 1731 return 0; 1732 if (entry == NULL) { 1733 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1734 entcount = map->nentries; 1735 while (entcount-- && tmpe) { 1736 if( _vm_object_in_map(map, object, tmpe)) { 1737 return 1; 1738 } 1739 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1740 } 1741 return (0); 1742 } 1743 switch(entry->maptype) { 1744 case VM_MAPTYPE_SUBMAP: 1745 tmpm = entry->ba.sub_map; 1746 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1747 entcount = tmpm->nentries; 1748 while (entcount-- && tmpe) { 1749 if( _vm_object_in_map(tmpm, object, tmpe)) { 1750 return 1; 1751 } 1752 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1753 } 1754 break; 1755 case VM_MAPTYPE_NORMAL: 1756 case VM_MAPTYPE_VPAGETABLE: 1757 ba = &entry->ba; 1758 while (ba) { 1759 if (ba->object == object) 1760 return TRUE; 1761 ba = ba->backing_ba; 1762 } 1763 break; 1764 default: 1765 break; 1766 } 1767 return 0; 1768 } 1769 1770 static int vm_object_in_map_callback(struct proc *p, void *data); 1771 1772 struct vm_object_in_map_info { 1773 vm_object_t object; 1774 int rv; 1775 }; 1776 1777 /* 1778 * Debugging only 1779 */ 1780 static int 1781 vm_object_in_map(vm_object_t object) 1782 { 1783 struct vm_object_in_map_info info; 1784 1785 info.rv = 0; 1786 info.object = object; 1787 1788 allproc_scan(vm_object_in_map_callback, &info, 0); 1789 if (info.rv) 1790 return 1; 1791 if( _vm_object_in_map(&kernel_map, object, 0)) 1792 return 1; 1793 if( _vm_object_in_map(&pager_map, object, 0)) 1794 return 1; 1795 if( _vm_object_in_map(&buffer_map, object, 0)) 1796 return 1; 1797 return 0; 1798 } 1799 1800 /* 1801 * Debugging only 1802 */ 1803 static int 1804 vm_object_in_map_callback(struct proc *p, void *data) 1805 { 1806 struct vm_object_in_map_info *info = data; 1807 1808 if (p->p_vmspace) { 1809 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1810 info->rv = 1; 1811 return -1; 1812 } 1813 } 1814 return (0); 1815 } 1816 1817 DB_SHOW_COMMAND(vmochk, vm_object_check) 1818 { 1819 struct vm_object_hash *hash; 1820 vm_object_t object; 1821 int n; 1822 1823 /* 1824 * make sure that internal objs are in a map somewhere 1825 * and none have zero ref counts. 1826 */ 1827 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1828 hash = &vm_object_hash[n]; 1829 for (object = TAILQ_FIRST(&hash->list); 1830 object != NULL; 1831 object = TAILQ_NEXT(object, object_list)) { 1832 if (object->type == OBJT_MARKER) 1833 continue; 1834 if (object->handle != NULL || 1835 (object->type != OBJT_DEFAULT && 1836 object->type != OBJT_SWAP)) { 1837 continue; 1838 } 1839 if (object->ref_count == 0) { 1840 db_printf("vmochk: internal obj has " 1841 "zero ref count: %ld\n", 1842 (long)object->size); 1843 } 1844 if (vm_object_in_map(object)) 1845 continue; 1846 db_printf("vmochk: internal obj is not in a map: " 1847 "ref: %d, size: %lu: 0x%lx\n", 1848 object->ref_count, (u_long)object->size, 1849 (u_long)object->size); 1850 } 1851 } 1852 } 1853 1854 /* 1855 * Debugging only 1856 */ 1857 DB_SHOW_COMMAND(object, vm_object_print_static) 1858 { 1859 /* XXX convert args. */ 1860 vm_object_t object = (vm_object_t)addr; 1861 boolean_t full = have_addr; 1862 1863 vm_page_t p; 1864 1865 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1866 #define count was_count 1867 1868 int count; 1869 1870 if (object == NULL) 1871 return; 1872 1873 db_iprintf( 1874 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1875 object, (int)object->type, (u_long)object->size, 1876 object->resident_page_count, object->ref_count, object->flags); 1877 /* 1878 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1879 */ 1880 db_iprintf("\n"); 1881 1882 if (!full) 1883 return; 1884 1885 db_indent += 2; 1886 count = 0; 1887 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1888 if (count == 0) 1889 db_iprintf("memory:="); 1890 else if (count == 6) { 1891 db_printf("\n"); 1892 db_iprintf(" ..."); 1893 count = 0; 1894 } else 1895 db_printf(","); 1896 count++; 1897 1898 db_printf("(off=0x%lx,page=0x%lx)", 1899 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1900 } 1901 if (count != 0) 1902 db_printf("\n"); 1903 db_indent -= 2; 1904 } 1905 1906 /* XXX. */ 1907 #undef count 1908 1909 /* 1910 * XXX need this non-static entry for calling from vm_map_print. 1911 * 1912 * Debugging only 1913 */ 1914 void 1915 vm_object_print(/* db_expr_t */ long addr, 1916 boolean_t have_addr, 1917 /* db_expr_t */ long count, 1918 char *modif) 1919 { 1920 vm_object_print_static(addr, have_addr, count, modif); 1921 } 1922 1923 /* 1924 * Debugging only 1925 */ 1926 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1927 { 1928 struct vm_object_hash *hash; 1929 vm_object_t object; 1930 int nl = 0; 1931 int c; 1932 int n; 1933 1934 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1935 hash = &vm_object_hash[n]; 1936 for (object = TAILQ_FIRST(&hash->list); 1937 object != NULL; 1938 object = TAILQ_NEXT(object, object_list)) { 1939 vm_pindex_t idx, fidx; 1940 vm_pindex_t osize; 1941 vm_paddr_t pa = -1, padiff; 1942 int rcount; 1943 vm_page_t m; 1944 1945 if (object->type == OBJT_MARKER) 1946 continue; 1947 db_printf("new object: %p\n", (void *)object); 1948 if ( nl > 18) { 1949 c = cngetc(); 1950 if (c != ' ') 1951 return; 1952 nl = 0; 1953 } 1954 nl++; 1955 rcount = 0; 1956 fidx = 0; 1957 osize = object->size; 1958 if (osize > 128) 1959 osize = 128; 1960 for (idx = 0; idx < osize; idx++) { 1961 m = vm_page_lookup(object, idx); 1962 if (m == NULL) { 1963 if (rcount) { 1964 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1965 (long)fidx, rcount, (long)pa); 1966 if ( nl > 18) { 1967 c = cngetc(); 1968 if (c != ' ') 1969 return; 1970 nl = 0; 1971 } 1972 nl++; 1973 rcount = 0; 1974 } 1975 continue; 1976 } 1977 1978 if (rcount && 1979 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1980 ++rcount; 1981 continue; 1982 } 1983 if (rcount) { 1984 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1985 padiff >>= PAGE_SHIFT; 1986 padiff &= PQ_L2_MASK; 1987 if (padiff == 0) { 1988 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1989 ++rcount; 1990 continue; 1991 } 1992 db_printf(" index(%ld)run(%d)pa(0x%lx)", 1993 (long)fidx, rcount, (long)pa); 1994 db_printf("pd(%ld)\n", (long)padiff); 1995 if ( nl > 18) { 1996 c = cngetc(); 1997 if (c != ' ') 1998 return; 1999 nl = 0; 2000 } 2001 nl++; 2002 } 2003 fidx = idx; 2004 pa = VM_PAGE_TO_PHYS(m); 2005 rcount = 1; 2006 } 2007 if (rcount) { 2008 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 2009 (long)fidx, rcount, (long)pa); 2010 if ( nl > 18) { 2011 c = cngetc(); 2012 if (c != ' ') 2013 return; 2014 nl = 0; 2015 } 2016 nl++; 2017 } 2018 } 2019 } 2020 } 2021 #endif /* DDB */ 2022