1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/sysctl.h> 77 #include <sys/refcount.h> 78 79 #include <vm/vm.h> 80 #include <vm/vm_param.h> 81 #include <vm/pmap.h> 82 #include <vm/vm_map.h> 83 #include <vm/vm_object.h> 84 #include <vm/vm_page.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_pager.h> 87 #include <vm/swap_pager.h> 88 #include <vm/vm_kern.h> 89 #include <vm/vm_extern.h> 90 #include <vm/vm_zone.h> 91 92 #include <vm/vm_page2.h> 93 94 #include <machine/specialreg.h> 95 96 #define EASY_SCAN_FACTOR 8 97 98 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 99 int pagerflags); 100 static void vm_object_lock_init(vm_object_t); 101 102 /* 103 * Virtual memory objects maintain the actual data 104 * associated with allocated virtual memory. A given 105 * page of memory exists within exactly one object. 106 * 107 * An object is only deallocated when all "references" 108 * are given up. Only one "reference" to a given 109 * region of an object should be writeable. 110 * 111 * Associated with each object is a list of all resident 112 * memory pages belonging to that object; this list is 113 * maintained by the "vm_page" module, and locked by the object's 114 * lock. 115 * 116 * Each object also records a "pager" routine which is 117 * used to retrieve (and store) pages to the proper backing 118 * storage. In addition, objects may be backed by other 119 * objects from which they were virtual-copied. 120 * 121 * The only items within the object structure which are 122 * modified after time of creation are: 123 * reference count locked by object's lock 124 * pager routine locked by object's lock 125 * 126 */ 127 128 struct vm_object kernel_object; 129 130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 131 132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures"); 133 134 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 135 #define VMOBJ_HASH_PRIME2 989042931893ULL 136 137 int vm_object_debug; 138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 139 140 static __inline 141 struct vm_object_hash * 142 vmobj_hash(vm_object_t obj) 143 { 144 uintptr_t hash1; 145 uintptr_t hash2; 146 147 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 148 hash1 %= VMOBJ_HASH_PRIME1; 149 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 150 hash2 %= VMOBJ_HASH_PRIME2; 151 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 152 } 153 154 #if defined(DEBUG_LOCKS) 155 156 #define vm_object_vndeallocate(obj, vpp) \ 157 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 158 159 /* 160 * Debug helper to track hold/drop/ref/deallocate calls. 161 */ 162 static void 163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 164 { 165 int i; 166 167 i = atomic_fetchadd_int(&obj->debug_index, 1); 168 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 169 ksnprintf(obj->debug_hold_thrs[i], 170 sizeof(obj->debug_hold_thrs[i]), 171 "%c%d:(%d):%s", 172 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 173 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 174 obj->ref_count, 175 curthread->td_comm); 176 obj->debug_hold_file[i] = file; 177 obj->debug_hold_line[i] = line; 178 #if 0 179 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 180 if (strcmp(curthread->td_comm, "sshd") == 0) { 181 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 182 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 183 obj, obj->ref_count, addrem, file, line); 184 } 185 #endif 186 } 187 188 #endif 189 190 /* 191 * Misc low level routines 192 */ 193 static void 194 vm_object_lock_init(vm_object_t obj) 195 { 196 #if defined(DEBUG_LOCKS) 197 int i; 198 199 obj->debug_index = 0; 200 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 201 obj->debug_hold_thrs[i][0] = 0; 202 obj->debug_hold_file[i] = NULL; 203 obj->debug_hold_line[i] = 0; 204 } 205 #endif 206 } 207 208 void 209 vm_object_lock_swap(void) 210 { 211 lwkt_token_swap(); 212 } 213 214 void 215 vm_object_lock(vm_object_t obj) 216 { 217 lwkt_gettoken(&obj->token); 218 } 219 220 /* 221 * Returns TRUE on sucesss 222 */ 223 static int 224 vm_object_lock_try(vm_object_t obj) 225 { 226 return(lwkt_trytoken(&obj->token)); 227 } 228 229 void 230 vm_object_lock_shared(vm_object_t obj) 231 { 232 lwkt_gettoken_shared(&obj->token); 233 } 234 235 void 236 vm_object_unlock(vm_object_t obj) 237 { 238 lwkt_reltoken(&obj->token); 239 } 240 241 void 242 vm_object_upgrade(vm_object_t obj) 243 { 244 lwkt_reltoken(&obj->token); 245 lwkt_gettoken(&obj->token); 246 } 247 248 void 249 vm_object_downgrade(vm_object_t obj) 250 { 251 lwkt_reltoken(&obj->token); 252 lwkt_gettoken_shared(&obj->token); 253 } 254 255 static __inline void 256 vm_object_assert_held(vm_object_t obj) 257 { 258 ASSERT_LWKT_TOKEN_HELD(&obj->token); 259 } 260 261 int 262 vm_quickcolor(void) 263 { 264 globaldata_t gd = mycpu; 265 int pg_color; 266 267 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 268 pg_color += gd->gd_quick_color; 269 gd->gd_quick_color += PQ_PRIME2; 270 271 return pg_color; 272 } 273 274 void 275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 276 { 277 KKASSERT(obj != NULL); 278 279 /* 280 * Object must be held (object allocation is stable due to callers 281 * context, typically already holding the token on a parent object) 282 * prior to potentially blocking on the lock, otherwise the object 283 * can get ripped away from us. 284 */ 285 refcount_acquire(&obj->hold_count); 286 vm_object_lock(obj); 287 288 #if defined(DEBUG_LOCKS) 289 debugvm_object_add(obj, file, line, 1); 290 #endif 291 } 292 293 int 294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 295 { 296 KKASSERT(obj != NULL); 297 298 /* 299 * Object must be held (object allocation is stable due to callers 300 * context, typically already holding the token on a parent object) 301 * prior to potentially blocking on the lock, otherwise the object 302 * can get ripped away from us. 303 */ 304 refcount_acquire(&obj->hold_count); 305 if (vm_object_lock_try(obj) == 0) { 306 if (refcount_release(&obj->hold_count)) { 307 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 308 kfree(obj, M_VM_OBJECT); 309 } 310 return(0); 311 } 312 313 #if defined(DEBUG_LOCKS) 314 debugvm_object_add(obj, file, line, 1); 315 #endif 316 return(1); 317 } 318 319 void 320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 321 { 322 KKASSERT(obj != NULL); 323 324 /* 325 * Object must be held (object allocation is stable due to callers 326 * context, typically already holding the token on a parent object) 327 * prior to potentially blocking on the lock, otherwise the object 328 * can get ripped away from us. 329 */ 330 refcount_acquire(&obj->hold_count); 331 vm_object_lock_shared(obj); 332 333 #if defined(DEBUG_LOCKS) 334 debugvm_object_add(obj, file, line, 1); 335 #endif 336 } 337 338 /* 339 * Drop the token and hold_count on the object. 340 * 341 * WARNING! Token might be shared. 342 */ 343 void 344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 345 { 346 if (obj == NULL) 347 return; 348 349 /* 350 * No new holders should be possible once we drop hold_count 1->0 as 351 * there is no longer any way to reference the object. 352 */ 353 KKASSERT(obj->hold_count > 0); 354 if (refcount_release(&obj->hold_count)) { 355 #if defined(DEBUG_LOCKS) 356 debugvm_object_add(obj, file, line, -1); 357 #endif 358 359 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 360 vm_object_unlock(obj); 361 kfree(obj, M_VM_OBJECT); 362 } else { 363 vm_object_unlock(obj); 364 } 365 } else { 366 #if defined(DEBUG_LOCKS) 367 debugvm_object_add(obj, file, line, -1); 368 #endif 369 vm_object_unlock(obj); 370 } 371 } 372 373 /* 374 * Initialize a freshly allocated object, returning a held object. 375 * 376 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 377 * 378 * No requirements. 379 */ 380 void 381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) 382 { 383 struct vm_object_hash *hash; 384 385 RB_INIT(&object->rb_memq); 386 lwkt_token_init(&object->token, "vmobj"); 387 388 TAILQ_INIT(&object->backing_list); 389 object->type = type; 390 object->size = size; 391 object->ref_count = 1; 392 object->memattr = VM_MEMATTR_DEFAULT; 393 object->hold_count = 0; 394 object->flags = 0; 395 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 396 vm_object_set_flag(object, OBJ_ONEMAPPING); 397 object->paging_in_progress = 0; 398 object->resident_page_count = 0; 399 /* cpu localization twist */ 400 object->pg_color = vm_quickcolor(); 401 object->handle = NULL; 402 403 atomic_add_int(&object->generation, 1); 404 object->swblock_count = 0; 405 RB_INIT(&object->swblock_root); 406 vm_object_lock_init(object); 407 pmap_object_init(object); 408 409 vm_object_hold(object); 410 411 hash = vmobj_hash(object); 412 lwkt_gettoken(&hash->token); 413 TAILQ_INSERT_TAIL(&hash->list, object, object_entry); 414 lwkt_reltoken(&hash->token); 415 } 416 417 /* 418 * Initialize a VM object. 419 */ 420 void 421 vm_object_init(vm_object_t object, vm_pindex_t size) 422 { 423 _vm_object_allocate(OBJT_DEFAULT, size, object); 424 vm_object_drop(object); 425 } 426 427 /* 428 * Initialize the VM objects module. 429 * 430 * Called from the low level boot code only. Note that this occurs before 431 * kmalloc is initialized so we cannot allocate any VM objects. 432 */ 433 void 434 vm_object_init1(void) 435 { 436 int i; 437 438 for (i = 0; i < VMOBJ_HSIZE; ++i) { 439 TAILQ_INIT(&vm_object_hash[i].list); 440 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 441 } 442 443 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 444 &kernel_object); 445 vm_object_drop(&kernel_object); 446 } 447 448 void 449 vm_object_init2(void) 450 { 451 kmalloc_set_unlimited(M_VM_OBJECT); 452 } 453 454 /* 455 * Allocate and return a new object of the specified type and size. 456 * 457 * No requirements. 458 */ 459 vm_object_t 460 vm_object_allocate(objtype_t type, vm_pindex_t size) 461 { 462 vm_object_t obj; 463 464 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 465 _vm_object_allocate(type, size, obj); 466 vm_object_drop(obj); 467 468 return (obj); 469 } 470 471 /* 472 * This version returns a held object, allowing further atomic initialization 473 * of the object. 474 */ 475 vm_object_t 476 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 477 { 478 vm_object_t obj; 479 480 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 481 _vm_object_allocate(type, size, obj); 482 483 return (obj); 484 } 485 486 /* 487 * Add an additional reference to a vm_object. The object must already be 488 * held. The original non-lock version is no longer supported. The object 489 * must NOT be chain locked by anyone at the time the reference is added. 490 * 491 * The object must be held, but may be held shared if desired (hence why 492 * we use an atomic op). 493 */ 494 void 495 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 496 { 497 KKASSERT(object != NULL); 498 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 499 atomic_add_int(&object->ref_count, 1); 500 if (object->type == OBJT_VNODE) { 501 vref(object->handle); 502 /* XXX what if the vnode is being destroyed? */ 503 } 504 #if defined(DEBUG_LOCKS) 505 debugvm_object_add(object, file, line, 1); 506 #endif 507 } 508 509 /* 510 * This version is only allowed in situations where the caller 511 * already knows that the object is deterministically referenced 512 * (usually because its taken from a ref'd vnode, or during a map_entry 513 * replication). 514 */ 515 void 516 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 517 { 518 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0); 519 atomic_add_int(&object->ref_count, 1); 520 if (object->type == OBJT_VNODE) 521 vref(object->handle); 522 #if defined(DEBUG_LOCKS) 523 debugvm_object_add(object, file, line, 1); 524 #endif 525 } 526 527 /* 528 * Dereference an object and its underlying vnode. The object may be 529 * held shared. On return the object will remain held. 530 * 531 * This function may return a vnode in *vpp which the caller must release 532 * after the caller drops its own lock. If vpp is NULL, we assume that 533 * the caller was holding an exclusive lock on the object and we vrele() 534 * the vp ourselves. 535 */ 536 static void 537 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 538 VMOBJDBARGS) 539 { 540 struct vnode *vp = (struct vnode *) object->handle; 541 542 KASSERT(object->type == OBJT_VNODE, 543 ("vm_object_vndeallocate: not a vnode object")); 544 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 545 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 546 #ifdef INVARIANTS 547 if (object->ref_count == 0) { 548 vprint("vm_object_vndeallocate", vp); 549 panic("vm_object_vndeallocate: bad object reference count"); 550 } 551 #endif 552 for (;;) { 553 int count = object->ref_count; 554 cpu_ccfence(); 555 if (count == 1) { 556 vm_object_upgrade(object); 557 if (atomic_cmpset_int(&object->ref_count, count, 0)) { 558 vclrflags(vp, VTEXT); 559 break; 560 } 561 } else { 562 if (atomic_cmpset_int(&object->ref_count, 563 count, count - 1)) { 564 break; 565 } 566 } 567 /* retry */ 568 } 569 #if defined(DEBUG_LOCKS) 570 debugvm_object_add(object, file, line, -1); 571 #endif 572 573 /* 574 * vrele or return the vp to vrele. We can only safely vrele(vp) 575 * if the object was locked exclusively. But there are two races 576 * here. 577 * 578 * We had to upgrade the object above to safely clear VTEXT 579 * but the alternative path where the shared lock is retained 580 * can STILL race to 0 in other paths and cause our own vrele() 581 * to terminate the vnode. We can't allow that if the VM object 582 * is still locked shared. 583 */ 584 if (vpp) 585 *vpp = vp; 586 else 587 vrele(vp); 588 } 589 590 /* 591 * Release a reference to the specified object, gained either through a 592 * vm_object_allocate or a vm_object_reference call. When all references 593 * are gone, storage associated with this object may be relinquished. 594 * 595 * The caller does not have to hold the object locked but must have control 596 * over the reference in question in order to guarantee that the object 597 * does not get ripped out from under us. 598 * 599 * XXX Currently all deallocations require an exclusive lock. 600 */ 601 void 602 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 603 { 604 struct vnode *vp; 605 int count; 606 607 if (object == NULL) 608 return; 609 610 for (;;) { 611 count = object->ref_count; 612 cpu_ccfence(); 613 614 /* 615 * If decrementing the count enters into special handling 616 * territory (0, 1, or 2) we have to do it the hard way. 617 * Fortunate though, objects with only a few refs like this 618 * are not likely to be heavily contended anyway. 619 * 620 * For vnode objects we only care about 1->0 transitions. 621 */ 622 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 623 #if defined(DEBUG_LOCKS) 624 debugvm_object_add(object, file, line, 0); 625 #endif 626 vm_object_hold(object); 627 vm_object_deallocate_locked(object); 628 vm_object_drop(object); 629 break; 630 } 631 632 /* 633 * Try to decrement ref_count without acquiring a hold on 634 * the object. This is particularly important for the exec*() 635 * and exit*() code paths because the program binary may 636 * have a great deal of sharing and an exclusive lock will 637 * crowbar performance in those circumstances. 638 */ 639 if (object->type == OBJT_VNODE) { 640 vp = (struct vnode *)object->handle; 641 if (atomic_cmpset_int(&object->ref_count, 642 count, count - 1)) { 643 #if defined(DEBUG_LOCKS) 644 debugvm_object_add(object, file, line, -1); 645 #endif 646 647 vrele(vp); 648 break; 649 } 650 /* retry */ 651 } else { 652 if (atomic_cmpset_int(&object->ref_count, 653 count, count - 1)) { 654 #if defined(DEBUG_LOCKS) 655 debugvm_object_add(object, file, line, -1); 656 #endif 657 break; 658 } 659 /* retry */ 660 } 661 /* retry */ 662 } 663 } 664 665 void 666 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 667 { 668 /* 669 * Degenerate case 670 */ 671 if (object == NULL) 672 return; 673 674 /* 675 * vnode case, caller either locked the object exclusively 676 * or this is a recursion with must_drop != 0 and the vnode 677 * object will be locked shared. 678 * 679 * If locked shared we have to drop the object before we can 680 * call vrele() or risk a shared/exclusive livelock. 681 */ 682 if (object->type == OBJT_VNODE) { 683 ASSERT_LWKT_TOKEN_HELD(&object->token); 684 vm_object_vndeallocate(object, NULL); 685 return; 686 } 687 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 688 689 /* 690 * Normal case (object is locked exclusively) 691 */ 692 if (object->ref_count == 0) { 693 panic("vm_object_deallocate: object deallocated " 694 "too many times: %d", object->type); 695 } 696 if (object->ref_count > 2) { 697 atomic_add_int(&object->ref_count, -1); 698 #if defined(DEBUG_LOCKS) 699 debugvm_object_add(object, file, line, -1); 700 #endif 701 return; 702 } 703 704 /* 705 * Drop the ref and handle termination on the 1->0 transition. 706 * We may have blocked above so we have to recheck. 707 */ 708 KKASSERT(object->ref_count != 0); 709 if (object->ref_count >= 2) { 710 atomic_add_int(&object->ref_count, -1); 711 #if defined(DEBUG_LOCKS) 712 debugvm_object_add(object, file, line, -1); 713 #endif 714 return; 715 } 716 717 atomic_add_int(&object->ref_count, -1); 718 if ((object->flags & OBJ_DEAD) == 0) 719 vm_object_terminate(object); 720 } 721 722 /* 723 * Destroy the specified object, freeing up related resources. 724 * 725 * The object must have zero references. 726 * 727 * The object must held. The caller is responsible for dropping the object 728 * after terminate returns. Terminate does NOT drop the object. 729 */ 730 static int vm_object_terminate_callback(vm_page_t p, void *data); 731 732 void 733 vm_object_terminate(vm_object_t object) 734 { 735 struct rb_vm_page_scan_info info; 736 struct vm_object_hash *hash; 737 738 /* 739 * Make sure no one uses us. Once we set OBJ_DEAD we should be 740 * able to safely block. 741 */ 742 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 743 KKASSERT((object->flags & OBJ_DEAD) == 0); 744 vm_object_set_flag(object, OBJ_DEAD); 745 746 /* 747 * Wait for the pageout daemon to be done with the object 748 */ 749 vm_object_pip_wait(object, "objtrm1"); 750 751 KASSERT(!object->paging_in_progress, 752 ("vm_object_terminate: pageout in progress")); 753 754 /* 755 * Clean and free the pages, as appropriate. All references to the 756 * object are gone, so we don't need to lock it. 757 */ 758 if (object->type == OBJT_VNODE) { 759 struct vnode *vp; 760 761 /* 762 * Clean pages and flush buffers. 763 * 764 * NOTE! TMPFS buffer flushes do not typically flush the 765 * actual page to swap as this would be highly 766 * inefficient, and normal filesystems usually wrap 767 * page flushes with buffer cache buffers. 768 * 769 * To deal with this we have to call vinvalbuf() both 770 * before and after the vm_object_page_clean(). 771 */ 772 vp = (struct vnode *) object->handle; 773 vinvalbuf(vp, V_SAVE, 0, 0); 774 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 775 vinvalbuf(vp, V_SAVE, 0, 0); 776 } 777 778 /* 779 * Wait for any I/O to complete, after which there had better not 780 * be any references left on the object. 781 */ 782 vm_object_pip_wait(object, "objtrm2"); 783 784 if (object->ref_count != 0) { 785 panic("vm_object_terminate: object with references, " 786 "ref_count=%d", object->ref_count); 787 } 788 789 /* 790 * Cleanup any shared pmaps associated with this object. 791 */ 792 pmap_object_free(object); 793 794 /* 795 * Now free any remaining pages. For internal objects, this also 796 * removes them from paging queues. Don't free wired pages, just 797 * remove them from the object. 798 */ 799 info.count = 0; 800 info.object = object; 801 do { 802 info.error = 0; 803 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 804 vm_object_terminate_callback, &info); 805 } while (info.error); 806 807 /* 808 * Let the pager know object is dead. 809 */ 810 vm_pager_deallocate(object); 811 812 /* 813 * Wait for the object hold count to hit 1, clean out pages as 814 * we go. vmobj_token interlocks any race conditions that might 815 * pick the object up from the vm_object_list after we have cleared 816 * rb_memq. 817 */ 818 for (;;) { 819 if (RB_ROOT(&object->rb_memq) == NULL) 820 break; 821 kprintf("vm_object_terminate: Warning, object %p " 822 "still has %ld pages\n", 823 object, object->resident_page_count); 824 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 825 vm_object_terminate_callback, &info); 826 } 827 828 /* 829 * There had better not be any pages left 830 */ 831 KKASSERT(object->resident_page_count == 0); 832 833 /* 834 * Remove the object from the global object list. 835 */ 836 hash = vmobj_hash(object); 837 lwkt_gettoken(&hash->token); 838 TAILQ_REMOVE(&hash->list, object, object_entry); 839 lwkt_reltoken(&hash->token); 840 841 if (object->ref_count != 0) { 842 panic("vm_object_terminate2: object with references, " 843 "ref_count=%d", object->ref_count); 844 } 845 846 /* 847 * NOTE: The object hold_count is at least 1, so we cannot kfree() 848 * the object here. See vm_object_drop(). 849 */ 850 } 851 852 /* 853 * The caller must hold the object. 854 */ 855 static int 856 vm_object_terminate_callback(vm_page_t p, void *data) 857 { 858 struct rb_vm_page_scan_info *info = data; 859 vm_object_t object; 860 861 object = p->object; 862 KKASSERT(object == info->object); 863 if (vm_page_busy_try(p, TRUE)) { 864 vm_page_sleep_busy(p, TRUE, "vmotrm"); 865 info->error = 1; 866 return 0; 867 } 868 if (object != p->object) { 869 /* XXX remove once we determine it can't happen */ 870 kprintf("vm_object_terminate: Warning: Encountered " 871 "busied page %p on queue %d\n", p, p->queue); 872 vm_page_wakeup(p); 873 info->error = 1; 874 } else if (p->wire_count == 0) { 875 /* 876 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 877 */ 878 vm_page_free(p); 879 mycpu->gd_cnt.v_pfree++; 880 } else { 881 if (p->queue != PQ_NONE) { 882 kprintf("vm_object_terminate: Warning: Encountered " 883 "wired page %p on queue %d\n", p, p->queue); 884 if (vm_object_debug > 0) { 885 --vm_object_debug; 886 print_backtrace(10); 887 } 888 } 889 vm_page_remove(p); 890 vm_page_wakeup(p); 891 } 892 893 /* 894 * Must be at end to avoid SMP races, caller holds object token 895 */ 896 if ((++info->count & 63) == 0) 897 lwkt_user_yield(); 898 return(0); 899 } 900 901 /* 902 * Clean all dirty pages in the specified range of object. Leaves page 903 * on whatever queue it is currently on. If NOSYNC is set then do not 904 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 905 * leaving the object dirty. 906 * 907 * When stuffing pages asynchronously, allow clustering. XXX we need a 908 * synchronous clustering mode implementation. 909 * 910 * Odd semantics: if start == end, we clean everything. 911 * 912 * The object must be locked? XXX 913 */ 914 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 915 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 916 917 void 918 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 919 int flags) 920 { 921 struct rb_vm_page_scan_info info; 922 struct vnode *vp; 923 int wholescan; 924 int pagerflags; 925 int generation; 926 927 vm_object_hold(object); 928 if (object->type != OBJT_VNODE || 929 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 930 vm_object_drop(object); 931 return; 932 } 933 934 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 935 VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; 936 pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; 937 938 vp = object->handle; 939 940 /* 941 * Interlock other major object operations. This allows us to 942 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 943 */ 944 vm_object_set_flag(object, OBJ_CLEANING); 945 946 /* 947 * Handle 'entire object' case 948 */ 949 info.start_pindex = start; 950 if (end == 0) { 951 info.end_pindex = object->size - 1; 952 } else { 953 info.end_pindex = end - 1; 954 } 955 wholescan = (start == 0 && info.end_pindex == object->size - 1); 956 info.limit = flags; 957 info.pagerflags = pagerflags; 958 info.object = object; 959 960 /* 961 * If cleaning the entire object do a pass to mark the pages read-only. 962 * If everything worked out ok, clear OBJ_WRITEABLE and 963 * OBJ_MIGHTBEDIRTY. 964 */ 965 if (wholescan) { 966 info.error = 0; 967 info.count = 0; 968 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 969 vm_object_page_clean_pass1, &info); 970 if (info.error == 0) { 971 vm_object_clear_flag(object, 972 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 973 if (object->type == OBJT_VNODE && 974 (vp = (struct vnode *)object->handle) != NULL) { 975 /* 976 * Use new-style interface to clear VISDIRTY 977 * because the vnode is not necessarily removed 978 * from the syncer list(s) as often as it was 979 * under the old interface, which can leave 980 * the vnode on the syncer list after reclaim. 981 */ 982 vclrobjdirty(vp); 983 } 984 } 985 } 986 987 /* 988 * Do a pass to clean all the dirty pages we find. 989 */ 990 do { 991 info.error = 0; 992 info.count = 0; 993 generation = object->generation; 994 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 995 vm_object_page_clean_pass2, &info); 996 } while (info.error || generation != object->generation); 997 998 vm_object_clear_flag(object, OBJ_CLEANING); 999 vm_object_drop(object); 1000 } 1001 1002 /* 1003 * The caller must hold the object. 1004 */ 1005 static 1006 int 1007 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1008 { 1009 struct rb_vm_page_scan_info *info = data; 1010 1011 KKASSERT(p->object == info->object); 1012 1013 vm_page_flag_set(p, PG_CLEANCHK); 1014 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1015 info->error = 1; 1016 } else if (vm_page_busy_try(p, FALSE)) { 1017 info->error = 1; 1018 } else { 1019 KKASSERT(p->object == info->object); 1020 vm_page_protect(p, VM_PROT_READ); 1021 vm_page_wakeup(p); 1022 } 1023 1024 /* 1025 * Must be at end to avoid SMP races, caller holds object token 1026 */ 1027 if ((++info->count & 63) == 0) 1028 lwkt_user_yield(); 1029 return(0); 1030 } 1031 1032 /* 1033 * The caller must hold the object 1034 */ 1035 static 1036 int 1037 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1038 { 1039 struct rb_vm_page_scan_info *info = data; 1040 int generation; 1041 1042 KKASSERT(p->object == info->object); 1043 1044 /* 1045 * Do not mess with pages that were inserted after we started 1046 * the cleaning pass. 1047 */ 1048 if ((p->flags & PG_CLEANCHK) == 0) 1049 goto done; 1050 1051 generation = info->object->generation; 1052 1053 if (vm_page_busy_try(p, TRUE)) { 1054 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1055 info->error = 1; 1056 goto done; 1057 } 1058 1059 KKASSERT(p->object == info->object && 1060 info->object->generation == generation); 1061 1062 /* 1063 * Before wasting time traversing the pmaps, check for trivial 1064 * cases where the page cannot be dirty. 1065 */ 1066 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1067 KKASSERT((p->dirty & p->valid) == 0 && 1068 (p->flags & PG_NEED_COMMIT) == 0); 1069 vm_page_wakeup(p); 1070 goto done; 1071 } 1072 1073 /* 1074 * Check whether the page is dirty or not. The page has been set 1075 * to be read-only so the check will not race a user dirtying the 1076 * page. 1077 */ 1078 vm_page_test_dirty(p); 1079 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1080 vm_page_flag_clear(p, PG_CLEANCHK); 1081 vm_page_wakeup(p); 1082 goto done; 1083 } 1084 1085 /* 1086 * If we have been asked to skip nosync pages and this is a 1087 * nosync page, skip it. Note that the object flags were 1088 * not cleared in this case (because pass1 will have returned an 1089 * error), so we do not have to set them. 1090 */ 1091 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1092 vm_page_flag_clear(p, PG_CLEANCHK); 1093 vm_page_wakeup(p); 1094 goto done; 1095 } 1096 1097 /* 1098 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1099 * the pages that get successfully flushed. Set info->error if 1100 * we raced an object modification. 1101 */ 1102 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1103 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1104 1105 /* 1106 * Must be at end to avoid SMP races, caller holds object token 1107 */ 1108 done: 1109 if ((++info->count & 63) == 0) 1110 lwkt_user_yield(); 1111 return(0); 1112 } 1113 1114 /* 1115 * Collect the specified page and nearby pages and flush them out. 1116 * The number of pages flushed is returned. The passed page is busied 1117 * by the caller and we are responsible for its disposition. 1118 * 1119 * The caller must hold the object. 1120 */ 1121 static void 1122 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1123 { 1124 int error; 1125 int is; 1126 int ib; 1127 int i; 1128 int page_base; 1129 vm_pindex_t pi; 1130 vm_page_t ma[BLIST_MAX_ALLOC]; 1131 1132 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1133 1134 pi = p->pindex; 1135 page_base = pi % BLIST_MAX_ALLOC; 1136 ma[page_base] = p; 1137 ib = page_base - 1; 1138 is = page_base + 1; 1139 1140 while (ib >= 0) { 1141 vm_page_t tp; 1142 1143 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1144 TRUE, &error); 1145 if (error) 1146 break; 1147 if (tp == NULL) 1148 break; 1149 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1150 (tp->flags & PG_CLEANCHK) == 0) { 1151 vm_page_wakeup(tp); 1152 break; 1153 } 1154 if ((tp->queue - tp->pc) == PQ_CACHE) { 1155 vm_page_flag_clear(tp, PG_CLEANCHK); 1156 vm_page_wakeup(tp); 1157 break; 1158 } 1159 vm_page_test_dirty(tp); 1160 if ((tp->dirty & tp->valid) == 0 && 1161 (tp->flags & PG_NEED_COMMIT) == 0) { 1162 vm_page_flag_clear(tp, PG_CLEANCHK); 1163 vm_page_wakeup(tp); 1164 break; 1165 } 1166 ma[ib] = tp; 1167 --ib; 1168 } 1169 ++ib; /* fixup */ 1170 1171 while (is < BLIST_MAX_ALLOC && 1172 pi - page_base + is < object->size) { 1173 vm_page_t tp; 1174 1175 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1176 TRUE, &error); 1177 if (error) 1178 break; 1179 if (tp == NULL) 1180 break; 1181 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1182 (tp->flags & PG_CLEANCHK) == 0) { 1183 vm_page_wakeup(tp); 1184 break; 1185 } 1186 if ((tp->queue - tp->pc) == PQ_CACHE) { 1187 vm_page_flag_clear(tp, PG_CLEANCHK); 1188 vm_page_wakeup(tp); 1189 break; 1190 } 1191 vm_page_test_dirty(tp); 1192 if ((tp->dirty & tp->valid) == 0 && 1193 (tp->flags & PG_NEED_COMMIT) == 0) { 1194 vm_page_flag_clear(tp, PG_CLEANCHK); 1195 vm_page_wakeup(tp); 1196 break; 1197 } 1198 ma[is] = tp; 1199 ++is; 1200 } 1201 1202 /* 1203 * All pages in the ma[] array are busied now 1204 */ 1205 for (i = ib; i < is; ++i) { 1206 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1207 vm_page_hold(ma[i]); /* XXX need this any more? */ 1208 } 1209 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1210 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1211 vm_page_unhold(ma[i]); 1212 } 1213 1214 /* 1215 * Same as vm_object_pmap_copy, except range checking really 1216 * works, and is meant for small sections of an object. 1217 * 1218 * This code protects resident pages by making them read-only 1219 * and is typically called on a fork or split when a page 1220 * is converted to copy-on-write. 1221 * 1222 * NOTE: If the page is already at VM_PROT_NONE, calling 1223 * vm_page_protect will have no effect. 1224 */ 1225 void 1226 vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 1227 { 1228 vm_pindex_t idx; 1229 vm_page_t p; 1230 1231 if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0) 1232 return; 1233 1234 vm_object_hold(object); 1235 for (idx = start; idx < end; idx++) { 1236 p = vm_page_lookup(object, idx); 1237 if (p == NULL) 1238 continue; 1239 vm_page_protect(p, VM_PROT_READ); 1240 } 1241 vm_object_drop(object); 1242 } 1243 1244 /* 1245 * Removes all physical pages in the specified object range from all 1246 * physical maps. 1247 * 1248 * The object must *not* be locked. 1249 */ 1250 1251 static int vm_object_pmap_remove_callback(vm_page_t p, void *data); 1252 1253 void 1254 vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end) 1255 { 1256 struct rb_vm_page_scan_info info; 1257 1258 if (object == NULL) 1259 return; 1260 if (start == end) 1261 return; 1262 info.start_pindex = start; 1263 info.end_pindex = end - 1; 1264 info.count = 0; 1265 info.object = object; 1266 1267 vm_object_hold(object); 1268 do { 1269 info.error = 0; 1270 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1271 vm_object_pmap_remove_callback, &info); 1272 } while (info.error); 1273 if (start == 0 && end == object->size) 1274 vm_object_clear_flag(object, OBJ_WRITEABLE); 1275 vm_object_drop(object); 1276 } 1277 1278 /* 1279 * The caller must hold the object 1280 */ 1281 static int 1282 vm_object_pmap_remove_callback(vm_page_t p, void *data) 1283 { 1284 struct rb_vm_page_scan_info *info = data; 1285 1286 if (info->object != p->object || 1287 p->pindex < info->start_pindex || 1288 p->pindex > info->end_pindex) { 1289 kprintf("vm_object_pmap_remove_callback: obj/pg race %p/%p\n", 1290 info->object, p); 1291 info->error = 1; 1292 return(0); 1293 } 1294 1295 vm_page_protect(p, VM_PROT_NONE); 1296 1297 /* 1298 * Must be at end to avoid SMP races, caller holds object token 1299 */ 1300 if ((++info->count & 63) == 0) 1301 lwkt_user_yield(); 1302 return(0); 1303 } 1304 1305 /* 1306 * Implements the madvise function at the object/page level. 1307 * 1308 * MADV_WILLNEED (any object) 1309 * 1310 * Activate the specified pages if they are resident. 1311 * 1312 * MADV_DONTNEED (any object) 1313 * 1314 * Deactivate the specified pages if they are resident. 1315 * 1316 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1317 * 1318 * Deactivate and clean the specified pages if they are 1319 * resident. This permits the process to reuse the pages 1320 * without faulting or the kernel to reclaim the pages 1321 * without I/O. 1322 * 1323 * No requirements. 1324 */ 1325 void 1326 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1327 vm_pindex_t count, int advise) 1328 { 1329 vm_pindex_t end; 1330 vm_page_t m; 1331 int error; 1332 1333 if (object == NULL) 1334 return; 1335 1336 end = pindex + count; 1337 1338 vm_object_hold(object); 1339 1340 /* 1341 * Locate and adjust resident pages. This only applies to the 1342 * primary object in the mapping. 1343 */ 1344 for (; pindex < end; pindex += 1) { 1345 relookup: 1346 /* 1347 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1348 * and those pages must be OBJ_ONEMAPPING. 1349 */ 1350 if (advise == MADV_FREE) { 1351 if ((object->type != OBJT_DEFAULT && 1352 object->type != OBJT_SWAP) || 1353 (object->flags & OBJ_ONEMAPPING) == 0) { 1354 continue; 1355 } 1356 } 1357 1358 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1359 1360 if (error) { 1361 vm_page_sleep_busy(m, TRUE, "madvpo"); 1362 goto relookup; 1363 } 1364 if (m == NULL) { 1365 /* 1366 * There may be swap even if there is no backing page 1367 */ 1368 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1369 swap_pager_freespace(object, pindex, 1); 1370 continue; 1371 } 1372 1373 /* 1374 * If the page is not in a normal active state, we skip it. 1375 * If the page is not managed there are no page queues to 1376 * mess with. Things can break if we mess with pages in 1377 * any of the below states. 1378 */ 1379 if (m->wire_count || 1380 (m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1381 m->valid != VM_PAGE_BITS_ALL 1382 ) { 1383 vm_page_wakeup(m); 1384 continue; 1385 } 1386 1387 /* 1388 * Theoretically once a page is known not to be busy, an 1389 * interrupt cannot come along and rip it out from under us. 1390 */ 1391 if (advise == MADV_WILLNEED) { 1392 vm_page_activate(m); 1393 } else if (advise == MADV_DONTNEED) { 1394 vm_page_dontneed(m); 1395 } else if (advise == MADV_FREE) { 1396 /* 1397 * Mark the page clean. This will allow the page 1398 * to be freed up by the system. However, such pages 1399 * are often reused quickly by malloc()/free() 1400 * so we do not do anything that would cause 1401 * a page fault if we can help it. 1402 * 1403 * Specifically, we do not try to actually free 1404 * the page now nor do we try to put it in the 1405 * cache (which would cause a page fault on reuse). 1406 * 1407 * But we do make the page is freeable as we 1408 * can without actually taking the step of unmapping 1409 * it. 1410 */ 1411 pmap_clear_modify(m); 1412 m->dirty = 0; 1413 m->act_count = 0; 1414 vm_page_dontneed(m); 1415 if (object->type == OBJT_SWAP) 1416 swap_pager_freespace(object, pindex, 1); 1417 } 1418 vm_page_wakeup(m); 1419 } 1420 vm_object_drop(object); 1421 } 1422 1423 /* 1424 * Removes all physical pages in the specified object range from the 1425 * object's list of pages. 1426 * 1427 * No requirements. 1428 */ 1429 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1430 1431 void 1432 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1433 boolean_t clean_only) 1434 { 1435 struct rb_vm_page_scan_info info; 1436 int all; 1437 1438 /* 1439 * Degenerate cases and assertions 1440 */ 1441 vm_object_hold(object); 1442 if (object == NULL || 1443 (object->resident_page_count == 0 && object->swblock_count == 0)) { 1444 vm_object_drop(object); 1445 return; 1446 } 1447 KASSERT(object->type != OBJT_PHYS, 1448 ("attempt to remove pages from a physical object")); 1449 1450 /* 1451 * Indicate that paging is occuring on the object 1452 */ 1453 vm_object_pip_add(object, 1); 1454 1455 /* 1456 * Figure out the actual removal range and whether we are removing 1457 * the entire contents of the object or not. If removing the entire 1458 * contents, be sure to get all pages, even those that might be 1459 * beyond the end of the object. 1460 */ 1461 info.object = object; 1462 info.start_pindex = start; 1463 if (end == 0) 1464 info.end_pindex = (vm_pindex_t)-1; 1465 else 1466 info.end_pindex = end - 1; 1467 info.limit = clean_only; 1468 info.count = 0; 1469 all = (start == 0 && info.end_pindex >= object->size - 1); 1470 1471 /* 1472 * Loop until we are sure we have gotten them all. 1473 */ 1474 do { 1475 info.error = 0; 1476 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1477 vm_object_page_remove_callback, &info); 1478 } while (info.error); 1479 1480 /* 1481 * Remove any related swap if throwing away pages, or for 1482 * non-swap objects (the swap is a clean copy in that case). 1483 */ 1484 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1485 if (all) 1486 swap_pager_freespace_all(object); 1487 else 1488 swap_pager_freespace(object, info.start_pindex, 1489 info.end_pindex - info.start_pindex + 1); 1490 } 1491 1492 /* 1493 * Cleanup 1494 */ 1495 vm_object_pip_wakeup(object); 1496 vm_object_drop(object); 1497 } 1498 1499 /* 1500 * The caller must hold the object. 1501 * 1502 * NOTE: User yields are allowed when removing more than one page, but not 1503 * allowed if only removing one page (the path for single page removals 1504 * might hold a spinlock). 1505 */ 1506 static int 1507 vm_object_page_remove_callback(vm_page_t p, void *data) 1508 { 1509 struct rb_vm_page_scan_info *info = data; 1510 1511 if (info->object != p->object || 1512 p->pindex < info->start_pindex || 1513 p->pindex > info->end_pindex) { 1514 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1515 info->object, p); 1516 return(0); 1517 } 1518 if (vm_page_busy_try(p, TRUE)) { 1519 vm_page_sleep_busy(p, TRUE, "vmopar"); 1520 info->error = 1; 1521 return(0); 1522 } 1523 if (info->object != p->object) { 1524 /* this should never happen */ 1525 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1526 info->object, p); 1527 vm_page_wakeup(p); 1528 return(0); 1529 } 1530 1531 /* 1532 * Wired pages cannot be destroyed, but they can be invalidated 1533 * and we do so if clean_only (limit) is not set. 1534 * 1535 * WARNING! The page may be wired due to being part of a buffer 1536 * cache buffer, and the buffer might be marked B_CACHE. 1537 * This is fine as part of a truncation but VFSs must be 1538 * sure to fix the buffer up when re-extending the file. 1539 * 1540 * NOTE! PG_NEED_COMMIT is ignored. 1541 */ 1542 if (p->wire_count != 0) { 1543 vm_page_protect(p, VM_PROT_NONE); 1544 if (info->limit == 0) 1545 p->valid = 0; 1546 vm_page_wakeup(p); 1547 goto done; 1548 } 1549 1550 /* 1551 * limit is our clean_only flag. If set and the page is dirty or 1552 * requires a commit, do not free it. If set and the page is being 1553 * held by someone, do not free it. 1554 */ 1555 if (info->limit && p->valid) { 1556 vm_page_test_dirty(p); 1557 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1558 vm_page_wakeup(p); 1559 goto done; 1560 } 1561 } 1562 1563 /* 1564 * Destroy the page 1565 */ 1566 vm_page_protect(p, VM_PROT_NONE); 1567 vm_page_free(p); 1568 1569 /* 1570 * Must be at end to avoid SMP races, caller holds object token 1571 */ 1572 done: 1573 if ((++info->count & 63) == 0) 1574 lwkt_user_yield(); 1575 1576 return(0); 1577 } 1578 1579 /* 1580 * Try to extend prev_object into an adjoining region of virtual 1581 * memory, return TRUE on success. 1582 * 1583 * The caller does not need to hold (prev_object) but must have a stable 1584 * pointer to it (typically by holding the vm_map locked). 1585 * 1586 * This function only works for anonymous memory objects which either 1587 * have (a) one reference or (b) we are extending the object's size. 1588 * Otherwise the related VM pages we want to use for the object might 1589 * be in use by another mapping. 1590 */ 1591 boolean_t 1592 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1593 vm_size_t prev_size, vm_size_t next_size) 1594 { 1595 vm_pindex_t next_pindex; 1596 1597 if (prev_object == NULL) 1598 return (TRUE); 1599 1600 vm_object_hold(prev_object); 1601 1602 if (prev_object->type != OBJT_DEFAULT && 1603 prev_object->type != OBJT_SWAP) { 1604 vm_object_drop(prev_object); 1605 return (FALSE); 1606 } 1607 1608 #if 0 1609 /* caller now checks this */ 1610 /* 1611 * Try to collapse the object first 1612 */ 1613 vm_object_collapse(prev_object, NULL); 1614 #endif 1615 1616 #if 0 1617 /* caller now checks this */ 1618 /* 1619 * We can't coalesce if we shadow another object (figuring out the 1620 * relationships become too complex). 1621 */ 1622 if (prev_object->backing_object != NULL) { 1623 vm_object_chain_release(prev_object); 1624 vm_object_drop(prev_object); 1625 return (FALSE); 1626 } 1627 #endif 1628 1629 prev_size >>= PAGE_SHIFT; 1630 next_size >>= PAGE_SHIFT; 1631 next_pindex = prev_pindex + prev_size; 1632 1633 /* 1634 * We can't if the object has more than one ref count unless we 1635 * are extending it into newly minted space. 1636 */ 1637 if (prev_object->ref_count > 1 && 1638 prev_object->size != next_pindex) { 1639 vm_object_drop(prev_object); 1640 return (FALSE); 1641 } 1642 1643 /* 1644 * Remove any pages that may still be in the object from a previous 1645 * deallocation. 1646 */ 1647 if (next_pindex < prev_object->size) { 1648 vm_object_page_remove(prev_object, 1649 next_pindex, 1650 next_pindex + next_size, FALSE); 1651 if (prev_object->type == OBJT_SWAP) 1652 swap_pager_freespace(prev_object, 1653 next_pindex, next_size); 1654 } 1655 1656 /* 1657 * Extend the object if necessary. 1658 */ 1659 if (next_pindex + next_size > prev_object->size) 1660 prev_object->size = next_pindex + next_size; 1661 vm_object_drop(prev_object); 1662 1663 return (TRUE); 1664 } 1665 1666 /* 1667 * Make the object writable and flag is being possibly dirty. 1668 * 1669 * The object might not be held (or might be held but held shared), 1670 * the related vnode is probably not held either. Object and vnode are 1671 * stable by virtue of the vm_page busied by the caller preventing 1672 * destruction. 1673 * 1674 * If the related mount is flagged MNTK_THR_SYNC we need to call 1675 * vsetobjdirty(). Filesystems using this option usually shortcut 1676 * synchronization by only scanning the syncer list. 1677 */ 1678 void 1679 vm_object_set_writeable_dirty(vm_object_t object) 1680 { 1681 struct vnode *vp; 1682 1683 /*vm_object_assert_held(object);*/ 1684 /* 1685 * Avoid contention in vm fault path by checking the state before 1686 * issuing an atomic op on it. 1687 */ 1688 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1689 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1690 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1691 } 1692 if (object->type == OBJT_VNODE && 1693 (vp = (struct vnode *)object->handle) != NULL) { 1694 if ((vp->v_flag & VOBJDIRTY) == 0) { 1695 if (vp->v_mount && 1696 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1697 /* 1698 * New style THR_SYNC places vnodes on the 1699 * syncer list more deterministically. 1700 */ 1701 vsetobjdirty(vp); 1702 } else { 1703 /* 1704 * Old style scan would not necessarily place 1705 * a vnode on the syncer list when possibly 1706 * modified via mmap. 1707 */ 1708 vsetflags(vp, VOBJDIRTY); 1709 } 1710 } 1711 } 1712 } 1713 1714 #include "opt_ddb.h" 1715 #ifdef DDB 1716 #include <sys/cons.h> 1717 1718 #include <ddb/ddb.h> 1719 1720 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1721 vm_map_entry_t entry); 1722 static int vm_object_in_map (vm_object_t object); 1723 1724 /* 1725 * The caller must hold the object. 1726 */ 1727 static int 1728 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1729 { 1730 vm_map_backing_t ba; 1731 vm_map_t tmpm; 1732 vm_map_entry_t tmpe; 1733 int entcount; 1734 1735 if (map == NULL) 1736 return 0; 1737 if (entry == NULL) { 1738 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1739 entcount = map->nentries; 1740 while (entcount-- && tmpe) { 1741 if( _vm_object_in_map(map, object, tmpe)) { 1742 return 1; 1743 } 1744 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1745 } 1746 return (0); 1747 } 1748 switch(entry->maptype) { 1749 case VM_MAPTYPE_SUBMAP: 1750 tmpm = entry->ba.sub_map; 1751 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1752 entcount = tmpm->nentries; 1753 while (entcount-- && tmpe) { 1754 if( _vm_object_in_map(tmpm, object, tmpe)) { 1755 return 1; 1756 } 1757 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1758 } 1759 break; 1760 case VM_MAPTYPE_NORMAL: 1761 case VM_MAPTYPE_VPAGETABLE: 1762 ba = &entry->ba; 1763 while (ba) { 1764 if (ba->object == object) 1765 return TRUE; 1766 ba = ba->backing_ba; 1767 } 1768 break; 1769 default: 1770 break; 1771 } 1772 return 0; 1773 } 1774 1775 static int vm_object_in_map_callback(struct proc *p, void *data); 1776 1777 struct vm_object_in_map_info { 1778 vm_object_t object; 1779 int rv; 1780 }; 1781 1782 /* 1783 * Debugging only 1784 */ 1785 static int 1786 vm_object_in_map(vm_object_t object) 1787 { 1788 struct vm_object_in_map_info info; 1789 1790 info.rv = 0; 1791 info.object = object; 1792 1793 allproc_scan(vm_object_in_map_callback, &info, 0); 1794 if (info.rv) 1795 return 1; 1796 if( _vm_object_in_map(&kernel_map, object, 0)) 1797 return 1; 1798 if( _vm_object_in_map(&pager_map, object, 0)) 1799 return 1; 1800 if( _vm_object_in_map(&buffer_map, object, 0)) 1801 return 1; 1802 return 0; 1803 } 1804 1805 /* 1806 * Debugging only 1807 */ 1808 static int 1809 vm_object_in_map_callback(struct proc *p, void *data) 1810 { 1811 struct vm_object_in_map_info *info = data; 1812 1813 if (p->p_vmspace) { 1814 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1815 info->rv = 1; 1816 return -1; 1817 } 1818 } 1819 return (0); 1820 } 1821 1822 DB_SHOW_COMMAND(vmochk, vm_object_check) 1823 { 1824 struct vm_object_hash *hash; 1825 vm_object_t object; 1826 int n; 1827 1828 /* 1829 * make sure that internal objs are in a map somewhere 1830 * and none have zero ref counts. 1831 */ 1832 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1833 hash = &vm_object_hash[n]; 1834 for (object = TAILQ_FIRST(&hash->list); 1835 object != NULL; 1836 object = TAILQ_NEXT(object, object_entry)) { 1837 if (object->type == OBJT_MARKER) 1838 continue; 1839 if (object->handle != NULL || 1840 (object->type != OBJT_DEFAULT && 1841 object->type != OBJT_SWAP)) { 1842 continue; 1843 } 1844 if (object->ref_count == 0) { 1845 db_printf("vmochk: internal obj has " 1846 "zero ref count: %ld\n", 1847 (long)object->size); 1848 } 1849 if (vm_object_in_map(object)) 1850 continue; 1851 db_printf("vmochk: internal obj is not in a map: " 1852 "ref: %d, size: %lu: 0x%lx\n", 1853 object->ref_count, (u_long)object->size, 1854 (u_long)object->size); 1855 } 1856 } 1857 } 1858 1859 /* 1860 * Debugging only 1861 */ 1862 DB_SHOW_COMMAND(object, vm_object_print_static) 1863 { 1864 /* XXX convert args. */ 1865 vm_object_t object = (vm_object_t)addr; 1866 boolean_t full = have_addr; 1867 1868 vm_page_t p; 1869 1870 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1871 #define count was_count 1872 1873 int count; 1874 1875 if (object == NULL) 1876 return; 1877 1878 db_iprintf( 1879 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1880 object, (int)object->type, (u_long)object->size, 1881 object->resident_page_count, object->ref_count, object->flags); 1882 /* 1883 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1884 */ 1885 db_iprintf("\n"); 1886 1887 if (!full) 1888 return; 1889 1890 db_indent += 2; 1891 count = 0; 1892 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1893 if (count == 0) 1894 db_iprintf("memory:="); 1895 else if (count == 6) { 1896 db_printf("\n"); 1897 db_iprintf(" ..."); 1898 count = 0; 1899 } else 1900 db_printf(","); 1901 count++; 1902 1903 db_printf("(off=0x%lx,page=0x%lx)", 1904 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1905 } 1906 if (count != 0) 1907 db_printf("\n"); 1908 db_indent -= 2; 1909 } 1910 1911 /* XXX. */ 1912 #undef count 1913 1914 /* 1915 * XXX need this non-static entry for calling from vm_map_print. 1916 * 1917 * Debugging only 1918 */ 1919 void 1920 vm_object_print(/* db_expr_t */ long addr, 1921 boolean_t have_addr, 1922 /* db_expr_t */ long count, 1923 char *modif) 1924 { 1925 vm_object_print_static(addr, have_addr, count, modif); 1926 } 1927 1928 /* 1929 * Debugging only 1930 */ 1931 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1932 { 1933 struct vm_object_hash *hash; 1934 vm_object_t object; 1935 int nl = 0; 1936 int c; 1937 int n; 1938 1939 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1940 hash = &vm_object_hash[n]; 1941 for (object = TAILQ_FIRST(&hash->list); 1942 object != NULL; 1943 object = TAILQ_NEXT(object, object_entry)) { 1944 vm_pindex_t idx, fidx; 1945 vm_pindex_t osize; 1946 vm_paddr_t pa = -1, padiff; 1947 int rcount; 1948 vm_page_t m; 1949 1950 if (object->type == OBJT_MARKER) 1951 continue; 1952 db_printf("new object: %p\n", (void *)object); 1953 if ( nl > 18) { 1954 c = cngetc(); 1955 if (c != ' ') 1956 return; 1957 nl = 0; 1958 } 1959 nl++; 1960 rcount = 0; 1961 fidx = 0; 1962 osize = object->size; 1963 if (osize > 128) 1964 osize = 128; 1965 for (idx = 0; idx < osize; idx++) { 1966 m = vm_page_lookup(object, idx); 1967 if (m == NULL) { 1968 if (rcount) { 1969 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1970 (long)fidx, rcount, (long)pa); 1971 if ( nl > 18) { 1972 c = cngetc(); 1973 if (c != ' ') 1974 return; 1975 nl = 0; 1976 } 1977 nl++; 1978 rcount = 0; 1979 } 1980 continue; 1981 } 1982 1983 if (rcount && 1984 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1985 ++rcount; 1986 continue; 1987 } 1988 if (rcount) { 1989 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1990 padiff >>= PAGE_SHIFT; 1991 padiff &= PQ_L2_MASK; 1992 if (padiff == 0) { 1993 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1994 ++rcount; 1995 continue; 1996 } 1997 db_printf(" index(%ld)run(%d)pa(0x%lx)", 1998 (long)fidx, rcount, (long)pa); 1999 db_printf("pd(%ld)\n", (long)padiff); 2000 if ( nl > 18) { 2001 c = cngetc(); 2002 if (c != ' ') 2003 return; 2004 nl = 0; 2005 } 2006 nl++; 2007 } 2008 fidx = idx; 2009 pa = VM_PAGE_TO_PHYS(m); 2010 rcount = 1; 2011 } 2012 if (rcount) { 2013 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 2014 (long)fidx, rcount, (long)pa); 2015 if ( nl > 18) { 2016 c = cngetc(); 2017 if (c != ' ') 2018 return; 2019 nl = 0; 2020 } 2021 nl++; 2022 } 2023 } 2024 } 2025 } 2026 #endif /* DDB */ 2027