1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/malloc.h> 77 #include <sys/sysctl.h> 78 #include <sys/refcount.h> 79 80 #include <vm/vm.h> 81 #include <vm/vm_param.h> 82 #include <vm/pmap.h> 83 #include <vm/vm_map.h> 84 #include <vm/vm_object.h> 85 #include <vm/vm_page.h> 86 #include <vm/vm_pageout.h> 87 #include <vm/vm_pager.h> 88 #include <vm/swap_pager.h> 89 #include <vm/vm_kern.h> 90 #include <vm/vm_extern.h> 91 #include <vm/vm_zone.h> 92 93 #include <vm/vm_page2.h> 94 95 #include <machine/specialreg.h> 96 97 #define EASY_SCAN_FACTOR 8 98 99 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 100 int pagerflags); 101 static void vm_object_lock_init(vm_object_t); 102 103 /* 104 * Virtual memory objects maintain the actual data 105 * associated with allocated virtual memory. A given 106 * page of memory exists within exactly one object. 107 * 108 * An object is only deallocated when all "references" 109 * are given up. Only one "reference" to a given 110 * region of an object should be writeable. 111 * 112 * Associated with each object is a list of all resident 113 * memory pages belonging to that object; this list is 114 * maintained by the "vm_page" module, and locked by the object's 115 * lock. 116 * 117 * Each object also records a "pager" routine which is 118 * used to retrieve (and store) pages to the proper backing 119 * storage. In addition, objects may be backed by other 120 * objects from which they were virtual-copied. 121 * 122 * The only items within the object structure which are 123 * modified after time of creation are: 124 * reference count locked by object's lock 125 * pager routine locked by object's lock 126 * 127 */ 128 129 struct vm_object kernel_object; 130 131 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 132 133 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures"); 134 135 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 136 #define VMOBJ_HASH_PRIME2 989042931893ULL 137 138 int vm_object_debug; 139 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 140 141 static __inline 142 struct vm_object_hash * 143 vmobj_hash(vm_object_t obj) 144 { 145 uintptr_t hash1; 146 uintptr_t hash2; 147 148 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 149 hash1 %= VMOBJ_HASH_PRIME1; 150 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 151 hash2 %= VMOBJ_HASH_PRIME2; 152 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 153 } 154 155 #if defined(DEBUG_LOCKS) 156 157 #define vm_object_vndeallocate(obj, vpp) \ 158 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 159 160 /* 161 * Debug helper to track hold/drop/ref/deallocate calls. 162 */ 163 static void 164 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 165 { 166 int i; 167 168 i = atomic_fetchadd_int(&obj->debug_index, 1); 169 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 170 ksnprintf(obj->debug_hold_thrs[i], 171 sizeof(obj->debug_hold_thrs[i]), 172 "%c%d:(%d):%s", 173 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 174 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 175 obj->ref_count, 176 curthread->td_comm); 177 obj->debug_hold_file[i] = file; 178 obj->debug_hold_line[i] = line; 179 #if 0 180 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 181 if (strcmp(curthread->td_comm, "sshd") == 0) { 182 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 183 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 184 obj, obj->ref_count, addrem, file, line); 185 } 186 #endif 187 } 188 189 #endif 190 191 /* 192 * Misc low level routines 193 */ 194 static void 195 vm_object_lock_init(vm_object_t obj) 196 { 197 #if defined(DEBUG_LOCKS) 198 int i; 199 200 obj->debug_index = 0; 201 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 202 obj->debug_hold_thrs[i][0] = 0; 203 obj->debug_hold_file[i] = NULL; 204 obj->debug_hold_line[i] = 0; 205 } 206 #endif 207 } 208 209 void 210 vm_object_lock_swap(void) 211 { 212 lwkt_token_swap(); 213 } 214 215 void 216 vm_object_lock(vm_object_t obj) 217 { 218 lwkt_gettoken(&obj->token); 219 } 220 221 /* 222 * Returns TRUE on sucesss 223 */ 224 static int 225 vm_object_lock_try(vm_object_t obj) 226 { 227 return(lwkt_trytoken(&obj->token)); 228 } 229 230 void 231 vm_object_lock_shared(vm_object_t obj) 232 { 233 lwkt_gettoken_shared(&obj->token); 234 } 235 236 void 237 vm_object_unlock(vm_object_t obj) 238 { 239 lwkt_reltoken(&obj->token); 240 } 241 242 void 243 vm_object_upgrade(vm_object_t obj) 244 { 245 lwkt_reltoken(&obj->token); 246 lwkt_gettoken(&obj->token); 247 } 248 249 void 250 vm_object_downgrade(vm_object_t obj) 251 { 252 lwkt_reltoken(&obj->token); 253 lwkt_gettoken_shared(&obj->token); 254 } 255 256 static __inline void 257 vm_object_assert_held(vm_object_t obj) 258 { 259 ASSERT_LWKT_TOKEN_HELD(&obj->token); 260 } 261 262 int 263 vm_quickcolor(void) 264 { 265 globaldata_t gd = mycpu; 266 int pg_color; 267 268 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 269 pg_color += gd->gd_quick_color; 270 gd->gd_quick_color += PQ_PRIME2; 271 272 return pg_color; 273 } 274 275 void 276 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 277 { 278 KKASSERT(obj != NULL); 279 280 /* 281 * Object must be held (object allocation is stable due to callers 282 * context, typically already holding the token on a parent object) 283 * prior to potentially blocking on the lock, otherwise the object 284 * can get ripped away from us. 285 */ 286 refcount_acquire(&obj->hold_count); 287 vm_object_lock(obj); 288 289 #if defined(DEBUG_LOCKS) 290 debugvm_object_add(obj, file, line, 1); 291 #endif 292 } 293 294 int 295 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 296 { 297 KKASSERT(obj != NULL); 298 299 /* 300 * Object must be held (object allocation is stable due to callers 301 * context, typically already holding the token on a parent object) 302 * prior to potentially blocking on the lock, otherwise the object 303 * can get ripped away from us. 304 */ 305 refcount_acquire(&obj->hold_count); 306 if (vm_object_lock_try(obj) == 0) { 307 if (refcount_release(&obj->hold_count)) { 308 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 309 kfree(obj, M_VM_OBJECT); 310 } 311 return(0); 312 } 313 314 #if defined(DEBUG_LOCKS) 315 debugvm_object_add(obj, file, line, 1); 316 #endif 317 return(1); 318 } 319 320 void 321 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 322 { 323 KKASSERT(obj != NULL); 324 325 /* 326 * Object must be held (object allocation is stable due to callers 327 * context, typically already holding the token on a parent object) 328 * prior to potentially blocking on the lock, otherwise the object 329 * can get ripped away from us. 330 */ 331 refcount_acquire(&obj->hold_count); 332 vm_object_lock_shared(obj); 333 334 #if defined(DEBUG_LOCKS) 335 debugvm_object_add(obj, file, line, 1); 336 #endif 337 } 338 339 /* 340 * Drop the token and hold_count on the object. 341 * 342 * WARNING! Token might be shared. 343 */ 344 void 345 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 346 { 347 if (obj == NULL) 348 return; 349 350 /* 351 * No new holders should be possible once we drop hold_count 1->0 as 352 * there is no longer any way to reference the object. 353 */ 354 KKASSERT(obj->hold_count > 0); 355 if (refcount_release(&obj->hold_count)) { 356 #if defined(DEBUG_LOCKS) 357 debugvm_object_add(obj, file, line, -1); 358 #endif 359 360 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 361 vm_object_unlock(obj); 362 kfree(obj, M_VM_OBJECT); 363 } else { 364 vm_object_unlock(obj); 365 } 366 } else { 367 #if defined(DEBUG_LOCKS) 368 debugvm_object_add(obj, file, line, -1); 369 #endif 370 vm_object_unlock(obj); 371 } 372 } 373 374 /* 375 * Initialize a freshly allocated object, returning a held object. 376 * 377 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 378 * 379 * No requirements. 380 */ 381 void 382 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) 383 { 384 struct vm_object_hash *hash; 385 386 RB_INIT(&object->rb_memq); 387 lwkt_token_init(&object->token, "vmobj"); 388 389 TAILQ_INIT(&object->backing_list); 390 lockinit(&object->backing_lk, "baclk", 0, 0); 391 392 object->type = type; 393 object->size = size; 394 object->ref_count = 1; 395 object->memattr = VM_MEMATTR_DEFAULT; 396 object->hold_count = 0; 397 object->flags = 0; 398 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 399 vm_object_set_flag(object, OBJ_ONEMAPPING); 400 object->paging_in_progress = 0; 401 object->resident_page_count = 0; 402 /* cpu localization twist */ 403 object->pg_color = vm_quickcolor(); 404 object->handle = NULL; 405 406 atomic_add_int(&object->generation, 1); 407 object->swblock_count = 0; 408 RB_INIT(&object->swblock_root); 409 vm_object_lock_init(object); 410 pmap_object_init(object); 411 412 vm_object_hold(object); 413 414 hash = vmobj_hash(object); 415 lwkt_gettoken(&hash->token); 416 TAILQ_INSERT_TAIL(&hash->list, object, object_entry); 417 lwkt_reltoken(&hash->token); 418 } 419 420 /* 421 * Initialize a VM object. 422 */ 423 void 424 vm_object_init(vm_object_t object, vm_pindex_t size) 425 { 426 _vm_object_allocate(OBJT_DEFAULT, size, object); 427 vm_object_drop(object); 428 } 429 430 /* 431 * Initialize the VM objects module. 432 * 433 * Called from the low level boot code only. Note that this occurs before 434 * kmalloc is initialized so we cannot allocate any VM objects. 435 */ 436 void 437 vm_object_init1(void) 438 { 439 int i; 440 441 for (i = 0; i < VMOBJ_HSIZE; ++i) { 442 TAILQ_INIT(&vm_object_hash[i].list); 443 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 444 } 445 446 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 447 &kernel_object); 448 vm_object_drop(&kernel_object); 449 } 450 451 void 452 vm_object_init2(void) 453 { 454 kmalloc_set_unlimited(M_VM_OBJECT); 455 } 456 457 /* 458 * Allocate and return a new object of the specified type and size. 459 * 460 * No requirements. 461 */ 462 vm_object_t 463 vm_object_allocate(objtype_t type, vm_pindex_t size) 464 { 465 vm_object_t obj; 466 467 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 468 _vm_object_allocate(type, size, obj); 469 vm_object_drop(obj); 470 471 return (obj); 472 } 473 474 /* 475 * This version returns a held object, allowing further atomic initialization 476 * of the object. 477 */ 478 vm_object_t 479 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 480 { 481 vm_object_t obj; 482 483 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 484 _vm_object_allocate(type, size, obj); 485 486 return (obj); 487 } 488 489 /* 490 * Add an additional reference to a vm_object. The object must already be 491 * held. The original non-lock version is no longer supported. The object 492 * must NOT be chain locked by anyone at the time the reference is added. 493 * 494 * The object must be held, but may be held shared if desired (hence why 495 * we use an atomic op). 496 */ 497 void 498 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 499 { 500 KKASSERT(object != NULL); 501 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 502 atomic_add_int(&object->ref_count, 1); 503 if (object->type == OBJT_VNODE) { 504 vref(object->handle); 505 /* XXX what if the vnode is being destroyed? */ 506 } 507 #if defined(DEBUG_LOCKS) 508 debugvm_object_add(object, file, line, 1); 509 #endif 510 } 511 512 /* 513 * This version is only allowed in situations where the caller 514 * already knows that the object is deterministically referenced 515 * (usually because its taken from a ref'd vnode, or during a map_entry 516 * replication). 517 */ 518 void 519 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 520 { 521 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0); 522 atomic_add_int(&object->ref_count, 1); 523 if (object->type == OBJT_VNODE) 524 vref(object->handle); 525 #if defined(DEBUG_LOCKS) 526 debugvm_object_add(object, file, line, 1); 527 #endif 528 } 529 530 /* 531 * Dereference an object and its underlying vnode. The object may be 532 * held shared. On return the object will remain held. 533 * 534 * This function may return a vnode in *vpp which the caller must release 535 * after the caller drops its own lock. If vpp is NULL, we assume that 536 * the caller was holding an exclusive lock on the object and we vrele() 537 * the vp ourselves. 538 */ 539 static void 540 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 541 VMOBJDBARGS) 542 { 543 struct vnode *vp = (struct vnode *) object->handle; 544 545 KASSERT(object->type == OBJT_VNODE, 546 ("vm_object_vndeallocate: not a vnode object")); 547 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 548 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 549 #ifdef INVARIANTS 550 if (object->ref_count == 0) { 551 vprint("vm_object_vndeallocate", vp); 552 panic("vm_object_vndeallocate: bad object reference count"); 553 } 554 #endif 555 for (;;) { 556 int count = object->ref_count; 557 cpu_ccfence(); 558 if (count == 1) { 559 vm_object_upgrade(object); 560 if (atomic_cmpset_int(&object->ref_count, count, 0)) { 561 vclrflags(vp, VTEXT); 562 break; 563 } 564 } else { 565 if (atomic_cmpset_int(&object->ref_count, 566 count, count - 1)) { 567 break; 568 } 569 } 570 /* retry */ 571 } 572 #if defined(DEBUG_LOCKS) 573 debugvm_object_add(object, file, line, -1); 574 #endif 575 576 /* 577 * vrele or return the vp to vrele. We can only safely vrele(vp) 578 * if the object was locked exclusively. But there are two races 579 * here. 580 * 581 * We had to upgrade the object above to safely clear VTEXT 582 * but the alternative path where the shared lock is retained 583 * can STILL race to 0 in other paths and cause our own vrele() 584 * to terminate the vnode. We can't allow that if the VM object 585 * is still locked shared. 586 */ 587 if (vpp) 588 *vpp = vp; 589 else 590 vrele(vp); 591 } 592 593 /* 594 * Release a reference to the specified object, gained either through a 595 * vm_object_allocate or a vm_object_reference call. When all references 596 * are gone, storage associated with this object may be relinquished. 597 * 598 * The caller does not have to hold the object locked but must have control 599 * over the reference in question in order to guarantee that the object 600 * does not get ripped out from under us. 601 * 602 * XXX Currently all deallocations require an exclusive lock. 603 */ 604 void 605 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 606 { 607 struct vnode *vp; 608 int count; 609 610 if (object == NULL) 611 return; 612 613 for (;;) { 614 count = object->ref_count; 615 cpu_ccfence(); 616 617 /* 618 * If decrementing the count enters into special handling 619 * territory (0, 1, or 2) we have to do it the hard way. 620 * Fortunate though, objects with only a few refs like this 621 * are not likely to be heavily contended anyway. 622 * 623 * For vnode objects we only care about 1->0 transitions. 624 */ 625 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 626 #if defined(DEBUG_LOCKS) 627 debugvm_object_add(object, file, line, 0); 628 #endif 629 vm_object_hold(object); 630 vm_object_deallocate_locked(object); 631 vm_object_drop(object); 632 break; 633 } 634 635 /* 636 * Try to decrement ref_count without acquiring a hold on 637 * the object. This is particularly important for the exec*() 638 * and exit*() code paths because the program binary may 639 * have a great deal of sharing and an exclusive lock will 640 * crowbar performance in those circumstances. 641 */ 642 if (object->type == OBJT_VNODE) { 643 vp = (struct vnode *)object->handle; 644 if (atomic_cmpset_int(&object->ref_count, 645 count, count - 1)) { 646 #if defined(DEBUG_LOCKS) 647 debugvm_object_add(object, file, line, -1); 648 #endif 649 650 vrele(vp); 651 break; 652 } 653 /* retry */ 654 } else { 655 if (atomic_cmpset_int(&object->ref_count, 656 count, count - 1)) { 657 #if defined(DEBUG_LOCKS) 658 debugvm_object_add(object, file, line, -1); 659 #endif 660 break; 661 } 662 /* retry */ 663 } 664 /* retry */ 665 } 666 } 667 668 void 669 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 670 { 671 /* 672 * Degenerate case 673 */ 674 if (object == NULL) 675 return; 676 677 /* 678 * vnode case, caller either locked the object exclusively 679 * or this is a recursion with must_drop != 0 and the vnode 680 * object will be locked shared. 681 * 682 * If locked shared we have to drop the object before we can 683 * call vrele() or risk a shared/exclusive livelock. 684 */ 685 if (object->type == OBJT_VNODE) { 686 ASSERT_LWKT_TOKEN_HELD(&object->token); 687 vm_object_vndeallocate(object, NULL); 688 return; 689 } 690 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 691 692 /* 693 * Normal case (object is locked exclusively) 694 */ 695 if (object->ref_count == 0) { 696 panic("vm_object_deallocate: object deallocated " 697 "too many times: %d", object->type); 698 } 699 if (object->ref_count > 2) { 700 atomic_add_int(&object->ref_count, -1); 701 #if defined(DEBUG_LOCKS) 702 debugvm_object_add(object, file, line, -1); 703 #endif 704 return; 705 } 706 707 /* 708 * Drop the ref and handle termination on the 1->0 transition. 709 * We may have blocked above so we have to recheck. 710 */ 711 KKASSERT(object->ref_count != 0); 712 if (object->ref_count >= 2) { 713 atomic_add_int(&object->ref_count, -1); 714 #if defined(DEBUG_LOCKS) 715 debugvm_object_add(object, file, line, -1); 716 #endif 717 return; 718 } 719 720 atomic_add_int(&object->ref_count, -1); 721 if ((object->flags & OBJ_DEAD) == 0) 722 vm_object_terminate(object); 723 } 724 725 /* 726 * Destroy the specified object, freeing up related resources. 727 * 728 * The object must have zero references. 729 * 730 * The object must held. The caller is responsible for dropping the object 731 * after terminate returns. Terminate does NOT drop the object. 732 */ 733 static int vm_object_terminate_callback(vm_page_t p, void *data); 734 735 void 736 vm_object_terminate(vm_object_t object) 737 { 738 struct rb_vm_page_scan_info info; 739 struct vm_object_hash *hash; 740 741 /* 742 * Make sure no one uses us. Once we set OBJ_DEAD we should be 743 * able to safely block. 744 */ 745 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 746 KKASSERT((object->flags & OBJ_DEAD) == 0); 747 vm_object_set_flag(object, OBJ_DEAD); 748 749 /* 750 * Wait for the pageout daemon to be done with the object 751 */ 752 vm_object_pip_wait(object, "objtrm1"); 753 754 KASSERT(!object->paging_in_progress, 755 ("vm_object_terminate: pageout in progress")); 756 757 /* 758 * Clean and free the pages, as appropriate. All references to the 759 * object are gone, so we don't need to lock it. 760 */ 761 if (object->type == OBJT_VNODE) { 762 struct vnode *vp; 763 764 /* 765 * Clean pages and flush buffers. 766 * 767 * NOTE! TMPFS buffer flushes do not typically flush the 768 * actual page to swap as this would be highly 769 * inefficient, and normal filesystems usually wrap 770 * page flushes with buffer cache buffers. 771 * 772 * To deal with this we have to call vinvalbuf() both 773 * before and after the vm_object_page_clean(). 774 */ 775 vp = (struct vnode *) object->handle; 776 vinvalbuf(vp, V_SAVE, 0, 0); 777 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 778 vinvalbuf(vp, V_SAVE, 0, 0); 779 } 780 781 /* 782 * Wait for any I/O to complete, after which there had better not 783 * be any references left on the object. 784 */ 785 vm_object_pip_wait(object, "objtrm2"); 786 787 if (object->ref_count != 0) { 788 panic("vm_object_terminate: object with references, " 789 "ref_count=%d", object->ref_count); 790 } 791 792 /* 793 * Cleanup any shared pmaps associated with this object. 794 */ 795 pmap_object_free(object); 796 797 /* 798 * Now free any remaining pages. For internal objects, this also 799 * removes them from paging queues. Don't free wired pages, just 800 * remove them from the object. 801 */ 802 info.count = 0; 803 info.object = object; 804 do { 805 info.error = 0; 806 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 807 vm_object_terminate_callback, &info); 808 } while (info.error); 809 810 /* 811 * Let the pager know object is dead. 812 */ 813 vm_pager_deallocate(object); 814 815 /* 816 * Wait for the object hold count to hit 1, clean out pages as 817 * we go. vmobj_token interlocks any race conditions that might 818 * pick the object up from the vm_object_list after we have cleared 819 * rb_memq. 820 */ 821 for (;;) { 822 if (RB_ROOT(&object->rb_memq) == NULL) 823 break; 824 kprintf("vm_object_terminate: Warning, object %p " 825 "still has %ld pages\n", 826 object, object->resident_page_count); 827 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 828 vm_object_terminate_callback, &info); 829 } 830 831 /* 832 * There had better not be any pages left 833 */ 834 KKASSERT(object->resident_page_count == 0); 835 836 /* 837 * Remove the object from the global object list. 838 */ 839 hash = vmobj_hash(object); 840 lwkt_gettoken(&hash->token); 841 TAILQ_REMOVE(&hash->list, object, object_entry); 842 lwkt_reltoken(&hash->token); 843 844 if (object->ref_count != 0) { 845 panic("vm_object_terminate2: object with references, " 846 "ref_count=%d", object->ref_count); 847 } 848 849 /* 850 * NOTE: The object hold_count is at least 1, so we cannot kfree() 851 * the object here. See vm_object_drop(). 852 */ 853 } 854 855 /* 856 * The caller must hold the object. 857 */ 858 static int 859 vm_object_terminate_callback(vm_page_t p, void *data) 860 { 861 struct rb_vm_page_scan_info *info = data; 862 vm_object_t object; 863 864 object = p->object; 865 KKASSERT(object == info->object); 866 if (vm_page_busy_try(p, TRUE)) { 867 vm_page_sleep_busy(p, TRUE, "vmotrm"); 868 info->error = 1; 869 return 0; 870 } 871 if (object != p->object) { 872 /* XXX remove once we determine it can't happen */ 873 kprintf("vm_object_terminate: Warning: Encountered " 874 "busied page %p on queue %d\n", p, p->queue); 875 vm_page_wakeup(p); 876 info->error = 1; 877 } else if (p->wire_count == 0) { 878 /* 879 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 880 */ 881 vm_page_free(p); 882 mycpu->gd_cnt.v_pfree++; 883 } else { 884 if (p->queue != PQ_NONE) { 885 kprintf("vm_object_terminate: Warning: Encountered " 886 "wired page %p on queue %d\n", p, p->queue); 887 if (vm_object_debug > 0) { 888 --vm_object_debug; 889 print_backtrace(10); 890 } 891 } 892 vm_page_remove(p); 893 vm_page_wakeup(p); 894 } 895 896 /* 897 * Must be at end to avoid SMP races, caller holds object token 898 */ 899 if ((++info->count & 63) == 0) 900 lwkt_user_yield(); 901 return(0); 902 } 903 904 /* 905 * Clean all dirty pages in the specified range of object. Leaves page 906 * on whatever queue it is currently on. If NOSYNC is set then do not 907 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 908 * leaving the object dirty. 909 * 910 * When stuffing pages asynchronously, allow clustering. XXX we need a 911 * synchronous clustering mode implementation. 912 * 913 * Odd semantics: if start == end, we clean everything. 914 * 915 * The object must be locked? XXX 916 */ 917 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 918 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 919 920 void 921 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 922 int flags) 923 { 924 struct rb_vm_page_scan_info info; 925 struct vnode *vp; 926 int wholescan; 927 int pagerflags; 928 int generation; 929 930 vm_object_hold(object); 931 if (object->type != OBJT_VNODE || 932 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 933 vm_object_drop(object); 934 return; 935 } 936 937 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 938 VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; 939 pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; 940 941 vp = object->handle; 942 943 /* 944 * Interlock other major object operations. This allows us to 945 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 946 */ 947 vm_object_set_flag(object, OBJ_CLEANING); 948 949 /* 950 * Handle 'entire object' case 951 */ 952 info.start_pindex = start; 953 if (end == 0) { 954 info.end_pindex = object->size - 1; 955 } else { 956 info.end_pindex = end - 1; 957 } 958 wholescan = (start == 0 && info.end_pindex == object->size - 1); 959 info.limit = flags; 960 info.pagerflags = pagerflags; 961 info.object = object; 962 963 /* 964 * If cleaning the entire object do a pass to mark the pages read-only. 965 * If everything worked out ok, clear OBJ_WRITEABLE and 966 * OBJ_MIGHTBEDIRTY. 967 */ 968 if (wholescan) { 969 info.error = 0; 970 info.count = 0; 971 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 972 vm_object_page_clean_pass1, &info); 973 if (info.error == 0) { 974 vm_object_clear_flag(object, 975 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 976 if (object->type == OBJT_VNODE && 977 (vp = (struct vnode *)object->handle) != NULL) { 978 /* 979 * Use new-style interface to clear VISDIRTY 980 * because the vnode is not necessarily removed 981 * from the syncer list(s) as often as it was 982 * under the old interface, which can leave 983 * the vnode on the syncer list after reclaim. 984 */ 985 vclrobjdirty(vp); 986 } 987 } 988 } 989 990 /* 991 * Do a pass to clean all the dirty pages we find. 992 */ 993 do { 994 info.error = 0; 995 info.count = 0; 996 generation = object->generation; 997 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 998 vm_object_page_clean_pass2, &info); 999 } while (info.error || generation != object->generation); 1000 1001 vm_object_clear_flag(object, OBJ_CLEANING); 1002 vm_object_drop(object); 1003 } 1004 1005 /* 1006 * The caller must hold the object. 1007 */ 1008 static 1009 int 1010 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1011 { 1012 struct rb_vm_page_scan_info *info = data; 1013 1014 KKASSERT(p->object == info->object); 1015 1016 vm_page_flag_set(p, PG_CLEANCHK); 1017 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1018 info->error = 1; 1019 } else if (vm_page_busy_try(p, FALSE)) { 1020 info->error = 1; 1021 } else { 1022 KKASSERT(p->object == info->object); 1023 vm_page_protect(p, VM_PROT_READ); 1024 vm_page_wakeup(p); 1025 } 1026 1027 /* 1028 * Must be at end to avoid SMP races, caller holds object token 1029 */ 1030 if ((++info->count & 63) == 0) 1031 lwkt_user_yield(); 1032 return(0); 1033 } 1034 1035 /* 1036 * The caller must hold the object 1037 */ 1038 static 1039 int 1040 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1041 { 1042 struct rb_vm_page_scan_info *info = data; 1043 int generation; 1044 1045 KKASSERT(p->object == info->object); 1046 1047 /* 1048 * Do not mess with pages that were inserted after we started 1049 * the cleaning pass. 1050 */ 1051 if ((p->flags & PG_CLEANCHK) == 0) 1052 goto done; 1053 1054 generation = info->object->generation; 1055 1056 if (vm_page_busy_try(p, TRUE)) { 1057 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1058 info->error = 1; 1059 goto done; 1060 } 1061 1062 KKASSERT(p->object == info->object && 1063 info->object->generation == generation); 1064 1065 /* 1066 * Before wasting time traversing the pmaps, check for trivial 1067 * cases where the page cannot be dirty. 1068 */ 1069 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1070 KKASSERT((p->dirty & p->valid) == 0 && 1071 (p->flags & PG_NEED_COMMIT) == 0); 1072 vm_page_wakeup(p); 1073 goto done; 1074 } 1075 1076 /* 1077 * Check whether the page is dirty or not. The page has been set 1078 * to be read-only so the check will not race a user dirtying the 1079 * page. 1080 */ 1081 vm_page_test_dirty(p); 1082 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1083 vm_page_flag_clear(p, PG_CLEANCHK); 1084 vm_page_wakeup(p); 1085 goto done; 1086 } 1087 1088 /* 1089 * If we have been asked to skip nosync pages and this is a 1090 * nosync page, skip it. Note that the object flags were 1091 * not cleared in this case (because pass1 will have returned an 1092 * error), so we do not have to set them. 1093 */ 1094 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1095 vm_page_flag_clear(p, PG_CLEANCHK); 1096 vm_page_wakeup(p); 1097 goto done; 1098 } 1099 1100 /* 1101 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1102 * the pages that get successfully flushed. Set info->error if 1103 * we raced an object modification. 1104 */ 1105 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1106 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1107 1108 /* 1109 * Must be at end to avoid SMP races, caller holds object token 1110 */ 1111 done: 1112 if ((++info->count & 63) == 0) 1113 lwkt_user_yield(); 1114 return(0); 1115 } 1116 1117 /* 1118 * Collect the specified page and nearby pages and flush them out. 1119 * The number of pages flushed is returned. The passed page is busied 1120 * by the caller and we are responsible for its disposition. 1121 * 1122 * The caller must hold the object. 1123 */ 1124 static void 1125 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1126 { 1127 int error; 1128 int is; 1129 int ib; 1130 int i; 1131 int page_base; 1132 vm_pindex_t pi; 1133 vm_page_t ma[BLIST_MAX_ALLOC]; 1134 1135 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1136 1137 pi = p->pindex; 1138 page_base = pi % BLIST_MAX_ALLOC; 1139 ma[page_base] = p; 1140 ib = page_base - 1; 1141 is = page_base + 1; 1142 1143 while (ib >= 0) { 1144 vm_page_t tp; 1145 1146 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1147 TRUE, &error); 1148 if (error) 1149 break; 1150 if (tp == NULL) 1151 break; 1152 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1153 (tp->flags & PG_CLEANCHK) == 0) { 1154 vm_page_wakeup(tp); 1155 break; 1156 } 1157 if ((tp->queue - tp->pc) == PQ_CACHE) { 1158 vm_page_flag_clear(tp, PG_CLEANCHK); 1159 vm_page_wakeup(tp); 1160 break; 1161 } 1162 vm_page_test_dirty(tp); 1163 if ((tp->dirty & tp->valid) == 0 && 1164 (tp->flags & PG_NEED_COMMIT) == 0) { 1165 vm_page_flag_clear(tp, PG_CLEANCHK); 1166 vm_page_wakeup(tp); 1167 break; 1168 } 1169 ma[ib] = tp; 1170 --ib; 1171 } 1172 ++ib; /* fixup */ 1173 1174 while (is < BLIST_MAX_ALLOC && 1175 pi - page_base + is < object->size) { 1176 vm_page_t tp; 1177 1178 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1179 TRUE, &error); 1180 if (error) 1181 break; 1182 if (tp == NULL) 1183 break; 1184 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1185 (tp->flags & PG_CLEANCHK) == 0) { 1186 vm_page_wakeup(tp); 1187 break; 1188 } 1189 if ((tp->queue - tp->pc) == PQ_CACHE) { 1190 vm_page_flag_clear(tp, PG_CLEANCHK); 1191 vm_page_wakeup(tp); 1192 break; 1193 } 1194 vm_page_test_dirty(tp); 1195 if ((tp->dirty & tp->valid) == 0 && 1196 (tp->flags & PG_NEED_COMMIT) == 0) { 1197 vm_page_flag_clear(tp, PG_CLEANCHK); 1198 vm_page_wakeup(tp); 1199 break; 1200 } 1201 ma[is] = tp; 1202 ++is; 1203 } 1204 1205 /* 1206 * All pages in the ma[] array are busied now 1207 */ 1208 for (i = ib; i < is; ++i) { 1209 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1210 vm_page_hold(ma[i]); /* XXX need this any more? */ 1211 } 1212 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1213 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1214 vm_page_unhold(ma[i]); 1215 } 1216 1217 /* 1218 * Implements the madvise function at the object/page level. 1219 * 1220 * MADV_WILLNEED (any object) 1221 * 1222 * Activate the specified pages if they are resident. 1223 * 1224 * MADV_DONTNEED (any object) 1225 * 1226 * Deactivate the specified pages if they are resident. 1227 * 1228 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1229 * 1230 * Deactivate and clean the specified pages if they are 1231 * resident. This permits the process to reuse the pages 1232 * without faulting or the kernel to reclaim the pages 1233 * without I/O. 1234 * 1235 * No requirements. 1236 */ 1237 void 1238 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1239 vm_pindex_t count, int advise) 1240 { 1241 vm_pindex_t end; 1242 vm_page_t m; 1243 int error; 1244 1245 if (object == NULL) 1246 return; 1247 1248 end = pindex + count; 1249 1250 vm_object_hold(object); 1251 1252 /* 1253 * Locate and adjust resident pages. This only applies to the 1254 * primary object in the mapping. 1255 */ 1256 for (; pindex < end; pindex += 1) { 1257 relookup: 1258 /* 1259 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1260 * and those pages must be OBJ_ONEMAPPING. 1261 */ 1262 if (advise == MADV_FREE) { 1263 if ((object->type != OBJT_DEFAULT && 1264 object->type != OBJT_SWAP) || 1265 (object->flags & OBJ_ONEMAPPING) == 0) { 1266 continue; 1267 } 1268 } 1269 1270 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1271 1272 if (error) { 1273 vm_page_sleep_busy(m, TRUE, "madvpo"); 1274 goto relookup; 1275 } 1276 if (m == NULL) { 1277 /* 1278 * There may be swap even if there is no backing page 1279 */ 1280 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1281 swap_pager_freespace(object, pindex, 1); 1282 continue; 1283 } 1284 1285 /* 1286 * If the page is not in a normal active state, we skip it. 1287 * If the page is not managed there are no page queues to 1288 * mess with. Things can break if we mess with pages in 1289 * any of the below states. 1290 */ 1291 if (m->wire_count || 1292 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED | 1293 PG_NEED_COMMIT)) || 1294 m->valid != VM_PAGE_BITS_ALL 1295 ) { 1296 vm_page_wakeup(m); 1297 continue; 1298 } 1299 1300 /* 1301 * Theoretically once a page is known not to be busy, an 1302 * interrupt cannot come along and rip it out from under us. 1303 */ 1304 if (advise == MADV_WILLNEED) { 1305 vm_page_activate(m); 1306 } else if (advise == MADV_DONTNEED) { 1307 vm_page_dontneed(m); 1308 } else if (advise == MADV_FREE) { 1309 /* 1310 * Mark the page clean. This will allow the page 1311 * to be freed up by the system. However, such pages 1312 * are often reused quickly by malloc()/free() 1313 * so we do not do anything that would cause 1314 * a page fault if we can help it. 1315 * 1316 * Specifically, we do not try to actually free 1317 * the page now nor do we try to put it in the 1318 * cache (which would cause a page fault on reuse). 1319 * 1320 * But we do make the page is freeable as we 1321 * can without actually taking the step of unmapping 1322 * it. 1323 */ 1324 pmap_clear_modify(m); 1325 m->dirty = 0; 1326 m->act_count = 0; 1327 vm_page_dontneed(m); 1328 if (object->type == OBJT_SWAP) 1329 swap_pager_freespace(object, pindex, 1); 1330 } 1331 vm_page_wakeup(m); 1332 } 1333 vm_object_drop(object); 1334 } 1335 1336 /* 1337 * Removes all physical pages in the specified object range from the 1338 * object's list of pages. 1339 * 1340 * No requirements. 1341 */ 1342 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1343 1344 void 1345 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1346 boolean_t clean_only) 1347 { 1348 struct rb_vm_page_scan_info info; 1349 int all; 1350 1351 /* 1352 * Degenerate cases and assertions 1353 */ 1354 vm_object_hold(object); 1355 if (object == NULL || 1356 (object->resident_page_count == 0 && object->swblock_count == 0)) { 1357 vm_object_drop(object); 1358 return; 1359 } 1360 KASSERT(object->type != OBJT_PHYS, 1361 ("attempt to remove pages from a physical object")); 1362 1363 /* 1364 * Indicate that paging is occuring on the object 1365 */ 1366 vm_object_pip_add(object, 1); 1367 1368 /* 1369 * Figure out the actual removal range and whether we are removing 1370 * the entire contents of the object or not. If removing the entire 1371 * contents, be sure to get all pages, even those that might be 1372 * beyond the end of the object. 1373 */ 1374 info.object = object; 1375 info.start_pindex = start; 1376 if (end == 0) 1377 info.end_pindex = (vm_pindex_t)-1; 1378 else 1379 info.end_pindex = end - 1; 1380 info.limit = clean_only; 1381 info.count = 0; 1382 all = (start == 0 && info.end_pindex >= object->size - 1); 1383 1384 /* 1385 * Loop until we are sure we have gotten them all. 1386 */ 1387 do { 1388 info.error = 0; 1389 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1390 vm_object_page_remove_callback, &info); 1391 } while (info.error); 1392 1393 /* 1394 * Remove any related swap if throwing away pages, or for 1395 * non-swap objects (the swap is a clean copy in that case). 1396 */ 1397 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1398 if (all) 1399 swap_pager_freespace_all(object); 1400 else 1401 swap_pager_freespace(object, info.start_pindex, 1402 info.end_pindex - info.start_pindex + 1); 1403 } 1404 1405 /* 1406 * Cleanup 1407 */ 1408 vm_object_pip_wakeup(object); 1409 vm_object_drop(object); 1410 } 1411 1412 /* 1413 * The caller must hold the object. 1414 * 1415 * NOTE: User yields are allowed when removing more than one page, but not 1416 * allowed if only removing one page (the path for single page removals 1417 * might hold a spinlock). 1418 */ 1419 static int 1420 vm_object_page_remove_callback(vm_page_t p, void *data) 1421 { 1422 struct rb_vm_page_scan_info *info = data; 1423 1424 if (info->object != p->object || 1425 p->pindex < info->start_pindex || 1426 p->pindex > info->end_pindex) { 1427 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1428 info->object, p); 1429 return(0); 1430 } 1431 if (vm_page_busy_try(p, TRUE)) { 1432 vm_page_sleep_busy(p, TRUE, "vmopar"); 1433 info->error = 1; 1434 return(0); 1435 } 1436 if (info->object != p->object) { 1437 /* this should never happen */ 1438 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1439 info->object, p); 1440 vm_page_wakeup(p); 1441 return(0); 1442 } 1443 1444 /* 1445 * Wired pages cannot be destroyed, but they can be invalidated 1446 * and we do so if clean_only (limit) is not set. 1447 * 1448 * WARNING! The page may be wired due to being part of a buffer 1449 * cache buffer, and the buffer might be marked B_CACHE. 1450 * This is fine as part of a truncation but VFSs must be 1451 * sure to fix the buffer up when re-extending the file. 1452 * 1453 * NOTE! PG_NEED_COMMIT is ignored. 1454 */ 1455 if (p->wire_count != 0) { 1456 vm_page_protect(p, VM_PROT_NONE); 1457 if (info->limit == 0) 1458 p->valid = 0; 1459 vm_page_wakeup(p); 1460 goto done; 1461 } 1462 1463 /* 1464 * limit is our clean_only flag. If set and the page is dirty or 1465 * requires a commit, do not free it. If set and the page is being 1466 * held by someone, do not free it. 1467 */ 1468 if (info->limit && p->valid) { 1469 vm_page_test_dirty(p); 1470 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1471 vm_page_wakeup(p); 1472 goto done; 1473 } 1474 } 1475 1476 /* 1477 * Destroy the page. But we have to re-test whether its dirty after 1478 * removing it from its pmaps. 1479 */ 1480 vm_page_protect(p, VM_PROT_NONE); 1481 if (info->limit && p->valid) { 1482 vm_page_test_dirty(p); 1483 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1484 vm_page_wakeup(p); 1485 goto done; 1486 } 1487 } 1488 vm_page_free(p); 1489 1490 /* 1491 * Must be at end to avoid SMP races, caller holds object token 1492 */ 1493 done: 1494 if ((++info->count & 63) == 0) 1495 lwkt_user_yield(); 1496 1497 return(0); 1498 } 1499 1500 /* 1501 * Try to extend prev_object into an adjoining region of virtual 1502 * memory, return TRUE on success. 1503 * 1504 * The caller does not need to hold (prev_object) but must have a stable 1505 * pointer to it (typically by holding the vm_map locked). 1506 * 1507 * This function only works for anonymous memory objects which either 1508 * have (a) one reference or (b) we are extending the object's size. 1509 * Otherwise the related VM pages we want to use for the object might 1510 * be in use by another mapping. 1511 */ 1512 boolean_t 1513 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1514 vm_size_t prev_size, vm_size_t next_size) 1515 { 1516 vm_pindex_t next_pindex; 1517 1518 if (prev_object == NULL) 1519 return (TRUE); 1520 1521 vm_object_hold(prev_object); 1522 1523 if (prev_object->type != OBJT_DEFAULT && 1524 prev_object->type != OBJT_SWAP) { 1525 vm_object_drop(prev_object); 1526 return (FALSE); 1527 } 1528 1529 #if 0 1530 /* caller now checks this */ 1531 /* 1532 * Try to collapse the object first 1533 */ 1534 vm_object_collapse(prev_object, NULL); 1535 #endif 1536 1537 #if 0 1538 /* caller now checks this */ 1539 /* 1540 * We can't coalesce if we shadow another object (figuring out the 1541 * relationships become too complex). 1542 */ 1543 if (prev_object->backing_object != NULL) { 1544 vm_object_chain_release(prev_object); 1545 vm_object_drop(prev_object); 1546 return (FALSE); 1547 } 1548 #endif 1549 1550 prev_size >>= PAGE_SHIFT; 1551 next_size >>= PAGE_SHIFT; 1552 next_pindex = prev_pindex + prev_size; 1553 1554 /* 1555 * We can't if the object has more than one ref count unless we 1556 * are extending it into newly minted space. 1557 */ 1558 if (prev_object->ref_count > 1 && 1559 prev_object->size != next_pindex) { 1560 vm_object_drop(prev_object); 1561 return (FALSE); 1562 } 1563 1564 /* 1565 * Remove any pages that may still be in the object from a previous 1566 * deallocation. 1567 */ 1568 if (next_pindex < prev_object->size) { 1569 vm_object_page_remove(prev_object, 1570 next_pindex, 1571 next_pindex + next_size, FALSE); 1572 if (prev_object->type == OBJT_SWAP) 1573 swap_pager_freespace(prev_object, 1574 next_pindex, next_size); 1575 } 1576 1577 /* 1578 * Extend the object if necessary. 1579 */ 1580 if (next_pindex + next_size > prev_object->size) 1581 prev_object->size = next_pindex + next_size; 1582 vm_object_drop(prev_object); 1583 1584 return (TRUE); 1585 } 1586 1587 /* 1588 * Make the object writable and flag is being possibly dirty. 1589 * 1590 * The object might not be held (or might be held but held shared), 1591 * the related vnode is probably not held either. Object and vnode are 1592 * stable by virtue of the vm_page busied by the caller preventing 1593 * destruction. 1594 * 1595 * If the related mount is flagged MNTK_THR_SYNC we need to call 1596 * vsetobjdirty(). Filesystems using this option usually shortcut 1597 * synchronization by only scanning the syncer list. 1598 */ 1599 void 1600 vm_object_set_writeable_dirty(vm_object_t object) 1601 { 1602 struct vnode *vp; 1603 1604 /*vm_object_assert_held(object);*/ 1605 /* 1606 * Avoid contention in vm fault path by checking the state before 1607 * issuing an atomic op on it. 1608 */ 1609 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1610 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1611 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1612 } 1613 if (object->type == OBJT_VNODE && 1614 (vp = (struct vnode *)object->handle) != NULL) { 1615 if ((vp->v_flag & VOBJDIRTY) == 0) { 1616 if (vp->v_mount && 1617 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1618 /* 1619 * New style THR_SYNC places vnodes on the 1620 * syncer list more deterministically. 1621 */ 1622 vsetobjdirty(vp); 1623 } else { 1624 /* 1625 * Old style scan would not necessarily place 1626 * a vnode on the syncer list when possibly 1627 * modified via mmap. 1628 */ 1629 vsetflags(vp, VOBJDIRTY); 1630 } 1631 } 1632 } 1633 } 1634 1635 #include "opt_ddb.h" 1636 #ifdef DDB 1637 #include <sys/cons.h> 1638 1639 #include <ddb/ddb.h> 1640 1641 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1642 vm_map_entry_t entry); 1643 static int vm_object_in_map (vm_object_t object); 1644 1645 /* 1646 * The caller must hold the object. 1647 */ 1648 static int 1649 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1650 { 1651 vm_map_backing_t ba; 1652 vm_map_t tmpm; 1653 vm_map_entry_t tmpe; 1654 int entcount; 1655 1656 if (map == NULL) 1657 return 0; 1658 if (entry == NULL) { 1659 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1660 entcount = map->nentries; 1661 while (entcount-- && tmpe) { 1662 if( _vm_object_in_map(map, object, tmpe)) { 1663 return 1; 1664 } 1665 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1666 } 1667 return (0); 1668 } 1669 switch(entry->maptype) { 1670 case VM_MAPTYPE_SUBMAP: 1671 tmpm = entry->ba.sub_map; 1672 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1673 entcount = tmpm->nentries; 1674 while (entcount-- && tmpe) { 1675 if( _vm_object_in_map(tmpm, object, tmpe)) { 1676 return 1; 1677 } 1678 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1679 } 1680 break; 1681 case VM_MAPTYPE_NORMAL: 1682 case VM_MAPTYPE_VPAGETABLE: 1683 ba = &entry->ba; 1684 while (ba) { 1685 if (ba->object == object) 1686 return TRUE; 1687 ba = ba->backing_ba; 1688 } 1689 break; 1690 default: 1691 break; 1692 } 1693 return 0; 1694 } 1695 1696 static int vm_object_in_map_callback(struct proc *p, void *data); 1697 1698 struct vm_object_in_map_info { 1699 vm_object_t object; 1700 int rv; 1701 }; 1702 1703 /* 1704 * Debugging only 1705 */ 1706 static int 1707 vm_object_in_map(vm_object_t object) 1708 { 1709 struct vm_object_in_map_info info; 1710 1711 info.rv = 0; 1712 info.object = object; 1713 1714 allproc_scan(vm_object_in_map_callback, &info, 0); 1715 if (info.rv) 1716 return 1; 1717 if( _vm_object_in_map(&kernel_map, object, 0)) 1718 return 1; 1719 if( _vm_object_in_map(&pager_map, object, 0)) 1720 return 1; 1721 if( _vm_object_in_map(&buffer_map, object, 0)) 1722 return 1; 1723 return 0; 1724 } 1725 1726 /* 1727 * Debugging only 1728 */ 1729 static int 1730 vm_object_in_map_callback(struct proc *p, void *data) 1731 { 1732 struct vm_object_in_map_info *info = data; 1733 1734 if (p->p_vmspace) { 1735 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1736 info->rv = 1; 1737 return -1; 1738 } 1739 } 1740 return (0); 1741 } 1742 1743 DB_SHOW_COMMAND(vmochk, vm_object_check) 1744 { 1745 struct vm_object_hash *hash; 1746 vm_object_t object; 1747 int n; 1748 1749 /* 1750 * make sure that internal objs are in a map somewhere 1751 * and none have zero ref counts. 1752 */ 1753 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1754 hash = &vm_object_hash[n]; 1755 for (object = TAILQ_FIRST(&hash->list); 1756 object != NULL; 1757 object = TAILQ_NEXT(object, object_entry)) { 1758 if (object->type == OBJT_MARKER) 1759 continue; 1760 if (object->handle != NULL || 1761 (object->type != OBJT_DEFAULT && 1762 object->type != OBJT_SWAP)) { 1763 continue; 1764 } 1765 if (object->ref_count == 0) { 1766 db_printf("vmochk: internal obj has " 1767 "zero ref count: %ld\n", 1768 (long)object->size); 1769 } 1770 if (vm_object_in_map(object)) 1771 continue; 1772 db_printf("vmochk: internal obj is not in a map: " 1773 "ref: %d, size: %lu: 0x%lx\n", 1774 object->ref_count, (u_long)object->size, 1775 (u_long)object->size); 1776 } 1777 } 1778 } 1779 1780 /* 1781 * Debugging only 1782 */ 1783 DB_SHOW_COMMAND(object, vm_object_print_static) 1784 { 1785 /* XXX convert args. */ 1786 vm_object_t object = (vm_object_t)addr; 1787 boolean_t full = have_addr; 1788 1789 vm_page_t p; 1790 1791 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1792 #define count was_count 1793 1794 int count; 1795 1796 if (object == NULL) 1797 return; 1798 1799 db_iprintf( 1800 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1801 object, (int)object->type, (u_long)object->size, 1802 object->resident_page_count, object->ref_count, object->flags); 1803 /* 1804 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1805 */ 1806 db_iprintf("\n"); 1807 1808 if (!full) 1809 return; 1810 1811 db_indent += 2; 1812 count = 0; 1813 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1814 if (count == 0) 1815 db_iprintf("memory:="); 1816 else if (count == 6) { 1817 db_printf("\n"); 1818 db_iprintf(" ..."); 1819 count = 0; 1820 } else 1821 db_printf(","); 1822 count++; 1823 1824 db_printf("(off=0x%lx,page=0x%lx)", 1825 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1826 } 1827 if (count != 0) 1828 db_printf("\n"); 1829 db_indent -= 2; 1830 } 1831 1832 /* XXX. */ 1833 #undef count 1834 1835 /* 1836 * XXX need this non-static entry for calling from vm_map_print. 1837 * 1838 * Debugging only 1839 */ 1840 void 1841 vm_object_print(/* db_expr_t */ long addr, 1842 boolean_t have_addr, 1843 /* db_expr_t */ long count, 1844 char *modif) 1845 { 1846 vm_object_print_static(addr, have_addr, count, modif); 1847 } 1848 1849 /* 1850 * Debugging only 1851 */ 1852 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1853 { 1854 struct vm_object_hash *hash; 1855 vm_object_t object; 1856 int nl = 0; 1857 int c; 1858 int n; 1859 1860 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1861 hash = &vm_object_hash[n]; 1862 for (object = TAILQ_FIRST(&hash->list); 1863 object != NULL; 1864 object = TAILQ_NEXT(object, object_entry)) { 1865 vm_pindex_t idx, fidx; 1866 vm_pindex_t osize; 1867 vm_paddr_t pa = -1, padiff; 1868 int rcount; 1869 vm_page_t m; 1870 1871 if (object->type == OBJT_MARKER) 1872 continue; 1873 db_printf("new object: %p\n", (void *)object); 1874 if ( nl > 18) { 1875 c = cngetc(); 1876 if (c != ' ') 1877 return; 1878 nl = 0; 1879 } 1880 nl++; 1881 rcount = 0; 1882 fidx = 0; 1883 osize = object->size; 1884 if (osize > 128) 1885 osize = 128; 1886 for (idx = 0; idx < osize; idx++) { 1887 m = vm_page_lookup(object, idx); 1888 if (m == NULL) { 1889 if (rcount) { 1890 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1891 (long)fidx, rcount, (long)pa); 1892 if ( nl > 18) { 1893 c = cngetc(); 1894 if (c != ' ') 1895 return; 1896 nl = 0; 1897 } 1898 nl++; 1899 rcount = 0; 1900 } 1901 continue; 1902 } 1903 1904 if (rcount && 1905 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1906 ++rcount; 1907 continue; 1908 } 1909 if (rcount) { 1910 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1911 padiff >>= PAGE_SHIFT; 1912 padiff &= PQ_L2_MASK; 1913 if (padiff == 0) { 1914 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1915 ++rcount; 1916 continue; 1917 } 1918 db_printf(" index(%ld)run(%d)pa(0x%lx)", 1919 (long)fidx, rcount, (long)pa); 1920 db_printf("pd(%ld)\n", (long)padiff); 1921 if ( nl > 18) { 1922 c = cngetc(); 1923 if (c != ' ') 1924 return; 1925 nl = 0; 1926 } 1927 nl++; 1928 } 1929 fidx = idx; 1930 pa = VM_PAGE_TO_PHYS(m); 1931 rcount = 1; 1932 } 1933 if (rcount) { 1934 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1935 (long)fidx, rcount, (long)pa); 1936 if ( nl > 18) { 1937 c = cngetc(); 1938 if (c != ' ') 1939 return; 1940 nl = 0; 1941 } 1942 nl++; 1943 } 1944 } 1945 } 1946 } 1947 #endif /* DDB */ 1948