1 /* 2 * Copyright (c) 1991, 1993, 2013 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * The Mach Operating System project at Carnegie-Mellon University. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * from: @(#)vm_object.c 8.5 (Berkeley) 3/22/94 33 * 34 * 35 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 36 * All rights reserved. 37 * 38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 39 * 40 * Permission to use, copy, modify and distribute this software and 41 * its documentation is hereby granted, provided that both the copyright 42 * notice and this permission notice appear in all copies of the 43 * software, derivative works or modified versions, and any portions 44 * thereof, and that both notices appear in supporting documentation. 45 * 46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 49 * 50 * Carnegie Mellon requests users of this software to return to 51 * 52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 53 * School of Computer Science 54 * Carnegie Mellon University 55 * Pittsburgh PA 15213-3890 56 * 57 * any improvements or extensions that they make and grant Carnegie the 58 * rights to redistribute these changes. 59 * 60 * $FreeBSD: src/sys/vm/vm_object.c,v 1.171.2.8 2003/05/26 19:17:56 alc Exp $ 61 */ 62 63 /* 64 * Virtual memory object module. 65 */ 66 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/proc.h> /* for curproc, pageproc */ 70 #include <sys/thread.h> 71 #include <sys/vnode.h> 72 #include <sys/vmmeter.h> 73 #include <sys/mman.h> 74 #include <sys/mount.h> 75 #include <sys/kernel.h> 76 #include <sys/sysctl.h> 77 #include <sys/refcount.h> 78 79 #include <vm/vm.h> 80 #include <vm/vm_param.h> 81 #include <vm/pmap.h> 82 #include <vm/vm_map.h> 83 #include <vm/vm_object.h> 84 #include <vm/vm_page.h> 85 #include <vm/vm_pageout.h> 86 #include <vm/vm_pager.h> 87 #include <vm/swap_pager.h> 88 #include <vm/vm_kern.h> 89 #include <vm/vm_extern.h> 90 #include <vm/vm_zone.h> 91 92 #include <vm/vm_page2.h> 93 94 #include <machine/specialreg.h> 95 96 #define EASY_SCAN_FACTOR 8 97 98 static void vm_object_page_collect_flush(vm_object_t object, vm_page_t p, 99 int pagerflags); 100 static void vm_object_lock_init(vm_object_t); 101 102 /* 103 * Virtual memory objects maintain the actual data 104 * associated with allocated virtual memory. A given 105 * page of memory exists within exactly one object. 106 * 107 * An object is only deallocated when all "references" 108 * are given up. Only one "reference" to a given 109 * region of an object should be writeable. 110 * 111 * Associated with each object is a list of all resident 112 * memory pages belonging to that object; this list is 113 * maintained by the "vm_page" module, and locked by the object's 114 * lock. 115 * 116 * Each object also records a "pager" routine which is 117 * used to retrieve (and store) pages to the proper backing 118 * storage. In addition, objects may be backed by other 119 * objects from which they were virtual-copied. 120 * 121 * The only items within the object structure which are 122 * modified after time of creation are: 123 * reference count locked by object's lock 124 * pager routine locked by object's lock 125 * 126 */ 127 128 struct vm_object kernel_object; 129 130 struct vm_object_hash vm_object_hash[VMOBJ_HSIZE]; 131 132 MALLOC_DEFINE(M_VM_OBJECT, "vm_object", "vm_object structures"); 133 134 #define VMOBJ_HASH_PRIME1 66555444443333333ULL 135 #define VMOBJ_HASH_PRIME2 989042931893ULL 136 137 int vm_object_debug; 138 SYSCTL_INT(_vm, OID_AUTO, object_debug, CTLFLAG_RW, &vm_object_debug, 0, ""); 139 140 static __inline 141 struct vm_object_hash * 142 vmobj_hash(vm_object_t obj) 143 { 144 uintptr_t hash1; 145 uintptr_t hash2; 146 147 hash1 = (uintptr_t)obj + ((uintptr_t)obj >> 18); 148 hash1 %= VMOBJ_HASH_PRIME1; 149 hash2 = ((uintptr_t)obj >> 8) + ((uintptr_t)obj >> 24); 150 hash2 %= VMOBJ_HASH_PRIME2; 151 return (&vm_object_hash[(hash1 ^ hash2) & VMOBJ_HMASK]); 152 } 153 154 #if defined(DEBUG_LOCKS) 155 156 #define vm_object_vndeallocate(obj, vpp) \ 157 debugvm_object_vndeallocate(obj, vpp, __FILE__, __LINE__) 158 159 /* 160 * Debug helper to track hold/drop/ref/deallocate calls. 161 */ 162 static void 163 debugvm_object_add(vm_object_t obj, char *file, int line, int addrem) 164 { 165 int i; 166 167 i = atomic_fetchadd_int(&obj->debug_index, 1); 168 i = i & (VMOBJ_DEBUG_ARRAY_SIZE - 1); 169 ksnprintf(obj->debug_hold_thrs[i], 170 sizeof(obj->debug_hold_thrs[i]), 171 "%c%d:(%d):%s", 172 (addrem == -1 ? '-' : (addrem == 1 ? '+' : '=')), 173 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 174 obj->ref_count, 175 curthread->td_comm); 176 obj->debug_hold_file[i] = file; 177 obj->debug_hold_line[i] = line; 178 #if 0 179 /* Uncomment for debugging obj refs/derefs in reproducable cases */ 180 if (strcmp(curthread->td_comm, "sshd") == 0) { 181 kprintf("%d %p refs=%d ar=%d file: %s/%d\n", 182 (curthread->td_proc ? curthread->td_proc->p_pid : -1), 183 obj, obj->ref_count, addrem, file, line); 184 } 185 #endif 186 } 187 188 #endif 189 190 /* 191 * Misc low level routines 192 */ 193 static void 194 vm_object_lock_init(vm_object_t obj) 195 { 196 #if defined(DEBUG_LOCKS) 197 int i; 198 199 obj->debug_index = 0; 200 for (i = 0; i < VMOBJ_DEBUG_ARRAY_SIZE; i++) { 201 obj->debug_hold_thrs[i][0] = 0; 202 obj->debug_hold_file[i] = NULL; 203 obj->debug_hold_line[i] = 0; 204 } 205 #endif 206 } 207 208 void 209 vm_object_lock_swap(void) 210 { 211 lwkt_token_swap(); 212 } 213 214 void 215 vm_object_lock(vm_object_t obj) 216 { 217 lwkt_gettoken(&obj->token); 218 } 219 220 /* 221 * Returns TRUE on sucesss 222 */ 223 static int 224 vm_object_lock_try(vm_object_t obj) 225 { 226 return(lwkt_trytoken(&obj->token)); 227 } 228 229 void 230 vm_object_lock_shared(vm_object_t obj) 231 { 232 lwkt_gettoken_shared(&obj->token); 233 } 234 235 void 236 vm_object_unlock(vm_object_t obj) 237 { 238 lwkt_reltoken(&obj->token); 239 } 240 241 void 242 vm_object_upgrade(vm_object_t obj) 243 { 244 lwkt_reltoken(&obj->token); 245 lwkt_gettoken(&obj->token); 246 } 247 248 void 249 vm_object_downgrade(vm_object_t obj) 250 { 251 lwkt_reltoken(&obj->token); 252 lwkt_gettoken_shared(&obj->token); 253 } 254 255 static __inline void 256 vm_object_assert_held(vm_object_t obj) 257 { 258 ASSERT_LWKT_TOKEN_HELD(&obj->token); 259 } 260 261 int 262 vm_quickcolor(void) 263 { 264 globaldata_t gd = mycpu; 265 int pg_color; 266 267 pg_color = (int)(intptr_t)gd->gd_curthread >> 10; 268 pg_color += gd->gd_quick_color; 269 gd->gd_quick_color += PQ_PRIME2; 270 271 return pg_color; 272 } 273 274 void 275 VMOBJDEBUG(vm_object_hold)(vm_object_t obj VMOBJDBARGS) 276 { 277 KKASSERT(obj != NULL); 278 279 /* 280 * Object must be held (object allocation is stable due to callers 281 * context, typically already holding the token on a parent object) 282 * prior to potentially blocking on the lock, otherwise the object 283 * can get ripped away from us. 284 */ 285 refcount_acquire(&obj->hold_count); 286 vm_object_lock(obj); 287 288 #if defined(DEBUG_LOCKS) 289 debugvm_object_add(obj, file, line, 1); 290 #endif 291 } 292 293 int 294 VMOBJDEBUG(vm_object_hold_try)(vm_object_t obj VMOBJDBARGS) 295 { 296 KKASSERT(obj != NULL); 297 298 /* 299 * Object must be held (object allocation is stable due to callers 300 * context, typically already holding the token on a parent object) 301 * prior to potentially blocking on the lock, otherwise the object 302 * can get ripped away from us. 303 */ 304 refcount_acquire(&obj->hold_count); 305 if (vm_object_lock_try(obj) == 0) { 306 if (refcount_release(&obj->hold_count)) { 307 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) 308 kfree(obj, M_VM_OBJECT); 309 } 310 return(0); 311 } 312 313 #if defined(DEBUG_LOCKS) 314 debugvm_object_add(obj, file, line, 1); 315 #endif 316 return(1); 317 } 318 319 void 320 VMOBJDEBUG(vm_object_hold_shared)(vm_object_t obj VMOBJDBARGS) 321 { 322 KKASSERT(obj != NULL); 323 324 /* 325 * Object must be held (object allocation is stable due to callers 326 * context, typically already holding the token on a parent object) 327 * prior to potentially blocking on the lock, otherwise the object 328 * can get ripped away from us. 329 */ 330 refcount_acquire(&obj->hold_count); 331 vm_object_lock_shared(obj); 332 333 #if defined(DEBUG_LOCKS) 334 debugvm_object_add(obj, file, line, 1); 335 #endif 336 } 337 338 /* 339 * Drop the token and hold_count on the object. 340 * 341 * WARNING! Token might be shared. 342 */ 343 void 344 VMOBJDEBUG(vm_object_drop)(vm_object_t obj VMOBJDBARGS) 345 { 346 if (obj == NULL) 347 return; 348 349 /* 350 * No new holders should be possible once we drop hold_count 1->0 as 351 * there is no longer any way to reference the object. 352 */ 353 KKASSERT(obj->hold_count > 0); 354 if (refcount_release(&obj->hold_count)) { 355 #if defined(DEBUG_LOCKS) 356 debugvm_object_add(obj, file, line, -1); 357 #endif 358 359 if (obj->ref_count == 0 && (obj->flags & OBJ_DEAD)) { 360 vm_object_unlock(obj); 361 kfree(obj, M_VM_OBJECT); 362 } else { 363 vm_object_unlock(obj); 364 } 365 } else { 366 #if defined(DEBUG_LOCKS) 367 debugvm_object_add(obj, file, line, -1); 368 #endif 369 vm_object_unlock(obj); 370 } 371 } 372 373 /* 374 * Initialize a freshly allocated object, returning a held object. 375 * 376 * Used only by vm_object_allocate(), zinitna() and vm_object_init(). 377 * 378 * No requirements. 379 */ 380 void 381 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) 382 { 383 struct vm_object_hash *hash; 384 385 RB_INIT(&object->rb_memq); 386 lwkt_token_init(&object->token, "vmobj"); 387 388 TAILQ_INIT(&object->backing_list); 389 lockinit(&object->backing_lk, "baclk", 0, 0); 390 391 object->type = type; 392 object->size = size; 393 object->ref_count = 1; 394 object->memattr = VM_MEMATTR_DEFAULT; 395 object->hold_count = 0; 396 object->flags = 0; 397 if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) 398 vm_object_set_flag(object, OBJ_ONEMAPPING); 399 object->paging_in_progress = 0; 400 object->resident_page_count = 0; 401 /* cpu localization twist */ 402 object->pg_color = vm_quickcolor(); 403 object->handle = NULL; 404 405 atomic_add_int(&object->generation, 1); 406 object->swblock_count = 0; 407 RB_INIT(&object->swblock_root); 408 vm_object_lock_init(object); 409 pmap_object_init(object); 410 411 vm_object_hold(object); 412 413 hash = vmobj_hash(object); 414 lwkt_gettoken(&hash->token); 415 TAILQ_INSERT_TAIL(&hash->list, object, object_entry); 416 lwkt_reltoken(&hash->token); 417 } 418 419 /* 420 * Initialize a VM object. 421 */ 422 void 423 vm_object_init(vm_object_t object, vm_pindex_t size) 424 { 425 _vm_object_allocate(OBJT_DEFAULT, size, object); 426 vm_object_drop(object); 427 } 428 429 /* 430 * Initialize the VM objects module. 431 * 432 * Called from the low level boot code only. Note that this occurs before 433 * kmalloc is initialized so we cannot allocate any VM objects. 434 */ 435 void 436 vm_object_init1(void) 437 { 438 int i; 439 440 for (i = 0; i < VMOBJ_HSIZE; ++i) { 441 TAILQ_INIT(&vm_object_hash[i].list); 442 lwkt_token_init(&vm_object_hash[i].token, "vmobjlst"); 443 } 444 445 _vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(KvaEnd), 446 &kernel_object); 447 vm_object_drop(&kernel_object); 448 } 449 450 void 451 vm_object_init2(void) 452 { 453 kmalloc_set_unlimited(M_VM_OBJECT); 454 } 455 456 /* 457 * Allocate and return a new object of the specified type and size. 458 * 459 * No requirements. 460 */ 461 vm_object_t 462 vm_object_allocate(objtype_t type, vm_pindex_t size) 463 { 464 vm_object_t obj; 465 466 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 467 _vm_object_allocate(type, size, obj); 468 vm_object_drop(obj); 469 470 return (obj); 471 } 472 473 /* 474 * This version returns a held object, allowing further atomic initialization 475 * of the object. 476 */ 477 vm_object_t 478 vm_object_allocate_hold(objtype_t type, vm_pindex_t size) 479 { 480 vm_object_t obj; 481 482 obj = kmalloc(sizeof(*obj), M_VM_OBJECT, M_INTWAIT|M_ZERO); 483 _vm_object_allocate(type, size, obj); 484 485 return (obj); 486 } 487 488 /* 489 * Add an additional reference to a vm_object. The object must already be 490 * held. The original non-lock version is no longer supported. The object 491 * must NOT be chain locked by anyone at the time the reference is added. 492 * 493 * The object must be held, but may be held shared if desired (hence why 494 * we use an atomic op). 495 */ 496 void 497 VMOBJDEBUG(vm_object_reference_locked)(vm_object_t object VMOBJDBARGS) 498 { 499 KKASSERT(object != NULL); 500 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 501 atomic_add_int(&object->ref_count, 1); 502 if (object->type == OBJT_VNODE) { 503 vref(object->handle); 504 /* XXX what if the vnode is being destroyed? */ 505 } 506 #if defined(DEBUG_LOCKS) 507 debugvm_object_add(object, file, line, 1); 508 #endif 509 } 510 511 /* 512 * This version is only allowed in situations where the caller 513 * already knows that the object is deterministically referenced 514 * (usually because its taken from a ref'd vnode, or during a map_entry 515 * replication). 516 */ 517 void 518 VMOBJDEBUG(vm_object_reference_quick)(vm_object_t object VMOBJDBARGS) 519 { 520 KKASSERT(object->type == OBJT_VNODE || object->ref_count > 0); 521 atomic_add_int(&object->ref_count, 1); 522 if (object->type == OBJT_VNODE) 523 vref(object->handle); 524 #if defined(DEBUG_LOCKS) 525 debugvm_object_add(object, file, line, 1); 526 #endif 527 } 528 529 /* 530 * Dereference an object and its underlying vnode. The object may be 531 * held shared. On return the object will remain held. 532 * 533 * This function may return a vnode in *vpp which the caller must release 534 * after the caller drops its own lock. If vpp is NULL, we assume that 535 * the caller was holding an exclusive lock on the object and we vrele() 536 * the vp ourselves. 537 */ 538 static void 539 VMOBJDEBUG(vm_object_vndeallocate)(vm_object_t object, struct vnode **vpp 540 VMOBJDBARGS) 541 { 542 struct vnode *vp = (struct vnode *) object->handle; 543 544 KASSERT(object->type == OBJT_VNODE, 545 ("vm_object_vndeallocate: not a vnode object")); 546 KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp")); 547 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 548 #ifdef INVARIANTS 549 if (object->ref_count == 0) { 550 vprint("vm_object_vndeallocate", vp); 551 panic("vm_object_vndeallocate: bad object reference count"); 552 } 553 #endif 554 for (;;) { 555 int count = object->ref_count; 556 cpu_ccfence(); 557 if (count == 1) { 558 vm_object_upgrade(object); 559 if (atomic_cmpset_int(&object->ref_count, count, 0)) { 560 vclrflags(vp, VTEXT); 561 break; 562 } 563 } else { 564 if (atomic_cmpset_int(&object->ref_count, 565 count, count - 1)) { 566 break; 567 } 568 } 569 /* retry */ 570 } 571 #if defined(DEBUG_LOCKS) 572 debugvm_object_add(object, file, line, -1); 573 #endif 574 575 /* 576 * vrele or return the vp to vrele. We can only safely vrele(vp) 577 * if the object was locked exclusively. But there are two races 578 * here. 579 * 580 * We had to upgrade the object above to safely clear VTEXT 581 * but the alternative path where the shared lock is retained 582 * can STILL race to 0 in other paths and cause our own vrele() 583 * to terminate the vnode. We can't allow that if the VM object 584 * is still locked shared. 585 */ 586 if (vpp) 587 *vpp = vp; 588 else 589 vrele(vp); 590 } 591 592 /* 593 * Release a reference to the specified object, gained either through a 594 * vm_object_allocate or a vm_object_reference call. When all references 595 * are gone, storage associated with this object may be relinquished. 596 * 597 * The caller does not have to hold the object locked but must have control 598 * over the reference in question in order to guarantee that the object 599 * does not get ripped out from under us. 600 * 601 * XXX Currently all deallocations require an exclusive lock. 602 */ 603 void 604 VMOBJDEBUG(vm_object_deallocate)(vm_object_t object VMOBJDBARGS) 605 { 606 struct vnode *vp; 607 int count; 608 609 if (object == NULL) 610 return; 611 612 for (;;) { 613 count = object->ref_count; 614 cpu_ccfence(); 615 616 /* 617 * If decrementing the count enters into special handling 618 * territory (0, 1, or 2) we have to do it the hard way. 619 * Fortunate though, objects with only a few refs like this 620 * are not likely to be heavily contended anyway. 621 * 622 * For vnode objects we only care about 1->0 transitions. 623 */ 624 if (count <= 3 || (object->type == OBJT_VNODE && count <= 1)) { 625 #if defined(DEBUG_LOCKS) 626 debugvm_object_add(object, file, line, 0); 627 #endif 628 vm_object_hold(object); 629 vm_object_deallocate_locked(object); 630 vm_object_drop(object); 631 break; 632 } 633 634 /* 635 * Try to decrement ref_count without acquiring a hold on 636 * the object. This is particularly important for the exec*() 637 * and exit*() code paths because the program binary may 638 * have a great deal of sharing and an exclusive lock will 639 * crowbar performance in those circumstances. 640 */ 641 if (object->type == OBJT_VNODE) { 642 vp = (struct vnode *)object->handle; 643 if (atomic_cmpset_int(&object->ref_count, 644 count, count - 1)) { 645 #if defined(DEBUG_LOCKS) 646 debugvm_object_add(object, file, line, -1); 647 #endif 648 649 vrele(vp); 650 break; 651 } 652 /* retry */ 653 } else { 654 if (atomic_cmpset_int(&object->ref_count, 655 count, count - 1)) { 656 #if defined(DEBUG_LOCKS) 657 debugvm_object_add(object, file, line, -1); 658 #endif 659 break; 660 } 661 /* retry */ 662 } 663 /* retry */ 664 } 665 } 666 667 void 668 VMOBJDEBUG(vm_object_deallocate_locked)(vm_object_t object VMOBJDBARGS) 669 { 670 /* 671 * Degenerate case 672 */ 673 if (object == NULL) 674 return; 675 676 /* 677 * vnode case, caller either locked the object exclusively 678 * or this is a recursion with must_drop != 0 and the vnode 679 * object will be locked shared. 680 * 681 * If locked shared we have to drop the object before we can 682 * call vrele() or risk a shared/exclusive livelock. 683 */ 684 if (object->type == OBJT_VNODE) { 685 ASSERT_LWKT_TOKEN_HELD(&object->token); 686 vm_object_vndeallocate(object, NULL); 687 return; 688 } 689 ASSERT_LWKT_TOKEN_HELD_EXCL(&object->token); 690 691 /* 692 * Normal case (object is locked exclusively) 693 */ 694 if (object->ref_count == 0) { 695 panic("vm_object_deallocate: object deallocated " 696 "too many times: %d", object->type); 697 } 698 if (object->ref_count > 2) { 699 atomic_add_int(&object->ref_count, -1); 700 #if defined(DEBUG_LOCKS) 701 debugvm_object_add(object, file, line, -1); 702 #endif 703 return; 704 } 705 706 /* 707 * Drop the ref and handle termination on the 1->0 transition. 708 * We may have blocked above so we have to recheck. 709 */ 710 KKASSERT(object->ref_count != 0); 711 if (object->ref_count >= 2) { 712 atomic_add_int(&object->ref_count, -1); 713 #if defined(DEBUG_LOCKS) 714 debugvm_object_add(object, file, line, -1); 715 #endif 716 return; 717 } 718 719 atomic_add_int(&object->ref_count, -1); 720 if ((object->flags & OBJ_DEAD) == 0) 721 vm_object_terminate(object); 722 } 723 724 /* 725 * Destroy the specified object, freeing up related resources. 726 * 727 * The object must have zero references. 728 * 729 * The object must held. The caller is responsible for dropping the object 730 * after terminate returns. Terminate does NOT drop the object. 731 */ 732 static int vm_object_terminate_callback(vm_page_t p, void *data); 733 734 void 735 vm_object_terminate(vm_object_t object) 736 { 737 struct rb_vm_page_scan_info info; 738 struct vm_object_hash *hash; 739 740 /* 741 * Make sure no one uses us. Once we set OBJ_DEAD we should be 742 * able to safely block. 743 */ 744 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 745 KKASSERT((object->flags & OBJ_DEAD) == 0); 746 vm_object_set_flag(object, OBJ_DEAD); 747 748 /* 749 * Wait for the pageout daemon to be done with the object 750 */ 751 vm_object_pip_wait(object, "objtrm1"); 752 753 KASSERT(!object->paging_in_progress, 754 ("vm_object_terminate: pageout in progress")); 755 756 /* 757 * Clean and free the pages, as appropriate. All references to the 758 * object are gone, so we don't need to lock it. 759 */ 760 if (object->type == OBJT_VNODE) { 761 struct vnode *vp; 762 763 /* 764 * Clean pages and flush buffers. 765 * 766 * NOTE! TMPFS buffer flushes do not typically flush the 767 * actual page to swap as this would be highly 768 * inefficient, and normal filesystems usually wrap 769 * page flushes with buffer cache buffers. 770 * 771 * To deal with this we have to call vinvalbuf() both 772 * before and after the vm_object_page_clean(). 773 */ 774 vp = (struct vnode *) object->handle; 775 vinvalbuf(vp, V_SAVE, 0, 0); 776 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 777 vinvalbuf(vp, V_SAVE, 0, 0); 778 } 779 780 /* 781 * Wait for any I/O to complete, after which there had better not 782 * be any references left on the object. 783 */ 784 vm_object_pip_wait(object, "objtrm2"); 785 786 if (object->ref_count != 0) { 787 panic("vm_object_terminate: object with references, " 788 "ref_count=%d", object->ref_count); 789 } 790 791 /* 792 * Cleanup any shared pmaps associated with this object. 793 */ 794 pmap_object_free(object); 795 796 /* 797 * Now free any remaining pages. For internal objects, this also 798 * removes them from paging queues. Don't free wired pages, just 799 * remove them from the object. 800 */ 801 info.count = 0; 802 info.object = object; 803 do { 804 info.error = 0; 805 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 806 vm_object_terminate_callback, &info); 807 } while (info.error); 808 809 /* 810 * Let the pager know object is dead. 811 */ 812 vm_pager_deallocate(object); 813 814 /* 815 * Wait for the object hold count to hit 1, clean out pages as 816 * we go. vmobj_token interlocks any race conditions that might 817 * pick the object up from the vm_object_list after we have cleared 818 * rb_memq. 819 */ 820 for (;;) { 821 if (RB_ROOT(&object->rb_memq) == NULL) 822 break; 823 kprintf("vm_object_terminate: Warning, object %p " 824 "still has %ld pages\n", 825 object, object->resident_page_count); 826 vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL, 827 vm_object_terminate_callback, &info); 828 } 829 830 /* 831 * There had better not be any pages left 832 */ 833 KKASSERT(object->resident_page_count == 0); 834 835 /* 836 * Remove the object from the global object list. 837 */ 838 hash = vmobj_hash(object); 839 lwkt_gettoken(&hash->token); 840 TAILQ_REMOVE(&hash->list, object, object_entry); 841 lwkt_reltoken(&hash->token); 842 843 if (object->ref_count != 0) { 844 panic("vm_object_terminate2: object with references, " 845 "ref_count=%d", object->ref_count); 846 } 847 848 /* 849 * NOTE: The object hold_count is at least 1, so we cannot kfree() 850 * the object here. See vm_object_drop(). 851 */ 852 } 853 854 /* 855 * The caller must hold the object. 856 */ 857 static int 858 vm_object_terminate_callback(vm_page_t p, void *data) 859 { 860 struct rb_vm_page_scan_info *info = data; 861 vm_object_t object; 862 863 object = p->object; 864 KKASSERT(object == info->object); 865 if (vm_page_busy_try(p, TRUE)) { 866 vm_page_sleep_busy(p, TRUE, "vmotrm"); 867 info->error = 1; 868 return 0; 869 } 870 if (object != p->object) { 871 /* XXX remove once we determine it can't happen */ 872 kprintf("vm_object_terminate: Warning: Encountered " 873 "busied page %p on queue %d\n", p, p->queue); 874 vm_page_wakeup(p); 875 info->error = 1; 876 } else if (p->wire_count == 0) { 877 /* 878 * NOTE: p->dirty and PG_NEED_COMMIT are ignored. 879 */ 880 vm_page_free(p); 881 mycpu->gd_cnt.v_pfree++; 882 } else { 883 if (p->queue != PQ_NONE) { 884 kprintf("vm_object_terminate: Warning: Encountered " 885 "wired page %p on queue %d\n", p, p->queue); 886 if (vm_object_debug > 0) { 887 --vm_object_debug; 888 print_backtrace(10); 889 } 890 } 891 vm_page_remove(p); 892 vm_page_wakeup(p); 893 } 894 895 /* 896 * Must be at end to avoid SMP races, caller holds object token 897 */ 898 if ((++info->count & 63) == 0) 899 lwkt_user_yield(); 900 return(0); 901 } 902 903 /* 904 * Clean all dirty pages in the specified range of object. Leaves page 905 * on whatever queue it is currently on. If NOSYNC is set then do not 906 * write out pages with PG_NOSYNC set (originally comes from MAP_NOSYNC), 907 * leaving the object dirty. 908 * 909 * When stuffing pages asynchronously, allow clustering. XXX we need a 910 * synchronous clustering mode implementation. 911 * 912 * Odd semantics: if start == end, we clean everything. 913 * 914 * The object must be locked? XXX 915 */ 916 static int vm_object_page_clean_pass1(struct vm_page *p, void *data); 917 static int vm_object_page_clean_pass2(struct vm_page *p, void *data); 918 919 void 920 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 921 int flags) 922 { 923 struct rb_vm_page_scan_info info; 924 struct vnode *vp; 925 int wholescan; 926 int pagerflags; 927 int generation; 928 929 vm_object_hold(object); 930 if (object->type != OBJT_VNODE || 931 (object->flags & OBJ_MIGHTBEDIRTY) == 0) { 932 vm_object_drop(object); 933 return; 934 } 935 936 pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? 937 VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK; 938 pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0; 939 940 vp = object->handle; 941 942 /* 943 * Interlock other major object operations. This allows us to 944 * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY. 945 */ 946 vm_object_set_flag(object, OBJ_CLEANING); 947 948 /* 949 * Handle 'entire object' case 950 */ 951 info.start_pindex = start; 952 if (end == 0) { 953 info.end_pindex = object->size - 1; 954 } else { 955 info.end_pindex = end - 1; 956 } 957 wholescan = (start == 0 && info.end_pindex == object->size - 1); 958 info.limit = flags; 959 info.pagerflags = pagerflags; 960 info.object = object; 961 962 /* 963 * If cleaning the entire object do a pass to mark the pages read-only. 964 * If everything worked out ok, clear OBJ_WRITEABLE and 965 * OBJ_MIGHTBEDIRTY. 966 */ 967 if (wholescan) { 968 info.error = 0; 969 info.count = 0; 970 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 971 vm_object_page_clean_pass1, &info); 972 if (info.error == 0) { 973 vm_object_clear_flag(object, 974 OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 975 if (object->type == OBJT_VNODE && 976 (vp = (struct vnode *)object->handle) != NULL) { 977 /* 978 * Use new-style interface to clear VISDIRTY 979 * because the vnode is not necessarily removed 980 * from the syncer list(s) as often as it was 981 * under the old interface, which can leave 982 * the vnode on the syncer list after reclaim. 983 */ 984 vclrobjdirty(vp); 985 } 986 } 987 } 988 989 /* 990 * Do a pass to clean all the dirty pages we find. 991 */ 992 do { 993 info.error = 0; 994 info.count = 0; 995 generation = object->generation; 996 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 997 vm_object_page_clean_pass2, &info); 998 } while (info.error || generation != object->generation); 999 1000 vm_object_clear_flag(object, OBJ_CLEANING); 1001 vm_object_drop(object); 1002 } 1003 1004 /* 1005 * The caller must hold the object. 1006 */ 1007 static 1008 int 1009 vm_object_page_clean_pass1(struct vm_page *p, void *data) 1010 { 1011 struct rb_vm_page_scan_info *info = data; 1012 1013 KKASSERT(p->object == info->object); 1014 1015 vm_page_flag_set(p, PG_CLEANCHK); 1016 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1017 info->error = 1; 1018 } else if (vm_page_busy_try(p, FALSE)) { 1019 info->error = 1; 1020 } else { 1021 KKASSERT(p->object == info->object); 1022 vm_page_protect(p, VM_PROT_READ); 1023 vm_page_wakeup(p); 1024 } 1025 1026 /* 1027 * Must be at end to avoid SMP races, caller holds object token 1028 */ 1029 if ((++info->count & 63) == 0) 1030 lwkt_user_yield(); 1031 return(0); 1032 } 1033 1034 /* 1035 * The caller must hold the object 1036 */ 1037 static 1038 int 1039 vm_object_page_clean_pass2(struct vm_page *p, void *data) 1040 { 1041 struct rb_vm_page_scan_info *info = data; 1042 int generation; 1043 1044 KKASSERT(p->object == info->object); 1045 1046 /* 1047 * Do not mess with pages that were inserted after we started 1048 * the cleaning pass. 1049 */ 1050 if ((p->flags & PG_CLEANCHK) == 0) 1051 goto done; 1052 1053 generation = info->object->generation; 1054 1055 if (vm_page_busy_try(p, TRUE)) { 1056 vm_page_sleep_busy(p, TRUE, "vpcwai"); 1057 info->error = 1; 1058 goto done; 1059 } 1060 1061 KKASSERT(p->object == info->object && 1062 info->object->generation == generation); 1063 1064 /* 1065 * Before wasting time traversing the pmaps, check for trivial 1066 * cases where the page cannot be dirty. 1067 */ 1068 if (p->valid == 0 || (p->queue - p->pc) == PQ_CACHE) { 1069 KKASSERT((p->dirty & p->valid) == 0 && 1070 (p->flags & PG_NEED_COMMIT) == 0); 1071 vm_page_wakeup(p); 1072 goto done; 1073 } 1074 1075 /* 1076 * Check whether the page is dirty or not. The page has been set 1077 * to be read-only so the check will not race a user dirtying the 1078 * page. 1079 */ 1080 vm_page_test_dirty(p); 1081 if ((p->dirty & p->valid) == 0 && (p->flags & PG_NEED_COMMIT) == 0) { 1082 vm_page_flag_clear(p, PG_CLEANCHK); 1083 vm_page_wakeup(p); 1084 goto done; 1085 } 1086 1087 /* 1088 * If we have been asked to skip nosync pages and this is a 1089 * nosync page, skip it. Note that the object flags were 1090 * not cleared in this case (because pass1 will have returned an 1091 * error), so we do not have to set them. 1092 */ 1093 if ((info->limit & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) { 1094 vm_page_flag_clear(p, PG_CLEANCHK); 1095 vm_page_wakeup(p); 1096 goto done; 1097 } 1098 1099 /* 1100 * Flush as many pages as we can. PG_CLEANCHK will be cleared on 1101 * the pages that get successfully flushed. Set info->error if 1102 * we raced an object modification. 1103 */ 1104 vm_object_page_collect_flush(info->object, p, info->pagerflags); 1105 /* vm_wait_nominal(); this can deadlock the system in syncer/pageout */ 1106 1107 /* 1108 * Must be at end to avoid SMP races, caller holds object token 1109 */ 1110 done: 1111 if ((++info->count & 63) == 0) 1112 lwkt_user_yield(); 1113 return(0); 1114 } 1115 1116 /* 1117 * Collect the specified page and nearby pages and flush them out. 1118 * The number of pages flushed is returned. The passed page is busied 1119 * by the caller and we are responsible for its disposition. 1120 * 1121 * The caller must hold the object. 1122 */ 1123 static void 1124 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags) 1125 { 1126 int error; 1127 int is; 1128 int ib; 1129 int i; 1130 int page_base; 1131 vm_pindex_t pi; 1132 vm_page_t ma[BLIST_MAX_ALLOC]; 1133 1134 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 1135 1136 pi = p->pindex; 1137 page_base = pi % BLIST_MAX_ALLOC; 1138 ma[page_base] = p; 1139 ib = page_base - 1; 1140 is = page_base + 1; 1141 1142 while (ib >= 0) { 1143 vm_page_t tp; 1144 1145 tp = vm_page_lookup_busy_try(object, pi - page_base + ib, 1146 TRUE, &error); 1147 if (error) 1148 break; 1149 if (tp == NULL) 1150 break; 1151 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1152 (tp->flags & PG_CLEANCHK) == 0) { 1153 vm_page_wakeup(tp); 1154 break; 1155 } 1156 if ((tp->queue - tp->pc) == PQ_CACHE) { 1157 vm_page_flag_clear(tp, PG_CLEANCHK); 1158 vm_page_wakeup(tp); 1159 break; 1160 } 1161 vm_page_test_dirty(tp); 1162 if ((tp->dirty & tp->valid) == 0 && 1163 (tp->flags & PG_NEED_COMMIT) == 0) { 1164 vm_page_flag_clear(tp, PG_CLEANCHK); 1165 vm_page_wakeup(tp); 1166 break; 1167 } 1168 ma[ib] = tp; 1169 --ib; 1170 } 1171 ++ib; /* fixup */ 1172 1173 while (is < BLIST_MAX_ALLOC && 1174 pi - page_base + is < object->size) { 1175 vm_page_t tp; 1176 1177 tp = vm_page_lookup_busy_try(object, pi - page_base + is, 1178 TRUE, &error); 1179 if (error) 1180 break; 1181 if (tp == NULL) 1182 break; 1183 if ((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 && 1184 (tp->flags & PG_CLEANCHK) == 0) { 1185 vm_page_wakeup(tp); 1186 break; 1187 } 1188 if ((tp->queue - tp->pc) == PQ_CACHE) { 1189 vm_page_flag_clear(tp, PG_CLEANCHK); 1190 vm_page_wakeup(tp); 1191 break; 1192 } 1193 vm_page_test_dirty(tp); 1194 if ((tp->dirty & tp->valid) == 0 && 1195 (tp->flags & PG_NEED_COMMIT) == 0) { 1196 vm_page_flag_clear(tp, PG_CLEANCHK); 1197 vm_page_wakeup(tp); 1198 break; 1199 } 1200 ma[is] = tp; 1201 ++is; 1202 } 1203 1204 /* 1205 * All pages in the ma[] array are busied now 1206 */ 1207 for (i = ib; i < is; ++i) { 1208 vm_page_flag_clear(ma[i], PG_CLEANCHK); 1209 vm_page_hold(ma[i]); /* XXX need this any more? */ 1210 } 1211 vm_pageout_flush(&ma[ib], is - ib, pagerflags); 1212 for (i = ib; i < is; ++i) /* XXX need this any more? */ 1213 vm_page_unhold(ma[i]); 1214 } 1215 1216 /* 1217 * Implements the madvise function at the object/page level. 1218 * 1219 * MADV_WILLNEED (any object) 1220 * 1221 * Activate the specified pages if they are resident. 1222 * 1223 * MADV_DONTNEED (any object) 1224 * 1225 * Deactivate the specified pages if they are resident. 1226 * 1227 * MADV_FREE (OBJT_DEFAULT/OBJT_SWAP objects, OBJ_ONEMAPPING only) 1228 * 1229 * Deactivate and clean the specified pages if they are 1230 * resident. This permits the process to reuse the pages 1231 * without faulting or the kernel to reclaim the pages 1232 * without I/O. 1233 * 1234 * No requirements. 1235 */ 1236 void 1237 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, 1238 vm_pindex_t count, int advise) 1239 { 1240 vm_pindex_t end; 1241 vm_page_t m; 1242 int error; 1243 1244 if (object == NULL) 1245 return; 1246 1247 end = pindex + count; 1248 1249 vm_object_hold(object); 1250 1251 /* 1252 * Locate and adjust resident pages. This only applies to the 1253 * primary object in the mapping. 1254 */ 1255 for (; pindex < end; pindex += 1) { 1256 relookup: 1257 /* 1258 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages 1259 * and those pages must be OBJ_ONEMAPPING. 1260 */ 1261 if (advise == MADV_FREE) { 1262 if ((object->type != OBJT_DEFAULT && 1263 object->type != OBJT_SWAP) || 1264 (object->flags & OBJ_ONEMAPPING) == 0) { 1265 continue; 1266 } 1267 } 1268 1269 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error); 1270 1271 if (error) { 1272 vm_page_sleep_busy(m, TRUE, "madvpo"); 1273 goto relookup; 1274 } 1275 if (m == NULL) { 1276 /* 1277 * There may be swap even if there is no backing page 1278 */ 1279 if (advise == MADV_FREE && object->type == OBJT_SWAP) 1280 swap_pager_freespace(object, pindex, 1); 1281 continue; 1282 } 1283 1284 /* 1285 * If the page is not in a normal active state, we skip it. 1286 * If the page is not managed there are no page queues to 1287 * mess with. Things can break if we mess with pages in 1288 * any of the below states. 1289 */ 1290 if (m->wire_count || 1291 (m->flags & (PG_FICTITIOUS | PG_UNQUEUED | 1292 PG_NEED_COMMIT)) || 1293 m->valid != VM_PAGE_BITS_ALL 1294 ) { 1295 vm_page_wakeup(m); 1296 continue; 1297 } 1298 1299 /* 1300 * Theoretically once a page is known not to be busy, an 1301 * interrupt cannot come along and rip it out from under us. 1302 */ 1303 if (advise == MADV_WILLNEED) { 1304 vm_page_activate(m); 1305 } else if (advise == MADV_DONTNEED) { 1306 vm_page_dontneed(m); 1307 } else if (advise == MADV_FREE) { 1308 /* 1309 * Mark the page clean. This will allow the page 1310 * to be freed up by the system. However, such pages 1311 * are often reused quickly by malloc()/free() 1312 * so we do not do anything that would cause 1313 * a page fault if we can help it. 1314 * 1315 * Specifically, we do not try to actually free 1316 * the page now nor do we try to put it in the 1317 * cache (which would cause a page fault on reuse). 1318 * 1319 * But we do make the page is freeable as we 1320 * can without actually taking the step of unmapping 1321 * it. 1322 */ 1323 pmap_clear_modify(m); 1324 m->dirty = 0; 1325 m->act_count = 0; 1326 vm_page_dontneed(m); 1327 if (object->type == OBJT_SWAP) 1328 swap_pager_freespace(object, pindex, 1); 1329 } 1330 vm_page_wakeup(m); 1331 } 1332 vm_object_drop(object); 1333 } 1334 1335 /* 1336 * Removes all physical pages in the specified object range from the 1337 * object's list of pages. 1338 * 1339 * No requirements. 1340 */ 1341 static int vm_object_page_remove_callback(vm_page_t p, void *data); 1342 1343 void 1344 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, 1345 boolean_t clean_only) 1346 { 1347 struct rb_vm_page_scan_info info; 1348 int all; 1349 1350 /* 1351 * Degenerate cases and assertions 1352 */ 1353 vm_object_hold(object); 1354 if (object == NULL || 1355 (object->resident_page_count == 0 && object->swblock_count == 0)) { 1356 vm_object_drop(object); 1357 return; 1358 } 1359 KASSERT(object->type != OBJT_PHYS, 1360 ("attempt to remove pages from a physical object")); 1361 1362 /* 1363 * Indicate that paging is occuring on the object 1364 */ 1365 vm_object_pip_add(object, 1); 1366 1367 /* 1368 * Figure out the actual removal range and whether we are removing 1369 * the entire contents of the object or not. If removing the entire 1370 * contents, be sure to get all pages, even those that might be 1371 * beyond the end of the object. 1372 */ 1373 info.object = object; 1374 info.start_pindex = start; 1375 if (end == 0) 1376 info.end_pindex = (vm_pindex_t)-1; 1377 else 1378 info.end_pindex = end - 1; 1379 info.limit = clean_only; 1380 info.count = 0; 1381 all = (start == 0 && info.end_pindex >= object->size - 1); 1382 1383 /* 1384 * Loop until we are sure we have gotten them all. 1385 */ 1386 do { 1387 info.error = 0; 1388 vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp, 1389 vm_object_page_remove_callback, &info); 1390 } while (info.error); 1391 1392 /* 1393 * Remove any related swap if throwing away pages, or for 1394 * non-swap objects (the swap is a clean copy in that case). 1395 */ 1396 if (object->type != OBJT_SWAP || clean_only == FALSE) { 1397 if (all) 1398 swap_pager_freespace_all(object); 1399 else 1400 swap_pager_freespace(object, info.start_pindex, 1401 info.end_pindex - info.start_pindex + 1); 1402 } 1403 1404 /* 1405 * Cleanup 1406 */ 1407 vm_object_pip_wakeup(object); 1408 vm_object_drop(object); 1409 } 1410 1411 /* 1412 * The caller must hold the object. 1413 * 1414 * NOTE: User yields are allowed when removing more than one page, but not 1415 * allowed if only removing one page (the path for single page removals 1416 * might hold a spinlock). 1417 */ 1418 static int 1419 vm_object_page_remove_callback(vm_page_t p, void *data) 1420 { 1421 struct rb_vm_page_scan_info *info = data; 1422 1423 if (info->object != p->object || 1424 p->pindex < info->start_pindex || 1425 p->pindex > info->end_pindex) { 1426 kprintf("vm_object_page_remove_callbackA: obj/pg race %p/%p\n", 1427 info->object, p); 1428 return(0); 1429 } 1430 if (vm_page_busy_try(p, TRUE)) { 1431 vm_page_sleep_busy(p, TRUE, "vmopar"); 1432 info->error = 1; 1433 return(0); 1434 } 1435 if (info->object != p->object) { 1436 /* this should never happen */ 1437 kprintf("vm_object_page_remove_callbackB: obj/pg race %p/%p\n", 1438 info->object, p); 1439 vm_page_wakeup(p); 1440 return(0); 1441 } 1442 1443 /* 1444 * Wired pages cannot be destroyed, but they can be invalidated 1445 * and we do so if clean_only (limit) is not set. 1446 * 1447 * WARNING! The page may be wired due to being part of a buffer 1448 * cache buffer, and the buffer might be marked B_CACHE. 1449 * This is fine as part of a truncation but VFSs must be 1450 * sure to fix the buffer up when re-extending the file. 1451 * 1452 * NOTE! PG_NEED_COMMIT is ignored. 1453 */ 1454 if (p->wire_count != 0) { 1455 vm_page_protect(p, VM_PROT_NONE); 1456 if (info->limit == 0) 1457 p->valid = 0; 1458 vm_page_wakeup(p); 1459 goto done; 1460 } 1461 1462 /* 1463 * limit is our clean_only flag. If set and the page is dirty or 1464 * requires a commit, do not free it. If set and the page is being 1465 * held by someone, do not free it. 1466 */ 1467 if (info->limit && p->valid) { 1468 vm_page_test_dirty(p); 1469 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1470 vm_page_wakeup(p); 1471 goto done; 1472 } 1473 } 1474 1475 /* 1476 * Destroy the page. But we have to re-test whether its dirty after 1477 * removing it from its pmaps. 1478 */ 1479 vm_page_protect(p, VM_PROT_NONE); 1480 if (info->limit && p->valid) { 1481 vm_page_test_dirty(p); 1482 if ((p->valid & p->dirty) || (p->flags & PG_NEED_COMMIT)) { 1483 vm_page_wakeup(p); 1484 goto done; 1485 } 1486 } 1487 vm_page_free(p); 1488 1489 /* 1490 * Must be at end to avoid SMP races, caller holds object token 1491 */ 1492 done: 1493 if ((++info->count & 63) == 0) 1494 lwkt_user_yield(); 1495 1496 return(0); 1497 } 1498 1499 /* 1500 * Try to extend prev_object into an adjoining region of virtual 1501 * memory, return TRUE on success. 1502 * 1503 * The caller does not need to hold (prev_object) but must have a stable 1504 * pointer to it (typically by holding the vm_map locked). 1505 * 1506 * This function only works for anonymous memory objects which either 1507 * have (a) one reference or (b) we are extending the object's size. 1508 * Otherwise the related VM pages we want to use for the object might 1509 * be in use by another mapping. 1510 */ 1511 boolean_t 1512 vm_object_coalesce(vm_object_t prev_object, vm_pindex_t prev_pindex, 1513 vm_size_t prev_size, vm_size_t next_size) 1514 { 1515 vm_pindex_t next_pindex; 1516 1517 if (prev_object == NULL) 1518 return (TRUE); 1519 1520 vm_object_hold(prev_object); 1521 1522 if (prev_object->type != OBJT_DEFAULT && 1523 prev_object->type != OBJT_SWAP) { 1524 vm_object_drop(prev_object); 1525 return (FALSE); 1526 } 1527 1528 #if 0 1529 /* caller now checks this */ 1530 /* 1531 * Try to collapse the object first 1532 */ 1533 vm_object_collapse(prev_object, NULL); 1534 #endif 1535 1536 #if 0 1537 /* caller now checks this */ 1538 /* 1539 * We can't coalesce if we shadow another object (figuring out the 1540 * relationships become too complex). 1541 */ 1542 if (prev_object->backing_object != NULL) { 1543 vm_object_chain_release(prev_object); 1544 vm_object_drop(prev_object); 1545 return (FALSE); 1546 } 1547 #endif 1548 1549 prev_size >>= PAGE_SHIFT; 1550 next_size >>= PAGE_SHIFT; 1551 next_pindex = prev_pindex + prev_size; 1552 1553 /* 1554 * We can't if the object has more than one ref count unless we 1555 * are extending it into newly minted space. 1556 */ 1557 if (prev_object->ref_count > 1 && 1558 prev_object->size != next_pindex) { 1559 vm_object_drop(prev_object); 1560 return (FALSE); 1561 } 1562 1563 /* 1564 * Remove any pages that may still be in the object from a previous 1565 * deallocation. 1566 */ 1567 if (next_pindex < prev_object->size) { 1568 vm_object_page_remove(prev_object, 1569 next_pindex, 1570 next_pindex + next_size, FALSE); 1571 if (prev_object->type == OBJT_SWAP) 1572 swap_pager_freespace(prev_object, 1573 next_pindex, next_size); 1574 } 1575 1576 /* 1577 * Extend the object if necessary. 1578 */ 1579 if (next_pindex + next_size > prev_object->size) 1580 prev_object->size = next_pindex + next_size; 1581 vm_object_drop(prev_object); 1582 1583 return (TRUE); 1584 } 1585 1586 /* 1587 * Make the object writable and flag is being possibly dirty. 1588 * 1589 * The object might not be held (or might be held but held shared), 1590 * the related vnode is probably not held either. Object and vnode are 1591 * stable by virtue of the vm_page busied by the caller preventing 1592 * destruction. 1593 * 1594 * If the related mount is flagged MNTK_THR_SYNC we need to call 1595 * vsetobjdirty(). Filesystems using this option usually shortcut 1596 * synchronization by only scanning the syncer list. 1597 */ 1598 void 1599 vm_object_set_writeable_dirty(vm_object_t object) 1600 { 1601 struct vnode *vp; 1602 1603 /*vm_object_assert_held(object);*/ 1604 /* 1605 * Avoid contention in vm fault path by checking the state before 1606 * issuing an atomic op on it. 1607 */ 1608 if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) != 1609 (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) { 1610 vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY); 1611 } 1612 if (object->type == OBJT_VNODE && 1613 (vp = (struct vnode *)object->handle) != NULL) { 1614 if ((vp->v_flag & VOBJDIRTY) == 0) { 1615 if (vp->v_mount && 1616 (vp->v_mount->mnt_kern_flag & MNTK_THR_SYNC)) { 1617 /* 1618 * New style THR_SYNC places vnodes on the 1619 * syncer list more deterministically. 1620 */ 1621 vsetobjdirty(vp); 1622 } else { 1623 /* 1624 * Old style scan would not necessarily place 1625 * a vnode on the syncer list when possibly 1626 * modified via mmap. 1627 */ 1628 vsetflags(vp, VOBJDIRTY); 1629 } 1630 } 1631 } 1632 } 1633 1634 #include "opt_ddb.h" 1635 #ifdef DDB 1636 #include <sys/cons.h> 1637 1638 #include <ddb/ddb.h> 1639 1640 static int _vm_object_in_map (vm_map_t map, vm_object_t object, 1641 vm_map_entry_t entry); 1642 static int vm_object_in_map (vm_object_t object); 1643 1644 /* 1645 * The caller must hold the object. 1646 */ 1647 static int 1648 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry) 1649 { 1650 vm_map_backing_t ba; 1651 vm_map_t tmpm; 1652 vm_map_entry_t tmpe; 1653 int entcount; 1654 1655 if (map == NULL) 1656 return 0; 1657 if (entry == NULL) { 1658 tmpe = RB_MIN(vm_map_rb_tree, &map->rb_root); 1659 entcount = map->nentries; 1660 while (entcount-- && tmpe) { 1661 if( _vm_object_in_map(map, object, tmpe)) { 1662 return 1; 1663 } 1664 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1665 } 1666 return (0); 1667 } 1668 switch(entry->maptype) { 1669 case VM_MAPTYPE_SUBMAP: 1670 tmpm = entry->ba.sub_map; 1671 tmpe = RB_MIN(vm_map_rb_tree, &tmpm->rb_root); 1672 entcount = tmpm->nentries; 1673 while (entcount-- && tmpe) { 1674 if( _vm_object_in_map(tmpm, object, tmpe)) { 1675 return 1; 1676 } 1677 tmpe = vm_map_rb_tree_RB_NEXT(tmpe); 1678 } 1679 break; 1680 case VM_MAPTYPE_NORMAL: 1681 case VM_MAPTYPE_VPAGETABLE: 1682 ba = &entry->ba; 1683 while (ba) { 1684 if (ba->object == object) 1685 return TRUE; 1686 ba = ba->backing_ba; 1687 } 1688 break; 1689 default: 1690 break; 1691 } 1692 return 0; 1693 } 1694 1695 static int vm_object_in_map_callback(struct proc *p, void *data); 1696 1697 struct vm_object_in_map_info { 1698 vm_object_t object; 1699 int rv; 1700 }; 1701 1702 /* 1703 * Debugging only 1704 */ 1705 static int 1706 vm_object_in_map(vm_object_t object) 1707 { 1708 struct vm_object_in_map_info info; 1709 1710 info.rv = 0; 1711 info.object = object; 1712 1713 allproc_scan(vm_object_in_map_callback, &info, 0); 1714 if (info.rv) 1715 return 1; 1716 if( _vm_object_in_map(&kernel_map, object, 0)) 1717 return 1; 1718 if( _vm_object_in_map(&pager_map, object, 0)) 1719 return 1; 1720 if( _vm_object_in_map(&buffer_map, object, 0)) 1721 return 1; 1722 return 0; 1723 } 1724 1725 /* 1726 * Debugging only 1727 */ 1728 static int 1729 vm_object_in_map_callback(struct proc *p, void *data) 1730 { 1731 struct vm_object_in_map_info *info = data; 1732 1733 if (p->p_vmspace) { 1734 if (_vm_object_in_map(&p->p_vmspace->vm_map, info->object, 0)) { 1735 info->rv = 1; 1736 return -1; 1737 } 1738 } 1739 return (0); 1740 } 1741 1742 DB_SHOW_COMMAND(vmochk, vm_object_check) 1743 { 1744 struct vm_object_hash *hash; 1745 vm_object_t object; 1746 int n; 1747 1748 /* 1749 * make sure that internal objs are in a map somewhere 1750 * and none have zero ref counts. 1751 */ 1752 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1753 hash = &vm_object_hash[n]; 1754 for (object = TAILQ_FIRST(&hash->list); 1755 object != NULL; 1756 object = TAILQ_NEXT(object, object_entry)) { 1757 if (object->type == OBJT_MARKER) 1758 continue; 1759 if (object->handle != NULL || 1760 (object->type != OBJT_DEFAULT && 1761 object->type != OBJT_SWAP)) { 1762 continue; 1763 } 1764 if (object->ref_count == 0) { 1765 db_printf("vmochk: internal obj has " 1766 "zero ref count: %ld\n", 1767 (long)object->size); 1768 } 1769 if (vm_object_in_map(object)) 1770 continue; 1771 db_printf("vmochk: internal obj is not in a map: " 1772 "ref: %d, size: %lu: 0x%lx\n", 1773 object->ref_count, (u_long)object->size, 1774 (u_long)object->size); 1775 } 1776 } 1777 } 1778 1779 /* 1780 * Debugging only 1781 */ 1782 DB_SHOW_COMMAND(object, vm_object_print_static) 1783 { 1784 /* XXX convert args. */ 1785 vm_object_t object = (vm_object_t)addr; 1786 boolean_t full = have_addr; 1787 1788 vm_page_t p; 1789 1790 /* XXX count is an (unused) arg. Avoid shadowing it. */ 1791 #define count was_count 1792 1793 int count; 1794 1795 if (object == NULL) 1796 return; 1797 1798 db_iprintf( 1799 "Object %p: type=%d, size=0x%lx, res=%ld, ref=%d, flags=0x%x\n", 1800 object, (int)object->type, (u_long)object->size, 1801 object->resident_page_count, object->ref_count, object->flags); 1802 /* 1803 * XXX no %qd in kernel. Truncate object->backing_object_offset. 1804 */ 1805 db_iprintf("\n"); 1806 1807 if (!full) 1808 return; 1809 1810 db_indent += 2; 1811 count = 0; 1812 RB_FOREACH(p, vm_page_rb_tree, &object->rb_memq) { 1813 if (count == 0) 1814 db_iprintf("memory:="); 1815 else if (count == 6) { 1816 db_printf("\n"); 1817 db_iprintf(" ..."); 1818 count = 0; 1819 } else 1820 db_printf(","); 1821 count++; 1822 1823 db_printf("(off=0x%lx,page=0x%lx)", 1824 (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p)); 1825 } 1826 if (count != 0) 1827 db_printf("\n"); 1828 db_indent -= 2; 1829 } 1830 1831 /* XXX. */ 1832 #undef count 1833 1834 /* 1835 * XXX need this non-static entry for calling from vm_map_print. 1836 * 1837 * Debugging only 1838 */ 1839 void 1840 vm_object_print(/* db_expr_t */ long addr, 1841 boolean_t have_addr, 1842 /* db_expr_t */ long count, 1843 char *modif) 1844 { 1845 vm_object_print_static(addr, have_addr, count, modif); 1846 } 1847 1848 /* 1849 * Debugging only 1850 */ 1851 DB_SHOW_COMMAND(vmopag, vm_object_print_pages) 1852 { 1853 struct vm_object_hash *hash; 1854 vm_object_t object; 1855 int nl = 0; 1856 int c; 1857 int n; 1858 1859 for (n = 0; n < VMOBJ_HSIZE; ++n) { 1860 hash = &vm_object_hash[n]; 1861 for (object = TAILQ_FIRST(&hash->list); 1862 object != NULL; 1863 object = TAILQ_NEXT(object, object_entry)) { 1864 vm_pindex_t idx, fidx; 1865 vm_pindex_t osize; 1866 vm_paddr_t pa = -1, padiff; 1867 int rcount; 1868 vm_page_t m; 1869 1870 if (object->type == OBJT_MARKER) 1871 continue; 1872 db_printf("new object: %p\n", (void *)object); 1873 if ( nl > 18) { 1874 c = cngetc(); 1875 if (c != ' ') 1876 return; 1877 nl = 0; 1878 } 1879 nl++; 1880 rcount = 0; 1881 fidx = 0; 1882 osize = object->size; 1883 if (osize > 128) 1884 osize = 128; 1885 for (idx = 0; idx < osize; idx++) { 1886 m = vm_page_lookup(object, idx); 1887 if (m == NULL) { 1888 if (rcount) { 1889 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1890 (long)fidx, rcount, (long)pa); 1891 if ( nl > 18) { 1892 c = cngetc(); 1893 if (c != ' ') 1894 return; 1895 nl = 0; 1896 } 1897 nl++; 1898 rcount = 0; 1899 } 1900 continue; 1901 } 1902 1903 if (rcount && 1904 (VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) { 1905 ++rcount; 1906 continue; 1907 } 1908 if (rcount) { 1909 padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m); 1910 padiff >>= PAGE_SHIFT; 1911 padiff &= PQ_L2_MASK; 1912 if (padiff == 0) { 1913 pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE; 1914 ++rcount; 1915 continue; 1916 } 1917 db_printf(" index(%ld)run(%d)pa(0x%lx)", 1918 (long)fidx, rcount, (long)pa); 1919 db_printf("pd(%ld)\n", (long)padiff); 1920 if ( nl > 18) { 1921 c = cngetc(); 1922 if (c != ' ') 1923 return; 1924 nl = 0; 1925 } 1926 nl++; 1927 } 1928 fidx = idx; 1929 pa = VM_PAGE_TO_PHYS(m); 1930 rcount = 1; 1931 } 1932 if (rcount) { 1933 db_printf(" index(%ld)run(%d)pa(0x%lx)\n", 1934 (long)fidx, rcount, (long)pa); 1935 if ( nl > 18) { 1936 c = cngetc(); 1937 if (c != ' ') 1938 return; 1939 nl = 0; 1940 } 1941 nl++; 1942 } 1943 } 1944 } 1945 } 1946 #endif /* DDB */ 1947