1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2015, Joyent, Inc. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - anonymous pages. 41 * 42 * This layer sits immediately above the vm_swap layer. It manages 43 * physical pages that have no permanent identity in the file system 44 * name space, using the services of the vm_swap layer to allocate 45 * backing storage for these pages. Since these pages have no external 46 * identity, they are discarded when the last reference is removed. 47 * 48 * An important function of this layer is to manage low-level sharing 49 * of pages that are logically distinct but that happen to be 50 * physically identical (e.g., the corresponding pages of the processes 51 * resulting from a fork before one process or the other changes their 52 * contents). This pseudo-sharing is present only as an optimization 53 * and is not to be confused with true sharing in which multiple 54 * address spaces deliberately contain references to the same object; 55 * such sharing is managed at a higher level. 56 * 57 * The key data structure here is the anon struct, which contains a 58 * reference count for its associated physical page and a hint about 59 * the identity of that page. Anon structs typically live in arrays, 60 * with an instance's position in its array determining where the 61 * corresponding backing storage is allocated; however, the swap_xlate() 62 * routine abstracts away this representation information so that the 63 * rest of the anon layer need not know it. (See the swap layer for 64 * more details on anon struct layout.) 65 * 66 * In the future versions of the system, the association between an 67 * anon struct and its position on backing store will change so that 68 * we don't require backing store all anonymous pages in the system. 69 * This is important for consideration for large memory systems. 70 * We can also use this technique to delay binding physical locations 71 * to anonymous pages until pageout/swapout time where we can make 72 * smarter allocation decisions to improve anonymous klustering. 73 * 74 * Many of the routines defined here take a (struct anon **) argument, 75 * which allows the code at this level to manage anon pages directly, 76 * so that callers can regard anon structs as opaque objects and not be 77 * concerned with assigning or inspecting their contents. 78 * 79 * Clients of this layer refer to anon pages indirectly. That is, they 80 * maintain arrays of pointers to anon structs rather than maintaining 81 * anon structs themselves. The (struct anon **) arguments mentioned 82 * above are pointers to entries in these arrays. It is these arrays 83 * that capture the mapping between offsets within a given segment and 84 * the corresponding anonymous backing storage address. 85 */ 86 87 #ifdef DEBUG 88 #define ANON_DEBUG 89 #endif 90 91 #include <sys/types.h> 92 #include <sys/t_lock.h> 93 #include <sys/param.h> 94 #include <sys/systm.h> 95 #include <sys/mman.h> 96 #include <sys/cred.h> 97 #include <sys/thread.h> 98 #include <sys/vnode.h> 99 #include <sys/cpuvar.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/vtrace.h> 103 #include <sys/kmem.h> 104 #include <sys/sysmacros.h> 105 #include <sys/bitmap.h> 106 #include <sys/vmsystm.h> 107 #include <sys/tuneable.h> 108 #include <sys/debug.h> 109 #include <sys/fs/swapnode.h> 110 #include <sys/tnf_probe.h> 111 #include <sys/lgrp.h> 112 #include <sys/policy.h> 113 #include <sys/condvar_impl.h> 114 #include <sys/mutex_impl.h> 115 #include <sys/rctl.h> 116 117 #include <vm/as.h> 118 #include <vm/hat.h> 119 #include <vm/anon.h> 120 #include <vm/page.h> 121 #include <vm/vpage.h> 122 #include <vm/seg.h> 123 #include <vm/rm.h> 124 125 #include <fs/fs_subr.h> 126 127 struct vnode *anon_vp; 128 129 int anon_debug; 130 131 kmutex_t anoninfo_lock; 132 struct k_anoninfo k_anoninfo; 133 ani_free_t *ani_free_pool; 134 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 135 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 136 137 /* 138 * Global hash table for (vp, off) -> anon slot 139 */ 140 extern int swap_maxcontig; 141 size_t anon_hash_size; 142 unsigned int anon_hash_shift; 143 struct anon **anon_hash; 144 145 static struct kmem_cache *anon_cache; 146 static struct kmem_cache *anonmap_cache; 147 148 pad_mutex_t *anonhash_lock; 149 150 /* 151 * Used to make the increment of all refcnts of all anon slots of a large 152 * page appear to be atomic. The lock is grabbed for the first anon slot of 153 * a large page. 154 */ 155 pad_mutex_t *anonpages_hash_lock; 156 157 #define APH_MUTEX(vp, off) \ 158 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \ 159 (AH_LOCK_SIZE - 1))].pad_mutex) 160 161 #ifdef VM_STATS 162 static struct anonvmstats_str { 163 ulong_t getpages[30]; 164 ulong_t privatepages[10]; 165 ulong_t demotepages[9]; 166 ulong_t decrefpages[9]; 167 ulong_t dupfillholes[4]; 168 ulong_t freepages[1]; 169 } anonvmstats; 170 #endif /* VM_STATS */ 171 172 /*ARGSUSED*/ 173 static int 174 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 175 { 176 struct anon_map *amp = buf; 177 178 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 179 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 180 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 181 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 182 return (0); 183 } 184 185 /*ARGSUSED1*/ 186 static void 187 anonmap_cache_destructor(void *buf, void *cdrarg) 188 { 189 struct anon_map *amp = buf; 190 191 rw_destroy(&->a_rwlock); 192 cv_destroy(&->a_purgecv); 193 mutex_destroy(&->a_pmtx); 194 mutex_destroy(&->a_purgemtx); 195 } 196 197 void 198 anon_init(void) 199 { 200 int i; 201 pad_mutex_t *tmp; 202 203 /* These both need to be powers of 2 so round up to the next power */ 204 anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1); 205 anon_hash_size = 1L << anon_hash_shift; 206 207 /* 208 * We need to align the anonhash_lock and anonpages_hash_lock arrays 209 * to a 64B boundary to avoid false sharing. We add 63B to our 210 * allocation so that we can get a 64B aligned address to use. 211 * We allocate both of these together to avoid wasting an additional 212 * 63B. 213 */ 214 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63, 215 KM_SLEEP); 216 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64); 217 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE; 218 219 for (i = 0; i < AH_LOCK_SIZE; i++) { 220 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT, 221 NULL); 222 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL, 223 MUTEX_DEFAULT, NULL); 224 } 225 226 for (i = 0; i < ANON_LOCKSIZE; i++) { 227 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 228 MUTEX_DEFAULT, NULL); 229 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 230 } 231 232 anon_hash = (struct anon **) 233 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 234 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 235 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL); 236 anonmap_cache = kmem_cache_create("anonmap_cache", 237 sizeof (struct anon_map), 0, 238 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 239 NULL, NULL, 0); 240 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 241 242 tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP); 243 /* Round ani_free_pool to cacheline boundary to avoid false sharing. */ 244 ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64); 245 246 anon_vp = vn_alloc(KM_SLEEP); 247 vn_setops(anon_vp, swap_vnodeops); 248 anon_vp->v_type = VREG; 249 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 250 } 251 252 /* 253 * Global anon slot hash table manipulation. 254 */ 255 256 static void 257 anon_addhash(struct anon *ap) 258 { 259 int index; 260 261 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 262 index = ANON_HASH(ap->an_vp, ap->an_off); 263 ap->an_hash = anon_hash[index]; 264 anon_hash[index] = ap; 265 } 266 267 static void 268 anon_rmhash(struct anon *ap) 269 { 270 struct anon **app; 271 272 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 273 274 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 275 *app; app = &((*app)->an_hash)) { 276 if (*app == ap) { 277 *app = ap->an_hash; 278 break; 279 } 280 } 281 } 282 283 /* 284 * The anon array interfaces. Functions allocating, 285 * freeing array of pointers, and returning/setting 286 * entries in the array of pointers for a given offset. 287 * 288 * Create the list of pointers 289 */ 290 struct anon_hdr * 291 anon_create(pgcnt_t npages, int flags) 292 { 293 struct anon_hdr *ahp; 294 ulong_t nchunks; 295 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 296 297 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 298 return (NULL); 299 } 300 301 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 302 /* 303 * Single level case. 304 */ 305 ahp->size = npages; 306 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 307 308 if (flags & ANON_ALLOC_FORCE) 309 ahp->flags |= ANON_ALLOC_FORCE; 310 311 ahp->array_chunk = kmem_zalloc( 312 ahp->size * sizeof (struct anon *), kmemflags); 313 314 if (ahp->array_chunk == NULL) { 315 kmem_free(ahp, sizeof (struct anon_hdr)); 316 return (NULL); 317 } 318 } else { 319 /* 320 * 2 Level case. 321 * anon hdr size needs to be rounded off to be a multiple 322 * of ANON_CHUNK_SIZE. This is important as various anon 323 * related functions depend on this. 324 * NOTE - 325 * anon_grow() makes anon hdr size a multiple of 326 * ANON_CHUNK_SIZE. 327 * amp size is <= anon hdr size. 328 * anon_index + seg_pgs <= anon hdr size. 329 */ 330 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 331 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 332 333 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 334 kmemflags); 335 336 if (ahp->array_chunk == NULL) { 337 kmem_free(ahp, sizeof (struct anon_hdr)); 338 return (NULL); 339 } 340 } 341 return (ahp); 342 } 343 344 /* 345 * Free the array of pointers 346 */ 347 void 348 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 349 { 350 ulong_t i; 351 void **ppp; 352 ulong_t nchunks; 353 354 ASSERT(npages <= ahp->size); 355 356 /* 357 * Single level case. 358 */ 359 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 360 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 361 } else { 362 /* 363 * 2 level case. 364 */ 365 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 366 for (i = 0; i < nchunks; i++) { 367 ppp = &ahp->array_chunk[i]; 368 if (*ppp != NULL) 369 kmem_free(*ppp, PAGESIZE); 370 } 371 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 372 } 373 mutex_destroy(&ahp->serial_lock); 374 kmem_free(ahp, sizeof (struct anon_hdr)); 375 } 376 377 /* 378 * Return the pointer from the list for a 379 * specified anon index. 380 */ 381 struct anon * 382 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 383 { 384 struct anon **app; 385 386 ASSERT(an_idx < ahp->size); 387 388 /* 389 * Single level case. 390 */ 391 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 392 return ((struct anon *) 393 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 394 } else { 395 396 /* 397 * 2 level case. 398 */ 399 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 400 if (app) { 401 return ((struct anon *) 402 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 403 ANON_PTRMASK)); 404 } else { 405 return (NULL); 406 } 407 } 408 } 409 410 /* 411 * Return the anon pointer for the first valid entry in the anon list, 412 * starting from the given index. 413 */ 414 struct anon * 415 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 416 { 417 struct anon *ap; 418 struct anon **app; 419 ulong_t chunkoff; 420 ulong_t i; 421 ulong_t j; 422 pgcnt_t size; 423 424 i = *index; 425 size = ahp->size; 426 427 ASSERT(i < size); 428 429 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 430 /* 431 * 1 level case 432 */ 433 while (i < size) { 434 ap = (struct anon *) 435 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 436 if (ap) { 437 *index = i; 438 return (ap); 439 } 440 i++; 441 } 442 } else { 443 /* 444 * 2 level case 445 */ 446 chunkoff = i & ANON_CHUNK_OFF; 447 while (i < size) { 448 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 449 if (app) 450 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 451 ap = (struct anon *) 452 ((uintptr_t)app[j] & ANON_PTRMASK); 453 if (ap) { 454 *index = i + (j - chunkoff); 455 return (ap); 456 } 457 } 458 chunkoff = 0; 459 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 460 } 461 } 462 *index = size; 463 return (NULL); 464 } 465 466 /* 467 * Set list entry with a given pointer for a specified offset 468 */ 469 int 470 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 471 { 472 void **ppp; 473 struct anon **app; 474 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 475 uintptr_t *ap_addr; 476 477 ASSERT(an_idx < ahp->size); 478 479 /* 480 * Single level case. 481 */ 482 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 483 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 484 } else { 485 486 /* 487 * 2 level case. 488 */ 489 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 490 491 ASSERT(ppp != NULL); 492 if (*ppp == NULL) { 493 mutex_enter(&ahp->serial_lock); 494 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 495 if (*ppp == NULL) { 496 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 497 if (*ppp == NULL) { 498 mutex_exit(&ahp->serial_lock); 499 return (ENOMEM); 500 } 501 } 502 mutex_exit(&ahp->serial_lock); 503 } 504 app = *ppp; 505 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 506 } 507 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 508 return (0); 509 } 510 511 /* 512 * Copy anon array into a given new anon array 513 */ 514 int 515 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 516 struct anon_hdr *dahp, ulong_t d_idx, 517 pgcnt_t npages, int flags) 518 { 519 void **sapp, **dapp; 520 void *ap; 521 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 522 523 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 524 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 525 526 /* 527 * Both arrays are 1 level. 528 */ 529 if (((sahp->size <= ANON_CHUNK_SIZE) && 530 (dahp->size <= ANON_CHUNK_SIZE)) || 531 ((sahp->flags & ANON_ALLOC_FORCE) && 532 (dahp->flags & ANON_ALLOC_FORCE))) { 533 534 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 535 npages * sizeof (struct anon *)); 536 return (0); 537 } 538 539 /* 540 * Both arrays are 2 levels. 541 */ 542 if (sahp->size > ANON_CHUNK_SIZE && 543 dahp->size > ANON_CHUNK_SIZE && 544 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 545 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 546 547 ulong_t sapidx, dapidx; 548 ulong_t *sap, *dap; 549 ulong_t chknp; 550 551 while (npages != 0) { 552 553 sapidx = s_idx & ANON_CHUNK_OFF; 554 dapidx = d_idx & ANON_CHUNK_OFF; 555 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 556 if (chknp > npages) 557 chknp = npages; 558 559 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 560 if ((sap = *sapp) != NULL) { 561 dapp = &dahp->array_chunk[d_idx 562 >> ANON_CHUNK_SHIFT]; 563 if ((dap = *dapp) == NULL) { 564 *dapp = kmem_zalloc(PAGESIZE, 565 kmemflags); 566 if ((dap = *dapp) == NULL) 567 return (ENOMEM); 568 } 569 bcopy((sap + sapidx), (dap + dapidx), 570 chknp << ANON_PTRSHIFT); 571 } 572 s_idx += chknp; 573 d_idx += chknp; 574 npages -= chknp; 575 } 576 return (0); 577 } 578 579 /* 580 * At least one of the arrays is 2 level. 581 */ 582 while (npages--) { 583 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 584 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 585 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 586 return (ENOMEM); 587 } 588 s_idx++; 589 d_idx++; 590 } 591 return (0); 592 } 593 594 595 /* 596 * ANON_INITBUF is a convenience macro for anon_grow() below. It 597 * takes a buffer dst, which is at least as large as buffer src. It 598 * does a bcopy from src into dst, and then bzeros the extra bytes 599 * of dst. If tail is set, the data in src is tail aligned within 600 * dst instead of head aligned. 601 */ 602 603 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 604 if (tail) { \ 605 bzero((dst), (dstsize) - (srclen)); \ 606 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 607 } else { \ 608 bcopy((src), (dst), (srclen)); \ 609 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 610 } 611 612 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 613 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 614 615 /* 616 * anon_grow() is used to efficiently extend an existing anon array. 617 * startidx_p points to the index into the anon array of the first page 618 * that is in use. oldseg_pgs is the number of pages in use, starting at 619 * *startidx_p. newpages is the number of additional pages desired. 620 * 621 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 622 * 623 * The growth is done by creating a new top level of the anon array, 624 * and (if the array is 2-level) reusing the existing second level arrays. 625 * 626 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 627 * 628 * Returns the new number of pages in the anon array. 629 */ 630 pgcnt_t 631 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 632 pgcnt_t newseg_pgs, int flags) 633 { 634 ulong_t startidx = startidx_p ? *startidx_p : 0; 635 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 636 pgcnt_t oelems, nelems, totpages; 637 void **level1; 638 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 639 int growdown = (flags & ANON_GROWDOWN); 640 size_t newarrsz, oldarrsz; 641 void *level2; 642 643 ASSERT(!(startidx_p == NULL && growdown)); 644 ASSERT(startidx + oldseg_pgs <= ahp->size); 645 646 /* 647 * Determine the total number of pages needed in the new 648 * anon array. If growing down, totpages is all pages from 649 * startidx through the end of the array, plus <newseg_pgs> 650 * pages. If growing up, keep all pages from page 0 through 651 * the last page currently in use, plus <newseg_pgs> pages. 652 */ 653 if (growdown) 654 totpages = oldamp_pgs - startidx + newseg_pgs; 655 else 656 totpages = startidx + oldseg_pgs + newseg_pgs; 657 658 /* If the array is already large enough, just return. */ 659 660 if (oldamp_pgs >= totpages) { 661 if (growdown) 662 *startidx_p = oldamp_pgs - totpages; 663 return (oldamp_pgs); 664 } 665 666 /* 667 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 668 * by the corresponding arrays. 669 * oelems/nelems are the number of pointers in the top level arrays 670 * which may be either level 1 or level 2. 671 * Will the new anon array be one level or two levels? 672 */ 673 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 674 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 675 oelems = oldamp_pgs; 676 nelems = newamp_pgs; 677 } else { 678 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 679 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 680 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 681 } 682 683 newarrsz = nelems * sizeof (void *); 684 level1 = kmem_alloc(newarrsz, kmemflags); 685 if (level1 == NULL) 686 return (0); 687 688 /* Are we converting from a one level to a two level anon array? */ 689 690 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 691 !(ahp->flags & ANON_ALLOC_FORCE)) { 692 693 /* 694 * Yes, we're converting to a two level. Reuse old level 1 695 * as new level 2 if it is exactly PAGESIZE. Otherwise 696 * alloc a new level 2 and copy the old level 1 data into it. 697 */ 698 if (oldamp_pgs == ANON_CHUNK_SIZE) { 699 level2 = (void *)ahp->array_chunk; 700 } else { 701 level2 = kmem_alloc(PAGESIZE, kmemflags); 702 if (level2 == NULL) { 703 kmem_free(level1, newarrsz); 704 return (0); 705 } 706 oldarrsz = oldamp_pgs * sizeof (void *); 707 708 ANON_INITBUF(ahp->array_chunk, oldarrsz, 709 level2, PAGESIZE, growdown); 710 kmem_free(ahp->array_chunk, oldarrsz); 711 } 712 bzero(level1, newarrsz); 713 if (growdown) 714 level1[nelems - 1] = level2; 715 else 716 level1[0] = level2; 717 } else { 718 oldarrsz = oelems * sizeof (void *); 719 720 ANON_INITBUF(ahp->array_chunk, oldarrsz, 721 level1, newarrsz, growdown); 722 kmem_free(ahp->array_chunk, oldarrsz); 723 } 724 725 ahp->array_chunk = level1; 726 ahp->size = newamp_pgs; 727 if (growdown) 728 *startidx_p = newamp_pgs - totpages; 729 730 return (newamp_pgs); 731 } 732 733 734 /* 735 * Called to sync ani_free value. 736 */ 737 738 void 739 set_anoninfo(void) 740 { 741 processorid_t ix, max_seqid; 742 pgcnt_t total = 0; 743 static clock_t last_time; 744 clock_t new_time; 745 746 if (ani_free_pool == NULL) 747 return; 748 749 /* 750 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to 751 * identify the maximum number of CPUs were ever online. 752 */ 753 new_time = ddi_get_lbolt(); 754 if (new_time > last_time) { 755 756 max_seqid = max_cpu_seqid_ever; 757 ASSERT(ANI_MAX_POOL > max_seqid); 758 for (ix = 0; ix <= max_seqid; ix++) 759 total += ani_free_pool[ix].ani_count; 760 761 last_time = new_time; 762 k_anoninfo.ani_free = total; 763 } 764 } 765 766 /* 767 * Reserve anon space. 768 * 769 * It's no longer simply a matter of incrementing ani_resv to 770 * reserve swap space, we need to check memory-based as well 771 * as disk-backed (physical) swap. The following algorithm 772 * is used: 773 * Check the space on physical swap 774 * i.e. amount needed < ani_max - ani_phys_resv 775 * If we are swapping on swapfs check 776 * amount needed < (availrmem - swapfs_minfree) 777 * Since the algorithm to check for the quantity of swap space is 778 * almost the same as that for reserving it, we'll just use anon_resvmem 779 * with a flag to decrement availrmem. 780 * 781 * Return non-zero on success. 782 */ 783 int 784 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 785 { 786 pgcnt_t npages = btopr(size); 787 pgcnt_t mswap_pages = 0; 788 pgcnt_t pswap_pages = 0; 789 proc_t *p = curproc; 790 791 if (zone != NULL) { 792 /* test zone.max-swap resource control */ 793 mutex_enter(&p->p_lock); 794 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 795 mutex_exit(&p->p_lock); 796 797 if (takemem) 798 atomic_add_64(&zone->zone_anon_alloc_fail, 1); 799 800 return (0); 801 } 802 803 if (!takemem) 804 rctl_decr_swap(zone, ptob(npages)); 805 806 mutex_exit(&p->p_lock); 807 } 808 mutex_enter(&anoninfo_lock); 809 810 /* 811 * pswap_pages is the number of pages we can take from 812 * physical (i.e. disk-backed) swap. 813 */ 814 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 815 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 816 817 ANON_PRINT(A_RESV, 818 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 819 npages, takemem, pswap_pages, (void *)caller())); 820 821 if (npages <= pswap_pages) { 822 /* 823 * we have enough space on a physical swap 824 */ 825 if (takemem) 826 k_anoninfo.ani_phys_resv += npages; 827 mutex_exit(&anoninfo_lock); 828 return (1); 829 } else if (pswap_pages != 0) { 830 /* 831 * we have some space on a physical swap 832 */ 833 if (takemem) { 834 /* 835 * use up remainder of phys swap 836 */ 837 k_anoninfo.ani_phys_resv += pswap_pages; 838 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 839 } 840 } 841 /* 842 * since (npages > pswap_pages) we need mem swap 843 * mswap_pages is the number of pages needed from availrmem 844 */ 845 ASSERT(npages > pswap_pages); 846 mswap_pages = npages - pswap_pages; 847 848 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 849 mswap_pages)); 850 851 /* 852 * priv processes can reserve memory as swap as long as availrmem 853 * remains greater than swapfs_minfree; in the case of non-priv 854 * processes, memory can be reserved as swap only if availrmem 855 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 856 * swapfs_reserve amount of memswap is not available to non-priv 857 * processes. This protects daemons such as automounter dying 858 * as a result of application processes eating away almost entire 859 * membased swap. This safeguard becomes useless if apps are run 860 * with root access. 861 * 862 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 863 * 864 */ 865 if (tryhard) { 866 pgcnt_t floor_pages; 867 868 if (secpolicy_resource_anon_mem(CRED())) { 869 floor_pages = swapfs_minfree; 870 } else { 871 floor_pages = swapfs_minfree + swapfs_reserve; 872 } 873 874 mutex_exit(&anoninfo_lock); 875 (void) page_reclaim_mem(mswap_pages, floor_pages, 0); 876 mutex_enter(&anoninfo_lock); 877 } 878 879 mutex_enter(&freemem_lock); 880 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 881 (availrmem > (swapfs_minfree + mswap_pages) && 882 secpolicy_resource(CRED()) == 0)) { 883 884 if (takemem) { 885 /* 886 * Take the memory from the rest of the system. 887 */ 888 availrmem -= mswap_pages; 889 mutex_exit(&freemem_lock); 890 k_anoninfo.ani_mem_resv += mswap_pages; 891 ANI_ADD(mswap_pages); 892 ANON_PRINT((A_RESV | A_MRESV), 893 ("anon_resvmem: took %ld pages of availrmem\n", 894 mswap_pages)); 895 } else { 896 mutex_exit(&freemem_lock); 897 } 898 899 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 900 mutex_exit(&anoninfo_lock); 901 return (1); 902 } else { 903 /* 904 * Fail if not enough memory 905 */ 906 if (takemem) { 907 k_anoninfo.ani_phys_resv -= pswap_pages; 908 } 909 910 mutex_exit(&freemem_lock); 911 mutex_exit(&anoninfo_lock); 912 ANON_PRINT(A_RESV, 913 ("anon_resvmem: not enough space from swapfs\n")); 914 if (zone != NULL && takemem) 915 rctl_decr_swap(zone, ptob(npages)); 916 return (0); 917 } 918 } 919 920 /* 921 * Give back an anon reservation. 922 */ 923 void 924 anon_unresvmem(size_t size, zone_t *zone) 925 { 926 pgcnt_t npages = btopr(size); 927 spgcnt_t mem_free_pages = 0; 928 pgcnt_t phys_free_slots; 929 #ifdef ANON_DEBUG 930 pgcnt_t mem_resv; 931 #endif 932 if (zone != NULL) 933 rctl_decr_swap(zone, ptob(npages)); 934 935 mutex_enter(&anoninfo_lock); 936 937 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 938 939 /* 940 * If some of this reservation belonged to swapfs 941 * give it back to availrmem. 942 * ani_mem_resv is the amount of availrmem swapfs has reserved. 943 * but some of that memory could be locked by segspt so we can only 944 * return non locked ani_mem_resv back to availrmem 945 */ 946 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 947 ANON_PRINT((A_RESV | A_MRESV), 948 ("anon_unresv: growing availrmem by %ld pages\n", 949 MIN(k_anoninfo.ani_mem_resv, npages))); 950 951 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 952 k_anoninfo.ani_locked_swap), npages); 953 mutex_enter(&freemem_lock); 954 availrmem += mem_free_pages; 955 mutex_exit(&freemem_lock); 956 k_anoninfo.ani_mem_resv -= mem_free_pages; 957 958 ANI_ADD(-mem_free_pages); 959 } 960 /* 961 * The remainder of the pages is returned to phys swap 962 */ 963 ASSERT(npages >= mem_free_pages); 964 phys_free_slots = npages - mem_free_pages; 965 966 if (phys_free_slots) { 967 k_anoninfo.ani_phys_resv -= phys_free_slots; 968 } 969 970 #ifdef ANON_DEBUG 971 mem_resv = k_anoninfo.ani_mem_resv; 972 #endif 973 974 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 975 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 976 977 mutex_exit(&anoninfo_lock); 978 979 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 980 npages, mem_resv, (void *)caller())); 981 } 982 983 /* 984 * Allocate an anon slot and return it with the lock held. 985 */ 986 struct anon * 987 anon_alloc(struct vnode *vp, anoff_t off) 988 { 989 struct anon *ap; 990 kmutex_t *ahm; 991 992 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 993 if (vp == NULL) { 994 swap_alloc(ap); 995 } else { 996 ap->an_vp = vp; 997 ap->an_off = off; 998 } 999 ap->an_refcnt = 1; 1000 ap->an_pvp = NULL; 1001 ap->an_poff = 0; 1002 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1003 mutex_enter(ahm); 1004 anon_addhash(ap); 1005 mutex_exit(ahm); 1006 ANI_ADD(-1); 1007 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 1008 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 1009 return (ap); 1010 } 1011 1012 /* 1013 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 1014 * such pages don't consume any physical swap resources needed for swapping 1015 * unlocked pages. 1016 */ 1017 void 1018 anon_swap_free(struct anon *ap, page_t *pp) 1019 { 1020 kmutex_t *ahm; 1021 1022 ASSERT(ap != NULL); 1023 ASSERT(pp != NULL); 1024 ASSERT(PAGE_LOCKED(pp)); 1025 ASSERT(pp->p_vnode != NULL); 1026 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1027 ASSERT(ap->an_refcnt != 0); 1028 ASSERT(pp->p_vnode == ap->an_vp); 1029 ASSERT(pp->p_offset == ap->an_off); 1030 1031 if (ap->an_pvp == NULL) 1032 return; 1033 1034 page_io_lock(pp); 1035 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1036 mutex_enter(ahm); 1037 1038 ASSERT(ap->an_refcnt != 0); 1039 ASSERT(pp->p_vnode == ap->an_vp); 1040 ASSERT(pp->p_offset == ap->an_off); 1041 1042 if (ap->an_pvp != NULL) { 1043 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1044 ap->an_pvp = NULL; 1045 ap->an_poff = 0; 1046 mutex_exit(ahm); 1047 hat_setmod(pp); 1048 } else { 1049 mutex_exit(ahm); 1050 } 1051 page_io_unlock(pp); 1052 } 1053 1054 /* 1055 * Decrement the reference count of an anon page. 1056 * If reference count goes to zero, free it and 1057 * its associated page (if any). 1058 */ 1059 void 1060 anon_decref(struct anon *ap) 1061 { 1062 page_t *pp; 1063 struct vnode *vp; 1064 anoff_t off; 1065 kmutex_t *ahm; 1066 1067 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1068 mutex_enter(ahm); 1069 ASSERT(ap->an_refcnt != 0); 1070 if (ap->an_refcnt == 0) 1071 panic("anon_decref: slot count 0"); 1072 if (--ap->an_refcnt == 0) { 1073 swap_xlate(ap, &vp, &off); 1074 anon_rmhash(ap); 1075 if (ap->an_pvp != NULL) 1076 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1077 mutex_exit(ahm); 1078 1079 /* 1080 * If there is a page for this anon slot we will need to 1081 * call VN_DISPOSE to get rid of the vp association and 1082 * put the page back on the free list as really free. 1083 * Acquire the "exclusive" lock to ensure that any 1084 * pending i/o always completes before the swap slot 1085 * is freed. 1086 */ 1087 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1088 if (pp != NULL) { 1089 /*LINTED: constant in conditional context */ 1090 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1091 } 1092 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1093 (void *)ap, (void *)ap->an_vp)); 1094 1095 kmem_cache_free(anon_cache, ap); 1096 1097 ANI_ADD(1); 1098 } else { 1099 mutex_exit(ahm); 1100 } 1101 } 1102 1103 1104 /* 1105 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1106 * seg->s_szc level) to determine whether COW processing is required. 1107 * anonpages_hash_lock[] held on the root ap ensures that if root's 1108 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1109 * later since this process can't fork while its AS lock is held). 1110 * 1111 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1112 */ 1113 int 1114 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1115 { 1116 struct anon *ap; 1117 kmutex_t *ahmpages = NULL; 1118 1119 ap = anon_get_ptr(ahp, anon_index); 1120 if (ap == NULL) 1121 return (0); 1122 1123 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1124 mutex_enter(ahmpages); 1125 ASSERT(ap->an_refcnt >= 1); 1126 if (ap->an_refcnt == 1) { 1127 mutex_exit(ahmpages); 1128 return (0); 1129 } 1130 mutex_exit(ahmpages); 1131 return (1); 1132 } 1133 /* 1134 * Check 'nslots' anon slots for refcnt > 1. 1135 * 1136 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1137 * returns 0. 1138 */ 1139 static int 1140 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1141 { 1142 struct anon *ap; 1143 1144 while (nslots-- > 0) { 1145 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1146 ap->an_refcnt > 1) 1147 return (1); 1148 anon_index++; 1149 } 1150 1151 return (0); 1152 } 1153 1154 static void 1155 anon_decref_pages( 1156 struct anon_hdr *ahp, 1157 ulong_t an_idx, 1158 uint_t szc) 1159 { 1160 struct anon *ap = anon_get_ptr(ahp, an_idx); 1161 kmutex_t *ahmpages = NULL; 1162 page_t *pp; 1163 pgcnt_t pgcnt = page_get_pagecnt(szc); 1164 pgcnt_t i; 1165 struct vnode *vp; 1166 anoff_t off; 1167 kmutex_t *ahm; 1168 #ifdef DEBUG 1169 int refcnt = 1; 1170 #endif 1171 1172 ASSERT(szc != 0); 1173 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1174 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1175 ASSERT(an_idx < ahp->size); 1176 1177 if (ahp->size - an_idx < pgcnt) { 1178 /* 1179 * In case of shared mappings total anon map size may not be 1180 * the largest page size aligned. 1181 */ 1182 pgcnt = ahp->size - an_idx; 1183 } 1184 1185 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1186 1187 if (ap != NULL) { 1188 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1189 mutex_enter(ahmpages); 1190 ASSERT((refcnt = ap->an_refcnt) != 0); 1191 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1192 if (ap->an_refcnt == 1) { 1193 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1194 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1195 mutex_exit(ahmpages); 1196 ahmpages = NULL; 1197 } 1198 } 1199 1200 i = 0; 1201 while (i < pgcnt) { 1202 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1203 ASSERT(refcnt == 1 && ahmpages == NULL); 1204 i++; 1205 continue; 1206 } 1207 ASSERT(ap->an_refcnt == refcnt); 1208 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1209 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1210 1211 if (ahmpages == NULL) { 1212 swap_xlate(ap, &vp, &off); 1213 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1214 if (pp == NULL || pp->p_szc == 0) { 1215 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1216 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1217 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1218 ANON_SLEEP); 1219 mutex_enter(ahm); 1220 ap->an_refcnt--; 1221 ASSERT(ap->an_refcnt == 0); 1222 anon_rmhash(ap); 1223 if (ap->an_pvp) 1224 swap_phys_free(ap->an_pvp, ap->an_poff, 1225 PAGESIZE); 1226 mutex_exit(ahm); 1227 if (pp == NULL) { 1228 pp = page_lookup(vp, (u_offset_t)off, 1229 SE_EXCL); 1230 ASSERT(pp == NULL || pp->p_szc == 0); 1231 } 1232 if (pp != NULL) { 1233 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1234 /*LINTED*/ 1235 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1236 } 1237 kmem_cache_free(anon_cache, ap); 1238 ANI_ADD(1); 1239 i++; 1240 } else { 1241 pgcnt_t j; 1242 pgcnt_t curpgcnt = 1243 page_get_pagecnt(pp->p_szc); 1244 size_t ppasize = curpgcnt * sizeof (page_t *); 1245 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1246 int dispose = 0; 1247 1248 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1249 1250 ASSERT(pp->p_szc <= szc); 1251 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1252 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1253 ASSERT(i + curpgcnt <= pgcnt); 1254 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1255 ppa[0] = pp; 1256 for (j = i + 1; j < i + curpgcnt; j++) { 1257 ap = anon_get_ptr(ahp, an_idx + j); 1258 ASSERT(ap != NULL && 1259 ap->an_refcnt == 1); 1260 swap_xlate(ap, &vp, &off); 1261 pp = page_lookup(vp, (u_offset_t)off, 1262 SE_EXCL); 1263 if (pp == NULL) 1264 panic("anon_decref_pages: " 1265 "no page"); 1266 1267 (void) hat_pageunload(pp, 1268 HAT_FORCE_PGUNLOAD); 1269 ASSERT(pp->p_szc == ppa[0]->p_szc); 1270 ASSERT(page_pptonum(pp) - 1 == 1271 page_pptonum(ppa[j - i - 1])); 1272 ppa[j - i] = pp; 1273 if (ap->an_pvp != NULL && 1274 !vn_matchopval(ap->an_pvp, 1275 VOPNAME_DISPOSE, 1276 (fs_generic_func_p)fs_dispose)) 1277 dispose = 1; 1278 } 1279 for (j = i; j < i + curpgcnt; j++) { 1280 ap = anon_get_ptr(ahp, an_idx + j); 1281 ASSERT(ap != NULL && 1282 ap->an_refcnt == 1); 1283 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1284 (void) anon_set_ptr(ahp, an_idx + j, 1285 NULL, ANON_SLEEP); 1286 mutex_enter(ahm); 1287 ap->an_refcnt--; 1288 ASSERT(ap->an_refcnt == 0); 1289 anon_rmhash(ap); 1290 if (ap->an_pvp) 1291 swap_phys_free(ap->an_pvp, 1292 ap->an_poff, PAGESIZE); 1293 mutex_exit(ahm); 1294 kmem_cache_free(anon_cache, ap); 1295 ANI_ADD(1); 1296 } 1297 if (!dispose) { 1298 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1299 page_destroy_pages(ppa[0]); 1300 } else { 1301 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1302 for (j = 0; j < curpgcnt; j++) { 1303 ASSERT(PAGE_EXCL(ppa[j])); 1304 ppa[j]->p_szc = 0; 1305 } 1306 for (j = 0; j < curpgcnt; j++) { 1307 ASSERT(!hat_page_is_mapped( 1308 ppa[j])); 1309 /*LINTED*/ 1310 VN_DISPOSE(ppa[j], B_INVAL, 0, 1311 kcred); 1312 } 1313 } 1314 kmem_free(ppa, ppasize); 1315 i += curpgcnt; 1316 } 1317 } else { 1318 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1319 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1320 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1321 mutex_enter(ahm); 1322 ap->an_refcnt--; 1323 mutex_exit(ahm); 1324 i++; 1325 } 1326 } 1327 1328 if (ahmpages != NULL) { 1329 mutex_exit(ahmpages); 1330 } 1331 } 1332 1333 /* 1334 * Duplicate references to size bytes worth of anon pages. 1335 * Used when duplicating a segment that contains private anon pages. 1336 * This code assumes that procedure calling this one has already used 1337 * hat_chgprot() to disable write access to the range of addresses that 1338 * that *old actually refers to. 1339 */ 1340 void 1341 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1342 ulong_t new_idx, size_t size) 1343 { 1344 spgcnt_t npages; 1345 kmutex_t *ahm; 1346 struct anon *ap; 1347 ulong_t off; 1348 ulong_t index; 1349 1350 npages = btopr(size); 1351 while (npages > 0) { 1352 index = old_idx; 1353 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1354 break; 1355 1356 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1357 off = index - old_idx; 1358 npages -= off; 1359 if (npages <= 0) 1360 break; 1361 1362 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1363 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1364 1365 mutex_enter(ahm); 1366 ap->an_refcnt++; 1367 mutex_exit(ahm); 1368 1369 off++; 1370 new_idx += off; 1371 old_idx += off; 1372 npages--; 1373 } 1374 } 1375 1376 /* 1377 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1378 * slots) within any large page region. That means if a large page region is 1379 * empty in the old array it will skip it. If there are 1 or more valid slots 1380 * in the large page region of the old array it will make sure to fill in any 1381 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1382 * page region should either have no valid anon slots or all slots should be 1383 * valid. 1384 */ 1385 void 1386 anon_dup_fill_holes( 1387 struct anon_hdr *old, 1388 ulong_t old_idx, 1389 struct anon_hdr *new, 1390 ulong_t new_idx, 1391 size_t size, 1392 uint_t szc, 1393 int noalloc) 1394 { 1395 struct anon *ap; 1396 spgcnt_t npages; 1397 kmutex_t *ahm, *ahmpages = NULL; 1398 pgcnt_t pgcnt, i; 1399 ulong_t index, off; 1400 #ifdef DEBUG 1401 int refcnt; 1402 #endif 1403 1404 ASSERT(szc != 0); 1405 pgcnt = page_get_pagecnt(szc); 1406 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1407 npages = btopr(size); 1408 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1409 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1410 1411 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1412 1413 while (npages > 0) { 1414 index = old_idx; 1415 1416 /* 1417 * Find the next valid slot. 1418 */ 1419 if (anon_get_next_ptr(old, &index) == NULL) 1420 break; 1421 1422 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1423 /* 1424 * Now backup index to the beginning of the 1425 * current large page region of the old array. 1426 */ 1427 index = P2ALIGN(index, pgcnt); 1428 off = index - old_idx; 1429 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1430 npages -= off; 1431 if (npages <= 0) 1432 break; 1433 1434 /* 1435 * Fill and copy a large page regions worth 1436 * of anon slots. 1437 */ 1438 for (i = 0; i < pgcnt; i++) { 1439 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1440 if (noalloc) { 1441 panic("anon_dup_fill_holes: " 1442 "empty anon slot\n"); 1443 } 1444 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1445 ap = anon_alloc(NULL, 0); 1446 (void) anon_set_ptr(old, index + i, ap, 1447 ANON_SLEEP); 1448 } else if (i == 0) { 1449 /* 1450 * make the increment of all refcnts of all 1451 * anon slots of a large page appear atomic by 1452 * getting an anonpages_hash_lock for the 1453 * first anon slot of a large page. 1454 */ 1455 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1456 1457 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1458 mutex_enter(ahmpages); 1459 /*LINTED*/ 1460 ASSERT(refcnt = ap->an_refcnt); 1461 1462 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1463 anonvmstats.dupfillholes[3]); 1464 } 1465 (void) anon_set_ptr(new, new_idx + off + i, ap, 1466 ANON_SLEEP); 1467 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1468 mutex_enter(ahm); 1469 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1470 ASSERT(i == 0 || ahmpages == NULL || 1471 refcnt == ap->an_refcnt); 1472 ap->an_refcnt++; 1473 mutex_exit(ahm); 1474 } 1475 if (ahmpages != NULL) { 1476 mutex_exit(ahmpages); 1477 ahmpages = NULL; 1478 } 1479 off += pgcnt; 1480 new_idx += off; 1481 old_idx += off; 1482 npages -= pgcnt; 1483 } 1484 } 1485 1486 /* 1487 * Used when a segment with a vnode changes szc. similarly to 1488 * anon_dup_fill_holes() makes sure each large page region either has no anon 1489 * slots or all of them. but new slots are created by COWing the file 1490 * pages. on entrance no anon slots should be shared. 1491 */ 1492 int 1493 anon_fill_cow_holes( 1494 struct seg *seg, 1495 caddr_t addr, 1496 struct anon_hdr *ahp, 1497 ulong_t an_idx, 1498 struct vnode *vp, 1499 u_offset_t vp_off, 1500 size_t size, 1501 uint_t szc, 1502 uint_t prot, 1503 struct vpage vpage[], 1504 struct cred *cred) 1505 { 1506 struct anon *ap; 1507 spgcnt_t npages; 1508 pgcnt_t pgcnt, i; 1509 ulong_t index, off; 1510 int err = 0; 1511 int pageflags = 0; 1512 1513 ASSERT(szc != 0); 1514 pgcnt = page_get_pagecnt(szc); 1515 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1516 npages = btopr(size); 1517 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1518 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1519 1520 while (npages > 0) { 1521 index = an_idx; 1522 1523 /* 1524 * Find the next valid slot. 1525 */ 1526 if (anon_get_next_ptr(ahp, &index) == NULL) { 1527 break; 1528 } 1529 1530 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1531 /* 1532 * Now backup index to the beginning of the 1533 * current large page region of the anon array. 1534 */ 1535 index = P2ALIGN(index, pgcnt); 1536 off = index - an_idx; 1537 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1538 npages -= off; 1539 if (npages <= 0) 1540 break; 1541 an_idx += off; 1542 vp_off += ptob(off); 1543 addr += ptob(off); 1544 if (vpage != NULL) { 1545 vpage += off; 1546 } 1547 1548 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1549 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1550 page_t *pl[1 + 1]; 1551 page_t *pp; 1552 1553 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1554 pl, PAGESIZE, seg, addr, S_READ, cred, 1555 NULL); 1556 if (err) { 1557 break; 1558 } 1559 if (vpage != NULL) { 1560 prot = VPP_PROT(vpage); 1561 pageflags = VPP_ISPPLOCK(vpage) ? 1562 LOCK_PAGE : 0; 1563 } 1564 pp = anon_private(&ap, seg, addr, prot, pl[0], 1565 pageflags, cred); 1566 if (pp == NULL) { 1567 err = ENOMEM; 1568 break; 1569 } 1570 (void) anon_set_ptr(ahp, an_idx, ap, 1571 ANON_SLEEP); 1572 page_unlock(pp); 1573 } 1574 ASSERT(ap->an_refcnt == 1); 1575 addr += PAGESIZE; 1576 if (vpage != NULL) { 1577 vpage++; 1578 } 1579 } 1580 npages -= pgcnt; 1581 } 1582 1583 return (err); 1584 } 1585 1586 /* 1587 * Free a group of "size" anon pages, size in bytes, 1588 * and clear out the pointers to the anon entries. 1589 */ 1590 void 1591 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1592 { 1593 spgcnt_t npages; 1594 struct anon *ap; 1595 ulong_t old; 1596 1597 npages = btopr(size); 1598 1599 while (npages > 0) { 1600 old = index; 1601 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1602 break; 1603 1604 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1605 npages -= index - old; 1606 if (npages <= 0) 1607 break; 1608 1609 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1610 anon_decref(ap); 1611 /* 1612 * Bump index and decrement page count 1613 */ 1614 index++; 1615 npages--; 1616 } 1617 } 1618 1619 void 1620 anon_free_pages( 1621 struct anon_hdr *ahp, 1622 ulong_t an_idx, 1623 size_t size, 1624 uint_t szc) 1625 { 1626 spgcnt_t npages; 1627 pgcnt_t pgcnt; 1628 ulong_t index, off; 1629 1630 ASSERT(szc != 0); 1631 pgcnt = page_get_pagecnt(szc); 1632 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1633 npages = btopr(size); 1634 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1635 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1636 ASSERT(an_idx < ahp->size); 1637 1638 VM_STAT_ADD(anonvmstats.freepages[0]); 1639 1640 while (npages > 0) { 1641 index = an_idx; 1642 1643 /* 1644 * Find the next valid slot. 1645 */ 1646 if (anon_get_next_ptr(ahp, &index) == NULL) 1647 break; 1648 1649 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1650 /* 1651 * Now backup index to the beginning of the 1652 * current large page region of the old array. 1653 */ 1654 index = P2ALIGN(index, pgcnt); 1655 off = index - an_idx; 1656 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1657 npages -= off; 1658 if (npages <= 0) 1659 break; 1660 1661 anon_decref_pages(ahp, index, szc); 1662 1663 off += pgcnt; 1664 an_idx += off; 1665 npages -= pgcnt; 1666 } 1667 } 1668 1669 /* 1670 * Make anonymous pages discardable 1671 */ 1672 int 1673 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, 1674 uint_t behav, pgcnt_t *purged) 1675 { 1676 spgcnt_t npages = btopr(size); 1677 struct anon *ap; 1678 struct vnode *vp; 1679 anoff_t off; 1680 page_t *pp, *root_pp; 1681 kmutex_t *ahm; 1682 pgcnt_t pgcnt, npurged = 0; 1683 ulong_t old_idx, idx, i; 1684 struct anon_hdr *ahp = amp->ahp; 1685 anon_sync_obj_t cookie; 1686 int err = 0; 1687 1688 VERIFY(behav == MADV_FREE || behav == MADV_PURGE); 1689 ASSERT(RW_READ_HELD(&->a_rwlock)); 1690 pgcnt = 1; 1691 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1692 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1693 1694 /* 1695 * get anon pointer and index for the first valid entry 1696 * in the anon list, starting from "index" 1697 */ 1698 old_idx = index; 1699 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1700 break; 1701 1702 /* 1703 * decrement npages by number of NULL anon slots we skipped 1704 */ 1705 npages -= index - old_idx; 1706 if (npages <= 0) 1707 break; 1708 1709 anon_array_enter(amp, index, &cookie); 1710 ap = anon_get_ptr(ahp, index); 1711 ASSERT(ap != NULL); 1712 1713 /* 1714 * Get anonymous page and try to lock it SE_EXCL; 1715 * if we couldn't grab the lock we skip to next page. 1716 */ 1717 swap_xlate(ap, &vp, &off); 1718 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1719 if (pp == NULL) { 1720 segadvstat.MADV_FREE_miss.value.ul++; 1721 pgcnt = 1; 1722 anon_array_exit(&cookie); 1723 continue; 1724 } 1725 pgcnt = page_get_pagecnt(pp->p_szc); 1726 1727 /* 1728 * we cannot free a page which is permanently locked. 1729 * The page_struct_lock need not be acquired to examine 1730 * these fields since the page has an "exclusive" lock. 1731 */ 1732 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1733 page_unlock(pp); 1734 segadvstat.MADV_FREE_miss.value.ul++; 1735 anon_array_exit(&cookie); 1736 err = EBUSY; 1737 continue; 1738 } 1739 1740 ahm = AH_MUTEX(vp, off); 1741 mutex_enter(ahm); 1742 ASSERT(ap->an_refcnt != 0); 1743 /* 1744 * skip this one if copy-on-write is not yet broken. 1745 */ 1746 if (ap->an_refcnt > 1) { 1747 mutex_exit(ahm); 1748 page_unlock(pp); 1749 segadvstat.MADV_FREE_miss.value.ul++; 1750 anon_array_exit(&cookie); 1751 continue; 1752 } 1753 1754 if (behav == MADV_PURGE && pp->p_szc != 0) { 1755 /* 1756 * If we're purging and we have a large page, simplify 1757 * things a bit by demoting ourselves into the base 1758 * page case. 1759 */ 1760 (void) page_try_demote_pages(pp); 1761 } 1762 1763 if (pp->p_szc == 0) { 1764 pgcnt = 1; 1765 1766 /* 1767 * free swap slot; 1768 */ 1769 if (ap->an_pvp) { 1770 swap_phys_free(ap->an_pvp, ap->an_poff, 1771 PAGESIZE); 1772 ap->an_pvp = NULL; 1773 ap->an_poff = 0; 1774 } 1775 1776 if (behav == MADV_PURGE) { 1777 /* 1778 * If we're purging (instead of merely freeing), 1779 * rip out this anon structure entirely to 1780 * assure that any subsequent fault pulls from 1781 * the backing vnode (if any). 1782 */ 1783 if (--ap->an_refcnt == 0) 1784 anon_rmhash(ap); 1785 1786 mutex_exit(ahm); 1787 (void) anon_set_ptr(ahp, index, 1788 NULL, ANON_SLEEP); 1789 npurged++; 1790 ANI_ADD(1); 1791 kmem_cache_free(anon_cache, ap); 1792 } else { 1793 mutex_exit(ahm); 1794 } 1795 1796 segadvstat.MADV_FREE_hit.value.ul++; 1797 1798 /* 1799 * while we are at it, unload all the translations 1800 * and attempt to free the page. 1801 */ 1802 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1803 /*LINTED: constant in conditional context */ 1804 VN_DISPOSE(pp, 1805 behav == MADV_FREE ? B_FREE : B_INVAL, 0, kcred); 1806 1807 anon_array_exit(&cookie); 1808 continue; 1809 } 1810 1811 pgcnt = page_get_pagecnt(pp->p_szc); 1812 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1813 if (!page_try_demote_pages(pp)) { 1814 mutex_exit(ahm); 1815 page_unlock(pp); 1816 segadvstat.MADV_FREE_miss.value.ul++; 1817 anon_array_exit(&cookie); 1818 err = EBUSY; 1819 continue; 1820 } else { 1821 pgcnt = 1; 1822 if (ap->an_pvp) { 1823 swap_phys_free(ap->an_pvp, 1824 ap->an_poff, PAGESIZE); 1825 ap->an_pvp = NULL; 1826 ap->an_poff = 0; 1827 } 1828 mutex_exit(ahm); 1829 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1830 /*LINTED*/ 1831 VN_DISPOSE(pp, B_FREE, 0, kcred); 1832 segadvstat.MADV_FREE_hit.value.ul++; 1833 anon_array_exit(&cookie); 1834 continue; 1835 } 1836 } 1837 mutex_exit(ahm); 1838 root_pp = pp; 1839 1840 /* 1841 * try to lock remaining pages 1842 */ 1843 for (idx = 1; idx < pgcnt; idx++) { 1844 pp++; 1845 if (!page_trylock(pp, SE_EXCL)) 1846 break; 1847 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1848 page_unlock(pp); 1849 break; 1850 } 1851 } 1852 1853 if (idx == pgcnt) { 1854 for (i = 0; i < pgcnt; i++) { 1855 ap = anon_get_ptr(ahp, index + i); 1856 if (ap == NULL) 1857 break; 1858 swap_xlate(ap, &vp, &off); 1859 ahm = AH_MUTEX(vp, off); 1860 mutex_enter(ahm); 1861 ASSERT(ap->an_refcnt != 0); 1862 1863 /* 1864 * skip this one if copy-on-write 1865 * is not yet broken. 1866 */ 1867 if (ap->an_refcnt > 1) { 1868 mutex_exit(ahm); 1869 goto skiplp; 1870 } 1871 if (ap->an_pvp) { 1872 swap_phys_free(ap->an_pvp, 1873 ap->an_poff, PAGESIZE); 1874 ap->an_pvp = NULL; 1875 ap->an_poff = 0; 1876 } 1877 mutex_exit(ahm); 1878 } 1879 page_destroy_pages(root_pp); 1880 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1881 anon_array_exit(&cookie); 1882 continue; 1883 } 1884 skiplp: 1885 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1886 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1887 page_unlock(pp); 1888 anon_array_exit(&cookie); 1889 } 1890 1891 if (purged != NULL) 1892 *purged = npurged; 1893 1894 return (err); 1895 } 1896 1897 /* 1898 * Return the kept page(s) and protections back to the segment driver. 1899 */ 1900 int 1901 anon_getpage( 1902 struct anon **app, 1903 uint_t *protp, 1904 page_t *pl[], 1905 size_t plsz, 1906 struct seg *seg, 1907 caddr_t addr, 1908 enum seg_rw rw, 1909 struct cred *cred) 1910 { 1911 page_t *pp; 1912 struct anon *ap = *app; 1913 struct vnode *vp; 1914 anoff_t off; 1915 int err; 1916 kmutex_t *ahm; 1917 1918 swap_xlate(ap, &vp, &off); 1919 1920 /* 1921 * Lookup the page. If page is being paged in, 1922 * wait for it to finish as we must return a list of 1923 * pages since this routine acts like the VOP_GETPAGE 1924 * routine does. 1925 */ 1926 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1927 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1928 mutex_enter(ahm); 1929 if (ap->an_refcnt == 1) 1930 *protp = PROT_ALL; 1931 else 1932 *protp = PROT_ALL & ~PROT_WRITE; 1933 mutex_exit(ahm); 1934 pl[0] = pp; 1935 pl[1] = NULL; 1936 return (0); 1937 } 1938 1939 /* 1940 * Simply treat it as a vnode fault on the anon vp. 1941 */ 1942 1943 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1944 "anon_getpage:seg %x addr %x vp %x", 1945 seg, addr, vp); 1946 1947 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1948 seg, addr, rw, cred, NULL); 1949 1950 if (err == 0 && pl != NULL) { 1951 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1952 mutex_enter(ahm); 1953 if (ap->an_refcnt != 1) 1954 *protp &= ~PROT_WRITE; /* make read-only */ 1955 mutex_exit(ahm); 1956 } 1957 return (err); 1958 } 1959 1960 /* 1961 * Creates or returns kept pages to the segment driver. returns -1 if a large 1962 * page cannot be allocated. returns -2 if some other process has allocated a 1963 * larger page. 1964 * 1965 * For cowfault it will allocate any size pages to fill the requested area to 1966 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1967 * slots within a large page with other processes). This policy greatly 1968 * simplifies large page freeing (which is only freed when all anon slot 1969 * refcnts are 0). 1970 */ 1971 int 1972 anon_map_getpages( 1973 struct anon_map *amp, 1974 ulong_t start_idx, 1975 uint_t szc, 1976 struct seg *seg, 1977 caddr_t addr, 1978 uint_t prot, 1979 uint_t *protp, 1980 page_t *ppa[], 1981 uint_t *ppa_szc, 1982 struct vpage vpage[], 1983 enum seg_rw rw, 1984 int brkcow, 1985 int anypgsz, 1986 int pgflags, 1987 struct cred *cred) 1988 { 1989 pgcnt_t pgcnt; 1990 struct anon *ap; 1991 struct vnode *vp; 1992 anoff_t off; 1993 page_t *pp, *pl[2], *conpp = NULL; 1994 caddr_t vaddr; 1995 ulong_t pg_idx, an_idx, i; 1996 spgcnt_t nreloc = 0; 1997 int prealloc = 1; 1998 int err, slotcreate; 1999 uint_t vpprot; 2000 int upsize = (szc < seg->s_szc); 2001 2002 #if !defined(__i386) && !defined(__amd64) 2003 ASSERT(seg->s_szc != 0); 2004 #endif 2005 ASSERT(szc <= seg->s_szc); 2006 ASSERT(ppa_szc != NULL); 2007 ASSERT(rw != S_CREATE); 2008 2009 *protp = PROT_ALL; 2010 2011 VM_STAT_ADD(anonvmstats.getpages[0]); 2012 2013 if (szc == 0) { 2014 VM_STAT_ADD(anonvmstats.getpages[1]); 2015 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 2016 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 2017 addr, rw, cred); 2018 if (err) 2019 return (err); 2020 ppa[0] = pl[0]; 2021 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2022 VM_STAT_ADD(anonvmstats.getpages[2]); 2023 if (ppa[0]->p_szc != 0 && upsize) { 2024 VM_STAT_ADD(anonvmstats.getpages[3]); 2025 *ppa_szc = MIN(ppa[0]->p_szc, 2026 seg->s_szc); 2027 page_unlock(ppa[0]); 2028 return (-2); 2029 } 2030 return (0); 2031 } 2032 panic("anon_map_getpages: cowfault for szc 0"); 2033 } else { 2034 VM_STAT_ADD(anonvmstats.getpages[4]); 2035 ppa[0] = anon_zero(seg, addr, &ap, cred); 2036 if (ppa[0] == NULL) 2037 return (ENOMEM); 2038 (void) anon_set_ptr(amp->ahp, start_idx, ap, 2039 ANON_SLEEP); 2040 return (0); 2041 } 2042 } 2043 2044 pgcnt = page_get_pagecnt(szc); 2045 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2046 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2047 2048 /* 2049 * First we check for the case that the requtested large 2050 * page or larger page already exists in the system. 2051 * Actually we only check if the first constituent page 2052 * exists and only preallocate if it's not found. 2053 */ 2054 ap = anon_get_ptr(amp->ahp, start_idx); 2055 if (ap) { 2056 uint_t pszc; 2057 swap_xlate(ap, &vp, &off); 2058 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 2059 if (pszc > szc && upsize) { 2060 *ppa_szc = MIN(pszc, seg->s_szc); 2061 return (-2); 2062 } 2063 if (pszc >= szc) { 2064 prealloc = 0; 2065 } 2066 } 2067 } 2068 2069 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 2070 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 2071 2072 top: 2073 /* 2074 * If a smaller page or no page at all was found, 2075 * grab a large page off the freelist. 2076 */ 2077 if (prealloc) { 2078 ASSERT(conpp == NULL); 2079 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 2080 szc, 0, pgflags) != 0) { 2081 VM_STAT_ADD(anonvmstats.getpages[7]); 2082 if (brkcow == 0 || szc < seg->s_szc || 2083 !anon_szcshare(amp->ahp, start_idx)) { 2084 /* 2085 * If the refcnt's of all anon slots are <= 1 2086 * they can't increase since we are holding 2087 * the address space's lock. So segvn can 2088 * safely decrease szc without risking to 2089 * generate a cow fault for the region smaller 2090 * than the segment's largest page size. 2091 */ 2092 VM_STAT_ADD(anonvmstats.getpages[8]); 2093 return (-1); 2094 } 2095 docow: 2096 /* 2097 * This is a cow fault. Copy away the entire 1 large 2098 * page region of this segment. 2099 */ 2100 if (szc != seg->s_szc) 2101 panic("anon_map_getpages: cowfault for szc %d", 2102 szc); 2103 vaddr = addr; 2104 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2105 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2106 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2107 NULL) { 2108 err = anon_getpage(&ap, &vpprot, pl, 2109 PAGESIZE, seg, vaddr, rw, cred); 2110 if (err) { 2111 for (i = 0; i < pg_idx; i++) { 2112 if ((pp = ppa[i]) != 2113 NULL) 2114 page_unlock(pp); 2115 } 2116 return (err); 2117 } 2118 ppa[pg_idx] = pl[0]; 2119 } else { 2120 /* 2121 * Since this is a cowfault we know 2122 * that this address space has a 2123 * parent or children which means 2124 * anon_dup_fill_holes() has initialized 2125 * all anon slots within a large page 2126 * region that had at least one anon 2127 * slot at the time of fork(). 2128 */ 2129 panic("anon_map_getpages: " 2130 "cowfault but anon slot is empty"); 2131 } 2132 } 2133 VM_STAT_ADD(anonvmstats.getpages[9]); 2134 *protp = PROT_ALL; 2135 return (anon_map_privatepages(amp, start_idx, szc, seg, 2136 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2137 } 2138 } 2139 2140 VM_STAT_ADD(anonvmstats.getpages[10]); 2141 2142 an_idx = start_idx; 2143 pg_idx = 0; 2144 vaddr = addr; 2145 while (pg_idx < pgcnt) { 2146 slotcreate = 0; 2147 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2148 VM_STAT_ADD(anonvmstats.getpages[11]); 2149 /* 2150 * For us to have decided not to preallocate 2151 * would have meant that a large page 2152 * was found. Which also means that all of the 2153 * anon slots for that page would have been 2154 * already created for us. 2155 */ 2156 if (prealloc == 0) 2157 panic("anon_map_getpages: prealloc = 0"); 2158 2159 slotcreate = 1; 2160 ap = anon_alloc(NULL, 0); 2161 } 2162 swap_xlate(ap, &vp, &off); 2163 2164 /* 2165 * Now setup our preallocated page to pass down 2166 * to swap_getpage(). 2167 */ 2168 if (prealloc) { 2169 ASSERT(ppa[pg_idx]->p_szc == szc); 2170 conpp = ppa[pg_idx]; 2171 } 2172 ASSERT(prealloc || conpp == NULL); 2173 2174 /* 2175 * If we just created this anon slot then call 2176 * with S_CREATE to prevent doing IO on the page. 2177 * Similar to the anon_zero case. 2178 */ 2179 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2180 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2181 slotcreate == 1 ? S_CREATE : rw, cred); 2182 2183 if (err) { 2184 ASSERT(err != -2 || upsize); 2185 VM_STAT_ADD(anonvmstats.getpages[12]); 2186 ASSERT(slotcreate == 0); 2187 goto io_err; 2188 } 2189 2190 pp = pl[0]; 2191 2192 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2193 VM_STAT_ADD(anonvmstats.getpages[13]); 2194 ASSERT(slotcreate == 0); 2195 ASSERT(prealloc == 0); 2196 ASSERT(pg_idx == 0); 2197 if (pp->p_szc > szc) { 2198 ASSERT(upsize); 2199 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2200 page_unlock(pp); 2201 VM_STAT_ADD(anonvmstats.getpages[14]); 2202 return (-2); 2203 } 2204 page_unlock(pp); 2205 prealloc = 1; 2206 goto top; 2207 } 2208 2209 /* 2210 * If we decided to preallocate but VOP_GETPAGE 2211 * found a page in the system that satisfies our 2212 * request then free up our preallocated large page 2213 * and continue looping accross the existing large 2214 * page via VOP_GETPAGE. 2215 */ 2216 if (prealloc && pp != ppa[pg_idx]) { 2217 VM_STAT_ADD(anonvmstats.getpages[15]); 2218 ASSERT(slotcreate == 0); 2219 ASSERT(pg_idx == 0); 2220 conpp = NULL; 2221 prealloc = 0; 2222 page_free_pages(ppa[0]); 2223 } 2224 2225 if (prealloc && nreloc > 1) { 2226 /* 2227 * we have relocated out of a smaller large page. 2228 * skip npgs - 1 iterations and continue which will 2229 * increment by one the loop indices. 2230 */ 2231 spgcnt_t npgs = nreloc; 2232 2233 VM_STAT_ADD(anonvmstats.getpages[16]); 2234 2235 ASSERT(pp == ppa[pg_idx]); 2236 ASSERT(slotcreate == 0); 2237 ASSERT(pg_idx + npgs <= pgcnt); 2238 if ((*protp & PROT_WRITE) && 2239 anon_share(amp->ahp, an_idx, npgs)) { 2240 *protp &= ~PROT_WRITE; 2241 } 2242 pg_idx += npgs; 2243 an_idx += npgs; 2244 vaddr += PAGESIZE * npgs; 2245 continue; 2246 } 2247 2248 VM_STAT_ADD(anonvmstats.getpages[17]); 2249 2250 /* 2251 * Anon_zero case. 2252 */ 2253 if (slotcreate) { 2254 ASSERT(prealloc); 2255 pagezero(pp, 0, PAGESIZE); 2256 CPU_STATS_ADD_K(vm, zfod, 1); 2257 hat_setrefmod(pp); 2258 } 2259 2260 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2261 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2262 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2263 2264 if (pg_idx > 0 && 2265 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2266 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2267 panic("anon_map_getpages: unexpected page"); 2268 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2269 panic("anon_map_getpages: unaligned page"); 2270 } 2271 2272 if (prealloc == 0) { 2273 ppa[pg_idx] = pp; 2274 } 2275 2276 if (ap->an_refcnt > 1) { 2277 VM_STAT_ADD(anonvmstats.getpages[18]); 2278 *protp &= ~PROT_WRITE; 2279 } 2280 2281 /* 2282 * If this is a new anon slot then initialize 2283 * the anon array entry. 2284 */ 2285 if (slotcreate) { 2286 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2287 } 2288 pg_idx++; 2289 an_idx++; 2290 vaddr += PAGESIZE; 2291 } 2292 2293 /* 2294 * Since preallocated pages come off the freelist 2295 * they are locked SE_EXCL. Simply downgrade and return. 2296 */ 2297 if (prealloc) { 2298 VM_STAT_ADD(anonvmstats.getpages[19]); 2299 conpp = NULL; 2300 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2301 page_downgrade(ppa[pg_idx]); 2302 } 2303 } 2304 ASSERT(conpp == NULL); 2305 2306 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2307 VM_STAT_ADD(anonvmstats.getpages[20]); 2308 return (0); 2309 } 2310 2311 if (szc < seg->s_szc) 2312 panic("anon_map_getpages: cowfault for szc %d", szc); 2313 2314 VM_STAT_ADD(anonvmstats.getpages[21]); 2315 2316 *protp = PROT_ALL; 2317 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2318 ppa, vpage, anypgsz, pgflags, cred)); 2319 io_err: 2320 /* 2321 * We got an IO error somewhere in our large page. 2322 * If we were using a preallocated page then just demote 2323 * all the constituent pages that we've succeeded with sofar 2324 * to PAGESIZE pages and leave them in the system 2325 * unlocked. 2326 */ 2327 2328 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2329 2330 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2331 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2332 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2333 2334 if (prealloc) { 2335 conpp = NULL; 2336 if (pg_idx > 0) { 2337 VM_STAT_ADD(anonvmstats.getpages[25]); 2338 for (i = 0; i < pgcnt; i++) { 2339 pp = ppa[i]; 2340 ASSERT(PAGE_EXCL(pp)); 2341 ASSERT(pp->p_szc == szc); 2342 pp->p_szc = 0; 2343 } 2344 for (i = 0; i < pg_idx; i++) { 2345 ASSERT(!hat_page_is_mapped(ppa[i])); 2346 page_unlock(ppa[i]); 2347 } 2348 /* 2349 * Now free up the remaining unused constituent 2350 * pages. 2351 */ 2352 while (pg_idx < pgcnt) { 2353 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2354 page_free(ppa[pg_idx], 0); 2355 pg_idx++; 2356 } 2357 } else { 2358 VM_STAT_ADD(anonvmstats.getpages[26]); 2359 page_free_pages(ppa[0]); 2360 } 2361 } else { 2362 VM_STAT_ADD(anonvmstats.getpages[27]); 2363 ASSERT(err > 0); 2364 for (i = 0; i < pg_idx; i++) 2365 page_unlock(ppa[i]); 2366 } 2367 ASSERT(conpp == NULL); 2368 if (err != -1) 2369 return (err); 2370 /* 2371 * we are here because we failed to relocate. 2372 */ 2373 ASSERT(prealloc); 2374 if (brkcow == 0 || szc < seg->s_szc || 2375 !anon_szcshare(amp->ahp, start_idx)) { 2376 VM_STAT_ADD(anonvmstats.getpages[28]); 2377 return (-1); 2378 } 2379 VM_STAT_ADD(anonvmstats.getpages[29]); 2380 goto docow; 2381 } 2382 2383 2384 /* 2385 * Turn a reference to an object or shared anon page 2386 * into a private page with a copy of the data from the 2387 * original page which is always locked by the caller. 2388 * This routine unloads the translation and unlocks the 2389 * original page, if it isn't being stolen, before returning 2390 * to the caller. 2391 * 2392 * NOTE: The original anon slot is not freed by this routine 2393 * It must be freed by the caller while holding the 2394 * "anon_map" lock to prevent races which can occur if 2395 * a process has multiple lwps in its address space. 2396 */ 2397 page_t * 2398 anon_private( 2399 struct anon **app, 2400 struct seg *seg, 2401 caddr_t addr, 2402 uint_t prot, 2403 page_t *opp, 2404 int oppflags, 2405 struct cred *cred) 2406 { 2407 struct anon *old = *app; 2408 struct anon *new; 2409 page_t *pp = NULL; 2410 struct vnode *vp; 2411 anoff_t off; 2412 page_t *anon_pl[1 + 1]; 2413 int err; 2414 2415 if (oppflags & STEAL_PAGE) 2416 ASSERT(PAGE_EXCL(opp)); 2417 else 2418 ASSERT(PAGE_LOCKED(opp)); 2419 2420 CPU_STATS_ADD_K(vm, cow_fault, 1); 2421 2422 /* Kernel probe */ 2423 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2424 tnf_opaque, address, addr); 2425 2426 *app = new = anon_alloc(NULL, 0); 2427 swap_xlate(new, &vp, &off); 2428 2429 if (oppflags & STEAL_PAGE) { 2430 page_rename(opp, vp, (u_offset_t)off); 2431 pp = opp; 2432 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2433 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2434 seg, addr, pp, vp, off); 2435 hat_setmod(pp); 2436 2437 /* bug 4026339 */ 2438 page_downgrade(pp); 2439 return (pp); 2440 } 2441 2442 /* 2443 * Call the VOP_GETPAGE routine to create the page, thereby 2444 * enabling the vnode driver to allocate any filesystem 2445 * space (e.g., disk block allocation for UFS). This also 2446 * prevents more than one page from being added to the 2447 * vnode at the same time. 2448 */ 2449 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2450 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2451 if (err) 2452 goto out; 2453 2454 pp = anon_pl[0]; 2455 2456 /* 2457 * If the original page was locked, we need to move the lock 2458 * to the new page by transfering 'cowcnt/lckcnt' of the original 2459 * page to 'cowcnt/lckcnt' of the new page. 2460 * 2461 * See Statement at the beginning of segvn_lockop() and 2462 * comments in page_pp_useclaim() regarding the way 2463 * cowcnts/lckcnts are handled. 2464 * 2465 * Also availrmem must be decremented up front for read only mapping 2466 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2467 * if availrmem did not need to be decremented after all. 2468 */ 2469 if (oppflags & LOCK_PAGE) { 2470 if ((prot & PROT_WRITE) == 0) { 2471 mutex_enter(&freemem_lock); 2472 if (availrmem > pages_pp_maximum) { 2473 availrmem--; 2474 pages_useclaim++; 2475 } else { 2476 mutex_exit(&freemem_lock); 2477 goto out; 2478 } 2479 mutex_exit(&freemem_lock); 2480 } 2481 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2482 } 2483 2484 /* 2485 * Now copy the contents from the original page, 2486 * which is locked and loaded in the MMU by 2487 * the caller to prevent yet another page fault. 2488 */ 2489 /* XXX - should set mod bit in here */ 2490 if (ppcopy(opp, pp) == 0) { 2491 /* 2492 * Before ppcopy could hanlde UE or other faults, we 2493 * would have panicked here, and still have no option 2494 * but to do so now. 2495 */ 2496 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2497 (void *)opp, (void *)pp); 2498 } 2499 2500 hat_setrefmod(pp); /* mark as modified */ 2501 2502 /* 2503 * Unload the old translation. 2504 */ 2505 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2506 2507 /* 2508 * Free unmapped, unmodified original page. 2509 * or release the lock on the original page, 2510 * otherwise the process will sleep forever in 2511 * anon_decref() waiting for the "exclusive" lock 2512 * on the page. 2513 */ 2514 (void) page_release(opp, 1); 2515 2516 /* 2517 * we are done with page creation so downgrade the new 2518 * page's selock to shared, this helps when multiple 2519 * as_fault(...SOFTLOCK...) are done to the same 2520 * page(aio) 2521 */ 2522 page_downgrade(pp); 2523 2524 /* 2525 * NOTE: The original anon slot must be freed by the 2526 * caller while holding the "anon_map" lock, if we 2527 * copied away from an anonymous page. 2528 */ 2529 return (pp); 2530 2531 out: 2532 *app = old; 2533 if (pp) 2534 page_unlock(pp); 2535 anon_decref(new); 2536 page_unlock(opp); 2537 return ((page_t *)NULL); 2538 } 2539 2540 int 2541 anon_map_privatepages( 2542 struct anon_map *amp, 2543 ulong_t start_idx, 2544 uint_t szc, 2545 struct seg *seg, 2546 caddr_t addr, 2547 uint_t prot, 2548 page_t *ppa[], 2549 struct vpage vpage[], 2550 int anypgsz, 2551 int pgflags, 2552 struct cred *cred) 2553 { 2554 pgcnt_t pgcnt; 2555 struct vnode *vp; 2556 anoff_t off; 2557 page_t *pl[2], *conpp = NULL; 2558 int err; 2559 int prealloc = 1; 2560 struct anon *ap, *oldap; 2561 caddr_t vaddr; 2562 page_t *pplist, *pp; 2563 ulong_t pg_idx, an_idx; 2564 spgcnt_t nreloc = 0; 2565 int pagelock = 0; 2566 kmutex_t *ahmpages = NULL; 2567 #ifdef DEBUG 2568 int refcnt; 2569 #endif 2570 2571 ASSERT(szc != 0); 2572 ASSERT(szc == seg->s_szc); 2573 2574 VM_STAT_ADD(anonvmstats.privatepages[0]); 2575 2576 pgcnt = page_get_pagecnt(szc); 2577 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2578 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2579 2580 ASSERT(amp != NULL); 2581 ap = anon_get_ptr(amp->ahp, start_idx); 2582 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2583 2584 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2585 2586 /* 2587 * Now try and allocate the large page. If we fail then just 2588 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2589 * the caller make this decision but to avoid added complexity 2590 * it's simplier to handle that case here. 2591 */ 2592 if (anypgsz == -1) { 2593 VM_STAT_ADD(anonvmstats.privatepages[2]); 2594 prealloc = 0; 2595 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2596 anypgsz, pgflags) != 0) { 2597 VM_STAT_ADD(anonvmstats.privatepages[3]); 2598 prealloc = 0; 2599 } 2600 2601 /* 2602 * make the decrement of all refcnts of all 2603 * anon slots of a large page appear atomic by 2604 * getting an anonpages_hash_lock for the 2605 * first anon slot of a large page. 2606 */ 2607 if (ap != NULL) { 2608 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 2609 mutex_enter(ahmpages); 2610 if (ap->an_refcnt == 1) { 2611 VM_STAT_ADD(anonvmstats.privatepages[4]); 2612 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2613 mutex_exit(ahmpages); 2614 2615 if (prealloc) { 2616 page_free_replacement_page(pplist); 2617 page_create_putback(pgcnt); 2618 } 2619 ASSERT(ppa[0]->p_szc <= szc); 2620 if (ppa[0]->p_szc == szc) { 2621 VM_STAT_ADD(anonvmstats.privatepages[5]); 2622 return (0); 2623 } 2624 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2625 ASSERT(ppa[pg_idx] != NULL); 2626 page_unlock(ppa[pg_idx]); 2627 } 2628 return (-1); 2629 } 2630 } 2631 2632 /* 2633 * If we are passed in the vpage array and this is 2634 * not PROT_WRITE then we need to decrement availrmem 2635 * up front before we try anything. If we need to and 2636 * can't decrement availrmem then its better to fail now 2637 * than in the middle of processing the new large page. 2638 * page_pp_usclaim() on behalf of each constituent page 2639 * below will adjust availrmem back for the cases not needed. 2640 */ 2641 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2642 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2643 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2644 pagelock = 1; 2645 break; 2646 } 2647 } 2648 if (pagelock) { 2649 VM_STAT_ADD(anonvmstats.privatepages[6]); 2650 mutex_enter(&freemem_lock); 2651 if (availrmem >= pages_pp_maximum + pgcnt) { 2652 availrmem -= pgcnt; 2653 pages_useclaim += pgcnt; 2654 } else { 2655 VM_STAT_ADD(anonvmstats.privatepages[7]); 2656 mutex_exit(&freemem_lock); 2657 if (ahmpages != NULL) { 2658 mutex_exit(ahmpages); 2659 } 2660 if (prealloc) { 2661 page_free_replacement_page(pplist); 2662 page_create_putback(pgcnt); 2663 } 2664 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2665 if (ppa[pg_idx] != NULL) 2666 page_unlock(ppa[pg_idx]); 2667 return (ENOMEM); 2668 } 2669 mutex_exit(&freemem_lock); 2670 } 2671 } 2672 2673 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2674 2675 VM_STAT_ADD(anonvmstats.privatepages[8]); 2676 2677 an_idx = start_idx; 2678 pg_idx = 0; 2679 vaddr = addr; 2680 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2681 ASSERT(ppa[pg_idx] != NULL); 2682 oldap = anon_get_ptr(amp->ahp, an_idx); 2683 ASSERT(ahmpages != NULL || oldap == NULL); 2684 ASSERT(ahmpages == NULL || oldap != NULL); 2685 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2686 ASSERT(ahmpages == NULL || pg_idx != 0 || 2687 (refcnt = oldap->an_refcnt)); 2688 ASSERT(ahmpages == NULL || pg_idx == 0 || 2689 refcnt == oldap->an_refcnt); 2690 2691 ap = anon_alloc(NULL, 0); 2692 2693 swap_xlate(ap, &vp, &off); 2694 2695 /* 2696 * Now setup our preallocated page to pass down to 2697 * swap_getpage(). 2698 */ 2699 if (prealloc) { 2700 pp = pplist; 2701 page_sub(&pplist, pp); 2702 conpp = pp; 2703 } 2704 2705 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2706 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2707 S_CREATE, cred); 2708 2709 /* 2710 * Impossible to fail this is S_CREATE. 2711 */ 2712 if (err) 2713 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2714 2715 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2716 ASSERT(prealloc == 0 || nreloc == 1); 2717 2718 pp = pl[0]; 2719 2720 /* 2721 * If the original page was locked, we need to move 2722 * the lock to the new page by transfering 2723 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2724 * of the new page. pg_idx can be used to index 2725 * into the vpage array since the caller will guarentee 2726 * that vpage struct passed in corresponds to addr 2727 * and forward. 2728 */ 2729 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2730 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2731 } else if (pagelock) { 2732 mutex_enter(&freemem_lock); 2733 availrmem++; 2734 pages_useclaim--; 2735 mutex_exit(&freemem_lock); 2736 } 2737 2738 /* 2739 * Now copy the contents from the original page. 2740 */ 2741 if (ppcopy(ppa[pg_idx], pp) == 0) { 2742 /* 2743 * Before ppcopy could hanlde UE or other faults, we 2744 * would have panicked here, and still have no option 2745 * but to do so now. 2746 */ 2747 panic("anon_map_privatepages, ppcopy failed"); 2748 } 2749 2750 hat_setrefmod(pp); /* mark as modified */ 2751 2752 /* 2753 * Release the lock on the original page, 2754 * derement the old slot, and down grade the lock 2755 * on the new copy. 2756 */ 2757 page_unlock(ppa[pg_idx]); 2758 2759 if (!prealloc) 2760 page_downgrade(pp); 2761 2762 ppa[pg_idx] = pp; 2763 2764 /* 2765 * Now reflect the copy in the new anon array. 2766 */ 2767 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2768 if (oldap != NULL) 2769 anon_decref(oldap); 2770 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2771 } 2772 2773 /* 2774 * Unload the old large page translation. 2775 */ 2776 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2777 2778 if (ahmpages != NULL) { 2779 mutex_exit(ahmpages); 2780 } 2781 ASSERT(prealloc == 0 || pplist == NULL); 2782 if (prealloc) { 2783 VM_STAT_ADD(anonvmstats.privatepages[9]); 2784 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2785 page_downgrade(ppa[pg_idx]); 2786 } 2787 } 2788 2789 return (0); 2790 } 2791 2792 /* 2793 * Allocate a private zero-filled anon page. 2794 */ 2795 page_t * 2796 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2797 { 2798 struct anon *ap; 2799 page_t *pp; 2800 struct vnode *vp; 2801 anoff_t off; 2802 page_t *anon_pl[1 + 1]; 2803 int err; 2804 2805 /* Kernel probe */ 2806 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2807 tnf_opaque, address, addr); 2808 2809 *app = ap = anon_alloc(NULL, 0); 2810 swap_xlate(ap, &vp, &off); 2811 2812 /* 2813 * Call the VOP_GETPAGE routine to create the page, thereby 2814 * enabling the vnode driver to allocate any filesystem 2815 * dependent structures (e.g., disk block allocation for UFS). 2816 * This also prevents more than on page from being added to 2817 * the vnode at the same time since it is locked. 2818 */ 2819 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2820 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2821 if (err) { 2822 *app = NULL; 2823 anon_decref(ap); 2824 return (NULL); 2825 } 2826 pp = anon_pl[0]; 2827 2828 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2829 page_downgrade(pp); 2830 CPU_STATS_ADD_K(vm, zfod, 1); 2831 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2832 return (pp); 2833 } 2834 2835 2836 /* 2837 * Allocate array of private zero-filled anon pages for empty slots 2838 * and kept pages for non empty slots within given range. 2839 * 2840 * NOTE: This rontine will try and use large pages 2841 * if available and supported by underlying platform. 2842 */ 2843 int 2844 anon_map_createpages( 2845 struct anon_map *amp, 2846 ulong_t start_index, 2847 size_t len, 2848 page_t *ppa[], 2849 struct seg *seg, 2850 caddr_t addr, 2851 enum seg_rw rw, 2852 struct cred *cred) 2853 { 2854 2855 struct anon *ap; 2856 struct vnode *ap_vp; 2857 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2858 int err = 0; 2859 ulong_t p_index, index; 2860 pgcnt_t npgs, pg_cnt; 2861 spgcnt_t nreloc = 0; 2862 uint_t l_szc, szc, prot; 2863 anoff_t ap_off; 2864 size_t pgsz; 2865 lgrp_t *lgrp; 2866 kmutex_t *ahm; 2867 2868 /* 2869 * XXX For now only handle S_CREATE. 2870 */ 2871 ASSERT(rw == S_CREATE); 2872 2873 index = start_index; 2874 p_index = 0; 2875 npgs = btopr(len); 2876 2877 /* 2878 * If this platform supports multiple page sizes 2879 * then try and allocate directly from the free 2880 * list for pages larger than PAGESIZE. 2881 * 2882 * NOTE:When we have page_create_ru we can stop 2883 * directly allocating from the freelist. 2884 */ 2885 l_szc = seg->s_szc; 2886 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2887 while (npgs) { 2888 2889 /* 2890 * if anon slot already exists 2891 * (means page has been created) 2892 * so 1) look up the page 2893 * 2) if the page is still in memory, get it. 2894 * 3) if not, create a page and 2895 * page in from physical swap device. 2896 * These are done in anon_getpage(). 2897 */ 2898 ap = anon_get_ptr(amp->ahp, index); 2899 if (ap) { 2900 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2901 seg, addr, S_READ, cred); 2902 if (err) { 2903 ANON_LOCK_EXIT(&->a_rwlock); 2904 panic("anon_map_createpages: anon_getpage"); 2905 } 2906 pp = anon_pl[0]; 2907 ppa[p_index++] = pp; 2908 2909 /* 2910 * an_pvp can become non-NULL after SysV's page was 2911 * paged out before ISM was attached to this SysV 2912 * shared memory segment. So free swap slot if needed. 2913 */ 2914 if (ap->an_pvp != NULL) { 2915 page_io_lock(pp); 2916 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 2917 mutex_enter(ahm); 2918 if (ap->an_pvp != NULL) { 2919 swap_phys_free(ap->an_pvp, 2920 ap->an_poff, PAGESIZE); 2921 ap->an_pvp = NULL; 2922 ap->an_poff = 0; 2923 mutex_exit(ahm); 2924 hat_setmod(pp); 2925 } else { 2926 mutex_exit(ahm); 2927 } 2928 page_io_unlock(pp); 2929 } 2930 2931 addr += PAGESIZE; 2932 index++; 2933 npgs--; 2934 continue; 2935 } 2936 /* 2937 * Now try and allocate the largest page possible 2938 * for the current address and range. 2939 * Keep dropping down in page size until: 2940 * 2941 * 1) Properly aligned 2942 * 2) Does not overlap existing anon pages 2943 * 3) Fits in remaining range. 2944 * 4) able to allocate one. 2945 * 2946 * NOTE: XXX When page_create_ru is completed this code 2947 * will change. 2948 */ 2949 szc = l_szc; 2950 pplist = NULL; 2951 pg_cnt = 0; 2952 while (szc) { 2953 pgsz = page_get_pagesize(szc); 2954 pg_cnt = pgsz >> PAGESHIFT; 2955 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2956 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2957 /* 2958 * XXX 2959 * Since we are faking page_create() 2960 * we also need to do the freemem and 2961 * pcf accounting. 2962 */ 2963 (void) page_create_wait(pg_cnt, PG_WAIT); 2964 2965 /* 2966 * Get lgroup to allocate next page of shared 2967 * memory from and use it to specify where to 2968 * allocate the physical memory 2969 */ 2970 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2971 2972 pplist = page_get_freelist( 2973 anon_vp, (u_offset_t)0, seg, 2974 addr, pgsz, 0, lgrp); 2975 2976 if (pplist == NULL) { 2977 page_create_putback(pg_cnt); 2978 } 2979 2980 /* 2981 * If a request for a page of size 2982 * larger than PAGESIZE failed 2983 * then don't try that size anymore. 2984 */ 2985 if (pplist == NULL) { 2986 l_szc = szc - 1; 2987 } else { 2988 break; 2989 } 2990 } 2991 szc--; 2992 } 2993 2994 /* 2995 * If just using PAGESIZE pages then don't 2996 * directly allocate from the free list. 2997 */ 2998 if (pplist == NULL) { 2999 ASSERT(szc == 0); 3000 pp = anon_zero(seg, addr, &ap, cred); 3001 if (pp == NULL) { 3002 ANON_LOCK_EXIT(&->a_rwlock); 3003 panic("anon_map_createpages: anon_zero"); 3004 } 3005 ppa[p_index++] = pp; 3006 3007 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 3008 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 3009 3010 addr += PAGESIZE; 3011 index++; 3012 npgs--; 3013 continue; 3014 } 3015 3016 /* 3017 * pplist is a list of pg_cnt PAGESIZE pages. 3018 * These pages are locked SE_EXCL since they 3019 * came directly off the free list. 3020 */ 3021 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 3022 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 3023 ASSERT(conpp == NULL); 3024 while (pg_cnt--) { 3025 3026 ap = anon_alloc(NULL, 0); 3027 swap_xlate(ap, &ap_vp, &ap_off); 3028 3029 ASSERT(pplist != NULL); 3030 pp = pplist; 3031 page_sub(&pplist, pp); 3032 PP_CLRFREE(pp); 3033 PP_CLRAGED(pp); 3034 conpp = pp; 3035 3036 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 3037 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 3038 &nreloc, seg, addr, S_CREATE, cred); 3039 3040 if (err) { 3041 ANON_LOCK_EXIT(&->a_rwlock); 3042 panic("anon_map_createpages: S_CREATE"); 3043 } 3044 3045 ASSERT(anon_pl[0] == pp); 3046 ASSERT(nreloc == 1); 3047 pagezero(pp, 0, PAGESIZE); 3048 CPU_STATS_ADD_K(vm, zfod, 1); 3049 hat_setrefmod(pp); 3050 3051 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 3052 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 3053 3054 ppa[p_index++] = pp; 3055 3056 addr += PAGESIZE; 3057 index++; 3058 npgs--; 3059 } 3060 conpp = NULL; 3061 pg_cnt = pgsz >> PAGESHIFT; 3062 p_index = p_index - pg_cnt; 3063 while (pg_cnt--) { 3064 page_downgrade(ppa[p_index++]); 3065 } 3066 } 3067 ANON_LOCK_EXIT(&->a_rwlock); 3068 return (0); 3069 } 3070 3071 static int 3072 anon_try_demote_pages( 3073 struct anon_hdr *ahp, 3074 ulong_t sidx, 3075 uint_t szc, 3076 page_t **ppa, 3077 int private) 3078 { 3079 struct anon *ap; 3080 pgcnt_t pgcnt = page_get_pagecnt(szc); 3081 page_t *pp; 3082 pgcnt_t i; 3083 kmutex_t *ahmpages = NULL; 3084 int root = 0; 3085 pgcnt_t npgs; 3086 pgcnt_t curnpgs = 0; 3087 size_t ppasize = 0; 3088 3089 ASSERT(szc != 0); 3090 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3091 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 3092 ASSERT(sidx < ahp->size); 3093 3094 if (ppa == NULL) { 3095 ppasize = pgcnt * sizeof (page_t *); 3096 ppa = kmem_alloc(ppasize, KM_SLEEP); 3097 } 3098 3099 ap = anon_get_ptr(ahp, sidx); 3100 if (ap != NULL && private) { 3101 VM_STAT_ADD(anonvmstats.demotepages[1]); 3102 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 3103 mutex_enter(ahmpages); 3104 } 3105 3106 if (ap != NULL && ap->an_refcnt > 1) { 3107 if (ahmpages != NULL) { 3108 VM_STAT_ADD(anonvmstats.demotepages[2]); 3109 mutex_exit(ahmpages); 3110 } 3111 if (ppasize != 0) { 3112 kmem_free(ppa, ppasize); 3113 } 3114 return (0); 3115 } 3116 if (ahmpages != NULL) { 3117 mutex_exit(ahmpages); 3118 } 3119 if (ahp->size - sidx < pgcnt) { 3120 ASSERT(private == 0); 3121 pgcnt = ahp->size - sidx; 3122 } 3123 for (i = 0; i < pgcnt; i++, sidx++) { 3124 ap = anon_get_ptr(ahp, sidx); 3125 if (ap != NULL) { 3126 if (ap->an_refcnt != 1) { 3127 panic("anon_try_demote_pages: an_refcnt != 1"); 3128 } 3129 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3130 SE_EXCL); 3131 if (pp != NULL) { 3132 (void) hat_pageunload(pp, 3133 HAT_FORCE_PGUNLOAD); 3134 } 3135 } else { 3136 ppa[i] = NULL; 3137 } 3138 } 3139 for (i = 0; i < pgcnt; i++) { 3140 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3141 ASSERT(pp->p_szc <= szc); 3142 if (!root) { 3143 VM_STAT_ADD(anonvmstats.demotepages[3]); 3144 if (curnpgs != 0) 3145 panic("anon_try_demote_pages: " 3146 "bad large page"); 3147 3148 root = 1; 3149 curnpgs = npgs = 3150 page_get_pagecnt(pp->p_szc); 3151 3152 ASSERT(npgs <= pgcnt); 3153 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3154 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3155 } else { 3156 ASSERT(i > 0); 3157 ASSERT(page_pptonum(pp) - 1 == 3158 page_pptonum(ppa[i - 1])); 3159 if ((page_pptonum(pp) & (npgs - 1)) == 3160 npgs - 1) 3161 root = 0; 3162 } 3163 ASSERT(PAGE_EXCL(pp)); 3164 pp->p_szc = 0; 3165 ASSERT(curnpgs > 0); 3166 curnpgs--; 3167 } 3168 } 3169 if (root != 0 || curnpgs != 0) 3170 panic("anon_try_demote_pages: bad large page"); 3171 3172 for (i = 0; i < pgcnt; i++) { 3173 if ((pp = ppa[i]) != NULL) { 3174 ASSERT(!hat_page_is_mapped(pp)); 3175 ASSERT(pp->p_szc == 0); 3176 page_unlock(pp); 3177 } 3178 } 3179 if (ppasize != 0) { 3180 kmem_free(ppa, ppasize); 3181 } 3182 return (1); 3183 } 3184 3185 /* 3186 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3187 */ 3188 int 3189 anon_map_demotepages( 3190 struct anon_map *amp, 3191 ulong_t start_idx, 3192 struct seg *seg, 3193 caddr_t addr, 3194 uint_t prot, 3195 struct vpage vpage[], 3196 struct cred *cred) 3197 { 3198 struct anon *ap; 3199 uint_t szc = seg->s_szc; 3200 pgcnt_t pgcnt = page_get_pagecnt(szc); 3201 size_t ppasize = pgcnt * sizeof (page_t *); 3202 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3203 page_t *pp; 3204 page_t *pl[2]; 3205 pgcnt_t i, pg_idx; 3206 ulong_t an_idx; 3207 caddr_t vaddr; 3208 int err; 3209 int retry = 0; 3210 uint_t vpprot; 3211 3212 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3213 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3214 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3215 ASSERT(ppa != NULL); 3216 ASSERT(szc != 0); 3217 ASSERT(szc == amp->a_szc); 3218 3219 VM_STAT_ADD(anonvmstats.demotepages[0]); 3220 3221 top: 3222 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3223 kmem_free(ppa, ppasize); 3224 return (0); 3225 } 3226 3227 VM_STAT_ADD(anonvmstats.demotepages[4]); 3228 3229 ASSERT(retry == 0); /* we can be here only once */ 3230 3231 vaddr = addr; 3232 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3233 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3234 ap = anon_get_ptr(amp->ahp, an_idx); 3235 if (ap == NULL) 3236 panic("anon_map_demotepages: no anon slot"); 3237 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3238 S_READ, cred); 3239 if (err) { 3240 for (i = 0; i < pg_idx; i++) { 3241 if ((pp = ppa[i]) != NULL) 3242 page_unlock(pp); 3243 } 3244 kmem_free(ppa, ppasize); 3245 return (err); 3246 } 3247 ppa[pg_idx] = pl[0]; 3248 } 3249 3250 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3251 vpage, -1, 0, cred); 3252 if (err > 0) { 3253 VM_STAT_ADD(anonvmstats.demotepages[5]); 3254 kmem_free(ppa, ppasize); 3255 return (err); 3256 } 3257 ASSERT(err == 0 || err == -1); 3258 if (err == -1) { 3259 VM_STAT_ADD(anonvmstats.demotepages[6]); 3260 retry = 1; 3261 goto top; 3262 } 3263 for (i = 0; i < pgcnt; i++) { 3264 ASSERT(ppa[i] != NULL); 3265 if (ppa[i]->p_szc != 0) 3266 retry = 1; 3267 page_unlock(ppa[i]); 3268 } 3269 if (retry) { 3270 VM_STAT_ADD(anonvmstats.demotepages[7]); 3271 goto top; 3272 } 3273 3274 VM_STAT_ADD(anonvmstats.demotepages[8]); 3275 3276 kmem_free(ppa, ppasize); 3277 3278 return (0); 3279 } 3280 3281 /* 3282 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3283 * structures with private anon maps. Therefore all anon structures should 3284 * have at most one reference at this point. This means underlying pages can 3285 * be exclusively locked and demoted or freed. If not freeing the entire 3286 * large pages demote the ends of the region we free to be able to free 3287 * subpages. Page roots correspond to aligned index positions in anon map. 3288 */ 3289 void 3290 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3291 { 3292 ulong_t eidx = sidx + btopr(len); 3293 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3294 struct anon_hdr *ahp = amp->ahp; 3295 ulong_t tidx; 3296 size_t size; 3297 ulong_t sidx_aligned; 3298 ulong_t eidx_aligned; 3299 3300 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3301 ASSERT(amp->refcnt <= 1); 3302 ASSERT(amp->a_szc > 0); 3303 ASSERT(eidx <= ahp->size); 3304 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3305 3306 if (len == 0) { /* XXX */ 3307 return; 3308 } 3309 3310 sidx_aligned = P2ALIGN(sidx, pages); 3311 if (sidx_aligned != sidx || 3312 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3313 if (!anon_try_demote_pages(ahp, sidx_aligned, 3314 amp->a_szc, NULL, 0)) { 3315 panic("anon_shmap_free_pages: demote failed"); 3316 } 3317 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3318 P2NPHASE(sidx, pages); 3319 size <<= PAGESHIFT; 3320 anon_free(ahp, sidx, size); 3321 sidx = sidx_aligned + pages; 3322 if (eidx <= sidx) { 3323 return; 3324 } 3325 } 3326 eidx_aligned = P2ALIGN(eidx, pages); 3327 if (sidx < eidx_aligned) { 3328 anon_free_pages(ahp, sidx, 3329 (eidx_aligned - sidx) << PAGESHIFT, 3330 amp->a_szc); 3331 sidx = eidx_aligned; 3332 } 3333 ASSERT(sidx == eidx_aligned); 3334 if (eidx == eidx_aligned) { 3335 return; 3336 } 3337 tidx = eidx; 3338 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3339 tidx - sidx < pages) { 3340 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3341 panic("anon_shmap_free_pages: demote failed"); 3342 } 3343 size = (eidx - sidx) << PAGESHIFT; 3344 anon_free(ahp, sidx, size); 3345 } else { 3346 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3347 } 3348 } 3349 3350 /* 3351 * This routine should be called with amp's writer lock when there're no other 3352 * users of amp. All pcache entries of this amp must have been already 3353 * inactivated. We must not drop a_rwlock here to prevent new users from 3354 * attaching to this amp. 3355 */ 3356 void 3357 anonmap_purge(struct anon_map *amp) 3358 { 3359 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3360 ASSERT(amp->refcnt <= 1); 3361 3362 if (amp->a_softlockcnt != 0) { 3363 seg_ppurge(NULL, amp, 0); 3364 } 3365 3366 /* 3367 * Since all pcache entries were already inactive before this routine 3368 * was called seg_ppurge() couldn't return while there're still 3369 * entries that can be found via the list anchored at a_phead. So we 3370 * can assert this list is empty now. a_softlockcnt may be still non 0 3371 * if asynchronous thread that manages pcache already removed pcache 3372 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3373 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3374 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3375 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3376 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3377 * barrier that prevents anonmap_purge() to complete while 3378 * shamp_reclaim() may still be referencing this amp. 3379 */ 3380 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3381 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3382 3383 mutex_enter(&->a_purgemtx); 3384 while (amp->a_softlockcnt != 0) { 3385 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3386 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3387 amp->a_purgewait = 1; 3388 cv_wait(&->a_purgecv, &->a_purgemtx); 3389 } 3390 mutex_exit(&->a_purgemtx); 3391 3392 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3393 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3394 ASSERT(amp->a_softlockcnt == 0); 3395 } 3396 3397 /* 3398 * Allocate and initialize an anon_map structure for seg 3399 * associating the given swap reservation with the new anon_map. 3400 */ 3401 struct anon_map * 3402 anonmap_alloc(size_t size, size_t swresv, int flags) 3403 { 3404 struct anon_map *amp; 3405 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3406 3407 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3408 if (amp == NULL) { 3409 ASSERT(kmflags == KM_NOSLEEP); 3410 return (NULL); 3411 } 3412 3413 amp->ahp = anon_create(btopr(size), flags); 3414 if (amp->ahp == NULL) { 3415 ASSERT(flags == ANON_NOSLEEP); 3416 kmem_cache_free(anonmap_cache, amp); 3417 return (NULL); 3418 } 3419 amp->refcnt = 1; 3420 amp->size = size; 3421 amp->swresv = swresv; 3422 amp->locality = 0; 3423 amp->a_szc = 0; 3424 amp->a_sp = NULL; 3425 amp->a_softlockcnt = 0; 3426 amp->a_purgewait = 0; 3427 amp->a_phead.p_lnext = &->a_phead; 3428 amp->a_phead.p_lprev = &->a_phead; 3429 3430 return (amp); 3431 } 3432 3433 void 3434 anonmap_free(struct anon_map *amp) 3435 { 3436 ASSERT(amp->ahp != NULL); 3437 ASSERT(amp->refcnt == 0); 3438 ASSERT(amp->a_softlockcnt == 0); 3439 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3440 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3441 3442 lgrp_shm_policy_fini(amp, NULL); 3443 anon_release(amp->ahp, btopr(amp->size)); 3444 kmem_cache_free(anonmap_cache, amp); 3445 } 3446 3447 /* 3448 * Returns true if the app array has some empty slots. 3449 * The offp and lenp parameters are in/out parameters. On entry 3450 * these values represent the starting offset and length of the 3451 * mapping. When true is returned, these values may be modified 3452 * to be the largest range which includes empty slots. 3453 */ 3454 int 3455 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3456 size_t *lenp) 3457 { 3458 ulong_t i, el; 3459 ssize_t low, high; 3460 struct anon *ap; 3461 3462 low = -1; 3463 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3464 ap = anon_get_ptr(ahp, anon_idx); 3465 if (ap == NULL) { 3466 if (low == -1) 3467 low = i; 3468 high = i; 3469 } 3470 } 3471 if (low != -1) { 3472 /* 3473 * Found at least one non-anon page. 3474 * Set up the off and len return values. 3475 */ 3476 if (low != 0) 3477 *offp += low; 3478 *lenp = high - low + PAGESIZE; 3479 return (1); 3480 } 3481 return (0); 3482 } 3483 3484 /* 3485 * Return a count of the number of existing anon pages in the anon array 3486 * app in the range (off, off+len). The array and slots must be guaranteed 3487 * stable by the caller. 3488 */ 3489 pgcnt_t 3490 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3491 { 3492 pgcnt_t cnt = 0; 3493 3494 while (nslots-- > 0) { 3495 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3496 cnt++; 3497 anon_index++; 3498 } 3499 return (cnt); 3500 } 3501 3502 /* 3503 * Move reserved phys swap into memory swap (unreserve phys swap 3504 * and reserve mem swap by the same amount). 3505 * Used by segspt when it needs to lock reserved swap npages in memory 3506 */ 3507 int 3508 anon_swap_adjust(pgcnt_t npages) 3509 { 3510 pgcnt_t unlocked_mem_swap; 3511 3512 mutex_enter(&anoninfo_lock); 3513 3514 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3515 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3516 3517 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3518 - k_anoninfo.ani_locked_swap; 3519 if (npages > unlocked_mem_swap) { 3520 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3521 3522 /* 3523 * if there is not enough unlocked mem swap we take missing 3524 * amount from phys swap and give it to mem swap 3525 */ 3526 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3527 mutex_exit(&anoninfo_lock); 3528 return (ENOMEM); 3529 } 3530 3531 k_anoninfo.ani_mem_resv += adjusted_swap; 3532 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3533 k_anoninfo.ani_phys_resv -= adjusted_swap; 3534 3535 ANI_ADD(adjusted_swap); 3536 } 3537 k_anoninfo.ani_locked_swap += npages; 3538 3539 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3540 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3541 3542 mutex_exit(&anoninfo_lock); 3543 3544 return (0); 3545 } 3546 3547 /* 3548 * 'unlocked' reserved mem swap so when it is unreserved it 3549 * can be moved back phys (disk) swap 3550 */ 3551 void 3552 anon_swap_restore(pgcnt_t npages) 3553 { 3554 mutex_enter(&anoninfo_lock); 3555 3556 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3557 3558 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3559 k_anoninfo.ani_locked_swap -= npages; 3560 3561 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3562 3563 mutex_exit(&anoninfo_lock); 3564 } 3565 3566 /* 3567 * Return the pointer from the list for a 3568 * specified anon index. 3569 */ 3570 ulong_t * 3571 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3572 { 3573 struct anon **app; 3574 void **ppp; 3575 3576 ASSERT(an_idx < ahp->size); 3577 3578 /* 3579 * Single level case. 3580 */ 3581 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3582 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3583 } else { 3584 3585 /* 3586 * 2 level case. 3587 */ 3588 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3589 if (*ppp == NULL) { 3590 mutex_enter(&ahp->serial_lock); 3591 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3592 if (*ppp == NULL) 3593 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3594 mutex_exit(&ahp->serial_lock); 3595 } 3596 app = *ppp; 3597 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3598 } 3599 } 3600 3601 void 3602 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3603 { 3604 ulong_t *ap_slot; 3605 kmutex_t *mtx; 3606 kcondvar_t *cv; 3607 int hash; 3608 3609 /* 3610 * Use szc to determine anon slot(s) to appear atomic. 3611 * If szc = 0, then lock the anon slot and mark it busy. 3612 * If szc > 0, then lock the range of slots by getting the 3613 * anon_array_lock for the first anon slot, and mark only the 3614 * first anon slot busy to represent whole range being busy. 3615 */ 3616 3617 ASSERT(RW_READ_HELD(&->a_rwlock)); 3618 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3619 hash = ANON_ARRAY_HASH(amp, an_idx); 3620 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3621 sobj->sync_cv = cv = &anon_array_cv[hash]; 3622 mutex_enter(mtx); 3623 ap_slot = anon_get_slot(amp->ahp, an_idx); 3624 while (ANON_ISBUSY(ap_slot)) 3625 cv_wait(cv, mtx); 3626 ANON_SETBUSY(ap_slot); 3627 sobj->sync_data = ap_slot; 3628 mutex_exit(mtx); 3629 } 3630 3631 int 3632 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3633 anon_sync_obj_t *sobj) 3634 { 3635 ulong_t *ap_slot; 3636 kmutex_t *mtx; 3637 int hash; 3638 3639 /* 3640 * Try to lock a range of anon slots. 3641 * Use szc to determine anon slot(s) to appear atomic. 3642 * If szc = 0, then lock the anon slot and mark it busy. 3643 * If szc > 0, then lock the range of slots by getting the 3644 * anon_array_lock for the first anon slot, and mark only the 3645 * first anon slot busy to represent whole range being busy. 3646 * Fail if the mutex or the anon_array are busy. 3647 */ 3648 3649 ASSERT(RW_READ_HELD(&->a_rwlock)); 3650 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3651 hash = ANON_ARRAY_HASH(amp, an_idx); 3652 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3653 sobj->sync_cv = &anon_array_cv[hash]; 3654 if (!mutex_tryenter(mtx)) { 3655 return (EWOULDBLOCK); 3656 } 3657 ap_slot = anon_get_slot(amp->ahp, an_idx); 3658 if (ANON_ISBUSY(ap_slot)) { 3659 mutex_exit(mtx); 3660 return (EWOULDBLOCK); 3661 } 3662 ANON_SETBUSY(ap_slot); 3663 sobj->sync_data = ap_slot; 3664 mutex_exit(mtx); 3665 return (0); 3666 } 3667 3668 void 3669 anon_array_exit(anon_sync_obj_t *sobj) 3670 { 3671 mutex_enter(sobj->sync_mutex); 3672 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3673 ANON_CLRBUSY(sobj->sync_data); 3674 if (CV_HAS_WAITERS(sobj->sync_cv)) 3675 cv_broadcast(sobj->sync_cv); 3676 mutex_exit(sobj->sync_mutex); 3677 } 3678