1 /* 2 * KERN_KMALLOC.C - Kernel memory allocator 3 * 4 * Copyright (c) 2021 The DragonFly Project, All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * This module implements the kmalloc_obj allocator. This is a type-stable 39 * allocator that uses the same base structures (e.g. malloc_type) plus 40 * some extensions to efficiently implement single-type zones. 41 * 42 * All memory management is zone based. When a zone is destroyed, all of 43 * its memory is returned to the system with no fragmentation. 44 * 45 * A mini-slab allocator hangs directly off the zone structure (malloc_type). 46 * Since the object zones are single-size-only, the slab allocator is very 47 * simple and currently utilizes just two per-zone/per-cpu slabs (active and 48 * alternate) before kicking up to the per-zone cache. Beyond that we just 49 * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary 50 * kernel_map mappings and unmappings. 51 * 52 * The advantage of this that zones don't stomp over each other and cause 53 * excessive fragmentation in the slabs. For example, when you umount a 54 * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory) 55 * is returned to the system. 56 */ 57 58 #include "opt_vm.h" 59 60 #include <sys/param.h> 61 #include <sys/systm.h> 62 #include <sys/kernel.h> 63 #include <sys/slaballoc.h> 64 #include <sys/mbuf.h> 65 #include <sys/vmmeter.h> 66 #include <sys/spinlock.h> 67 #include <sys/lock.h> 68 #include <sys/thread.h> 69 #include <sys/globaldata.h> 70 #include <sys/sysctl.h> 71 #include <sys/ktr.h> 72 #include <sys/malloc.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_param.h> 76 #include <vm/vm_kern.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_object.h> 79 #include <vm/pmap.h> 80 #include <vm/vm_map.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_pageout.h> 83 84 #include <machine/cpu.h> 85 86 #include <sys/spinlock2.h> 87 #include <sys/thread2.h> 88 #include <sys/exislock2.h> 89 #include <vm/vm_page2.h> 90 91 #define MEMORY_STRING "ptr=%p type=%p size=%lu flags=%04x" 92 #define MEMORY_ARGS void *ptr, void *type, unsigned long size, int flags 93 94 #if !defined(KTR_MEMORY) 95 #define KTR_MEMORY KTR_ALL 96 #endif 97 KTR_INFO_MASTER(mem_obj); 98 KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin"); 99 KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS); 100 #if 0 101 KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS); 102 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS); 103 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS); 104 KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS); 105 KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS); 106 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS); 107 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS); 108 #endif 109 KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin"); 110 KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end"); 111 112 #define logmemory(name, ptr, type, size, flags) \ 113 KTR_LOG(mem_obj_ ## name, ptr, type, size, flags) 114 #define logmemory_quick(name) \ 115 KTR_LOG(mem_obj_ ## name) 116 117 __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS; 118 SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, ""); 119 __read_frequently static int kzone_bretire = 4; 120 SYSCTL_INT(_kern, OID_AUTO, kzone_bretire, CTLFLAG_RW, &kzone_bretire, 0, ""); 121 __read_frequently static int kzone_debug; 122 SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, ""); 123 124 __read_frequently struct kmalloc_slab kslab_dummy; 125 126 static void malloc_slab_destroy(struct malloc_type *type, 127 struct kmalloc_slab **slabp); 128 129 /* 130 * Cache a chain of slabs onto their respective cpu slab caches. Any slabs 131 * which we cannot cache will be returned. 132 * 133 * free_slabs - Current structure may only be accessed by current cpu 134 * remote_free_slabs - Only atomic swap operations are allowed. 135 * free_count - Only atomic operations are allowed. 136 * 137 * If the count is sufficient to cache the entire list, NULL is returned. 138 * Otherwise the portion that was not cached is returned. 139 */ 140 static __noinline 141 struct kmalloc_slab * 142 gslab_cache(struct kmalloc_slab *slab) 143 { 144 struct kmalloc_slab *save; 145 struct kmalloc_slab *next; 146 struct kmalloc_slab *res; 147 struct kmalloc_slab **resp; 148 struct kmalloc_slab **slabp; 149 globaldata_t rgd; 150 size_t count; 151 int cpuid; 152 153 res = NULL; 154 resp = &res; 155 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 156 157 /* 158 * Given the slab list, get the cpuid and clip off as many matching 159 * elements as fits in the cache. 160 */ 161 while (slab) { 162 cpuid = slab->orig_cpuid; 163 rgd = globaldata_find(cpuid); 164 165 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 166 /* 167 * Doesn't fit in cache, put on return list. 168 */ 169 if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) { 170 *resp = slab; 171 resp = &slab->next; 172 slab = slab->next; 173 continue; 174 } 175 176 /* 177 * Collect. We aren't required to match-up the original cpu 178 * with the disposal cpu, but its a good idea to retain 179 * memory locality. 180 * 181 * The slabs we collect are going into the global cache, 182 * remove the type association. 183 */ 184 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 185 slabp = &slab->next; 186 count = 1; 187 slab->type = NULL; 188 189 while ((next = *slabp) != NULL && 190 next->orig_cpuid == cpuid && 191 rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs) 192 { 193 KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0); 194 next->type = NULL; 195 ++count; 196 slabp = &next->next; 197 } 198 199 /* 200 * Safety, unhook before next, next is not included in the 201 * list starting with slab that is being pre-pended 202 * to remote_free_slabs. 203 */ 204 *slabp = NULL; 205 206 /* 207 * Now atomically pre-pend slab...*slabp to remote_free_slabs. 208 * Pump the count first (its ok if the actual chain length 209 * races the count update). 210 * 211 * NOTE: In the loop, (save) is updated by fcmpset. 212 */ 213 atomic_add_long(&rgd->gd_kmslab.free_count, count); 214 save = rgd->gd_kmslab.remote_free_slabs; 215 for (;;) { 216 KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0); 217 *slabp = save; /* end of slab list chain to... */ 218 cpu_ccfence(); 219 if (atomic_fcmpset_ptr( 220 &rgd->gd_kmslab.remote_free_slabs, 221 &save, slab)) 222 { 223 break; 224 } 225 } 226 227 /* 228 * Setup for next loop 229 */ 230 slab = next; 231 } 232 233 /* 234 * Terminate the result list and return it 235 */ 236 *resp = NULL; 237 238 return res; 239 } 240 241 /* 242 * May only be called on current cpu. Pull a free slab from the 243 * pcpu cache. If we run out, move any slabs that have built-up 244 * from remote cpus. 245 * 246 * We are only allowed to swap the remote_free_slabs head, we cannot 247 * manipulate any next pointers while structures are sitting on that list. 248 */ 249 static __inline 250 struct kmalloc_slab * 251 gslab_alloc(globaldata_t gd) 252 { 253 struct kmalloc_slab *slab; 254 255 slab = gd->gd_kmslab.free_slabs; 256 if (slab == NULL) { 257 slab = atomic_swap_ptr( 258 (volatile void **)&gd->gd_kmslab.remote_free_slabs, 259 NULL); 260 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 261 } 262 if (slab) { 263 gd->gd_kmslab.free_slabs = slab->next; 264 slab->next = NULL; 265 atomic_add_long(&gd->gd_kmslab.free_count, -1); 266 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 267 } 268 return slab; 269 } 270 271 void 272 malloc_mgt_init(struct malloc_type *type __unused, 273 struct kmalloc_mgt *mgt, size_t size) 274 { 275 size_t offset; 276 size_t count; 277 278 bzero(mgt, sizeof(*mgt)); 279 spin_init(&mgt->spin, "kmmgt"); 280 281 /* 282 * Allows us to avoid a conditional. The dummy slabs are empty 283 * and have no objects. 284 */ 285 mgt->active = &kslab_dummy; 286 mgt->alternate = &kslab_dummy; 287 mgt->empty_tailp = &mgt->empty; 288 289 /* 290 * Figure out the count by taking into account the size of the fobjs[] 291 * array by adding it to the object size. 292 */ 293 offset = offsetof(struct kmalloc_slab, fobjs[0]); 294 offset = __VM_CACHELINE_ALIGN(offset); 295 count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *)); 296 297 /* 298 * However, the fobj[] array itself must be aligned, so we might 299 * have to reduce the count by 1. (We can do this becaues 'size' 300 * is already aligned as well). 301 */ 302 offset = offsetof(struct kmalloc_slab, fobjs[count]); 303 offset = __VM_CACHELINE_ALIGN(offset); 304 305 if (offset + size * count > KMALLOC_SLAB_SIZE) { 306 --count; 307 offset = offsetof(struct kmalloc_slab, fobjs[count]); 308 offset = __VM_CACHELINE_ALIGN(offset); 309 KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE); 310 } 311 312 mgt->slab_offset = offset; 313 mgt->slab_count = count; 314 } 315 316 void 317 malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst) 318 { 319 struct kmalloc_slab **slabp; 320 321 spin_init(&dst->spin, "kmmgt"); 322 slabp = &dst->empty; 323 324 while (*slabp) { 325 slabp = &(*slabp)->next; 326 } 327 dst->empty_tailp = slabp; 328 } 329 330 void 331 malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt) 332 { 333 if (mgt->active != &kslab_dummy) 334 malloc_slab_destroy(type, &mgt->active); 335 mgt->active = NULL; 336 337 if (mgt->alternate != &kslab_dummy) 338 malloc_slab_destroy(type, &mgt->alternate); 339 mgt->alternate = NULL; 340 341 malloc_slab_destroy(type, &mgt->partial); 342 malloc_slab_destroy(type, &mgt->full); 343 malloc_slab_destroy(type, &mgt->empty); 344 mgt->npartial = 0; 345 mgt->nfull = 0; 346 mgt->nempty = 0; 347 mgt->empty_tailp = &mgt->empty; 348 349 spin_uninit(&mgt->spin); 350 } 351 352 /* 353 * Destroy a list of slabs. Attempt to cache the slabs on the specified 354 * (possibly remote) cpu. This allows slabs that were operating on a 355 * particular cpu to be disposed of back to that same cpu. 356 */ 357 static void 358 malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp) 359 { 360 struct kmalloc_slab *slab; 361 struct kmalloc_slab *base; 362 struct kmalloc_slab **basep; 363 size_t delta; 364 365 if (*slabp == NULL) 366 return; 367 368 /* 369 * Collect all slabs that can actually be destroyed, complain 370 * about the rest. 371 */ 372 base = NULL; 373 basep = &base; 374 while ((slab = *slabp) != NULL) { 375 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 376 377 delta = slab->findex - slab->aindex; 378 if (delta == slab->ncount) { 379 *slabp = slab->next; /* unlink */ 380 *basep = slab; /* link into base list */ 381 basep = &slab->next; 382 } else { 383 kprintf("%s: slab %p %zd objects " 384 "were still allocated\n", 385 type->ks_shortdesc, slab, 386 slab->ncount - delta); 387 /* leave link intact and iterate */ 388 slabp = &slab->next; 389 } 390 } 391 392 /* 393 * Terminate the base list of slabs that can be destroyed, 394 * then cache as many of them as possible. 395 */ 396 *basep = NULL; 397 if (base == NULL) 398 return; 399 base = gslab_cache(base); 400 401 /* 402 * Destroy the remainder 403 */ 404 while ((slab = base) != NULL) { 405 base = slab->next; 406 slab->next = (void *)(uintptr_t)-1; 407 kmem_slab_free(slab, KMALLOC_SLAB_SIZE); 408 } 409 } 410 411 /* 412 * Poll a limited number of slabs on the empty list and move them 413 * to the appropriate full or partial list. Slabs left on the empty 414 * list or rotated to the tail. 415 * 416 * If gcache is non-zero this function will try to place full slabs into 417 * the globaldata cache, if it isn't already too full. 418 * 419 * The mgt is spin-locked 420 * 421 * Returns non-zero if the ggm updates possibly made slabs available for 422 * allocation. 423 */ 424 static int 425 malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count) 426 { 427 struct kmalloc_slab *marker; 428 struct kmalloc_slab *slab; 429 size_t delta; 430 int got_something; 431 432 if (ggm->empty == NULL) 433 return 0; 434 435 got_something = 0; 436 marker = ggm->empty; 437 438 while (count-- && (slab = ggm->empty) != NULL) { 439 /* 440 * Unlink from empty 441 */ 442 ggm->empty = slab->next; 443 slab->next = NULL; 444 --ggm->nempty; 445 if (ggm->empty_tailp == &slab->next) 446 ggm->empty_tailp = &ggm->empty; 447 448 /* 449 * Check partial, full, and empty. We rotate 450 * empty entries to the end of the empty list. 451 * 452 * NOTE: For a fully-freeable slab we also have 453 * to check xindex. 454 */ 455 delta = slab->findex - slab->aindex; 456 if (delta == slab->ncount) { 457 /* 458 * Stuff into the full list. This requires setting 459 * the exis sequence number via exis_terminate(). 460 */ 461 KKASSERT(slab->next == NULL); 462 exis_terminate(&slab->exis); 463 slab->next = ggm->full; 464 ggm->full = slab; 465 got_something = 1; 466 ++ggm->nfull; 467 } else if (delta) { 468 /* 469 * Partially full 470 */ 471 KKASSERT(slab->next == NULL); 472 slab->next = ggm->partial; 473 ggm->partial = slab; 474 got_something = 1; 475 ++ggm->npartial; 476 } else { 477 /* 478 * Empty 479 */ 480 KKASSERT(slab->next == NULL); 481 *ggm->empty_tailp = slab; 482 ggm->empty_tailp = &slab->next; 483 ++ggm->nempty; 484 if (ggm->empty == marker) 485 break; 486 } 487 } 488 return got_something; 489 } 490 491 /* 492 * Called once a second with the zone interlocked against destruction. 493 * 494 * Returns non-zero to tell the caller to iterate to the next type, 495 * else the caller should stay on the current type. 496 */ 497 int 498 malloc_mgt_poll(struct malloc_type *type) 499 { 500 struct kmalloc_mgt *ggm; 501 struct kmalloc_slab *slab; 502 struct kmalloc_slab **slabp; 503 struct kmalloc_slab *base; 504 struct kmalloc_slab **basep; 505 size_t delta; 506 int donext; 507 int count; 508 int retired; 509 510 if ((type->ks_flags & KSF_OBJSIZE) == 0) 511 return 1; 512 513 /* 514 * Check the partial, full, and empty lists for full freeable slabs 515 * in excess of desired caching count. 516 */ 517 ggm = &type->ks_mgt; 518 spin_lock(&ggm->spin); 519 520 /* 521 * Move empty slabs to partial or full as appropriate. We 522 * don't bother checking partial slabs to see if they are full 523 * for now. 524 */ 525 malloc_mgt_poll_empty_locked(ggm, 16); 526 527 /* 528 * Ok, cleanout some of the full mags from the full list 529 */ 530 base = NULL; 531 basep = &base; 532 count = ggm->nfull; 533 retired = 0; 534 cpu_ccfence(); 535 536 if (count > KMALLOC_MAXFREEMAGS) { 537 slabp = &ggm->full; 538 count -= KMALLOC_MAXFREEMAGS; 539 if (count > 16) 540 count = 16; 541 542 while (count && (slab = *slabp) != NULL) { 543 delta = slab->findex - slab->aindex; 544 if (delta == slab->ncount && 545 slab->xindex == slab->findex && 546 exis_freeable(&slab->exis)) 547 { 548 /* 549 * (1) No allocated entries in the structure, 550 * this should always be the case from the 551 * full list. 552 * 553 * (2) kfree_obj() has fully completed. Just 554 * checking findex is not sufficient since 555 * it is incremented to reserve the slot 556 * before the element is loaded into it. 557 * 558 * (3) The slab has been on the full list for 559 * a sufficient number of EXIS 560 * pseudo_ticks, for type-safety. 561 */ 562 *slabp = slab->next; 563 *basep = slab; 564 basep = &slab->next; 565 --ggm->nfull; 566 ++ggm->gcache_count; 567 if (++retired == kzone_bretire) 568 break; 569 } else { 570 slabp = &slab->next; 571 } 572 --count; 573 } 574 *basep = NULL; /* terminate the retirement list */ 575 donext = (*slabp == NULL); 576 } else { 577 donext = 1; 578 } 579 spin_unlock(&ggm->spin); 580 581 /* 582 * Clean out any slabs that we couldn't stow in the globaldata cache. 583 */ 584 if (retired) { 585 if (kzone_debug) { 586 kprintf("kmalloc_poll: %s retire %d\n", 587 type->ks_shortdesc, retired); 588 } 589 base = gslab_cache(base); 590 while ((slab = base) != NULL) { 591 base = base->next; 592 slab->next = NULL; 593 kmem_slab_free(slab, KMALLOC_SLAB_SIZE); 594 } 595 } 596 597 return donext; 598 } 599 600 /* 601 * Optional bitmap double-free check. This is typically turned on by 602 * default for safety (sys/_malloc.h) 603 */ 604 #ifdef KMALLOC_CHECK_DOUBLE_FREE 605 606 static __inline void 607 bmap_set(struct kmalloc_slab *slab, void *obj) 608 { 609 uint64_t *ptr; 610 uint64_t mask; 611 size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) / 612 slab->objsize; 613 614 ptr = &slab->bmap[i >> 6]; 615 mask = (uint64_t)1U << (i & 63); 616 KKASSERT(i < slab->ncount && (*ptr & mask) == 0); 617 atomic_set_64(ptr, mask); 618 } 619 620 static __inline void 621 bmap_clr(struct kmalloc_slab *slab, void *obj) 622 { 623 uint64_t *ptr; 624 uint64_t mask; 625 size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) / 626 slab->objsize; 627 628 ptr = &slab->bmap[i >> 6]; 629 mask = (uint64_t)1U << (i & 63); 630 KKASSERT(i < slab->ncount && (*ptr & mask) != 0); 631 atomic_clear_64(ptr, mask); 632 } 633 634 #endif 635 636 /* 637 * Cleanup a mgt structure. 638 * 639 * Always called from the current cpu, so we can manipulate the various 640 * lists freely. 641 * 642 * WARNING: findex can race, fobjs[n] is updated after findex is incremented, 643 * and 'full' 644 */ 645 #if 0 646 static void 647 mgt_cleanup(struct kmalloc_mgt *mgt) 648 { 649 #if 0 650 struct kmalloc_slab **slabp; 651 struct kmalloc_slab *slab; 652 size_t delta; 653 size_t total; 654 #endif 655 } 656 #endif 657 658 #ifdef SLAB_DEBUG 659 void * 660 _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags, 661 const char *file, int line) 662 #else 663 void * 664 _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags) 665 #endif 666 { 667 struct kmalloc_slab *slab; 668 struct kmalloc_use *use; 669 struct kmalloc_mgt *mgt; 670 struct kmalloc_mgt *ggm; 671 globaldata_t gd; 672 void *obj; 673 size_t delta; 674 675 /* 676 * Check limits 677 */ 678 while (__predict_false(type->ks_loosememuse >= type->ks_limit)) { 679 long ttl; 680 int n; 681 682 for (n = ttl = 0; n < ncpus; ++n) 683 ttl += type->ks_use[n].memuse; 684 type->ks_loosememuse = ttl; /* not MP synchronized */ 685 if ((ssize_t)ttl < 0) /* deal with occassional race */ 686 ttl = 0; 687 if (ttl >= type->ks_limit) { 688 if (flags & M_NULLOK) 689 return(NULL); 690 panic("%s: malloc limit exceeded", type->ks_shortdesc); 691 } 692 } 693 694 /* 695 * Setup 696 */ 697 crit_enter(); 698 logmemory_quick(malloc_beg); 699 KKASSERT(size == type->ks_objsize); 700 gd = mycpu; 701 use = &type->ks_use[gd->gd_cpuid]; 702 703 retry: 704 /* 705 * Check active 706 * 707 * NOTE: obj can be NULL if racing a _kfree_obj(). 708 */ 709 mgt = &use->mgt; 710 slab = mgt->active; /* Might be dummy */ 711 delta = slab->findex - slab->aindex; 712 if (__predict_true(delta != 0)) { /* Cannot be dummy */ 713 size_t i; 714 715 i = slab->aindex % slab->ncount; 716 obj = slab->fobjs[i]; 717 if (__predict_true(obj != NULL)) { 718 slab->fobjs[i] = NULL; 719 ++slab->aindex; 720 #ifdef KMALLOC_CHECK_DOUBLE_FREE 721 bmap_set(slab, obj); 722 #endif 723 goto found; 724 } 725 } 726 727 /* 728 * Check alternate. If we find something, swap it with 729 * the active. 730 * 731 * NOTE: It is possible for exhausted slabs to recover entries 732 * via _kfree_obj(), so we just keep swapping until both 733 * are empty. 734 * 735 * NOTE: obj can be NULL if racing a _kfree_obj(). 736 */ 737 slab = mgt->alternate; /* Might be dummy */ 738 delta = slab->findex - slab->aindex; 739 if (__predict_true(delta != 0)) { /* Cannot be dummy */ 740 size_t i; 741 742 mgt->alternate = mgt->active; 743 mgt->active = slab; 744 i = slab->aindex % slab->ncount; 745 obj = slab->fobjs[i]; 746 if (__predict_true(obj != NULL)) { 747 slab->fobjs[i] = NULL; 748 ++slab->aindex; 749 #ifdef KMALLOC_CHECK_DOUBLE_FREE 750 bmap_set(slab, obj); 751 #endif 752 goto found; 753 } 754 } 755 756 /* 757 * Rotate a slab from the global mgt into the pcpu mgt. 758 * 759 * G(partial, full) -> active -> alternate -> G(empty) 760 * 761 * We try to exhaust partials first to reduce fragmentation, then 762 * dig into the fulls. 763 */ 764 ggm = &type->ks_mgt; 765 spin_lock(&ggm->spin); 766 767 rerotate: 768 if (ggm->partial) { 769 slab = mgt->alternate; /* Might be dummy */ 770 mgt->alternate = mgt->active; /* Might be dummy */ 771 mgt->active = ggm->partial; 772 ggm->partial = ggm->partial->next; 773 mgt->active->next = NULL; 774 --ggm->npartial; 775 if (slab != &kslab_dummy) { 776 KKASSERT(slab->next == NULL); 777 *ggm->empty_tailp = slab; 778 ggm->empty_tailp = &slab->next; 779 ++ggm->nempty; 780 } 781 spin_unlock(&ggm->spin); 782 goto retry; 783 } 784 785 if (ggm->full) { 786 slab = mgt->alternate; /* Might be dummy */ 787 mgt->alternate = mgt->active; /* Might be dummy */ 788 mgt->active = ggm->full; 789 ggm->full = ggm->full->next; 790 mgt->active->next = NULL; 791 --ggm->nfull; 792 exis_setlive(&mgt->active->exis); 793 if (slab != &kslab_dummy) { 794 KKASSERT(slab->next == NULL); 795 *ggm->empty_tailp = slab; 796 ggm->empty_tailp = &slab->next; 797 ++ggm->nempty; 798 } 799 spin_unlock(&ggm->spin); 800 goto retry; 801 } 802 803 /* 804 * We couldn't find anything, scan a limited number of empty entries 805 * looking for something with objects. This will also free excess 806 * full lists that meet requirements. 807 */ 808 if (malloc_mgt_poll_empty_locked(ggm, 16)) 809 goto rerotate; 810 811 /* 812 * Absolutely nothing is available, allocate a new slab and 813 * rotate it in. 814 * 815 * Try to get a slab from the global pcpu slab cache (very cheap). 816 * If that fails, allocate a new slab (very expensive). 817 */ 818 spin_unlock(&ggm->spin); 819 820 if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) { 821 slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE, 822 M_WAITOK); 823 } 824 825 bzero(slab, sizeof(*slab)); 826 KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <= 827 use->mgt.slab_offset); 828 829 obj = (char *)slab + use->mgt.slab_offset; 830 slab->type = type; 831 slab->orig_cpuid = gd->gd_cpuid; 832 slab->ncount = use->mgt.slab_count; 833 slab->offset = use->mgt.slab_offset; 834 slab->objsize = type->ks_objsize; 835 slab->aindex = 0; 836 slab->findex = slab->ncount; 837 slab->xindex = slab->ncount; 838 for (delta = 0; delta < slab->ncount; ++delta) { 839 slab->fobjs[delta] = obj; 840 obj = (char *)obj + type->ks_objsize; 841 } 842 843 /* 844 * Sanity check, assert that the last byte of last object is still 845 * in the slab. 846 */ 847 #if 0 848 KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) & 849 ~KMALLOC_SLAB_MASK) == 0); 850 #endif 851 KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) & 852 ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj)); 853 slab->magic = KMALLOC_SLAB_MAGIC; 854 spin_init(&slab->spin, "kmslb"); 855 856 /* 857 * Rotate it in, then retry. 858 * 859 * (NEW)slab -> active -> alternate -> G(empty) 860 */ 861 spin_lock(&ggm->spin); 862 if (mgt->alternate != &kslab_dummy) { 863 struct kmalloc_slab *slab_tmp; 864 865 slab_tmp = mgt->alternate; 866 slab_tmp->next = NULL; 867 *ggm->empty_tailp = slab_tmp; 868 ggm->empty_tailp = &slab_tmp->next; 869 ++ggm->nempty; 870 } 871 mgt->alternate = mgt->active; /* Might be dummy */ 872 mgt->active = slab; 873 spin_unlock(&ggm->spin); 874 875 goto retry; 876 877 /* 878 * Found object, adjust statistics and return 879 */ 880 found: 881 ++use->inuse; 882 ++use->calls; 883 use->memuse += size; 884 use->loosememuse += size; 885 if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) { 886 /* not MP synchronized */ 887 type->ks_loosememuse += use->loosememuse; 888 use->loosememuse = 0; 889 } 890 891 /* 892 * Handle remaining flags. M_ZERO is typically not set because 893 * the inline macro deals with zeroing for constant sizes. 894 */ 895 if (__predict_false(flags & M_ZERO)) 896 bzero(obj, size); 897 898 crit_exit(); 899 logmemory(malloc_end, NULL, type, size, flags); 900 901 return(obj); 902 } 903 904 /* 905 * Free a type-stable object. We have the base structure and can 906 * calculate the slab, but from this direction we don't know which 907 * mgt structure or list the slab might be on. 908 */ 909 void 910 _kfree_obj(void *obj, struct malloc_type *type) 911 { 912 struct kmalloc_slab *slab; 913 struct kmalloc_use *use; 914 globaldata_t gd; 915 size_t delta; 916 size_t i; 917 918 logmemory_quick(free_beg); 919 gd = mycpu; 920 921 /* 922 * Calculate the slab from the pointer 923 */ 924 slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK); 925 delta = slab->findex - slab->aindex; 926 KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount); 927 928 /* 929 * We can only safely adjust the statistics for the current cpu. 930 * Don't try to track down the original cpu. The statistics will 931 * be collected and fixed up by vmstat -m (etc). 932 */ 933 use = &slab->type->ks_use[gd->gd_cpuid]; 934 --use->inuse; 935 use->memuse -= slab->objsize; 936 937 /* 938 * There MUST be free space in the slab since we are returning 939 * the obj to the same slab it was allocated from. 940 */ 941 i = atomic_fetchadd_long(&slab->findex, 1); 942 i = i % slab->ncount; 943 if (slab->fobjs[i] != NULL) { 944 kprintf("_kfree_obj failure %zd/%zd/%zd\n", 945 slab->aindex, slab->findex, slab->ncount); 946 } 947 #ifdef KMALLOC_CHECK_DOUBLE_FREE 948 bmap_clr(slab, obj); 949 #endif 950 KKASSERT(slab->fobjs[i] == NULL); 951 slab->fobjs[i] = obj; 952 atomic_add_long(&slab->xindex, 1); /* synchronizer */ 953 954 logmemory_quick(free_end); 955 } 956