1 /* 2 * KERN_KMALLOC.C - Kernel memory allocator 3 * 4 * Copyright (c) 2021 The DragonFly Project, All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * This module implements the kmalloc_obj allocator. This is a type-stable 39 * allocator that uses the same base structures (e.g. malloc_type) plus 40 * some extensions to efficiently implement single-type zones. 41 * 42 * All memory management is zone based. When a zone is destroyed, all of 43 * its memory is returned to the system with no fragmentation. 44 * 45 * A mini-slab allocator hangs directly off the zone structure (malloc_type). 46 * Since the object zones are single-size-only, the slab allocator is very 47 * simple and currently utilizes just two per-zone/per-cpu slabs (active and 48 * alternate) before kicking up to the per-zone cache. Beyond that we just 49 * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary 50 * kernel_map mappings and unmappings. 51 * 52 * The advantage of this that zones don't stomp over each other and cause 53 * excessive fragmentation in the slabs. For example, when you umount a 54 * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory) 55 * is returned to the system. 56 */ 57 58 #include <sys/param.h> 59 #include <sys/systm.h> 60 #include <sys/kernel.h> 61 #include <sys/slaballoc.h> 62 #include <sys/mbuf.h> 63 #include <sys/vmmeter.h> 64 #include <sys/spinlock.h> 65 #include <sys/lock.h> 66 #include <sys/thread.h> 67 #include <sys/globaldata.h> 68 #include <sys/sysctl.h> 69 #include <sys/ktr.h> 70 #include <sys/malloc.h> 71 72 #include <vm/vm.h> 73 #include <vm/vm_param.h> 74 #include <vm/vm_kern.h> 75 #include <vm/vm_extern.h> 76 #include <vm/vm_object.h> 77 #include <vm/pmap.h> 78 #include <vm/vm_map.h> 79 #include <vm/vm_page.h> 80 #include <vm/vm_pageout.h> 81 82 #include <machine/cpu.h> 83 84 #include <sys/spinlock2.h> 85 #include <sys/thread2.h> 86 #include <sys/exislock2.h> 87 #include <vm/vm_page2.h> 88 89 #define MEMORY_STRING "ptr=%p type=%p size=%lu flags=%04x" 90 #define MEMORY_ARGS void *ptr, void *type, unsigned long size, int flags 91 92 #if !defined(KTR_MEMORY) 93 #define KTR_MEMORY KTR_ALL 94 #endif 95 KTR_INFO_MASTER(mem_obj); 96 KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin"); 97 KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS); 98 #if 0 99 KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS); 100 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS); 101 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS); 102 KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS); 103 KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS); 104 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS); 105 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS); 106 #endif 107 KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin"); 108 KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end"); 109 110 #define logmemory(name, ptr, type, size, flags) \ 111 KTR_LOG(mem_obj_ ## name, ptr, type, size, flags) 112 #define logmemory_quick(name) \ 113 KTR_LOG(mem_obj_ ## name) 114 115 __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS; 116 SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, ""); 117 __read_frequently static int kzone_bretire = 4; 118 SYSCTL_INT(_kern, OID_AUTO, kzone_bretire, CTLFLAG_RW, &kzone_bretire, 0, ""); 119 __read_frequently static int kzone_debug; 120 SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, ""); 121 122 __read_frequently struct kmalloc_slab kslab_dummy; 123 124 static void malloc_slab_destroy(struct malloc_type *type, 125 struct kmalloc_slab **slabp); 126 127 /* 128 * Cache a chain of slabs onto their respective cpu slab caches. Any slabs 129 * which we cannot cache will be returned. 130 * 131 * free_slabs - Current structure may only be accessed by current cpu 132 * remote_free_slabs - Only atomic swap operations are allowed. 133 * free_count - Only atomic operations are allowed. 134 * 135 * If the count is sufficient to cache the entire list, NULL is returned. 136 * Otherwise the portion that was not cached is returned. 137 */ 138 static __noinline 139 struct kmalloc_slab * 140 gslab_cache(struct kmalloc_slab *slab) 141 { 142 struct kmalloc_slab *save; 143 struct kmalloc_slab *next; 144 struct kmalloc_slab *res; 145 struct kmalloc_slab **resp; 146 struct kmalloc_slab **slabp; 147 globaldata_t rgd; 148 size_t count; 149 int cpuid; 150 151 res = NULL; 152 resp = &res; 153 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 154 155 /* 156 * Given the slab list, get the cpuid and clip off as many matching 157 * elements as fits in the cache. 158 */ 159 while (slab) { 160 cpuid = slab->orig_cpuid; 161 rgd = globaldata_find(cpuid); 162 163 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 164 /* 165 * Doesn't fit in cache, put on return list. 166 */ 167 if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) { 168 *resp = slab; 169 resp = &slab->next; 170 slab = slab->next; 171 continue; 172 } 173 174 /* 175 * Collect. We aren't required to match-up the original cpu 176 * with the disposal cpu, but its a good idea to retain 177 * memory locality. 178 * 179 * The slabs we collect are going into the global cache, 180 * remove the type association. 181 */ 182 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 183 slabp = &slab->next; 184 count = 1; 185 slab->type = NULL; 186 187 while ((next = *slabp) != NULL && 188 next->orig_cpuid == cpuid && 189 rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs) 190 { 191 KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0); 192 next->type = NULL; 193 ++count; 194 slabp = &next->next; 195 } 196 197 /* 198 * Safety, unhook before next, next is not included in the 199 * list starting with slab that is being pre-pended 200 * to remote_free_slabs. 201 */ 202 *slabp = NULL; 203 204 /* 205 * Now atomically pre-pend slab...*slabp to remote_free_slabs. 206 * Pump the count first (its ok if the actual chain length 207 * races the count update). 208 * 209 * NOTE: In the loop, (save) is updated by fcmpset. 210 */ 211 atomic_add_long(&rgd->gd_kmslab.free_count, count); 212 save = rgd->gd_kmslab.remote_free_slabs; 213 for (;;) { 214 KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0); 215 *slabp = save; /* end of slab list chain to... */ 216 cpu_ccfence(); 217 if (atomic_fcmpset_ptr( 218 &rgd->gd_kmslab.remote_free_slabs, 219 &save, slab)) 220 { 221 break; 222 } 223 } 224 225 /* 226 * Setup for next loop 227 */ 228 slab = next; 229 } 230 231 /* 232 * Terminate the result list and return it 233 */ 234 *resp = NULL; 235 236 return res; 237 } 238 239 /* 240 * May only be called on current cpu. Pull a free slab from the 241 * pcpu cache. If we run out, move any slabs that have built-up 242 * from remote cpus. 243 * 244 * We are only allowed to swap the remote_free_slabs head, we cannot 245 * manipulate any next pointers while structures are sitting on that list. 246 */ 247 static __inline 248 struct kmalloc_slab * 249 gslab_alloc(globaldata_t gd) 250 { 251 struct kmalloc_slab *slab; 252 253 slab = gd->gd_kmslab.free_slabs; 254 if (slab == NULL) { 255 slab = atomic_swap_ptr( 256 (volatile void **)&gd->gd_kmslab.remote_free_slabs, 257 NULL); 258 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 259 } 260 if (slab) { 261 gd->gd_kmslab.free_slabs = slab->next; 262 slab->next = NULL; 263 atomic_add_long(&gd->gd_kmslab.free_count, -1); 264 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 265 } 266 return slab; 267 } 268 269 void 270 malloc_mgt_init(struct malloc_type *type __unused, 271 struct kmalloc_mgt *mgt, size_t size) 272 { 273 size_t offset; 274 size_t count; 275 276 bzero(mgt, sizeof(*mgt)); 277 spin_init(&mgt->spin, "kmmgt"); 278 279 /* 280 * Allows us to avoid a conditional. The dummy slabs are empty 281 * and have no objects. 282 */ 283 mgt->active = &kslab_dummy; 284 mgt->alternate = &kslab_dummy; 285 mgt->empty_tailp = &mgt->empty; 286 287 /* 288 * Figure out the count by taking into account the size of the fobjs[] 289 * array by adding it to the object size. This initial calculation 290 * ignores alignment edge-cases that might require the count to be 291 * reduced. 292 */ 293 offset = offsetof(struct kmalloc_slab, fobjs[0]); 294 count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *)); 295 296 /* 297 * Recalculate the offset of the first object, this time including 298 * the required alignment. (size) should already be aligned. This 299 * may push the last object beyond the slab so check and loop with 300 * a reduced count as necessary. 301 * 302 * Ok, theoretically the count should not actually change since the 303 * division above rounds-down (that is, any mis-alignment is already 304 * not included in the count calculation). But I'm not going to take 305 * any chances and check anyway as a safety in case some programmer 306 * changes the code above later. This is not a time-critical code 307 * path. 308 */ 309 offset = offsetof(struct kmalloc_slab, fobjs[count]); 310 offset = __VM_CACHELINE_ALIGN(offset); 311 312 while (offset + size * count > KMALLOC_SLAB_SIZE) { 313 --count; 314 offset = offsetof(struct kmalloc_slab, fobjs[count]); 315 offset = __VM_CACHELINE_ALIGN(offset); 316 KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE); 317 } 318 319 mgt->slab_offset = offset; 320 mgt->slab_count = count; 321 } 322 323 void 324 malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst) 325 { 326 struct kmalloc_slab **slabp; 327 328 spin_init(&dst->spin, "kmmgt"); 329 slabp = &dst->empty; 330 331 while (*slabp) { 332 slabp = &(*slabp)->next; 333 } 334 dst->empty_tailp = slabp; 335 } 336 337 void 338 malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt) 339 { 340 if (mgt->active != &kslab_dummy) 341 malloc_slab_destroy(type, &mgt->active); 342 mgt->active = NULL; 343 344 if (mgt->alternate != &kslab_dummy) 345 malloc_slab_destroy(type, &mgt->alternate); 346 mgt->alternate = NULL; 347 348 malloc_slab_destroy(type, &mgt->partial); 349 malloc_slab_destroy(type, &mgt->full); 350 malloc_slab_destroy(type, &mgt->empty); 351 mgt->npartial = 0; 352 mgt->nfull = 0; 353 mgt->nempty = 0; 354 mgt->empty_tailp = &mgt->empty; 355 356 spin_uninit(&mgt->spin); 357 } 358 359 /* 360 * Destroy a list of slabs. Attempt to cache the slabs on the specified 361 * (possibly remote) cpu. This allows slabs that were operating on a 362 * particular cpu to be disposed of back to that same cpu. 363 */ 364 static void 365 malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp) 366 { 367 struct kmalloc_slab *slab; 368 struct kmalloc_slab *base; 369 struct kmalloc_slab **basep; 370 size_t delta; 371 372 if (*slabp == NULL) 373 return; 374 375 /* 376 * Collect all slabs that can actually be destroyed, complain 377 * about the rest. 378 */ 379 base = NULL; 380 basep = &base; 381 while ((slab = *slabp) != NULL) { 382 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 383 384 delta = slab->findex - slab->aindex; 385 if (delta == slab->ncount) { 386 *slabp = slab->next; /* unlink */ 387 *basep = slab; /* link into base list */ 388 basep = &slab->next; 389 } else { 390 kprintf("%s: slab %p %zd objects " 391 "were still allocated\n", 392 type->ks_shortdesc, slab, 393 slab->ncount - delta); 394 /* leave link intact and iterate */ 395 slabp = &slab->next; 396 } 397 } 398 399 /* 400 * Terminate the base list of slabs that can be destroyed, 401 * then cache as many of them as possible. 402 */ 403 *basep = NULL; 404 if (base == NULL) 405 return; 406 base = gslab_cache(base); 407 408 /* 409 * Destroy the remainder 410 */ 411 while ((slab = base) != NULL) { 412 base = slab->next; 413 slab->next = (void *)(uintptr_t)-1; 414 kmem_slab_free(slab, KMALLOC_SLAB_SIZE); 415 } 416 } 417 418 /* 419 * Objects can be freed to an empty slab at any time, causing it to no 420 * longer be empty. To improve performance, we do not try to pro-actively 421 * move such slabs to the appropriate partial or full list upon kfree_obj(). 422 * Instead, a poller comes along and tests the slabs on the empty list 423 * periodically, and moves slabs that are no longer empty to the appropriate 424 * list. 425 * 426 * -- 427 * 428 * Poll a limited number of slabs on the empty list and move them 429 * to the appropriate full or partial list. Slabs left on the empty 430 * list are rotated to the tail. 431 * 432 * If gcache is non-zero this function will try to place full slabs into 433 * the globaldata cache, if it isn't already too full. 434 * 435 * The mgt is spin-locked 436 * 437 * Returns non-zero if the ggm updates possibly made slabs available for 438 * allocation. 439 */ 440 static int 441 malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count) 442 { 443 struct kmalloc_slab *marker; 444 struct kmalloc_slab *slab; 445 size_t delta; 446 int got_something; 447 448 if (ggm->empty == NULL) 449 return 0; 450 451 got_something = 0; 452 marker = ggm->empty; 453 454 while (count-- && (slab = ggm->empty) != NULL) { 455 /* 456 * Unlink from empty 457 */ 458 ggm->empty = slab->next; 459 slab->next = NULL; 460 --ggm->nempty; 461 if (ggm->empty_tailp == &slab->next) 462 ggm->empty_tailp = &ggm->empty; 463 464 /* 465 * Check partial, full, and empty. We rotate 466 * empty entries to the end of the empty list. 467 * 468 * NOTE: For a fully-freeable slab we also have 469 * to check xindex. 470 */ 471 delta = slab->findex - slab->aindex; 472 if (delta == slab->ncount) { 473 /* 474 * Stuff into the full list. This requires setting 475 * the exis sequence number via exis_terminate(). 476 */ 477 KKASSERT(slab->next == NULL); 478 exis_terminate(&slab->exis); 479 slab->next = ggm->full; 480 ggm->full = slab; 481 got_something = 1; 482 ++ggm->nfull; 483 } else if (delta) { 484 /* 485 * Partially full 486 */ 487 KKASSERT(slab->next == NULL); 488 slab->next = ggm->partial; 489 ggm->partial = slab; 490 got_something = 1; 491 ++ggm->npartial; 492 } else { 493 /* 494 * Empty 495 */ 496 KKASSERT(slab->next == NULL); 497 *ggm->empty_tailp = slab; 498 ggm->empty_tailp = &slab->next; 499 ++ggm->nempty; 500 if (ggm->empty == marker) 501 break; 502 } 503 } 504 return got_something; 505 } 506 507 /* 508 * Called once a second with the zone interlocked against destruction. 509 * 510 * Returns non-zero to tell the caller to iterate to the next type, 511 * else the caller should stay on the current type. 512 */ 513 int 514 malloc_mgt_poll(struct malloc_type *type) 515 { 516 struct kmalloc_mgt *ggm; 517 struct kmalloc_slab *slab; 518 struct kmalloc_slab **slabp; 519 struct kmalloc_slab *base; 520 struct kmalloc_slab **basep; 521 size_t delta; 522 int donext; 523 int count; 524 int retired; 525 526 if ((type->ks_flags & KSF_OBJSIZE) == 0) 527 return 1; 528 529 /* 530 * Check the partial, full, and empty lists for full freeable slabs 531 * in excess of desired caching count. 532 */ 533 ggm = &type->ks_mgt; 534 spin_lock(&ggm->spin); 535 536 /* 537 * Move empty slabs to partial or full as appropriate. We 538 * don't bother checking partial slabs to see if they are full 539 * for now. 540 */ 541 malloc_mgt_poll_empty_locked(ggm, 16); 542 543 /* 544 * Ok, cleanout some of the full mags from the full list 545 */ 546 base = NULL; 547 basep = &base; 548 count = ggm->nfull; 549 retired = 0; 550 cpu_ccfence(); 551 552 if (count > KMALLOC_MAXFREEMAGS) { 553 slabp = &ggm->full; 554 count -= KMALLOC_MAXFREEMAGS; 555 if (count > 16) 556 count = 16; 557 558 while (count && (slab = *slabp) != NULL) { 559 delta = slab->findex - slab->aindex; 560 if (delta == slab->ncount && 561 slab->xindex == slab->findex && 562 exis_freeable(&slab->exis)) 563 { 564 /* 565 * (1) No allocated entries in the structure, 566 * this should always be the case from the 567 * full list. 568 * 569 * (2) kfree_obj() has fully completed. Just 570 * checking findex is not sufficient since 571 * it is incremented to reserve the slot 572 * before the element is loaded into it. 573 * 574 * (3) The slab has been on the full list for 575 * a sufficient number of EXIS 576 * pseudo_ticks, for type-safety. 577 */ 578 *slabp = slab->next; 579 *basep = slab; 580 basep = &slab->next; 581 --ggm->nfull; 582 ++ggm->gcache_count; 583 if (++retired == kzone_bretire) 584 break; 585 } else { 586 slabp = &slab->next; 587 } 588 --count; 589 } 590 *basep = NULL; /* terminate the retirement list */ 591 donext = (*slabp == NULL); 592 } else { 593 donext = 1; 594 } 595 spin_unlock(&ggm->spin); 596 597 /* 598 * Clean out any slabs that we couldn't stow in the globaldata cache. 599 */ 600 if (retired) { 601 if (kzone_debug) { 602 kprintf("kmalloc_poll: %s retire %d\n", 603 type->ks_shortdesc, retired); 604 } 605 base = gslab_cache(base); 606 while ((slab = base) != NULL) { 607 base = base->next; 608 slab->next = NULL; 609 kmem_slab_free(slab, KMALLOC_SLAB_SIZE); 610 } 611 } 612 613 return donext; 614 } 615 616 /* 617 * Optional bitmap double-free check. This is typically turned on by 618 * default for safety (sys/_malloc.h) 619 */ 620 #ifdef KMALLOC_CHECK_DOUBLE_FREE 621 622 static __inline void 623 bmap_set(struct kmalloc_slab *slab, void *obj) 624 { 625 uint64_t *ptr; 626 uint64_t mask; 627 size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) / 628 slab->objsize; 629 630 ptr = &slab->bmap[i >> 6]; 631 mask = (uint64_t)1U << (i & 63); 632 KKASSERT(i < slab->ncount && (*ptr & mask) == 0); 633 atomic_set_64(ptr, mask); 634 } 635 636 static __inline void 637 bmap_clr(struct kmalloc_slab *slab, void *obj) 638 { 639 uint64_t *ptr; 640 uint64_t mask; 641 size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) / 642 slab->objsize; 643 644 ptr = &slab->bmap[i >> 6]; 645 mask = (uint64_t)1U << (i & 63); 646 KKASSERT(i < slab->ncount && (*ptr & mask) != 0); 647 atomic_clear_64(ptr, mask); 648 } 649 650 #endif 651 652 /* 653 * Cleanup a mgt structure. 654 * 655 * Always called from the current cpu, so we can manipulate the various 656 * lists freely. 657 * 658 * WARNING: findex can race, fobjs[n] is updated after findex is incremented, 659 * and 'full' 660 */ 661 #if 0 662 static void 663 mgt_cleanup(struct kmalloc_mgt *mgt) 664 { 665 #if 0 666 struct kmalloc_slab **slabp; 667 struct kmalloc_slab *slab; 668 size_t delta; 669 size_t total; 670 #endif 671 } 672 #endif 673 674 #ifdef SLAB_DEBUG 675 void * 676 _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags, 677 const char *file, int line) 678 #else 679 void * 680 _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags) 681 #endif 682 { 683 struct kmalloc_slab *slab; 684 struct kmalloc_use *use; 685 struct kmalloc_mgt *mgt; 686 struct kmalloc_mgt *ggm; 687 globaldata_t gd; 688 void *obj; 689 size_t delta; 690 691 /* 692 * Check limits 693 */ 694 while (__predict_false(type->ks_loosememuse >= type->ks_limit)) { 695 long ttl; 696 int n; 697 698 for (n = ttl = 0; n < ncpus; ++n) 699 ttl += type->ks_use[n].memuse; 700 type->ks_loosememuse = ttl; /* not MP synchronized */ 701 if ((ssize_t)ttl < 0) /* deal with occassional race */ 702 ttl = 0; 703 if (ttl >= type->ks_limit) { 704 if (flags & M_NULLOK) 705 return(NULL); 706 panic("%s: malloc limit exceeded", type->ks_shortdesc); 707 } 708 } 709 710 /* 711 * Setup 712 */ 713 crit_enter(); 714 logmemory_quick(malloc_beg); 715 KKASSERT(size == type->ks_objsize); 716 gd = mycpu; 717 use = &type->ks_use[gd->gd_cpuid]; 718 719 retry: 720 /* 721 * Check active 722 * 723 * NOTE: obj can be NULL if racing a _kfree_obj(). 724 */ 725 mgt = &use->mgt; 726 slab = mgt->active; /* Might be dummy */ 727 delta = slab->findex - slab->aindex; 728 if (__predict_true(delta != 0)) { /* Cannot be dummy */ 729 size_t i; 730 731 i = slab->aindex % slab->ncount; 732 obj = slab->fobjs[i]; 733 if (__predict_true(obj != NULL)) { 734 slab->fobjs[i] = NULL; 735 ++slab->aindex; 736 #ifdef KMALLOC_CHECK_DOUBLE_FREE 737 bmap_set(slab, obj); 738 #endif 739 goto found; 740 } 741 } 742 743 /* 744 * Check alternate. If we find something, swap it with 745 * the active. 746 * 747 * NOTE: It is possible for exhausted slabs to recover entries 748 * via _kfree_obj(), so we just keep swapping until both 749 * are empty. 750 * 751 * NOTE: obj can be NULL if racing a _kfree_obj(). 752 */ 753 slab = mgt->alternate; /* Might be dummy */ 754 delta = slab->findex - slab->aindex; 755 if (__predict_true(delta != 0)) { /* Cannot be dummy */ 756 size_t i; 757 758 mgt->alternate = mgt->active; 759 mgt->active = slab; 760 i = slab->aindex % slab->ncount; 761 obj = slab->fobjs[i]; 762 if (__predict_true(obj != NULL)) { 763 slab->fobjs[i] = NULL; 764 ++slab->aindex; 765 #ifdef KMALLOC_CHECK_DOUBLE_FREE 766 bmap_set(slab, obj); 767 #endif 768 goto found; 769 } 770 } 771 772 /* 773 * Rotate a slab from the global mgt into the pcpu mgt. 774 * 775 * G(partial, full) -> active -> alternate -> G(empty) 776 * 777 * We try to exhaust partials first to reduce fragmentation, then 778 * dig into the fulls. 779 */ 780 ggm = &type->ks_mgt; 781 spin_lock(&ggm->spin); 782 783 rerotate: 784 if (ggm->partial) { 785 slab = mgt->alternate; /* Might be dummy */ 786 mgt->alternate = mgt->active; /* Might be dummy */ 787 mgt->active = ggm->partial; 788 ggm->partial = ggm->partial->next; 789 mgt->active->next = NULL; 790 --ggm->npartial; 791 if (slab != &kslab_dummy) { 792 KKASSERT(slab->next == NULL); 793 *ggm->empty_tailp = slab; 794 ggm->empty_tailp = &slab->next; 795 ++ggm->nempty; 796 } 797 spin_unlock(&ggm->spin); 798 goto retry; 799 } 800 801 if (ggm->full) { 802 slab = mgt->alternate; /* Might be dummy */ 803 mgt->alternate = mgt->active; /* Might be dummy */ 804 mgt->active = ggm->full; 805 ggm->full = ggm->full->next; 806 mgt->active->next = NULL; 807 --ggm->nfull; 808 exis_setlive(&mgt->active->exis); 809 if (slab != &kslab_dummy) { 810 KKASSERT(slab->next == NULL); 811 *ggm->empty_tailp = slab; 812 ggm->empty_tailp = &slab->next; 813 ++ggm->nempty; 814 } 815 spin_unlock(&ggm->spin); 816 goto retry; 817 } 818 819 /* 820 * We couldn't find anything, scan a limited number of empty entries 821 * looking for something with objects. This will also free excess 822 * full lists that meet requirements. 823 */ 824 if (malloc_mgt_poll_empty_locked(ggm, 16)) 825 goto rerotate; 826 827 /* 828 * Absolutely nothing is available, allocate a new slab and 829 * rotate it in. 830 * 831 * Try to get a slab from the global pcpu slab cache (very cheap). 832 * If that fails, allocate a new slab (very expensive). 833 */ 834 spin_unlock(&ggm->spin); 835 836 if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) { 837 slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE, 838 M_WAITOK); 839 } 840 841 bzero(slab, sizeof(*slab)); 842 KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <= 843 use->mgt.slab_offset); 844 845 obj = (char *)slab + use->mgt.slab_offset; 846 slab->type = type; 847 slab->orig_cpuid = gd->gd_cpuid; 848 slab->ncount = use->mgt.slab_count; 849 slab->offset = use->mgt.slab_offset; 850 slab->objsize = type->ks_objsize; 851 slab->aindex = 0; 852 slab->findex = slab->ncount; 853 slab->xindex = slab->ncount; 854 for (delta = 0; delta < slab->ncount; ++delta) { 855 slab->fobjs[delta] = obj; 856 obj = (char *)obj + type->ks_objsize; 857 } 858 859 /* 860 * Sanity check, assert that the last byte of last object is still 861 * in the slab. 862 */ 863 #if 0 864 KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) & 865 ~KMALLOC_SLAB_MASK) == 0); 866 #endif 867 KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) & 868 ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj)); 869 slab->magic = KMALLOC_SLAB_MAGIC; 870 spin_init(&slab->spin, "kmslb"); 871 872 /* 873 * Rotate it in, then retry. 874 * 875 * (NEW)slab -> active -> alternate -> G(empty) 876 */ 877 spin_lock(&ggm->spin); 878 if (mgt->alternate != &kslab_dummy) { 879 struct kmalloc_slab *slab_tmp; 880 881 slab_tmp = mgt->alternate; 882 slab_tmp->next = NULL; 883 *ggm->empty_tailp = slab_tmp; 884 ggm->empty_tailp = &slab_tmp->next; 885 ++ggm->nempty; 886 } 887 mgt->alternate = mgt->active; /* Might be dummy */ 888 mgt->active = slab; 889 spin_unlock(&ggm->spin); 890 891 goto retry; 892 893 /* 894 * Found object, adjust statistics and return 895 */ 896 found: 897 ++use->inuse; 898 ++use->calls; 899 use->memuse += size; 900 use->loosememuse += size; 901 if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) { 902 /* not MP synchronized */ 903 type->ks_loosememuse += use->loosememuse; 904 use->loosememuse = 0; 905 } 906 907 /* 908 * Handle remaining flags. M_ZERO is typically not set because 909 * the inline macro deals with zeroing for constant sizes. 910 */ 911 if (__predict_false(flags & M_ZERO)) 912 bzero(obj, size); 913 914 crit_exit(); 915 logmemory(malloc_end, NULL, type, size, flags); 916 917 return(obj); 918 } 919 920 /* 921 * Free a type-stable object. We have the base structure and can 922 * calculate the slab, but from this direction we don't know which 923 * mgt structure or list the slab might be on. 924 */ 925 void 926 _kfree_obj(void *obj, struct malloc_type *type) 927 { 928 struct kmalloc_slab *slab; 929 struct kmalloc_use *use; 930 globaldata_t gd; 931 size_t delta; 932 size_t i; 933 934 logmemory_quick(free_beg); 935 gd = mycpu; 936 937 /* 938 * Calculate the slab from the pointer 939 */ 940 slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK); 941 delta = slab->findex - slab->aindex; 942 KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount); 943 944 /* 945 * We can only safely adjust the statistics for the current cpu. 946 * Don't try to track down the original cpu. The statistics will 947 * be collected and fixed up by vmstat -m (etc). 948 */ 949 use = &slab->type->ks_use[gd->gd_cpuid]; 950 --use->inuse; 951 use->memuse -= slab->objsize; 952 953 /* 954 * There MUST be free space in the slab since we are returning 955 * the obj to the same slab it was allocated from. 956 */ 957 i = atomic_fetchadd_long(&slab->findex, 1); 958 i = i % slab->ncount; 959 if (slab->fobjs[i] != NULL) { 960 kprintf("_kfree_obj failure %zd/%zd/%zd\n", 961 slab->aindex, slab->findex, slab->ncount); 962 } 963 #ifdef KMALLOC_CHECK_DOUBLE_FREE 964 bmap_clr(slab, obj); 965 #endif 966 KKASSERT(slab->fobjs[i] == NULL); 967 slab->fobjs[i] = obj; 968 atomic_add_long(&slab->xindex, 1); /* synchronizer */ 969 970 logmemory_quick(free_end); 971 } 972