1 /* 2 * KERN_KMALLOC.C - Kernel memory allocator 3 * 4 * Copyright (c) 2021 The DragonFly Project, All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * This module implements the kmalloc_obj allocator. This is a type-stable 39 * allocator that uses the same base structures (e.g. malloc_type) plus 40 * some extensions to efficiently implement single-type zones. 41 * 42 * All memory management is zone based. When a zone is destroyed, all of 43 * its memory is returned to the system with no fragmentation. 44 * 45 * A mini-slab allocator hangs directly off the zone structure (malloc_type). 46 * Since the object zones are single-size-only, the slab allocator is very 47 * simple and currently utilizes just two per-zone/per-cpu slabs (active and 48 * alternate) before kicking up to the per-zone cache. Beyond that we just 49 * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary 50 * kernel_map mappings and unmappings. 51 * 52 * The advantage of this that zones don't stomp over each other and cause 53 * excessive fragmentation in the slabs. For example, when you umount a 54 * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory) 55 * is returned to the system. 56 */ 57 58 #include "opt_vm.h" 59 60 #include <sys/param.h> 61 #include <sys/systm.h> 62 #include <sys/kernel.h> 63 #include <sys/slaballoc.h> 64 #include <sys/mbuf.h> 65 #include <sys/vmmeter.h> 66 #include <sys/spinlock.h> 67 #include <sys/lock.h> 68 #include <sys/thread.h> 69 #include <sys/globaldata.h> 70 #include <sys/sysctl.h> 71 #include <sys/ktr.h> 72 #include <sys/malloc.h> 73 74 #include <vm/vm.h> 75 #include <vm/vm_param.h> 76 #include <vm/vm_kern.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_object.h> 79 #include <vm/pmap.h> 80 #include <vm/vm_map.h> 81 #include <vm/vm_page.h> 82 #include <vm/vm_pageout.h> 83 84 #include <machine/cpu.h> 85 86 #include <sys/spinlock2.h> 87 #include <sys/thread2.h> 88 #include <sys/exislock2.h> 89 #include <vm/vm_page2.h> 90 91 #define MEMORY_STRING "ptr=%p type=%p size=%lu flags=%04x" 92 #define MEMORY_ARGS void *ptr, void *type, unsigned long size, int flags 93 94 #if !defined(KTR_MEMORY) 95 #define KTR_MEMORY KTR_ALL 96 #endif 97 KTR_INFO_MASTER(mem_obj); 98 KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin"); 99 KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS); 100 #if 0 101 KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS); 102 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS); 103 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS); 104 KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS); 105 KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS); 106 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS); 107 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS); 108 #endif 109 KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin"); 110 KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end"); 111 112 #define logmemory(name, ptr, type, size, flags) \ 113 KTR_LOG(mem_obj_ ## name, ptr, type, size, flags) 114 #define logmemory_quick(name) \ 115 KTR_LOG(mem_obj_ ## name) 116 117 __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS; 118 SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, ""); 119 __read_frequently static int kzone_bretire = 4; 120 SYSCTL_INT(_kern, OID_AUTO, kzone_bretire, CTLFLAG_RW, &kzone_bretire, 0, ""); 121 __read_frequently static int kzone_debug; 122 SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, ""); 123 124 __read_frequently struct kmalloc_slab kslab_dummy; 125 126 static void malloc_slab_destroy(struct malloc_type *type, 127 struct kmalloc_slab **slabp); 128 129 /* 130 * Cache a chain of slabs onto their respective cpu slab caches. Any slabs 131 * which we cannot cache will be returned. 132 * 133 * free_slabs - Current structure may only be accessed by current cpu 134 * remote_free_slabs - Only atomic swap operations are allowed. 135 * free_count - Only atomic operations are allowed. 136 * 137 * If the count is sufficient to cache the entire list, NULL is returned. 138 * Otherwise the portion that was not cached is returned. 139 */ 140 static __noinline 141 struct kmalloc_slab * 142 gslab_cache(struct kmalloc_slab *slab) 143 { 144 struct kmalloc_slab *save; 145 struct kmalloc_slab *next; 146 struct kmalloc_slab *res; 147 struct kmalloc_slab **resp; 148 struct kmalloc_slab **slabp; 149 globaldata_t rgd; 150 size_t count; 151 int cpuid; 152 153 res = NULL; 154 resp = &res; 155 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 156 157 /* 158 * Given the slab list, get the cpuid and clip off as many matching 159 * elements as fits in the cache. 160 */ 161 while (slab) { 162 cpuid = slab->orig_cpuid; 163 rgd = globaldata_find(cpuid); 164 165 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 166 /* 167 * Doesn't fit in cache, put on return list. 168 */ 169 if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) { 170 *resp = slab; 171 resp = &slab->next; 172 slab = slab->next; 173 continue; 174 } 175 176 /* 177 * Collect. We aren't required to match-up the original cpu 178 * with the disposal cpu, but its a good idea to retain 179 * memory locality. 180 * 181 * The slabs we collect are going into the global cache, 182 * remove the type association. 183 */ 184 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 185 slabp = &slab->next; 186 count = 1; 187 slab->type = NULL; 188 189 while ((next = *slabp) != NULL && 190 next->orig_cpuid == cpuid && 191 rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs) 192 { 193 KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0); 194 next->type = NULL; 195 ++count; 196 slabp = &next->next; 197 } 198 199 /* 200 * Safety, unhook before next, next is not included in the 201 * list starting with slab that is being pre-pended 202 * to remote_free_slabs. 203 */ 204 *slabp = NULL; 205 206 /* 207 * Now atomically pre-pend slab...*slabp to remote_free_slabs. 208 * Pump the count first (its ok if the actual chain length 209 * races the count update). 210 * 211 * NOTE: In the loop, (save) is updated by fcmpset. 212 */ 213 atomic_add_long(&rgd->gd_kmslab.free_count, count); 214 save = rgd->gd_kmslab.remote_free_slabs; 215 for (;;) { 216 KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0); 217 *slabp = save; /* end of slab list chain to... */ 218 cpu_ccfence(); 219 if (atomic_fcmpset_ptr( 220 &rgd->gd_kmslab.remote_free_slabs, 221 &save, slab)) 222 { 223 break; 224 } 225 } 226 227 /* 228 * Setup for next loop 229 */ 230 slab = next; 231 } 232 233 /* 234 * Terminate the result list and return it 235 */ 236 *resp = NULL; 237 238 return res; 239 } 240 241 /* 242 * May only be called on current cpu. Pull a free slab from the 243 * pcpu cache. If we run out, move any slabs that have built-up 244 * from remote cpus. 245 * 246 * We are only allowed to swap the remote_free_slabs head, we cannot 247 * manipulate any next pointers while structures are sitting on that list. 248 */ 249 static __inline 250 struct kmalloc_slab * 251 gslab_alloc(globaldata_t gd) 252 { 253 struct kmalloc_slab *slab; 254 255 slab = gd->gd_kmslab.free_slabs; 256 if (slab == NULL) { 257 slab = atomic_swap_ptr( 258 (volatile void **)&gd->gd_kmslab.remote_free_slabs, 259 NULL); 260 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 261 } 262 if (slab) { 263 gd->gd_kmslab.free_slabs = slab->next; 264 slab->next = NULL; 265 atomic_add_long(&gd->gd_kmslab.free_count, -1); 266 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 267 } 268 return slab; 269 } 270 271 void 272 malloc_mgt_init(struct malloc_type *type __unused, 273 struct kmalloc_mgt *mgt, size_t size) 274 { 275 size_t offset; 276 size_t count; 277 278 bzero(mgt, sizeof(*mgt)); 279 spin_init(&mgt->spin, "kmmgt"); 280 281 /* 282 * Allows us to avoid a conditional. The dummy slabs are empty 283 * and have no objects. 284 */ 285 mgt->active = &kslab_dummy; 286 mgt->alternate = &kslab_dummy; 287 mgt->empty_tailp = &mgt->empty; 288 289 /* 290 * Figure out the count by taking into account the size of the fobjs[] 291 * array by adding it to the object size. This initial calculation 292 * ignores alignment edge-cases that might require the count to be 293 * reduced. 294 */ 295 offset = offsetof(struct kmalloc_slab, fobjs[0]); 296 count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *)); 297 298 /* 299 * Recalculate the offset of the first object, this time including 300 * the required alignment. (size) should already be aligned. This 301 * may push the last object beyond the slab so check and loop with 302 * a reduced count as necessary. 303 * 304 * Ok, theoretically the count should not actually change since the 305 * division above rounds-down (that is, any mis-alignment is already 306 * not included in the count calculation). But I'm not going to take 307 * any chances and check anyway as a safety in case some programmer 308 * changes the code above later. This is not a time-critical code 309 * path. 310 */ 311 offset = offsetof(struct kmalloc_slab, fobjs[count]); 312 offset = __VM_CACHELINE_ALIGN(offset); 313 314 while (offset + size * count > KMALLOC_SLAB_SIZE) { 315 --count; 316 offset = offsetof(struct kmalloc_slab, fobjs[count]); 317 offset = __VM_CACHELINE_ALIGN(offset); 318 KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE); 319 } 320 321 mgt->slab_offset = offset; 322 mgt->slab_count = count; 323 } 324 325 void 326 malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst) 327 { 328 struct kmalloc_slab **slabp; 329 330 spin_init(&dst->spin, "kmmgt"); 331 slabp = &dst->empty; 332 333 while (*slabp) { 334 slabp = &(*slabp)->next; 335 } 336 dst->empty_tailp = slabp; 337 } 338 339 void 340 malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt) 341 { 342 if (mgt->active != &kslab_dummy) 343 malloc_slab_destroy(type, &mgt->active); 344 mgt->active = NULL; 345 346 if (mgt->alternate != &kslab_dummy) 347 malloc_slab_destroy(type, &mgt->alternate); 348 mgt->alternate = NULL; 349 350 malloc_slab_destroy(type, &mgt->partial); 351 malloc_slab_destroy(type, &mgt->full); 352 malloc_slab_destroy(type, &mgt->empty); 353 mgt->npartial = 0; 354 mgt->nfull = 0; 355 mgt->nempty = 0; 356 mgt->empty_tailp = &mgt->empty; 357 358 spin_uninit(&mgt->spin); 359 } 360 361 /* 362 * Destroy a list of slabs. Attempt to cache the slabs on the specified 363 * (possibly remote) cpu. This allows slabs that were operating on a 364 * particular cpu to be disposed of back to that same cpu. 365 */ 366 static void 367 malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp) 368 { 369 struct kmalloc_slab *slab; 370 struct kmalloc_slab *base; 371 struct kmalloc_slab **basep; 372 size_t delta; 373 374 if (*slabp == NULL) 375 return; 376 377 /* 378 * Collect all slabs that can actually be destroyed, complain 379 * about the rest. 380 */ 381 base = NULL; 382 basep = &base; 383 while ((slab = *slabp) != NULL) { 384 KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0); 385 386 delta = slab->findex - slab->aindex; 387 if (delta == slab->ncount) { 388 *slabp = slab->next; /* unlink */ 389 *basep = slab; /* link into base list */ 390 basep = &slab->next; 391 } else { 392 kprintf("%s: slab %p %zd objects " 393 "were still allocated\n", 394 type->ks_shortdesc, slab, 395 slab->ncount - delta); 396 /* leave link intact and iterate */ 397 slabp = &slab->next; 398 } 399 } 400 401 /* 402 * Terminate the base list of slabs that can be destroyed, 403 * then cache as many of them as possible. 404 */ 405 *basep = NULL; 406 if (base == NULL) 407 return; 408 base = gslab_cache(base); 409 410 /* 411 * Destroy the remainder 412 */ 413 while ((slab = base) != NULL) { 414 base = slab->next; 415 slab->next = (void *)(uintptr_t)-1; 416 kmem_slab_free(slab, KMALLOC_SLAB_SIZE); 417 } 418 } 419 420 /* 421 * Objects can be freed to an empty slab at any time, causing it to no 422 * longer be empty. To improve performance, we do not try to pro-actively 423 * move such slabs to the appropriate partial or full list upon kfree_obj(). 424 * Instead, a poller comes along and tests the slabs on the empty list 425 * periodically, and moves slabs that are no longer empty to the appropriate 426 * list. 427 * 428 * -- 429 * 430 * Poll a limited number of slabs on the empty list and move them 431 * to the appropriate full or partial list. Slabs left on the empty 432 * list are rotated to the tail. 433 * 434 * If gcache is non-zero this function will try to place full slabs into 435 * the globaldata cache, if it isn't already too full. 436 * 437 * The mgt is spin-locked 438 * 439 * Returns non-zero if the ggm updates possibly made slabs available for 440 * allocation. 441 */ 442 static int 443 malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count) 444 { 445 struct kmalloc_slab *marker; 446 struct kmalloc_slab *slab; 447 size_t delta; 448 int got_something; 449 450 if (ggm->empty == NULL) 451 return 0; 452 453 got_something = 0; 454 marker = ggm->empty; 455 456 while (count-- && (slab = ggm->empty) != NULL) { 457 /* 458 * Unlink from empty 459 */ 460 ggm->empty = slab->next; 461 slab->next = NULL; 462 --ggm->nempty; 463 if (ggm->empty_tailp == &slab->next) 464 ggm->empty_tailp = &ggm->empty; 465 466 /* 467 * Check partial, full, and empty. We rotate 468 * empty entries to the end of the empty list. 469 * 470 * NOTE: For a fully-freeable slab we also have 471 * to check xindex. 472 */ 473 delta = slab->findex - slab->aindex; 474 if (delta == slab->ncount) { 475 /* 476 * Stuff into the full list. This requires setting 477 * the exis sequence number via exis_terminate(). 478 */ 479 KKASSERT(slab->next == NULL); 480 exis_terminate(&slab->exis); 481 slab->next = ggm->full; 482 ggm->full = slab; 483 got_something = 1; 484 ++ggm->nfull; 485 } else if (delta) { 486 /* 487 * Partially full 488 */ 489 KKASSERT(slab->next == NULL); 490 slab->next = ggm->partial; 491 ggm->partial = slab; 492 got_something = 1; 493 ++ggm->npartial; 494 } else { 495 /* 496 * Empty 497 */ 498 KKASSERT(slab->next == NULL); 499 *ggm->empty_tailp = slab; 500 ggm->empty_tailp = &slab->next; 501 ++ggm->nempty; 502 if (ggm->empty == marker) 503 break; 504 } 505 } 506 return got_something; 507 } 508 509 /* 510 * Called once a second with the zone interlocked against destruction. 511 * 512 * Returns non-zero to tell the caller to iterate to the next type, 513 * else the caller should stay on the current type. 514 */ 515 int 516 malloc_mgt_poll(struct malloc_type *type) 517 { 518 struct kmalloc_mgt *ggm; 519 struct kmalloc_slab *slab; 520 struct kmalloc_slab **slabp; 521 struct kmalloc_slab *base; 522 struct kmalloc_slab **basep; 523 size_t delta; 524 int donext; 525 int count; 526 int retired; 527 528 if ((type->ks_flags & KSF_OBJSIZE) == 0) 529 return 1; 530 531 /* 532 * Check the partial, full, and empty lists for full freeable slabs 533 * in excess of desired caching count. 534 */ 535 ggm = &type->ks_mgt; 536 spin_lock(&ggm->spin); 537 538 /* 539 * Move empty slabs to partial or full as appropriate. We 540 * don't bother checking partial slabs to see if they are full 541 * for now. 542 */ 543 malloc_mgt_poll_empty_locked(ggm, 16); 544 545 /* 546 * Ok, cleanout some of the full mags from the full list 547 */ 548 base = NULL; 549 basep = &base; 550 count = ggm->nfull; 551 retired = 0; 552 cpu_ccfence(); 553 554 if (count > KMALLOC_MAXFREEMAGS) { 555 slabp = &ggm->full; 556 count -= KMALLOC_MAXFREEMAGS; 557 if (count > 16) 558 count = 16; 559 560 while (count && (slab = *slabp) != NULL) { 561 delta = slab->findex - slab->aindex; 562 if (delta == slab->ncount && 563 slab->xindex == slab->findex && 564 exis_freeable(&slab->exis)) 565 { 566 /* 567 * (1) No allocated entries in the structure, 568 * this should always be the case from the 569 * full list. 570 * 571 * (2) kfree_obj() has fully completed. Just 572 * checking findex is not sufficient since 573 * it is incremented to reserve the slot 574 * before the element is loaded into it. 575 * 576 * (3) The slab has been on the full list for 577 * a sufficient number of EXIS 578 * pseudo_ticks, for type-safety. 579 */ 580 *slabp = slab->next; 581 *basep = slab; 582 basep = &slab->next; 583 --ggm->nfull; 584 ++ggm->gcache_count; 585 if (++retired == kzone_bretire) 586 break; 587 } else { 588 slabp = &slab->next; 589 } 590 --count; 591 } 592 *basep = NULL; /* terminate the retirement list */ 593 donext = (*slabp == NULL); 594 } else { 595 donext = 1; 596 } 597 spin_unlock(&ggm->spin); 598 599 /* 600 * Clean out any slabs that we couldn't stow in the globaldata cache. 601 */ 602 if (retired) { 603 if (kzone_debug) { 604 kprintf("kmalloc_poll: %s retire %d\n", 605 type->ks_shortdesc, retired); 606 } 607 base = gslab_cache(base); 608 while ((slab = base) != NULL) { 609 base = base->next; 610 slab->next = NULL; 611 kmem_slab_free(slab, KMALLOC_SLAB_SIZE); 612 } 613 } 614 615 return donext; 616 } 617 618 /* 619 * Optional bitmap double-free check. This is typically turned on by 620 * default for safety (sys/_malloc.h) 621 */ 622 #ifdef KMALLOC_CHECK_DOUBLE_FREE 623 624 static __inline void 625 bmap_set(struct kmalloc_slab *slab, void *obj) 626 { 627 uint64_t *ptr; 628 uint64_t mask; 629 size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) / 630 slab->objsize; 631 632 ptr = &slab->bmap[i >> 6]; 633 mask = (uint64_t)1U << (i & 63); 634 KKASSERT(i < slab->ncount && (*ptr & mask) == 0); 635 atomic_set_64(ptr, mask); 636 } 637 638 static __inline void 639 bmap_clr(struct kmalloc_slab *slab, void *obj) 640 { 641 uint64_t *ptr; 642 uint64_t mask; 643 size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) / 644 slab->objsize; 645 646 ptr = &slab->bmap[i >> 6]; 647 mask = (uint64_t)1U << (i & 63); 648 KKASSERT(i < slab->ncount && (*ptr & mask) != 0); 649 atomic_clear_64(ptr, mask); 650 } 651 652 #endif 653 654 /* 655 * Cleanup a mgt structure. 656 * 657 * Always called from the current cpu, so we can manipulate the various 658 * lists freely. 659 * 660 * WARNING: findex can race, fobjs[n] is updated after findex is incremented, 661 * and 'full' 662 */ 663 #if 0 664 static void 665 mgt_cleanup(struct kmalloc_mgt *mgt) 666 { 667 #if 0 668 struct kmalloc_slab **slabp; 669 struct kmalloc_slab *slab; 670 size_t delta; 671 size_t total; 672 #endif 673 } 674 #endif 675 676 #ifdef SLAB_DEBUG 677 void * 678 _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags, 679 const char *file, int line) 680 #else 681 void * 682 _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags) 683 #endif 684 { 685 struct kmalloc_slab *slab; 686 struct kmalloc_use *use; 687 struct kmalloc_mgt *mgt; 688 struct kmalloc_mgt *ggm; 689 globaldata_t gd; 690 void *obj; 691 size_t delta; 692 693 /* 694 * Check limits 695 */ 696 while (__predict_false(type->ks_loosememuse >= type->ks_limit)) { 697 long ttl; 698 int n; 699 700 for (n = ttl = 0; n < ncpus; ++n) 701 ttl += type->ks_use[n].memuse; 702 type->ks_loosememuse = ttl; /* not MP synchronized */ 703 if ((ssize_t)ttl < 0) /* deal with occassional race */ 704 ttl = 0; 705 if (ttl >= type->ks_limit) { 706 if (flags & M_NULLOK) 707 return(NULL); 708 panic("%s: malloc limit exceeded", type->ks_shortdesc); 709 } 710 } 711 712 /* 713 * Setup 714 */ 715 crit_enter(); 716 logmemory_quick(malloc_beg); 717 KKASSERT(size == type->ks_objsize); 718 gd = mycpu; 719 use = &type->ks_use[gd->gd_cpuid]; 720 721 retry: 722 /* 723 * Check active 724 * 725 * NOTE: obj can be NULL if racing a _kfree_obj(). 726 */ 727 mgt = &use->mgt; 728 slab = mgt->active; /* Might be dummy */ 729 delta = slab->findex - slab->aindex; 730 if (__predict_true(delta != 0)) { /* Cannot be dummy */ 731 size_t i; 732 733 i = slab->aindex % slab->ncount; 734 obj = slab->fobjs[i]; 735 if (__predict_true(obj != NULL)) { 736 slab->fobjs[i] = NULL; 737 ++slab->aindex; 738 #ifdef KMALLOC_CHECK_DOUBLE_FREE 739 bmap_set(slab, obj); 740 #endif 741 goto found; 742 } 743 } 744 745 /* 746 * Check alternate. If we find something, swap it with 747 * the active. 748 * 749 * NOTE: It is possible for exhausted slabs to recover entries 750 * via _kfree_obj(), so we just keep swapping until both 751 * are empty. 752 * 753 * NOTE: obj can be NULL if racing a _kfree_obj(). 754 */ 755 slab = mgt->alternate; /* Might be dummy */ 756 delta = slab->findex - slab->aindex; 757 if (__predict_true(delta != 0)) { /* Cannot be dummy */ 758 size_t i; 759 760 mgt->alternate = mgt->active; 761 mgt->active = slab; 762 i = slab->aindex % slab->ncount; 763 obj = slab->fobjs[i]; 764 if (__predict_true(obj != NULL)) { 765 slab->fobjs[i] = NULL; 766 ++slab->aindex; 767 #ifdef KMALLOC_CHECK_DOUBLE_FREE 768 bmap_set(slab, obj); 769 #endif 770 goto found; 771 } 772 } 773 774 /* 775 * Rotate a slab from the global mgt into the pcpu mgt. 776 * 777 * G(partial, full) -> active -> alternate -> G(empty) 778 * 779 * We try to exhaust partials first to reduce fragmentation, then 780 * dig into the fulls. 781 */ 782 ggm = &type->ks_mgt; 783 spin_lock(&ggm->spin); 784 785 rerotate: 786 if (ggm->partial) { 787 slab = mgt->alternate; /* Might be dummy */ 788 mgt->alternate = mgt->active; /* Might be dummy */ 789 mgt->active = ggm->partial; 790 ggm->partial = ggm->partial->next; 791 mgt->active->next = NULL; 792 --ggm->npartial; 793 if (slab != &kslab_dummy) { 794 KKASSERT(slab->next == NULL); 795 *ggm->empty_tailp = slab; 796 ggm->empty_tailp = &slab->next; 797 ++ggm->nempty; 798 } 799 spin_unlock(&ggm->spin); 800 goto retry; 801 } 802 803 if (ggm->full) { 804 slab = mgt->alternate; /* Might be dummy */ 805 mgt->alternate = mgt->active; /* Might be dummy */ 806 mgt->active = ggm->full; 807 ggm->full = ggm->full->next; 808 mgt->active->next = NULL; 809 --ggm->nfull; 810 exis_setlive(&mgt->active->exis); 811 if (slab != &kslab_dummy) { 812 KKASSERT(slab->next == NULL); 813 *ggm->empty_tailp = slab; 814 ggm->empty_tailp = &slab->next; 815 ++ggm->nempty; 816 } 817 spin_unlock(&ggm->spin); 818 goto retry; 819 } 820 821 /* 822 * We couldn't find anything, scan a limited number of empty entries 823 * looking for something with objects. This will also free excess 824 * full lists that meet requirements. 825 */ 826 if (malloc_mgt_poll_empty_locked(ggm, 16)) 827 goto rerotate; 828 829 /* 830 * Absolutely nothing is available, allocate a new slab and 831 * rotate it in. 832 * 833 * Try to get a slab from the global pcpu slab cache (very cheap). 834 * If that fails, allocate a new slab (very expensive). 835 */ 836 spin_unlock(&ggm->spin); 837 838 if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) { 839 slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE, 840 M_WAITOK); 841 } 842 843 bzero(slab, sizeof(*slab)); 844 KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <= 845 use->mgt.slab_offset); 846 847 obj = (char *)slab + use->mgt.slab_offset; 848 slab->type = type; 849 slab->orig_cpuid = gd->gd_cpuid; 850 slab->ncount = use->mgt.slab_count; 851 slab->offset = use->mgt.slab_offset; 852 slab->objsize = type->ks_objsize; 853 slab->aindex = 0; 854 slab->findex = slab->ncount; 855 slab->xindex = slab->ncount; 856 for (delta = 0; delta < slab->ncount; ++delta) { 857 slab->fobjs[delta] = obj; 858 obj = (char *)obj + type->ks_objsize; 859 } 860 861 /* 862 * Sanity check, assert that the last byte of last object is still 863 * in the slab. 864 */ 865 #if 0 866 KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) & 867 ~KMALLOC_SLAB_MASK) == 0); 868 #endif 869 KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) & 870 ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj)); 871 slab->magic = KMALLOC_SLAB_MAGIC; 872 spin_init(&slab->spin, "kmslb"); 873 874 /* 875 * Rotate it in, then retry. 876 * 877 * (NEW)slab -> active -> alternate -> G(empty) 878 */ 879 spin_lock(&ggm->spin); 880 if (mgt->alternate != &kslab_dummy) { 881 struct kmalloc_slab *slab_tmp; 882 883 slab_tmp = mgt->alternate; 884 slab_tmp->next = NULL; 885 *ggm->empty_tailp = slab_tmp; 886 ggm->empty_tailp = &slab_tmp->next; 887 ++ggm->nempty; 888 } 889 mgt->alternate = mgt->active; /* Might be dummy */ 890 mgt->active = slab; 891 spin_unlock(&ggm->spin); 892 893 goto retry; 894 895 /* 896 * Found object, adjust statistics and return 897 */ 898 found: 899 ++use->inuse; 900 ++use->calls; 901 use->memuse += size; 902 use->loosememuse += size; 903 if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) { 904 /* not MP synchronized */ 905 type->ks_loosememuse += use->loosememuse; 906 use->loosememuse = 0; 907 } 908 909 /* 910 * Handle remaining flags. M_ZERO is typically not set because 911 * the inline macro deals with zeroing for constant sizes. 912 */ 913 if (__predict_false(flags & M_ZERO)) 914 bzero(obj, size); 915 916 crit_exit(); 917 logmemory(malloc_end, NULL, type, size, flags); 918 919 return(obj); 920 } 921 922 /* 923 * Free a type-stable object. We have the base structure and can 924 * calculate the slab, but from this direction we don't know which 925 * mgt structure or list the slab might be on. 926 */ 927 void 928 _kfree_obj(void *obj, struct malloc_type *type) 929 { 930 struct kmalloc_slab *slab; 931 struct kmalloc_use *use; 932 globaldata_t gd; 933 size_t delta; 934 size_t i; 935 936 logmemory_quick(free_beg); 937 gd = mycpu; 938 939 /* 940 * Calculate the slab from the pointer 941 */ 942 slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK); 943 delta = slab->findex - slab->aindex; 944 KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount); 945 946 /* 947 * We can only safely adjust the statistics for the current cpu. 948 * Don't try to track down the original cpu. The statistics will 949 * be collected and fixed up by vmstat -m (etc). 950 */ 951 use = &slab->type->ks_use[gd->gd_cpuid]; 952 --use->inuse; 953 use->memuse -= slab->objsize; 954 955 /* 956 * There MUST be free space in the slab since we are returning 957 * the obj to the same slab it was allocated from. 958 */ 959 i = atomic_fetchadd_long(&slab->findex, 1); 960 i = i % slab->ncount; 961 if (slab->fobjs[i] != NULL) { 962 kprintf("_kfree_obj failure %zd/%zd/%zd\n", 963 slab->aindex, slab->findex, slab->ncount); 964 } 965 #ifdef KMALLOC_CHECK_DOUBLE_FREE 966 bmap_clr(slab, obj); 967 #endif 968 KKASSERT(slab->fobjs[i] == NULL); 969 slab->fobjs[i] = obj; 970 atomic_add_long(&slab->xindex, 1); /* synchronizer */ 971 972 logmemory_quick(free_end); 973 } 974