xref: /dragonfly/sys/kern/kern_kmalloc.c (revision 55f88487)
1 /*
2  * KERN_KMALLOC.C	- Kernel memory allocator
3  *
4  * Copyright (c) 2021 The DragonFly Project, All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * This module implements the kmalloc_obj allocator.  This is a type-stable
39  * allocator that uses the same base structures (e.g. malloc_type) plus
40  * some extensions to efficiently implement single-type zones.
41  *
42  * All memory management is zone based.  When a zone is destroyed, all of
43  * its memory is returned to the system with no fragmentation.
44  *
45  * A mini-slab allocator hangs directly off the zone structure (malloc_type).
46  * Since the object zones are single-size-only, the slab allocator is very
47  * simple and currently utilizes just two per-zone/per-cpu slabs (active and
48  * alternate) before kicking up to the per-zone cache.  Beyond that we just
49  * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary
50  * kernel_map mappings and unmappings.
51  *
52  * The advantage of this that zones don't stomp over each other and cause
53  * excessive fragmentation in the slabs.  For example, when you umount a
54  * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory)
55  * is returned to the system.
56  */
57 
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/kernel.h>
61 #include <sys/slaballoc.h>
62 #include <sys/mbuf.h>
63 #include <sys/vmmeter.h>
64 #include <sys/spinlock.h>
65 #include <sys/lock.h>
66 #include <sys/thread.h>
67 #include <sys/globaldata.h>
68 #include <sys/sysctl.h>
69 #include <sys/ktr.h>
70 #include <sys/malloc.h>
71 
72 #include <vm/vm.h>
73 #include <vm/vm_param.h>
74 #include <vm/vm_kern.h>
75 #include <vm/vm_extern.h>
76 #include <vm/vm_object.h>
77 #include <vm/pmap.h>
78 #include <vm/vm_map.h>
79 #include <vm/vm_page.h>
80 #include <vm/vm_pageout.h>
81 
82 #include <machine/cpu.h>
83 
84 #include <sys/spinlock2.h>
85 #include <sys/thread2.h>
86 #include <sys/exislock2.h>
87 #include <vm/vm_page2.h>
88 
89 #define MEMORY_STRING	"ptr=%p type=%p size=%lu flags=%04x"
90 #define MEMORY_ARGS	void *ptr, void *type, unsigned long size, int flags
91 
92 #if !defined(KTR_MEMORY)
93 #define KTR_MEMORY	KTR_ALL
94 #endif
95 KTR_INFO_MASTER(mem_obj);
96 KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin");
97 KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS);
98 #if 0
99 KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS);
100 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS);
101 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS);
102 KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS);
103 KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS);
104 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS);
105 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS);
106 #endif
107 KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin");
108 KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end");
109 
110 #define logmemory(name, ptr, type, size, flags)				\
111 	KTR_LOG(mem_obj_ ## name, ptr, type, size, flags)
112 #define logmemory_quick(name)						\
113 	KTR_LOG(mem_obj_ ## name)
114 
115 __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS;
116 SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, "");
117 __read_frequently static int kzone_bretire = 4;
118 SYSCTL_INT(_kern, OID_AUTO, kzone_bretire, CTLFLAG_RW, &kzone_bretire, 0, "");
119 __read_frequently static int kzone_debug;
120 SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, "");
121 
122 __read_frequently struct kmalloc_slab kslab_dummy;
123 
124 static void malloc_slab_destroy(struct malloc_type *type,
125 			struct kmalloc_slab **slabp);
126 
127 /*
128  * Cache a chain of slabs onto their respective cpu slab caches.  Any slabs
129  * which we cannot cache will be returned.
130  *
131  * free_slabs	     - Current structure may only be accessed by current cpu
132  * remote_free_slabs - Only atomic swap operations are allowed.
133  * free_count	     - Only atomic operations are allowed.
134  *
135  * If the count is sufficient to cache the entire list, NULL is returned.
136  * Otherwise the portion that was not cached is returned.
137  */
138 static __noinline
139 struct kmalloc_slab *
140 gslab_cache(struct kmalloc_slab *slab)
141 {
142 	struct kmalloc_slab *save;
143 	struct kmalloc_slab *next;
144 	struct kmalloc_slab *res;
145 	struct kmalloc_slab **resp;
146 	struct kmalloc_slab **slabp;
147 	globaldata_t rgd;
148 	size_t count;
149 	int cpuid;
150 
151 	res = NULL;
152 	resp = &res;
153 	KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
154 
155 	/*
156 	 * Given the slab list, get the cpuid and clip off as many matching
157 	 * elements as fits in the cache.
158 	 */
159 	while (slab) {
160 		cpuid = slab->orig_cpuid;
161 		rgd = globaldata_find(cpuid);
162 
163 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
164 		/*
165 		 * Doesn't fit in cache, put on return list.
166 		 */
167 		if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) {
168 			*resp = slab;
169 			resp = &slab->next;
170 			slab = slab->next;
171 			continue;
172 		}
173 
174 		/*
175 		 * Collect.  We aren't required to match-up the original cpu
176 		 * with the disposal cpu, but its a good idea to retain
177 		 * memory locality.
178 		 *
179 		 * The slabs we collect are going into the global cache,
180 		 * remove the type association.
181 		 */
182 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
183 		slabp = &slab->next;
184 		count = 1;
185 		slab->type = NULL;
186 
187 		while ((next = *slabp) != NULL &&
188 		       next->orig_cpuid == cpuid &&
189 		       rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs)
190 	        {
191 			KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0);
192 			next->type = NULL;
193 			++count;
194 			slabp = &next->next;
195 		}
196 
197 		/*
198 		 * Safety, unhook before next, next is not included in the
199 		 * list starting with slab that is being pre-pended
200 		 * to remote_free_slabs.
201 		 */
202 		*slabp = NULL;
203 
204 		/*
205 		 * Now atomically pre-pend slab...*slabp to remote_free_slabs.
206 		 * Pump the count first (its ok if the actual chain length
207 		 * races the count update).
208 		 *
209 		 * NOTE: In the loop, (save) is updated by fcmpset.
210 		 */
211 		atomic_add_long(&rgd->gd_kmslab.free_count, count);
212 		save = rgd->gd_kmslab.remote_free_slabs;
213 		for (;;) {
214 			KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0);
215 			*slabp = save;	/* end of slab list chain to... */
216 			cpu_ccfence();
217 			if (atomic_fcmpset_ptr(
218 				&rgd->gd_kmslab.remote_free_slabs,
219 				&save, slab))
220 			{
221 				break;
222 			}
223 		}
224 
225 		/*
226 		 * Setup for next loop
227 		 */
228 		slab = next;
229 	}
230 
231 	/*
232 	 * Terminate the result list and return it
233 	 */
234 	*resp = NULL;
235 
236 	return res;
237 }
238 
239 /*
240  * May only be called on current cpu.  Pull a free slab from the
241  * pcpu cache.  If we run out, move any slabs that have built-up
242  * from remote cpus.
243  *
244  * We are only allowed to swap the remote_free_slabs head, we cannot
245  * manipulate any next pointers while structures are sitting on that list.
246  */
247 static __inline
248 struct kmalloc_slab *
249 gslab_alloc(globaldata_t gd)
250 {
251 	struct kmalloc_slab *slab;
252 
253 	slab = gd->gd_kmslab.free_slabs;
254 	if (slab == NULL) {
255 		slab = atomic_swap_ptr(
256 			(volatile void **)&gd->gd_kmslab.remote_free_slabs,
257 			NULL);
258 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
259 	}
260 	if (slab) {
261 		gd->gd_kmslab.free_slabs = slab->next;
262 		slab->next = NULL;
263 		atomic_add_long(&gd->gd_kmslab.free_count, -1);
264 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
265 	}
266 	return slab;
267 }
268 
269 void
270 malloc_mgt_init(struct malloc_type *type __unused,
271 		struct kmalloc_mgt *mgt, size_t size)
272 {
273 	size_t offset;
274 	size_t count;
275 
276 	bzero(mgt, sizeof(*mgt));
277 	spin_init(&mgt->spin, "kmmgt");
278 
279 	/*
280 	 * Allows us to avoid a conditional.  The dummy slabs are empty
281 	 * and have no objects.
282 	 */
283 	mgt->active = &kslab_dummy;
284 	mgt->alternate = &kslab_dummy;
285 	mgt->empty_tailp = &mgt->empty;
286 
287 	/*
288 	 * Figure out the count by taking into account the size of the fobjs[]
289 	 * array by adding it to the object size.  This initial calculation
290 	 * ignores alignment edge-cases that might require the count to be
291 	 * reduced.
292 	 */
293 	offset = offsetof(struct kmalloc_slab, fobjs[0]);
294 	count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *));
295 
296 	/*
297 	 * Recalculate the offset of the first object, this time including
298 	 * the required alignment.  (size) should already be aligned.  This
299 	 * may push the last object beyond the slab so check and loop with
300 	 * a reduced count as necessary.
301 	 *
302 	 * Ok, theoretically the count should not actually change since the
303 	 * division above rounds-down (that is, any mis-alignment is already
304 	 * not included in the count calculation).  But I'm not going to take
305 	 * any chances and check anyway as a safety in case some programmer
306 	 * changes the code above later.  This is not a time-critical code
307 	 * path.
308 	 */
309 	offset = offsetof(struct kmalloc_slab, fobjs[count]);
310 	offset = __VM_CACHELINE_ALIGN(offset);
311 
312 	while (offset + size * count > KMALLOC_SLAB_SIZE) {
313 		--count;
314 		offset = offsetof(struct kmalloc_slab, fobjs[count]);
315 		offset = __VM_CACHELINE_ALIGN(offset);
316 		KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE);
317 	}
318 
319 	mgt->slab_offset = offset;
320 	mgt->slab_count	 = count;
321 }
322 
323 void
324 malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst)
325 {
326 	struct kmalloc_slab **slabp;
327 
328 	spin_init(&dst->spin, "kmmgt");
329 	slabp = &dst->empty;
330 
331 	while (*slabp) {
332 		slabp = &(*slabp)->next;
333 	}
334 	dst->empty_tailp = slabp;
335 }
336 
337 void
338 malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt)
339 {
340 	if (mgt->active != &kslab_dummy)
341 		malloc_slab_destroy(type, &mgt->active);
342 	mgt->active = NULL;
343 
344 	if (mgt->alternate != &kslab_dummy)
345 		malloc_slab_destroy(type, &mgt->alternate);
346 	mgt->alternate = NULL;
347 
348 	malloc_slab_destroy(type, &mgt->partial);
349 	malloc_slab_destroy(type, &mgt->full);
350 	malloc_slab_destroy(type, &mgt->empty);
351 	mgt->npartial = 0;
352 	mgt->nfull = 0;
353 	mgt->nempty = 0;
354 	mgt->empty_tailp = &mgt->empty;
355 
356 	spin_uninit(&mgt->spin);
357 }
358 
359 /*
360  * Destroy a list of slabs.  Attempt to cache the slabs on the specified
361  * (possibly remote) cpu.  This allows slabs that were operating on a
362  * particular cpu to be disposed of back to that same cpu.
363  */
364 static void
365 malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp)
366 {
367 	struct kmalloc_slab *slab;
368 	struct kmalloc_slab *base;
369 	struct kmalloc_slab **basep;
370 	size_t delta;
371 
372 	if (*slabp == NULL)
373 		return;
374 
375 	/*
376 	 * Collect all slabs that can actually be destroyed, complain
377 	 * about the rest.
378 	 */
379 	base = NULL;
380 	basep = &base;
381 	while ((slab = *slabp) != NULL) {
382 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
383 
384 		delta = slab->findex - slab->aindex;
385 		if (delta == slab->ncount) {
386 			*slabp = slab->next;	/* unlink */
387 			*basep = slab;		/* link into base list */
388 			basep = &slab->next;
389 		} else {
390 			kprintf("%s: slab %p %zd objects "
391 				"were still allocated\n",
392 				type->ks_shortdesc, slab,
393 				slab->ncount - delta);
394 			/* leave link intact and iterate */
395 			slabp = &slab->next;
396 		}
397 	}
398 
399 	/*
400 	 * Terminate the base list of slabs that can be destroyed,
401 	 * then cache as many of them as possible.
402 	 */
403 	*basep = NULL;
404 	if (base == NULL)
405 		return;
406 	base = gslab_cache(base);
407 
408 	/*
409 	 * Destroy the remainder
410 	 */
411 	while ((slab = base) != NULL) {
412 		base = slab->next;
413 		slab->next = (void *)(uintptr_t)-1;
414 		kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
415 	}
416 }
417 
418 /*
419  * Objects can be freed to an empty slab at any time, causing it to no
420  * longer be empty.  To improve performance, we do not try to pro-actively
421  * move such slabs to the appropriate partial or full list upon kfree_obj().
422  * Instead, a poller comes along and tests the slabs on the empty list
423  * periodically, and moves slabs that are no longer empty to the appropriate
424  * list.
425  *
426  * --
427  *
428  * Poll a limited number of slabs on the empty list and move them
429  * to the appropriate full or partial list.  Slabs left on the empty
430  * list are rotated to the tail.
431  *
432  * If gcache is non-zero this function will try to place full slabs into
433  * the globaldata cache, if it isn't already too full.
434  *
435  * The mgt is spin-locked
436  *
437  * Returns non-zero if the ggm updates possibly made slabs available for
438  * allocation.
439  */
440 static int
441 malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count)
442 {
443 	struct kmalloc_slab *marker;
444 	struct kmalloc_slab *slab;
445 	size_t delta;
446 	int got_something;
447 
448 	if (ggm->empty == NULL)
449 		return 0;
450 
451 	got_something = 0;
452 	marker = ggm->empty;
453 
454 	while (count-- && (slab = ggm->empty) != NULL) {
455 		/*
456 		 * Unlink from empty
457 		 */
458 		ggm->empty = slab->next;
459 		slab->next = NULL;
460 		--ggm->nempty;
461 		if (ggm->empty_tailp == &slab->next)
462 			ggm->empty_tailp = &ggm->empty;
463 
464 		/*
465 		 * Check partial, full, and empty.  We rotate
466 		 * empty entries to the end of the empty list.
467 		 *
468 		 * NOTE: For a fully-freeable slab we also have
469 		 *	 to check xindex.
470 		 */
471 		delta = slab->findex - slab->aindex;
472 		if (delta == slab->ncount) {
473 			/*
474 			 * Stuff into the full list.  This requires setting
475 			 * the exis sequence number via exis_terminate().
476 			 */
477 			KKASSERT(slab->next == NULL);
478 			exis_terminate(&slab->exis);
479 			slab->next = ggm->full;
480 			ggm->full = slab;
481 			got_something = 1;
482 			++ggm->nfull;
483 		} else if (delta) {
484 			/*
485 			 * Partially full
486 			 */
487 			KKASSERT(slab->next == NULL);
488 			slab->next = ggm->partial;
489 			ggm->partial = slab;
490 			got_something = 1;
491 			++ggm->npartial;
492 		} else {
493 			/*
494 			 * Empty
495 			 */
496 			KKASSERT(slab->next == NULL);
497 			*ggm->empty_tailp = slab;
498 			ggm->empty_tailp = &slab->next;
499 			++ggm->nempty;
500 			if (ggm->empty == marker)
501 				break;
502 		}
503 	}
504 	return got_something;
505 }
506 
507 /*
508  * Called once a second with the zone interlocked against destruction.
509  *
510  * Returns non-zero to tell the caller to iterate to the next type,
511  * else the caller should stay on the current type.
512  */
513 int
514 malloc_mgt_poll(struct malloc_type *type)
515 {
516 	struct kmalloc_mgt *ggm;
517 	struct kmalloc_slab *slab;
518 	struct kmalloc_slab **slabp;
519 	struct kmalloc_slab *base;
520 	struct kmalloc_slab **basep;
521 	size_t delta;
522 	int donext;
523 	int count;
524 	int retired;
525 
526 	if ((type->ks_flags & KSF_OBJSIZE) == 0)
527 		return 1;
528 
529 	/*
530 	 * Check the partial, full, and empty lists for full freeable slabs
531 	 * in excess of desired caching count.
532 	 */
533 	ggm = &type->ks_mgt;
534 	spin_lock(&ggm->spin);
535 
536 	/*
537 	 * Move empty slabs to partial or full as appropriate.  We
538 	 * don't bother checking partial slabs to see if they are full
539 	 * for now.
540 	 */
541 	malloc_mgt_poll_empty_locked(ggm, 16);
542 
543 	/*
544 	 * Ok, cleanout some of the full mags from the full list
545 	 */
546 	base = NULL;
547 	basep = &base;
548 	count = ggm->nfull;
549 	retired = 0;
550 	cpu_ccfence();
551 
552 	if (count > KMALLOC_MAXFREEMAGS) {
553 		slabp = &ggm->full;
554 		count -= KMALLOC_MAXFREEMAGS;
555 		if (count > 16)
556 			count = 16;
557 
558 		while (count && (slab = *slabp) != NULL) {
559 			delta = slab->findex - slab->aindex;
560 			if (delta == slab->ncount &&
561 			    slab->xindex == slab->findex &&
562 			    exis_freeable(&slab->exis))
563 			{
564 				/*
565 				 * (1) No allocated entries in the structure,
566 				 *     this should always be the case from the
567 				 *     full list.
568 				 *
569 				 * (2) kfree_obj() has fully completed.  Just
570 				 *     checking findex is not sufficient since
571 				 *     it is incremented to reserve the slot
572 				 *     before the element is loaded into it.
573 				 *
574 				 * (3) The slab has been on the full list for
575 				 *     a sufficient number of EXIS
576 				 *     pseudo_ticks, for type-safety.
577 				 */
578 				*slabp = slab->next;
579 				*basep = slab;
580 				basep = &slab->next;
581 				--ggm->nfull;
582 				++ggm->gcache_count;
583 				if (++retired == kzone_bretire)
584 					break;
585 			} else {
586 				slabp = &slab->next;
587 			}
588 			--count;
589 		}
590 		*basep = NULL;	/* terminate the retirement list */
591 		donext = (*slabp == NULL);
592 	} else {
593 		donext = 1;
594 	}
595 	spin_unlock(&ggm->spin);
596 
597 	/*
598 	 * Clean out any slabs that we couldn't stow in the globaldata cache.
599 	 */
600 	if (retired) {
601 		if (kzone_debug) {
602 			kprintf("kmalloc_poll: %s retire %d\n",
603 				type->ks_shortdesc, retired);
604 		}
605 		base = gslab_cache(base);
606 		while ((slab = base) != NULL) {
607 			base = base->next;
608 			slab->next = NULL;
609 			kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
610 		}
611 	}
612 
613 	return donext;
614 }
615 
616 /*
617  * Optional bitmap double-free check.  This is typically turned on by
618  * default for safety (sys/_malloc.h)
619  */
620 #ifdef KMALLOC_CHECK_DOUBLE_FREE
621 
622 static __inline void
623 bmap_set(struct kmalloc_slab *slab, void *obj)
624 {
625 	uint64_t *ptr;
626 	uint64_t mask;
627 	size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
628 		   slab->objsize;
629 
630 	ptr = &slab->bmap[i >> 6];
631 	mask = (uint64_t)1U << (i & 63);
632 	KKASSERT(i < slab->ncount && (*ptr & mask) == 0);
633 	atomic_set_64(ptr, mask);
634 }
635 
636 static __inline void
637 bmap_clr(struct kmalloc_slab *slab, void *obj)
638 {
639 	uint64_t *ptr;
640 	uint64_t mask;
641 	size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
642 		   slab->objsize;
643 
644 	ptr = &slab->bmap[i >> 6];
645 	mask = (uint64_t)1U << (i & 63);
646 	KKASSERT(i < slab->ncount && (*ptr & mask) != 0);
647 	atomic_clear_64(ptr, mask);
648 }
649 
650 #endif
651 
652 /*
653  * Cleanup a mgt structure.
654  *
655  * Always called from the current cpu, so we can manipulate the various
656  * lists freely.
657  *
658  * WARNING: findex can race, fobjs[n] is updated after findex is incremented,
659  *	    and 'full'
660  */
661 #if 0
662 static void
663 mgt_cleanup(struct kmalloc_mgt *mgt)
664 {
665 #if 0
666 	struct kmalloc_slab **slabp;
667 	struct kmalloc_slab *slab;
668 	size_t delta;
669 	size_t total;
670 #endif
671 }
672 #endif
673 
674 #ifdef SLAB_DEBUG
675 void *
676 _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags,
677 	      const char *file, int line)
678 #else
679 void *
680 _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags)
681 #endif
682 {
683 	struct kmalloc_slab *slab;
684 	struct kmalloc_use *use;
685 	struct kmalloc_mgt *mgt;
686 	struct kmalloc_mgt *ggm;
687 	globaldata_t gd;
688 	void *obj;
689 	size_t delta;
690 
691 	/*
692 	 * Check limits
693 	 */
694 	while (__predict_false(type->ks_loosememuse >= type->ks_limit)) {
695 		long ttl;
696 		int n;
697 
698 		for (n = ttl = 0; n < ncpus; ++n)
699 			ttl += type->ks_use[n].memuse;
700 		type->ks_loosememuse = ttl;	/* not MP synchronized */
701 		if ((ssize_t)ttl < 0)		/* deal with occassional race */
702 			ttl = 0;
703 		if (ttl >= type->ks_limit) {
704 			if (flags & M_NULLOK)
705 				return(NULL);
706 			panic("%s: malloc limit exceeded", type->ks_shortdesc);
707 		}
708 	}
709 
710 	/*
711 	 * Setup
712 	 */
713 	crit_enter();
714 	logmemory_quick(malloc_beg);
715 	KKASSERT(size == type->ks_objsize);
716 	gd = mycpu;
717 	use = &type->ks_use[gd->gd_cpuid];
718 
719 retry:
720 	/*
721 	 * Check active
722 	 *
723 	 * NOTE: obj can be NULL if racing a _kfree_obj().
724 	 */
725 	mgt = &use->mgt;
726 	slab = mgt->active;			/* Might be dummy */
727 	delta = slab->findex - slab->aindex;
728 	if (__predict_true(delta != 0)) {	/* Cannot be dummy */
729 		size_t i;
730 
731 		i = slab->aindex % slab->ncount;
732 		obj = slab->fobjs[i];
733 		if (__predict_true(obj != NULL)) {
734 			slab->fobjs[i] = NULL;
735 			++slab->aindex;
736 #ifdef KMALLOC_CHECK_DOUBLE_FREE
737 			bmap_set(slab, obj);
738 #endif
739 			goto found;
740 		}
741 	}
742 
743 	/*
744 	 * Check alternate.  If we find something, swap it with
745 	 * the active.
746 	 *
747 	 * NOTE: It is possible for exhausted slabs to recover entries
748 	 *	 via _kfree_obj(), so we just keep swapping until both
749 	 *	 are empty.
750 	 *
751 	 * NOTE: obj can be NULL if racing a _kfree_obj().
752 	 */
753 	slab = mgt->alternate;			/* Might be dummy */
754 	delta = slab->findex - slab->aindex;
755 	if (__predict_true(delta != 0)) {	/* Cannot be dummy */
756 		size_t i;
757 
758 		mgt->alternate = mgt->active;
759 		mgt->active = slab;
760 		i = slab->aindex % slab->ncount;
761 		obj = slab->fobjs[i];
762 		if (__predict_true(obj != NULL)) {
763 			slab->fobjs[i] = NULL;
764 			++slab->aindex;
765 #ifdef KMALLOC_CHECK_DOUBLE_FREE
766 			bmap_set(slab, obj);
767 #endif
768 			goto found;
769 		}
770 	}
771 
772 	/*
773 	 * Rotate a slab from the global mgt into the pcpu mgt.
774 	 *
775 	 *	G(partial, full) -> active -> alternate -> G(empty)
776 	 *
777 	 * We try to exhaust partials first to reduce fragmentation, then
778 	 * dig into the fulls.
779 	 */
780 	ggm = &type->ks_mgt;
781 	spin_lock(&ggm->spin);
782 
783 rerotate:
784 	if (ggm->partial) {
785 		slab = mgt->alternate;		/* Might be dummy */
786 		mgt->alternate = mgt->active;	/* Might be dummy */
787 		mgt->active = ggm->partial;
788 		ggm->partial = ggm->partial->next;
789 		mgt->active->next = NULL;
790 		--ggm->npartial;
791 		if (slab != &kslab_dummy) {
792 			KKASSERT(slab->next == NULL);
793 			*ggm->empty_tailp = slab;
794 			ggm->empty_tailp = &slab->next;
795 			++ggm->nempty;
796 		}
797 		spin_unlock(&ggm->spin);
798 		goto retry;
799 	}
800 
801 	if (ggm->full) {
802 		slab = mgt->alternate;		/* Might be dummy */
803 		mgt->alternate = mgt->active;	/* Might be dummy */
804 		mgt->active = ggm->full;
805 		ggm->full = ggm->full->next;
806 		mgt->active->next = NULL;
807 		--ggm->nfull;
808 		exis_setlive(&mgt->active->exis);
809 		if (slab != &kslab_dummy) {
810 			KKASSERT(slab->next == NULL);
811 			*ggm->empty_tailp = slab;
812 			ggm->empty_tailp = &slab->next;
813 			++ggm->nempty;
814 		}
815 		spin_unlock(&ggm->spin);
816 		goto retry;
817 	}
818 
819 	/*
820 	 * We couldn't find anything, scan a limited number of empty entries
821 	 * looking for something with objects.  This will also free excess
822 	 * full lists that meet requirements.
823 	 */
824 	if (malloc_mgt_poll_empty_locked(ggm, 16))
825 		goto rerotate;
826 
827 	/*
828 	 * Absolutely nothing is available, allocate a new slab and
829 	 * rotate it in.
830 	 *
831 	 * Try to get a slab from the global pcpu slab cache (very cheap).
832 	 * If that fails, allocate a new slab (very expensive).
833 	 */
834 	spin_unlock(&ggm->spin);
835 
836 	if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) {
837 		slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE,
838 				       M_WAITOK);
839 	}
840 
841 	bzero(slab, sizeof(*slab));
842 	KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <=
843 		 use->mgt.slab_offset);
844 
845 	obj = (char *)slab + use->mgt.slab_offset;
846 	slab->type = type;
847 	slab->orig_cpuid = gd->gd_cpuid;
848 	slab->ncount = use->mgt.slab_count;
849 	slab->offset = use->mgt.slab_offset;
850 	slab->objsize = type->ks_objsize;
851 	slab->aindex = 0;
852 	slab->findex = slab->ncount;
853 	slab->xindex = slab->ncount;
854 	for (delta = 0; delta < slab->ncount; ++delta) {
855 		slab->fobjs[delta] = obj;
856 		obj = (char *)obj + type->ks_objsize;
857 	}
858 
859 	/*
860 	 * Sanity check, assert that the last byte of last object is still
861 	 * in the slab.
862 	 */
863 #if 0
864 	KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
865 		  ~KMALLOC_SLAB_MASK) == 0);
866 #endif
867 	KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
868 		  ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj));
869 	slab->magic = KMALLOC_SLAB_MAGIC;
870 	spin_init(&slab->spin, "kmslb");
871 
872 	/*
873 	 * Rotate it in, then retry.
874 	 *
875 	 *	(NEW)slab -> active -> alternate -> G(empty)
876 	 */
877 	spin_lock(&ggm->spin);
878 	if (mgt->alternate != &kslab_dummy) {
879 		struct kmalloc_slab *slab_tmp;
880 
881 		slab_tmp = mgt->alternate;
882 		slab_tmp->next = NULL;
883 		*ggm->empty_tailp = slab_tmp;
884 		ggm->empty_tailp = &slab_tmp->next;
885 		++ggm->nempty;
886 	}
887 	mgt->alternate = mgt->active;		/* Might be dummy */
888 	mgt->active = slab;
889 	spin_unlock(&ggm->spin);
890 
891 	goto retry;
892 
893 	/*
894 	 * Found object, adjust statistics and return
895 	 */
896 found:
897 	++use->inuse;
898 	++use->calls;
899 	use->memuse += size;
900 	use->loosememuse += size;
901 	if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) {
902 	    /* not MP synchronized */
903 	    type->ks_loosememuse += use->loosememuse;
904 	    use->loosememuse = 0;
905 	}
906 
907 	/*
908 	 * Handle remaining flags.  M_ZERO is typically not set because
909 	 * the inline macro deals with zeroing for constant sizes.
910 	 */
911 	if (__predict_false(flags & M_ZERO))
912 	    bzero(obj, size);
913 
914 	crit_exit();
915 	logmemory(malloc_end, NULL, type, size, flags);
916 
917 	return(obj);
918 }
919 
920 /*
921  * Free a type-stable object.  We have the base structure and can
922  * calculate the slab, but from this direction we don't know which
923  * mgt structure or list the slab might be on.
924  */
925 void
926 _kfree_obj(void *obj, struct malloc_type *type)
927 {
928 	struct kmalloc_slab *slab;
929 	struct kmalloc_use *use;
930 	globaldata_t gd;
931 	size_t	delta;
932 	size_t	i;
933 
934 	logmemory_quick(free_beg);
935 	gd = mycpu;
936 
937 	/*
938 	 * Calculate the slab from the pointer
939 	 */
940 	slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK);
941 	delta = slab->findex - slab->aindex;
942 	KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount);
943 
944 	/*
945 	 * We can only safely adjust the statistics for the current cpu.
946 	 * Don't try to track down the original cpu.  The statistics will
947 	 * be collected and fixed up by vmstat -m  (etc).
948 	 */
949 	use = &slab->type->ks_use[gd->gd_cpuid];
950 	--use->inuse;
951 	use->memuse -= slab->objsize;
952 
953 	/*
954 	 * There MUST be free space in the slab since we are returning
955 	 * the obj to the same slab it was allocated from.
956 	 */
957 	i = atomic_fetchadd_long(&slab->findex, 1);
958 	i = i % slab->ncount;
959 	if (slab->fobjs[i] != NULL) {
960 		kprintf("_kfree_obj failure %zd/%zd/%zd\n",
961 			slab->aindex, slab->findex, slab->ncount);
962 	}
963 #ifdef KMALLOC_CHECK_DOUBLE_FREE
964 	bmap_clr(slab, obj);
965 #endif
966 	KKASSERT(slab->fobjs[i] == NULL);
967 	slab->fobjs[i] = obj;
968 	atomic_add_long(&slab->xindex, 1);	/* synchronizer */
969 
970 	logmemory_quick(free_end);
971 }
972