xref: /dragonfly/sys/kern/kern_kmalloc.c (revision 0b2c5ee3)
1 /*
2  * KERN_KMALLOC.C	- Kernel memory allocator
3  *
4  * Copyright (c) 2021 The DragonFly Project, All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * This module implements the kmalloc_obj allocator.  This is a type-stable
39  * allocator that uses the same base structures (e.g. malloc_type) plus
40  * some extensions to efficiently implement single-type zones.
41  *
42  * All memory management is zone based.  When a zone is destroyed, all of
43  * its memory is returned to the system with no fragmentation.
44  *
45  * A mini-slab allocator hangs directly off the zone structure (malloc_type).
46  * Since the object zones are single-size-only, the slab allocator is very
47  * simple and currently utilizes just two per-zone/per-cpu slabs (active and
48  * alternate) before kicking up to the per-zone cache.  Beyond that we just
49  * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary
50  * kernel_map mappings and unmappings.
51  *
52  * The advantage of this that zones don't stomp over each other and cause
53  * excessive fragmentation in the slabs.  For example, when you umount a
54  * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory)
55  * is returned to the system.
56  */
57 
58 #include "opt_vm.h"
59 
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/kernel.h>
63 #include <sys/slaballoc.h>
64 #include <sys/mbuf.h>
65 #include <sys/vmmeter.h>
66 #include <sys/spinlock.h>
67 #include <sys/lock.h>
68 #include <sys/thread.h>
69 #include <sys/globaldata.h>
70 #include <sys/sysctl.h>
71 #include <sys/ktr.h>
72 #include <sys/malloc.h>
73 
74 #include <vm/vm.h>
75 #include <vm/vm_param.h>
76 #include <vm/vm_kern.h>
77 #include <vm/vm_extern.h>
78 #include <vm/vm_object.h>
79 #include <vm/pmap.h>
80 #include <vm/vm_map.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_pageout.h>
83 
84 #include <machine/cpu.h>
85 
86 #include <sys/spinlock2.h>
87 #include <sys/thread2.h>
88 #include <sys/exislock2.h>
89 #include <vm/vm_page2.h>
90 
91 #define MEMORY_STRING	"ptr=%p type=%p size=%lu flags=%04x"
92 #define MEMORY_ARGS	void *ptr, void *type, unsigned long size, int flags
93 
94 #if !defined(KTR_MEMORY)
95 #define KTR_MEMORY	KTR_ALL
96 #endif
97 KTR_INFO_MASTER(mem_obj);
98 KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin");
99 KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS);
100 #if 0
101 KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS);
102 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS);
103 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS);
104 KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS);
105 KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS);
106 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS);
107 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS);
108 #endif
109 KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin");
110 KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end");
111 
112 #define logmemory(name, ptr, type, size, flags)				\
113 	KTR_LOG(mem_obj_ ## name, ptr, type, size, flags)
114 #define logmemory_quick(name)						\
115 	KTR_LOG(mem_obj_ ## name)
116 
117 __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS;
118 SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, "");
119 __read_frequently static int kzone_bretire = 4;
120 SYSCTL_INT(_kern, OID_AUTO, kzone_bretire, CTLFLAG_RW, &kzone_bretire, 0, "");
121 __read_frequently static int kzone_debug;
122 SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, "");
123 
124 __read_frequently struct kmalloc_slab kslab_dummy;
125 
126 static void malloc_slab_destroy(struct malloc_type *type,
127 			struct kmalloc_slab **slabp);
128 
129 /*
130  * Cache a chain of slabs onto their respective cpu slab caches.  Any slabs
131  * which we cannot cache will be returned.
132  *
133  * free_slabs	     - Current structure may only be accessed by current cpu
134  * remote_free_slabs - Only atomic swap operations are allowed.
135  * free_count	     - Only atomic operations are allowed.
136  *
137  * If the count is sufficient to cache the entire list, NULL is returned.
138  * Otherwise the portion that was not cached is returned.
139  */
140 static __noinline
141 struct kmalloc_slab *
142 gslab_cache(struct kmalloc_slab *slab)
143 {
144 	struct kmalloc_slab *save;
145 	struct kmalloc_slab *next;
146 	struct kmalloc_slab *res;
147 	struct kmalloc_slab **resp;
148 	struct kmalloc_slab **slabp;
149 	globaldata_t rgd;
150 	size_t count;
151 	int cpuid;
152 
153 	res = NULL;
154 	resp = &res;
155 	KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
156 
157 	/*
158 	 * Given the slab list, get the cpuid and clip off as many matching
159 	 * elements as fits in the cache.
160 	 */
161 	while (slab) {
162 		cpuid = slab->orig_cpuid;
163 		rgd = globaldata_find(cpuid);
164 
165 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
166 		/*
167 		 * Doesn't fit in cache, put on return list.
168 		 */
169 		if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) {
170 			*resp = slab;
171 			resp = &slab->next;
172 			slab = slab->next;
173 			continue;
174 		}
175 
176 		/*
177 		 * Collect.  We aren't required to match-up the original cpu
178 		 * with the disposal cpu, but its a good idea to retain
179 		 * memory locality.
180 		 *
181 		 * The slabs we collect are going into the global cache,
182 		 * remove the type association.
183 		 */
184 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
185 		slabp = &slab->next;
186 		count = 1;
187 		slab->type = NULL;
188 
189 		while ((next = *slabp) != NULL &&
190 		       next->orig_cpuid == cpuid &&
191 		       rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs)
192 	        {
193 			KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0);
194 			next->type = NULL;
195 			++count;
196 			slabp = &next->next;
197 		}
198 
199 		/*
200 		 * Safety, unhook before next, next is not included in the
201 		 * list starting with slab that is being pre-pended
202 		 * to remote_free_slabs.
203 		 */
204 		*slabp = NULL;
205 
206 		/*
207 		 * Now atomically pre-pend slab...*slabp to remote_free_slabs.
208 		 * Pump the count first (its ok if the actual chain length
209 		 * races the count update).
210 		 *
211 		 * NOTE: In the loop, (save) is updated by fcmpset.
212 		 */
213 		atomic_add_long(&rgd->gd_kmslab.free_count, count);
214 		save = rgd->gd_kmslab.remote_free_slabs;
215 		for (;;) {
216 			KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0);
217 			*slabp = save;	/* end of slab list chain to... */
218 			cpu_ccfence();
219 			if (atomic_fcmpset_ptr(
220 				&rgd->gd_kmslab.remote_free_slabs,
221 				&save, slab))
222 			{
223 				break;
224 			}
225 		}
226 
227 		/*
228 		 * Setup for next loop
229 		 */
230 		slab = next;
231 	}
232 
233 	/*
234 	 * Terminate the result list and return it
235 	 */
236 	*resp = NULL;
237 
238 	return res;
239 }
240 
241 /*
242  * May only be called on current cpu.  Pull a free slab from the
243  * pcpu cache.  If we run out, move any slabs that have built-up
244  * from remote cpus.
245  *
246  * We are only allowed to swap the remote_free_slabs head, we cannot
247  * manipulate any next pointers while structures are sitting on that list.
248  */
249 static __inline
250 struct kmalloc_slab *
251 gslab_alloc(globaldata_t gd)
252 {
253 	struct kmalloc_slab *slab;
254 
255 	slab = gd->gd_kmslab.free_slabs;
256 	if (slab == NULL) {
257 		slab = atomic_swap_ptr(
258 			(volatile void **)&gd->gd_kmslab.remote_free_slabs,
259 			NULL);
260 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
261 	}
262 	if (slab) {
263 		gd->gd_kmslab.free_slabs = slab->next;
264 		slab->next = NULL;
265 		atomic_add_long(&gd->gd_kmslab.free_count, -1);
266 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
267 	}
268 	return slab;
269 }
270 
271 void
272 malloc_mgt_init(struct malloc_type *type __unused,
273 		struct kmalloc_mgt *mgt, size_t size)
274 {
275 	size_t offset;
276 	size_t count;
277 
278 	bzero(mgt, sizeof(*mgt));
279 	spin_init(&mgt->spin, "kmmgt");
280 
281 	/*
282 	 * Allows us to avoid a conditional.  The dummy slabs are empty
283 	 * and have no objects.
284 	 */
285 	mgt->active = &kslab_dummy;
286 	mgt->alternate = &kslab_dummy;
287 	mgt->empty_tailp = &mgt->empty;
288 
289 	/*
290 	 * Figure out the count by taking into account the size of the fobjs[]
291 	 * array by adding it to the object size.
292 	 */
293 	offset = offsetof(struct kmalloc_slab, fobjs[0]);
294 	offset = __VM_CACHELINE_ALIGN(offset);
295 	count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *));
296 
297 	/*
298 	 * However, the fobj[] array itself must be aligned, so we might
299 	 * have to reduce the count by 1.  (We can do this becaues 'size'
300 	 * is already aligned as well).
301 	 */
302 	offset = offsetof(struct kmalloc_slab, fobjs[count]);
303 	offset = __VM_CACHELINE_ALIGN(offset);
304 
305 	if (offset + size * count > KMALLOC_SLAB_SIZE) {
306 		--count;
307 		offset = offsetof(struct kmalloc_slab, fobjs[count]);
308 		offset = __VM_CACHELINE_ALIGN(offset);
309 		KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE);
310 	}
311 
312 	mgt->slab_offset = offset;
313 	mgt->slab_count	 = count;
314 }
315 
316 void
317 malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst)
318 {
319 	struct kmalloc_slab **slabp;
320 
321 	spin_init(&dst->spin, "kmmgt");
322 	slabp = &dst->empty;
323 
324 	while (*slabp) {
325 		slabp = &(*slabp)->next;
326 	}
327 	dst->empty_tailp = slabp;
328 }
329 
330 void
331 malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt)
332 {
333 	if (mgt->active != &kslab_dummy)
334 		malloc_slab_destroy(type, &mgt->active);
335 	mgt->active = NULL;
336 
337 	if (mgt->alternate != &kslab_dummy)
338 		malloc_slab_destroy(type, &mgt->alternate);
339 	mgt->alternate = NULL;
340 
341 	malloc_slab_destroy(type, &mgt->partial);
342 	malloc_slab_destroy(type, &mgt->full);
343 	malloc_slab_destroy(type, &mgt->empty);
344 	mgt->npartial = 0;
345 	mgt->nfull = 0;
346 	mgt->nempty = 0;
347 	mgt->empty_tailp = &mgt->empty;
348 
349 	spin_uninit(&mgt->spin);
350 }
351 
352 /*
353  * Destroy a list of slabs.  Attempt to cache the slabs on the specified
354  * (possibly remote) cpu.  This allows slabs that were operating on a
355  * particular cpu to be disposed of back to that same cpu.
356  */
357 static void
358 malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp)
359 {
360 	struct kmalloc_slab *slab;
361 	struct kmalloc_slab *base;
362 	struct kmalloc_slab **basep;
363 	size_t delta;
364 
365 	if (*slabp == NULL)
366 		return;
367 
368 	/*
369 	 * Collect all slabs that can actually be destroyed, complain
370 	 * about the rest.
371 	 */
372 	base = NULL;
373 	basep = &base;
374 	while ((slab = *slabp) != NULL) {
375 		KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
376 
377 		delta = slab->findex - slab->aindex;
378 		if (delta == slab->ncount) {
379 			*slabp = slab->next;	/* unlink */
380 			*basep = slab;		/* link into base list */
381 			basep = &slab->next;
382 		} else {
383 			kprintf("%s: slab %p %zd objects "
384 				"were still allocated\n",
385 				type->ks_shortdesc, slab,
386 				slab->ncount - delta);
387 			/* leave link intact and iterate */
388 			slabp = &slab->next;
389 		}
390 	}
391 
392 	/*
393 	 * Terminate the base list of slabs that can be destroyed,
394 	 * then cache as many of them as possible.
395 	 */
396 	*basep = NULL;
397 	if (base == NULL)
398 		return;
399 	base = gslab_cache(base);
400 
401 	/*
402 	 * Destroy the remainder
403 	 */
404 	while ((slab = base) != NULL) {
405 		base = slab->next;
406 		slab->next = (void *)(uintptr_t)-1;
407 		kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
408 	}
409 }
410 
411 /*
412  * Poll a limited number of slabs on the empty list and move them
413  * to the appropriate full or partial list.  Slabs left on the empty
414  * list or rotated to the tail.
415  *
416  * If gcache is non-zero this function will try to place full slabs into
417  * the globaldata cache, if it isn't already too full.
418  *
419  * The mgt is spin-locked
420  *
421  * Returns non-zero if the ggm updates possibly made slabs available for
422  * allocation.
423  */
424 static int
425 malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count)
426 {
427 	struct kmalloc_slab *marker;
428 	struct kmalloc_slab *slab;
429 	size_t delta;
430 	int got_something;
431 
432 	if (ggm->empty == NULL)
433 		return 0;
434 
435 	got_something = 0;
436 	marker = ggm->empty;
437 
438 	while (count-- && (slab = ggm->empty) != NULL) {
439 		/*
440 		 * Unlink from empty
441 		 */
442 		ggm->empty = slab->next;
443 		slab->next = NULL;
444 		--ggm->nempty;
445 		if (ggm->empty_tailp == &slab->next)
446 			ggm->empty_tailp = &ggm->empty;
447 
448 		/*
449 		 * Check partial, full, and empty.  We rotate
450 		 * empty entries to the end of the empty list.
451 		 *
452 		 * NOTE: For a fully-freeable slab we also have
453 		 *	 to check xindex.
454 		 */
455 		delta = slab->findex - slab->aindex;
456 		if (delta == slab->ncount) {
457 			/*
458 			 * Stuff into the full list.  This requires setting
459 			 * the exis sequence number via exis_terminate().
460 			 */
461 			KKASSERT(slab->next == NULL);
462 			exis_terminate(&slab->exis);
463 			slab->next = ggm->full;
464 			ggm->full = slab;
465 			got_something = 1;
466 			++ggm->nfull;
467 		} else if (delta) {
468 			/*
469 			 * Partially full
470 			 */
471 			KKASSERT(slab->next == NULL);
472 			slab->next = ggm->partial;
473 			ggm->partial = slab;
474 			got_something = 1;
475 			++ggm->npartial;
476 		} else {
477 			/*
478 			 * Empty
479 			 */
480 			KKASSERT(slab->next == NULL);
481 			*ggm->empty_tailp = slab;
482 			ggm->empty_tailp = &slab->next;
483 			++ggm->nempty;
484 			if (ggm->empty == marker)
485 				break;
486 		}
487 	}
488 	return got_something;
489 }
490 
491 /*
492  * Called once a second with the zone interlocked against destruction.
493  *
494  * Returns non-zero to tell the caller to iterate to the next type,
495  * else the caller should stay on the current type.
496  */
497 int
498 malloc_mgt_poll(struct malloc_type *type)
499 {
500 	struct kmalloc_mgt *ggm;
501 	struct kmalloc_slab *slab;
502 	struct kmalloc_slab **slabp;
503 	struct kmalloc_slab *base;
504 	struct kmalloc_slab **basep;
505 	size_t delta;
506 	int donext;
507 	int count;
508 	int retired;
509 
510 	if ((type->ks_flags & KSF_OBJSIZE) == 0)
511 		return 1;
512 
513 	/*
514 	 * Check the partial, full, and empty lists for full freeable slabs
515 	 * in excess of desired caching count.
516 	 */
517 	ggm = &type->ks_mgt;
518 	spin_lock(&ggm->spin);
519 
520 	/*
521 	 * Move empty slabs to partial or full as appropriate.  We
522 	 * don't bother checking partial slabs to see if they are full
523 	 * for now.
524 	 */
525 	malloc_mgt_poll_empty_locked(ggm, 16);
526 
527 	/*
528 	 * Ok, cleanout some of the full mags from the full list
529 	 */
530 	base = NULL;
531 	basep = &base;
532 	count = ggm->nfull;
533 	retired = 0;
534 	cpu_ccfence();
535 
536 	if (count > KMALLOC_MAXFREEMAGS) {
537 		slabp = &ggm->full;
538 		count -= KMALLOC_MAXFREEMAGS;
539 		if (count > 16)
540 			count = 16;
541 
542 		while (count && (slab = *slabp) != NULL) {
543 			delta = slab->findex - slab->aindex;
544 			if (delta == slab->ncount &&
545 			    slab->xindex == slab->findex &&
546 			    exis_freeable(&slab->exis))
547 			{
548 				/*
549 				 * (1) No allocated entries in the structure,
550 				 *     this should always be the case from the
551 				 *     full list.
552 				 *
553 				 * (2) kfree_obj() has fully completed.  Just
554 				 *     checking findex is not sufficient since
555 				 *     it is incremented to reserve the slot
556 				 *     before the element is loaded into it.
557 				 *
558 				 * (3) The slab has been on the full list for
559 				 *     a sufficient number of EXIS
560 				 *     pseudo_ticks, for type-safety.
561 				 */
562 				*slabp = slab->next;
563 				*basep = slab;
564 				basep = &slab->next;
565 				--ggm->nfull;
566 				++ggm->gcache_count;
567 				if (++retired == kzone_bretire)
568 					break;
569 			} else {
570 				slabp = &slab->next;
571 			}
572 			--count;
573 		}
574 		*basep = NULL;	/* terminate the retirement list */
575 		donext = (*slabp == NULL);
576 	} else {
577 		donext = 1;
578 	}
579 	spin_unlock(&ggm->spin);
580 
581 	/*
582 	 * Clean out any slabs that we couldn't stow in the globaldata cache.
583 	 */
584 	if (retired) {
585 		if (kzone_debug) {
586 			kprintf("kmalloc_poll: %s retire %d\n",
587 				type->ks_shortdesc, retired);
588 		}
589 		base = gslab_cache(base);
590 		while ((slab = base) != NULL) {
591 			base = base->next;
592 			slab->next = NULL;
593 			kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
594 		}
595 	}
596 
597 	return donext;
598 }
599 
600 /*
601  * Optional bitmap double-free check.  This is typically turned on by
602  * default for safety (sys/_malloc.h)
603  */
604 #ifdef KMALLOC_CHECK_DOUBLE_FREE
605 
606 static __inline void
607 bmap_set(struct kmalloc_slab *slab, void *obj)
608 {
609 	uint64_t *ptr;
610 	uint64_t mask;
611 	size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
612 		   slab->objsize;
613 
614 	ptr = &slab->bmap[i >> 6];
615 	mask = (uint64_t)1U << (i & 63);
616 	KKASSERT(i < slab->ncount && (*ptr & mask) == 0);
617 	atomic_set_64(ptr, mask);
618 }
619 
620 static __inline void
621 bmap_clr(struct kmalloc_slab *slab, void *obj)
622 {
623 	uint64_t *ptr;
624 	uint64_t mask;
625 	size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
626 		   slab->objsize;
627 
628 	ptr = &slab->bmap[i >> 6];
629 	mask = (uint64_t)1U << (i & 63);
630 	KKASSERT(i < slab->ncount && (*ptr & mask) != 0);
631 	atomic_clear_64(ptr, mask);
632 }
633 
634 #endif
635 
636 /*
637  * Cleanup a mgt structure.
638  *
639  * Always called from the current cpu, so we can manipulate the various
640  * lists freely.
641  *
642  * WARNING: findex can race, fobjs[n] is updated after findex is incremented,
643  *	    and 'full'
644  */
645 #if 0
646 static void
647 mgt_cleanup(struct kmalloc_mgt *mgt)
648 {
649 #if 0
650 	struct kmalloc_slab **slabp;
651 	struct kmalloc_slab *slab;
652 	size_t delta;
653 	size_t total;
654 #endif
655 }
656 #endif
657 
658 #ifdef SLAB_DEBUG
659 void *
660 _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags,
661 	      const char *file, int line)
662 #else
663 void *
664 _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags)
665 #endif
666 {
667 	struct kmalloc_slab *slab;
668 	struct kmalloc_use *use;
669 	struct kmalloc_mgt *mgt;
670 	struct kmalloc_mgt *ggm;
671 	globaldata_t gd;
672 	void *obj;
673 	size_t delta;
674 
675 	/*
676 	 * Check limits
677 	 */
678 	while (__predict_false(type->ks_loosememuse >= type->ks_limit)) {
679 		long ttl;
680 		int n;
681 
682 		for (n = ttl = 0; n < ncpus; ++n)
683 			ttl += type->ks_use[n].memuse;
684 		type->ks_loosememuse = ttl;	/* not MP synchronized */
685 		if ((ssize_t)ttl < 0)		/* deal with occassional race */
686 			ttl = 0;
687 		if (ttl >= type->ks_limit) {
688 			if (flags & M_NULLOK)
689 				return(NULL);
690 			panic("%s: malloc limit exceeded", type->ks_shortdesc);
691 		}
692 	}
693 
694 	/*
695 	 * Setup
696 	 */
697 	crit_enter();
698 	logmemory_quick(malloc_beg);
699 	KKASSERT(size == type->ks_objsize);
700 	gd = mycpu;
701 	use = &type->ks_use[gd->gd_cpuid];
702 
703 retry:
704 	/*
705 	 * Check active
706 	 *
707 	 * NOTE: obj can be NULL if racing a _kfree_obj().
708 	 */
709 	mgt = &use->mgt;
710 	slab = mgt->active;			/* Might be dummy */
711 	delta = slab->findex - slab->aindex;
712 	if (__predict_true(delta != 0)) {	/* Cannot be dummy */
713 		size_t i;
714 
715 		i = slab->aindex % slab->ncount;
716 		obj = slab->fobjs[i];
717 		if (__predict_true(obj != NULL)) {
718 			slab->fobjs[i] = NULL;
719 			++slab->aindex;
720 #ifdef KMALLOC_CHECK_DOUBLE_FREE
721 			bmap_set(slab, obj);
722 #endif
723 			goto found;
724 		}
725 	}
726 
727 	/*
728 	 * Check alternate.  If we find something, swap it with
729 	 * the active.
730 	 *
731 	 * NOTE: It is possible for exhausted slabs to recover entries
732 	 *	 via _kfree_obj(), so we just keep swapping until both
733 	 *	 are empty.
734 	 *
735 	 * NOTE: obj can be NULL if racing a _kfree_obj().
736 	 */
737 	slab = mgt->alternate;			/* Might be dummy */
738 	delta = slab->findex - slab->aindex;
739 	if (__predict_true(delta != 0)) {	/* Cannot be dummy */
740 		size_t i;
741 
742 		mgt->alternate = mgt->active;
743 		mgt->active = slab;
744 		i = slab->aindex % slab->ncount;
745 		obj = slab->fobjs[i];
746 		if (__predict_true(obj != NULL)) {
747 			slab->fobjs[i] = NULL;
748 			++slab->aindex;
749 #ifdef KMALLOC_CHECK_DOUBLE_FREE
750 			bmap_set(slab, obj);
751 #endif
752 			goto found;
753 		}
754 	}
755 
756 	/*
757 	 * Rotate a slab from the global mgt into the pcpu mgt.
758 	 *
759 	 *	G(partial, full) -> active -> alternate -> G(empty)
760 	 *
761 	 * We try to exhaust partials first to reduce fragmentation, then
762 	 * dig into the fulls.
763 	 */
764 	ggm = &type->ks_mgt;
765 	spin_lock(&ggm->spin);
766 
767 rerotate:
768 	if (ggm->partial) {
769 		slab = mgt->alternate;		/* Might be dummy */
770 		mgt->alternate = mgt->active;	/* Might be dummy */
771 		mgt->active = ggm->partial;
772 		ggm->partial = ggm->partial->next;
773 		mgt->active->next = NULL;
774 		--ggm->npartial;
775 		if (slab != &kslab_dummy) {
776 			KKASSERT(slab->next == NULL);
777 			*ggm->empty_tailp = slab;
778 			ggm->empty_tailp = &slab->next;
779 			++ggm->nempty;
780 		}
781 		spin_unlock(&ggm->spin);
782 		goto retry;
783 	}
784 
785 	if (ggm->full) {
786 		slab = mgt->alternate;		/* Might be dummy */
787 		mgt->alternate = mgt->active;	/* Might be dummy */
788 		mgt->active = ggm->full;
789 		ggm->full = ggm->full->next;
790 		mgt->active->next = NULL;
791 		--ggm->nfull;
792 		exis_setlive(&mgt->active->exis);
793 		if (slab != &kslab_dummy) {
794 			KKASSERT(slab->next == NULL);
795 			*ggm->empty_tailp = slab;
796 			ggm->empty_tailp = &slab->next;
797 			++ggm->nempty;
798 		}
799 		spin_unlock(&ggm->spin);
800 		goto retry;
801 	}
802 
803 	/*
804 	 * We couldn't find anything, scan a limited number of empty entries
805 	 * looking for something with objects.  This will also free excess
806 	 * full lists that meet requirements.
807 	 */
808 	if (malloc_mgt_poll_empty_locked(ggm, 16))
809 		goto rerotate;
810 
811 	/*
812 	 * Absolutely nothing is available, allocate a new slab and
813 	 * rotate it in.
814 	 *
815 	 * Try to get a slab from the global pcpu slab cache (very cheap).
816 	 * If that fails, allocate a new slab (very expensive).
817 	 */
818 	spin_unlock(&ggm->spin);
819 
820 	if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) {
821 		slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE,
822 				       M_WAITOK);
823 	}
824 
825 	bzero(slab, sizeof(*slab));
826 	KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <=
827 		 use->mgt.slab_offset);
828 
829 	obj = (char *)slab + use->mgt.slab_offset;
830 	slab->type = type;
831 	slab->orig_cpuid = gd->gd_cpuid;
832 	slab->ncount = use->mgt.slab_count;
833 	slab->offset = use->mgt.slab_offset;
834 	slab->objsize = type->ks_objsize;
835 	slab->aindex = 0;
836 	slab->findex = slab->ncount;
837 	slab->xindex = slab->ncount;
838 	for (delta = 0; delta < slab->ncount; ++delta) {
839 		slab->fobjs[delta] = obj;
840 		obj = (char *)obj + type->ks_objsize;
841 	}
842 
843 	/*
844 	 * Sanity check, assert that the last byte of last object is still
845 	 * in the slab.
846 	 */
847 #if 0
848 	KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
849 		  ~KMALLOC_SLAB_MASK) == 0);
850 #endif
851 	KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
852 		  ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj));
853 	slab->magic = KMALLOC_SLAB_MAGIC;
854 	spin_init(&slab->spin, "kmslb");
855 
856 	/*
857 	 * Rotate it in, then retry.
858 	 *
859 	 *	(NEW)slab -> active -> alternate -> G(empty)
860 	 */
861 	spin_lock(&ggm->spin);
862 	if (mgt->alternate != &kslab_dummy) {
863 		struct kmalloc_slab *slab_tmp;
864 
865 		slab_tmp = mgt->alternate;
866 		slab_tmp->next = NULL;
867 		*ggm->empty_tailp = slab_tmp;
868 		ggm->empty_tailp = &slab_tmp->next;
869 		++ggm->nempty;
870 	}
871 	mgt->alternate = mgt->active;		/* Might be dummy */
872 	mgt->active = slab;
873 	spin_unlock(&ggm->spin);
874 
875 	goto retry;
876 
877 	/*
878 	 * Found object, adjust statistics and return
879 	 */
880 found:
881 	++use->inuse;
882 	++use->calls;
883 	use->memuse += size;
884 	use->loosememuse += size;
885 	if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) {
886 	    /* not MP synchronized */
887 	    type->ks_loosememuse += use->loosememuse;
888 	    use->loosememuse = 0;
889 	}
890 
891 	/*
892 	 * Handle remaining flags.  M_ZERO is typically not set because
893 	 * the inline macro deals with zeroing for constant sizes.
894 	 */
895 	if (__predict_false(flags & M_ZERO))
896 	    bzero(obj, size);
897 
898 	crit_exit();
899 	logmemory(malloc_end, NULL, type, size, flags);
900 
901 	return(obj);
902 }
903 
904 /*
905  * Free a type-stable object.  We have the base structure and can
906  * calculate the slab, but from this direction we don't know which
907  * mgt structure or list the slab might be on.
908  */
909 void
910 _kfree_obj(void *obj, struct malloc_type *type)
911 {
912 	struct kmalloc_slab *slab;
913 	struct kmalloc_use *use;
914 	globaldata_t gd;
915 	size_t	delta;
916 	size_t	i;
917 
918 	logmemory_quick(free_beg);
919 	gd = mycpu;
920 
921 	/*
922 	 * Calculate the slab from the pointer
923 	 */
924 	slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK);
925 	delta = slab->findex - slab->aindex;
926 	KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount);
927 
928 	/*
929 	 * We can only safely adjust the statistics for the current cpu.
930 	 * Don't try to track down the original cpu.  The statistics will
931 	 * be collected and fixed up by vmstat -m  (etc).
932 	 */
933 	use = &slab->type->ks_use[gd->gd_cpuid];
934 	--use->inuse;
935 	use->memuse -= slab->objsize;
936 
937 	/*
938 	 * There MUST be free space in the slab since we are returning
939 	 * the obj to the same slab it was allocated from.
940 	 */
941 	i = atomic_fetchadd_long(&slab->findex, 1);
942 	i = i % slab->ncount;
943 	if (slab->fobjs[i] != NULL) {
944 		kprintf("_kfree_obj failure %zd/%zd/%zd\n",
945 			slab->aindex, slab->findex, slab->ncount);
946 	}
947 #ifdef KMALLOC_CHECK_DOUBLE_FREE
948 	bmap_clr(slab, obj);
949 #endif
950 	KKASSERT(slab->fobjs[i] == NULL);
951 	slab->fobjs[i] = obj;
952 	atomic_add_long(&slab->xindex, 1);	/* synchronizer */
953 
954 	logmemory_quick(free_end);
955 }
956