xref: /dragonfly/sys/kern/kern_objcache.c (revision 7485684f)
1 /*
2  * Copyright (c) 2005 Jeffrey M. Hsu.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Jeffrey M. Hsu.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of The DragonFly Project nor the names of its
16  *    contributors may be used to endorse or promote products derived
17  *    from this software without specific, prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
23  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
25  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
27  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
29  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/param.h>
34 #include <sys/kernel.h>
35 #include <sys/systm.h>
36 #include <sys/callout.h>
37 #include <sys/globaldata.h>
38 #include <sys/malloc.h>
39 #include <sys/queue.h>
40 #include <sys/objcache.h>
41 #include <sys/spinlock.h>
42 #include <sys/thread.h>
43 #include <sys/thread2.h>
44 #include <sys/spinlock2.h>
45 #include <sys/sysctl.h>
46 
47 static MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
48 static MALLOC_DEFINE(M_OBJMAG, "objcache mag", "Object Cache Magazine");
49 
50 #define	INITIAL_MAG_CAPACITY	64
51 
52 struct magazine {
53 	int			 rounds;
54 	int			 capacity;
55 	SLIST_ENTRY(magazine)	 nextmagazine;
56 	void			*objects[];
57 };
58 
59 SLIST_HEAD(magazinelist, magazine);
60 
61 #define MAGAZINE_HDRSIZE	__offsetof(struct magazine, objects[0])
62 #define MAGAZINE_CAPACITY_MAX	4096
63 #define MAGAZINE_CAPACITY_MIN	4
64 
65 /*
66  * per-cluster cache of magazines
67  *
68  * All fields in this structure are protected by the spinlock.
69  */
70 struct magazinedepot {
71 	/*
72 	 * The per-cpu object caches only exchanges completely full or
73 	 * completely empty magazines with the depot layer, so only have
74 	 * to cache these two types of magazines.
75 	 */
76 	struct magazinelist	fullmagazines;
77 	struct magazinelist	emptymagazines;
78 	int			magcapacity;
79 
80 	/* protect this structure */
81 	struct spinlock		spin;
82 
83 	/* magazines not yet allocated towards limit */
84 	int			unallocated_objects;
85 	int			cluster_limit;	/* ref for adjustments */
86 
87 	/* infrequently used fields */
88 	int			waiting;	/* waiting for another cpu to
89 						 * return a full magazine to
90 						 * the depot */
91 	int			contested;	/* depot contention count */
92 } __cachealign;
93 
94 /*
95  * per-cpu object cache
96  * All fields in this structure are protected by crit_enter().
97  */
98 struct percpu_objcache {
99 	struct magazine	*loaded_magazine;	/* active magazine */
100 	struct magazine	*previous_magazine;	/* backup magazine */
101 
102 	/* statistics */
103 	u_long		gets_cumulative;	/* total calls to get */
104 	u_long		gets_null;		/* objcache_get returned NULL */
105 	u_long		allocs_cumulative;	/* total calls to alloc */
106 	u_long		puts_cumulative;	/* total calls to put */
107 	u_long		gets_exhausted;		/* # of gets hit exhaustion */
108 #ifdef notyet
109 	u_long		puts_othercluster;	/* returned to other cluster */
110 #endif
111 
112 	/* infrequently used fields */
113 	int		waiting;		/* waiting for a thread on this
114 						 * cpu to return an obj to the
115 						 * per-cpu cache */
116 } __cachealign;
117 
118 /* only until we have NUMA cluster topology information XXX */
119 #define MAXCLUSTERS 1
120 #define myclusterid 0
121 #define CLUSTER_OF(obj) 0
122 
123 /*
124  * Rarely accessed but useful bits of objcache.
125  */
126 struct objcache_desc {
127 	LIST_ENTRY(objcache_desc)	next;
128 	struct objcache			*objcache;
129 	int				total_objects;
130 	int				reserved;
131 	char				name[OBJCACHE_NAMELEN];
132 };
133 
134 /*
135  * Two-level object cache consisting of NUMA cluster-level depots of
136  * fully loaded or completely empty magazines and cpu-level caches of
137  * individual objects.
138  */
139 struct objcache {
140 	/* object constructor and destructor from blank storage */
141 	objcache_ctor_fn	*ctor;
142 	objcache_dtor_fn	*dtor;
143 	void			*privdata;
144 
145 	/* interface to underlying allocator */
146 	objcache_alloc_fn	*alloc;
147 	objcache_free_fn	*free;
148 	void			*allocator_args;
149 
150 	struct objcache_desc	*desc;
151 
152 	/* NUMA-cluster level caches */
153 	struct magazinedepot	depot[MAXCLUSTERS];
154 
155 	struct percpu_objcache	cache_percpu[];	/* per-cpu caches */
156 };
157 
158 SYSCTL_NODE(_kern, OID_AUTO, objcache, CTLFLAG_RW, 0, "objcache");
159 
160 static struct spinlock objcachelist_spin;
161 static LIST_HEAD(objcachelist, objcache_desc) allobjcaches;
162 static int magazine_capmin;
163 static int magazine_capmax;
164 
165 static struct magazine *
166 mag_alloc(int capacity)
167 {
168 	struct magazine *mag;
169 	int size;
170 
171 	size = __offsetof(struct magazine, objects[capacity]);
172 	KASSERT(size > 0 && (size & __VM_CACHELINE_MASK) == 0,
173 	    ("magazine size is not multiple cache line size"));
174 
175 	mag = kmalloc(size, M_OBJMAG, M_INTWAIT | M_ZERO | M_CACHEALIGN);
176 	mag->capacity = capacity;
177 	mag->rounds = 0;
178 	return (mag);
179 }
180 
181 static int
182 mag_capacity_align(int mag_capacity)
183 {
184 	int mag_size;
185 
186 	mag_size = __VM_CACHELINE_ALIGN(
187 	    __offsetof(struct magazine, objects[mag_capacity]));
188 	mag_capacity = (mag_size - MAGAZINE_HDRSIZE) / sizeof(void *);
189 
190 	return mag_capacity;
191 }
192 
193 /*
194  * Utility routine for objects that don't require any de-construction.
195  */
196 
197 static void
198 null_dtor(void *obj, void *privdata)
199 {
200 	/* do nothing */
201 }
202 
203 static boolean_t
204 null_ctor(void *obj, void *privdata, int ocflags)
205 {
206 	return TRUE;
207 }
208 
209 /*
210  * Create an object cache.
211  */
212 struct objcache *
213 objcache_create(const char *name, int cluster_limit, int nom_cache,
214 		objcache_ctor_fn *ctor, objcache_dtor_fn *dtor, void *privdata,
215 		objcache_alloc_fn *alloc, objcache_free_fn *free,
216 		void *allocator_args)
217 {
218 	struct objcache_desc *desc;
219 	struct objcache *oc;
220 	struct magazinedepot *depot;
221 	int cpuid;
222 	int nmagdepot;
223 	int mag_capacity;
224 	int i;
225 
226 	/*
227 	 * Allocate objcache descriptor.
228 	 */
229 	desc = kmalloc(sizeof(*desc), M_OBJCACHE, M_WAITOK | M_ZERO);
230 
231 	/*
232 	 * Allocate object cache structure
233 	 */
234 	oc = kmalloc(__offsetof(struct objcache, cache_percpu[ncpus]),
235 		     M_OBJCACHE,
236 		     M_WAITOK | M_ZERO | M_CACHEALIGN);
237 	oc->ctor = ctor ? ctor : null_ctor;
238 	oc->dtor = dtor ? dtor : null_dtor;
239 	oc->privdata = privdata;
240 	oc->alloc = alloc;
241 	oc->free = free;
242 	oc->allocator_args = allocator_args;
243 
244 	/*
245 	 * Link objcache and its descriptor.
246 	 */
247 	oc->desc = desc;
248 	desc->objcache = oc;
249 	strlcpy(desc->name, name, sizeof(desc->name));
250 
251 	/*
252 	 * Initialize depot list(s).
253 	 */
254 	depot = &oc->depot[0];
255 
256 	spin_init(&depot->spin, "objcachedepot");
257 	SLIST_INIT(&depot->fullmagazines);
258 	SLIST_INIT(&depot->emptymagazines);
259 
260 	/*
261 	 * Figure out the nominal number of free objects to cache and
262 	 * the magazine capacity.  By default we want to cache up to
263 	 * half the cluster_limit.  If there is no cluster_limit then
264 	 * we want to cache up to 128 objects.
265 	 */
266 	if (nom_cache == 0)
267 		nom_cache = cluster_limit / 2;
268 	if (cluster_limit && nom_cache > cluster_limit)
269 		nom_cache = cluster_limit;
270 	if (nom_cache == 0)
271 		nom_cache = INITIAL_MAG_CAPACITY * 2;
272 
273 	/*
274 	 * Magazine capacity for 2 active magazines per cpu plus 2
275 	 * magazines in the depot.
276 	 */
277 	mag_capacity = mag_capacity_align(nom_cache / (ncpus + 1) / 2 + 1);
278 	if (mag_capacity > magazine_capmax)
279 		mag_capacity = magazine_capmax;
280 	else if (mag_capacity < magazine_capmin)
281 		mag_capacity = magazine_capmin;
282 	depot->magcapacity = mag_capacity;
283 
284 	/*
285 	 * The cluster_limit must be sufficient to have two magazines per
286 	 * cpu plus at least two magazines in the depot.  However, because
287 	 * partial magazines can stay on the cpus what we really need here
288 	 * is to specify the number of extra magazines we allocate for the
289 	 * depot.
290 	 *
291 	 * Use ~1B objects to mean 'unlimited'.  A negative unallocated
292 	 * object count is possible due to dynamic adjustments so we can't
293 	 * use a negative number to mean 'unlimited'.  We need some overflow
294 	 * capacity too due to the preallocated mags.
295 	 */
296 	if (cluster_limit == 0) {
297 		depot->unallocated_objects = OBJCACHE_UNLIMITED;
298 	} else {
299 		depot->unallocated_objects = ncpus * mag_capacity * 2 +
300 					     cluster_limit;
301 	}
302 
303 	/* Save # of total objects. */
304 	desc->total_objects = depot->unallocated_objects;
305 
306 	/*
307 	 * This is a dynamic adjustment aid initialized to the callers
308 	 * expectations of the current limit.
309 	 */
310 	depot->cluster_limit = cluster_limit;
311 
312 	/*
313 	 * Initialize per-cpu caches
314 	 */
315 	for (cpuid = 0; cpuid < ncpus; cpuid++) {
316 		struct percpu_objcache *cache_percpu = &oc->cache_percpu[cpuid];
317 
318 		cache_percpu->loaded_magazine = mag_alloc(mag_capacity);
319 		cache_percpu->previous_magazine = mag_alloc(mag_capacity);
320 	}
321 
322 	/*
323 	 * Compute how many empty magazines to place in the depot.  This
324 	 * determines the retained cache size and is based on nom_cache.
325 	 *
326 	 * The actual cache size is larger because there are two magazines
327 	 * for each cpu as well but those can be in any fill state so we
328 	 * just can't count them.
329 	 *
330 	 * There is a minimum of two magazines in the depot.
331 	 */
332 	nmagdepot = nom_cache / mag_capacity + 1;
333 	if (nmagdepot < 2)
334 		nmagdepot = 2;
335 
336 	/*
337 	 * Put empty magazines in depot
338 	 */
339 	for (i = 0; i < nmagdepot; i++) {
340 		struct magazine *mag = mag_alloc(mag_capacity);
341 		SLIST_INSERT_HEAD(&depot->emptymagazines, mag, nextmagazine);
342 	}
343 
344 	spin_lock(&objcachelist_spin);
345 	LIST_INSERT_HEAD(&allobjcaches, desc, next);
346 	spin_unlock(&objcachelist_spin);
347 
348 	return (oc);
349 }
350 
351 /*
352  * Adjust the cluster limit.  This is allowed to cause unallocated_objects
353  * to go negative.  Note that due to the magazine hysteresis there is a
354  * limit to how much of the objcache can be reclaimed using this API to
355  * reduce its size.
356  */
357 void
358 objcache_set_cluster_limit(struct objcache *oc, int cluster_limit)
359 {
360 	struct magazinedepot *depot;
361 
362 	depot = &oc->depot[myclusterid];
363 	if (depot->cluster_limit != cluster_limit) {
364 		int delta;
365 
366 		spin_lock(&depot->spin);
367 		delta = cluster_limit - depot->cluster_limit;
368 		depot->unallocated_objects += delta;
369 		depot->cluster_limit = cluster_limit;
370 		spin_unlock(&depot->spin);
371 		wakeup(depot);
372 
373 		oc->desc->total_objects += delta;
374 	}
375 }
376 
377 struct objcache *
378 objcache_create_simple(malloc_type_t mtype, size_t objsize)
379 {
380 	struct objcache_malloc_args *margs;
381 	struct objcache *oc;
382 
383 	margs = kmalloc(sizeof(*margs), M_OBJCACHE, M_WAITOK|M_ZERO);
384 	margs->objsize = objsize;
385 	margs->mtype = mtype;
386 	oc = objcache_create(mtype->ks_shortdesc, 0, 0,
387 			     NULL, NULL, NULL,
388 			     objcache_malloc_alloc, objcache_malloc_free,
389 			     margs);
390 	return (oc);
391 }
392 
393 struct objcache *
394 objcache_create_mbacked(malloc_type_t mtype, size_t objsize,
395 			int cluster_limit, int nom_cache,
396 			objcache_ctor_fn *ctor, objcache_dtor_fn *dtor,
397 			void *privdata)
398 {
399 	struct objcache_malloc_args *margs;
400 	struct objcache *oc;
401 
402 	margs = kmalloc(sizeof(*margs), M_OBJCACHE, M_WAITOK|M_ZERO);
403 	margs->objsize = objsize;
404 	margs->mtype = mtype;
405 	oc = objcache_create(mtype->ks_shortdesc,
406 			     cluster_limit, nom_cache,
407 			     ctor, dtor, privdata,
408 			     objcache_malloc_alloc, objcache_malloc_free,
409 			     margs);
410 	return(oc);
411 }
412 
413 
414 #define MAGAZINE_EMPTY(mag)	(mag->rounds == 0)
415 #define MAGAZINE_NOTEMPTY(mag)	(mag->rounds != 0)
416 #define MAGAZINE_FULL(mag)	(mag->rounds == mag->capacity)
417 
418 #define	swap(x, y)	({ struct magazine *t = x; x = y; y = t; })
419 
420 /*
421  * Get an object from the object cache.
422  *
423  * WARNING!  ocflags are only used when we have to go to the underlying
424  * allocator, so we cannot depend on flags such as M_ZERO.
425  */
426 void *
427 objcache_get(struct objcache *oc, int ocflags)
428 {
429 	struct percpu_objcache *cpucache = &oc->cache_percpu[mycpuid];
430 	struct magazine *loadedmag;
431 	struct magazine *emptymag;
432 	void *obj;
433 	struct magazinedepot *depot;
434 
435 	KKASSERT((ocflags & M_ZERO) == 0);
436 	crit_enter();
437 	++cpucache->gets_cumulative;
438 
439 retry:
440 	/*
441 	 * Loaded magazine has an object.  This is the hot path.
442 	 * It is lock-free and uses a critical section to block
443 	 * out interrupt handlers on the same processor.
444 	 */
445 	loadedmag = cpucache->loaded_magazine;
446 	if (MAGAZINE_NOTEMPTY(loadedmag)) {
447 		obj = loadedmag->objects[--loadedmag->rounds];
448 		crit_exit();
449 		return (obj);
450 	}
451 
452 	/* Previous magazine has an object. */
453 	if (MAGAZINE_NOTEMPTY(cpucache->previous_magazine)) {
454 		swap(cpucache->loaded_magazine, cpucache->previous_magazine);
455 		loadedmag = cpucache->loaded_magazine;
456 		obj = loadedmag->objects[--loadedmag->rounds];
457 		crit_exit();
458 		return (obj);
459 	}
460 
461 	/*
462 	 * Both magazines empty.  Get a full magazine from the depot and
463 	 * move one of the empty ones to the depot.
464 	 *
465 	 * Obtain the depot spinlock.
466 	 *
467 	 * NOTE: Beyond this point, M_* flags are handled via oc->alloc()
468 	 */
469 	depot = &oc->depot[myclusterid];
470 	spin_lock(&depot->spin);
471 
472 	/*
473 	 * Recheck the cpucache after obtaining the depot spinlock.  This
474 	 * shouldn't be necessary now but don't take any chances.
475 	 */
476 	if (MAGAZINE_NOTEMPTY(cpucache->loaded_magazine) ||
477 	    MAGAZINE_NOTEMPTY(cpucache->previous_magazine)
478 	) {
479 		spin_unlock(&depot->spin);
480 		goto retry;
481 	}
482 
483 	/* Check if depot has a full magazine. */
484 	if (!SLIST_EMPTY(&depot->fullmagazines)) {
485 		emptymag = cpucache->previous_magazine;
486 		cpucache->previous_magazine = cpucache->loaded_magazine;
487 		cpucache->loaded_magazine = SLIST_FIRST(&depot->fullmagazines);
488 		SLIST_REMOVE_HEAD(&depot->fullmagazines, nextmagazine);
489 
490 		/*
491 		 * Return emptymag to the depot.
492 		 */
493 		KKASSERT(MAGAZINE_EMPTY(emptymag));
494 		SLIST_INSERT_HEAD(&depot->emptymagazines,
495 				  emptymag, nextmagazine);
496 		spin_unlock(&depot->spin);
497 		goto retry;
498 	}
499 
500 	/*
501 	 * The depot does not have any non-empty magazines.  If we have
502 	 * not hit our object limit we can allocate a new object using
503 	 * the back-end allocator.
504 	 *
505 	 * NOTE: unallocated_objects can wind up being negative due to
506 	 *	 objcache_set_cluster_limit() calls.
507 	 */
508 	if (__predict_true(depot->unallocated_objects > 0)) {
509 		--depot->unallocated_objects;
510 		spin_unlock(&depot->spin);
511 		++cpucache->allocs_cumulative;
512 		crit_exit();
513 
514 		obj = oc->alloc(oc->allocator_args, ocflags);
515 		if (obj) {
516 			if (oc->ctor(obj, oc->privdata, ocflags))
517 				return (obj);
518 			oc->free(obj, oc->allocator_args);
519 			obj = NULL;
520 		}
521 		if (obj == NULL) {
522 			spin_lock(&depot->spin);
523 			++depot->unallocated_objects;
524 			spin_unlock(&depot->spin);
525 			if (depot->waiting)
526 				wakeup(depot);
527 
528 			crit_enter();
529 			/*
530 			 * makes debugging easier when gets_cumulative does
531 			 * not include gets_null.
532 			 */
533 			++cpucache->gets_null;
534 			--cpucache->gets_cumulative;
535 			crit_exit();
536 		}
537 		return(obj);
538 	}
539 	if (__predict_false(cpucache->gets_exhausted++ == 0)) {
540 		kprintf("Warning: objcache(%s) exhausted on cpu%d!\n",
541 		    oc->desc->name, mycpuid);
542 	}
543 
544 	/*
545 	 * Otherwise block if allowed to.
546 	 */
547 	if ((ocflags & (M_WAITOK|M_NULLOK)) == M_WAITOK) {
548 		++cpucache->waiting;
549 		++depot->waiting;
550 		ssleep(depot, &depot->spin, 0, "objcache_get", 0);
551 		--cpucache->waiting;
552 		--depot->waiting;
553 		spin_unlock(&depot->spin);
554 		goto retry;
555 	}
556 
557 	/*
558 	 * Otherwise fail
559 	 */
560 	++cpucache->gets_null;
561 	--cpucache->gets_cumulative;
562 	crit_exit();
563 	spin_unlock(&depot->spin);
564 	return (NULL);
565 }
566 
567 /*
568  * Wrapper for malloc allocation routines.
569  */
570 void *
571 objcache_malloc_alloc(void *allocator_args, int ocflags)
572 {
573 	struct objcache_malloc_args *alloc_args = allocator_args;
574 
575 	return (kmalloc(alloc_args->objsize, alloc_args->mtype,
576 		       ocflags & OC_MFLAGS));
577 }
578 
579 /*
580  * Wrapper for malloc allocation routines, with initial zeroing
581  * (but objects are not zerod on reuse from cache).
582  */
583 void *
584 objcache_malloc_alloc_zero(void *allocator_args, int ocflags)
585 {
586 	struct objcache_malloc_args *alloc_args = allocator_args;
587 
588 	return (kmalloc(alloc_args->objsize, alloc_args->mtype,
589 		       (ocflags & OC_MFLAGS) | M_ZERO));
590 }
591 
592 
593 void
594 objcache_malloc_free(void *obj, void *allocator_args)
595 {
596 	struct objcache_malloc_args *alloc_args = allocator_args;
597 
598 	kfree(obj, alloc_args->mtype);
599 }
600 
601 /*
602  * Wrapper for allocation policies that pre-allocate at initialization time
603  * and don't do run-time allocation.
604  */
605 void *
606 objcache_nop_alloc(void *allocator_args, int ocflags)
607 {
608 	return (NULL);
609 }
610 
611 void
612 objcache_nop_free(void *obj, void *allocator_args)
613 {
614 }
615 
616 /*
617  * Return an object to the object cache.
618  */
619 void
620 objcache_put(struct objcache *oc, void *obj)
621 {
622 	struct percpu_objcache *cpucache = &oc->cache_percpu[mycpuid];
623 	struct magazine *loadedmag;
624 	struct magazinedepot *depot;
625 
626 	crit_enter();
627 	++cpucache->puts_cumulative;
628 
629 	if (CLUSTER_OF(obj) != myclusterid) {
630 #ifdef notyet
631 		/* use lazy IPI to send object to owning cluster XXX todo */
632 		++cpucache->puts_othercluster;
633 		crit_exit();
634 		return;
635 #endif
636 	}
637 
638 retry:
639 	/*
640 	 * Free slot available in loaded magazine.  This is the hot path.
641 	 * It is lock-free and uses a critical section to block out interrupt
642 	 * handlers on the same processor.
643 	 */
644 	loadedmag = cpucache->loaded_magazine;
645 	if (!MAGAZINE_FULL(loadedmag)) {
646 		loadedmag->objects[loadedmag->rounds++] = obj;
647 		if (cpucache->waiting)
648 			wakeup_mycpu(&oc->depot[myclusterid]);
649 		crit_exit();
650 		return;
651 	}
652 
653 	/*
654 	 * Current magazine full, but previous magazine has room.  XXX
655 	 */
656 	if (!MAGAZINE_FULL(cpucache->previous_magazine)) {
657 		swap(cpucache->loaded_magazine, cpucache->previous_magazine);
658 		loadedmag = cpucache->loaded_magazine;
659 		loadedmag->objects[loadedmag->rounds++] = obj;
660 		if (cpucache->waiting)
661 			wakeup_mycpu(&oc->depot[myclusterid]);
662 		crit_exit();
663 		return;
664 	}
665 
666 	/*
667 	 * Both magazines full.  Get an empty magazine from the depot and
668 	 * move a full loaded magazine to the depot.  Even though the
669 	 * magazine may wind up with space available after we block on
670 	 * the spinlock, we still cycle it through to avoid the non-optimal
671 	 * corner-case.
672 	 *
673 	 * Obtain the depot spinlock.
674 	 */
675 	depot = &oc->depot[myclusterid];
676 	spin_lock(&depot->spin);
677 
678 	/*
679 	 * If an empty magazine is available in the depot, cycle it
680 	 * through and retry.
681 	 */
682 	if (!SLIST_EMPTY(&depot->emptymagazines)) {
683 		loadedmag = cpucache->previous_magazine;
684 		cpucache->previous_magazine = cpucache->loaded_magazine;
685 		cpucache->loaded_magazine = SLIST_FIRST(&depot->emptymagazines);
686 		SLIST_REMOVE_HEAD(&depot->emptymagazines, nextmagazine);
687 
688 		/*
689 		 * Return loadedmag to the depot.  Due to blocking it may
690 		 * not be entirely full and could even be empty.
691 		 */
692 		if (MAGAZINE_EMPTY(loadedmag)) {
693 			SLIST_INSERT_HEAD(&depot->emptymagazines,
694 					  loadedmag, nextmagazine);
695 			spin_unlock(&depot->spin);
696 		} else {
697 			SLIST_INSERT_HEAD(&depot->fullmagazines,
698 					  loadedmag, nextmagazine);
699 			spin_unlock(&depot->spin);
700 			if (depot->waiting)
701 				wakeup(depot);
702 		}
703 		goto retry;
704 	}
705 
706 	/*
707 	 * An empty mag is not available.  This is a corner case which can
708 	 * occur due to cpus holding partially full magazines.  Do not try
709 	 * to allocate a mag, just free the object.
710 	 */
711 	++depot->unallocated_objects;
712 	spin_unlock(&depot->spin);
713 	if (depot->waiting)
714 		wakeup(depot);
715 	crit_exit();
716 	oc->dtor(obj, oc->privdata);
717 	oc->free(obj, oc->allocator_args);
718 }
719 
720 /*
721  * The object is being put back into the cache, but the caller has
722  * indicated that the object is not in any shape to be reused and should
723  * be dtor'd immediately.
724  */
725 void
726 objcache_dtor(struct objcache *oc, void *obj)
727 {
728 	struct magazinedepot *depot;
729 
730 	depot = &oc->depot[myclusterid];
731 	spin_lock(&depot->spin);
732 	++depot->unallocated_objects;
733 	spin_unlock(&depot->spin);
734 	if (depot->waiting)
735 		wakeup(depot);
736 	oc->dtor(obj, oc->privdata);
737 	oc->free(obj, oc->allocator_args);
738 }
739 
740 /*
741  * Deallocate all objects in a magazine and free the magazine if requested.
742  * When freeit is TRUE the magazine must already be disassociated from the
743  * depot.
744  *
745  * Must be called with a critical section held when called with a per-cpu
746  * magazine.  The magazine may be indirectly modified during the loop.
747  *
748  * If the magazine moves during a dtor the operation is aborted.  This is
749  * only allowed when freeit is FALSE.
750  *
751  * The number of objects freed is returned.
752  */
753 static int
754 mag_purge(struct objcache *oc, struct magazine **magp, int freeit)
755 {
756 	struct magazine *mag = *magp;
757 	int count;
758 	void *obj;
759 
760 	count = 0;
761 	while (mag->rounds) {
762 		obj = mag->objects[--mag->rounds];
763 		oc->dtor(obj, oc->privdata);		/* MAY BLOCK */
764 		oc->free(obj, oc->allocator_args);	/* MAY BLOCK */
765 		++count;
766 
767 		/*
768 		 * Cycle for interrupts.
769 		 */
770 		if ((count & 15) == 0) {
771 			crit_exit();
772 			crit_enter();
773 		}
774 
775 		/*
776 		 * mag may have become invalid either due to dtor/free
777 		 * blocking or interrupt cycling, do not derefernce it
778 		 * until we check.
779 		 */
780 		if (*magp != mag) {
781 			kprintf("mag_purge: mag ripped out\n");
782 			break;
783 		}
784 	}
785 	if (freeit) {
786 		KKASSERT(*magp == mag);
787 		*magp = NULL;
788 		kfree(mag, M_OBJMAG);
789 	}
790 	return(count);
791 }
792 
793 /*
794  * Disassociate zero or more magazines from a magazine list associated with
795  * the depot, update the depot, and move the magazines to a temporary
796  * list.
797  *
798  * The caller must check the depot for waiters and wake it up, typically
799  * after disposing of the magazines this function loads onto the temporary
800  * list.
801  */
802 static void
803 maglist_disassociate(struct magazinedepot *depot, struct magazinelist *maglist,
804 		     struct magazinelist *tmplist, boolean_t purgeall)
805 {
806 	struct magazine *mag;
807 
808 	while ((mag = SLIST_FIRST(maglist)) != NULL) {
809 		SLIST_REMOVE_HEAD(maglist, nextmagazine);
810 		SLIST_INSERT_HEAD(tmplist, mag, nextmagazine);
811 		depot->unallocated_objects += mag->rounds;
812 	}
813 }
814 
815 /*
816  * Deallocate all magazines and their contents from the passed temporary
817  * list.  The magazines have already been accounted for by their depots.
818  *
819  * The total number of rounds freed is returned.  This number is typically
820  * only used to determine whether a wakeup on the depot is needed or not.
821  */
822 static int
823 maglist_purge(struct objcache *oc, struct magazinelist *maglist)
824 {
825 	struct magazine *mag;
826 	int count = 0;
827 
828 	/*
829 	 * can't use SLIST_FOREACH because blocking releases the depot
830 	 * spinlock
831 	 */
832 	crit_enter();
833 	while ((mag = SLIST_FIRST(maglist)) != NULL) {
834 		SLIST_REMOVE_HEAD(maglist, nextmagazine);
835 		count += mag_purge(oc, &mag, TRUE);
836 	}
837 	crit_exit();
838 	return(count);
839 }
840 
841 /*
842  * De-allocates all magazines on the full and empty magazine lists.
843  *
844  * Because this routine is called with a spinlock held, the magazines
845  * can only be disassociated and moved to a temporary list, not freed.
846  *
847  * The caller is responsible for freeing the magazines.
848  */
849 static void
850 depot_disassociate(struct magazinedepot *depot, struct magazinelist *tmplist)
851 {
852 	maglist_disassociate(depot, &depot->fullmagazines, tmplist, TRUE);
853 	maglist_disassociate(depot, &depot->emptymagazines, tmplist, TRUE);
854 }
855 
856 /*
857  * Try to free up some memory.  Return as soon as some free memory is found.
858  * For each object cache on the reclaim list, first try the current per-cpu
859  * cache, then the full magazine depot.
860  */
861 boolean_t
862 objcache_reclaimlist(struct objcache *oclist[], int nlist)
863 {
864 	struct objcache *oc;
865 	struct percpu_objcache *cpucache;
866 	struct magazinedepot *depot;
867 	struct magazinelist tmplist;
868 	int i, count;
869 
870 	SLIST_INIT(&tmplist);
871 
872 	for (i = 0; i < nlist; i++) {
873 		oc = oclist[i];
874 		cpucache = &oc->cache_percpu[mycpuid];
875 		depot = &oc->depot[myclusterid];
876 
877 		crit_enter();
878 		count = mag_purge(oc, &cpucache->loaded_magazine, FALSE);
879 		if (count == 0)
880 			count += mag_purge(oc, &cpucache->previous_magazine, FALSE);
881 		crit_exit();
882 		if (count > 0) {
883 			spin_lock(&depot->spin);
884 			depot->unallocated_objects += count;
885 			spin_unlock(&depot->spin);
886 			if (depot->waiting)
887 				wakeup(depot);
888 			return (TRUE);
889 		}
890 		spin_lock(&depot->spin);
891 		maglist_disassociate(depot, &depot->fullmagazines,
892 				     &tmplist, FALSE);
893 		spin_unlock(&depot->spin);
894 		count = maglist_purge(oc, &tmplist);
895 		if (count > 0) {
896 			if (depot->waiting)
897 				wakeup(depot);
898 			return (TRUE);
899 		}
900 	}
901 	return (FALSE);
902 }
903 
904 /*
905  * Destroy an object cache.  Must have no existing references.
906  */
907 void
908 objcache_destroy(struct objcache *oc)
909 {
910 	struct objcache_desc *desc = oc->desc;
911 	struct percpu_objcache *cache_percpu;
912 	struct magazinedepot *depot;
913 	int clusterid, cpuid;
914 	struct magazinelist tmplist;
915 
916 	spin_lock(&objcachelist_spin);
917 	LIST_REMOVE(desc, next);
918 	spin_unlock(&objcachelist_spin);
919 
920 	SLIST_INIT(&tmplist);
921 	for (clusterid = 0; clusterid < MAXCLUSTERS; clusterid++) {
922 		depot = &oc->depot[clusterid];
923 		spin_lock(&depot->spin);
924 		depot_disassociate(depot, &tmplist);
925 		spin_unlock(&depot->spin);
926 	}
927 	maglist_purge(oc, &tmplist);
928 
929 	for (cpuid = 0; cpuid < ncpus; cpuid++) {
930 		cache_percpu = &oc->cache_percpu[cpuid];
931 
932 		crit_enter();
933 		mag_purge(oc, &cache_percpu->loaded_magazine, TRUE);
934 		mag_purge(oc, &cache_percpu->previous_magazine, TRUE);
935 		crit_exit();
936 		cache_percpu->loaded_magazine = NULL;
937 		cache_percpu->previous_magazine = NULL;
938 		/* don't bother adjusting depot->unallocated_objects */
939 	}
940 
941 	kfree(desc, M_OBJCACHE);
942 	kfree(oc, M_OBJCACHE);
943 }
944 
945 static int
946 sysctl_ocstats(SYSCTL_HANDLER_ARGS)
947 {
948 	struct objcache_stats stat;
949 	struct objcache_desc marker, *desc;
950 	int error;
951 
952 	memset(&marker, 0, sizeof(marker));
953 
954 	spin_lock(&objcachelist_spin);
955 
956 	LIST_INSERT_HEAD(&allobjcaches, &marker, next);
957 	while ((desc = LIST_NEXT(&marker, next)) != NULL) {
958 		u_long puts, unalloc;
959 		int cpu;
960 
961 		LIST_REMOVE(&marker, next);
962 		LIST_INSERT_AFTER(desc, &marker, next);
963 
964 		if (desc->total_objects == 0) {
965 			/* Marker inserted by another thread. */
966 			continue;
967 		}
968 
969 		memset(&stat, 0, sizeof(stat));
970 		strlcpy(stat.oc_name, desc->name, sizeof(stat.oc_name));
971 		stat.oc_limit = desc->total_objects;
972 		/* XXX domain aware */
973 		unalloc = desc->objcache->depot[0].unallocated_objects;
974 
975 		puts = 0;
976 		for (cpu = 0; cpu < ncpus; ++cpu) {
977 			const struct percpu_objcache *cache;
978 
979 			cache = &desc->objcache->cache_percpu[cpu];
980 			puts += cache->puts_cumulative;
981 
982 			stat.oc_requested += cache->gets_cumulative;
983 			stat.oc_exhausted += cache->gets_exhausted;
984 			stat.oc_failed += cache->gets_null;
985 			stat.oc_allocated += cache->allocs_cumulative;
986 		}
987 		spin_unlock(&objcachelist_spin);
988 
989 		/*
990 		 * Apply fixup.
991 		 */
992 		if (stat.oc_requested > puts)
993 			stat.oc_used = stat.oc_requested - puts;
994 		if (stat.oc_limit > unalloc + stat.oc_used) {
995 			stat.oc_cached = stat.oc_limit -
996 			    (unalloc + stat.oc_used);
997 		}
998 		stat.oc_requested += stat.oc_failed;
999 
1000 		/* Send out. */
1001 		error = SYSCTL_OUT(req, &stat, sizeof(stat));
1002 
1003 		/* Hold the lock before we return. */
1004 		spin_lock(&objcachelist_spin);
1005 
1006 		if (error)
1007 			break;
1008 	}
1009 	LIST_REMOVE(&marker, next);
1010 
1011 	spin_unlock(&objcachelist_spin);
1012 
1013 	return error;
1014 }
1015 SYSCTL_PROC(_kern_objcache, OID_AUTO, stats, (CTLTYPE_OPAQUE | CTLFLAG_RD),
1016     0, 0, sysctl_ocstats, "S,objcache_stats", "objcache statistics");
1017 
1018 static void
1019 objcache_init(void)
1020 {
1021 	spin_init(&objcachelist_spin, "objcachelist");
1022 
1023 	magazine_capmin = mag_capacity_align(MAGAZINE_CAPACITY_MIN);
1024 	magazine_capmax = mag_capacity_align(MAGAZINE_CAPACITY_MAX);
1025 	if (bootverbose) {
1026 		kprintf("objcache: magazine cap [%d, %d]\n",
1027 		    magazine_capmin, magazine_capmax);
1028 	}
1029 #if 0
1030 	callout_init_mp(&objcache_callout);
1031 	objcache_rebalance_period = 60 * hz;
1032 	callout_reset(&objcache_callout, objcache_rebalance_period,
1033 		      objcache_timer, NULL);
1034 #endif
1035 }
1036 SYSINIT(objcache, SI_BOOT2_OBJCACHE, SI_ORDER_FIRST, objcache_init, 0);
1037