xref: /dragonfly/sys/kern/kern_dsched.c (revision e8c03636)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 TAILQ_HEAD(tdio_list_head, dsched_thread_io);
56 
57 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
58 
59 static dsched_prepare_t		noop_prepare;
60 static dsched_teardown_t	noop_teardown;
61 static dsched_cancel_t		noop_cancel;
62 static dsched_queue_t		noop_queue;
63 
64 static void dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio);
65 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
66 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
67 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
68 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
69 
70 static int	dsched_inited = 0;
71 static int	default_set = 0;
72 
73 struct lock	dsched_lock;
74 static int	dsched_debug_enable = 0;
75 
76 struct dsched_stats	dsched_stats;
77 
78 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
79 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
80 struct objcache_malloc_args dsched_thread_io_malloc_args = {
81 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
82 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
83 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
84 
85 static struct objcache	*dsched_diskctx_cache;
86 static struct objcache	*dsched_tdctx_cache;
87 static struct objcache	*dsched_tdio_cache;
88 
89 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
90 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
91 
92 struct lock	dsched_tdctx_lock;
93 
94 static struct dsched_policy_head dsched_policy_list =
95 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
96 
97 static struct dsched_policy dsched_noop_policy = {
98 	.name = "noop",
99 
100 	.prepare = noop_prepare,
101 	.teardown = noop_teardown,
102 	.cancel_all = noop_cancel,
103 	.bio_queue = noop_queue
104 };
105 
106 static struct dsched_policy *default_policy = &dsched_noop_policy;
107 
108 /*
109  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
110  * using kvprintf
111  */
112 int
113 dsched_debug(int level, char *fmt, ...)
114 {
115 	__va_list ap;
116 
117 	__va_start(ap, fmt);
118 	if (level <= dsched_debug_enable)
119 		kvprintf(fmt, ap);
120 	__va_end(ap);
121 
122 	return 0;
123 }
124 
125 /*
126  * Called on disk_create()
127  * tries to read which policy to use from loader.conf, if there's
128  * none specified, the default policy is used.
129  */
130 void
131 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
132 {
133 	char tunable_key[SPECNAMELEN + 48];
134 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
135 	char *ptr;
136 	struct dsched_policy *policy = NULL;
137 
138 	/* Also look for serno stuff? */
139 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
140 
141 	ksnprintf(tunable_key, sizeof(tunable_key),
142 		  "dsched.policy.%s%d", head_name, unit);
143 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
144 	    sizeof(sched_policy)) != 0) {
145 		policy = dsched_find_policy(sched_policy);
146 	}
147 
148 	ksnprintf(tunable_key, sizeof(tunable_key),
149 		  "dsched.policy.%s", head_name);
150 
151 	for (ptr = tunable_key; *ptr; ptr++) {
152 		if (*ptr == '/')
153 			*ptr = '-';
154 	}
155 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
156 	    sizeof(sched_policy)) != 0)) {
157 		policy = dsched_find_policy(sched_policy);
158 	}
159 
160 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
161 	if (!policy && !default_set &&
162 	    (TUNABLE_STR_FETCH(tunable_key, sched_policy,
163 			       sizeof(sched_policy)) != 0)) {
164 		policy = dsched_find_policy(sched_policy);
165 	}
166 
167 	if (!policy) {
168 		if (!default_set && bootverbose) {
169 			dsched_debug(0,
170 				     "No policy for %s%d specified, "
171 				     "or policy not found\n",
172 				     head_name, unit);
173 		}
174 		dsched_set_policy(dp, default_policy);
175 	} else {
176 		dsched_set_policy(dp, policy);
177 	}
178 
179 	if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
180 		ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
181 	else
182 		ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
183 	for (ptr = tunable_key; *ptr; ptr++) {
184 		if (*ptr == '/')
185 			*ptr = '-';
186 	}
187 	dsched_sysctl_add_disk(
188 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
189 	    tunable_key);
190 
191 	lockmgr(&dsched_lock, LK_RELEASE);
192 }
193 
194 /*
195  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
196  * there's any policy associated with the serial number of the device.
197  */
198 void
199 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
200 {
201 	char tunable_key[SPECNAMELEN + 48];
202 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
203 	struct dsched_policy *policy = NULL;
204 
205 	if (info->d_serialno == NULL)
206 		return;
207 
208 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
209 
210 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
211 	    info->d_serialno);
212 
213 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
214 	    sizeof(sched_policy)) != 0)) {
215 		policy = dsched_find_policy(sched_policy);
216 	}
217 
218 	if (policy) {
219 		dsched_switch(dp, policy);
220 	}
221 
222 	dsched_sysctl_add_disk(
223 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
224 	    info->d_serialno);
225 
226 	lockmgr(&dsched_lock, LK_RELEASE);
227 }
228 
229 /*
230  * Called on disk_destroy()
231  * shuts down the scheduler core and cancels all remaining bios
232  */
233 void
234 dsched_disk_destroy_callback(struct disk *dp)
235 {
236 	struct dsched_policy *old_policy;
237 	struct dsched_disk_ctx *diskctx;
238 
239 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
240 
241 	diskctx = dsched_get_disk_priv(dp);
242 
243 	old_policy = dp->d_sched_policy;
244 	dp->d_sched_policy = &dsched_noop_policy;
245 	old_policy->cancel_all(dsched_get_disk_priv(dp));
246 	old_policy->teardown(dsched_get_disk_priv(dp));
247 
248 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
249 		sysctl_ctx_free(&diskctx->sysctl_ctx);
250 
251 	policy_destroy(dp);
252 	atomic_subtract_int(&old_policy->ref_count, 1);
253 	KKASSERT(old_policy->ref_count >= 0);
254 
255 	lockmgr(&dsched_lock, LK_RELEASE);
256 }
257 
258 
259 void
260 dsched_queue(struct disk *dp, struct bio *bio)
261 {
262 	struct dsched_thread_ctx	*tdctx;
263 	struct dsched_thread_io		*tdio;
264 	struct dsched_disk_ctx		*diskctx;
265 
266 	int found = 0, error = 0;
267 
268 	tdctx = dsched_get_buf_priv(bio->bio_buf);
269 	if (tdctx == NULL) {
270 		/* We don't handle this case, let dsched dispatch */
271 		atomic_add_int(&dsched_stats.no_tdctx, 1);
272 		dsched_strategy_raw(dp, bio);
273 		return;
274 	}
275 
276 	DSCHED_THREAD_CTX_LOCK(tdctx);
277 
278 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
279 	/*
280 	 * XXX:
281 	 * iterate in reverse to make sure we find the most up-to-date
282 	 * tdio for a given disk. After a switch it may take some time
283 	 * for everything to clean up.
284 	 */
285 	TAILQ_FOREACH_REVERSE(tdio, &tdctx->tdio_list, tdio_list_head, link) {
286 		if (tdio->dp == dp) {
287 			dsched_thread_io_ref(tdio);
288 			found = 1;
289 			break;
290 		}
291 	}
292 
293 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
294 	dsched_clr_buf_priv(bio->bio_buf);
295 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
296 
297 	KKASSERT(found == 1);
298 	diskctx = dsched_get_disk_priv(dp);
299 	dsched_disk_ctx_ref(diskctx);
300 
301 	if (dp->d_sched_policy != &dsched_noop_policy)
302 		KKASSERT(tdio->debug_policy == dp->d_sched_policy);
303 
304 	KKASSERT(tdio->debug_inited == 0xF00F1234);
305 
306 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
307 
308 	if (error) {
309 		dsched_strategy_raw(dp, bio);
310 	}
311 	dsched_disk_ctx_unref(diskctx);
312 	dsched_thread_io_unref(tdio);
313 }
314 
315 
316 /*
317  * Called from each module_init or module_attach of each policy
318  * registers the policy in the local policy list.
319  */
320 int
321 dsched_register(struct dsched_policy *d_policy)
322 {
323 	struct dsched_policy *policy;
324 	int error = 0;
325 
326 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
327 
328 	policy = dsched_find_policy(d_policy->name);
329 
330 	if (!policy) {
331 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
332 		atomic_add_int(&d_policy->ref_count, 1);
333 	} else {
334 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
335 		    d_policy->name);
336 		error = EEXIST;
337 	}
338 
339 	lockmgr(&dsched_lock, LK_RELEASE);
340 	return error;
341 }
342 
343 /*
344  * Called from each module_detach of each policy
345  * unregisters the policy
346  */
347 int
348 dsched_unregister(struct dsched_policy *d_policy)
349 {
350 	struct dsched_policy *policy;
351 
352 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
353 	policy = dsched_find_policy(d_policy->name);
354 
355 	if (policy) {
356 		if (policy->ref_count > 1) {
357 			lockmgr(&dsched_lock, LK_RELEASE);
358 			return EBUSY;
359 		}
360 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
361 		atomic_subtract_int(&policy->ref_count, 1);
362 		KKASSERT(policy->ref_count == 0);
363 	}
364 	lockmgr(&dsched_lock, LK_RELEASE);
365 
366 	return 0;
367 }
368 
369 
370 /*
371  * switches the policy by first removing the old one and then
372  * enabling the new one.
373  */
374 int
375 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
376 {
377 	struct dsched_policy *old_policy;
378 
379 	/* If we are asked to set the same policy, do nothing */
380 	if (dp->d_sched_policy == new_policy)
381 		return 0;
382 
383 	/* lock everything down, diskwise */
384 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
385 	old_policy = dp->d_sched_policy;
386 
387 	atomic_subtract_int(&old_policy->ref_count, 1);
388 	KKASSERT(old_policy->ref_count >= 0);
389 
390 	dp->d_sched_policy = &dsched_noop_policy;
391 	old_policy->teardown(dsched_get_disk_priv(dp));
392 	policy_destroy(dp);
393 
394 	/* Bring everything back to life */
395 	dsched_set_policy(dp, new_policy);
396 	lockmgr(&dsched_lock, LK_RELEASE);
397 
398 	return 0;
399 }
400 
401 
402 /*
403  * Loads a given policy and attaches it to the specified disk.
404  * Also initializes the core for the policy
405  */
406 void
407 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
408 {
409 	int locked = 0;
410 
411 	/* Check if it is locked already. if not, we acquire the devfs lock */
412 	if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
413 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
414 		locked = 1;
415 	}
416 
417 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
418 
419 	policy_new(dp, new_policy);
420 	new_policy->prepare(dsched_get_disk_priv(dp));
421 	dp->d_sched_policy = new_policy;
422 
423 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
424 
425 	atomic_add_int(&new_policy->ref_count, 1);
426 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
427 	    new_policy->name);
428 
429 	/* If we acquired the lock, we also get rid of it */
430 	if (locked)
431 		lockmgr(&dsched_lock, LK_RELEASE);
432 }
433 
434 struct dsched_policy*
435 dsched_find_policy(char *search)
436 {
437 	struct dsched_policy *policy;
438 	struct dsched_policy *policy_found = NULL;
439 	int locked = 0;
440 
441 	/* Check if it is locked already. if not, we acquire the devfs lock */
442 	if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
443 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
444 		locked = 1;
445 	}
446 
447 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
448 		if (!strcmp(policy->name, search)) {
449 			policy_found = policy;
450 			break;
451 		}
452 	}
453 
454 	/* If we acquired the lock, we also get rid of it */
455 	if (locked)
456 		lockmgr(&dsched_lock, LK_RELEASE);
457 
458 	return policy_found;
459 }
460 
461 /*
462  * Returns ref'd disk
463  */
464 struct disk *
465 dsched_find_disk(char *search)
466 {
467 	struct disk marker;
468 	struct disk *dp = NULL;
469 
470 	while ((dp = disk_enumerate(&marker, dp)) != NULL) {
471 		if (strcmp(dp->d_cdev->si_name, search) == 0) {
472 			disk_enumerate_stop(&marker, NULL);
473 			/* leave ref on dp */
474 			break;
475 		}
476 	}
477 	return dp;
478 }
479 
480 struct disk *
481 dsched_disk_enumerate(struct disk *marker, struct disk *dp,
482 		      struct dsched_policy *policy)
483 {
484 	while ((dp = disk_enumerate(marker, dp)) != NULL) {
485 		if (dp->d_sched_policy == policy)
486 			break;
487 	}
488 	return NULL;
489 }
490 
491 struct dsched_policy *
492 dsched_policy_enumerate(struct dsched_policy *pol)
493 {
494 	if (!pol)
495 		return (TAILQ_FIRST(&dsched_policy_list));
496 	else
497 		return (TAILQ_NEXT(pol, link));
498 }
499 
500 void
501 dsched_cancel_bio(struct bio *bp)
502 {
503 	bp->bio_buf->b_error = ENXIO;
504 	bp->bio_buf->b_flags |= B_ERROR;
505 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
506 
507 	biodone(bp);
508 }
509 
510 void
511 dsched_strategy_raw(struct disk *dp, struct bio *bp)
512 {
513 	/*
514 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
515 	 * to avoid panics
516 	 */
517 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
518 	if(bp->bio_track != NULL) {
519 		dsched_debug(LOG_INFO,
520 		    "dsched_strategy_raw sees non-NULL bio_track!! "
521 		    "bio: %p\n", bp);
522 		bp->bio_track = NULL;
523 	}
524 	dev_dstrategy(dp->d_rawdev, bp);
525 }
526 
527 void
528 dsched_strategy_sync(struct disk *dp, struct bio *bio)
529 {
530 	struct buf *bp, *nbp;
531 	struct bio *nbio;
532 
533 	bp = bio->bio_buf;
534 
535 	nbp = getpbuf(NULL);
536 	nbio = &nbp->b_bio1;
537 
538 	nbp->b_cmd = bp->b_cmd;
539 	nbp->b_bufsize = bp->b_bufsize;
540 	nbp->b_runningbufspace = bp->b_runningbufspace;
541 	nbp->b_bcount = bp->b_bcount;
542 	nbp->b_resid = bp->b_resid;
543 	nbp->b_data = bp->b_data;
544 #if 0
545 	/*
546 	 * Buffers undergoing device I/O do not need a kvabase/size.
547 	 */
548 	nbp->b_kvabase = bp->b_kvabase;
549 	nbp->b_kvasize = bp->b_kvasize;
550 #endif
551 	nbp->b_dirtyend = bp->b_dirtyend;
552 
553 	nbio->bio_done = biodone_sync;
554 	nbio->bio_flags |= BIO_SYNC;
555 	nbio->bio_track = NULL;
556 
557 	nbio->bio_caller_info1.ptr = dp;
558 	nbio->bio_offset = bio->bio_offset;
559 
560 	dev_dstrategy(dp->d_rawdev, nbio);
561 	biowait(nbio, "dschedsync");
562 	bp->b_resid = nbp->b_resid;
563 	bp->b_error = nbp->b_error;
564 	biodone(bio);
565 #if 0
566 	nbp->b_kvabase = NULL;
567 	nbp->b_kvasize = 0;
568 #endif
569 	relpbuf(nbp, NULL);
570 }
571 
572 void
573 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
574 {
575 	struct bio *nbio;
576 
577 	nbio = push_bio(bio);
578 	nbio->bio_done = done;
579 	nbio->bio_offset = bio->bio_offset;
580 
581 	dsched_set_bio_dp(nbio, dp);
582 	dsched_set_bio_priv(nbio, priv);
583 
584 	getmicrotime(&nbio->bio_caller_info3.tv);
585 	dev_dstrategy(dp->d_rawdev, nbio);
586 }
587 
588 /*
589  * A special bio done call back function
590  * used by policy having request polling implemented.
591  */
592 static void
593 request_polling_biodone(struct bio *bp)
594 {
595 	struct dsched_disk_ctx *diskctx = NULL;
596 	struct disk *dp = NULL;
597 	struct bio *obio;
598 	struct dsched_policy *policy;
599 
600 	dp = dsched_get_bio_dp(bp);
601 	policy = dp->d_sched_policy;
602 	diskctx = dsched_get_disk_priv(dp);
603 	KKASSERT(diskctx && policy);
604 	dsched_disk_ctx_ref(diskctx);
605 
606 	/*
607 	 * XXX:
608 	 * the bio_done function should not be blocked !
609 	 */
610 	if (diskctx->dp->d_sched_policy->bio_done)
611 		diskctx->dp->d_sched_policy->bio_done(bp);
612 
613 	obio = pop_bio(bp);
614 	biodone(obio);
615 
616 	atomic_subtract_int(&diskctx->current_tag_queue_depth, 1);
617 
618 	/* call the polling function,
619 	 * XXX:
620 	 * the polling function should not be blocked!
621 	 */
622 	if (policy->polling_func)
623 		policy->polling_func(diskctx);
624 	else
625 		dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
626 	dsched_disk_ctx_unref(diskctx);
627 }
628 
629 /*
630  * A special dsched strategy used by policy having request polling
631  * (polling function) implemented.
632  *
633  * The strategy is the just like dsched_strategy_async(), but
634  * the biodone call back is set to a preset one.
635  *
636  * If the policy needs its own biodone callback, it should
637  * register it in the policy structure. (bio_done field)
638  *
639  * The current_tag_queue_depth is maintained by this function
640  * and the request_polling_biodone() function
641  */
642 
643 void
644 dsched_strategy_request_polling(struct disk *dp, struct bio *bio, struct dsched_disk_ctx *diskctx)
645 {
646 	atomic_add_int(&diskctx->current_tag_queue_depth, 1);
647 	dsched_strategy_async(dp, bio, request_polling_biodone, dsched_get_bio_priv(bio));
648 }
649 
650 /*
651  * Ref and deref various structures.  The 1->0 transition of the reference
652  * count actually transitions 1->0x80000000 and causes the object to be
653  * destroyed.  It is possible for transitory references to occur on the
654  * object while it is being destroyed.  We use bit 31 to indicate that
655  * destruction is in progress and to prevent nested destructions.
656  */
657 void
658 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
659 {
660 	int refcount;
661 
662 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
663 }
664 
665 void
666 dsched_thread_io_ref(struct dsched_thread_io *tdio)
667 {
668 	int refcount;
669 
670 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
671 }
672 
673 void
674 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
675 {
676 	int refcount;
677 
678 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
679 }
680 
681 void
682 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
683 {
684 	int refs;
685 	int nrefs;
686 
687 	/*
688 	 * Handle 1->0 transitions for diskctx and nested destruction
689 	 * recursions.  If the refs are already in destruction mode (bit 31
690 	 * set) on the 1->0 transition we don't try to destruct it again.
691 	 *
692 	 * 0x80000001->0x80000000 transitions are handled normally and
693 	 * thus avoid nested dstruction.
694 	 */
695 	for (;;) {
696 		refs = diskctx->refcount;
697 		cpu_ccfence();
698 		nrefs = refs - 1;
699 
700 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
701 		if (nrefs) {
702 			if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
703 				break;
704 			continue;
705 		}
706 		nrefs = 0x80000000;
707 		if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
708 			dsched_disk_ctx_destroy(diskctx);
709 			break;
710 		}
711 	}
712 }
713 
714 static
715 void
716 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
717 {
718 	struct dsched_thread_io	*tdio;
719 	int refs;
720 	int nrefs;
721 
722 #if 0
723 	kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
724 	print_backtrace(4);
725 #endif
726 	lockmgr(&diskctx->lock, LK_EXCLUSIVE);
727 	while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
728 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
729 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
730 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
731 		tdio->diskctx = NULL;
732 		/* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
733 		lockmgr(&diskctx->lock, LK_RELEASE);
734 		dsched_thread_io_unref_destroy(tdio);
735 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
736 	}
737 	lockmgr(&diskctx->lock, LK_RELEASE);
738 
739 	/*
740 	 * Expect diskctx->refcount to be 0x80000000.  If it isn't someone
741 	 * else still has a temporary ref on the diskctx and we have to
742 	 * transition it back to an undestroyed-state (albeit without any
743 	 * associations), so the other user destroys it properly when the
744 	 * ref is released.
745 	 */
746 	while ((refs = diskctx->refcount) != 0x80000000) {
747 		kprintf("dsched_thread_io: destroy race diskctx=%p\n", diskctx);
748 		cpu_ccfence();
749 		KKASSERT(refs & 0x80000000);
750 		nrefs = refs & 0x7FFFFFFF;
751 		if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
752 			return;
753 	}
754 
755 	/*
756 	 * Really for sure now.
757 	 */
758 	if (diskctx->dp->d_sched_policy->destroy_diskctx)
759 		diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
760 	objcache_put(dsched_diskctx_cache, diskctx);
761 	atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
762 }
763 
764 void
765 dsched_thread_io_unref(struct dsched_thread_io *tdio)
766 {
767 	int refs;
768 	int nrefs;
769 
770 	/*
771 	 * Handle 1->0 transitions for tdio and nested destruction
772 	 * recursions.  If the refs are already in destruction mode (bit 31
773 	 * set) on the 1->0 transition we don't try to destruct it again.
774 	 *
775 	 * 0x80000001->0x80000000 transitions are handled normally and
776 	 * thus avoid nested dstruction.
777 	 */
778 	for (;;) {
779 		refs = tdio->refcount;
780 		cpu_ccfence();
781 		nrefs = refs - 1;
782 
783 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
784 		if (nrefs) {
785 			if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
786 				break;
787 			continue;
788 		}
789 		nrefs = 0x80000000;
790 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
791 			dsched_thread_io_destroy(tdio);
792 			break;
793 		}
794 	}
795 }
796 
797 /*
798  * Unref and destroy the tdio even if additional refs are present.
799  */
800 static
801 void
802 dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio)
803 {
804 	int refs;
805 	int nrefs;
806 
807 	/*
808 	 * If not already transitioned to destroy-in-progress we transition
809 	 * to destroy-in-progress, cleanup our ref, and destroy the tdio.
810 	 */
811 	for (;;) {
812 		refs = tdio->refcount;
813 		cpu_ccfence();
814 		nrefs = refs - 1;
815 
816 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
817 		if (nrefs & 0x80000000) {
818 			if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
819 				break;
820 			continue;
821 		}
822 		nrefs |= 0x80000000;
823 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
824 			dsched_thread_io_destroy(tdio);
825 			break;
826 		}
827 	}
828 }
829 
830 static void
831 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
832 {
833 	struct dsched_thread_ctx *tdctx;
834 	struct dsched_disk_ctx	*diskctx;
835 	int refs;
836 	int nrefs;
837 
838 #if 0
839 	kprintf("tdio (%p) destruction started, trace:\n", tdio);
840 	print_backtrace(8);
841 #endif
842 	KKASSERT(tdio->qlength == 0);
843 
844 	while ((diskctx = tdio->diskctx) != NULL) {
845 		dsched_disk_ctx_ref(diskctx);
846 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
847 		if (diskctx != tdio->diskctx) {
848 			lockmgr(&diskctx->lock, LK_RELEASE);
849 			dsched_disk_ctx_unref(diskctx);
850 			continue;
851 		}
852 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
853 		if (diskctx->dp->d_sched_policy->destroy_tdio)
854 			diskctx->dp->d_sched_policy->destroy_tdio(tdio);
855 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
856 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
857 		tdio->diskctx = NULL;
858 		dsched_thread_io_unref(tdio);
859 		lockmgr(&diskctx->lock, LK_RELEASE);
860 		dsched_disk_ctx_unref(diskctx);
861 	}
862 	while ((tdctx = tdio->tdctx) != NULL) {
863 		dsched_thread_ctx_ref(tdctx);
864 		lockmgr(&tdctx->lock, LK_EXCLUSIVE);
865 		if (tdctx != tdio->tdctx) {
866 			lockmgr(&tdctx->lock, LK_RELEASE);
867 			dsched_thread_ctx_unref(tdctx);
868 			continue;
869 		}
870 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
871 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
872 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
873 		tdio->tdctx = NULL;
874 		dsched_thread_io_unref(tdio);
875 		lockmgr(&tdctx->lock, LK_RELEASE);
876 		dsched_thread_ctx_unref(tdctx);
877 	}
878 
879 	/*
880 	 * Expect tdio->refcount to be 0x80000000.  If it isn't someone else
881 	 * still has a temporary ref on the tdio and we have to transition
882 	 * it back to an undestroyed-state (albeit without any associations)
883 	 * so the other user destroys it properly when the ref is released.
884 	 */
885 	while ((refs = tdio->refcount) != 0x80000000) {
886 		kprintf("dsched_thread_io: destroy race tdio=%p\n", tdio);
887 		cpu_ccfence();
888 		KKASSERT(refs & 0x80000000);
889 		nrefs = refs & 0x7FFFFFFF;
890 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
891 			return;
892 	}
893 
894 	/*
895 	 * Really for sure now.
896 	 */
897 	objcache_put(dsched_tdio_cache, tdio);
898 	atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
899 }
900 
901 void
902 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
903 {
904 	int refs;
905 	int nrefs;
906 
907 	/*
908 	 * Handle 1->0 transitions for tdctx and nested destruction
909 	 * recursions.  If the refs are already in destruction mode (bit 31
910 	 * set) on the 1->0 transition we don't try to destruct it again.
911 	 *
912 	 * 0x80000001->0x80000000 transitions are handled normally and
913 	 * thus avoid nested dstruction.
914 	 */
915 	for (;;) {
916 		refs = tdctx->refcount;
917 		cpu_ccfence();
918 		nrefs = refs - 1;
919 
920 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
921 		if (nrefs) {
922 			if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
923 				break;
924 			continue;
925 		}
926 		nrefs = 0x80000000;
927 		if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
928 			dsched_thread_ctx_destroy(tdctx);
929 			break;
930 		}
931 	}
932 }
933 
934 static void
935 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
936 {
937 	struct dsched_thread_io	*tdio;
938 
939 #if 0
940 	kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
941 	print_backtrace(8);
942 #endif
943 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
944 
945 	lockmgr(&tdctx->lock, LK_EXCLUSIVE);
946 
947 	while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
948 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
949 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
950 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
951 		tdio->tdctx = NULL;
952 		lockmgr(&tdctx->lock, LK_RELEASE);	/* avoid deadlock */
953 		dsched_thread_io_unref_destroy(tdio);
954 		lockmgr(&tdctx->lock, LK_EXCLUSIVE);
955 	}
956 	KKASSERT(tdctx->refcount == 0x80000000);
957 	TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
958 
959 	lockmgr(&tdctx->lock, LK_RELEASE);
960 
961 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
962 
963 	objcache_put(dsched_tdctx_cache, tdctx);
964 	atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
965 }
966 
967 /*
968  * Ensures that a tdio is assigned to tdctx and disk.
969  */
970 void
971 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
972 		       struct dsched_policy *pol)
973 {
974 	struct dsched_thread_io	*tdio;
975 #if 0
976 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
977 #endif
978 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
979 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
980 
981 	dsched_thread_io_ref(tdio);	/* prevent ripout */
982 	dsched_thread_io_ref(tdio);	/* for diskctx ref */
983 
984 	DSCHED_THREAD_IO_LOCKINIT(tdio);
985 	tdio->dp = dp;
986 
987 	tdio->diskctx = dsched_get_disk_priv(dp);
988 	TAILQ_INIT(&tdio->queue);
989 
990 	if (pol->new_tdio)
991 		pol->new_tdio(tdio);
992 
993 	lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
994 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
995 	atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
996 	lockmgr(&tdio->diskctx->lock, LK_RELEASE);
997 
998 	if (tdctx) {
999 		/*
1000 		 * Put the tdio in the tdctx list.  Inherit the temporary
1001 		 * ref (one ref for each list).
1002 		 */
1003 		DSCHED_THREAD_CTX_LOCK(tdctx);
1004 		tdio->tdctx = tdctx;
1005 		tdio->p = tdctx->p;
1006 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
1007 		atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
1008 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
1009 	} else {
1010 		dsched_thread_io_unref(tdio);
1011 	}
1012 
1013 	tdio->debug_policy = pol;
1014 	tdio->debug_inited = 0xF00F1234;
1015 
1016 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
1017 }
1018 
1019 
1020 struct dsched_disk_ctx *
1021 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
1022 {
1023 	struct dsched_disk_ctx *diskctx;
1024 
1025 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
1026 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
1027 	dsched_disk_ctx_ref(diskctx);
1028 	diskctx->dp = dp;
1029 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
1030 	TAILQ_INIT(&diskctx->tdio_list);
1031 	/*
1032 	 * XXX: magic number 32: most device has a tag queue
1033 	 * of depth 32.
1034 	 * Better to retrive more precise value from the driver
1035 	 */
1036 	diskctx->max_tag_queue_depth = 32;
1037 	diskctx->current_tag_queue_depth = 0;
1038 
1039 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
1040 	if (pol->new_diskctx)
1041 		pol->new_diskctx(diskctx);
1042 	return diskctx;
1043 }
1044 
1045 
1046 struct dsched_thread_ctx *
1047 dsched_thread_ctx_alloc(struct proc *p)
1048 {
1049 	struct dsched_thread_ctx	*tdctx;
1050 	struct disk marker;
1051 	struct disk *dp;
1052 
1053 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
1054 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
1055 	dsched_thread_ctx_ref(tdctx);
1056 #if 0
1057 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
1058 #endif
1059 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
1060 	TAILQ_INIT(&tdctx->tdio_list);
1061 	tdctx->p = p;
1062 
1063 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
1064 	dp = NULL;
1065 	while ((dp = disk_enumerate(&marker, dp)) != NULL)
1066 		dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
1067 
1068 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
1069 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1070 
1071 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
1072 	/* XXX: no callback here */
1073 	return tdctx;
1074 }
1075 
1076 void
1077 policy_new(struct disk *dp, struct dsched_policy *pol) {
1078 	struct dsched_thread_ctx *tdctx;
1079 	struct dsched_disk_ctx *diskctx;
1080 
1081 	diskctx = dsched_disk_ctx_alloc(dp, pol);
1082 	dsched_disk_ctx_ref(diskctx);
1083 	dsched_set_disk_priv(dp, diskctx);
1084 
1085 	/*
1086 	 * XXX this is really really expensive!
1087 	 */
1088 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link)
1089 		dsched_thread_io_alloc(dp, tdctx, pol);
1090 }
1091 
1092 void
1093 policy_destroy(struct disk *dp) {
1094 	struct dsched_disk_ctx *diskctx;
1095 
1096 	diskctx = dsched_get_disk_priv(dp);
1097 	KKASSERT(diskctx != NULL);
1098 
1099 	dsched_disk_ctx_unref(diskctx); /* from prepare */
1100 	dsched_disk_ctx_unref(diskctx); /* from alloc */
1101 
1102 	dsched_set_disk_priv(dp, NULL);
1103 }
1104 
1105 void
1106 dsched_new_buf(struct buf *bp)
1107 {
1108 	struct dsched_thread_ctx	*tdctx = NULL;
1109 
1110 	if (dsched_inited == 0)
1111 		return;
1112 
1113 	if (curproc != NULL) {
1114 		tdctx = dsched_get_proc_priv(curproc);
1115 	} else {
1116 		/* This is a kernel thread, so no proc info is available */
1117 		tdctx = dsched_get_thread_priv(curthread);
1118 	}
1119 
1120 #if 0
1121 	/*
1122 	 * XXX: hack. we don't want this assert because we aren't catching all
1123 	 *	threads. mi_startup() is still getting away without an tdctx.
1124 	 */
1125 
1126 	/* by now we should have an tdctx. if not, something bad is going on */
1127 	KKASSERT(tdctx != NULL);
1128 #endif
1129 
1130 	if (tdctx) {
1131 		dsched_thread_ctx_ref(tdctx);
1132 	}
1133 	dsched_set_buf_priv(bp, tdctx);
1134 }
1135 
1136 void
1137 dsched_exit_buf(struct buf *bp)
1138 {
1139 	struct dsched_thread_ctx	*tdctx;
1140 
1141 	tdctx = dsched_get_buf_priv(bp);
1142 	if (tdctx != NULL) {
1143 		dsched_clr_buf_priv(bp);
1144 		dsched_thread_ctx_unref(tdctx);
1145 	}
1146 }
1147 
1148 void
1149 dsched_new_proc(struct proc *p)
1150 {
1151 	struct dsched_thread_ctx	*tdctx;
1152 
1153 	if (dsched_inited == 0)
1154 		return;
1155 
1156 	KKASSERT(p != NULL);
1157 
1158 	tdctx = dsched_thread_ctx_alloc(p);
1159 	tdctx->p = p;
1160 	dsched_thread_ctx_ref(tdctx);
1161 
1162 	dsched_set_proc_priv(p, tdctx);
1163 	atomic_add_int(&dsched_stats.nprocs, 1);
1164 }
1165 
1166 
1167 void
1168 dsched_new_thread(struct thread *td)
1169 {
1170 	struct dsched_thread_ctx	*tdctx;
1171 
1172 	if (dsched_inited == 0)
1173 		return;
1174 
1175 	KKASSERT(td != NULL);
1176 
1177 	tdctx = dsched_thread_ctx_alloc(NULL);
1178 	tdctx->td = td;
1179 	dsched_thread_ctx_ref(tdctx);
1180 
1181 	dsched_set_thread_priv(td, tdctx);
1182 	atomic_add_int(&dsched_stats.nthreads, 1);
1183 }
1184 
1185 void
1186 dsched_exit_proc(struct proc *p)
1187 {
1188 	struct dsched_thread_ctx	*tdctx;
1189 
1190 	if (dsched_inited == 0)
1191 		return;
1192 
1193 	KKASSERT(p != NULL);
1194 
1195 	tdctx = dsched_get_proc_priv(p);
1196 	KKASSERT(tdctx != NULL);
1197 
1198 	tdctx->dead = 0xDEAD;
1199 	dsched_set_proc_priv(p, NULL);
1200 
1201 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1202 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1203 	atomic_subtract_int(&dsched_stats.nprocs, 1);
1204 }
1205 
1206 
1207 void
1208 dsched_exit_thread(struct thread *td)
1209 {
1210 	struct dsched_thread_ctx	*tdctx;
1211 
1212 	if (dsched_inited == 0)
1213 		return;
1214 
1215 	KKASSERT(td != NULL);
1216 
1217 	tdctx = dsched_get_thread_priv(td);
1218 	KKASSERT(tdctx != NULL);
1219 
1220 	tdctx->dead = 0xDEAD;
1221 	dsched_set_thread_priv(td, 0);
1222 
1223 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1224 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1225 	atomic_subtract_int(&dsched_stats.nthreads, 1);
1226 }
1227 
1228 /*
1229  * Returns ref'd tdio.
1230  *
1231  * tdio may have additional refs for the diskctx and tdctx it resides on.
1232  */
1233 void
1234 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1235 			      struct dsched_policy *pol)
1236 {
1237 	struct dsched_thread_ctx *tdctx;
1238 
1239 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
1240 
1241 	tdctx = dsched_get_thread_priv(curthread);
1242 	KKASSERT(tdctx != NULL);
1243 	dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
1244 
1245 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1246 }
1247 
1248 /* DEFAULT NOOP POLICY */
1249 
1250 static int
1251 noop_prepare(struct dsched_disk_ctx *diskctx)
1252 {
1253 	return 0;
1254 }
1255 
1256 static void
1257 noop_teardown(struct dsched_disk_ctx *diskctx)
1258 {
1259 
1260 }
1261 
1262 static void
1263 noop_cancel(struct dsched_disk_ctx *diskctx)
1264 {
1265 
1266 }
1267 
1268 static int
1269 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1270     struct bio *bio)
1271 {
1272 	dsched_strategy_raw(diskctx->dp, bio);
1273 #if 0
1274 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1275 #endif
1276 	return 0;
1277 }
1278 
1279 /*
1280  * SYSINIT stuff
1281  */
1282 static void
1283 dsched_init(void)
1284 {
1285 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1286 					   NULL, NULL, NULL,
1287 					   objcache_malloc_alloc,
1288 					   objcache_malloc_free,
1289 					   &dsched_thread_io_malloc_args );
1290 
1291 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1292 					   NULL, NULL, NULL,
1293 					   objcache_malloc_alloc,
1294 					   objcache_malloc_free,
1295 					   &dsched_thread_ctx_malloc_args );
1296 
1297 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1298 					   NULL, NULL, NULL,
1299 					   objcache_malloc_alloc,
1300 					   objcache_malloc_free,
1301 					   &dsched_disk_ctx_malloc_args );
1302 
1303 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1304 
1305 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1306 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1307 
1308 	dsched_register(&dsched_noop_policy);
1309 
1310 	dsched_inited = 1;
1311 }
1312 
1313 static void
1314 dsched_uninit(void)
1315 {
1316 }
1317 
1318 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1319 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1320 
1321 /*
1322  * SYSCTL stuff
1323  */
1324 static int
1325 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1326 {
1327 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1328 }
1329 
1330 static int
1331 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1332 {
1333 	struct dsched_policy *pol = NULL;
1334 	int error, first = 1;
1335 
1336 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1337 
1338 	while ((pol = dsched_policy_enumerate(pol))) {
1339 		if (!first) {
1340 			error = SYSCTL_OUT(req, " ", 1);
1341 			if (error)
1342 				break;
1343 		} else {
1344 			first = 0;
1345 		}
1346 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1347 		if (error)
1348 			break;
1349 
1350 	}
1351 
1352 	lockmgr(&dsched_lock, LK_RELEASE);
1353 
1354 	error = SYSCTL_OUT(req, "", 1);
1355 
1356 	return error;
1357 }
1358 
1359 static int
1360 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1361 {
1362 	char buf[DSCHED_POLICY_NAME_LENGTH];
1363 	struct dsched_disk_ctx *diskctx = arg1;
1364 	struct dsched_policy *pol = NULL;
1365 	int error;
1366 
1367 	if (diskctx == NULL) {
1368 		return 0;
1369 	}
1370 
1371 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1372 
1373 	pol = diskctx->dp->d_sched_policy;
1374 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1375 
1376 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1377 	if (error || req->newptr == NULL) {
1378 		lockmgr(&dsched_lock, LK_RELEASE);
1379 		return (error);
1380 	}
1381 
1382 	pol = dsched_find_policy(buf);
1383 	if (pol == NULL) {
1384 		lockmgr(&dsched_lock, LK_RELEASE);
1385 		return 0;
1386 	}
1387 
1388 	dsched_switch(diskctx->dp, pol);
1389 
1390 	lockmgr(&dsched_lock, LK_RELEASE);
1391 
1392 	return error;
1393 }
1394 
1395 static int
1396 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1397 {
1398 	char buf[DSCHED_POLICY_NAME_LENGTH];
1399 	struct dsched_policy *pol = NULL;
1400 	int error;
1401 
1402 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1403 
1404 	pol = default_policy;
1405 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1406 
1407 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1408 	if (error || req->newptr == NULL) {
1409 		lockmgr(&dsched_lock, LK_RELEASE);
1410 		return (error);
1411 	}
1412 
1413 	pol = dsched_find_policy(buf);
1414 	if (pol == NULL) {
1415 		lockmgr(&dsched_lock, LK_RELEASE);
1416 		return 0;
1417 	}
1418 
1419 	default_set = 1;
1420 	default_policy = pol;
1421 
1422 	lockmgr(&dsched_lock, LK_RELEASE);
1423 
1424 	return error;
1425 }
1426 
1427 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1428     "Disk Scheduler Framework (dsched) magic");
1429 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1430     "List of disks and their policies");
1431 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1432     0, "Enable dsched debugging");
1433 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1434     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1435     "dsched statistics");
1436 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1437     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1438 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1439     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1440 
1441 static void
1442 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1443 {
1444 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1445 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1446 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1447 	}
1448 
1449 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1450 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1451 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1452 }
1453