xref: /dragonfly/sys/kern/kern_dsched.c (revision 38c2ea22)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 TAILQ_HEAD(tdio_list_head, dsched_thread_io);
56 
57 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
58 
59 static dsched_prepare_t		noop_prepare;
60 static dsched_teardown_t	noop_teardown;
61 static dsched_cancel_t		noop_cancel;
62 static dsched_queue_t		noop_queue;
63 
64 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
65 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
66 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
67 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
68 
69 static int	dsched_inited = 0;
70 static int	default_set = 0;
71 
72 struct lock	dsched_lock;
73 static int	dsched_debug_enable = 0;
74 
75 struct dsched_stats	dsched_stats;
76 
77 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
78 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
79 struct objcache_malloc_args dsched_thread_io_malloc_args = {
80 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
81 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
82 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
83 
84 static struct objcache	*dsched_diskctx_cache;
85 static struct objcache	*dsched_tdctx_cache;
86 static struct objcache	*dsched_tdio_cache;
87 
88 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
89 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
90 
91 struct lock	dsched_tdctx_lock;
92 
93 static struct dsched_policy_head dsched_policy_list =
94 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
95 
96 static struct dsched_policy dsched_noop_policy = {
97 	.name = "noop",
98 
99 	.prepare = noop_prepare,
100 	.teardown = noop_teardown,
101 	.cancel_all = noop_cancel,
102 	.bio_queue = noop_queue
103 };
104 
105 static struct dsched_policy *default_policy = &dsched_noop_policy;
106 
107 /*
108  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
109  * using kvprintf
110  */
111 int
112 dsched_debug(int level, char *fmt, ...)
113 {
114 	__va_list ap;
115 
116 	__va_start(ap, fmt);
117 	if (level <= dsched_debug_enable)
118 		kvprintf(fmt, ap);
119 	__va_end(ap);
120 
121 	return 0;
122 }
123 
124 /*
125  * Called on disk_create()
126  * tries to read which policy to use from loader.conf, if there's
127  * none specified, the default policy is used.
128  */
129 void
130 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
131 {
132 	char tunable_key[SPECNAMELEN + 48];
133 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
134 	char *ptr;
135 	struct dsched_policy *policy = NULL;
136 
137 	/* Also look for serno stuff? */
138 	/* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
139 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
140 
141 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
142 	    head_name, unit);
143 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
144 	    sizeof(sched_policy)) != 0) {
145 		policy = dsched_find_policy(sched_policy);
146 	}
147 
148 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
149 	    head_name);
150 	for (ptr = tunable_key; *ptr; ptr++) {
151 		if (*ptr == '/')
152 			*ptr = '-';
153 	}
154 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
155 	    sizeof(sched_policy)) != 0)) {
156 		policy = dsched_find_policy(sched_policy);
157 	}
158 
159 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
160 	if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
161 	    sizeof(sched_policy)) != 0)) {
162 		policy = dsched_find_policy(sched_policy);
163 	}
164 
165 	if (!policy) {
166 		if (!default_set && bootverbose) {
167 			dsched_debug(0,
168 				     "No policy for %s%d specified, "
169 				     "or policy not found\n",
170 				     head_name, unit);
171 		}
172 		dsched_set_policy(dp, default_policy);
173 	} else {
174 		dsched_set_policy(dp, policy);
175 	}
176 
177 	if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
178 		ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
179 	else
180 		ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
181 	for (ptr = tunable_key; *ptr; ptr++) {
182 		if (*ptr == '/')
183 			*ptr = '-';
184 	}
185 	dsched_sysctl_add_disk(
186 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
187 	    tunable_key);
188 
189 	lockmgr(&dsched_lock, LK_RELEASE);
190 }
191 
192 /*
193  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
194  * there's any policy associated with the serial number of the device.
195  */
196 void
197 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
198 {
199 	char tunable_key[SPECNAMELEN + 48];
200 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
201 	struct dsched_policy *policy = NULL;
202 
203 	if (info->d_serialno == NULL)
204 		return;
205 
206 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
207 
208 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
209 	    info->d_serialno);
210 
211 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
212 	    sizeof(sched_policy)) != 0)) {
213 		policy = dsched_find_policy(sched_policy);
214 	}
215 
216 	if (policy) {
217 		dsched_switch(dp, policy);
218 	}
219 
220 	dsched_sysctl_add_disk(
221 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
222 	    info->d_serialno);
223 
224 	lockmgr(&dsched_lock, LK_RELEASE);
225 }
226 
227 /*
228  * Called on disk_destroy()
229  * shuts down the scheduler core and cancels all remaining bios
230  */
231 void
232 dsched_disk_destroy_callback(struct disk *dp)
233 {
234 	struct dsched_policy *old_policy;
235 	struct dsched_disk_ctx *diskctx;
236 
237 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
238 
239 	diskctx = dsched_get_disk_priv(dp);
240 
241 	old_policy = dp->d_sched_policy;
242 	dp->d_sched_policy = &dsched_noop_policy;
243 	old_policy->cancel_all(dsched_get_disk_priv(dp));
244 	old_policy->teardown(dsched_get_disk_priv(dp));
245 
246 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
247 		sysctl_ctx_free(&diskctx->sysctl_ctx);
248 
249 	policy_destroy(dp);
250 	atomic_subtract_int(&old_policy->ref_count, 1);
251 	KKASSERT(old_policy->ref_count >= 0);
252 
253 	lockmgr(&dsched_lock, LK_RELEASE);
254 }
255 
256 
257 void
258 dsched_queue(struct disk *dp, struct bio *bio)
259 {
260 	struct dsched_thread_ctx	*tdctx;
261 	struct dsched_thread_io		*tdio;
262 	struct dsched_disk_ctx		*diskctx;
263 
264 	int found = 0, error = 0;
265 
266 	tdctx = dsched_get_buf_priv(bio->bio_buf);
267 	if (tdctx == NULL) {
268 		/* We don't handle this case, let dsched dispatch */
269 		atomic_add_int(&dsched_stats.no_tdctx, 1);
270 		dsched_strategy_raw(dp, bio);
271 		return;
272 	}
273 
274 	DSCHED_THREAD_CTX_LOCK(tdctx);
275 
276 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
277 	/*
278 	 * XXX:
279 	 * iterate in reverse to make sure we find the most up-to-date
280 	 * tdio for a given disk. After a switch it may take some time
281 	 * for everything to clean up.
282 	 */
283 	TAILQ_FOREACH_REVERSE(tdio, &tdctx->tdio_list, tdio_list_head, link) {
284 		if (tdio->dp == dp) {
285 			dsched_thread_io_ref(tdio);
286 			found = 1;
287 			break;
288 		}
289 	}
290 
291 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
292 	dsched_clr_buf_priv(bio->bio_buf);
293 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
294 
295 	KKASSERT(found == 1);
296 	diskctx = dsched_get_disk_priv(dp);
297 	dsched_disk_ctx_ref(diskctx);
298 
299 	if (dp->d_sched_policy != &dsched_noop_policy)
300 		KKASSERT(tdio->debug_policy == dp->d_sched_policy);
301 
302 	KKASSERT(tdio->debug_inited == 0xF00F1234);
303 
304 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
305 
306 	if (error) {
307 		dsched_strategy_raw(dp, bio);
308 	}
309 	dsched_disk_ctx_unref(diskctx);
310 	dsched_thread_io_unref(tdio);
311 }
312 
313 
314 /*
315  * Called from each module_init or module_attach of each policy
316  * registers the policy in the local policy list.
317  */
318 int
319 dsched_register(struct dsched_policy *d_policy)
320 {
321 	struct dsched_policy *policy;
322 	int error = 0;
323 
324 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
325 
326 	policy = dsched_find_policy(d_policy->name);
327 
328 	if (!policy) {
329 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
330 		atomic_add_int(&d_policy->ref_count, 1);
331 	} else {
332 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
333 		    d_policy->name);
334 		error = EEXIST;
335 	}
336 
337 	lockmgr(&dsched_lock, LK_RELEASE);
338 	return error;
339 }
340 
341 /*
342  * Called from each module_detach of each policy
343  * unregisters the policy
344  */
345 int
346 dsched_unregister(struct dsched_policy *d_policy)
347 {
348 	struct dsched_policy *policy;
349 
350 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
351 	policy = dsched_find_policy(d_policy->name);
352 
353 	if (policy) {
354 		if (policy->ref_count > 1) {
355 			lockmgr(&dsched_lock, LK_RELEASE);
356 			return EBUSY;
357 		}
358 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
359 		atomic_subtract_int(&policy->ref_count, 1);
360 		KKASSERT(policy->ref_count == 0);
361 	}
362 	lockmgr(&dsched_lock, LK_RELEASE);
363 
364 	return 0;
365 }
366 
367 
368 /*
369  * switches the policy by first removing the old one and then
370  * enabling the new one.
371  */
372 int
373 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
374 {
375 	struct dsched_policy *old_policy;
376 
377 	/* If we are asked to set the same policy, do nothing */
378 	if (dp->d_sched_policy == new_policy)
379 		return 0;
380 
381 	/* lock everything down, diskwise */
382 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
383 	old_policy = dp->d_sched_policy;
384 
385 	atomic_subtract_int(&old_policy->ref_count, 1);
386 	KKASSERT(old_policy->ref_count >= 0);
387 
388 	dp->d_sched_policy = &dsched_noop_policy;
389 	old_policy->teardown(dsched_get_disk_priv(dp));
390 	policy_destroy(dp);
391 
392 	/* Bring everything back to life */
393 	dsched_set_policy(dp, new_policy);
394 	lockmgr(&dsched_lock, LK_RELEASE);
395 
396 	return 0;
397 }
398 
399 
400 /*
401  * Loads a given policy and attaches it to the specified disk.
402  * Also initializes the core for the policy
403  */
404 void
405 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
406 {
407 	int locked = 0;
408 
409 	/* Check if it is locked already. if not, we acquire the devfs lock */
410 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
411 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
412 		locked = 1;
413 	}
414 
415 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
416 
417 	policy_new(dp, new_policy);
418 	new_policy->prepare(dsched_get_disk_priv(dp));
419 	dp->d_sched_policy = new_policy;
420 
421 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
422 
423 	atomic_add_int(&new_policy->ref_count, 1);
424 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
425 	    new_policy->name);
426 
427 	/* If we acquired the lock, we also get rid of it */
428 	if (locked)
429 		lockmgr(&dsched_lock, LK_RELEASE);
430 }
431 
432 struct dsched_policy*
433 dsched_find_policy(char *search)
434 {
435 	struct dsched_policy *policy;
436 	struct dsched_policy *policy_found = NULL;
437 	int locked = 0;
438 
439 	/* Check if it is locked already. if not, we acquire the devfs lock */
440 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
441 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
442 		locked = 1;
443 	}
444 
445 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
446 		if (!strcmp(policy->name, search)) {
447 			policy_found = policy;
448 			break;
449 		}
450 	}
451 
452 	/* If we acquired the lock, we also get rid of it */
453 	if (locked)
454 		lockmgr(&dsched_lock, LK_RELEASE);
455 
456 	return policy_found;
457 }
458 
459 struct disk*
460 dsched_find_disk(char *search)
461 {
462 	struct disk *dp_found = NULL;
463 	struct disk *dp = NULL;
464 
465 	while((dp = disk_enumerate(dp))) {
466 		if (!strcmp(dp->d_cdev->si_name, search)) {
467 			dp_found = dp;
468 			break;
469 		}
470 	}
471 
472 	return dp_found;
473 }
474 
475 struct disk*
476 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
477 {
478 	while ((dp = disk_enumerate(dp))) {
479 		if (dp->d_sched_policy == policy)
480 			return dp;
481 	}
482 
483 	return NULL;
484 }
485 
486 struct dsched_policy *
487 dsched_policy_enumerate(struct dsched_policy *pol)
488 {
489 	if (!pol)
490 		return (TAILQ_FIRST(&dsched_policy_list));
491 	else
492 		return (TAILQ_NEXT(pol, link));
493 }
494 
495 void
496 dsched_cancel_bio(struct bio *bp)
497 {
498 	bp->bio_buf->b_error = ENXIO;
499 	bp->bio_buf->b_flags |= B_ERROR;
500 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
501 
502 	biodone(bp);
503 }
504 
505 void
506 dsched_strategy_raw(struct disk *dp, struct bio *bp)
507 {
508 	/*
509 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
510 	 * to avoid panics
511 	 */
512 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
513 	if(bp->bio_track != NULL) {
514 		dsched_debug(LOG_INFO,
515 		    "dsched_strategy_raw sees non-NULL bio_track!! "
516 		    "bio: %p\n", bp);
517 		bp->bio_track = NULL;
518 	}
519 	dev_dstrategy(dp->d_rawdev, bp);
520 }
521 
522 void
523 dsched_strategy_sync(struct disk *dp, struct bio *bio)
524 {
525 	struct buf *bp, *nbp;
526 	struct bio *nbio;
527 
528 	bp = bio->bio_buf;
529 
530 	nbp = getpbuf(NULL);
531 	nbio = &nbp->b_bio1;
532 
533 	nbp->b_cmd = bp->b_cmd;
534 	nbp->b_bufsize = bp->b_bufsize;
535 	nbp->b_runningbufspace = bp->b_runningbufspace;
536 	nbp->b_bcount = bp->b_bcount;
537 	nbp->b_resid = bp->b_resid;
538 	nbp->b_data = bp->b_data;
539 #if 0
540 	/*
541 	 * Buffers undergoing device I/O do not need a kvabase/size.
542 	 */
543 	nbp->b_kvabase = bp->b_kvabase;
544 	nbp->b_kvasize = bp->b_kvasize;
545 #endif
546 	nbp->b_dirtyend = bp->b_dirtyend;
547 
548 	nbio->bio_done = biodone_sync;
549 	nbio->bio_flags |= BIO_SYNC;
550 	nbio->bio_track = NULL;
551 
552 	nbio->bio_caller_info1.ptr = dp;
553 	nbio->bio_offset = bio->bio_offset;
554 
555 	dev_dstrategy(dp->d_rawdev, nbio);
556 	biowait(nbio, "dschedsync");
557 	bp->b_resid = nbp->b_resid;
558 	bp->b_error = nbp->b_error;
559 	biodone(bio);
560 #if 0
561 	nbp->b_kvabase = NULL;
562 	nbp->b_kvasize = 0;
563 #endif
564 	relpbuf(nbp, NULL);
565 }
566 
567 void
568 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
569 {
570 	struct bio *nbio;
571 
572 	nbio = push_bio(bio);
573 	nbio->bio_done = done;
574 	nbio->bio_offset = bio->bio_offset;
575 
576 	dsched_set_bio_dp(nbio, dp);
577 	dsched_set_bio_priv(nbio, priv);
578 
579 	getmicrotime(&nbio->bio_caller_info3.tv);
580 	dev_dstrategy(dp->d_rawdev, nbio);
581 }
582 
583 /*
584  * A special bio done call back function
585  * used by policy having request polling implemented.
586  */
587 static void
588 request_polling_biodone(struct bio *bp)
589 {
590 	struct dsched_disk_ctx *diskctx = NULL;
591 	struct disk *dp = NULL;
592 	struct bio *obio;
593 	struct dsched_policy *policy;
594 
595 	dp = dsched_get_bio_dp(bp);
596 	policy = dp->d_sched_policy;
597 	diskctx = dsched_get_disk_priv(dp);
598 	KKASSERT(diskctx && policy);
599 	dsched_disk_ctx_ref(diskctx);
600 
601 	/*
602 	 * XXX:
603 	 * the bio_done function should not be blocked !
604 	 */
605 	if (diskctx->dp->d_sched_policy->bio_done)
606 		diskctx->dp->d_sched_policy->bio_done(bp);
607 
608 	obio = pop_bio(bp);
609 	biodone(obio);
610 
611 	atomic_subtract_int(&diskctx->current_tag_queue_depth, 1);
612 
613 	/* call the polling function,
614 	 * XXX:
615 	 * the polling function should not be blocked!
616 	 */
617 	if (policy->polling_func)
618 		policy->polling_func(diskctx);
619 	else
620 		dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
621 	dsched_disk_ctx_unref(diskctx);
622 }
623 
624 /*
625  * A special dsched strategy used by policy having request polling
626  * (polling function) implemented.
627  *
628  * The strategy is the just like dsched_strategy_async(), but
629  * the biodone call back is set to a preset one.
630  *
631  * If the policy needs its own biodone callback, it should
632  * register it in the policy structure. (bio_done field)
633  *
634  * The current_tag_queue_depth is maintained by this function
635  * and the request_polling_biodone() function
636  */
637 
638 void
639 dsched_strategy_request_polling(struct disk *dp, struct bio *bio, struct dsched_disk_ctx *diskctx)
640 {
641 	atomic_add_int(&diskctx->current_tag_queue_depth, 1);
642 	dsched_strategy_async(dp, bio, request_polling_biodone, dsched_get_bio_priv(bio));
643 }
644 
645 /*
646  * Ref and deref various structures.  The 1->0 transition of the reference
647  * count actually transitions 1->0x80000000 and causes the object to be
648  * destroyed.  It is possible for transitory references to occur on the
649  * object while it is being destroyed.  We use bit 31 to indicate that
650  * destruction is in progress and to prevent nested destructions.
651  */
652 void
653 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
654 {
655 	int refcount;
656 
657 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
658 }
659 
660 void
661 dsched_thread_io_ref(struct dsched_thread_io *tdio)
662 {
663 	int refcount;
664 
665 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
666 }
667 
668 void
669 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
670 {
671 	int refcount;
672 
673 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
674 }
675 
676 void
677 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
678 {
679 	int refs;
680 	int nrefs;
681 
682 	/*
683 	 * Handle 1->0 transitions for diskctx and nested destruction
684 	 * recursions.  If the refs are already in destruction mode (bit 31
685 	 * set) on the 1->0 transition we don't try to destruct it again.
686 	 *
687 	 * 0x80000001->0x80000000 transitions are handled normally and
688 	 * thus avoid nested dstruction.
689 	 */
690 	for (;;) {
691 		refs = diskctx->refcount;
692 		cpu_ccfence();
693 		nrefs = refs - 1;
694 
695 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
696 		if (nrefs) {
697 			if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
698 				break;
699 			continue;
700 		}
701 		nrefs = 0x80000000;
702 		if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
703 			dsched_disk_ctx_destroy(diskctx);
704 			break;
705 		}
706 	}
707 }
708 
709 static
710 void
711 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
712 {
713 	struct dsched_thread_io	*tdio;
714 
715 #if 0
716 	kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
717 	print_backtrace(4);
718 #endif
719 	lockmgr(&diskctx->lock, LK_EXCLUSIVE);
720 	while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
721 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
722 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
723 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
724 		tdio->diskctx = NULL;
725 		/* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
726 		dsched_thread_io_unref(tdio);
727 	}
728 	lockmgr(&diskctx->lock, LK_RELEASE);
729 	if (diskctx->dp->d_sched_policy->destroy_diskctx)
730 		diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
731 	KKASSERT(diskctx->refcount == 0x80000000);
732 	objcache_put(dsched_diskctx_cache, diskctx);
733 	atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
734 }
735 
736 void
737 dsched_thread_io_unref(struct dsched_thread_io *tdio)
738 {
739 	int refs;
740 	int nrefs;
741 
742 	/*
743 	 * Handle 1->0 transitions for tdio and nested destruction
744 	 * recursions.  If the refs are already in destruction mode (bit 31
745 	 * set) on the 1->0 transition we don't try to destruct it again.
746 	 *
747 	 * 0x80000001->0x80000000 transitions are handled normally and
748 	 * thus avoid nested dstruction.
749 	 */
750 	for (;;) {
751 		refs = tdio->refcount;
752 		cpu_ccfence();
753 		nrefs = refs - 1;
754 
755 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
756 		if (nrefs) {
757 			if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
758 				break;
759 			continue;
760 		}
761 		nrefs = 0x80000000;
762 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
763 			dsched_thread_io_destroy(tdio);
764 			break;
765 		}
766 	}
767 }
768 
769 static void
770 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
771 {
772 	struct dsched_thread_ctx *tdctx;
773 	struct dsched_disk_ctx	*diskctx;
774 
775 #if 0
776 	kprintf("tdio (%p) destruction started, trace:\n", tdio);
777 	print_backtrace(8);
778 #endif
779 	KKASSERT(tdio->qlength == 0);
780 
781 	while ((diskctx = tdio->diskctx) != NULL) {
782 		dsched_disk_ctx_ref(diskctx);
783 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
784 		if (diskctx != tdio->diskctx) {
785 			lockmgr(&diskctx->lock, LK_RELEASE);
786 			dsched_disk_ctx_unref(diskctx);
787 			continue;
788 		}
789 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
790 		if (diskctx->dp->d_sched_policy->destroy_tdio)
791 			diskctx->dp->d_sched_policy->destroy_tdio(tdio);
792 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
793 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
794 		tdio->diskctx = NULL;
795 		lockmgr(&diskctx->lock, LK_RELEASE);
796 		dsched_disk_ctx_unref(diskctx);
797 	}
798 	while ((tdctx = tdio->tdctx) != NULL) {
799 		dsched_thread_ctx_ref(tdctx);
800 		lockmgr(&tdctx->lock, LK_EXCLUSIVE);
801 		if (tdctx != tdio->tdctx) {
802 			lockmgr(&tdctx->lock, LK_RELEASE);
803 			dsched_thread_ctx_unref(tdctx);
804 			continue;
805 		}
806 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
807 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
808 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
809 		tdio->tdctx = NULL;
810 		lockmgr(&tdctx->lock, LK_RELEASE);
811 		dsched_thread_ctx_unref(tdctx);
812 	}
813 	KKASSERT(tdio->refcount == 0x80000000);
814 	objcache_put(dsched_tdio_cache, tdio);
815 	atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
816 #if 0
817 	dsched_disk_ctx_unref(diskctx);
818 #endif
819 }
820 
821 void
822 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
823 {
824 	int refs;
825 	int nrefs;
826 
827 	/*
828 	 * Handle 1->0 transitions for tdctx and nested destruction
829 	 * recursions.  If the refs are already in destruction mode (bit 31
830 	 * set) on the 1->0 transition we don't try to destruct it again.
831 	 *
832 	 * 0x80000001->0x80000000 transitions are handled normally and
833 	 * thus avoid nested dstruction.
834 	 */
835 	for (;;) {
836 		refs = tdctx->refcount;
837 		cpu_ccfence();
838 		nrefs = refs - 1;
839 
840 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
841 		if (nrefs) {
842 			if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
843 				break;
844 			continue;
845 		}
846 		nrefs = 0x80000000;
847 		if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
848 			dsched_thread_ctx_destroy(tdctx);
849 			break;
850 		}
851 	}
852 }
853 
854 static void
855 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
856 {
857 	struct dsched_thread_io	*tdio;
858 
859 #if 0
860 	kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
861 	print_backtrace(8);
862 #endif
863 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
864 
865 	lockmgr(&tdctx->lock, LK_EXCLUSIVE);
866 
867 	while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
868 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
869 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
870 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
871 		tdio->tdctx = NULL;
872 		dsched_thread_io_unref(tdio);
873 	}
874 	KKASSERT(tdctx->refcount == 0x80000000);
875 	TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
876 
877 	lockmgr(&tdctx->lock, LK_RELEASE);
878 
879 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
880 
881 	objcache_put(dsched_tdctx_cache, tdctx);
882 	atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
883 }
884 
885 struct dsched_thread_io *
886 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
887     struct dsched_policy *pol)
888 {
889 	struct dsched_thread_io	*tdio;
890 #if 0
891 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
892 #endif
893 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
894 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
895 
896 	/* XXX: maybe we do need another ref for the disk list for tdio */
897 	dsched_thread_io_ref(tdio);
898 
899 	DSCHED_THREAD_IO_LOCKINIT(tdio);
900 	tdio->dp = dp;
901 
902 	tdio->diskctx = dsched_get_disk_priv(dp);
903 	TAILQ_INIT(&tdio->queue);
904 
905 	if (pol->new_tdio)
906 		pol->new_tdio(tdio);
907 
908 	lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
909 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
910 	atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
911 	lockmgr(&tdio->diskctx->lock, LK_RELEASE);
912 
913 	if (tdctx) {
914 		tdio->tdctx = tdctx;
915 		tdio->p = tdctx->p;
916 
917 		/* Put the tdio in the tdctx list */
918 		DSCHED_THREAD_CTX_LOCK(tdctx);
919 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
920 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
921 		atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
922 	}
923 
924 	tdio->debug_policy = pol;
925 	tdio->debug_inited = 0xF00F1234;
926 
927 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
928 	return tdio;
929 }
930 
931 
932 struct dsched_disk_ctx *
933 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
934 {
935 	struct dsched_disk_ctx *diskctx;
936 
937 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
938 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
939 	dsched_disk_ctx_ref(diskctx);
940 	diskctx->dp = dp;
941 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
942 	TAILQ_INIT(&diskctx->tdio_list);
943 	/*
944 	 * XXX: magic number 32: most device has a tag queue
945 	 * of depth 32.
946 	 * Better to retrive more precise value from the driver
947 	 */
948 	diskctx->max_tag_queue_depth = 32;
949 	diskctx->current_tag_queue_depth = 0;
950 
951 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
952 	if (pol->new_diskctx)
953 		pol->new_diskctx(diskctx);
954 	return diskctx;
955 }
956 
957 
958 struct dsched_thread_ctx *
959 dsched_thread_ctx_alloc(struct proc *p)
960 {
961 	struct dsched_thread_ctx	*tdctx;
962 	struct dsched_thread_io	*tdio;
963 	struct disk	*dp = NULL;
964 
965 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
966 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
967 	dsched_thread_ctx_ref(tdctx);
968 #if 0
969 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
970 #endif
971 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
972 	TAILQ_INIT(&tdctx->tdio_list);
973 	tdctx->p = p;
974 
975 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
976 	while ((dp = disk_enumerate(dp))) {
977 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
978 	}
979 
980 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
981 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
982 
983 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
984 	/* XXX: no callback here */
985 	return tdctx;
986 }
987 
988 void
989 policy_new(struct disk *dp, struct dsched_policy *pol) {
990 	struct dsched_thread_ctx *tdctx;
991 	struct dsched_disk_ctx *diskctx;
992 	struct dsched_thread_io *tdio;
993 
994 	diskctx = dsched_disk_ctx_alloc(dp, pol);
995 	dsched_disk_ctx_ref(diskctx);
996 	dsched_set_disk_priv(dp, diskctx);
997 
998 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
999 		tdio = dsched_thread_io_alloc(dp, tdctx, pol);
1000 	}
1001 }
1002 
1003 void
1004 policy_destroy(struct disk *dp) {
1005 	struct dsched_disk_ctx *diskctx;
1006 
1007 	diskctx = dsched_get_disk_priv(dp);
1008 	KKASSERT(diskctx != NULL);
1009 
1010 	dsched_disk_ctx_unref(diskctx); /* from prepare */
1011 	dsched_disk_ctx_unref(diskctx); /* from alloc */
1012 
1013 	dsched_set_disk_priv(dp, NULL);
1014 }
1015 
1016 void
1017 dsched_new_buf(struct buf *bp)
1018 {
1019 	struct dsched_thread_ctx	*tdctx = NULL;
1020 
1021 	if (dsched_inited == 0)
1022 		return;
1023 
1024 	if (curproc != NULL) {
1025 		tdctx = dsched_get_proc_priv(curproc);
1026 	} else {
1027 		/* This is a kernel thread, so no proc info is available */
1028 		tdctx = dsched_get_thread_priv(curthread);
1029 	}
1030 
1031 #if 0
1032 	/*
1033 	 * XXX: hack. we don't want this assert because we aren't catching all
1034 	 *	threads. mi_startup() is still getting away without an tdctx.
1035 	 */
1036 
1037 	/* by now we should have an tdctx. if not, something bad is going on */
1038 	KKASSERT(tdctx != NULL);
1039 #endif
1040 
1041 	if (tdctx) {
1042 		dsched_thread_ctx_ref(tdctx);
1043 	}
1044 	dsched_set_buf_priv(bp, tdctx);
1045 }
1046 
1047 void
1048 dsched_exit_buf(struct buf *bp)
1049 {
1050 	struct dsched_thread_ctx	*tdctx;
1051 
1052 	tdctx = dsched_get_buf_priv(bp);
1053 	if (tdctx != NULL) {
1054 		dsched_clr_buf_priv(bp);
1055 		dsched_thread_ctx_unref(tdctx);
1056 	}
1057 }
1058 
1059 void
1060 dsched_new_proc(struct proc *p)
1061 {
1062 	struct dsched_thread_ctx	*tdctx;
1063 
1064 	if (dsched_inited == 0)
1065 		return;
1066 
1067 	KKASSERT(p != NULL);
1068 
1069 	tdctx = dsched_thread_ctx_alloc(p);
1070 	tdctx->p = p;
1071 	dsched_thread_ctx_ref(tdctx);
1072 
1073 	dsched_set_proc_priv(p, tdctx);
1074 	atomic_add_int(&dsched_stats.nprocs, 1);
1075 }
1076 
1077 
1078 void
1079 dsched_new_thread(struct thread *td)
1080 {
1081 	struct dsched_thread_ctx	*tdctx;
1082 
1083 	if (dsched_inited == 0)
1084 		return;
1085 
1086 	KKASSERT(td != NULL);
1087 
1088 	tdctx = dsched_thread_ctx_alloc(NULL);
1089 	tdctx->td = td;
1090 	dsched_thread_ctx_ref(tdctx);
1091 
1092 	dsched_set_thread_priv(td, tdctx);
1093 	atomic_add_int(&dsched_stats.nthreads, 1);
1094 }
1095 
1096 void
1097 dsched_exit_proc(struct proc *p)
1098 {
1099 	struct dsched_thread_ctx	*tdctx;
1100 
1101 	if (dsched_inited == 0)
1102 		return;
1103 
1104 	KKASSERT(p != NULL);
1105 
1106 	tdctx = dsched_get_proc_priv(p);
1107 	KKASSERT(tdctx != NULL);
1108 
1109 	tdctx->dead = 0xDEAD;
1110 	dsched_set_proc_priv(p, NULL);
1111 
1112 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1113 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1114 	atomic_subtract_int(&dsched_stats.nprocs, 1);
1115 }
1116 
1117 
1118 void
1119 dsched_exit_thread(struct thread *td)
1120 {
1121 	struct dsched_thread_ctx	*tdctx;
1122 
1123 	if (dsched_inited == 0)
1124 		return;
1125 
1126 	KKASSERT(td != NULL);
1127 
1128 	tdctx = dsched_get_thread_priv(td);
1129 	KKASSERT(tdctx != NULL);
1130 
1131 	tdctx->dead = 0xDEAD;
1132 	dsched_set_thread_priv(td, 0);
1133 
1134 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1135 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1136 	atomic_subtract_int(&dsched_stats.nthreads, 1);
1137 }
1138 
1139 struct dsched_thread_io *
1140 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1141     struct dsched_policy *pol) {
1142 	struct dsched_thread_ctx *tdctx;
1143 	struct dsched_thread_io *tdio;
1144 
1145 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
1146 
1147 	tdctx = dsched_get_thread_priv(curthread);
1148 	KKASSERT(tdctx != NULL);
1149 	tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
1150 
1151 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1152 
1153 	return tdio;
1154 }
1155 
1156 /* DEFAULT NOOP POLICY */
1157 
1158 static int
1159 noop_prepare(struct dsched_disk_ctx *diskctx)
1160 {
1161 	return 0;
1162 }
1163 
1164 static void
1165 noop_teardown(struct dsched_disk_ctx *diskctx)
1166 {
1167 
1168 }
1169 
1170 static void
1171 noop_cancel(struct dsched_disk_ctx *diskctx)
1172 {
1173 
1174 }
1175 
1176 static int
1177 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1178     struct bio *bio)
1179 {
1180 	dsched_strategy_raw(diskctx->dp, bio);
1181 #if 0
1182 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1183 #endif
1184 	return 0;
1185 }
1186 
1187 /*
1188  * SYSINIT stuff
1189  */
1190 static void
1191 dsched_init(void)
1192 {
1193 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1194 					   NULL, NULL, NULL,
1195 					   objcache_malloc_alloc,
1196 					   objcache_malloc_free,
1197 					   &dsched_thread_io_malloc_args );
1198 
1199 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1200 					   NULL, NULL, NULL,
1201 					   objcache_malloc_alloc,
1202 					   objcache_malloc_free,
1203 					   &dsched_thread_ctx_malloc_args );
1204 
1205 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1206 					   NULL, NULL, NULL,
1207 					   objcache_malloc_alloc,
1208 					   objcache_malloc_free,
1209 					   &dsched_disk_ctx_malloc_args );
1210 
1211 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1212 
1213 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1214 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1215 
1216 	dsched_register(&dsched_noop_policy);
1217 
1218 	dsched_inited = 1;
1219 }
1220 
1221 static void
1222 dsched_uninit(void)
1223 {
1224 }
1225 
1226 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1227 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1228 
1229 /*
1230  * SYSCTL stuff
1231  */
1232 static int
1233 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1234 {
1235 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1236 }
1237 
1238 static int
1239 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1240 {
1241 	struct dsched_policy *pol = NULL;
1242 	int error, first = 1;
1243 
1244 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1245 
1246 	while ((pol = dsched_policy_enumerate(pol))) {
1247 		if (!first) {
1248 			error = SYSCTL_OUT(req, " ", 1);
1249 			if (error)
1250 				break;
1251 		} else {
1252 			first = 0;
1253 		}
1254 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1255 		if (error)
1256 			break;
1257 
1258 	}
1259 
1260 	lockmgr(&dsched_lock, LK_RELEASE);
1261 
1262 	error = SYSCTL_OUT(req, "", 1);
1263 
1264 	return error;
1265 }
1266 
1267 static int
1268 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1269 {
1270 	char buf[DSCHED_POLICY_NAME_LENGTH];
1271 	struct dsched_disk_ctx *diskctx = arg1;
1272 	struct dsched_policy *pol = NULL;
1273 	int error;
1274 
1275 	if (diskctx == NULL) {
1276 		return 0;
1277 	}
1278 
1279 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1280 
1281 	pol = diskctx->dp->d_sched_policy;
1282 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1283 
1284 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1285 	if (error || req->newptr == NULL) {
1286 		lockmgr(&dsched_lock, LK_RELEASE);
1287 		return (error);
1288 	}
1289 
1290 	pol = dsched_find_policy(buf);
1291 	if (pol == NULL) {
1292 		lockmgr(&dsched_lock, LK_RELEASE);
1293 		return 0;
1294 	}
1295 
1296 	dsched_switch(diskctx->dp, pol);
1297 
1298 	lockmgr(&dsched_lock, LK_RELEASE);
1299 
1300 	return error;
1301 }
1302 
1303 static int
1304 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1305 {
1306 	char buf[DSCHED_POLICY_NAME_LENGTH];
1307 	struct dsched_policy *pol = NULL;
1308 	int error;
1309 
1310 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1311 
1312 	pol = default_policy;
1313 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1314 
1315 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1316 	if (error || req->newptr == NULL) {
1317 		lockmgr(&dsched_lock, LK_RELEASE);
1318 		return (error);
1319 	}
1320 
1321 	pol = dsched_find_policy(buf);
1322 	if (pol == NULL) {
1323 		lockmgr(&dsched_lock, LK_RELEASE);
1324 		return 0;
1325 	}
1326 
1327 	default_set = 1;
1328 	default_policy = pol;
1329 
1330 	lockmgr(&dsched_lock, LK_RELEASE);
1331 
1332 	return error;
1333 }
1334 
1335 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1336     "Disk Scheduler Framework (dsched) magic");
1337 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1338     "List of disks and their policies");
1339 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1340     0, "Enable dsched debugging");
1341 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1342     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1343     "dsched statistics");
1344 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1345     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1346 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1347     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1348 
1349 static void
1350 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1351 {
1352 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1353 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1354 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1355 	}
1356 
1357 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1358 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1359 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1360 }
1361