xref: /dragonfly/sys/kern/kern_dsched.c (revision 38b930d0)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 TAILQ_HEAD(tdio_list_head, dsched_thread_io);
56 
57 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
58 
59 static dsched_prepare_t		noop_prepare;
60 static dsched_teardown_t	noop_teardown;
61 static dsched_cancel_t		noop_cancel;
62 static dsched_queue_t		noop_queue;
63 
64 static void dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio);
65 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
66 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
67 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
68 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
69 
70 static struct dsched_thread_io *dsched_thread_io_alloc(
71 		struct disk *dp, struct dsched_thread_ctx *tdctx,
72 		struct dsched_policy *pol, int tdctx_locked);
73 
74 static int	dsched_inited = 0;
75 static int	default_set = 0;
76 
77 struct lock	dsched_lock;
78 static int	dsched_debug_enable = 0;
79 
80 struct dsched_stats	dsched_stats;
81 
82 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
83 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
84 struct objcache_malloc_args dsched_thread_io_malloc_args = {
85 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
86 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
87 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
88 
89 static struct objcache	*dsched_diskctx_cache;
90 static struct objcache	*dsched_tdctx_cache;
91 static struct objcache	*dsched_tdio_cache;
92 
93 struct lock	dsched_tdctx_lock;
94 
95 static struct dsched_policy_head dsched_policy_list =
96 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
97 
98 static struct dsched_policy dsched_noop_policy = {
99 	.name = "noop",
100 
101 	.prepare = noop_prepare,
102 	.teardown = noop_teardown,
103 	.cancel_all = noop_cancel,
104 	.bio_queue = noop_queue
105 };
106 
107 static struct dsched_policy *default_policy = &dsched_noop_policy;
108 
109 /*
110  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
111  * using kvprintf
112  */
113 int
114 dsched_debug(int level, char *fmt, ...)
115 {
116 	__va_list ap;
117 
118 	__va_start(ap, fmt);
119 	if (level <= dsched_debug_enable)
120 		kvprintf(fmt, ap);
121 	__va_end(ap);
122 
123 	return 0;
124 }
125 
126 /*
127  * Called on disk_create()
128  * tries to read which policy to use from loader.conf, if there's
129  * none specified, the default policy is used.
130  */
131 void
132 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
133 {
134 	char tunable_key[SPECNAMELEN + 48];
135 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
136 	char *ptr;
137 	struct dsched_policy *policy = NULL;
138 
139 	/* Also look for serno stuff? */
140 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
141 
142 	ksnprintf(tunable_key, sizeof(tunable_key),
143 		  "dsched.policy.%s%d", head_name, unit);
144 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
145 	    sizeof(sched_policy)) != 0) {
146 		policy = dsched_find_policy(sched_policy);
147 	}
148 
149 	ksnprintf(tunable_key, sizeof(tunable_key),
150 		  "dsched.policy.%s", head_name);
151 
152 	for (ptr = tunable_key; *ptr; ptr++) {
153 		if (*ptr == '/')
154 			*ptr = '-';
155 	}
156 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
157 	    sizeof(sched_policy)) != 0)) {
158 		policy = dsched_find_policy(sched_policy);
159 	}
160 
161 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
162 	if (!policy && !default_set &&
163 	    (TUNABLE_STR_FETCH(tunable_key, sched_policy,
164 			       sizeof(sched_policy)) != 0)) {
165 		policy = dsched_find_policy(sched_policy);
166 	}
167 
168 	if (!policy) {
169 		if (!default_set && bootverbose) {
170 			dsched_debug(0,
171 				     "No policy for %s%d specified, "
172 				     "or policy not found\n",
173 				     head_name, unit);
174 		}
175 		dsched_set_policy(dp, default_policy);
176 	} else {
177 		dsched_set_policy(dp, policy);
178 	}
179 
180 	if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
181 		ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
182 	else
183 		ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
184 	for (ptr = tunable_key; *ptr; ptr++) {
185 		if (*ptr == '/')
186 			*ptr = '-';
187 	}
188 	dsched_sysctl_add_disk(
189 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
190 	    tunable_key);
191 
192 	lockmgr(&dsched_lock, LK_RELEASE);
193 }
194 
195 /*
196  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
197  * there's any policy associated with the serial number of the device.
198  */
199 void
200 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
201 {
202 	char tunable_key[SPECNAMELEN + 48];
203 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
204 	struct dsched_policy *policy = NULL;
205 
206 	if (info->d_serialno == NULL)
207 		return;
208 
209 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
210 
211 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
212 	    info->d_serialno);
213 
214 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
215 	    sizeof(sched_policy)) != 0)) {
216 		policy = dsched_find_policy(sched_policy);
217 	}
218 
219 	if (policy) {
220 		dsched_switch(dp, policy);
221 	}
222 
223 	dsched_sysctl_add_disk(
224 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
225 	    info->d_serialno);
226 
227 	lockmgr(&dsched_lock, LK_RELEASE);
228 }
229 
230 /*
231  * Called on disk_destroy()
232  * shuts down the scheduler core and cancels all remaining bios
233  */
234 void
235 dsched_disk_destroy_callback(struct disk *dp)
236 {
237 	struct dsched_policy *old_policy;
238 	struct dsched_disk_ctx *diskctx;
239 
240 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
241 
242 	diskctx = dsched_get_disk_priv(dp);
243 
244 	old_policy = dp->d_sched_policy;
245 	dp->d_sched_policy = &dsched_noop_policy;
246 	old_policy->cancel_all(dsched_get_disk_priv(dp));
247 	old_policy->teardown(dsched_get_disk_priv(dp));
248 
249 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
250 		sysctl_ctx_free(&diskctx->sysctl_ctx);
251 
252 	policy_destroy(dp);
253 	atomic_subtract_int(&old_policy->ref_count, 1);
254 	KKASSERT(old_policy->ref_count >= 0);
255 
256 	lockmgr(&dsched_lock, LK_RELEASE);
257 }
258 
259 
260 /*
261  * Caller must have dp->diskctx locked
262  */
263 void
264 dsched_queue(struct disk *dp, struct bio *bio)
265 {
266 	struct dsched_thread_ctx	*tdctx;
267 	struct dsched_thread_io		*tdio;
268 	struct dsched_disk_ctx		*diskctx;
269 	int	found;
270 	int	error;
271 
272 	if (dp->d_sched_policy == &dsched_noop_policy) {
273 		dsched_clr_buf_priv(bio->bio_buf);
274 		atomic_add_int(&dsched_stats.no_tdctx, 1);
275 		dsched_strategy_raw(dp, bio);
276 		return;
277 	}
278 
279 	found = 0;
280 	error = 0;
281 	tdctx = dsched_get_buf_priv(bio->bio_buf);
282 	if (tdctx == NULL) {
283 		/* We don't handle this case, let dsched dispatch */
284 		atomic_add_int(&dsched_stats.no_tdctx, 1);
285 		dsched_strategy_raw(dp, bio);
286 		return;
287 	}
288 
289 	DSCHED_THREAD_CTX_LOCK(tdctx);
290 
291 	/*
292 	 * XXX:
293 	 * iterate in reverse to make sure we find the most up-to-date
294 	 * tdio for a given disk. After a switch it may take some time
295 	 * for everything to clean up.
296 	 */
297 	TAILQ_FOREACH_REVERSE(tdio, &tdctx->tdio_list, tdio_list_head, link) {
298 		if (tdio->dp == dp) {
299 			dsched_thread_io_ref(tdio);
300 			break;
301 		}
302 	}
303 	if (tdio == NULL) {
304 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy, 1);
305 		dsched_thread_io_ref(tdio);
306 	}
307 
308 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
309 	dsched_clr_buf_priv(bio->bio_buf);
310 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
311 
312 	KKASSERT(found == 1);
313 	diskctx = dsched_get_disk_priv(dp);
314 	dsched_disk_ctx_ref(diskctx);
315 
316 	if (dp->d_sched_policy != &dsched_noop_policy)
317 		KKASSERT(tdio->debug_policy == dp->d_sched_policy);
318 
319 	KKASSERT(tdio->debug_inited == 0xF00F1234);
320 
321 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
322 
323 	if (error) {
324 		dsched_strategy_raw(dp, bio);
325 	}
326 	dsched_disk_ctx_unref(diskctx);
327 	dsched_thread_io_unref(tdio);
328 }
329 
330 
331 /*
332  * Called from each module_init or module_attach of each policy
333  * registers the policy in the local policy list.
334  */
335 int
336 dsched_register(struct dsched_policy *d_policy)
337 {
338 	struct dsched_policy *policy;
339 	int error = 0;
340 
341 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
342 
343 	policy = dsched_find_policy(d_policy->name);
344 
345 	if (!policy) {
346 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
347 		atomic_add_int(&d_policy->ref_count, 1);
348 	} else {
349 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
350 		    d_policy->name);
351 		error = EEXIST;
352 	}
353 
354 	lockmgr(&dsched_lock, LK_RELEASE);
355 	return error;
356 }
357 
358 /*
359  * Called from each module_detach of each policy
360  * unregisters the policy
361  */
362 int
363 dsched_unregister(struct dsched_policy *d_policy)
364 {
365 	struct dsched_policy *policy;
366 
367 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
368 	policy = dsched_find_policy(d_policy->name);
369 
370 	if (policy) {
371 		if (policy->ref_count > 1) {
372 			lockmgr(&dsched_lock, LK_RELEASE);
373 			return EBUSY;
374 		}
375 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
376 		atomic_subtract_int(&policy->ref_count, 1);
377 		KKASSERT(policy->ref_count == 0);
378 	}
379 	lockmgr(&dsched_lock, LK_RELEASE);
380 
381 	return 0;
382 }
383 
384 
385 /*
386  * switches the policy by first removing the old one and then
387  * enabling the new one.
388  */
389 int
390 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
391 {
392 	struct dsched_policy *old_policy;
393 
394 	/* If we are asked to set the same policy, do nothing */
395 	if (dp->d_sched_policy == new_policy)
396 		return 0;
397 
398 	/* lock everything down, diskwise */
399 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
400 	old_policy = dp->d_sched_policy;
401 
402 	atomic_subtract_int(&old_policy->ref_count, 1);
403 	KKASSERT(old_policy->ref_count >= 0);
404 
405 	dp->d_sched_policy = &dsched_noop_policy;
406 	old_policy->teardown(dsched_get_disk_priv(dp));
407 	policy_destroy(dp);
408 
409 	/* Bring everything back to life */
410 	dsched_set_policy(dp, new_policy);
411 	lockmgr(&dsched_lock, LK_RELEASE);
412 
413 	return 0;
414 }
415 
416 
417 /*
418  * Loads a given policy and attaches it to the specified disk.
419  * Also initializes the core for the policy
420  */
421 void
422 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
423 {
424 	int locked = 0;
425 
426 	/* Check if it is locked already. if not, we acquire the devfs lock */
427 	if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
428 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
429 		locked = 1;
430 	}
431 
432 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
433 
434 	policy_new(dp, new_policy);
435 	new_policy->prepare(dsched_get_disk_priv(dp));
436 	dp->d_sched_policy = new_policy;
437 	atomic_add_int(&new_policy->ref_count, 1);
438 
439 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
440 
441 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
442 	    new_policy->name);
443 
444 	/* If we acquired the lock, we also get rid of it */
445 	if (locked)
446 		lockmgr(&dsched_lock, LK_RELEASE);
447 }
448 
449 struct dsched_policy*
450 dsched_find_policy(char *search)
451 {
452 	struct dsched_policy *policy;
453 	struct dsched_policy *policy_found = NULL;
454 	int locked = 0;
455 
456 	/* Check if it is locked already. if not, we acquire the devfs lock */
457 	if ((lockstatus(&dsched_lock, curthread)) != LK_EXCLUSIVE) {
458 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
459 		locked = 1;
460 	}
461 
462 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
463 		if (!strcmp(policy->name, search)) {
464 			policy_found = policy;
465 			break;
466 		}
467 	}
468 
469 	/* If we acquired the lock, we also get rid of it */
470 	if (locked)
471 		lockmgr(&dsched_lock, LK_RELEASE);
472 
473 	return policy_found;
474 }
475 
476 /*
477  * Returns ref'd disk
478  */
479 struct disk *
480 dsched_find_disk(char *search)
481 {
482 	struct disk marker;
483 	struct disk *dp = NULL;
484 
485 	while ((dp = disk_enumerate(&marker, dp)) != NULL) {
486 		if (strcmp(dp->d_cdev->si_name, search) == 0) {
487 			disk_enumerate_stop(&marker, NULL);
488 			/* leave ref on dp */
489 			break;
490 		}
491 	}
492 	return dp;
493 }
494 
495 struct disk *
496 dsched_disk_enumerate(struct disk *marker, struct disk *dp,
497 		      struct dsched_policy *policy)
498 {
499 	while ((dp = disk_enumerate(marker, dp)) != NULL) {
500 		if (dp->d_sched_policy == policy)
501 			break;
502 	}
503 	return NULL;
504 }
505 
506 struct dsched_policy *
507 dsched_policy_enumerate(struct dsched_policy *pol)
508 {
509 	if (!pol)
510 		return (TAILQ_FIRST(&dsched_policy_list));
511 	else
512 		return (TAILQ_NEXT(pol, link));
513 }
514 
515 void
516 dsched_cancel_bio(struct bio *bp)
517 {
518 	bp->bio_buf->b_error = ENXIO;
519 	bp->bio_buf->b_flags |= B_ERROR;
520 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
521 
522 	biodone(bp);
523 }
524 
525 void
526 dsched_strategy_raw(struct disk *dp, struct bio *bp)
527 {
528 	/*
529 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
530 	 * to avoid panics
531 	 */
532 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
533 	if(bp->bio_track != NULL) {
534 		dsched_debug(LOG_INFO,
535 		    "dsched_strategy_raw sees non-NULL bio_track!! "
536 		    "bio: %p\n", bp);
537 		bp->bio_track = NULL;
538 	}
539 	dev_dstrategy(dp->d_rawdev, bp);
540 }
541 
542 void
543 dsched_strategy_sync(struct disk *dp, struct bio *bio)
544 {
545 	struct buf *bp, *nbp;
546 	struct bio *nbio;
547 
548 	bp = bio->bio_buf;
549 
550 	nbp = getpbuf(NULL);
551 	nbio = &nbp->b_bio1;
552 
553 	nbp->b_cmd = bp->b_cmd;
554 	nbp->b_bufsize = bp->b_bufsize;
555 	nbp->b_runningbufspace = bp->b_runningbufspace;
556 	nbp->b_bcount = bp->b_bcount;
557 	nbp->b_resid = bp->b_resid;
558 	nbp->b_data = bp->b_data;
559 #if 0
560 	/*
561 	 * Buffers undergoing device I/O do not need a kvabase/size.
562 	 */
563 	nbp->b_kvabase = bp->b_kvabase;
564 	nbp->b_kvasize = bp->b_kvasize;
565 #endif
566 	nbp->b_dirtyend = bp->b_dirtyend;
567 
568 	nbio->bio_done = biodone_sync;
569 	nbio->bio_flags |= BIO_SYNC;
570 	nbio->bio_track = NULL;
571 
572 	nbio->bio_caller_info1.ptr = dp;
573 	nbio->bio_offset = bio->bio_offset;
574 
575 	dev_dstrategy(dp->d_rawdev, nbio);
576 	biowait(nbio, "dschedsync");
577 	bp->b_resid = nbp->b_resid;
578 	bp->b_error = nbp->b_error;
579 	biodone(bio);
580 #if 0
581 	nbp->b_kvabase = NULL;
582 	nbp->b_kvasize = 0;
583 #endif
584 	relpbuf(nbp, NULL);
585 }
586 
587 void
588 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
589 {
590 	struct bio *nbio;
591 
592 	nbio = push_bio(bio);
593 	nbio->bio_done = done;
594 	nbio->bio_offset = bio->bio_offset;
595 
596 	dsched_set_bio_dp(nbio, dp);
597 	dsched_set_bio_priv(nbio, priv);
598 
599 	getmicrotime(&nbio->bio_caller_info3.tv);
600 	dev_dstrategy(dp->d_rawdev, nbio);
601 }
602 
603 /*
604  * A special bio done call back function
605  * used by policy having request polling implemented.
606  */
607 static void
608 request_polling_biodone(struct bio *bp)
609 {
610 	struct dsched_disk_ctx *diskctx = NULL;
611 	struct disk *dp = NULL;
612 	struct bio *obio;
613 	struct dsched_policy *policy;
614 
615 	dp = dsched_get_bio_dp(bp);
616 	policy = dp->d_sched_policy;
617 	diskctx = dsched_get_disk_priv(dp);
618 	KKASSERT(diskctx && policy);
619 	dsched_disk_ctx_ref(diskctx);
620 
621 	/*
622 	 * XXX:
623 	 * the bio_done function should not be blocked !
624 	 */
625 	if (diskctx->dp->d_sched_policy->bio_done)
626 		diskctx->dp->d_sched_policy->bio_done(bp);
627 
628 	obio = pop_bio(bp);
629 	biodone(obio);
630 
631 	atomic_subtract_int(&diskctx->current_tag_queue_depth, 1);
632 
633 	/* call the polling function,
634 	 * XXX:
635 	 * the polling function should not be blocked!
636 	 */
637 	if (policy->polling_func)
638 		policy->polling_func(diskctx);
639 	else
640 		dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
641 	dsched_disk_ctx_unref(diskctx);
642 }
643 
644 /*
645  * A special dsched strategy used by policy having request polling
646  * (polling function) implemented.
647  *
648  * The strategy is the just like dsched_strategy_async(), but
649  * the biodone call back is set to a preset one.
650  *
651  * If the policy needs its own biodone callback, it should
652  * register it in the policy structure. (bio_done field)
653  *
654  * The current_tag_queue_depth is maintained by this function
655  * and the request_polling_biodone() function
656  */
657 
658 void
659 dsched_strategy_request_polling(struct disk *dp, struct bio *bio, struct dsched_disk_ctx *diskctx)
660 {
661 	atomic_add_int(&diskctx->current_tag_queue_depth, 1);
662 	dsched_strategy_async(dp, bio, request_polling_biodone, dsched_get_bio_priv(bio));
663 }
664 
665 /*
666  * Ref and deref various structures.  The 1->0 transition of the reference
667  * count actually transitions 1->0x80000000 and causes the object to be
668  * destroyed.  It is possible for transitory references to occur on the
669  * object while it is being destroyed.  We use bit 31 to indicate that
670  * destruction is in progress and to prevent nested destructions.
671  */
672 void
673 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
674 {
675 	int refcount;
676 
677 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
678 }
679 
680 void
681 dsched_thread_io_ref(struct dsched_thread_io *tdio)
682 {
683 	int refcount;
684 
685 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
686 }
687 
688 void
689 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
690 {
691 	int refcount;
692 
693 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
694 }
695 
696 void
697 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
698 {
699 	int refs;
700 	int nrefs;
701 
702 	/*
703 	 * Handle 1->0 transitions for diskctx and nested destruction
704 	 * recursions.  If the refs are already in destruction mode (bit 31
705 	 * set) on the 1->0 transition we don't try to destruct it again.
706 	 *
707 	 * 0x80000001->0x80000000 transitions are handled normally and
708 	 * thus avoid nested dstruction.
709 	 */
710 	for (;;) {
711 		refs = diskctx->refcount;
712 		cpu_ccfence();
713 		nrefs = refs - 1;
714 
715 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
716 		if (nrefs) {
717 			if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
718 				break;
719 			continue;
720 		}
721 		nrefs = 0x80000000;
722 		if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
723 			dsched_disk_ctx_destroy(diskctx);
724 			break;
725 		}
726 	}
727 }
728 
729 static
730 void
731 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
732 {
733 	struct dsched_thread_io	*tdio;
734 	int refs;
735 	int nrefs;
736 
737 #if 0
738 	kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
739 	print_backtrace(4);
740 #endif
741 	lockmgr(&diskctx->lock, LK_EXCLUSIVE);
742 	while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
743 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
744 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
745 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
746 		tdio->diskctx = NULL;
747 		/* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
748 		lockmgr(&diskctx->lock, LK_RELEASE);
749 		dsched_thread_io_unref_destroy(tdio);
750 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
751 	}
752 	lockmgr(&diskctx->lock, LK_RELEASE);
753 
754 	/*
755 	 * Expect diskctx->refcount to be 0x80000000.  If it isn't someone
756 	 * else still has a temporary ref on the diskctx and we have to
757 	 * transition it back to an undestroyed-state (albeit without any
758 	 * associations), so the other user destroys it properly when the
759 	 * ref is released.
760 	 */
761 	while ((refs = diskctx->refcount) != 0x80000000) {
762 		kprintf("dsched_thread_io: destroy race diskctx=%p\n", diskctx);
763 		cpu_ccfence();
764 		KKASSERT(refs & 0x80000000);
765 		nrefs = refs & 0x7FFFFFFF;
766 		if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
767 			return;
768 	}
769 
770 	/*
771 	 * Really for sure now.
772 	 */
773 	if (diskctx->dp->d_sched_policy->destroy_diskctx)
774 		diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
775 	objcache_put(dsched_diskctx_cache, diskctx);
776 	atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
777 }
778 
779 void
780 dsched_thread_io_unref(struct dsched_thread_io *tdio)
781 {
782 	int refs;
783 	int nrefs;
784 
785 	/*
786 	 * Handle 1->0 transitions for tdio and nested destruction
787 	 * recursions.  If the refs are already in destruction mode (bit 31
788 	 * set) on the 1->0 transition we don't try to destruct it again.
789 	 *
790 	 * 0x80000001->0x80000000 transitions are handled normally and
791 	 * thus avoid nested dstruction.
792 	 */
793 	for (;;) {
794 		refs = tdio->refcount;
795 		cpu_ccfence();
796 		nrefs = refs - 1;
797 
798 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
799 		if (nrefs) {
800 			if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
801 				break;
802 			continue;
803 		}
804 		nrefs = 0x80000000;
805 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
806 			dsched_thread_io_destroy(tdio);
807 			break;
808 		}
809 	}
810 }
811 
812 /*
813  * Unref and destroy the tdio even if additional refs are present.
814  */
815 static
816 void
817 dsched_thread_io_unref_destroy(struct dsched_thread_io *tdio)
818 {
819 	int refs;
820 	int nrefs;
821 
822 	/*
823 	 * If not already transitioned to destroy-in-progress we transition
824 	 * to destroy-in-progress, cleanup our ref, and destroy the tdio.
825 	 */
826 	for (;;) {
827 		refs = tdio->refcount;
828 		cpu_ccfence();
829 		nrefs = refs - 1;
830 
831 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
832 		if (nrefs & 0x80000000) {
833 			if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
834 				break;
835 			continue;
836 		}
837 		nrefs |= 0x80000000;
838 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
839 			dsched_thread_io_destroy(tdio);
840 			break;
841 		}
842 	}
843 }
844 
845 static void
846 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
847 {
848 	struct dsched_thread_ctx *tdctx;
849 	struct dsched_disk_ctx	*diskctx;
850 	int refs;
851 	int nrefs;
852 
853 #if 0
854 	kprintf("tdio (%p) destruction started, trace:\n", tdio);
855 	print_backtrace(8);
856 #endif
857 	KKASSERT(tdio->qlength == 0);
858 
859 	while ((diskctx = tdio->diskctx) != NULL) {
860 		dsched_disk_ctx_ref(diskctx);
861 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
862 		if (diskctx != tdio->diskctx) {
863 			lockmgr(&diskctx->lock, LK_RELEASE);
864 			dsched_disk_ctx_unref(diskctx);
865 			continue;
866 		}
867 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
868 		if (diskctx->dp->d_sched_policy->destroy_tdio)
869 			diskctx->dp->d_sched_policy->destroy_tdio(tdio);
870 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
871 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
872 		tdio->diskctx = NULL;
873 		dsched_thread_io_unref(tdio);
874 		lockmgr(&diskctx->lock, LK_RELEASE);
875 		dsched_disk_ctx_unref(diskctx);
876 	}
877 	while ((tdctx = tdio->tdctx) != NULL) {
878 		dsched_thread_ctx_ref(tdctx);
879 		lockmgr(&tdctx->lock, LK_EXCLUSIVE);
880 		if (tdctx != tdio->tdctx) {
881 			lockmgr(&tdctx->lock, LK_RELEASE);
882 			dsched_thread_ctx_unref(tdctx);
883 			continue;
884 		}
885 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
886 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
887 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
888 		tdio->tdctx = NULL;
889 		dsched_thread_io_unref(tdio);
890 		lockmgr(&tdctx->lock, LK_RELEASE);
891 		dsched_thread_ctx_unref(tdctx);
892 	}
893 
894 	/*
895 	 * Expect tdio->refcount to be 0x80000000.  If it isn't someone else
896 	 * still has a temporary ref on the tdio and we have to transition
897 	 * it back to an undestroyed-state (albeit without any associations)
898 	 * so the other user destroys it properly when the ref is released.
899 	 */
900 	while ((refs = tdio->refcount) != 0x80000000) {
901 		kprintf("dsched_thread_io: destroy race tdio=%p\n", tdio);
902 		cpu_ccfence();
903 		KKASSERT(refs & 0x80000000);
904 		nrefs = refs & 0x7FFFFFFF;
905 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
906 			return;
907 	}
908 
909 	/*
910 	 * Really for sure now.
911 	 */
912 	objcache_put(dsched_tdio_cache, tdio);
913 	atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
914 }
915 
916 void
917 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
918 {
919 	int refs;
920 	int nrefs;
921 
922 	/*
923 	 * Handle 1->0 transitions for tdctx and nested destruction
924 	 * recursions.  If the refs are already in destruction mode (bit 31
925 	 * set) on the 1->0 transition we don't try to destruct it again.
926 	 *
927 	 * 0x80000001->0x80000000 transitions are handled normally and
928 	 * thus avoid nested dstruction.
929 	 */
930 	for (;;) {
931 		refs = tdctx->refcount;
932 		cpu_ccfence();
933 		nrefs = refs - 1;
934 
935 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
936 		if (nrefs) {
937 			if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
938 				break;
939 			continue;
940 		}
941 		nrefs = 0x80000000;
942 		if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
943 			dsched_thread_ctx_destroy(tdctx);
944 			break;
945 		}
946 	}
947 }
948 
949 static void
950 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
951 {
952 	struct dsched_thread_io	*tdio;
953 
954 	lockmgr(&tdctx->lock, LK_EXCLUSIVE);
955 
956 	while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
957 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
958 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
959 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
960 		tdio->tdctx = NULL;
961 		lockmgr(&tdctx->lock, LK_RELEASE);	/* avoid deadlock */
962 		dsched_thread_io_unref_destroy(tdio);
963 		lockmgr(&tdctx->lock, LK_EXCLUSIVE);
964 	}
965 	KKASSERT(tdctx->refcount == 0x80000000);
966 
967 	lockmgr(&tdctx->lock, LK_RELEASE);
968 
969 	objcache_put(dsched_tdctx_cache, tdctx);
970 	atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
971 }
972 
973 /*
974  * Ensures that a tdio is assigned to tdctx and disk.
975  */
976 static
977 struct dsched_thread_io *
978 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
979 		       struct dsched_policy *pol, int tdctx_locked)
980 {
981 	struct dsched_thread_io	*tdio;
982 #if 0
983 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
984 #endif
985 	tdio = objcache_get(dsched_tdio_cache, M_INTWAIT);
986 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
987 
988 	dsched_thread_io_ref(tdio);	/* prevent ripout */
989 	dsched_thread_io_ref(tdio);	/* for diskctx ref */
990 
991 	DSCHED_THREAD_IO_LOCKINIT(tdio);
992 	tdio->dp = dp;
993 
994 	tdio->diskctx = dsched_get_disk_priv(dp);
995 	TAILQ_INIT(&tdio->queue);
996 
997 	if (pol->new_tdio)
998 		pol->new_tdio(tdio);
999 
1000 	lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
1001 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
1002 	atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
1003 
1004 	if (tdctx) {
1005 		/*
1006 		 * Put the tdio in the tdctx list.  Inherit the temporary
1007 		 * ref (one ref for each list).
1008 		 */
1009 		if (tdctx_locked == 0)
1010 			DSCHED_THREAD_CTX_LOCK(tdctx);
1011 		tdio->tdctx = tdctx;
1012 		tdio->p = tdctx->p;
1013 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
1014 		atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
1015 		if (tdctx_locked == 0)
1016 			DSCHED_THREAD_CTX_UNLOCK(tdctx);
1017 	} else {
1018 		dsched_thread_io_unref(tdio);
1019 	}
1020 
1021 	tdio->debug_policy = pol;
1022 	tdio->debug_inited = 0xF00F1234;
1023 
1024 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
1025 
1026 	return(tdio);
1027 }
1028 
1029 
1030 struct dsched_disk_ctx *
1031 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
1032 {
1033 	struct dsched_disk_ctx *diskctx;
1034 
1035 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
1036 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
1037 	dsched_disk_ctx_ref(diskctx);
1038 	diskctx->dp = dp;
1039 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
1040 	TAILQ_INIT(&diskctx->tdio_list);
1041 	/*
1042 	 * XXX: magic number 32: most device has a tag queue
1043 	 * of depth 32.
1044 	 * Better to retrive more precise value from the driver
1045 	 */
1046 	diskctx->max_tag_queue_depth = 32;
1047 	diskctx->current_tag_queue_depth = 0;
1048 
1049 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
1050 	if (pol->new_diskctx)
1051 		pol->new_diskctx(diskctx);
1052 	return diskctx;
1053 }
1054 
1055 
1056 struct dsched_thread_ctx *
1057 dsched_thread_ctx_alloc(struct proc *p)
1058 {
1059 	struct dsched_thread_ctx	*tdctx;
1060 
1061 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
1062 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
1063 	dsched_thread_ctx_ref(tdctx);
1064 #if 0
1065 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
1066 #endif
1067 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
1068 	TAILQ_INIT(&tdctx->tdio_list);
1069 	tdctx->p = p;
1070 
1071 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
1072 	/* XXX: no callback here */
1073 
1074 	return tdctx;
1075 }
1076 
1077 void
1078 policy_new(struct disk *dp, struct dsched_policy *pol)
1079 {
1080 	struct dsched_disk_ctx *diskctx;
1081 
1082 	diskctx = dsched_disk_ctx_alloc(dp, pol);
1083 	dsched_disk_ctx_ref(diskctx);
1084 	dsched_set_disk_priv(dp, diskctx);
1085 }
1086 
1087 void
1088 policy_destroy(struct disk *dp) {
1089 	struct dsched_disk_ctx *diskctx;
1090 
1091 	diskctx = dsched_get_disk_priv(dp);
1092 	KKASSERT(diskctx != NULL);
1093 
1094 	dsched_disk_ctx_unref(diskctx); /* from prepare */
1095 	dsched_disk_ctx_unref(diskctx); /* from alloc */
1096 
1097 	dsched_set_disk_priv(dp, NULL);
1098 }
1099 
1100 void
1101 dsched_new_buf(struct buf *bp)
1102 {
1103 	struct dsched_thread_ctx	*tdctx = NULL;
1104 
1105 	if (dsched_inited == 0)
1106 		return;
1107 
1108 	if (curproc != NULL) {
1109 		tdctx = dsched_get_proc_priv(curproc);
1110 	} else {
1111 		/* This is a kernel thread, so no proc info is available */
1112 		tdctx = dsched_get_thread_priv(curthread);
1113 	}
1114 
1115 #if 0
1116 	/*
1117 	 * XXX: hack. we don't want this assert because we aren't catching all
1118 	 *	threads. mi_startup() is still getting away without an tdctx.
1119 	 */
1120 
1121 	/* by now we should have an tdctx. if not, something bad is going on */
1122 	KKASSERT(tdctx != NULL);
1123 #endif
1124 
1125 	if (tdctx) {
1126 		dsched_thread_ctx_ref(tdctx);
1127 	}
1128 	dsched_set_buf_priv(bp, tdctx);
1129 }
1130 
1131 void
1132 dsched_exit_buf(struct buf *bp)
1133 {
1134 	struct dsched_thread_ctx	*tdctx;
1135 
1136 	tdctx = dsched_get_buf_priv(bp);
1137 	if (tdctx != NULL) {
1138 		dsched_clr_buf_priv(bp);
1139 		dsched_thread_ctx_unref(tdctx);
1140 	}
1141 }
1142 
1143 void
1144 dsched_new_proc(struct proc *p)
1145 {
1146 	struct dsched_thread_ctx	*tdctx;
1147 
1148 	if (dsched_inited == 0)
1149 		return;
1150 
1151 	KKASSERT(p != NULL);
1152 
1153 	tdctx = dsched_thread_ctx_alloc(p);
1154 	tdctx->p = p;
1155 	dsched_thread_ctx_ref(tdctx);
1156 
1157 	dsched_set_proc_priv(p, tdctx);
1158 	atomic_add_int(&dsched_stats.nprocs, 1);
1159 }
1160 
1161 
1162 void
1163 dsched_new_thread(struct thread *td)
1164 {
1165 	struct dsched_thread_ctx	*tdctx;
1166 
1167 	if (dsched_inited == 0)
1168 		return;
1169 
1170 	KKASSERT(td != NULL);
1171 
1172 	tdctx = dsched_thread_ctx_alloc(NULL);
1173 	tdctx->td = td;
1174 	dsched_thread_ctx_ref(tdctx);
1175 
1176 	dsched_set_thread_priv(td, tdctx);
1177 	atomic_add_int(&dsched_stats.nthreads, 1);
1178 }
1179 
1180 void
1181 dsched_exit_proc(struct proc *p)
1182 {
1183 	struct dsched_thread_ctx	*tdctx;
1184 
1185 	if (dsched_inited == 0)
1186 		return;
1187 
1188 	KKASSERT(p != NULL);
1189 
1190 	tdctx = dsched_get_proc_priv(p);
1191 	KKASSERT(tdctx != NULL);
1192 
1193 	tdctx->dead = 0xDEAD;
1194 	dsched_set_proc_priv(p, NULL);
1195 
1196 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1197 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1198 	atomic_subtract_int(&dsched_stats.nprocs, 1);
1199 }
1200 
1201 
1202 void
1203 dsched_exit_thread(struct thread *td)
1204 {
1205 	struct dsched_thread_ctx	*tdctx;
1206 
1207 	if (dsched_inited == 0)
1208 		return;
1209 
1210 	KKASSERT(td != NULL);
1211 
1212 	tdctx = dsched_get_thread_priv(td);
1213 	KKASSERT(tdctx != NULL);
1214 
1215 	tdctx->dead = 0xDEAD;
1216 	dsched_set_thread_priv(td, 0);
1217 
1218 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1219 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1220 	atomic_subtract_int(&dsched_stats.nthreads, 1);
1221 }
1222 
1223 /*
1224  * Returns ref'd tdio.
1225  *
1226  * tdio may have additional refs for the diskctx and tdctx it resides on.
1227  */
1228 void
1229 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1230 			      struct dsched_policy *pol)
1231 {
1232 	struct dsched_thread_ctx *tdctx;
1233 
1234 	tdctx = dsched_get_thread_priv(curthread);
1235 	KKASSERT(tdctx != NULL);
1236 	dsched_thread_io_alloc(diskctx->dp, tdctx, pol, 0);
1237 }
1238 
1239 /* DEFAULT NOOP POLICY */
1240 
1241 static int
1242 noop_prepare(struct dsched_disk_ctx *diskctx)
1243 {
1244 	return 0;
1245 }
1246 
1247 static void
1248 noop_teardown(struct dsched_disk_ctx *diskctx)
1249 {
1250 
1251 }
1252 
1253 static void
1254 noop_cancel(struct dsched_disk_ctx *diskctx)
1255 {
1256 
1257 }
1258 
1259 static int
1260 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1261 	   struct bio *bio)
1262 {
1263 	dsched_strategy_raw(diskctx->dp, bio);
1264 #if 0
1265 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1266 #endif
1267 	return 0;
1268 }
1269 
1270 /*
1271  * SYSINIT stuff
1272  */
1273 static void
1274 dsched_init(void)
1275 {
1276 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1277 					   NULL, NULL, NULL,
1278 					   objcache_malloc_alloc,
1279 					   objcache_malloc_free,
1280 					   &dsched_thread_io_malloc_args );
1281 
1282 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1283 					   NULL, NULL, NULL,
1284 					   objcache_malloc_alloc,
1285 					   objcache_malloc_free,
1286 					   &dsched_thread_ctx_malloc_args );
1287 
1288 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1289 					   NULL, NULL, NULL,
1290 					   objcache_malloc_alloc,
1291 					   objcache_malloc_free,
1292 					   &dsched_disk_ctx_malloc_args );
1293 
1294 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1295 
1296 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1297 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1298 
1299 	dsched_register(&dsched_noop_policy);
1300 
1301 	dsched_inited = 1;
1302 }
1303 
1304 static void
1305 dsched_uninit(void)
1306 {
1307 }
1308 
1309 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1310 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1311 
1312 /*
1313  * SYSCTL stuff
1314  */
1315 static int
1316 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1317 {
1318 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1319 }
1320 
1321 static int
1322 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1323 {
1324 	struct dsched_policy *pol = NULL;
1325 	int error, first = 1;
1326 
1327 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1328 
1329 	while ((pol = dsched_policy_enumerate(pol))) {
1330 		if (!first) {
1331 			error = SYSCTL_OUT(req, " ", 1);
1332 			if (error)
1333 				break;
1334 		} else {
1335 			first = 0;
1336 		}
1337 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1338 		if (error)
1339 			break;
1340 
1341 	}
1342 
1343 	lockmgr(&dsched_lock, LK_RELEASE);
1344 
1345 	error = SYSCTL_OUT(req, "", 1);
1346 
1347 	return error;
1348 }
1349 
1350 static int
1351 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1352 {
1353 	char buf[DSCHED_POLICY_NAME_LENGTH];
1354 	struct dsched_disk_ctx *diskctx = arg1;
1355 	struct dsched_policy *pol = NULL;
1356 	int error;
1357 
1358 	if (diskctx == NULL) {
1359 		return 0;
1360 	}
1361 
1362 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1363 
1364 	pol = diskctx->dp->d_sched_policy;
1365 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1366 
1367 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1368 	if (error || req->newptr == NULL) {
1369 		lockmgr(&dsched_lock, LK_RELEASE);
1370 		return (error);
1371 	}
1372 
1373 	pol = dsched_find_policy(buf);
1374 	if (pol == NULL) {
1375 		lockmgr(&dsched_lock, LK_RELEASE);
1376 		return 0;
1377 	}
1378 
1379 	dsched_switch(diskctx->dp, pol);
1380 
1381 	lockmgr(&dsched_lock, LK_RELEASE);
1382 
1383 	return error;
1384 }
1385 
1386 static int
1387 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1388 {
1389 	char buf[DSCHED_POLICY_NAME_LENGTH];
1390 	struct dsched_policy *pol = NULL;
1391 	int error;
1392 
1393 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1394 
1395 	pol = default_policy;
1396 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1397 
1398 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1399 	if (error || req->newptr == NULL) {
1400 		lockmgr(&dsched_lock, LK_RELEASE);
1401 		return (error);
1402 	}
1403 
1404 	pol = dsched_find_policy(buf);
1405 	if (pol == NULL) {
1406 		lockmgr(&dsched_lock, LK_RELEASE);
1407 		return 0;
1408 	}
1409 
1410 	default_set = 1;
1411 	default_policy = pol;
1412 
1413 	lockmgr(&dsched_lock, LK_RELEASE);
1414 
1415 	return error;
1416 }
1417 
1418 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1419     "Disk Scheduler Framework (dsched) magic");
1420 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1421     "List of disks and their policies");
1422 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1423     0, "Enable dsched debugging");
1424 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1425     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1426     "dsched statistics");
1427 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1428     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1429 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1430     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1431 
1432 static void
1433 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1434 {
1435 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1436 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1437 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1438 	}
1439 
1440 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1441 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1442 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1443 }
1444