xref: /dragonfly/sys/kern/kern_dsched.c (revision 92fc8b5c)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
56 
57 static dsched_prepare_t		noop_prepare;
58 static dsched_teardown_t	noop_teardown;
59 static dsched_cancel_t		noop_cancel;
60 static dsched_queue_t		noop_queue;
61 
62 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
63 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx);
64 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio);
65 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx);
66 
67 static int	dsched_inited = 0;
68 static int	default_set = 0;
69 
70 struct lock	dsched_lock;
71 static int	dsched_debug_enable = 0;
72 
73 struct dsched_stats	dsched_stats;
74 
75 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
76 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
77 struct objcache_malloc_args dsched_thread_io_malloc_args = {
78 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
79 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
80 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
81 
82 static struct objcache	*dsched_diskctx_cache;
83 static struct objcache	*dsched_tdctx_cache;
84 static struct objcache	*dsched_tdio_cache;
85 
86 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
87 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
88 
89 struct lock	dsched_tdctx_lock;
90 
91 static struct dsched_policy_head dsched_policy_list =
92 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
93 
94 static struct dsched_policy dsched_noop_policy = {
95 	.name = "noop",
96 
97 	.prepare = noop_prepare,
98 	.teardown = noop_teardown,
99 	.cancel_all = noop_cancel,
100 	.bio_queue = noop_queue
101 };
102 
103 static struct dsched_policy *default_policy = &dsched_noop_policy;
104 
105 /*
106  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
107  * using kvprintf
108  */
109 int
110 dsched_debug(int level, char *fmt, ...)
111 {
112 	__va_list ap;
113 
114 	__va_start(ap, fmt);
115 	if (level <= dsched_debug_enable)
116 		kvprintf(fmt, ap);
117 	__va_end(ap);
118 
119 	return 0;
120 }
121 
122 /*
123  * Called on disk_create()
124  * tries to read which policy to use from loader.conf, if there's
125  * none specified, the default policy is used.
126  */
127 void
128 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
129 {
130 	char tunable_key[SPECNAMELEN + 48];
131 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
132 	char *ptr;
133 	struct dsched_policy *policy = NULL;
134 
135 	/* Also look for serno stuff? */
136 	/* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
137 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
138 
139 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
140 	    head_name, unit);
141 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
142 	    sizeof(sched_policy)) != 0) {
143 		policy = dsched_find_policy(sched_policy);
144 	}
145 
146 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
147 	    head_name);
148 	for (ptr = tunable_key; *ptr; ptr++) {
149 		if (*ptr == '/')
150 			*ptr = '-';
151 	}
152 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
153 	    sizeof(sched_policy)) != 0)) {
154 		policy = dsched_find_policy(sched_policy);
155 	}
156 
157 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
158 	if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
159 	    sizeof(sched_policy)) != 0)) {
160 		policy = dsched_find_policy(sched_policy);
161 	}
162 
163 	if (!policy) {
164 		if (!default_set && bootverbose) {
165 			dsched_debug(0,
166 				     "No policy for %s%d specified, "
167 				     "or policy not found\n",
168 				     head_name, unit);
169 		}
170 		dsched_set_policy(dp, default_policy);
171 	} else {
172 		dsched_set_policy(dp, policy);
173 	}
174 
175 	if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
176 		ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
177 	else
178 		ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
179 	for (ptr = tunable_key; *ptr; ptr++) {
180 		if (*ptr == '/')
181 			*ptr = '-';
182 	}
183 	dsched_sysctl_add_disk(
184 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
185 	    tunable_key);
186 
187 	lockmgr(&dsched_lock, LK_RELEASE);
188 }
189 
190 /*
191  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
192  * there's any policy associated with the serial number of the device.
193  */
194 void
195 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
196 {
197 	char tunable_key[SPECNAMELEN + 48];
198 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
199 	struct dsched_policy *policy = NULL;
200 
201 	if (info->d_serialno == NULL)
202 		return;
203 
204 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
205 
206 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
207 	    info->d_serialno);
208 
209 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
210 	    sizeof(sched_policy)) != 0)) {
211 		policy = dsched_find_policy(sched_policy);
212 	}
213 
214 	if (policy) {
215 		dsched_switch(dp, policy);
216 	}
217 
218 	dsched_sysctl_add_disk(
219 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
220 	    info->d_serialno);
221 
222 	lockmgr(&dsched_lock, LK_RELEASE);
223 }
224 
225 /*
226  * Called on disk_destroy()
227  * shuts down the scheduler core and cancels all remaining bios
228  */
229 void
230 dsched_disk_destroy_callback(struct disk *dp)
231 {
232 	struct dsched_policy *old_policy;
233 	struct dsched_disk_ctx *diskctx;
234 
235 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
236 
237 	diskctx = dsched_get_disk_priv(dp);
238 
239 	old_policy = dp->d_sched_policy;
240 	dp->d_sched_policy = &dsched_noop_policy;
241 	old_policy->cancel_all(dsched_get_disk_priv(dp));
242 	old_policy->teardown(dsched_get_disk_priv(dp));
243 
244 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
245 		sysctl_ctx_free(&diskctx->sysctl_ctx);
246 
247 	policy_destroy(dp);
248 	atomic_subtract_int(&old_policy->ref_count, 1);
249 	KKASSERT(old_policy->ref_count >= 0);
250 
251 	lockmgr(&dsched_lock, LK_RELEASE);
252 }
253 
254 
255 void
256 dsched_queue(struct disk *dp, struct bio *bio)
257 {
258 	struct dsched_thread_ctx	*tdctx;
259 	struct dsched_thread_io		*tdio;
260 	struct dsched_disk_ctx		*diskctx;
261 
262 	int found = 0, error = 0;
263 
264 	tdctx = dsched_get_buf_priv(bio->bio_buf);
265 	if (tdctx == NULL) {
266 		/* We don't handle this case, let dsched dispatch */
267 		atomic_add_int(&dsched_stats.no_tdctx, 1);
268 		dsched_strategy_raw(dp, bio);
269 		return;
270 	}
271 
272 	DSCHED_THREAD_CTX_LOCK(tdctx);
273 
274 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
275 	TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
276 		if (tdio->dp == dp) {
277 			dsched_thread_io_ref(tdio);
278 			found = 1;
279 			break;
280 		}
281 	}
282 
283 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
284 	dsched_clr_buf_priv(bio->bio_buf);
285 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
286 
287 	KKASSERT(found == 1);
288 	diskctx = dsched_get_disk_priv(dp);
289 	dsched_disk_ctx_ref(diskctx);
290 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
291 
292 	if (error) {
293 		dsched_strategy_raw(dp, bio);
294 	}
295 	dsched_disk_ctx_unref(diskctx);
296 	dsched_thread_io_unref(tdio);
297 }
298 
299 
300 /*
301  * Called from each module_init or module_attach of each policy
302  * registers the policy in the local policy list.
303  */
304 int
305 dsched_register(struct dsched_policy *d_policy)
306 {
307 	struct dsched_policy *policy;
308 	int error = 0;
309 
310 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
311 
312 	policy = dsched_find_policy(d_policy->name);
313 
314 	if (!policy) {
315 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
316 		atomic_add_int(&d_policy->ref_count, 1);
317 	} else {
318 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
319 		    d_policy->name);
320 		error = EEXIST;
321 	}
322 
323 	lockmgr(&dsched_lock, LK_RELEASE);
324 	return error;
325 }
326 
327 /*
328  * Called from each module_detach of each policy
329  * unregisters the policy
330  */
331 int
332 dsched_unregister(struct dsched_policy *d_policy)
333 {
334 	struct dsched_policy *policy;
335 
336 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
337 	policy = dsched_find_policy(d_policy->name);
338 
339 	if (policy) {
340 		if (policy->ref_count > 1) {
341 			lockmgr(&dsched_lock, LK_RELEASE);
342 			return EBUSY;
343 		}
344 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
345 		atomic_subtract_int(&policy->ref_count, 1);
346 		KKASSERT(policy->ref_count == 0);
347 	}
348 	lockmgr(&dsched_lock, LK_RELEASE);
349 	return 0;
350 }
351 
352 
353 /*
354  * switches the policy by first removing the old one and then
355  * enabling the new one.
356  */
357 int
358 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
359 {
360 	struct dsched_policy *old_policy;
361 
362 	/* If we are asked to set the same policy, do nothing */
363 	if (dp->d_sched_policy == new_policy)
364 		return 0;
365 
366 	/* lock everything down, diskwise */
367 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
368 	old_policy = dp->d_sched_policy;
369 
370 	atomic_subtract_int(&old_policy->ref_count, 1);
371 	KKASSERT(old_policy->ref_count >= 0);
372 
373 	dp->d_sched_policy = &dsched_noop_policy;
374 	old_policy->teardown(dsched_get_disk_priv(dp));
375 	policy_destroy(dp);
376 
377 	/* Bring everything back to life */
378 	dsched_set_policy(dp, new_policy);
379 	lockmgr(&dsched_lock, LK_RELEASE);
380 	return 0;
381 }
382 
383 
384 /*
385  * Loads a given policy and attaches it to the specified disk.
386  * Also initializes the core for the policy
387  */
388 void
389 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
390 {
391 	int locked = 0;
392 
393 	/* Check if it is locked already. if not, we acquire the devfs lock */
394 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
395 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
396 		locked = 1;
397 	}
398 
399 	policy_new(dp, new_policy);
400 	new_policy->prepare(dsched_get_disk_priv(dp));
401 	dp->d_sched_policy = new_policy;
402 	atomic_add_int(&new_policy->ref_count, 1);
403 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
404 	    new_policy->name);
405 
406 	/* If we acquired the lock, we also get rid of it */
407 	if (locked)
408 		lockmgr(&dsched_lock, LK_RELEASE);
409 }
410 
411 struct dsched_policy*
412 dsched_find_policy(char *search)
413 {
414 	struct dsched_policy *policy;
415 	struct dsched_policy *policy_found = NULL;
416 	int locked = 0;
417 
418 	/* Check if it is locked already. if not, we acquire the devfs lock */
419 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
420 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
421 		locked = 1;
422 	}
423 
424 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
425 		if (!strcmp(policy->name, search)) {
426 			policy_found = policy;
427 			break;
428 		}
429 	}
430 
431 	/* If we acquired the lock, we also get rid of it */
432 	if (locked)
433 		lockmgr(&dsched_lock, LK_RELEASE);
434 
435 	return policy_found;
436 }
437 
438 struct disk*
439 dsched_find_disk(char *search)
440 {
441 	struct disk *dp_found = NULL;
442 	struct disk *dp = NULL;
443 
444 	while((dp = disk_enumerate(dp))) {
445 		if (!strcmp(dp->d_cdev->si_name, search)) {
446 			dp_found = dp;
447 			break;
448 		}
449 	}
450 
451 	return dp_found;
452 }
453 
454 struct disk*
455 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
456 {
457 	while ((dp = disk_enumerate(dp))) {
458 		if (dp->d_sched_policy == policy)
459 			return dp;
460 	}
461 
462 	return NULL;
463 }
464 
465 struct dsched_policy *
466 dsched_policy_enumerate(struct dsched_policy *pol)
467 {
468 	if (!pol)
469 		return (TAILQ_FIRST(&dsched_policy_list));
470 	else
471 		return (TAILQ_NEXT(pol, link));
472 }
473 
474 void
475 dsched_cancel_bio(struct bio *bp)
476 {
477 	bp->bio_buf->b_error = ENXIO;
478 	bp->bio_buf->b_flags |= B_ERROR;
479 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
480 
481 	biodone(bp);
482 }
483 
484 void
485 dsched_strategy_raw(struct disk *dp, struct bio *bp)
486 {
487 	/*
488 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
489 	 * to avoid panics
490 	 */
491 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
492 	if(bp->bio_track != NULL) {
493 		dsched_debug(LOG_INFO,
494 		    "dsched_strategy_raw sees non-NULL bio_track!! "
495 		    "bio: %p\n", bp);
496 		bp->bio_track = NULL;
497 	}
498 	dev_dstrategy(dp->d_rawdev, bp);
499 }
500 
501 void
502 dsched_strategy_sync(struct disk *dp, struct bio *bio)
503 {
504 	struct buf *bp, *nbp;
505 	struct bio *nbio;
506 
507 	bp = bio->bio_buf;
508 
509 	nbp = getpbuf(NULL);
510 	nbio = &nbp->b_bio1;
511 
512 	nbp->b_cmd = bp->b_cmd;
513 	nbp->b_bufsize = bp->b_bufsize;
514 	nbp->b_runningbufspace = bp->b_runningbufspace;
515 	nbp->b_bcount = bp->b_bcount;
516 	nbp->b_resid = bp->b_resid;
517 	nbp->b_data = bp->b_data;
518 #if 0
519 	/*
520 	 * Buffers undergoing device I/O do not need a kvabase/size.
521 	 */
522 	nbp->b_kvabase = bp->b_kvabase;
523 	nbp->b_kvasize = bp->b_kvasize;
524 #endif
525 	nbp->b_dirtyend = bp->b_dirtyend;
526 
527 	nbio->bio_done = biodone_sync;
528 	nbio->bio_flags |= BIO_SYNC;
529 	nbio->bio_track = NULL;
530 
531 	nbio->bio_caller_info1.ptr = dp;
532 	nbio->bio_offset = bio->bio_offset;
533 
534 	dev_dstrategy(dp->d_rawdev, nbio);
535 	biowait(nbio, "dschedsync");
536 	bp->b_resid = nbp->b_resid;
537 	bp->b_error = nbp->b_error;
538 	biodone(bio);
539 #if 0
540 	nbp->b_kvabase = NULL;
541 	nbp->b_kvasize = 0;
542 #endif
543 	relpbuf(nbp, NULL);
544 }
545 
546 void
547 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
548 {
549 	struct bio *nbio;
550 
551 	nbio = push_bio(bio);
552 	nbio->bio_done = done;
553 	nbio->bio_offset = bio->bio_offset;
554 
555 	dsched_set_bio_dp(nbio, dp);
556 	dsched_set_bio_priv(nbio, priv);
557 
558 	getmicrotime(&nbio->bio_caller_info3.tv);
559 	dev_dstrategy(dp->d_rawdev, nbio);
560 }
561 
562 /*
563  * Ref and deref various structures.  The 1->0 transition of the reference
564  * count actually transitions 1->0x80000000 and causes the object to be
565  * destroyed.  It is possible for transitory references to occur on the
566  * object while it is being destroyed.  We use bit 31 to indicate that
567  * destruction is in progress and to prevent nested destructions.
568  */
569 void
570 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
571 {
572 	int refcount;
573 
574 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
575 }
576 
577 void
578 dsched_thread_io_ref(struct dsched_thread_io *tdio)
579 {
580 	int refcount;
581 
582 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
583 }
584 
585 void
586 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
587 {
588 	int refcount;
589 
590 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
591 }
592 
593 void
594 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
595 {
596 	int refs;
597 	int nrefs;
598 
599 	/*
600 	 * Handle 1->0 transitions for diskctx and nested destruction
601 	 * recursions.  If the refs are already in destruction mode (bit 31
602 	 * set) on the 1->0 transition we don't try to destruct it again.
603 	 *
604 	 * 0x80000001->0x80000000 transitions are handled normally and
605 	 * thus avoid nested dstruction.
606 	 */
607 	for (;;) {
608 		refs = diskctx->refcount;
609 		cpu_ccfence();
610 		nrefs = refs - 1;
611 
612 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
613 		if (nrefs) {
614 			if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs))
615 				break;
616 			continue;
617 		}
618 		nrefs = 0x80000000;
619 		if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) {
620 			dsched_disk_ctx_destroy(diskctx);
621 			break;
622 		}
623 	}
624 }
625 
626 static
627 void
628 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx)
629 {
630 	struct dsched_thread_io	*tdio;
631 
632 #if 0
633 	kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
634 	print_backtrace(4);
635 #endif
636 	lockmgr(&diskctx->lock, LK_EXCLUSIVE);
637 	while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) {
638 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
639 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
640 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
641 		tdio->diskctx = NULL;
642 		/* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
643 		dsched_thread_io_unref(tdio);
644 	}
645 	lockmgr(&diskctx->lock, LK_RELEASE);
646 	if (diskctx->dp->d_sched_policy->destroy_diskctx)
647 		diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
648 	KKASSERT(diskctx->refcount == 0x80000000);
649 	objcache_put(dsched_diskctx_cache, diskctx);
650 	atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
651 }
652 
653 void
654 dsched_thread_io_unref(struct dsched_thread_io *tdio)
655 {
656 	int refs;
657 	int nrefs;
658 
659 	/*
660 	 * Handle 1->0 transitions for tdio and nested destruction
661 	 * recursions.  If the refs are already in destruction mode (bit 31
662 	 * set) on the 1->0 transition we don't try to destruct it again.
663 	 *
664 	 * 0x80000001->0x80000000 transitions are handled normally and
665 	 * thus avoid nested dstruction.
666 	 */
667 	for (;;) {
668 		refs = tdio->refcount;
669 		cpu_ccfence();
670 		nrefs = refs - 1;
671 
672 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
673 		if (nrefs) {
674 			if (atomic_cmpset_int(&tdio->refcount, refs, nrefs))
675 				break;
676 			continue;
677 		}
678 		nrefs = 0x80000000;
679 		if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) {
680 			dsched_thread_io_destroy(tdio);
681 			break;
682 		}
683 	}
684 }
685 
686 static void
687 dsched_thread_io_destroy(struct dsched_thread_io *tdio)
688 {
689 	struct dsched_thread_ctx *tdctx;
690 	struct dsched_disk_ctx	*diskctx;
691 
692 #if 0
693 	kprintf("tdio (%p) destruction started, trace:\n", tdio);
694 	print_backtrace(8);
695 #endif
696 	KKASSERT(tdio->qlength == 0);
697 
698 	while ((diskctx = tdio->diskctx) != NULL) {
699 		dsched_disk_ctx_ref(diskctx);
700 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
701 		if (diskctx != tdio->diskctx) {
702 			lockmgr(&diskctx->lock, LK_RELEASE);
703 			dsched_disk_ctx_unref(diskctx);
704 			continue;
705 		}
706 		KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX);
707 		if (diskctx->dp->d_sched_policy->destroy_tdio)
708 			diskctx->dp->d_sched_policy->destroy_tdio(tdio);
709 		TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
710 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
711 		tdio->diskctx = NULL;
712 		lockmgr(&diskctx->lock, LK_RELEASE);
713 		dsched_disk_ctx_unref(diskctx);
714 	}
715 	while ((tdctx = tdio->tdctx) != NULL) {
716 		dsched_thread_ctx_ref(tdctx);
717 		lockmgr(&tdctx->lock, LK_EXCLUSIVE);
718 		if (tdctx != tdio->tdctx) {
719 			lockmgr(&tdctx->lock, LK_RELEASE);
720 			dsched_thread_ctx_unref(tdctx);
721 			continue;
722 		}
723 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
724 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
725 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
726 		tdio->tdctx = NULL;
727 		lockmgr(&tdctx->lock, LK_RELEASE);
728 		dsched_thread_ctx_unref(tdctx);
729 	}
730 	KKASSERT(tdio->refcount == 0x80000000);
731 	objcache_put(dsched_tdio_cache, tdio);
732 	atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
733 #if 0
734 	dsched_disk_ctx_unref(diskctx);
735 #endif
736 }
737 
738 void
739 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
740 {
741 	int refs;
742 	int nrefs;
743 
744 	/*
745 	 * Handle 1->0 transitions for tdctx and nested destruction
746 	 * recursions.  If the refs are already in destruction mode (bit 31
747 	 * set) on the 1->0 transition we don't try to destruct it again.
748 	 *
749 	 * 0x80000001->0x80000000 transitions are handled normally and
750 	 * thus avoid nested dstruction.
751 	 */
752 	for (;;) {
753 		refs = tdctx->refcount;
754 		cpu_ccfence();
755 		nrefs = refs - 1;
756 
757 		KKASSERT(((refs ^ nrefs) & 0x80000000) == 0);
758 		if (nrefs) {
759 			if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs))
760 				break;
761 			continue;
762 		}
763 		nrefs = 0x80000000;
764 		if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) {
765 			dsched_thread_ctx_destroy(tdctx);
766 			break;
767 		}
768 	}
769 }
770 
771 static void
772 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx)
773 {
774 	struct dsched_thread_io	*tdio;
775 
776 #if 0
777 	kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
778 	print_backtrace(8);
779 #endif
780 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
781 
782 	while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) {
783 		KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX);
784 		TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
785 		atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
786 		tdio->tdctx = NULL;
787 		dsched_thread_io_unref(tdio);
788 	}
789 	KKASSERT(tdctx->refcount == 0x80000000);
790 	TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
791 
792 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
793 
794 	objcache_put(dsched_tdctx_cache, tdctx);
795 	atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
796 }
797 
798 struct dsched_thread_io *
799 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
800     struct dsched_policy *pol)
801 {
802 	struct dsched_thread_io	*tdio;
803 #if 0
804 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
805 #endif
806 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
807 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
808 
809 	/* XXX: maybe we do need another ref for the disk list for tdio */
810 	dsched_thread_io_ref(tdio);
811 
812 	DSCHED_THREAD_IO_LOCKINIT(tdio);
813 	tdio->dp = dp;
814 
815 	tdio->diskctx = dsched_get_disk_priv(dp);
816 	TAILQ_INIT(&tdio->queue);
817 
818 	if (pol->new_tdio)
819 		pol->new_tdio(tdio);
820 
821 	lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE);
822 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
823 	atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX);
824 	lockmgr(&tdio->diskctx->lock, LK_RELEASE);
825 
826 	if (tdctx) {
827 		tdio->tdctx = tdctx;
828 		tdio->p = tdctx->p;
829 
830 		/* Put the tdio in the tdctx list */
831 		DSCHED_THREAD_CTX_LOCK(tdctx);
832 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
833 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
834 		atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX);
835 	}
836 
837 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
838 	return tdio;
839 }
840 
841 
842 struct dsched_disk_ctx *
843 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
844 {
845 	struct dsched_disk_ctx *diskctx;
846 
847 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
848 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
849 	dsched_disk_ctx_ref(diskctx);
850 	diskctx->dp = dp;
851 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
852 	TAILQ_INIT(&diskctx->tdio_list);
853 
854 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
855 	if (pol->new_diskctx)
856 		pol->new_diskctx(diskctx);
857 	return diskctx;
858 }
859 
860 
861 struct dsched_thread_ctx *
862 dsched_thread_ctx_alloc(struct proc *p)
863 {
864 	struct dsched_thread_ctx	*tdctx;
865 	struct dsched_thread_io	*tdio;
866 	struct disk	*dp = NULL;
867 
868 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
869 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
870 	dsched_thread_ctx_ref(tdctx);
871 #if 0
872 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
873 #endif
874 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
875 	TAILQ_INIT(&tdctx->tdio_list);
876 	tdctx->p = p;
877 
878 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
879 	while ((dp = disk_enumerate(dp))) {
880 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
881 	}
882 
883 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
884 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
885 
886 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
887 	/* XXX: no callback here */
888 	return tdctx;
889 }
890 
891 void
892 policy_new(struct disk *dp, struct dsched_policy *pol) {
893 	struct dsched_thread_ctx *tdctx;
894 	struct dsched_disk_ctx *diskctx;
895 	struct dsched_thread_io *tdio;
896 
897 	diskctx = dsched_disk_ctx_alloc(dp, pol);
898 	dsched_disk_ctx_ref(diskctx);
899 	dsched_set_disk_priv(dp, diskctx);
900 
901 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
902 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
903 		tdio = dsched_thread_io_alloc(dp, tdctx, pol);
904 	}
905 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
906 
907 }
908 
909 void
910 policy_destroy(struct disk *dp) {
911 	struct dsched_disk_ctx *diskctx;
912 
913 	diskctx = dsched_get_disk_priv(dp);
914 	KKASSERT(diskctx != NULL);
915 
916 	dsched_disk_ctx_unref(diskctx); /* from prepare */
917 	dsched_disk_ctx_unref(diskctx); /* from alloc */
918 
919 	dsched_set_disk_priv(dp, NULL);
920 }
921 
922 void
923 dsched_new_buf(struct buf *bp)
924 {
925 	struct dsched_thread_ctx	*tdctx = NULL;
926 
927 	if (dsched_inited == 0)
928 		return;
929 
930 	if (curproc != NULL) {
931 		tdctx = dsched_get_proc_priv(curproc);
932 	} else {
933 		/* This is a kernel thread, so no proc info is available */
934 		tdctx = dsched_get_thread_priv(curthread);
935 	}
936 
937 #if 0
938 	/*
939 	 * XXX: hack. we don't want this assert because we aren't catching all
940 	 *	threads. mi_startup() is still getting away without an tdctx.
941 	 */
942 
943 	/* by now we should have an tdctx. if not, something bad is going on */
944 	KKASSERT(tdctx != NULL);
945 #endif
946 
947 	if (tdctx) {
948 		dsched_thread_ctx_ref(tdctx);
949 	}
950 	dsched_set_buf_priv(bp, tdctx);
951 }
952 
953 void
954 dsched_exit_buf(struct buf *bp)
955 {
956 	struct dsched_thread_ctx	*tdctx;
957 
958 	tdctx = dsched_get_buf_priv(bp);
959 	if (tdctx != NULL) {
960 		dsched_clr_buf_priv(bp);
961 		dsched_thread_ctx_unref(tdctx);
962 	}
963 }
964 
965 void
966 dsched_new_proc(struct proc *p)
967 {
968 	struct dsched_thread_ctx	*tdctx;
969 
970 	if (dsched_inited == 0)
971 		return;
972 
973 	KKASSERT(p != NULL);
974 
975 	tdctx = dsched_thread_ctx_alloc(p);
976 	tdctx->p = p;
977 	dsched_thread_ctx_ref(tdctx);
978 
979 	dsched_set_proc_priv(p, tdctx);
980 	atomic_add_int(&dsched_stats.nprocs, 1);
981 }
982 
983 
984 void
985 dsched_new_thread(struct thread *td)
986 {
987 	struct dsched_thread_ctx	*tdctx;
988 
989 	if (dsched_inited == 0)
990 		return;
991 
992 	KKASSERT(td != NULL);
993 
994 	tdctx = dsched_thread_ctx_alloc(NULL);
995 	tdctx->td = td;
996 	dsched_thread_ctx_ref(tdctx);
997 
998 	dsched_set_thread_priv(td, tdctx);
999 	atomic_add_int(&dsched_stats.nthreads, 1);
1000 }
1001 
1002 void
1003 dsched_exit_proc(struct proc *p)
1004 {
1005 	struct dsched_thread_ctx	*tdctx;
1006 
1007 	if (dsched_inited == 0)
1008 		return;
1009 
1010 	KKASSERT(p != NULL);
1011 
1012 	tdctx = dsched_get_proc_priv(p);
1013 	KKASSERT(tdctx != NULL);
1014 
1015 	tdctx->dead = 0xDEAD;
1016 	dsched_set_proc_priv(p, NULL);
1017 
1018 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1019 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1020 	atomic_subtract_int(&dsched_stats.nprocs, 1);
1021 }
1022 
1023 
1024 void
1025 dsched_exit_thread(struct thread *td)
1026 {
1027 	struct dsched_thread_ctx	*tdctx;
1028 
1029 	if (dsched_inited == 0)
1030 		return;
1031 
1032 	KKASSERT(td != NULL);
1033 
1034 	tdctx = dsched_get_thread_priv(td);
1035 	KKASSERT(tdctx != NULL);
1036 
1037 	tdctx->dead = 0xDEAD;
1038 	dsched_set_thread_priv(td, 0);
1039 
1040 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
1041 	dsched_thread_ctx_unref(tdctx); /* one for ref */
1042 	atomic_subtract_int(&dsched_stats.nthreads, 1);
1043 }
1044 
1045 struct dsched_thread_io *
1046 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
1047     struct dsched_policy *pol) {
1048 	struct dsched_thread_ctx *tdctx;
1049 	struct dsched_thread_io *tdio;
1050 
1051 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
1052 
1053 	tdctx = dsched_get_thread_priv(curthread);
1054 	KKASSERT(tdctx != NULL);
1055 	tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
1056 
1057 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1058 
1059 	return tdio;
1060 }
1061 
1062 /* DEFAULT NOOP POLICY */
1063 
1064 static int
1065 noop_prepare(struct dsched_disk_ctx *diskctx)
1066 {
1067 	return 0;
1068 }
1069 
1070 static void
1071 noop_teardown(struct dsched_disk_ctx *diskctx)
1072 {
1073 
1074 }
1075 
1076 static void
1077 noop_cancel(struct dsched_disk_ctx *diskctx)
1078 {
1079 
1080 }
1081 
1082 static int
1083 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
1084     struct bio *bio)
1085 {
1086 	dsched_strategy_raw(diskctx->dp, bio);
1087 #if 0
1088 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
1089 #endif
1090 	return 0;
1091 }
1092 
1093 /*
1094  * SYSINIT stuff
1095  */
1096 static void
1097 dsched_init(void)
1098 {
1099 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
1100 					   NULL, NULL, NULL,
1101 					   objcache_malloc_alloc,
1102 					   objcache_malloc_free,
1103 					   &dsched_thread_io_malloc_args );
1104 
1105 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1106 					   NULL, NULL, NULL,
1107 					   objcache_malloc_alloc,
1108 					   objcache_malloc_free,
1109 					   &dsched_thread_ctx_malloc_args );
1110 
1111 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1112 					   NULL, NULL, NULL,
1113 					   objcache_malloc_alloc,
1114 					   objcache_malloc_free,
1115 					   &dsched_disk_ctx_malloc_args );
1116 
1117 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1118 
1119 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1120 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1121 
1122 	dsched_register(&dsched_noop_policy);
1123 
1124 	dsched_inited = 1;
1125 }
1126 
1127 static void
1128 dsched_uninit(void)
1129 {
1130 }
1131 
1132 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1133 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1134 
1135 /*
1136  * SYSCTL stuff
1137  */
1138 static int
1139 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1140 {
1141 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1142 }
1143 
1144 static int
1145 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1146 {
1147 	struct dsched_policy *pol = NULL;
1148 	int error, first = 1;
1149 
1150 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1151 
1152 	while ((pol = dsched_policy_enumerate(pol))) {
1153 		if (!first) {
1154 			error = SYSCTL_OUT(req, " ", 1);
1155 			if (error)
1156 				break;
1157 		} else {
1158 			first = 0;
1159 		}
1160 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1161 		if (error)
1162 			break;
1163 
1164 	}
1165 
1166 	lockmgr(&dsched_lock, LK_RELEASE);
1167 
1168 	error = SYSCTL_OUT(req, "", 1);
1169 
1170 	return error;
1171 }
1172 
1173 static int
1174 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1175 {
1176 	char buf[DSCHED_POLICY_NAME_LENGTH];
1177 	struct dsched_disk_ctx *diskctx = arg1;
1178 	struct dsched_policy *pol = NULL;
1179 	int error;
1180 
1181 	if (diskctx == NULL) {
1182 		return 0;
1183 	}
1184 
1185 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1186 
1187 	pol = diskctx->dp->d_sched_policy;
1188 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1189 
1190 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1191 	if (error || req->newptr == NULL) {
1192 		lockmgr(&dsched_lock, LK_RELEASE);
1193 		return (error);
1194 	}
1195 
1196 	pol = dsched_find_policy(buf);
1197 	if (pol == NULL) {
1198 		lockmgr(&dsched_lock, LK_RELEASE);
1199 		return 0;
1200 	}
1201 
1202 	dsched_switch(diskctx->dp, pol);
1203 
1204 	lockmgr(&dsched_lock, LK_RELEASE);
1205 
1206 	return error;
1207 }
1208 
1209 static int
1210 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1211 {
1212 	char buf[DSCHED_POLICY_NAME_LENGTH];
1213 	struct dsched_policy *pol = NULL;
1214 	int error;
1215 
1216 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1217 
1218 	pol = default_policy;
1219 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1220 
1221 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1222 	if (error || req->newptr == NULL) {
1223 		lockmgr(&dsched_lock, LK_RELEASE);
1224 		return (error);
1225 	}
1226 
1227 	pol = dsched_find_policy(buf);
1228 	if (pol == NULL) {
1229 		lockmgr(&dsched_lock, LK_RELEASE);
1230 		return 0;
1231 	}
1232 
1233 	default_set = 1;
1234 	default_policy = pol;
1235 
1236 	lockmgr(&dsched_lock, LK_RELEASE);
1237 
1238 	return error;
1239 }
1240 
1241 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1242     "Disk Scheduler Framework (dsched) magic");
1243 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1244     "List of disks and their policies");
1245 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1246     0, "Enable dsched debugging");
1247 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1248     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1249     "dsched statistics");
1250 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1251     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1252 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1253     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1254 
1255 static void
1256 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1257 {
1258 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1259 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1260 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1261 	}
1262 
1263 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1264 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1265 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1266 }
1267