xref: /dragonfly/sys/kern/kern_dsched.c (revision 10cbe914)
1 /*
2  * Copyright (c) 2009, 2010 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Alex Hornung <ahornung@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/proc.h>
38 #include <sys/sysctl.h>
39 #include <sys/buf.h>
40 #include <sys/conf.h>
41 #include <sys/diskslice.h>
42 #include <sys/disk.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
50 #include <sys/buf2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
54 
55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs");
56 
57 static dsched_prepare_t		noop_prepare;
58 static dsched_teardown_t	noop_teardown;
59 static dsched_cancel_t		noop_cancel;
60 static dsched_queue_t		noop_queue;
61 
62 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name);
63 
64 static int	dsched_inited = 0;
65 static int	default_set = 0;
66 
67 struct lock	dsched_lock;
68 static int	dsched_debug_enable = 0;
69 
70 struct dsched_stats	dsched_stats;
71 
72 struct objcache_malloc_args dsched_disk_ctx_malloc_args = {
73 	DSCHED_DISK_CTX_MAX_SZ, M_DSCHED };
74 struct objcache_malloc_args dsched_thread_io_malloc_args = {
75 	DSCHED_THREAD_IO_MAX_SZ, M_DSCHED };
76 struct objcache_malloc_args dsched_thread_ctx_malloc_args = {
77 	DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED };
78 
79 static struct objcache	*dsched_diskctx_cache;
80 static struct objcache	*dsched_tdctx_cache;
81 static struct objcache	*dsched_tdio_cache;
82 
83 TAILQ_HEAD(, dsched_thread_ctx)	dsched_tdctx_list =
84 		TAILQ_HEAD_INITIALIZER(dsched_tdctx_list);
85 
86 struct lock	dsched_tdctx_lock;
87 
88 static struct dsched_policy_head dsched_policy_list =
89 		TAILQ_HEAD_INITIALIZER(dsched_policy_list);
90 
91 static struct dsched_policy dsched_noop_policy = {
92 	.name = "noop",
93 
94 	.prepare = noop_prepare,
95 	.teardown = noop_teardown,
96 	.cancel_all = noop_cancel,
97 	.bio_queue = noop_queue
98 };
99 
100 static struct dsched_policy *default_policy = &dsched_noop_policy;
101 
102 /*
103  * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
104  * using kvprintf
105  */
106 int
107 dsched_debug(int level, char *fmt, ...)
108 {
109 	__va_list ap;
110 
111 	__va_start(ap, fmt);
112 	if (level <= dsched_debug_enable)
113 		kvprintf(fmt, ap);
114 	__va_end(ap);
115 
116 	return 0;
117 }
118 
119 /*
120  * Called on disk_create()
121  * tries to read which policy to use from loader.conf, if there's
122  * none specified, the default policy is used.
123  */
124 void
125 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit)
126 {
127 	char tunable_key[SPECNAMELEN + 48];
128 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
129 	char *ptr;
130 	struct dsched_policy *policy = NULL;
131 
132 	/* Also look for serno stuff? */
133 	/* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */
134 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
135 
136 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d",
137 	    head_name, unit);
138 	if (TUNABLE_STR_FETCH(tunable_key, sched_policy,
139 	    sizeof(sched_policy)) != 0) {
140 		policy = dsched_find_policy(sched_policy);
141 	}
142 
143 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
144 	    head_name);
145 	for (ptr = tunable_key; *ptr; ptr++) {
146 		if (*ptr == '/')
147 			*ptr = '-';
148 	}
149 	if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
150 	    sizeof(sched_policy)) != 0)) {
151 		policy = dsched_find_policy(sched_policy);
152 	}
153 
154 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default");
155 	if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy,
156 	    sizeof(sched_policy)) != 0)) {
157 		policy = dsched_find_policy(sched_policy);
158 	}
159 
160 	if (!policy) {
161 		if (!default_set) {
162 			dsched_debug(0, "No policy for %s%d specified, "
163 			    "or policy not found\n", head_name, unit);
164 		}
165 		dsched_set_policy(dp, default_policy);
166 	} else {
167 		dsched_set_policy(dp, policy);
168 	}
169 
170 	if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0)
171 		ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name);
172 	else
173 		ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit);
174 	for (ptr = tunable_key; *ptr; ptr++) {
175 		if (*ptr == '/')
176 			*ptr = '-';
177 	}
178 	dsched_sysctl_add_disk(
179 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
180 	    tunable_key);
181 
182 	lockmgr(&dsched_lock, LK_RELEASE);
183 }
184 
185 /*
186  * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
187  * there's any policy associated with the serial number of the device.
188  */
189 void
190 dsched_disk_update_callback(struct disk *dp, struct disk_info *info)
191 {
192 	char tunable_key[SPECNAMELEN + 48];
193 	char sched_policy[DSCHED_POLICY_NAME_LENGTH];
194 	struct dsched_policy *policy = NULL;
195 
196 	if (info->d_serialno == NULL)
197 		return;
198 
199 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
200 
201 	ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s",
202 	    info->d_serialno);
203 
204 	if((TUNABLE_STR_FETCH(tunable_key, sched_policy,
205 	    sizeof(sched_policy)) != 0)) {
206 		policy = dsched_find_policy(sched_policy);
207 	}
208 
209 	if (policy) {
210 		dsched_switch(dp, policy);
211 	}
212 
213 	dsched_sysctl_add_disk(
214 	    (struct dsched_disk_ctx *)dsched_get_disk_priv(dp),
215 	    info->d_serialno);
216 
217 	lockmgr(&dsched_lock, LK_RELEASE);
218 }
219 
220 /*
221  * Called on disk_destroy()
222  * shuts down the scheduler core and cancels all remaining bios
223  */
224 void
225 dsched_disk_destroy_callback(struct disk *dp)
226 {
227 	struct dsched_policy *old_policy;
228 	struct dsched_disk_ctx *diskctx;
229 
230 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
231 
232 	diskctx = dsched_get_disk_priv(dp);
233 
234 	old_policy = dp->d_sched_policy;
235 	dp->d_sched_policy = &dsched_noop_policy;
236 	old_policy->cancel_all(dsched_get_disk_priv(dp));
237 	old_policy->teardown(dsched_get_disk_priv(dp));
238 
239 	if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED)
240 		sysctl_ctx_free(&diskctx->sysctl_ctx);
241 
242 	policy_destroy(dp);
243 	atomic_subtract_int(&old_policy->ref_count, 1);
244 	KKASSERT(old_policy->ref_count >= 0);
245 
246 	lockmgr(&dsched_lock, LK_RELEASE);
247 }
248 
249 
250 void
251 dsched_queue(struct disk *dp, struct bio *bio)
252 {
253 	struct dsched_thread_ctx	*tdctx;
254 	struct dsched_thread_io		*tdio;
255 	struct dsched_disk_ctx		*diskctx;
256 
257 	int found = 0, error = 0;
258 
259 	tdctx = dsched_get_buf_priv(bio->bio_buf);
260 	if (tdctx == NULL) {
261 		/* We don't handle this case, let dsched dispatch */
262 		atomic_add_int(&dsched_stats.no_tdctx, 1);
263 		dsched_strategy_raw(dp, bio);
264 		return;
265 	}
266 
267 	DSCHED_THREAD_CTX_LOCK(tdctx);
268 
269 	KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list));
270 	TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) {
271 		if (tdio->dp == dp) {
272 			dsched_thread_io_ref(tdio);
273 			found = 1;
274 			break;
275 		}
276 	}
277 
278 	DSCHED_THREAD_CTX_UNLOCK(tdctx);
279 	dsched_clr_buf_priv(bio->bio_buf);
280 	dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */
281 
282 	KKASSERT(found == 1);
283 	diskctx = dsched_get_disk_priv(dp);
284 	dsched_disk_ctx_ref(diskctx);
285 	error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio);
286 
287 	if (error) {
288 		dsched_strategy_raw(dp, bio);
289 	}
290 	dsched_disk_ctx_unref(diskctx);
291 	dsched_thread_io_unref(tdio);
292 }
293 
294 
295 /*
296  * Called from each module_init or module_attach of each policy
297  * registers the policy in the local policy list.
298  */
299 int
300 dsched_register(struct dsched_policy *d_policy)
301 {
302 	struct dsched_policy *policy;
303 	int error = 0;
304 
305 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
306 
307 	policy = dsched_find_policy(d_policy->name);
308 
309 	if (!policy) {
310 		TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link);
311 		atomic_add_int(&d_policy->ref_count, 1);
312 	} else {
313 		dsched_debug(LOG_ERR, "Policy with name %s already registered!\n",
314 		    d_policy->name);
315 		error = EEXIST;
316 	}
317 
318 	lockmgr(&dsched_lock, LK_RELEASE);
319 	return error;
320 }
321 
322 /*
323  * Called from each module_detach of each policy
324  * unregisters the policy
325  */
326 int
327 dsched_unregister(struct dsched_policy *d_policy)
328 {
329 	struct dsched_policy *policy;
330 
331 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
332 	policy = dsched_find_policy(d_policy->name);
333 
334 	if (policy) {
335 		if (policy->ref_count > 1) {
336 			lockmgr(&dsched_lock, LK_RELEASE);
337 			return EBUSY;
338 		}
339 		TAILQ_REMOVE(&dsched_policy_list, policy, link);
340 		atomic_subtract_int(&policy->ref_count, 1);
341 		KKASSERT(policy->ref_count == 0);
342 	}
343 	lockmgr(&dsched_lock, LK_RELEASE);
344 	return 0;
345 }
346 
347 
348 /*
349  * switches the policy by first removing the old one and then
350  * enabling the new one.
351  */
352 int
353 dsched_switch(struct disk *dp, struct dsched_policy *new_policy)
354 {
355 	struct dsched_policy *old_policy;
356 
357 	/* If we are asked to set the same policy, do nothing */
358 	if (dp->d_sched_policy == new_policy)
359 		return 0;
360 
361 	/* lock everything down, diskwise */
362 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
363 	old_policy = dp->d_sched_policy;
364 
365 	atomic_subtract_int(&old_policy->ref_count, 1);
366 	KKASSERT(old_policy->ref_count >= 0);
367 
368 	dp->d_sched_policy = &dsched_noop_policy;
369 	old_policy->teardown(dsched_get_disk_priv(dp));
370 	policy_destroy(dp);
371 
372 	/* Bring everything back to life */
373 	dsched_set_policy(dp, new_policy);
374 	lockmgr(&dsched_lock, LK_RELEASE);
375 	return 0;
376 }
377 
378 
379 /*
380  * Loads a given policy and attaches it to the specified disk.
381  * Also initializes the core for the policy
382  */
383 void
384 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy)
385 {
386 	int locked = 0;
387 
388 	/* Check if it is locked already. if not, we acquire the devfs lock */
389 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
390 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
391 		locked = 1;
392 	}
393 
394 	policy_new(dp, new_policy);
395 	new_policy->prepare(dsched_get_disk_priv(dp));
396 	dp->d_sched_policy = new_policy;
397 	atomic_add_int(&new_policy->ref_count, 1);
398 	kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name,
399 	    new_policy->name);
400 
401 	/* If we acquired the lock, we also get rid of it */
402 	if (locked)
403 		lockmgr(&dsched_lock, LK_RELEASE);
404 }
405 
406 struct dsched_policy*
407 dsched_find_policy(char *search)
408 {
409 	struct dsched_policy *policy;
410 	struct dsched_policy *policy_found = NULL;
411 	int locked = 0;
412 
413 	/* Check if it is locked already. if not, we acquire the devfs lock */
414 	if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) {
415 		lockmgr(&dsched_lock, LK_EXCLUSIVE);
416 		locked = 1;
417 	}
418 
419 	TAILQ_FOREACH(policy, &dsched_policy_list, link) {
420 		if (!strcmp(policy->name, search)) {
421 			policy_found = policy;
422 			break;
423 		}
424 	}
425 
426 	/* If we acquired the lock, we also get rid of it */
427 	if (locked)
428 		lockmgr(&dsched_lock, LK_RELEASE);
429 
430 	return policy_found;
431 }
432 
433 struct disk*
434 dsched_find_disk(char *search)
435 {
436 	struct disk *dp_found = NULL;
437 	struct disk *dp = NULL;
438 
439 	while((dp = disk_enumerate(dp))) {
440 		if (!strcmp(dp->d_cdev->si_name, search)) {
441 			dp_found = dp;
442 			break;
443 		}
444 	}
445 
446 	return dp_found;
447 }
448 
449 struct disk*
450 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy)
451 {
452 	while ((dp = disk_enumerate(dp))) {
453 		if (dp->d_sched_policy == policy)
454 			return dp;
455 	}
456 
457 	return NULL;
458 }
459 
460 struct dsched_policy *
461 dsched_policy_enumerate(struct dsched_policy *pol)
462 {
463 	if (!pol)
464 		return (TAILQ_FIRST(&dsched_policy_list));
465 	else
466 		return (TAILQ_NEXT(pol, link));
467 }
468 
469 void
470 dsched_cancel_bio(struct bio *bp)
471 {
472 	bp->bio_buf->b_error = ENXIO;
473 	bp->bio_buf->b_flags |= B_ERROR;
474 	bp->bio_buf->b_resid = bp->bio_buf->b_bcount;
475 
476 	biodone(bp);
477 }
478 
479 void
480 dsched_strategy_raw(struct disk *dp, struct bio *bp)
481 {
482 	/*
483 	 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
484 	 * to avoid panics
485 	 */
486 	KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!"));
487 	if(bp->bio_track != NULL) {
488 		dsched_debug(LOG_INFO,
489 		    "dsched_strategy_raw sees non-NULL bio_track!! "
490 		    "bio: %p\n", bp);
491 		bp->bio_track = NULL;
492 	}
493 	dev_dstrategy(dp->d_rawdev, bp);
494 }
495 
496 void
497 dsched_strategy_sync(struct disk *dp, struct bio *bio)
498 {
499 	struct buf *bp, *nbp;
500 	struct bio *nbio;
501 
502 	bp = bio->bio_buf;
503 
504 	nbp = getpbuf(NULL);
505 	nbio = &nbp->b_bio1;
506 
507 	nbp->b_cmd = bp->b_cmd;
508 	nbp->b_bufsize = bp->b_bufsize;
509 	nbp->b_runningbufspace = bp->b_runningbufspace;
510 	nbp->b_bcount = bp->b_bcount;
511 	nbp->b_resid = bp->b_resid;
512 	nbp->b_data = bp->b_data;
513 #if 0
514 	/*
515 	 * Buffers undergoing device I/O do not need a kvabase/size.
516 	 */
517 	nbp->b_kvabase = bp->b_kvabase;
518 	nbp->b_kvasize = bp->b_kvasize;
519 #endif
520 	nbp->b_dirtyend = bp->b_dirtyend;
521 
522 	nbio->bio_done = biodone_sync;
523 	nbio->bio_flags |= BIO_SYNC;
524 	nbio->bio_track = NULL;
525 
526 	nbio->bio_caller_info1.ptr = dp;
527 	nbio->bio_offset = bio->bio_offset;
528 
529 	dev_dstrategy(dp->d_rawdev, nbio);
530 	biowait(nbio, "dschedsync");
531 	bp->b_resid = nbp->b_resid;
532 	bp->b_error = nbp->b_error;
533 	biodone(bio);
534 #if 0
535 	nbp->b_kvabase = NULL;
536 	nbp->b_kvasize = 0;
537 #endif
538 	relpbuf(nbp, NULL);
539 }
540 
541 void
542 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv)
543 {
544 	struct bio *nbio;
545 
546 	nbio = push_bio(bio);
547 	nbio->bio_done = done;
548 	nbio->bio_offset = bio->bio_offset;
549 
550 	dsched_set_bio_dp(nbio, dp);
551 	dsched_set_bio_priv(nbio, priv);
552 
553 	getmicrotime(&nbio->bio_caller_info3.tv);
554 	dev_dstrategy(dp->d_rawdev, nbio);
555 }
556 
557 void
558 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx)
559 {
560 	int refcount;
561 
562 	refcount = atomic_fetchadd_int(&diskctx->refcount, 1);
563 
564 	KKASSERT(refcount >= 0);
565 }
566 
567 void
568 dsched_thread_io_ref(struct dsched_thread_io *tdio)
569 {
570 	int refcount;
571 
572 	refcount = atomic_fetchadd_int(&tdio->refcount, 1);
573 
574 	KKASSERT(refcount >= 0);
575 }
576 
577 void
578 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx)
579 {
580 	int refcount;
581 
582 	refcount = atomic_fetchadd_int(&tdctx->refcount, 1);
583 
584 	KKASSERT(refcount >= 0);
585 }
586 
587 void
588 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx)
589 {
590 	struct dsched_thread_io	*tdio, *tdio2;
591 	int refcount;
592 
593 	refcount = atomic_fetchadd_int(&diskctx->refcount, -1);
594 
595 
596 	KKASSERT(refcount >= 0 || refcount <= -0x400);
597 
598 	if (refcount == 1) {
599 		atomic_subtract_int(&diskctx->refcount, 0x400); /* mark as: in destruction */
600 #if 0
601 		kprintf("diskctx (%p) destruction started, trace:\n", diskctx);
602 		print_backtrace(4);
603 #endif
604 		lockmgr(&diskctx->lock, LK_EXCLUSIVE);
605 		TAILQ_FOREACH_MUTABLE(tdio, &diskctx->tdio_list, dlink, tdio2) {
606 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
607 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
608 			dsched_thread_io_unref(tdio);
609 		}
610 		lockmgr(&diskctx->lock, LK_RELEASE);
611 		if (diskctx->dp->d_sched_policy->destroy_diskctx)
612 			diskctx->dp->d_sched_policy->destroy_diskctx(diskctx);
613 		objcache_put(dsched_diskctx_cache, diskctx);
614 		atomic_subtract_int(&dsched_stats.diskctx_allocations, 1);
615 	}
616 }
617 
618 void
619 dsched_thread_io_unref(struct dsched_thread_io *tdio)
620 {
621 	struct dsched_thread_ctx	*tdctx;
622 	struct dsched_disk_ctx	*diskctx;
623 	int refcount;
624 
625 	refcount = atomic_fetchadd_int(&tdio->refcount, -1);
626 
627 	KKASSERT(refcount >= 0 || refcount <= -0x400);
628 
629 	if (refcount == 1) {
630 		atomic_subtract_int(&tdio->refcount, 0x400); /* mark as: in destruction */
631 #if 0
632 		kprintf("tdio (%p) destruction started, trace:\n", tdio);
633 		print_backtrace(8);
634 #endif
635 		diskctx = tdio->diskctx;
636 		KKASSERT(diskctx != NULL);
637 		KKASSERT(tdio->qlength == 0);
638 
639 		if (tdio->flags & DSCHED_LINKED_DISK_CTX) {
640 			lockmgr(&diskctx->lock, LK_EXCLUSIVE);
641 
642 			TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink);
643 			tdio->flags &= ~DSCHED_LINKED_DISK_CTX;
644 
645 			lockmgr(&diskctx->lock, LK_RELEASE);
646 		}
647 
648 		if (tdio->flags & DSCHED_LINKED_THREAD_CTX) {
649 			tdctx = tdio->tdctx;
650 			KKASSERT(tdctx != NULL);
651 
652 			lockmgr(&tdctx->lock, LK_EXCLUSIVE);
653 
654 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
655 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
656 
657 			lockmgr(&tdctx->lock, LK_RELEASE);
658 		}
659 		if (tdio->diskctx->dp->d_sched_policy->destroy_tdio)
660 			tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);
661 		objcache_put(dsched_tdio_cache, tdio);
662 		atomic_subtract_int(&dsched_stats.tdio_allocations, 1);
663 #if 0
664 		dsched_disk_ctx_unref(diskctx);
665 #endif
666 	}
667 }
668 
669 void
670 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx)
671 {
672 	struct dsched_thread_io	*tdio, *tdio2;
673 	int refcount;
674 
675 	refcount = atomic_fetchadd_int(&tdctx->refcount, -1);
676 
677 	KKASSERT(refcount >= 0 || refcount <= -0x400);
678 
679 	if (refcount == 1) {
680 		atomic_subtract_int(&tdctx->refcount, 0x400); /* mark as: in destruction */
681 #if 0
682 		kprintf("tdctx (%p) destruction started, trace:\n", tdctx);
683 		print_backtrace(8);
684 #endif
685 		DSCHED_GLOBAL_THREAD_CTX_LOCK();
686 
687 		TAILQ_FOREACH_MUTABLE(tdio, &tdctx->tdio_list, link, tdio2) {
688 			TAILQ_REMOVE(&tdctx->tdio_list, tdio, link);
689 			tdio->flags &= ~DSCHED_LINKED_THREAD_CTX;
690 			dsched_thread_io_unref(tdio);
691 		}
692 		TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link);
693 
694 		DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
695 
696 		objcache_put(dsched_tdctx_cache, tdctx);
697 		atomic_subtract_int(&dsched_stats.tdctx_allocations, 1);
698 	}
699 }
700 
701 
702 struct dsched_thread_io *
703 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx,
704     struct dsched_policy *pol)
705 {
706 	struct dsched_thread_io	*tdio;
707 #if 0
708 	dsched_disk_ctx_ref(dsched_get_disk_priv(dp));
709 #endif
710 	tdio = objcache_get(dsched_tdio_cache, M_WAITOK);
711 	bzero(tdio, DSCHED_THREAD_IO_MAX_SZ);
712 
713 	/* XXX: maybe we do need another ref for the disk list for tdio */
714 	dsched_thread_io_ref(tdio);
715 
716 	DSCHED_THREAD_IO_LOCKINIT(tdio);
717 	tdio->dp = dp;
718 
719 	tdio->diskctx = dsched_get_disk_priv(dp);
720 	TAILQ_INIT(&tdio->queue);
721 
722 	if (pol->new_tdio)
723 		pol->new_tdio(tdio);
724 
725 	TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink);
726 	tdio->flags |= DSCHED_LINKED_DISK_CTX;
727 
728 	if (tdctx) {
729 		tdio->tdctx = tdctx;
730 		tdio->p = tdctx->p;
731 
732 		/* Put the tdio in the tdctx list */
733 		DSCHED_THREAD_CTX_LOCK(tdctx);
734 		TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link);
735 		DSCHED_THREAD_CTX_UNLOCK(tdctx);
736 		tdio->flags |= DSCHED_LINKED_THREAD_CTX;
737 	}
738 
739 	atomic_add_int(&dsched_stats.tdio_allocations, 1);
740 	return tdio;
741 }
742 
743 
744 struct dsched_disk_ctx *
745 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol)
746 {
747 	struct dsched_disk_ctx *diskctx;
748 
749 	diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK);
750 	bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ);
751 	dsched_disk_ctx_ref(diskctx);
752 	diskctx->dp = dp;
753 	DSCHED_DISK_CTX_LOCKINIT(diskctx);
754 	TAILQ_INIT(&diskctx->tdio_list);
755 
756 	atomic_add_int(&dsched_stats.diskctx_allocations, 1);
757 	if (pol->new_diskctx)
758 		pol->new_diskctx(diskctx);
759 	return diskctx;
760 }
761 
762 
763 struct dsched_thread_ctx *
764 dsched_thread_ctx_alloc(struct proc *p)
765 {
766 	struct dsched_thread_ctx	*tdctx;
767 	struct dsched_thread_io	*tdio;
768 	struct disk	*dp = NULL;
769 
770 	tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK);
771 	bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ);
772 	dsched_thread_ctx_ref(tdctx);
773 #if 0
774 	kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx);
775 #endif
776 	DSCHED_THREAD_CTX_LOCKINIT(tdctx);
777 	TAILQ_INIT(&tdctx->tdio_list);
778 	tdctx->p = p;
779 
780 	/* XXX */
781 	while ((dp = disk_enumerate(dp))) {
782 		tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy);
783 	}
784 
785 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
786 	TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link);
787 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
788 
789 	atomic_add_int(&dsched_stats.tdctx_allocations, 1);
790 	/* XXX: no callback here */
791 	return tdctx;
792 }
793 
794 void
795 policy_new(struct disk *dp, struct dsched_policy *pol) {
796 	struct dsched_thread_ctx *tdctx;
797 	struct dsched_disk_ctx *diskctx;
798 	struct dsched_thread_io *tdio;
799 
800 	diskctx = dsched_disk_ctx_alloc(dp, pol);
801 	dsched_disk_ctx_ref(diskctx);
802 	dsched_set_disk_priv(dp, diskctx);
803 
804 	DSCHED_GLOBAL_THREAD_CTX_LOCK();
805 	TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) {
806 		tdio = dsched_thread_io_alloc(dp, tdctx, pol);
807 	}
808 	DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
809 
810 }
811 
812 void
813 policy_destroy(struct disk *dp) {
814 	struct dsched_disk_ctx *diskctx;
815 
816 	diskctx = dsched_get_disk_priv(dp);
817 	KKASSERT(diskctx != NULL);
818 
819 	dsched_disk_ctx_unref(diskctx); /* from prepare */
820 	dsched_disk_ctx_unref(diskctx); /* from alloc */
821 
822 	dsched_set_disk_priv(dp, NULL);
823 }
824 
825 void
826 dsched_new_buf(struct buf *bp)
827 {
828 	struct dsched_thread_ctx	*tdctx = NULL;
829 
830 	if (dsched_inited == 0)
831 		return;
832 
833 	if (curproc != NULL) {
834 		tdctx = dsched_get_proc_priv(curproc);
835 	} else {
836 		/* This is a kernel thread, so no proc info is available */
837 		tdctx = dsched_get_thread_priv(curthread);
838 	}
839 
840 #if 0
841 	/*
842 	 * XXX: hack. we don't want this assert because we aren't catching all
843 	 *	threads. mi_startup() is still getting away without an tdctx.
844 	 */
845 
846 	/* by now we should have an tdctx. if not, something bad is going on */
847 	KKASSERT(tdctx != NULL);
848 #endif
849 
850 	if (tdctx) {
851 		dsched_thread_ctx_ref(tdctx);
852 	}
853 	dsched_set_buf_priv(bp, tdctx);
854 }
855 
856 void
857 dsched_exit_buf(struct buf *bp)
858 {
859 	struct dsched_thread_ctx	*tdctx;
860 
861 	tdctx = dsched_get_buf_priv(bp);
862 	if (tdctx != NULL) {
863 		dsched_clr_buf_priv(bp);
864 		dsched_thread_ctx_unref(tdctx);
865 	}
866 }
867 
868 void
869 dsched_new_proc(struct proc *p)
870 {
871 	struct dsched_thread_ctx	*tdctx;
872 
873 	if (dsched_inited == 0)
874 		return;
875 
876 	KKASSERT(p != NULL);
877 
878 	tdctx = dsched_thread_ctx_alloc(p);
879 	tdctx->p = p;
880 	dsched_thread_ctx_ref(tdctx);
881 
882 	dsched_set_proc_priv(p, tdctx);
883 	atomic_add_int(&dsched_stats.nprocs, 1);
884 }
885 
886 
887 void
888 dsched_new_thread(struct thread *td)
889 {
890 	struct dsched_thread_ctx	*tdctx;
891 
892 	if (dsched_inited == 0)
893 		return;
894 
895 	KKASSERT(td != NULL);
896 
897 	tdctx = dsched_thread_ctx_alloc(NULL);
898 	tdctx->td = td;
899 	dsched_thread_ctx_ref(tdctx);
900 
901 	dsched_set_thread_priv(td, tdctx);
902 	atomic_add_int(&dsched_stats.nthreads, 1);
903 }
904 
905 void
906 dsched_exit_proc(struct proc *p)
907 {
908 	struct dsched_thread_ctx	*tdctx;
909 
910 	if (dsched_inited == 0)
911 		return;
912 
913 	KKASSERT(p != NULL);
914 
915 	tdctx = dsched_get_proc_priv(p);
916 	KKASSERT(tdctx != NULL);
917 
918 	tdctx->dead = 0xDEAD;
919 	dsched_set_proc_priv(p, NULL);
920 
921 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
922 	dsched_thread_ctx_unref(tdctx); /* one for ref */
923 	atomic_subtract_int(&dsched_stats.nprocs, 1);
924 }
925 
926 
927 void
928 dsched_exit_thread(struct thread *td)
929 {
930 	struct dsched_thread_ctx	*tdctx;
931 
932 	if (dsched_inited == 0)
933 		return;
934 
935 	KKASSERT(td != NULL);
936 
937 	tdctx = dsched_get_thread_priv(td);
938 	KKASSERT(tdctx != NULL);
939 
940 	tdctx->dead = 0xDEAD;
941 	dsched_set_thread_priv(td, 0);
942 
943 	dsched_thread_ctx_unref(tdctx); /* one for alloc, */
944 	dsched_thread_ctx_unref(tdctx); /* one for ref */
945 	atomic_subtract_int(&dsched_stats.nthreads, 1);
946 }
947 
948 struct dsched_thread_io *
949 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx,
950     struct dsched_policy *pol) {
951 	struct dsched_thread_ctx *tdctx;
952 	struct dsched_thread_io *tdio;
953 
954 	tdctx = dsched_get_thread_priv(curthread);
955 	KKASSERT(tdctx != NULL);
956 
957 	tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol);
958 	return tdio;
959 }
960 
961 /* DEFAULT NOOP POLICY */
962 
963 static int
964 noop_prepare(struct dsched_disk_ctx *diskctx)
965 {
966 	return 0;
967 }
968 
969 static void
970 noop_teardown(struct dsched_disk_ctx *diskctx)
971 {
972 
973 }
974 
975 static void
976 noop_cancel(struct dsched_disk_ctx *diskctx)
977 {
978 
979 }
980 
981 static int
982 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio,
983     struct bio *bio)
984 {
985 	dsched_strategy_raw(diskctx->dp, bio);
986 #if 0
987 	dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL);
988 #endif
989 	return 0;
990 }
991 
992 /*
993  * SYSINIT stuff
994  */
995 static void
996 dsched_init(void)
997 {
998 	dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0,
999 					   NULL, NULL, NULL,
1000 					   objcache_malloc_alloc,
1001 					   objcache_malloc_free,
1002 					   &dsched_thread_io_malloc_args );
1003 
1004 	dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0,
1005 					   NULL, NULL, NULL,
1006 					   objcache_malloc_alloc,
1007 					   objcache_malloc_free,
1008 					   &dsched_thread_ctx_malloc_args );
1009 
1010 	dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0,
1011 					   NULL, NULL, NULL,
1012 					   objcache_malloc_alloc,
1013 					   objcache_malloc_free,
1014 					   &dsched_disk_ctx_malloc_args );
1015 
1016 	bzero(&dsched_stats, sizeof(struct dsched_stats));
1017 
1018 	lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE);
1019 	DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1020 
1021 	dsched_register(&dsched_noop_policy);
1022 
1023 	dsched_inited = 1;
1024 }
1025 
1026 static void
1027 dsched_uninit(void)
1028 {
1029 }
1030 
1031 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL);
1032 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL);
1033 
1034 /*
1035  * SYSCTL stuff
1036  */
1037 static int
1038 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS)
1039 {
1040 	return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req));
1041 }
1042 
1043 static int
1044 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS)
1045 {
1046 	struct dsched_policy *pol = NULL;
1047 	int error, first = 1;
1048 
1049 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1050 
1051 	while ((pol = dsched_policy_enumerate(pol))) {
1052 		if (!first) {
1053 			error = SYSCTL_OUT(req, " ", 1);
1054 			if (error)
1055 				break;
1056 		} else {
1057 			first = 0;
1058 		}
1059 		error = SYSCTL_OUT(req, pol->name, strlen(pol->name));
1060 		if (error)
1061 			break;
1062 
1063 	}
1064 
1065 	lockmgr(&dsched_lock, LK_RELEASE);
1066 
1067 	error = SYSCTL_OUT(req, "", 1);
1068 
1069 	return error;
1070 }
1071 
1072 static int
1073 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS)
1074 {
1075 	char buf[DSCHED_POLICY_NAME_LENGTH];
1076 	struct dsched_disk_ctx *diskctx = arg1;
1077 	struct dsched_policy *pol = NULL;
1078 	int error;
1079 
1080 	if (diskctx == NULL) {
1081 		return 0;
1082 	}
1083 
1084 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1085 
1086 	pol = diskctx->dp->d_sched_policy;
1087 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1088 
1089 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1090 	if (error || req->newptr == NULL) {
1091 		lockmgr(&dsched_lock, LK_RELEASE);
1092 		return (error);
1093 	}
1094 
1095 	pol = dsched_find_policy(buf);
1096 	if (pol == NULL) {
1097 		lockmgr(&dsched_lock, LK_RELEASE);
1098 		return 0;
1099 	}
1100 
1101 	dsched_switch(diskctx->dp, pol);
1102 
1103 	lockmgr(&dsched_lock, LK_RELEASE);
1104 
1105 	return error;
1106 }
1107 
1108 static int
1109 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS)
1110 {
1111 	char buf[DSCHED_POLICY_NAME_LENGTH];
1112 	struct dsched_policy *pol = NULL;
1113 	int error;
1114 
1115 	lockmgr(&dsched_lock, LK_EXCLUSIVE);
1116 
1117 	pol = default_policy;
1118 	memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH);
1119 
1120 	error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req);
1121 	if (error || req->newptr == NULL) {
1122 		lockmgr(&dsched_lock, LK_RELEASE);
1123 		return (error);
1124 	}
1125 
1126 	pol = dsched_find_policy(buf);
1127 	if (pol == NULL) {
1128 		lockmgr(&dsched_lock, LK_RELEASE);
1129 		return 0;
1130 	}
1131 
1132 	default_set = 1;
1133 	default_policy = pol;
1134 
1135 	lockmgr(&dsched_lock, LK_RELEASE);
1136 
1137 	return error;
1138 }
1139 
1140 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL,
1141     "Disk Scheduler Framework (dsched) magic");
1142 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL,
1143     "List of disks and their policies");
1144 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable,
1145     0, "Enable dsched debugging");
1146 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD,
1147     0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats",
1148     "dsched statistics");
1149 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD,
1150     NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies");
1151 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW,
1152     NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy");
1153 
1154 static void
1155 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name)
1156 {
1157 	if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) {
1158 		diskctx->flags |= DSCHED_SYSCTL_CTX_INITED;
1159 		sysctl_ctx_init(&diskctx->sysctl_ctx);
1160 	}
1161 
1162 	SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy),
1163 	    OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW,
1164 	    diskctx, 0, sysctl_dsched_policy, "A", "policy");
1165 }
1166