1 /* 2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Alex Hornung <ahornung@gmail.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/proc.h> 38 #include <sys/sysctl.h> 39 #include <sys/buf.h> 40 #include <sys/conf.h> 41 #include <sys/diskslice.h> 42 #include <sys/disk.h> 43 #include <sys/malloc.h> 44 #include <machine/md_var.h> 45 #include <sys/ctype.h> 46 #include <sys/syslog.h> 47 #include <sys/device.h> 48 #include <sys/msgport.h> 49 #include <sys/msgport2.h> 50 #include <sys/buf2.h> 51 #include <sys/dsched.h> 52 #include <sys/fcntl.h> 53 #include <machine/varargs.h> 54 55 MALLOC_DEFINE(M_DSCHED, "dsched", "dsched allocs"); 56 57 static dsched_prepare_t noop_prepare; 58 static dsched_teardown_t noop_teardown; 59 static dsched_cancel_t noop_cancel; 60 static dsched_queue_t noop_queue; 61 62 static void dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name); 63 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx); 64 static void dsched_thread_io_destroy(struct dsched_thread_io *tdio); 65 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx); 66 67 static int dsched_inited = 0; 68 static int default_set = 0; 69 70 struct lock dsched_lock; 71 static int dsched_debug_enable = 0; 72 73 struct dsched_stats dsched_stats; 74 75 struct objcache_malloc_args dsched_disk_ctx_malloc_args = { 76 DSCHED_DISK_CTX_MAX_SZ, M_DSCHED }; 77 struct objcache_malloc_args dsched_thread_io_malloc_args = { 78 DSCHED_THREAD_IO_MAX_SZ, M_DSCHED }; 79 struct objcache_malloc_args dsched_thread_ctx_malloc_args = { 80 DSCHED_THREAD_CTX_MAX_SZ, M_DSCHED }; 81 82 static struct objcache *dsched_diskctx_cache; 83 static struct objcache *dsched_tdctx_cache; 84 static struct objcache *dsched_tdio_cache; 85 86 TAILQ_HEAD(, dsched_thread_ctx) dsched_tdctx_list = 87 TAILQ_HEAD_INITIALIZER(dsched_tdctx_list); 88 89 struct lock dsched_tdctx_lock; 90 91 static struct dsched_policy_head dsched_policy_list = 92 TAILQ_HEAD_INITIALIZER(dsched_policy_list); 93 94 static struct dsched_policy dsched_noop_policy = { 95 .name = "noop", 96 97 .prepare = noop_prepare, 98 .teardown = noop_teardown, 99 .cancel_all = noop_cancel, 100 .bio_queue = noop_queue 101 }; 102 103 static struct dsched_policy *default_policy = &dsched_noop_policy; 104 105 /* 106 * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function 107 * using kvprintf 108 */ 109 int 110 dsched_debug(int level, char *fmt, ...) 111 { 112 __va_list ap; 113 114 __va_start(ap, fmt); 115 if (level <= dsched_debug_enable) 116 kvprintf(fmt, ap); 117 __va_end(ap); 118 119 return 0; 120 } 121 122 /* 123 * Called on disk_create() 124 * tries to read which policy to use from loader.conf, if there's 125 * none specified, the default policy is used. 126 */ 127 void 128 dsched_disk_create_callback(struct disk *dp, const char *head_name, int unit) 129 { 130 char tunable_key[SPECNAMELEN + 48]; 131 char sched_policy[DSCHED_POLICY_NAME_LENGTH]; 132 char *ptr; 133 struct dsched_policy *policy = NULL; 134 135 /* Also look for serno stuff? */ 136 /* kprintf("dsched_disk_create_callback() for disk %s%d\n", head_name, unit); */ 137 lockmgr(&dsched_lock, LK_EXCLUSIVE); 138 139 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s%d", 140 head_name, unit); 141 if (TUNABLE_STR_FETCH(tunable_key, sched_policy, 142 sizeof(sched_policy)) != 0) { 143 policy = dsched_find_policy(sched_policy); 144 } 145 146 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s", 147 head_name); 148 for (ptr = tunable_key; *ptr; ptr++) { 149 if (*ptr == '/') 150 *ptr = '-'; 151 } 152 if (!policy && (TUNABLE_STR_FETCH(tunable_key, sched_policy, 153 sizeof(sched_policy)) != 0)) { 154 policy = dsched_find_policy(sched_policy); 155 } 156 157 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.default"); 158 if (!policy && !default_set && (TUNABLE_STR_FETCH(tunable_key, sched_policy, 159 sizeof(sched_policy)) != 0)) { 160 policy = dsched_find_policy(sched_policy); 161 } 162 163 if (!policy) { 164 if (!default_set && bootverbose) { 165 dsched_debug(0, 166 "No policy for %s%d specified, " 167 "or policy not found\n", 168 head_name, unit); 169 } 170 dsched_set_policy(dp, default_policy); 171 } else { 172 dsched_set_policy(dp, policy); 173 } 174 175 if (strncmp(head_name, "mapper/", strlen("mapper/")) == 0) 176 ksnprintf(tunable_key, sizeof(tunable_key), "%s", head_name); 177 else 178 ksnprintf(tunable_key, sizeof(tunable_key), "%s%d", head_name, unit); 179 for (ptr = tunable_key; *ptr; ptr++) { 180 if (*ptr == '/') 181 *ptr = '-'; 182 } 183 dsched_sysctl_add_disk( 184 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp), 185 tunable_key); 186 187 lockmgr(&dsched_lock, LK_RELEASE); 188 } 189 190 /* 191 * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if 192 * there's any policy associated with the serial number of the device. 193 */ 194 void 195 dsched_disk_update_callback(struct disk *dp, struct disk_info *info) 196 { 197 char tunable_key[SPECNAMELEN + 48]; 198 char sched_policy[DSCHED_POLICY_NAME_LENGTH]; 199 struct dsched_policy *policy = NULL; 200 201 if (info->d_serialno == NULL) 202 return; 203 204 lockmgr(&dsched_lock, LK_EXCLUSIVE); 205 206 ksnprintf(tunable_key, sizeof(tunable_key), "dsched.policy.%s", 207 info->d_serialno); 208 209 if((TUNABLE_STR_FETCH(tunable_key, sched_policy, 210 sizeof(sched_policy)) != 0)) { 211 policy = dsched_find_policy(sched_policy); 212 } 213 214 if (policy) { 215 dsched_switch(dp, policy); 216 } 217 218 dsched_sysctl_add_disk( 219 (struct dsched_disk_ctx *)dsched_get_disk_priv(dp), 220 info->d_serialno); 221 222 lockmgr(&dsched_lock, LK_RELEASE); 223 } 224 225 /* 226 * Called on disk_destroy() 227 * shuts down the scheduler core and cancels all remaining bios 228 */ 229 void 230 dsched_disk_destroy_callback(struct disk *dp) 231 { 232 struct dsched_policy *old_policy; 233 struct dsched_disk_ctx *diskctx; 234 235 lockmgr(&dsched_lock, LK_EXCLUSIVE); 236 237 diskctx = dsched_get_disk_priv(dp); 238 239 old_policy = dp->d_sched_policy; 240 dp->d_sched_policy = &dsched_noop_policy; 241 old_policy->cancel_all(dsched_get_disk_priv(dp)); 242 old_policy->teardown(dsched_get_disk_priv(dp)); 243 244 if (diskctx->flags & DSCHED_SYSCTL_CTX_INITED) 245 sysctl_ctx_free(&diskctx->sysctl_ctx); 246 247 policy_destroy(dp); 248 atomic_subtract_int(&old_policy->ref_count, 1); 249 KKASSERT(old_policy->ref_count >= 0); 250 251 lockmgr(&dsched_lock, LK_RELEASE); 252 } 253 254 255 void 256 dsched_queue(struct disk *dp, struct bio *bio) 257 { 258 struct dsched_thread_ctx *tdctx; 259 struct dsched_thread_io *tdio; 260 struct dsched_disk_ctx *diskctx; 261 262 int found = 0, error = 0; 263 264 tdctx = dsched_get_buf_priv(bio->bio_buf); 265 if (tdctx == NULL) { 266 /* We don't handle this case, let dsched dispatch */ 267 atomic_add_int(&dsched_stats.no_tdctx, 1); 268 dsched_strategy_raw(dp, bio); 269 return; 270 } 271 272 DSCHED_THREAD_CTX_LOCK(tdctx); 273 274 KKASSERT(!TAILQ_EMPTY(&tdctx->tdio_list)); 275 TAILQ_FOREACH(tdio, &tdctx->tdio_list, link) { 276 if (tdio->dp == dp) { 277 dsched_thread_io_ref(tdio); 278 found = 1; 279 break; 280 } 281 } 282 283 DSCHED_THREAD_CTX_UNLOCK(tdctx); 284 dsched_clr_buf_priv(bio->bio_buf); 285 dsched_thread_ctx_unref(tdctx); /* acquired on new_buf */ 286 287 KKASSERT(found == 1); 288 diskctx = dsched_get_disk_priv(dp); 289 dsched_disk_ctx_ref(diskctx); 290 error = dp->d_sched_policy->bio_queue(diskctx, tdio, bio); 291 292 if (error) { 293 dsched_strategy_raw(dp, bio); 294 } 295 dsched_disk_ctx_unref(diskctx); 296 dsched_thread_io_unref(tdio); 297 } 298 299 300 /* 301 * Called from each module_init or module_attach of each policy 302 * registers the policy in the local policy list. 303 */ 304 int 305 dsched_register(struct dsched_policy *d_policy) 306 { 307 struct dsched_policy *policy; 308 int error = 0; 309 310 lockmgr(&dsched_lock, LK_EXCLUSIVE); 311 312 policy = dsched_find_policy(d_policy->name); 313 314 if (!policy) { 315 TAILQ_INSERT_TAIL(&dsched_policy_list, d_policy, link); 316 atomic_add_int(&d_policy->ref_count, 1); 317 } else { 318 dsched_debug(LOG_ERR, "Policy with name %s already registered!\n", 319 d_policy->name); 320 error = EEXIST; 321 } 322 323 lockmgr(&dsched_lock, LK_RELEASE); 324 return error; 325 } 326 327 /* 328 * Called from each module_detach of each policy 329 * unregisters the policy 330 */ 331 int 332 dsched_unregister(struct dsched_policy *d_policy) 333 { 334 struct dsched_policy *policy; 335 336 lockmgr(&dsched_lock, LK_EXCLUSIVE); 337 policy = dsched_find_policy(d_policy->name); 338 339 if (policy) { 340 if (policy->ref_count > 1) { 341 lockmgr(&dsched_lock, LK_RELEASE); 342 return EBUSY; 343 } 344 TAILQ_REMOVE(&dsched_policy_list, policy, link); 345 atomic_subtract_int(&policy->ref_count, 1); 346 KKASSERT(policy->ref_count == 0); 347 } 348 lockmgr(&dsched_lock, LK_RELEASE); 349 return 0; 350 } 351 352 353 /* 354 * switches the policy by first removing the old one and then 355 * enabling the new one. 356 */ 357 int 358 dsched_switch(struct disk *dp, struct dsched_policy *new_policy) 359 { 360 struct dsched_policy *old_policy; 361 362 /* If we are asked to set the same policy, do nothing */ 363 if (dp->d_sched_policy == new_policy) 364 return 0; 365 366 /* lock everything down, diskwise */ 367 lockmgr(&dsched_lock, LK_EXCLUSIVE); 368 old_policy = dp->d_sched_policy; 369 370 atomic_subtract_int(&old_policy->ref_count, 1); 371 KKASSERT(old_policy->ref_count >= 0); 372 373 dp->d_sched_policy = &dsched_noop_policy; 374 old_policy->teardown(dsched_get_disk_priv(dp)); 375 policy_destroy(dp); 376 377 /* Bring everything back to life */ 378 dsched_set_policy(dp, new_policy); 379 lockmgr(&dsched_lock, LK_RELEASE); 380 return 0; 381 } 382 383 384 /* 385 * Loads a given policy and attaches it to the specified disk. 386 * Also initializes the core for the policy 387 */ 388 void 389 dsched_set_policy(struct disk *dp, struct dsched_policy *new_policy) 390 { 391 int locked = 0; 392 393 /* Check if it is locked already. if not, we acquire the devfs lock */ 394 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) { 395 lockmgr(&dsched_lock, LK_EXCLUSIVE); 396 locked = 1; 397 } 398 399 policy_new(dp, new_policy); 400 new_policy->prepare(dsched_get_disk_priv(dp)); 401 dp->d_sched_policy = new_policy; 402 atomic_add_int(&new_policy->ref_count, 1); 403 kprintf("disk scheduler: set policy of %s to %s\n", dp->d_cdev->si_name, 404 new_policy->name); 405 406 /* If we acquired the lock, we also get rid of it */ 407 if (locked) 408 lockmgr(&dsched_lock, LK_RELEASE); 409 } 410 411 struct dsched_policy* 412 dsched_find_policy(char *search) 413 { 414 struct dsched_policy *policy; 415 struct dsched_policy *policy_found = NULL; 416 int locked = 0; 417 418 /* Check if it is locked already. if not, we acquire the devfs lock */ 419 if (!(lockstatus(&dsched_lock, curthread)) == LK_EXCLUSIVE) { 420 lockmgr(&dsched_lock, LK_EXCLUSIVE); 421 locked = 1; 422 } 423 424 TAILQ_FOREACH(policy, &dsched_policy_list, link) { 425 if (!strcmp(policy->name, search)) { 426 policy_found = policy; 427 break; 428 } 429 } 430 431 /* If we acquired the lock, we also get rid of it */ 432 if (locked) 433 lockmgr(&dsched_lock, LK_RELEASE); 434 435 return policy_found; 436 } 437 438 struct disk* 439 dsched_find_disk(char *search) 440 { 441 struct disk *dp_found = NULL; 442 struct disk *dp = NULL; 443 444 while((dp = disk_enumerate(dp))) { 445 if (!strcmp(dp->d_cdev->si_name, search)) { 446 dp_found = dp; 447 break; 448 } 449 } 450 451 return dp_found; 452 } 453 454 struct disk* 455 dsched_disk_enumerate(struct disk *dp, struct dsched_policy *policy) 456 { 457 while ((dp = disk_enumerate(dp))) { 458 if (dp->d_sched_policy == policy) 459 return dp; 460 } 461 462 return NULL; 463 } 464 465 struct dsched_policy * 466 dsched_policy_enumerate(struct dsched_policy *pol) 467 { 468 if (!pol) 469 return (TAILQ_FIRST(&dsched_policy_list)); 470 else 471 return (TAILQ_NEXT(pol, link)); 472 } 473 474 void 475 dsched_cancel_bio(struct bio *bp) 476 { 477 bp->bio_buf->b_error = ENXIO; 478 bp->bio_buf->b_flags |= B_ERROR; 479 bp->bio_buf->b_resid = bp->bio_buf->b_bcount; 480 481 biodone(bp); 482 } 483 484 void 485 dsched_strategy_raw(struct disk *dp, struct bio *bp) 486 { 487 /* 488 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in 489 * to avoid panics 490 */ 491 KASSERT(dp->d_rawdev != NULL, ("dsched_strategy_raw sees NULL d_rawdev!!")); 492 if(bp->bio_track != NULL) { 493 dsched_debug(LOG_INFO, 494 "dsched_strategy_raw sees non-NULL bio_track!! " 495 "bio: %p\n", bp); 496 bp->bio_track = NULL; 497 } 498 dev_dstrategy(dp->d_rawdev, bp); 499 } 500 501 void 502 dsched_strategy_sync(struct disk *dp, struct bio *bio) 503 { 504 struct buf *bp, *nbp; 505 struct bio *nbio; 506 507 bp = bio->bio_buf; 508 509 nbp = getpbuf(NULL); 510 nbio = &nbp->b_bio1; 511 512 nbp->b_cmd = bp->b_cmd; 513 nbp->b_bufsize = bp->b_bufsize; 514 nbp->b_runningbufspace = bp->b_runningbufspace; 515 nbp->b_bcount = bp->b_bcount; 516 nbp->b_resid = bp->b_resid; 517 nbp->b_data = bp->b_data; 518 #if 0 519 /* 520 * Buffers undergoing device I/O do not need a kvabase/size. 521 */ 522 nbp->b_kvabase = bp->b_kvabase; 523 nbp->b_kvasize = bp->b_kvasize; 524 #endif 525 nbp->b_dirtyend = bp->b_dirtyend; 526 527 nbio->bio_done = biodone_sync; 528 nbio->bio_flags |= BIO_SYNC; 529 nbio->bio_track = NULL; 530 531 nbio->bio_caller_info1.ptr = dp; 532 nbio->bio_offset = bio->bio_offset; 533 534 dev_dstrategy(dp->d_rawdev, nbio); 535 biowait(nbio, "dschedsync"); 536 bp->b_resid = nbp->b_resid; 537 bp->b_error = nbp->b_error; 538 biodone(bio); 539 #if 0 540 nbp->b_kvabase = NULL; 541 nbp->b_kvasize = 0; 542 #endif 543 relpbuf(nbp, NULL); 544 } 545 546 void 547 dsched_strategy_async(struct disk *dp, struct bio *bio, biodone_t *done, void *priv) 548 { 549 struct bio *nbio; 550 551 nbio = push_bio(bio); 552 nbio->bio_done = done; 553 nbio->bio_offset = bio->bio_offset; 554 555 dsched_set_bio_dp(nbio, dp); 556 dsched_set_bio_priv(nbio, priv); 557 558 getmicrotime(&nbio->bio_caller_info3.tv); 559 dev_dstrategy(dp->d_rawdev, nbio); 560 } 561 562 /* 563 * Ref and deref various structures. The 1->0 transition of the reference 564 * count actually transitions 1->0x80000000 and causes the object to be 565 * destroyed. It is possible for transitory references to occur on the 566 * object while it is being destroyed. We use bit 31 to indicate that 567 * destruction is in progress and to prevent nested destructions. 568 */ 569 void 570 dsched_disk_ctx_ref(struct dsched_disk_ctx *diskctx) 571 { 572 int refcount; 573 574 refcount = atomic_fetchadd_int(&diskctx->refcount, 1); 575 } 576 577 void 578 dsched_thread_io_ref(struct dsched_thread_io *tdio) 579 { 580 int refcount; 581 582 refcount = atomic_fetchadd_int(&tdio->refcount, 1); 583 } 584 585 void 586 dsched_thread_ctx_ref(struct dsched_thread_ctx *tdctx) 587 { 588 int refcount; 589 590 refcount = atomic_fetchadd_int(&tdctx->refcount, 1); 591 } 592 593 void 594 dsched_disk_ctx_unref(struct dsched_disk_ctx *diskctx) 595 { 596 int refs; 597 int nrefs; 598 599 /* 600 * Handle 1->0 transitions for diskctx and nested destruction 601 * recursions. If the refs are already in destruction mode (bit 31 602 * set) on the 1->0 transition we don't try to destruct it again. 603 * 604 * 0x80000001->0x80000000 transitions are handled normally and 605 * thus avoid nested dstruction. 606 */ 607 for (;;) { 608 refs = diskctx->refcount; 609 cpu_ccfence(); 610 nrefs = refs - 1; 611 612 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0); 613 if (nrefs) { 614 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) 615 break; 616 continue; 617 } 618 nrefs = 0x80000000; 619 if (atomic_cmpset_int(&diskctx->refcount, refs, nrefs)) { 620 dsched_disk_ctx_destroy(diskctx); 621 break; 622 } 623 } 624 } 625 626 static 627 void 628 dsched_disk_ctx_destroy(struct dsched_disk_ctx *diskctx) 629 { 630 struct dsched_thread_io *tdio; 631 632 #if 0 633 kprintf("diskctx (%p) destruction started, trace:\n", diskctx); 634 print_backtrace(4); 635 #endif 636 lockmgr(&diskctx->lock, LK_EXCLUSIVE); 637 while ((tdio = TAILQ_FIRST(&diskctx->tdio_list)) != NULL) { 638 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX); 639 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink); 640 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX); 641 tdio->diskctx = NULL; 642 /* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/ 643 dsched_thread_io_unref(tdio); 644 } 645 lockmgr(&diskctx->lock, LK_RELEASE); 646 if (diskctx->dp->d_sched_policy->destroy_diskctx) 647 diskctx->dp->d_sched_policy->destroy_diskctx(diskctx); 648 KKASSERT(diskctx->refcount == 0x80000000); 649 objcache_put(dsched_diskctx_cache, diskctx); 650 atomic_subtract_int(&dsched_stats.diskctx_allocations, 1); 651 } 652 653 void 654 dsched_thread_io_unref(struct dsched_thread_io *tdio) 655 { 656 int refs; 657 int nrefs; 658 659 /* 660 * Handle 1->0 transitions for tdio and nested destruction 661 * recursions. If the refs are already in destruction mode (bit 31 662 * set) on the 1->0 transition we don't try to destruct it again. 663 * 664 * 0x80000001->0x80000000 transitions are handled normally and 665 * thus avoid nested dstruction. 666 */ 667 for (;;) { 668 refs = tdio->refcount; 669 cpu_ccfence(); 670 nrefs = refs - 1; 671 672 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0); 673 if (nrefs) { 674 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) 675 break; 676 continue; 677 } 678 nrefs = 0x80000000; 679 if (atomic_cmpset_int(&tdio->refcount, refs, nrefs)) { 680 dsched_thread_io_destroy(tdio); 681 break; 682 } 683 } 684 } 685 686 static void 687 dsched_thread_io_destroy(struct dsched_thread_io *tdio) 688 { 689 struct dsched_thread_ctx *tdctx; 690 struct dsched_disk_ctx *diskctx; 691 692 #if 0 693 kprintf("tdio (%p) destruction started, trace:\n", tdio); 694 print_backtrace(8); 695 #endif 696 KKASSERT(tdio->qlength == 0); 697 698 while ((diskctx = tdio->diskctx) != NULL) { 699 dsched_disk_ctx_ref(diskctx); 700 lockmgr(&diskctx->lock, LK_EXCLUSIVE); 701 if (diskctx != tdio->diskctx) { 702 lockmgr(&diskctx->lock, LK_RELEASE); 703 dsched_disk_ctx_unref(diskctx); 704 continue; 705 } 706 KKASSERT(tdio->flags & DSCHED_LINKED_DISK_CTX); 707 if (diskctx->dp->d_sched_policy->destroy_tdio) 708 diskctx->dp->d_sched_policy->destroy_tdio(tdio); 709 TAILQ_REMOVE(&diskctx->tdio_list, tdio, dlink); 710 atomic_clear_int(&tdio->flags, DSCHED_LINKED_DISK_CTX); 711 tdio->diskctx = NULL; 712 lockmgr(&diskctx->lock, LK_RELEASE); 713 dsched_disk_ctx_unref(diskctx); 714 } 715 while ((tdctx = tdio->tdctx) != NULL) { 716 dsched_thread_ctx_ref(tdctx); 717 lockmgr(&tdctx->lock, LK_EXCLUSIVE); 718 if (tdctx != tdio->tdctx) { 719 lockmgr(&tdctx->lock, LK_RELEASE); 720 dsched_thread_ctx_unref(tdctx); 721 continue; 722 } 723 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX); 724 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link); 725 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX); 726 tdio->tdctx = NULL; 727 lockmgr(&tdctx->lock, LK_RELEASE); 728 dsched_thread_ctx_unref(tdctx); 729 } 730 KKASSERT(tdio->refcount == 0x80000000); 731 objcache_put(dsched_tdio_cache, tdio); 732 atomic_subtract_int(&dsched_stats.tdio_allocations, 1); 733 #if 0 734 dsched_disk_ctx_unref(diskctx); 735 #endif 736 } 737 738 void 739 dsched_thread_ctx_unref(struct dsched_thread_ctx *tdctx) 740 { 741 int refs; 742 int nrefs; 743 744 /* 745 * Handle 1->0 transitions for tdctx and nested destruction 746 * recursions. If the refs are already in destruction mode (bit 31 747 * set) on the 1->0 transition we don't try to destruct it again. 748 * 749 * 0x80000001->0x80000000 transitions are handled normally and 750 * thus avoid nested dstruction. 751 */ 752 for (;;) { 753 refs = tdctx->refcount; 754 cpu_ccfence(); 755 nrefs = refs - 1; 756 757 KKASSERT(((refs ^ nrefs) & 0x80000000) == 0); 758 if (nrefs) { 759 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) 760 break; 761 continue; 762 } 763 nrefs = 0x80000000; 764 if (atomic_cmpset_int(&tdctx->refcount, refs, nrefs)) { 765 dsched_thread_ctx_destroy(tdctx); 766 break; 767 } 768 } 769 } 770 771 static void 772 dsched_thread_ctx_destroy(struct dsched_thread_ctx *tdctx) 773 { 774 struct dsched_thread_io *tdio; 775 776 #if 0 777 kprintf("tdctx (%p) destruction started, trace:\n", tdctx); 778 print_backtrace(8); 779 #endif 780 DSCHED_GLOBAL_THREAD_CTX_LOCK(); 781 782 while ((tdio = TAILQ_FIRST(&tdctx->tdio_list)) != NULL) { 783 KKASSERT(tdio->flags & DSCHED_LINKED_THREAD_CTX); 784 TAILQ_REMOVE(&tdctx->tdio_list, tdio, link); 785 atomic_clear_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX); 786 tdio->tdctx = NULL; 787 dsched_thread_io_unref(tdio); 788 } 789 KKASSERT(tdctx->refcount == 0x80000000); 790 TAILQ_REMOVE(&dsched_tdctx_list, tdctx, link); 791 792 DSCHED_GLOBAL_THREAD_CTX_UNLOCK(); 793 794 objcache_put(dsched_tdctx_cache, tdctx); 795 atomic_subtract_int(&dsched_stats.tdctx_allocations, 1); 796 } 797 798 struct dsched_thread_io * 799 dsched_thread_io_alloc(struct disk *dp, struct dsched_thread_ctx *tdctx, 800 struct dsched_policy *pol) 801 { 802 struct dsched_thread_io *tdio; 803 #if 0 804 dsched_disk_ctx_ref(dsched_get_disk_priv(dp)); 805 #endif 806 tdio = objcache_get(dsched_tdio_cache, M_WAITOK); 807 bzero(tdio, DSCHED_THREAD_IO_MAX_SZ); 808 809 /* XXX: maybe we do need another ref for the disk list for tdio */ 810 dsched_thread_io_ref(tdio); 811 812 DSCHED_THREAD_IO_LOCKINIT(tdio); 813 tdio->dp = dp; 814 815 tdio->diskctx = dsched_get_disk_priv(dp); 816 TAILQ_INIT(&tdio->queue); 817 818 if (pol->new_tdio) 819 pol->new_tdio(tdio); 820 821 lockmgr(&tdio->diskctx->lock, LK_EXCLUSIVE); 822 TAILQ_INSERT_TAIL(&tdio->diskctx->tdio_list, tdio, dlink); 823 atomic_set_int(&tdio->flags, DSCHED_LINKED_DISK_CTX); 824 lockmgr(&tdio->diskctx->lock, LK_RELEASE); 825 826 if (tdctx) { 827 tdio->tdctx = tdctx; 828 tdio->p = tdctx->p; 829 830 /* Put the tdio in the tdctx list */ 831 DSCHED_THREAD_CTX_LOCK(tdctx); 832 TAILQ_INSERT_TAIL(&tdctx->tdio_list, tdio, link); 833 DSCHED_THREAD_CTX_UNLOCK(tdctx); 834 atomic_set_int(&tdio->flags, DSCHED_LINKED_THREAD_CTX); 835 } 836 837 atomic_add_int(&dsched_stats.tdio_allocations, 1); 838 return tdio; 839 } 840 841 842 struct dsched_disk_ctx * 843 dsched_disk_ctx_alloc(struct disk *dp, struct dsched_policy *pol) 844 { 845 struct dsched_disk_ctx *diskctx; 846 847 diskctx = objcache_get(dsched_diskctx_cache, M_WAITOK); 848 bzero(diskctx, DSCHED_DISK_CTX_MAX_SZ); 849 dsched_disk_ctx_ref(diskctx); 850 diskctx->dp = dp; 851 DSCHED_DISK_CTX_LOCKINIT(diskctx); 852 TAILQ_INIT(&diskctx->tdio_list); 853 854 atomic_add_int(&dsched_stats.diskctx_allocations, 1); 855 if (pol->new_diskctx) 856 pol->new_diskctx(diskctx); 857 return diskctx; 858 } 859 860 861 struct dsched_thread_ctx * 862 dsched_thread_ctx_alloc(struct proc *p) 863 { 864 struct dsched_thread_ctx *tdctx; 865 struct dsched_thread_io *tdio; 866 struct disk *dp = NULL; 867 868 tdctx = objcache_get(dsched_tdctx_cache, M_WAITOK); 869 bzero(tdctx, DSCHED_THREAD_CTX_MAX_SZ); 870 dsched_thread_ctx_ref(tdctx); 871 #if 0 872 kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx); 873 #endif 874 DSCHED_THREAD_CTX_LOCKINIT(tdctx); 875 TAILQ_INIT(&tdctx->tdio_list); 876 tdctx->p = p; 877 878 DSCHED_GLOBAL_THREAD_CTX_LOCK(); 879 while ((dp = disk_enumerate(dp))) { 880 tdio = dsched_thread_io_alloc(dp, tdctx, dp->d_sched_policy); 881 } 882 883 TAILQ_INSERT_TAIL(&dsched_tdctx_list, tdctx, link); 884 DSCHED_GLOBAL_THREAD_CTX_UNLOCK(); 885 886 atomic_add_int(&dsched_stats.tdctx_allocations, 1); 887 /* XXX: no callback here */ 888 return tdctx; 889 } 890 891 void 892 policy_new(struct disk *dp, struct dsched_policy *pol) { 893 struct dsched_thread_ctx *tdctx; 894 struct dsched_disk_ctx *diskctx; 895 struct dsched_thread_io *tdio; 896 897 diskctx = dsched_disk_ctx_alloc(dp, pol); 898 dsched_disk_ctx_ref(diskctx); 899 dsched_set_disk_priv(dp, diskctx); 900 901 DSCHED_GLOBAL_THREAD_CTX_LOCK(); 902 TAILQ_FOREACH(tdctx, &dsched_tdctx_list, link) { 903 tdio = dsched_thread_io_alloc(dp, tdctx, pol); 904 } 905 DSCHED_GLOBAL_THREAD_CTX_UNLOCK(); 906 907 } 908 909 void 910 policy_destroy(struct disk *dp) { 911 struct dsched_disk_ctx *diskctx; 912 913 diskctx = dsched_get_disk_priv(dp); 914 KKASSERT(diskctx != NULL); 915 916 dsched_disk_ctx_unref(diskctx); /* from prepare */ 917 dsched_disk_ctx_unref(diskctx); /* from alloc */ 918 919 dsched_set_disk_priv(dp, NULL); 920 } 921 922 void 923 dsched_new_buf(struct buf *bp) 924 { 925 struct dsched_thread_ctx *tdctx = NULL; 926 927 if (dsched_inited == 0) 928 return; 929 930 if (curproc != NULL) { 931 tdctx = dsched_get_proc_priv(curproc); 932 } else { 933 /* This is a kernel thread, so no proc info is available */ 934 tdctx = dsched_get_thread_priv(curthread); 935 } 936 937 #if 0 938 /* 939 * XXX: hack. we don't want this assert because we aren't catching all 940 * threads. mi_startup() is still getting away without an tdctx. 941 */ 942 943 /* by now we should have an tdctx. if not, something bad is going on */ 944 KKASSERT(tdctx != NULL); 945 #endif 946 947 if (tdctx) { 948 dsched_thread_ctx_ref(tdctx); 949 } 950 dsched_set_buf_priv(bp, tdctx); 951 } 952 953 void 954 dsched_exit_buf(struct buf *bp) 955 { 956 struct dsched_thread_ctx *tdctx; 957 958 tdctx = dsched_get_buf_priv(bp); 959 if (tdctx != NULL) { 960 dsched_clr_buf_priv(bp); 961 dsched_thread_ctx_unref(tdctx); 962 } 963 } 964 965 void 966 dsched_new_proc(struct proc *p) 967 { 968 struct dsched_thread_ctx *tdctx; 969 970 if (dsched_inited == 0) 971 return; 972 973 KKASSERT(p != NULL); 974 975 tdctx = dsched_thread_ctx_alloc(p); 976 tdctx->p = p; 977 dsched_thread_ctx_ref(tdctx); 978 979 dsched_set_proc_priv(p, tdctx); 980 atomic_add_int(&dsched_stats.nprocs, 1); 981 } 982 983 984 void 985 dsched_new_thread(struct thread *td) 986 { 987 struct dsched_thread_ctx *tdctx; 988 989 if (dsched_inited == 0) 990 return; 991 992 KKASSERT(td != NULL); 993 994 tdctx = dsched_thread_ctx_alloc(NULL); 995 tdctx->td = td; 996 dsched_thread_ctx_ref(tdctx); 997 998 dsched_set_thread_priv(td, tdctx); 999 atomic_add_int(&dsched_stats.nthreads, 1); 1000 } 1001 1002 void 1003 dsched_exit_proc(struct proc *p) 1004 { 1005 struct dsched_thread_ctx *tdctx; 1006 1007 if (dsched_inited == 0) 1008 return; 1009 1010 KKASSERT(p != NULL); 1011 1012 tdctx = dsched_get_proc_priv(p); 1013 KKASSERT(tdctx != NULL); 1014 1015 tdctx->dead = 0xDEAD; 1016 dsched_set_proc_priv(p, NULL); 1017 1018 dsched_thread_ctx_unref(tdctx); /* one for alloc, */ 1019 dsched_thread_ctx_unref(tdctx); /* one for ref */ 1020 atomic_subtract_int(&dsched_stats.nprocs, 1); 1021 } 1022 1023 1024 void 1025 dsched_exit_thread(struct thread *td) 1026 { 1027 struct dsched_thread_ctx *tdctx; 1028 1029 if (dsched_inited == 0) 1030 return; 1031 1032 KKASSERT(td != NULL); 1033 1034 tdctx = dsched_get_thread_priv(td); 1035 KKASSERT(tdctx != NULL); 1036 1037 tdctx->dead = 0xDEAD; 1038 dsched_set_thread_priv(td, 0); 1039 1040 dsched_thread_ctx_unref(tdctx); /* one for alloc, */ 1041 dsched_thread_ctx_unref(tdctx); /* one for ref */ 1042 atomic_subtract_int(&dsched_stats.nthreads, 1); 1043 } 1044 1045 struct dsched_thread_io * 1046 dsched_new_policy_thread_tdio(struct dsched_disk_ctx *diskctx, 1047 struct dsched_policy *pol) { 1048 struct dsched_thread_ctx *tdctx; 1049 struct dsched_thread_io *tdio; 1050 1051 DSCHED_GLOBAL_THREAD_CTX_LOCK(); 1052 1053 tdctx = dsched_get_thread_priv(curthread); 1054 KKASSERT(tdctx != NULL); 1055 tdio = dsched_thread_io_alloc(diskctx->dp, tdctx, pol); 1056 1057 DSCHED_GLOBAL_THREAD_CTX_UNLOCK(); 1058 1059 return tdio; 1060 } 1061 1062 /* DEFAULT NOOP POLICY */ 1063 1064 static int 1065 noop_prepare(struct dsched_disk_ctx *diskctx) 1066 { 1067 return 0; 1068 } 1069 1070 static void 1071 noop_teardown(struct dsched_disk_ctx *diskctx) 1072 { 1073 1074 } 1075 1076 static void 1077 noop_cancel(struct dsched_disk_ctx *diskctx) 1078 { 1079 1080 } 1081 1082 static int 1083 noop_queue(struct dsched_disk_ctx *diskctx, struct dsched_thread_io *tdio, 1084 struct bio *bio) 1085 { 1086 dsched_strategy_raw(diskctx->dp, bio); 1087 #if 0 1088 dsched_strategy_async(diskctx->dp, bio, noop_completed, NULL); 1089 #endif 1090 return 0; 1091 } 1092 1093 /* 1094 * SYSINIT stuff 1095 */ 1096 static void 1097 dsched_init(void) 1098 { 1099 dsched_tdio_cache = objcache_create("dsched-tdio-cache", 0, 0, 1100 NULL, NULL, NULL, 1101 objcache_malloc_alloc, 1102 objcache_malloc_free, 1103 &dsched_thread_io_malloc_args ); 1104 1105 dsched_tdctx_cache = objcache_create("dsched-tdctx-cache", 0, 0, 1106 NULL, NULL, NULL, 1107 objcache_malloc_alloc, 1108 objcache_malloc_free, 1109 &dsched_thread_ctx_malloc_args ); 1110 1111 dsched_diskctx_cache = objcache_create("dsched-diskctx-cache", 0, 0, 1112 NULL, NULL, NULL, 1113 objcache_malloc_alloc, 1114 objcache_malloc_free, 1115 &dsched_disk_ctx_malloc_args ); 1116 1117 bzero(&dsched_stats, sizeof(struct dsched_stats)); 1118 1119 lockinit(&dsched_lock, "dsched lock", 0, LK_CANRECURSE); 1120 DSCHED_GLOBAL_THREAD_CTX_LOCKINIT(); 1121 1122 dsched_register(&dsched_noop_policy); 1123 1124 dsched_inited = 1; 1125 } 1126 1127 static void 1128 dsched_uninit(void) 1129 { 1130 } 1131 1132 SYSINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_FIRST, dsched_init, NULL); 1133 SYSUNINIT(subr_dsched_register, SI_SUB_CREATE_INIT-1, SI_ORDER_ANY, dsched_uninit, NULL); 1134 1135 /* 1136 * SYSCTL stuff 1137 */ 1138 static int 1139 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS) 1140 { 1141 return (sysctl_handle_opaque(oidp, &dsched_stats, sizeof(struct dsched_stats), req)); 1142 } 1143 1144 static int 1145 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS) 1146 { 1147 struct dsched_policy *pol = NULL; 1148 int error, first = 1; 1149 1150 lockmgr(&dsched_lock, LK_EXCLUSIVE); 1151 1152 while ((pol = dsched_policy_enumerate(pol))) { 1153 if (!first) { 1154 error = SYSCTL_OUT(req, " ", 1); 1155 if (error) 1156 break; 1157 } else { 1158 first = 0; 1159 } 1160 error = SYSCTL_OUT(req, pol->name, strlen(pol->name)); 1161 if (error) 1162 break; 1163 1164 } 1165 1166 lockmgr(&dsched_lock, LK_RELEASE); 1167 1168 error = SYSCTL_OUT(req, "", 1); 1169 1170 return error; 1171 } 1172 1173 static int 1174 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS) 1175 { 1176 char buf[DSCHED_POLICY_NAME_LENGTH]; 1177 struct dsched_disk_ctx *diskctx = arg1; 1178 struct dsched_policy *pol = NULL; 1179 int error; 1180 1181 if (diskctx == NULL) { 1182 return 0; 1183 } 1184 1185 lockmgr(&dsched_lock, LK_EXCLUSIVE); 1186 1187 pol = diskctx->dp->d_sched_policy; 1188 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH); 1189 1190 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req); 1191 if (error || req->newptr == NULL) { 1192 lockmgr(&dsched_lock, LK_RELEASE); 1193 return (error); 1194 } 1195 1196 pol = dsched_find_policy(buf); 1197 if (pol == NULL) { 1198 lockmgr(&dsched_lock, LK_RELEASE); 1199 return 0; 1200 } 1201 1202 dsched_switch(diskctx->dp, pol); 1203 1204 lockmgr(&dsched_lock, LK_RELEASE); 1205 1206 return error; 1207 } 1208 1209 static int 1210 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS) 1211 { 1212 char buf[DSCHED_POLICY_NAME_LENGTH]; 1213 struct dsched_policy *pol = NULL; 1214 int error; 1215 1216 lockmgr(&dsched_lock, LK_EXCLUSIVE); 1217 1218 pol = default_policy; 1219 memcpy(buf, pol->name, DSCHED_POLICY_NAME_LENGTH); 1220 1221 error = sysctl_handle_string(oidp, buf, DSCHED_POLICY_NAME_LENGTH, req); 1222 if (error || req->newptr == NULL) { 1223 lockmgr(&dsched_lock, LK_RELEASE); 1224 return (error); 1225 } 1226 1227 pol = dsched_find_policy(buf); 1228 if (pol == NULL) { 1229 lockmgr(&dsched_lock, LK_RELEASE); 1230 return 0; 1231 } 1232 1233 default_set = 1; 1234 default_policy = pol; 1235 1236 lockmgr(&dsched_lock, LK_RELEASE); 1237 1238 return error; 1239 } 1240 1241 SYSCTL_NODE(, OID_AUTO, dsched, CTLFLAG_RD, NULL, 1242 "Disk Scheduler Framework (dsched) magic"); 1243 SYSCTL_NODE(_dsched, OID_AUTO, policy, CTLFLAG_RW, NULL, 1244 "List of disks and their policies"); 1245 SYSCTL_INT(_dsched, OID_AUTO, debug, CTLFLAG_RW, &dsched_debug_enable, 1246 0, "Enable dsched debugging"); 1247 SYSCTL_PROC(_dsched, OID_AUTO, stats, CTLTYPE_OPAQUE|CTLFLAG_RD, 1248 0, sizeof(struct dsched_stats), sysctl_dsched_stats, "dsched_stats", 1249 "dsched statistics"); 1250 SYSCTL_PROC(_dsched, OID_AUTO, policies, CTLTYPE_STRING|CTLFLAG_RD, 1251 NULL, 0, sysctl_dsched_list_policies, "A", "names of available policies"); 1252 SYSCTL_PROC(_dsched_policy, OID_AUTO, default, CTLTYPE_STRING|CTLFLAG_RW, 1253 NULL, 0, sysctl_dsched_default_policy, "A", "default dsched policy"); 1254 1255 static void 1256 dsched_sysctl_add_disk(struct dsched_disk_ctx *diskctx, char *name) 1257 { 1258 if (!(diskctx->flags & DSCHED_SYSCTL_CTX_INITED)) { 1259 diskctx->flags |= DSCHED_SYSCTL_CTX_INITED; 1260 sysctl_ctx_init(&diskctx->sysctl_ctx); 1261 } 1262 1263 SYSCTL_ADD_PROC(&diskctx->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dsched_policy), 1264 OID_AUTO, name, CTLTYPE_STRING|CTLFLAG_RW, 1265 diskctx, 0, sysctl_dsched_policy, "A", "policy"); 1266 } 1267