1 /* 2 * Copyright (c) 2003-2010 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * Each cpu in a system has its own self-contained light weight kernel 37 * thread scheduler, which means that generally speaking we only need 38 * to use a critical section to avoid problems. Foreign thread 39 * scheduling is queued via (async) IPIs. 40 */ 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/kernel.h> 45 #include <sys/proc.h> 46 #include <sys/rtprio.h> 47 #include <sys/kinfo.h> 48 #include <sys/queue.h> 49 #include <sys/sysctl.h> 50 #include <sys/kthread.h> 51 #include <machine/cpu.h> 52 #include <sys/lock.h> 53 #include <sys/caps.h> 54 #include <sys/spinlock.h> 55 #include <sys/ktr.h> 56 57 #include <sys/thread2.h> 58 #include <sys/spinlock2.h> 59 #include <sys/mplock2.h> 60 61 #include <sys/dsched.h> 62 63 #include <vm/vm.h> 64 #include <vm/vm_param.h> 65 #include <vm/vm_kern.h> 66 #include <vm/vm_object.h> 67 #include <vm/vm_page.h> 68 #include <vm/vm_map.h> 69 #include <vm/vm_pager.h> 70 #include <vm/vm_extern.h> 71 72 #include <machine/stdarg.h> 73 #include <machine/smp.h> 74 75 #if !defined(KTR_CTXSW) 76 #define KTR_CTXSW KTR_ALL 77 #endif 78 KTR_INFO_MASTER(ctxsw); 79 KTR_INFO(KTR_CTXSW, ctxsw, sw, 0, "#cpu[%d].td = %p", 80 sizeof(int) + sizeof(struct thread *)); 81 KTR_INFO(KTR_CTXSW, ctxsw, pre, 1, "#cpu[%d].td = %p", 82 sizeof(int) + sizeof(struct thread *)); 83 KTR_INFO(KTR_CTXSW, ctxsw, newtd, 2, "#threads[%p].name = %s", 84 sizeof (struct thread *) + sizeof(char *)); 85 KTR_INFO(KTR_CTXSW, ctxsw, deadtd, 3, "#threads[%p].name = <dead>", sizeof (struct thread *)); 86 87 static MALLOC_DEFINE(M_THREAD, "thread", "lwkt threads"); 88 89 #ifdef INVARIANTS 90 static int panic_on_cscount = 0; 91 #endif 92 static __int64_t switch_count = 0; 93 static __int64_t preempt_hit = 0; 94 static __int64_t preempt_miss = 0; 95 static __int64_t preempt_weird = 0; 96 static __int64_t token_contention_count __debugvar = 0; 97 static int lwkt_use_spin_port; 98 static struct objcache *thread_cache; 99 100 #ifdef SMP 101 static void lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame); 102 #endif 103 static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td); 104 105 extern void cpu_heavy_restore(void); 106 extern void cpu_lwkt_restore(void); 107 extern void cpu_kthread_restore(void); 108 extern void cpu_idle_restore(void); 109 110 /* 111 * We can make all thread ports use the spin backend instead of the thread 112 * backend. This should only be set to debug the spin backend. 113 */ 114 TUNABLE_INT("lwkt.use_spin_port", &lwkt_use_spin_port); 115 116 #ifdef INVARIANTS 117 SYSCTL_INT(_lwkt, OID_AUTO, panic_on_cscount, CTLFLAG_RW, &panic_on_cscount, 0, 118 "Panic if attempting to switch lwkt's while mastering cpusync"); 119 #endif 120 SYSCTL_QUAD(_lwkt, OID_AUTO, switch_count, CTLFLAG_RW, &switch_count, 0, 121 "Number of switched threads"); 122 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_hit, CTLFLAG_RW, &preempt_hit, 0, 123 "Successful preemption events"); 124 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_miss, CTLFLAG_RW, &preempt_miss, 0, 125 "Failed preemption events"); 126 SYSCTL_QUAD(_lwkt, OID_AUTO, preempt_weird, CTLFLAG_RW, &preempt_weird, 0, 127 "Number of preempted threads."); 128 #ifdef INVARIANTS 129 SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW, 130 &token_contention_count, 0, "spinning due to token contention"); 131 #endif 132 static int fairq_enable = 1; 133 SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW, 134 &fairq_enable, 0, "Turn on fairq priority accumulators"); 135 static int lwkt_spin_loops = 10; 136 SYSCTL_INT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW, 137 &lwkt_spin_loops, 0, ""); 138 static int lwkt_spin_delay = 1; 139 SYSCTL_INT(_lwkt, OID_AUTO, spin_delay, CTLFLAG_RW, 140 &lwkt_spin_delay, 0, "Scheduler spin delay in microseconds 0=auto"); 141 static int lwkt_spin_method = 1; 142 SYSCTL_INT(_lwkt, OID_AUTO, spin_method, CTLFLAG_RW, 143 &lwkt_spin_method, 0, "LWKT scheduler behavior when contended"); 144 static int lwkt_spin_fatal = 0; /* disabled */ 145 SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW, 146 &lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic"); 147 static int preempt_enable = 1; 148 SYSCTL_INT(_lwkt, OID_AUTO, preempt_enable, CTLFLAG_RW, 149 &preempt_enable, 0, "Enable preemption"); 150 151 static __cachealign int lwkt_cseq_rindex; 152 static __cachealign int lwkt_cseq_windex; 153 154 /* 155 * These helper procedures handle the runq, they can only be called from 156 * within a critical section. 157 * 158 * WARNING! Prior to SMP being brought up it is possible to enqueue and 159 * dequeue threads belonging to other cpus, so be sure to use td->td_gd 160 * instead of 'mycpu' when referencing the globaldata structure. Once 161 * SMP live enqueuing and dequeueing only occurs on the current cpu. 162 */ 163 static __inline 164 void 165 _lwkt_dequeue(thread_t td) 166 { 167 if (td->td_flags & TDF_RUNQ) { 168 struct globaldata *gd = td->td_gd; 169 170 td->td_flags &= ~TDF_RUNQ; 171 TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq); 172 gd->gd_fairq_total_pri -= td->td_pri; 173 if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL) 174 atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING); 175 } 176 } 177 178 /* 179 * Priority enqueue. 180 * 181 * NOTE: There are a limited number of lwkt threads runnable since user 182 * processes only schedule one at a time per cpu. 183 */ 184 static __inline 185 void 186 _lwkt_enqueue(thread_t td) 187 { 188 thread_t xtd; 189 190 if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) { 191 struct globaldata *gd = td->td_gd; 192 193 td->td_flags |= TDF_RUNQ; 194 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 195 if (xtd == NULL) { 196 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); 197 atomic_set_int(&gd->gd_reqflags, RQF_RUNNING); 198 } else { 199 while (xtd && xtd->td_pri > td->td_pri) 200 xtd = TAILQ_NEXT(xtd, td_threadq); 201 if (xtd) 202 TAILQ_INSERT_BEFORE(xtd, td, td_threadq); 203 else 204 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq); 205 } 206 gd->gd_fairq_total_pri += td->td_pri; 207 } 208 } 209 210 static __boolean_t 211 _lwkt_thread_ctor(void *obj, void *privdata, int ocflags) 212 { 213 struct thread *td = (struct thread *)obj; 214 215 td->td_kstack = NULL; 216 td->td_kstack_size = 0; 217 td->td_flags = TDF_ALLOCATED_THREAD; 218 return (1); 219 } 220 221 static void 222 _lwkt_thread_dtor(void *obj, void *privdata) 223 { 224 struct thread *td = (struct thread *)obj; 225 226 KASSERT(td->td_flags & TDF_ALLOCATED_THREAD, 227 ("_lwkt_thread_dtor: not allocated from objcache")); 228 KASSERT((td->td_flags & TDF_ALLOCATED_STACK) && td->td_kstack && 229 td->td_kstack_size > 0, 230 ("_lwkt_thread_dtor: corrupted stack")); 231 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 232 } 233 234 /* 235 * Initialize the lwkt s/system. 236 */ 237 void 238 lwkt_init(void) 239 { 240 /* An objcache has 2 magazines per CPU so divide cache size by 2. */ 241 thread_cache = objcache_create_mbacked(M_THREAD, sizeof(struct thread), 242 NULL, CACHE_NTHREADS/2, 243 _lwkt_thread_ctor, _lwkt_thread_dtor, NULL); 244 } 245 246 /* 247 * Schedule a thread to run. As the current thread we can always safely 248 * schedule ourselves, and a shortcut procedure is provided for that 249 * function. 250 * 251 * (non-blocking, self contained on a per cpu basis) 252 */ 253 void 254 lwkt_schedule_self(thread_t td) 255 { 256 KKASSERT((td->td_flags & TDF_MIGRATING) == 0); 257 crit_enter_quick(td); 258 KASSERT(td != &td->td_gd->gd_idlethread, 259 ("lwkt_schedule_self(): scheduling gd_idlethread is illegal!")); 260 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 261 _lwkt_enqueue(td); 262 crit_exit_quick(td); 263 } 264 265 /* 266 * Deschedule a thread. 267 * 268 * (non-blocking, self contained on a per cpu basis) 269 */ 270 void 271 lwkt_deschedule_self(thread_t td) 272 { 273 crit_enter_quick(td); 274 _lwkt_dequeue(td); 275 crit_exit_quick(td); 276 } 277 278 /* 279 * LWKTs operate on a per-cpu basis 280 * 281 * WARNING! Called from early boot, 'mycpu' may not work yet. 282 */ 283 void 284 lwkt_gdinit(struct globaldata *gd) 285 { 286 TAILQ_INIT(&gd->gd_tdrunq); 287 TAILQ_INIT(&gd->gd_tdallq); 288 } 289 290 /* 291 * Create a new thread. The thread must be associated with a process context 292 * or LWKT start address before it can be scheduled. If the target cpu is 293 * -1 the thread will be created on the current cpu. 294 * 295 * If you intend to create a thread without a process context this function 296 * does everything except load the startup and switcher function. 297 */ 298 thread_t 299 lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) 300 { 301 globaldata_t gd = mycpu; 302 void *stack; 303 304 /* 305 * If static thread storage is not supplied allocate a thread. Reuse 306 * a cached free thread if possible. gd_freetd is used to keep an exiting 307 * thread intact through the exit. 308 */ 309 if (td == NULL) { 310 crit_enter_gd(gd); 311 if ((td = gd->gd_freetd) != NULL) { 312 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| 313 TDF_RUNQ)) == 0); 314 gd->gd_freetd = NULL; 315 } else { 316 td = objcache_get(thread_cache, M_WAITOK); 317 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK| 318 TDF_RUNQ)) == 0); 319 } 320 crit_exit_gd(gd); 321 KASSERT((td->td_flags & 322 (TDF_ALLOCATED_THREAD|TDF_RUNNING)) == TDF_ALLOCATED_THREAD, 323 ("lwkt_alloc_thread: corrupted td flags 0x%X", td->td_flags)); 324 flags |= td->td_flags & (TDF_ALLOCATED_THREAD|TDF_ALLOCATED_STACK); 325 } 326 327 /* 328 * Try to reuse cached stack. 329 */ 330 if ((stack = td->td_kstack) != NULL && td->td_kstack_size != stksize) { 331 if (flags & TDF_ALLOCATED_STACK) { 332 kmem_free(&kernel_map, (vm_offset_t)stack, td->td_kstack_size); 333 stack = NULL; 334 } 335 } 336 if (stack == NULL) { 337 stack = (void *)kmem_alloc_stack(&kernel_map, stksize); 338 flags |= TDF_ALLOCATED_STACK; 339 } 340 if (cpu < 0) 341 lwkt_init_thread(td, stack, stksize, flags, gd); 342 else 343 lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu)); 344 return(td); 345 } 346 347 /* 348 * Initialize a preexisting thread structure. This function is used by 349 * lwkt_alloc_thread() and also used to initialize the per-cpu idlethread. 350 * 351 * All threads start out in a critical section at a priority of 352 * TDPRI_KERN_DAEMON. Higher level code will modify the priority as 353 * appropriate. This function may send an IPI message when the 354 * requested cpu is not the current cpu and consequently gd_tdallq may 355 * not be initialized synchronously from the point of view of the originating 356 * cpu. 357 * 358 * NOTE! we have to be careful in regards to creating threads for other cpus 359 * if SMP has not yet been activated. 360 */ 361 #ifdef SMP 362 363 static void 364 lwkt_init_thread_remote(void *arg) 365 { 366 thread_t td = arg; 367 368 /* 369 * Protected by critical section held by IPI dispatch 370 */ 371 TAILQ_INSERT_TAIL(&td->td_gd->gd_tdallq, td, td_allq); 372 } 373 374 #endif 375 376 /* 377 * lwkt core thread structural initialization. 378 * 379 * NOTE: All threads are initialized as mpsafe threads. 380 */ 381 void 382 lwkt_init_thread(thread_t td, void *stack, int stksize, int flags, 383 struct globaldata *gd) 384 { 385 globaldata_t mygd = mycpu; 386 387 bzero(td, sizeof(struct thread)); 388 td->td_kstack = stack; 389 td->td_kstack_size = stksize; 390 td->td_flags = flags; 391 td->td_gd = gd; 392 td->td_pri = TDPRI_KERN_DAEMON; 393 td->td_critcount = 1; 394 td->td_toks_stop = &td->td_toks_base; 395 if (lwkt_use_spin_port) 396 lwkt_initport_spin(&td->td_msgport); 397 else 398 lwkt_initport_thread(&td->td_msgport, td); 399 pmap_init_thread(td); 400 #ifdef SMP 401 /* 402 * Normally initializing a thread for a remote cpu requires sending an 403 * IPI. However, the idlethread is setup before the other cpus are 404 * activated so we have to treat it as a special case. XXX manipulation 405 * of gd_tdallq requires the BGL. 406 */ 407 if (gd == mygd || td == &gd->gd_idlethread) { 408 crit_enter_gd(mygd); 409 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 410 crit_exit_gd(mygd); 411 } else { 412 lwkt_send_ipiq(gd, lwkt_init_thread_remote, td); 413 } 414 #else 415 crit_enter_gd(mygd); 416 TAILQ_INSERT_TAIL(&gd->gd_tdallq, td, td_allq); 417 crit_exit_gd(mygd); 418 #endif 419 420 dsched_new_thread(td); 421 } 422 423 void 424 lwkt_set_comm(thread_t td, const char *ctl, ...) 425 { 426 __va_list va; 427 428 __va_start(va, ctl); 429 kvsnprintf(td->td_comm, sizeof(td->td_comm), ctl, va); 430 __va_end(va); 431 KTR_LOG(ctxsw_newtd, td, &td->td_comm[0]); 432 } 433 434 void 435 lwkt_hold(thread_t td) 436 { 437 atomic_add_int(&td->td_refs, 1); 438 } 439 440 void 441 lwkt_rele(thread_t td) 442 { 443 KKASSERT(td->td_refs > 0); 444 atomic_add_int(&td->td_refs, -1); 445 } 446 447 void 448 lwkt_wait_free(thread_t td) 449 { 450 while (td->td_refs) 451 tsleep(td, 0, "tdreap", hz); 452 } 453 454 void 455 lwkt_free_thread(thread_t td) 456 { 457 KKASSERT(td->td_refs == 0); 458 KKASSERT((td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK|TDF_RUNQ)) == 0); 459 if (td->td_flags & TDF_ALLOCATED_THREAD) { 460 objcache_put(thread_cache, td); 461 } else if (td->td_flags & TDF_ALLOCATED_STACK) { 462 /* client-allocated struct with internally allocated stack */ 463 KASSERT(td->td_kstack && td->td_kstack_size > 0, 464 ("lwkt_free_thread: corrupted stack")); 465 kmem_free(&kernel_map, (vm_offset_t)td->td_kstack, td->td_kstack_size); 466 td->td_kstack = NULL; 467 td->td_kstack_size = 0; 468 } 469 KTR_LOG(ctxsw_deadtd, td); 470 } 471 472 473 /* 474 * Switch to the next runnable lwkt. If no LWKTs are runnable then 475 * switch to the idlethread. Switching must occur within a critical 476 * section to avoid races with the scheduling queue. 477 * 478 * We always have full control over our cpu's run queue. Other cpus 479 * that wish to manipulate our queue must use the cpu_*msg() calls to 480 * talk to our cpu, so a critical section is all that is needed and 481 * the result is very, very fast thread switching. 482 * 483 * The LWKT scheduler uses a fixed priority model and round-robins at 484 * each priority level. User process scheduling is a totally 485 * different beast and LWKT priorities should not be confused with 486 * user process priorities. 487 * 488 * PREEMPTION NOTE: Preemption occurs via lwkt_preempt(). lwkt_switch() 489 * is not called by the current thread in the preemption case, only when 490 * the preempting thread blocks (in order to return to the original thread). 491 * 492 * SPECIAL NOTE ON SWITCH ATOMICY: Certain operations such as thread 493 * migration and tsleep deschedule the current lwkt thread and call 494 * lwkt_switch(). In particular, the target cpu of the migration fully 495 * expects the thread to become non-runnable and can deadlock against 496 * cpusync operations if we run any IPIs prior to switching the thread out. 497 * 498 * WE MUST BE VERY CAREFUL NOT TO RUN SPLZ DIRECTLY OR INDIRECTLY IF 499 * THE CURRENET THREAD HAS BEEN DESCHEDULED! 500 */ 501 void 502 lwkt_switch(void) 503 { 504 globaldata_t gd = mycpu; 505 thread_t td = gd->gd_curthread; 506 thread_t ntd; 507 thread_t xtd; 508 int spinning = lwkt_spin_loops; /* loops before HLTing */ 509 int reqflags; 510 int cseq; 511 int oseq; 512 int fatal_count; 513 514 /* 515 * Switching from within a 'fast' (non thread switched) interrupt or IPI 516 * is illegal. However, we may have to do it anyway if we hit a fatal 517 * kernel trap or we have paniced. 518 * 519 * If this case occurs save and restore the interrupt nesting level. 520 */ 521 if (gd->gd_intr_nesting_level) { 522 int savegdnest; 523 int savegdtrap; 524 525 if (gd->gd_trap_nesting_level == 0 && panic_cpu_gd != mycpu) { 526 panic("lwkt_switch: Attempt to switch from a " 527 "a fast interrupt, ipi, or hard code section, " 528 "td %p\n", 529 td); 530 } else { 531 savegdnest = gd->gd_intr_nesting_level; 532 savegdtrap = gd->gd_trap_nesting_level; 533 gd->gd_intr_nesting_level = 0; 534 gd->gd_trap_nesting_level = 0; 535 if ((td->td_flags & TDF_PANICWARN) == 0) { 536 td->td_flags |= TDF_PANICWARN; 537 kprintf("Warning: thread switch from interrupt, IPI, " 538 "or hard code section.\n" 539 "thread %p (%s)\n", td, td->td_comm); 540 print_backtrace(-1); 541 } 542 lwkt_switch(); 543 gd->gd_intr_nesting_level = savegdnest; 544 gd->gd_trap_nesting_level = savegdtrap; 545 return; 546 } 547 } 548 549 /* 550 * Passive release (used to transition from user to kernel mode 551 * when we block or switch rather then when we enter the kernel). 552 * This function is NOT called if we are switching into a preemption 553 * or returning from a preemption. Typically this causes us to lose 554 * our current process designation (if we have one) and become a true 555 * LWKT thread, and may also hand the current process designation to 556 * another process and schedule thread. 557 */ 558 if (td->td_release) 559 td->td_release(td); 560 561 crit_enter_gd(gd); 562 if (TD_TOKS_HELD(td)) 563 lwkt_relalltokens(td); 564 565 /* 566 * We had better not be holding any spin locks, but don't get into an 567 * endless panic loop. 568 */ 569 KASSERT(gd->gd_spinlocks_wr == 0 || panicstr != NULL, 570 ("lwkt_switch: still holding %d exclusive spinlocks!", 571 gd->gd_spinlocks_wr)); 572 573 574 #ifdef SMP 575 #ifdef INVARIANTS 576 if (td->td_cscount) { 577 kprintf("Diagnostic: attempt to switch while mastering cpusync: %p\n", 578 td); 579 if (panic_on_cscount) 580 panic("switching while mastering cpusync"); 581 } 582 #endif 583 #endif 584 585 /* 586 * If we had preempted another thread on this cpu, resume the preempted 587 * thread. This occurs transparently, whether the preempted thread 588 * was scheduled or not (it may have been preempted after descheduling 589 * itself). 590 * 591 * We have to setup the MP lock for the original thread after backing 592 * out the adjustment that was made to curthread when the original 593 * was preempted. 594 */ 595 if ((ntd = td->td_preempted) != NULL) { 596 KKASSERT(ntd->td_flags & TDF_PREEMPT_LOCK); 597 ntd->td_flags |= TDF_PREEMPT_DONE; 598 599 /* 600 * The interrupt may have woken a thread up, we need to properly 601 * set the reschedule flag if the originally interrupted thread is 602 * at a lower priority. 603 */ 604 if (TAILQ_FIRST(&gd->gd_tdrunq) && 605 TAILQ_FIRST(&gd->gd_tdrunq)->td_pri > ntd->td_pri) { 606 need_lwkt_resched(); 607 } 608 /* YYY release mp lock on switchback if original doesn't need it */ 609 goto havethread_preempted; 610 } 611 612 /* 613 * Implement round-robin fairq with priority insertion. The priority 614 * insertion is handled by _lwkt_enqueue() 615 * 616 * If we cannot obtain ownership of the tokens we cannot immediately 617 * schedule the target thread. 618 * 619 * Reminder: Again, we cannot afford to run any IPIs in this path if 620 * the current thread has been descheduled. 621 */ 622 for (;;) { 623 /* 624 * Clear RQF_AST_LWKT_RESCHED (we handle the reschedule request) 625 * and set RQF_WAKEUP (prevent unnecessary IPIs from being 626 * received). 627 */ 628 for (;;) { 629 reqflags = gd->gd_reqflags; 630 if (atomic_cmpset_int(&gd->gd_reqflags, reqflags, 631 (reqflags & ~RQF_AST_LWKT_RESCHED) | 632 RQF_WAKEUP)) { 633 break; 634 } 635 } 636 637 /* 638 * Hotpath - pull the head of the run queue and attempt to schedule 639 * it. Fairq exhaustion moves the task to the end of the list. If 640 * no threads are runnable we switch to the idle thread. 641 */ 642 for (;;) { 643 ntd = TAILQ_FIRST(&gd->gd_tdrunq); 644 645 if (ntd == NULL) { 646 /* 647 * Runq is empty, switch to idle and clear RQF_WAKEUP 648 * to allow it to halt. 649 */ 650 ntd = &gd->gd_idlethread; 651 #ifdef SMP 652 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) 653 ASSERT_NO_TOKENS_HELD(ntd); 654 #endif 655 cpu_time.cp_msg[0] = 0; 656 cpu_time.cp_stallpc = 0; 657 atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP); 658 goto haveidle; 659 } 660 661 if (ntd->td_fairq_accum >= 0) 662 break; 663 664 /*splz_check(); cannot do this here, see above */ 665 lwkt_fairq_accumulate(gd, ntd); 666 TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); 667 TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq); 668 } 669 670 /* 671 * Hotpath - schedule ntd. Leaves RQF_WAKEUP set to prevent 672 * unwanted decontention IPIs. 673 * 674 * NOTE: For UP there is no mplock and lwkt_getalltokens() 675 * always succeeds. 676 */ 677 if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) 678 goto havethread; 679 680 /* 681 * Coldpath (SMP only since tokens always succeed on UP) 682 * 683 * We had some contention on the thread we wanted to schedule. 684 * What we do now is try to find a thread that we can schedule 685 * in its stead until decontention reschedules on our cpu. 686 * 687 * The coldpath scan does NOT rearrange threads in the run list 688 * and it also ignores the accumulator. 689 * 690 * We do not immediately schedule a user priority thread, instead 691 * we record it in xtd and continue looking for kernel threads. 692 * A cpu can only have one user priority thread (normally) so just 693 * record the first one. 694 * 695 * NOTE: This scan will also include threads whos fairq's were 696 * accumulated in the first loop. 697 */ 698 ++token_contention_count; 699 xtd = NULL; 700 while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) { 701 /* 702 * Try to switch to this thread. If the thread is running at 703 * user priority we clear WAKEUP to allow decontention IPIs 704 * (since this thread is simply running until the one we wanted 705 * decontends), and we make sure that LWKT_RESCHED is not set. 706 * 707 * Otherwise for kernel threads we leave WAKEUP set to avoid 708 * unnecessary decontention IPIs. 709 */ 710 if (ntd->td_pri < TDPRI_KERN_LPSCHED) { 711 if (xtd == NULL) 712 xtd = ntd; 713 continue; 714 } 715 716 /* 717 * Do not let the fairq get too negative. Even though we are 718 * ignoring it atm once the scheduler decontends a very negative 719 * thread will get moved to the end of the queue. 720 */ 721 if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) { 722 if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd)) 723 ntd->td_fairq_accum = -TDFAIRQ_MAX(gd); 724 goto havethread; 725 } 726 727 /* 728 * Well fubar, this thread is contended as well, loop 729 */ 730 /* */ 731 } 732 733 /* 734 * We exhausted the run list but we may have recorded a user 735 * thread to try. We have three choices based on 736 * lwkt.decontention_method. 737 * 738 * (0) Atomically clear RQF_WAKEUP in order to receive decontention 739 * IPIs (to interrupt the user process) and test 740 * RQF_AST_LWKT_RESCHED at the same time. 741 * 742 * This results in significant decontention IPI traffic but may 743 * be more responsive. 744 * 745 * (1) Leave RQF_WAKEUP set so we do not receive a decontention IPI. 746 * An automatic LWKT reschedule will occur on the next hardclock 747 * (typically 100hz). 748 * 749 * This results in no decontention IPI traffic but may be less 750 * responsive. This is the default. 751 * 752 * (2) Refuse to schedule the user process at this time. 753 * 754 * This is highly experimental and should not be used under 755 * normal circumstances. This can cause a user process to 756 * get starved out in situations where kernel threads are 757 * fighting each other for tokens. 758 */ 759 if (xtd) { 760 ntd = xtd; 761 762 switch(lwkt_spin_method) { 763 case 0: 764 for (;;) { 765 reqflags = gd->gd_reqflags; 766 if (atomic_cmpset_int(&gd->gd_reqflags, 767 reqflags, 768 reqflags & ~RQF_WAKEUP)) { 769 break; 770 } 771 } 772 break; 773 case 1: 774 reqflags = gd->gd_reqflags; 775 break; 776 default: 777 goto skip; 778 break; 779 } 780 if ((reqflags & RQF_AST_LWKT_RESCHED) == 0 && 781 (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) 782 ) { 783 if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd)) 784 ntd->td_fairq_accum = -TDFAIRQ_MAX(gd); 785 goto havethread; 786 } 787 788 skip: 789 /* 790 * Make sure RQF_WAKEUP is set if we failed to schedule the 791 * user thread to prevent the idle thread from halting. 792 */ 793 atomic_set_int(&gd->gd_reqflags, RQF_WAKEUP); 794 } 795 796 /* 797 * We exhausted the run list, meaning that all runnable threads 798 * are contended. 799 */ 800 cpu_pause(); 801 ntd = &gd->gd_idlethread; 802 #ifdef SMP 803 if (gd->gd_trap_nesting_level == 0 && panicstr == NULL) 804 ASSERT_NO_TOKENS_HELD(ntd); 805 /* contention case, do not clear contention mask */ 806 #endif 807 808 /* 809 * Ok, we might want to spin a few times as some tokens are held for 810 * very short periods of time and IPI overhead is 1uS or worse 811 * (meaning it is usually better to spin). Regardless we have to 812 * call splz_check() to be sure to service any interrupts blocked 813 * by our critical section, otherwise we could livelock e.g. IPIs. 814 * 815 * The IPI mechanic is really a last resort. In nearly all other 816 * cases RQF_WAKEUP is left set to prevent decontention IPIs. 817 * 818 * When we decide not to spin we clear RQF_WAKEUP and switch to 819 * the idle thread. Clearing RQF_WEAKEUP allows the idle thread 820 * to halt and decontended tokens will issue an IPI to us. The 821 * idle thread will check for pending reschedules already set 822 * (RQF_AST_LWKT_RESCHED) before actually halting so we don't have 823 * to here. 824 * 825 * Also, if TDF_RUNQ is not set the current thread is trying to 826 * deschedule, possibly in an atomic fashion. We cannot afford to 827 * stay here. 828 */ 829 if (spinning <= 0 || (td->td_flags & TDF_RUNQ) == 0) { 830 atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP); 831 goto haveidle; 832 } 833 --spinning; 834 835 /* 836 * When spinning a delay is required both to avoid livelocks from 837 * token order reversals (a thread may be trying to acquire multiple 838 * tokens), and also to reduce cpu cache management traffic. 839 * 840 * In order to scale to a large number of CPUs we use a time slot 841 * resequencer to force contending cpus into non-contending 842 * time-slots. The scheduler may still contend with the lock holder 843 * but will not (generally) contend with all the other cpus trying 844 * trying to get the same token. 845 * 846 * The resequencer uses a FIFO counter mechanic. The owner of the 847 * rindex at the head of the FIFO is allowed to pull itself off 848 * the FIFO and fetchadd is used to enter into the FIFO. This bit 849 * of code is VERY cache friendly and forces all spinning schedulers 850 * into their own time slots. 851 * 852 * This code has been tested to 48-cpus and caps the cache 853 * contention load at ~1uS intervals regardless of the number of 854 * cpus. Scaling beyond 64 cpus might require additional smarts 855 * (such as separate FIFOs for specific token cases). 856 * 857 * WARNING! We can't call splz_check() or anything else here as 858 * it could cause a deadlock. 859 */ 860 #ifdef __amd64__ 861 if ((read_rflags() & PSL_I) == 0) { 862 cpu_enable_intr(); 863 panic("lwkt_switch() called with interrupts disabled"); 864 } 865 #endif 866 cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1); 867 fatal_count = lwkt_spin_fatal; 868 while ((oseq = lwkt_cseq_rindex) != cseq) { 869 cpu_ccfence(); 870 #if !defined(_KERNEL_VIRTUAL) 871 if (cpu_mi_feature & CPU_MI_MONITOR) { 872 cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq); 873 } else 874 #endif 875 { 876 DELAY(1); 877 cpu_lfence(); 878 } 879 if (fatal_count && --fatal_count == 0) 880 panic("lwkt_switch: fatal spin wait"); 881 } 882 cseq = lwkt_spin_delay; /* don't trust the system operator */ 883 cpu_ccfence(); 884 if (cseq < 1) 885 cseq = 1; 886 if (cseq > 1000) 887 cseq = 1000; 888 DELAY(cseq); 889 atomic_add_int(&lwkt_cseq_rindex, 1); 890 splz_check(); /* ok, we already checked that td is still scheduled */ 891 /* highest level for(;;) loop */ 892 } 893 894 havethread: 895 /* 896 * We must always decrement td_fairq_accum on non-idle threads just 897 * in case a thread never gets a tick due to being in a continuous 898 * critical section. The page-zeroing code does this, for example. 899 * 900 * If the thread we came up with is a higher or equal priority verses 901 * the thread at the head of the queue we move our thread to the 902 * front. This way we can always check the front of the queue. 903 * 904 * Clear gd_idle_repeat when doing a normal switch to a non-idle 905 * thread. 906 */ 907 ++gd->gd_cnt.v_swtch; 908 --ntd->td_fairq_accum; 909 ntd->td_wmesg = NULL; 910 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 911 if (ntd != xtd && ntd->td_pri >= xtd->td_pri) { 912 TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq); 913 TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq); 914 } 915 gd->gd_idle_repeat = 0; 916 917 havethread_preempted: 918 /* 919 * If the new target does not need the MP lock and we are holding it, 920 * release the MP lock. If the new target requires the MP lock we have 921 * already acquired it for the target. 922 */ 923 ; 924 haveidle: 925 KASSERT(ntd->td_critcount, 926 ("priority problem in lwkt_switch %d %d", 927 td->td_critcount, ntd->td_critcount)); 928 929 if (td != ntd) { 930 ++switch_count; 931 KTR_LOG(ctxsw_sw, gd->gd_cpuid, ntd); 932 td->td_switch(ntd); 933 } 934 /* NOTE: current cpu may have changed after switch */ 935 crit_exit_quick(td); 936 } 937 938 /* 939 * Request that the target thread preempt the current thread. Preemption 940 * only works under a specific set of conditions: 941 * 942 * - We are not preempting ourselves 943 * - The target thread is owned by the current cpu 944 * - We are not currently being preempted 945 * - The target is not currently being preempted 946 * - We are not holding any spin locks 947 * - The target thread is not holding any tokens 948 * - We are able to satisfy the target's MP lock requirements (if any). 949 * 950 * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION. Typically 951 * this is called via lwkt_schedule() through the td_preemptable callback. 952 * critcount is the managed critical priority that we should ignore in order 953 * to determine whether preemption is possible (aka usually just the crit 954 * priority of lwkt_schedule() itself). 955 * 956 * XXX at the moment we run the target thread in a critical section during 957 * the preemption in order to prevent the target from taking interrupts 958 * that *WE* can't. Preemption is strictly limited to interrupt threads 959 * and interrupt-like threads, outside of a critical section, and the 960 * preempted source thread will be resumed the instant the target blocks 961 * whether or not the source is scheduled (i.e. preemption is supposed to 962 * be as transparent as possible). 963 */ 964 void 965 lwkt_preempt(thread_t ntd, int critcount) 966 { 967 struct globaldata *gd = mycpu; 968 thread_t td; 969 int save_gd_intr_nesting_level; 970 971 /* 972 * The caller has put us in a critical section. We can only preempt 973 * if the caller of the caller was not in a critical section (basically 974 * a local interrupt), as determined by the 'critcount' parameter. We 975 * also can't preempt if the caller is holding any spinlocks (even if 976 * he isn't in a critical section). This also handles the tokens test. 977 * 978 * YYY The target thread must be in a critical section (else it must 979 * inherit our critical section? I dunno yet). 980 * 981 * Set need_lwkt_resched() unconditionally for now YYY. 982 */ 983 KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri)); 984 985 if (preempt_enable == 0) { 986 ++preempt_miss; 987 return; 988 } 989 990 td = gd->gd_curthread; 991 if (ntd->td_pri <= td->td_pri) { 992 ++preempt_miss; 993 return; 994 } 995 if (td->td_critcount > critcount) { 996 ++preempt_miss; 997 need_lwkt_resched(); 998 return; 999 } 1000 #ifdef SMP 1001 if (ntd->td_gd != gd) { 1002 ++preempt_miss; 1003 need_lwkt_resched(); 1004 return; 1005 } 1006 #endif 1007 /* 1008 * We don't have to check spinlocks here as they will also bump 1009 * td_critcount. 1010 * 1011 * Do not try to preempt if the target thread is holding any tokens. 1012 * We could try to acquire the tokens but this case is so rare there 1013 * is no need to support it. 1014 */ 1015 KKASSERT(gd->gd_spinlocks_wr == 0); 1016 1017 if (TD_TOKS_HELD(ntd)) { 1018 ++preempt_miss; 1019 need_lwkt_resched(); 1020 return; 1021 } 1022 if (td == ntd || ((td->td_flags | ntd->td_flags) & TDF_PREEMPT_LOCK)) { 1023 ++preempt_weird; 1024 need_lwkt_resched(); 1025 return; 1026 } 1027 if (ntd->td_preempted) { 1028 ++preempt_hit; 1029 need_lwkt_resched(); 1030 return; 1031 } 1032 1033 /* 1034 * Since we are able to preempt the current thread, there is no need to 1035 * call need_lwkt_resched(). 1036 * 1037 * We must temporarily clear gd_intr_nesting_level around the switch 1038 * since switchouts from the target thread are allowed (they will just 1039 * return to our thread), and since the target thread has its own stack. 1040 */ 1041 ++preempt_hit; 1042 ntd->td_preempted = td; 1043 td->td_flags |= TDF_PREEMPT_LOCK; 1044 KTR_LOG(ctxsw_pre, gd->gd_cpuid, ntd); 1045 save_gd_intr_nesting_level = gd->gd_intr_nesting_level; 1046 gd->gd_intr_nesting_level = 0; 1047 td->td_switch(ntd); 1048 gd->gd_intr_nesting_level = save_gd_intr_nesting_level; 1049 1050 KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE)); 1051 ntd->td_preempted = NULL; 1052 td->td_flags &= ~(TDF_PREEMPT_LOCK|TDF_PREEMPT_DONE); 1053 } 1054 1055 /* 1056 * Conditionally call splz() if gd_reqflags indicates work is pending. 1057 * This will work inside a critical section but not inside a hard code 1058 * section. 1059 * 1060 * (self contained on a per cpu basis) 1061 */ 1062 void 1063 splz_check(void) 1064 { 1065 globaldata_t gd = mycpu; 1066 thread_t td = gd->gd_curthread; 1067 1068 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && 1069 gd->gd_intr_nesting_level == 0 && 1070 td->td_nest_count < 2) 1071 { 1072 splz(); 1073 } 1074 } 1075 1076 /* 1077 * This version is integrated into crit_exit, reqflags has already 1078 * been tested but td_critcount has not. 1079 * 1080 * We only want to execute the splz() on the 1->0 transition of 1081 * critcount and not in a hard code section or if too deeply nested. 1082 */ 1083 void 1084 lwkt_maybe_splz(thread_t td) 1085 { 1086 globaldata_t gd = td->td_gd; 1087 1088 if (td->td_critcount == 0 && 1089 gd->gd_intr_nesting_level == 0 && 1090 td->td_nest_count < 2) 1091 { 1092 splz(); 1093 } 1094 } 1095 1096 /* 1097 * This function is used to negotiate a passive release of the current 1098 * process/lwp designation with the user scheduler, allowing the user 1099 * scheduler to schedule another user thread. The related kernel thread 1100 * (curthread) continues running in the released state. 1101 */ 1102 void 1103 lwkt_passive_release(struct thread *td) 1104 { 1105 struct lwp *lp = td->td_lwp; 1106 1107 td->td_release = NULL; 1108 lwkt_setpri_self(TDPRI_KERN_USER); 1109 lp->lwp_proc->p_usched->release_curproc(lp); 1110 } 1111 1112 1113 /* 1114 * This implements a normal yield. This routine is virtually a nop if 1115 * there is nothing to yield to but it will always run any pending interrupts 1116 * if called from a critical section. 1117 * 1118 * This yield is designed for kernel threads without a user context. 1119 * 1120 * (self contained on a per cpu basis) 1121 */ 1122 void 1123 lwkt_yield(void) 1124 { 1125 globaldata_t gd = mycpu; 1126 thread_t td = gd->gd_curthread; 1127 thread_t xtd; 1128 1129 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) 1130 splz(); 1131 if (td->td_fairq_accum < 0) { 1132 lwkt_schedule_self(curthread); 1133 lwkt_switch(); 1134 } else { 1135 xtd = TAILQ_FIRST(&gd->gd_tdrunq); 1136 if (xtd && xtd->td_pri > td->td_pri) { 1137 lwkt_schedule_self(curthread); 1138 lwkt_switch(); 1139 } 1140 } 1141 } 1142 1143 /* 1144 * This yield is designed for kernel threads with a user context. 1145 * 1146 * The kernel acting on behalf of the user is potentially cpu-bound, 1147 * this function will efficiently allow other threads to run and also 1148 * switch to other processes by releasing. 1149 * 1150 * The lwkt_user_yield() function is designed to have very low overhead 1151 * if no yield is determined to be needed. 1152 */ 1153 void 1154 lwkt_user_yield(void) 1155 { 1156 globaldata_t gd = mycpu; 1157 thread_t td = gd->gd_curthread; 1158 1159 /* 1160 * Always run any pending interrupts in case we are in a critical 1161 * section. 1162 */ 1163 if ((gd->gd_reqflags & RQF_IDLECHECK_MASK) && td->td_nest_count < 2) 1164 splz(); 1165 1166 /* 1167 * Switch (which forces a release) if another kernel thread needs 1168 * the cpu, if userland wants us to resched, or if our kernel 1169 * quantum has run out. 1170 */ 1171 if (lwkt_resched_wanted() || 1172 user_resched_wanted() || 1173 td->td_fairq_accum < 0) 1174 { 1175 lwkt_switch(); 1176 } 1177 1178 #if 0 1179 /* 1180 * Reacquire the current process if we are released. 1181 * 1182 * XXX not implemented atm. The kernel may be holding locks and such, 1183 * so we want the thread to continue to receive cpu. 1184 */ 1185 if (td->td_release == NULL && lp) { 1186 lp->lwp_proc->p_usched->acquire_curproc(lp); 1187 td->td_release = lwkt_passive_release; 1188 lwkt_setpri_self(TDPRI_USER_NORM); 1189 } 1190 #endif 1191 } 1192 1193 /* 1194 * Generic schedule. Possibly schedule threads belonging to other cpus and 1195 * deal with threads that might be blocked on a wait queue. 1196 * 1197 * We have a little helper inline function which does additional work after 1198 * the thread has been enqueued, including dealing with preemption and 1199 * setting need_lwkt_resched() (which prevents the kernel from returning 1200 * to userland until it has processed higher priority threads). 1201 * 1202 * It is possible for this routine to be called after a failed _enqueue 1203 * (due to the target thread migrating, sleeping, or otherwise blocked). 1204 * We have to check that the thread is actually on the run queue! 1205 * 1206 * reschedok is an optimized constant propagated from lwkt_schedule() or 1207 * lwkt_schedule_noresched(). By default it is non-zero, causing a 1208 * reschedule to be requested if the target thread has a higher priority. 1209 * The port messaging code will set MSG_NORESCHED and cause reschedok to 1210 * be 0, prevented undesired reschedules. 1211 */ 1212 static __inline 1213 void 1214 _lwkt_schedule_post(globaldata_t gd, thread_t ntd, int ccount, int reschedok) 1215 { 1216 thread_t otd; 1217 1218 if (ntd->td_flags & TDF_RUNQ) { 1219 if (ntd->td_preemptable && reschedok) { 1220 ntd->td_preemptable(ntd, ccount); /* YYY +token */ 1221 } else if (reschedok) { 1222 otd = curthread; 1223 if (ntd->td_pri > otd->td_pri) 1224 need_lwkt_resched(); 1225 } 1226 1227 /* 1228 * Give the thread a little fair share scheduler bump if it 1229 * has been asleep for a while. This is primarily to avoid 1230 * a degenerate case for interrupt threads where accumulator 1231 * crosses into negative territory unnecessarily. 1232 */ 1233 if (ntd->td_fairq_lticks != ticks) { 1234 ntd->td_fairq_lticks = ticks; 1235 ntd->td_fairq_accum += gd->gd_fairq_total_pri; 1236 if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd)) 1237 ntd->td_fairq_accum = TDFAIRQ_MAX(gd); 1238 } 1239 } 1240 } 1241 1242 static __inline 1243 void 1244 _lwkt_schedule(thread_t td, int reschedok) 1245 { 1246 globaldata_t mygd = mycpu; 1247 1248 KASSERT(td != &td->td_gd->gd_idlethread, 1249 ("lwkt_schedule(): scheduling gd_idlethread is illegal!")); 1250 KKASSERT((td->td_flags & TDF_MIGRATING) == 0); 1251 crit_enter_gd(mygd); 1252 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1253 if (td == mygd->gd_curthread) { 1254 _lwkt_enqueue(td); 1255 } else { 1256 /* 1257 * If we own the thread, there is no race (since we are in a 1258 * critical section). If we do not own the thread there might 1259 * be a race but the target cpu will deal with it. 1260 */ 1261 #ifdef SMP 1262 if (td->td_gd == mygd) { 1263 _lwkt_enqueue(td); 1264 _lwkt_schedule_post(mygd, td, 1, reschedok); 1265 } else { 1266 lwkt_send_ipiq3(td->td_gd, lwkt_schedule_remote, td, 0); 1267 } 1268 #else 1269 _lwkt_enqueue(td); 1270 _lwkt_schedule_post(mygd, td, 1, reschedok); 1271 #endif 1272 } 1273 crit_exit_gd(mygd); 1274 } 1275 1276 void 1277 lwkt_schedule(thread_t td) 1278 { 1279 _lwkt_schedule(td, 1); 1280 } 1281 1282 void 1283 lwkt_schedule_noresched(thread_t td) 1284 { 1285 _lwkt_schedule(td, 0); 1286 } 1287 1288 #ifdef SMP 1289 1290 /* 1291 * When scheduled remotely if frame != NULL the IPIQ is being 1292 * run via doreti or an interrupt then preemption can be allowed. 1293 * 1294 * To allow preemption we have to drop the critical section so only 1295 * one is present in _lwkt_schedule_post. 1296 */ 1297 static void 1298 lwkt_schedule_remote(void *arg, int arg2, struct intrframe *frame) 1299 { 1300 thread_t td = curthread; 1301 thread_t ntd = arg; 1302 1303 if (frame && ntd->td_preemptable) { 1304 crit_exit_noyield(td); 1305 _lwkt_schedule(ntd, 1); 1306 crit_enter_quick(td); 1307 } else { 1308 _lwkt_schedule(ntd, 1); 1309 } 1310 } 1311 1312 /* 1313 * Thread migration using a 'Pull' method. The thread may or may not be 1314 * the current thread. It MUST be descheduled and in a stable state. 1315 * lwkt_giveaway() must be called on the cpu owning the thread. 1316 * 1317 * At any point after lwkt_giveaway() is called, the target cpu may 1318 * 'pull' the thread by calling lwkt_acquire(). 1319 * 1320 * We have to make sure the thread is not sitting on a per-cpu tsleep 1321 * queue or it will blow up when it moves to another cpu. 1322 * 1323 * MPSAFE - must be called under very specific conditions. 1324 */ 1325 void 1326 lwkt_giveaway(thread_t td) 1327 { 1328 globaldata_t gd = mycpu; 1329 1330 crit_enter_gd(gd); 1331 if (td->td_flags & TDF_TSLEEPQ) 1332 tsleep_remove(td); 1333 KKASSERT(td->td_gd == gd); 1334 TAILQ_REMOVE(&gd->gd_tdallq, td, td_allq); 1335 td->td_flags |= TDF_MIGRATING; 1336 crit_exit_gd(gd); 1337 } 1338 1339 void 1340 lwkt_acquire(thread_t td) 1341 { 1342 globaldata_t gd; 1343 globaldata_t mygd; 1344 1345 KKASSERT(td->td_flags & TDF_MIGRATING); 1346 gd = td->td_gd; 1347 mygd = mycpu; 1348 if (gd != mycpu) { 1349 cpu_lfence(); 1350 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1351 crit_enter_gd(mygd); 1352 DEBUG_PUSH_INFO("lwkt_acquire"); 1353 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1354 #ifdef SMP 1355 lwkt_process_ipiq(); 1356 #endif 1357 cpu_lfence(); 1358 } 1359 DEBUG_POP_INFO(); 1360 cpu_mfence(); 1361 td->td_gd = mygd; 1362 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1363 td->td_flags &= ~TDF_MIGRATING; 1364 crit_exit_gd(mygd); 1365 } else { 1366 crit_enter_gd(mygd); 1367 TAILQ_INSERT_TAIL(&mygd->gd_tdallq, td, td_allq); 1368 td->td_flags &= ~TDF_MIGRATING; 1369 crit_exit_gd(mygd); 1370 } 1371 } 1372 1373 #endif 1374 1375 /* 1376 * Generic deschedule. Descheduling threads other then your own should be 1377 * done only in carefully controlled circumstances. Descheduling is 1378 * asynchronous. 1379 * 1380 * This function may block if the cpu has run out of messages. 1381 */ 1382 void 1383 lwkt_deschedule(thread_t td) 1384 { 1385 crit_enter(); 1386 #ifdef SMP 1387 if (td == curthread) { 1388 _lwkt_dequeue(td); 1389 } else { 1390 if (td->td_gd == mycpu) { 1391 _lwkt_dequeue(td); 1392 } else { 1393 lwkt_send_ipiq(td->td_gd, (ipifunc1_t)lwkt_deschedule, td); 1394 } 1395 } 1396 #else 1397 _lwkt_dequeue(td); 1398 #endif 1399 crit_exit(); 1400 } 1401 1402 /* 1403 * Set the target thread's priority. This routine does not automatically 1404 * switch to a higher priority thread, LWKT threads are not designed for 1405 * continuous priority changes. Yield if you want to switch. 1406 */ 1407 void 1408 lwkt_setpri(thread_t td, int pri) 1409 { 1410 KKASSERT(td->td_gd == mycpu); 1411 if (td->td_pri != pri) { 1412 KKASSERT(pri >= 0); 1413 crit_enter(); 1414 if (td->td_flags & TDF_RUNQ) { 1415 _lwkt_dequeue(td); 1416 td->td_pri = pri; 1417 _lwkt_enqueue(td); 1418 } else { 1419 td->td_pri = pri; 1420 } 1421 crit_exit(); 1422 } 1423 } 1424 1425 /* 1426 * Set the initial priority for a thread prior to it being scheduled for 1427 * the first time. The thread MUST NOT be scheduled before or during 1428 * this call. The thread may be assigned to a cpu other then the current 1429 * cpu. 1430 * 1431 * Typically used after a thread has been created with TDF_STOPPREQ, 1432 * and before the thread is initially scheduled. 1433 */ 1434 void 1435 lwkt_setpri_initial(thread_t td, int pri) 1436 { 1437 KKASSERT(pri >= 0); 1438 KKASSERT((td->td_flags & TDF_RUNQ) == 0); 1439 td->td_pri = pri; 1440 } 1441 1442 void 1443 lwkt_setpri_self(int pri) 1444 { 1445 thread_t td = curthread; 1446 1447 KKASSERT(pri >= 0 && pri <= TDPRI_MAX); 1448 crit_enter(); 1449 if (td->td_flags & TDF_RUNQ) { 1450 _lwkt_dequeue(td); 1451 td->td_pri = pri; 1452 _lwkt_enqueue(td); 1453 } else { 1454 td->td_pri = pri; 1455 } 1456 crit_exit(); 1457 } 1458 1459 /* 1460 * 1/hz tick (typically 10ms) x TDFAIRQ_SCALE (typ 8) = 80ms full cycle. 1461 * 1462 * Example: two competing threads, same priority N. decrement by (2*N) 1463 * increment by N*8, each thread will get 4 ticks. 1464 */ 1465 void 1466 lwkt_fairq_schedulerclock(thread_t td) 1467 { 1468 globaldata_t gd; 1469 1470 if (fairq_enable) { 1471 while (td) { 1472 gd = td->td_gd; 1473 if (td != &gd->gd_idlethread) { 1474 td->td_fairq_accum -= gd->gd_fairq_total_pri; 1475 if (td->td_fairq_accum < -TDFAIRQ_MAX(gd)) 1476 td->td_fairq_accum = -TDFAIRQ_MAX(gd); 1477 if (td->td_fairq_accum < 0) 1478 need_lwkt_resched(); 1479 td->td_fairq_lticks = ticks; 1480 } 1481 td = td->td_preempted; 1482 } 1483 } 1484 } 1485 1486 static void 1487 lwkt_fairq_accumulate(globaldata_t gd, thread_t td) 1488 { 1489 td->td_fairq_accum += td->td_pri * TDFAIRQ_SCALE; 1490 if (td->td_fairq_accum > TDFAIRQ_MAX(td->td_gd)) 1491 td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd); 1492 } 1493 1494 /* 1495 * Migrate the current thread to the specified cpu. 1496 * 1497 * This is accomplished by descheduling ourselves from the current cpu, 1498 * moving our thread to the tdallq of the target cpu, IPI messaging the 1499 * target cpu, and switching out. TDF_MIGRATING prevents scheduling 1500 * races while the thread is being migrated. 1501 * 1502 * We must be sure to remove ourselves from the current cpu's tsleepq 1503 * before potentially moving to another queue. The thread can be on 1504 * a tsleepq due to a left-over tsleep_interlock(). 1505 */ 1506 #ifdef SMP 1507 static void lwkt_setcpu_remote(void *arg); 1508 #endif 1509 1510 void 1511 lwkt_setcpu_self(globaldata_t rgd) 1512 { 1513 #ifdef SMP 1514 thread_t td = curthread; 1515 1516 if (td->td_gd != rgd) { 1517 crit_enter_quick(td); 1518 if (td->td_flags & TDF_TSLEEPQ) 1519 tsleep_remove(td); 1520 td->td_flags |= TDF_MIGRATING; 1521 lwkt_deschedule_self(td); 1522 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1523 lwkt_send_ipiq(rgd, (ipifunc1_t)lwkt_setcpu_remote, td); 1524 lwkt_switch(); 1525 /* we are now on the target cpu */ 1526 TAILQ_INSERT_TAIL(&rgd->gd_tdallq, td, td_allq); 1527 crit_exit_quick(td); 1528 } 1529 #endif 1530 } 1531 1532 void 1533 lwkt_migratecpu(int cpuid) 1534 { 1535 #ifdef SMP 1536 globaldata_t rgd; 1537 1538 rgd = globaldata_find(cpuid); 1539 lwkt_setcpu_self(rgd); 1540 #endif 1541 } 1542 1543 /* 1544 * Remote IPI for cpu migration (called while in a critical section so we 1545 * do not have to enter another one). The thread has already been moved to 1546 * our cpu's allq, but we must wait for the thread to be completely switched 1547 * out on the originating cpu before we schedule it on ours or the stack 1548 * state may be corrupt. We clear TDF_MIGRATING after flushing the GD 1549 * change to main memory. 1550 * 1551 * XXX The use of TDF_MIGRATING might not be sufficient to avoid races 1552 * against wakeups. It is best if this interface is used only when there 1553 * are no pending events that might try to schedule the thread. 1554 */ 1555 #ifdef SMP 1556 static void 1557 lwkt_setcpu_remote(void *arg) 1558 { 1559 thread_t td = arg; 1560 globaldata_t gd = mycpu; 1561 int retry = 10000000; 1562 1563 DEBUG_PUSH_INFO("lwkt_setcpu_remote"); 1564 while (td->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) { 1565 #ifdef SMP 1566 lwkt_process_ipiq(); 1567 #endif 1568 cpu_lfence(); 1569 cpu_pause(); 1570 if (--retry == 0) { 1571 kprintf("lwkt_setcpu_remote: td->td_flags %08x\n", 1572 td->td_flags); 1573 retry = 10000000; 1574 } 1575 } 1576 DEBUG_POP_INFO(); 1577 td->td_gd = gd; 1578 cpu_mfence(); 1579 td->td_flags &= ~TDF_MIGRATING; 1580 KKASSERT(td->td_lwp == NULL || (td->td_lwp->lwp_flag & LWP_ONRUNQ) == 0); 1581 _lwkt_enqueue(td); 1582 } 1583 #endif 1584 1585 struct lwp * 1586 lwkt_preempted_proc(void) 1587 { 1588 thread_t td = curthread; 1589 while (td->td_preempted) 1590 td = td->td_preempted; 1591 return(td->td_lwp); 1592 } 1593 1594 /* 1595 * Create a kernel process/thread/whatever. It shares it's address space 1596 * with proc0 - ie: kernel only. 1597 * 1598 * NOTE! By default new threads are created with the MP lock held. A 1599 * thread which does not require the MP lock should release it by calling 1600 * rel_mplock() at the start of the new thread. 1601 */ 1602 int 1603 lwkt_create(void (*func)(void *), void *arg, struct thread **tdp, 1604 thread_t template, int tdflags, int cpu, const char *fmt, ...) 1605 { 1606 thread_t td; 1607 __va_list ap; 1608 1609 td = lwkt_alloc_thread(template, LWKT_THREAD_STACK, cpu, 1610 tdflags); 1611 if (tdp) 1612 *tdp = td; 1613 cpu_set_thread_handler(td, lwkt_exit, func, arg); 1614 1615 /* 1616 * Set up arg0 for 'ps' etc 1617 */ 1618 __va_start(ap, fmt); 1619 kvsnprintf(td->td_comm, sizeof(td->td_comm), fmt, ap); 1620 __va_end(ap); 1621 1622 /* 1623 * Schedule the thread to run 1624 */ 1625 if ((td->td_flags & TDF_STOPREQ) == 0) 1626 lwkt_schedule(td); 1627 else 1628 td->td_flags &= ~TDF_STOPREQ; 1629 return 0; 1630 } 1631 1632 /* 1633 * Destroy an LWKT thread. Warning! This function is not called when 1634 * a process exits, cpu_proc_exit() directly calls cpu_thread_exit() and 1635 * uses a different reaping mechanism. 1636 */ 1637 void 1638 lwkt_exit(void) 1639 { 1640 thread_t td = curthread; 1641 thread_t std; 1642 globaldata_t gd; 1643 1644 /* 1645 * Do any cleanup that might block here 1646 */ 1647 if (td->td_flags & TDF_VERBOSE) 1648 kprintf("kthread %p %s has exited\n", td, td->td_comm); 1649 caps_exit(td); 1650 biosched_done(td); 1651 dsched_exit_thread(td); 1652 1653 /* 1654 * Get us into a critical section to interlock gd_freetd and loop 1655 * until we can get it freed. 1656 * 1657 * We have to cache the current td in gd_freetd because objcache_put()ing 1658 * it would rip it out from under us while our thread is still active. 1659 */ 1660 gd = mycpu; 1661 crit_enter_quick(td); 1662 while ((std = gd->gd_freetd) != NULL) { 1663 KKASSERT((std->td_flags & (TDF_RUNNING|TDF_PREEMPT_LOCK)) == 0); 1664 gd->gd_freetd = NULL; 1665 objcache_put(thread_cache, std); 1666 } 1667 1668 /* 1669 * Remove thread resources from kernel lists and deschedule us for 1670 * the last time. We cannot block after this point or we may end 1671 * up with a stale td on the tsleepq. 1672 */ 1673 if (td->td_flags & TDF_TSLEEPQ) 1674 tsleep_remove(td); 1675 lwkt_deschedule_self(td); 1676 lwkt_remove_tdallq(td); 1677 KKASSERT(td->td_refs == 0); 1678 1679 /* 1680 * Final cleanup 1681 */ 1682 KKASSERT(gd->gd_freetd == NULL); 1683 if (td->td_flags & TDF_ALLOCATED_THREAD) 1684 gd->gd_freetd = td; 1685 cpu_thread_exit(); 1686 } 1687 1688 void 1689 lwkt_remove_tdallq(thread_t td) 1690 { 1691 KKASSERT(td->td_gd == mycpu); 1692 TAILQ_REMOVE(&td->td_gd->gd_tdallq, td, td_allq); 1693 } 1694 1695 /* 1696 * Code reduction and branch prediction improvements. Call/return 1697 * overhead on modern cpus often degenerates into 0 cycles due to 1698 * the cpu's branch prediction hardware and return pc cache. We 1699 * can take advantage of this by not inlining medium-complexity 1700 * functions and we can also reduce the branch prediction impact 1701 * by collapsing perfectly predictable branches into a single 1702 * procedure instead of duplicating it. 1703 * 1704 * Is any of this noticeable? Probably not, so I'll take the 1705 * smaller code size. 1706 */ 1707 void 1708 crit_exit_wrapper(__DEBUG_CRIT_ARG__) 1709 { 1710 _crit_exit(mycpu __DEBUG_CRIT_PASS_ARG__); 1711 } 1712 1713 void 1714 crit_panic(void) 1715 { 1716 thread_t td = curthread; 1717 int lcrit = td->td_critcount; 1718 1719 td->td_critcount = 0; 1720 panic("td_critcount is/would-go negative! %p %d", td, lcrit); 1721 /* NOT REACHED */ 1722 } 1723 1724 #ifdef SMP 1725 1726 /* 1727 * Called from debugger/panic on cpus which have been stopped. We must still 1728 * process the IPIQ while stopped, even if we were stopped while in a critical 1729 * section (XXX). 1730 * 1731 * If we are dumping also try to process any pending interrupts. This may 1732 * or may not work depending on the state of the cpu at the point it was 1733 * stopped. 1734 */ 1735 void 1736 lwkt_smp_stopped(void) 1737 { 1738 globaldata_t gd = mycpu; 1739 1740 crit_enter_gd(gd); 1741 if (dumping) { 1742 lwkt_process_ipiq(); 1743 splz(); 1744 } else { 1745 lwkt_process_ipiq(); 1746 } 1747 crit_exit_gd(gd); 1748 } 1749 1750 #endif 1751