1 /* 2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.26 2008/11/01 23:31:19 dillon Exp $ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/lock.h> 33 #include <sys/queue.h> 34 #include <sys/proc.h> 35 #include <sys/rtprio.h> 36 #include <sys/uio.h> 37 #include <sys/sysctl.h> 38 #include <sys/resourcevar.h> 39 #include <sys/spinlock.h> 40 #include <machine/cpu.h> 41 #include <machine/smp.h> 42 43 #include <sys/thread2.h> 44 #include <sys/spinlock2.h> 45 #include <sys/mplock2.h> 46 47 /* 48 * Priorities. Note that with 32 run queues per scheduler each queue 49 * represents four priority levels. 50 */ 51 52 #define MAXPRI 128 53 #define PRIMASK (MAXPRI - 1) 54 #define PRIBASE_REALTIME 0 55 #define PRIBASE_NORMAL MAXPRI 56 #define PRIBASE_IDLE (MAXPRI * 2) 57 #define PRIBASE_THREAD (MAXPRI * 3) 58 #define PRIBASE_NULL (MAXPRI * 4) 59 60 #define NQS 32 /* 32 run queues. */ 61 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 62 #define PPQMASK (PPQ - 1) 63 64 /* 65 * NICEPPQ - number of nice units per priority queue 66 * 67 * ESTCPUPPQ - number of estcpu units per priority queue 68 * ESTCPUMAX - number of estcpu units 69 */ 70 #define NICEPPQ 2 71 #define ESTCPUPPQ 512 72 #define ESTCPUMAX (ESTCPUPPQ * NQS) 73 #define BATCHMAX (ESTCPUFREQ * 30) 74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 75 76 #define ESTCPULIM(v) min((v), ESTCPUMAX) 77 78 TAILQ_HEAD(rq, lwp); 79 80 #define lwp_priority lwp_usdata.bsd4.priority 81 #define lwp_rqindex lwp_usdata.bsd4.rqindex 82 #define lwp_estcpu lwp_usdata.bsd4.estcpu 83 #define lwp_batch lwp_usdata.bsd4.batch 84 #define lwp_rqtype lwp_usdata.bsd4.rqtype 85 86 static void bsd4_acquire_curproc(struct lwp *lp); 87 static void bsd4_release_curproc(struct lwp *lp); 88 static void bsd4_select_curproc(globaldata_t gd); 89 static void bsd4_setrunqueue(struct lwp *lp); 90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 91 sysclock_t cpstamp); 92 static void bsd4_recalculate_estcpu(struct lwp *lp); 93 static void bsd4_resetpriority(struct lwp *lp); 94 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 95 static void bsd4_exiting(struct lwp *lp, struct proc *); 96 static void bsd4_yield(struct lwp *lp); 97 98 #ifdef SMP 99 static void need_user_resched_remote(void *dummy); 100 #endif 101 static struct lwp *chooseproc_locked(struct lwp *chklp); 102 static void bsd4_remrunqueue_locked(struct lwp *lp); 103 static void bsd4_setrunqueue_locked(struct lwp *lp); 104 105 struct usched usched_bsd4 = { 106 { NULL }, 107 "bsd4", "Original DragonFly Scheduler", 108 NULL, /* default registration */ 109 NULL, /* default deregistration */ 110 bsd4_acquire_curproc, 111 bsd4_release_curproc, 112 bsd4_setrunqueue, 113 bsd4_schedulerclock, 114 bsd4_recalculate_estcpu, 115 bsd4_resetpriority, 116 bsd4_forking, 117 bsd4_exiting, 118 NULL, /* setcpumask not supported */ 119 bsd4_yield 120 }; 121 122 struct usched_bsd4_pcpu { 123 struct thread helper_thread; 124 short rrcount; 125 short upri; 126 struct lwp *uschedcp; 127 }; 128 129 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; 130 131 /* 132 * We have NQS (32) run queues per scheduling class. For the normal 133 * class, there are 128 priorities scaled onto these 32 queues. New 134 * processes are added to the last entry in each queue, and processes 135 * are selected for running by taking them from the head and maintaining 136 * a simple FIFO arrangement. Realtime and Idle priority processes have 137 * and explicit 0-31 priority which maps directly onto their class queue 138 * index. When a queue has something in it, the corresponding bit is 139 * set in the queuebits variable, allowing a single read to determine 140 * the state of all 32 queues and then a ffs() to find the first busy 141 * queue. 142 */ 143 static struct rq bsd4_queues[NQS]; 144 static struct rq bsd4_rtqueues[NQS]; 145 static struct rq bsd4_idqueues[NQS]; 146 static u_int32_t bsd4_queuebits; 147 static u_int32_t bsd4_rtqueuebits; 148 static u_int32_t bsd4_idqueuebits; 149 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ 150 static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ 151 static int bsd4_runqcount; 152 #ifdef SMP 153 static volatile int bsd4_scancpu; 154 #endif 155 static struct spinlock bsd4_spin; 156 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; 157 158 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, 159 "Number of run queues"); 160 #ifdef INVARIANTS 161 static int usched_nonoptimal; 162 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, 163 &usched_nonoptimal, 0, "acquire_curproc() was not optimal"); 164 static int usched_optimal; 165 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, 166 &usched_optimal, 0, "acquire_curproc() was optimal"); 167 #endif 168 static int usched_debug = -1; 169 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, 170 "Print debug information for this pid"); 171 #ifdef SMP 172 static int remote_resched_nonaffinity; 173 static int remote_resched_affinity; 174 static int choose_affinity; 175 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, 176 &remote_resched_nonaffinity, 0, "Number of remote rescheds"); 177 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, 178 &remote_resched_affinity, 0, "Number of remote rescheds"); 179 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, 180 &choose_affinity, 0, "chooseproc() was smart"); 181 #endif 182 183 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 184 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW, 185 &usched_bsd4_rrinterval, 0, ""); 186 static int usched_bsd4_decay = 8; 187 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW, 188 &usched_bsd4_decay, 0, "Extra decay when not running"); 189 static int usched_bsd4_batch_time = 10; 190 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_batch_time, CTLFLAG_RW, 191 &usched_bsd4_batch_time, 0, "Minimum batch counter value"); 192 193 /* 194 * Initialize the run queues at boot time. 195 */ 196 static void 197 rqinit(void *dummy) 198 { 199 int i; 200 201 spin_init(&bsd4_spin); 202 for (i = 0; i < NQS; i++) { 203 TAILQ_INIT(&bsd4_queues[i]); 204 TAILQ_INIT(&bsd4_rtqueues[i]); 205 TAILQ_INIT(&bsd4_idqueues[i]); 206 } 207 atomic_clear_cpumask(&bsd4_curprocmask, 1); 208 } 209 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL) 210 211 /* 212 * BSD4_ACQUIRE_CURPROC 213 * 214 * This function is called when the kernel intends to return to userland. 215 * It is responsible for making the thread the current designated userland 216 * thread for this cpu, blocking if necessary. 217 * 218 * The kernel has already depressed our LWKT priority so we must not switch 219 * until we have either assigned or disposed of the thread. 220 * 221 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 222 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 223 * occur, this function is called only under very controlled circumstances. 224 * 225 * MPSAFE 226 */ 227 static void 228 bsd4_acquire_curproc(struct lwp *lp) 229 { 230 globaldata_t gd; 231 bsd4_pcpu_t dd; 232 struct lwp *olp; 233 234 crit_enter(); 235 bsd4_recalculate_estcpu(lp); 236 237 /* 238 * If a reschedule was requested give another thread the 239 * driver's seat. 240 */ 241 if (user_resched_wanted()) { 242 clear_user_resched(); 243 bsd4_release_curproc(lp); 244 } 245 246 /* 247 * Loop until we are the current user thread 248 */ 249 do { 250 /* 251 * Reload after a switch or setrunqueue/switch possibly 252 * moved us to another cpu. 253 */ 254 /*clear_lwkt_resched();*/ 255 gd = mycpu; 256 dd = &bsd4_pcpu[gd->gd_cpuid]; 257 258 /* 259 * Become the currently scheduled user thread for this cpu 260 * if we can do so trivially. 261 * 262 * We can steal another thread's current thread designation 263 * on this cpu since if we are running that other thread 264 * must not be, so we can safely deschedule it. 265 */ 266 if (dd->uschedcp == lp) { 267 /* 268 * We are already the current lwp (hot path). 269 */ 270 dd->upri = lp->lwp_priority; 271 } else if (dd->uschedcp == NULL) { 272 /* 273 * We can trivially become the current lwp. 274 */ 275 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 276 dd->uschedcp = lp; 277 dd->upri = lp->lwp_priority; 278 } else if (dd->upri > lp->lwp_priority) { 279 /* 280 * We can steal the current lwp designation from the 281 * olp that was previously assigned to this cpu. 282 */ 283 olp = dd->uschedcp; 284 dd->uschedcp = lp; 285 dd->upri = lp->lwp_priority; 286 lwkt_deschedule(olp->lwp_thread); 287 bsd4_setrunqueue(olp); 288 } else { 289 /* 290 * We cannot become the current lwp, place the lp 291 * on the bsd4 run-queue and deschedule ourselves. 292 */ 293 lwkt_deschedule(lp->lwp_thread); 294 bsd4_setrunqueue(lp); 295 lwkt_switch(); 296 } 297 298 /* 299 * Other threads at our current user priority have already 300 * put in their bids, but we must run any kernel threads 301 * at higher priorities, and we could lose our bid to 302 * another thread trying to return to user mode in the 303 * process. 304 * 305 * If we lose our bid we will be descheduled and put on 306 * the run queue. When we are reactivated we will have 307 * another chance. 308 */ 309 if (lwkt_resched_wanted() || 310 lp->lwp_thread->td_fairq_accum < 0) { 311 lwkt_switch(); 312 } 313 } while (dd->uschedcp != lp); 314 315 crit_exit(); 316 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); 317 } 318 319 /* 320 * BSD4_RELEASE_CURPROC 321 * 322 * This routine detaches the current thread from the userland scheduler, 323 * usually because the thread needs to run or block in the kernel (at 324 * kernel priority) for a while. 325 * 326 * This routine is also responsible for selecting a new thread to 327 * make the current thread. 328 * 329 * NOTE: This implementation differs from the dummy example in that 330 * bsd4_select_curproc() is able to select the current process, whereas 331 * dummy_select_curproc() is not able to select the current process. 332 * This means we have to NULL out uschedcp. 333 * 334 * Additionally, note that we may already be on a run queue if releasing 335 * via the lwkt_switch() in bsd4_setrunqueue(). 336 * 337 * MPSAFE 338 */ 339 static void 340 bsd4_release_curproc(struct lwp *lp) 341 { 342 globaldata_t gd = mycpu; 343 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 344 345 if (dd->uschedcp == lp) { 346 crit_enter(); 347 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); 348 dd->uschedcp = NULL; /* don't let lp be selected */ 349 dd->upri = PRIBASE_NULL; 350 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 351 bsd4_select_curproc(gd); 352 crit_exit(); 353 } 354 } 355 356 /* 357 * BSD4_SELECT_CURPROC 358 * 359 * Select a new current process for this cpu and clear any pending user 360 * reschedule request. The cpu currently has no current process. 361 * 362 * This routine is also responsible for equal-priority round-robining, 363 * typically triggered from bsd4_schedulerclock(). In our dummy example 364 * all the 'user' threads are LWKT scheduled all at once and we just 365 * call lwkt_switch(). 366 * 367 * The calling process is not on the queue and cannot be selected. 368 * 369 * MPSAFE 370 */ 371 static 372 void 373 bsd4_select_curproc(globaldata_t gd) 374 { 375 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 376 struct lwp *nlp; 377 int cpuid = gd->gd_cpuid; 378 379 crit_enter_gd(gd); 380 381 spin_lock(&bsd4_spin); 382 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { 383 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid)); 384 dd->upri = nlp->lwp_priority; 385 dd->uschedcp = nlp; 386 spin_unlock(&bsd4_spin); 387 #ifdef SMP 388 lwkt_acquire(nlp->lwp_thread); 389 #endif 390 lwkt_schedule(nlp->lwp_thread); 391 } else { 392 spin_unlock(&bsd4_spin); 393 } 394 #if 0 395 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) { 396 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 397 spin_unlock(&bsd4_spin); 398 lwkt_schedule(&dd->helper_thread); 399 } else { 400 spin_unlock(&bsd4_spin); 401 } 402 #endif 403 crit_exit_gd(gd); 404 } 405 406 /* 407 * BSD4_SETRUNQUEUE 408 * 409 * Place the specified lwp on the user scheduler's run queue. This routine 410 * must be called with the thread descheduled. The lwp must be runnable. 411 * 412 * The thread may be the current thread as a special case. 413 * 414 * MPSAFE 415 */ 416 static void 417 bsd4_setrunqueue(struct lwp *lp) 418 { 419 globaldata_t gd; 420 bsd4_pcpu_t dd; 421 #ifdef SMP 422 int cpuid; 423 cpumask_t mask; 424 cpumask_t tmpmask; 425 #endif 426 427 /* 428 * First validate the process state relative to the current cpu. 429 * We don't need the spinlock for this, just a critical section. 430 * We are in control of the process. 431 */ 432 crit_enter(); 433 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 434 KASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0, 435 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 436 lp->lwp_tid, lp->lwp_proc->p_flag, lp->lwp_flag)); 437 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 438 439 /* 440 * Note: gd and dd are relative to the target thread's last cpu, 441 * NOT our current cpu. 442 */ 443 gd = lp->lwp_thread->td_gd; 444 dd = &bsd4_pcpu[gd->gd_cpuid]; 445 446 /* 447 * This process is not supposed to be scheduled anywhere or assigned 448 * as the current process anywhere. Assert the condition. 449 */ 450 KKASSERT(dd->uschedcp != lp); 451 452 #ifndef SMP 453 /* 454 * If we are not SMP we do not have a scheduler helper to kick 455 * and must directly activate the process if none are scheduled. 456 * 457 * This is really only an issue when bootstrapping init since 458 * the caller in all other cases will be a user process, and 459 * even if released (dd->uschedcp == NULL), that process will 460 * kickstart the scheduler when it returns to user mode from 461 * the kernel. 462 */ 463 if (dd->uschedcp == NULL) { 464 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 465 dd->uschedcp = lp; 466 dd->upri = lp->lwp_priority; 467 lwkt_schedule(lp->lwp_thread); 468 crit_exit(); 469 return; 470 } 471 #endif 472 473 #ifdef SMP 474 /* 475 * XXX fixme. Could be part of a remrunqueue/setrunqueue 476 * operation when the priority is recalculated, so TDF_MIGRATING 477 * may already be set. 478 */ 479 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 480 lwkt_giveaway(lp->lwp_thread); 481 #endif 482 483 /* 484 * We lose control of lp the moment we release the spinlock after 485 * having placed lp on the queue. i.e. another cpu could pick it 486 * up and it could exit, or its priority could be further adjusted, 487 * or something like that. 488 */ 489 spin_lock(&bsd4_spin); 490 bsd4_setrunqueue_locked(lp); 491 492 #ifdef SMP 493 /* 494 * Kick the scheduler helper on one of the other cpu's 495 * and request a reschedule if appropriate. 496 * 497 * NOTE: We check all cpus whos rdyprocmask is set. First we 498 * look for cpus without designated lps, then we look for 499 * cpus with designated lps with a worse priority than our 500 * process. 501 */ 502 ++bsd4_scancpu; 503 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 504 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 505 smp_active_mask & usched_global_cpumask; 506 507 while (mask) { 508 tmpmask = ~(CPUMASK(cpuid) - 1); 509 if (mask & tmpmask) 510 cpuid = BSFCPUMASK(mask & tmpmask); 511 else 512 cpuid = BSFCPUMASK(mask); 513 gd = globaldata_find(cpuid); 514 dd = &bsd4_pcpu[cpuid]; 515 516 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) 517 goto found; 518 mask &= ~CPUMASK(cpuid); 519 } 520 521 /* 522 * Then cpus which might have a currently running lp 523 */ 524 mask = bsd4_curprocmask & bsd4_rdyprocmask & 525 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask; 526 527 while (mask) { 528 tmpmask = ~(CPUMASK(cpuid) - 1); 529 if (mask & tmpmask) 530 cpuid = BSFCPUMASK(mask & tmpmask); 531 else 532 cpuid = BSFCPUMASK(mask); 533 gd = globaldata_find(cpuid); 534 dd = &bsd4_pcpu[cpuid]; 535 536 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 537 goto found; 538 mask &= ~CPUMASK(cpuid); 539 } 540 541 /* 542 * If we cannot find a suitable cpu we reload from bsd4_scancpu 543 * and round-robin. Other cpus will pickup as they release their 544 * current lwps or become ready. 545 * 546 * Avoid a degenerate system lockup case if usched_global_cpumask 547 * is set to 0 or otherwise does not cover lwp_cpumask. 548 * 549 * We only kick the target helper thread in this case, we do not 550 * set the user resched flag because 551 */ 552 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 553 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) { 554 cpuid = 0; 555 } 556 gd = globaldata_find(cpuid); 557 dd = &bsd4_pcpu[cpuid]; 558 found: 559 if (gd == mycpu) { 560 spin_unlock(&bsd4_spin); 561 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 562 if (dd->uschedcp == NULL) { 563 lwkt_schedule(&dd->helper_thread); 564 } else { 565 need_user_resched(); 566 } 567 } 568 } else { 569 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 570 spin_unlock(&bsd4_spin); 571 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 572 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 573 else 574 lwkt_schedule(&dd->helper_thread); 575 } 576 #else 577 /* 578 * Request a reschedule if appropriate. 579 */ 580 spin_unlock(&bsd4_spin); 581 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 582 need_user_resched(); 583 } 584 #endif 585 crit_exit(); 586 } 587 588 /* 589 * This routine is called from a systimer IPI. It MUST be MP-safe and 590 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 591 * each cpu. 592 * 593 * MPSAFE 594 */ 595 static 596 void 597 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 598 { 599 globaldata_t gd = mycpu; 600 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 601 602 /* 603 * Do we need to round-robin? We round-robin 10 times a second. 604 * This should only occur for cpu-bound batch processes. 605 */ 606 if (++dd->rrcount >= usched_bsd4_rrinterval) { 607 dd->rrcount = 0; 608 need_user_resched(); 609 } 610 611 /* 612 * Adjust estcpu upward using a real time equivalent calculation. 613 */ 614 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1); 615 616 /* 617 * Spinlocks also hold a critical section so there should not be 618 * any active. 619 */ 620 KKASSERT(gd->gd_spinlocks_wr == 0); 621 622 bsd4_resetpriority(lp); 623 #if 0 624 /* 625 * if we can't call bsd4_resetpriority for some reason we must call 626 * need user_resched(). 627 */ 628 need_user_resched(); 629 #endif 630 } 631 632 /* 633 * Called from acquire and from kern_synch's one-second timer (one of the 634 * callout helper threads) with a critical section held. 635 * 636 * Decay p_estcpu based on the number of ticks we haven't been running 637 * and our p_nice. As the load increases each process observes a larger 638 * number of idle ticks (because other processes are running in them). 639 * This observation leads to a larger correction which tends to make the 640 * system more 'batchy'. 641 * 642 * Note that no recalculation occurs for a process which sleeps and wakes 643 * up in the same tick. That is, a system doing thousands of context 644 * switches per second will still only do serious estcpu calculations 645 * ESTCPUFREQ times per second. 646 * 647 * MPSAFE 648 */ 649 static 650 void 651 bsd4_recalculate_estcpu(struct lwp *lp) 652 { 653 globaldata_t gd = mycpu; 654 sysclock_t cpbase; 655 sysclock_t ttlticks; 656 int estcpu; 657 int decay_factor; 658 659 /* 660 * We have to subtract periodic to get the last schedclock 661 * timeout time, otherwise we would get the upcoming timeout. 662 * Keep in mind that a process can migrate between cpus and 663 * while the scheduler clock should be very close, boundary 664 * conditions could lead to a small negative delta. 665 */ 666 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 667 668 if (lp->lwp_slptime > 1) { 669 /* 670 * Too much time has passed, do a coarse correction. 671 */ 672 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 673 bsd4_resetpriority(lp); 674 lp->lwp_cpbase = cpbase; 675 lp->lwp_cpticks = 0; 676 lp->lwp_batch -= ESTCPUFREQ; 677 if (lp->lwp_batch < 0) 678 lp->lwp_batch = 0; 679 } else if (lp->lwp_cpbase != cpbase) { 680 /* 681 * Adjust estcpu if we are in a different tick. Don't waste 682 * time if we are in the same tick. 683 * 684 * First calculate the number of ticks in the measurement 685 * interval. The ttlticks calculation can wind up 0 due to 686 * a bug in the handling of lwp_slptime (as yet not found), 687 * so make sure we do not get a divide by 0 panic. 688 */ 689 ttlticks = (cpbase - lp->lwp_cpbase) / 690 gd->gd_schedclock.periodic; 691 if (ttlticks < 0) { 692 ttlticks = 0; 693 lp->lwp_cpbase = cpbase; 694 } 695 if (ttlticks == 0) 696 return; 697 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 698 699 /* 700 * Calculate the percentage of one cpu used factoring in ncpus 701 * and the load and adjust estcpu. Handle degenerate cases 702 * by adding 1 to bsd4_runqcount. 703 * 704 * estcpu is scaled by ESTCPUMAX. 705 * 706 * bsd4_runqcount is the excess number of user processes 707 * that cannot be immediately scheduled to cpus. We want 708 * to count these as running to avoid range compression 709 * in the base calculation (which is the actual percentage 710 * of one cpu used). 711 */ 712 estcpu = (lp->lwp_cpticks * ESTCPUMAX) * 713 (bsd4_runqcount + ncpus) / (ncpus * ttlticks); 714 715 /* 716 * If estcpu is > 50% we become more batch-like 717 * If estcpu is <= 50% we become less batch-like 718 * 719 * It takes 30 cpu seconds to traverse the entire range. 720 */ 721 if (estcpu > ESTCPUMAX / 2) { 722 lp->lwp_batch += ttlticks; 723 if (lp->lwp_batch > BATCHMAX) 724 lp->lwp_batch = BATCHMAX; 725 } else { 726 lp->lwp_batch -= ttlticks; 727 if (lp->lwp_batch < 0) 728 lp->lwp_batch = 0; 729 } 730 731 if (usched_debug == lp->lwp_proc->p_pid) { 732 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", 733 lp->lwp_proc->p_pid, lp, 734 estcpu, lp->lwp_estcpu, 735 lp->lwp_batch, 736 lp->lwp_cpticks, ttlticks); 737 } 738 739 /* 740 * Adjust lp->lwp_esetcpu. The decay factor determines how 741 * quickly lwp_estcpu collapses to its realtime calculation. 742 * A slower collapse gives us a more accurate number but 743 * can cause a cpu hog to eat too much cpu before the 744 * scheduler decides to downgrade it. 745 * 746 * NOTE: p_nice is accounted for in bsd4_resetpriority(), 747 * and not here, but we must still ensure that a 748 * cpu-bound nice -20 process does not completely 749 * override a cpu-bound nice +20 process. 750 * 751 * NOTE: We must use ESTCPULIM() here to deal with any 752 * overshoot. 753 */ 754 decay_factor = usched_bsd4_decay; 755 if (decay_factor < 1) 756 decay_factor = 1; 757 if (decay_factor > 1024) 758 decay_factor = 1024; 759 760 lp->lwp_estcpu = ESTCPULIM( 761 (lp->lwp_estcpu * decay_factor + estcpu) / 762 (decay_factor + 1)); 763 764 if (usched_debug == lp->lwp_proc->p_pid) 765 kprintf(" finalestcpu %d\n", lp->lwp_estcpu); 766 bsd4_resetpriority(lp); 767 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 768 lp->lwp_cpticks = 0; 769 } 770 } 771 772 /* 773 * Compute the priority of a process when running in user mode. 774 * Arrange to reschedule if the resulting priority is better 775 * than that of the current process. 776 * 777 * This routine may be called with any process. 778 * 779 * This routine is called by fork1() for initial setup with the process 780 * of the run queue, and also may be called normally with the process on or 781 * off the run queue. 782 * 783 * MPSAFE 784 */ 785 static void 786 bsd4_resetpriority(struct lwp *lp) 787 { 788 bsd4_pcpu_t dd; 789 int newpriority; 790 u_short newrqtype; 791 int reschedcpu; 792 int checkpri; 793 int estcpu; 794 795 /* 796 * Calculate the new priority and queue type 797 */ 798 crit_enter(); 799 spin_lock(&bsd4_spin); 800 801 newrqtype = lp->lwp_rtprio.type; 802 803 switch(newrqtype) { 804 case RTP_PRIO_REALTIME: 805 case RTP_PRIO_FIFO: 806 newpriority = PRIBASE_REALTIME + 807 (lp->lwp_rtprio.prio & PRIMASK); 808 break; 809 case RTP_PRIO_NORMAL: 810 /* 811 * Detune estcpu based on batchiness. lwp_batch ranges 812 * from 0 to BATCHMAX. Limit estcpu for the sake of 813 * the priority calculation to between 50% and 100%. 814 */ 815 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) / 816 (BATCHMAX * 2); 817 818 /* 819 * p_nice piece Adds (0-40) * 2 0-80 820 * estcpu Adds 16384 * 4 / 512 0-128 821 */ 822 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 823 newpriority += estcpu * PPQ / ESTCPUPPQ; 824 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / 825 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 826 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); 827 break; 828 case RTP_PRIO_IDLE: 829 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 830 break; 831 case RTP_PRIO_THREAD: 832 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 833 break; 834 default: 835 panic("Bad RTP_PRIO %d", newrqtype); 836 /* NOT REACHED */ 837 } 838 839 /* 840 * The newpriority incorporates the queue type so do a simple masked 841 * check to determine if the process has moved to another queue. If 842 * it has, and it is currently on a run queue, then move it. 843 */ 844 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 845 lp->lwp_priority = newpriority; 846 if (lp->lwp_flag & LWP_ONRUNQ) { 847 bsd4_remrunqueue_locked(lp); 848 lp->lwp_rqtype = newrqtype; 849 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 850 bsd4_setrunqueue_locked(lp); 851 checkpri = 1; 852 } else { 853 lp->lwp_rqtype = newrqtype; 854 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 855 checkpri = 0; 856 } 857 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; 858 } else { 859 lp->lwp_priority = newpriority; 860 reschedcpu = -1; 861 checkpri = 1; 862 } 863 864 /* 865 * Determine if we need to reschedule the target cpu. This only 866 * occurs if the LWP is already on a scheduler queue, which means 867 * that idle cpu notification has already occured. At most we 868 * need only issue a need_user_resched() on the appropriate cpu. 869 * 870 * The LWP may be owned by a CPU different from the current one, 871 * in which case dd->uschedcp may be modified without an MP lock 872 * or a spinlock held. The worst that happens is that the code 873 * below causes a spurious need_user_resched() on the target CPU 874 * and dd->pri to be wrong for a short period of time, both of 875 * which are harmless. 876 * 877 * If checkpri is 0 we are adjusting the priority of the current 878 * process, possibly higher (less desireable), so ignore the upri 879 * check which will fail in that case. 880 */ 881 if (reschedcpu >= 0) { 882 dd = &bsd4_pcpu[reschedcpu]; 883 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) && 884 (checkpri == 0 || 885 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { 886 #ifdef SMP 887 if (reschedcpu == mycpu->gd_cpuid) { 888 spin_unlock(&bsd4_spin); 889 need_user_resched(); 890 } else { 891 spin_unlock(&bsd4_spin); 892 atomic_clear_cpumask(&bsd4_rdyprocmask, 893 CPUMASK(reschedcpu)); 894 lwkt_send_ipiq(lp->lwp_thread->td_gd, 895 need_user_resched_remote, NULL); 896 } 897 #else 898 spin_unlock(&bsd4_spin); 899 need_user_resched(); 900 #endif 901 } else { 902 spin_unlock(&bsd4_spin); 903 } 904 } else { 905 spin_unlock(&bsd4_spin); 906 } 907 crit_exit(); 908 } 909 910 /* 911 * MPSAFE 912 */ 913 static 914 void 915 bsd4_yield(struct lwp *lp) 916 { 917 #if 0 918 /* FUTURE (or something similar) */ 919 switch(lp->lwp_rqtype) { 920 case RTP_PRIO_NORMAL: 921 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 922 break; 923 default: 924 break; 925 } 926 #endif 927 need_user_resched(); 928 } 929 930 /* 931 * Called from fork1() when a new child process is being created. 932 * 933 * Give the child process an initial estcpu that is more batch then 934 * its parent and dock the parent for the fork (but do not 935 * reschedule the parent). This comprises the main part of our batch 936 * detection heuristic for both parallel forking and sequential execs. 937 * 938 * XXX lwp should be "spawning" instead of "forking" 939 * 940 * MPSAFE 941 */ 942 static void 943 bsd4_forking(struct lwp *plp, struct lwp *lp) 944 { 945 /* 946 * Put the child 4 queue slots (out of 32) higher than the parent 947 * (less desireable than the parent). 948 */ 949 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4); 950 951 /* 952 * The batch status of children always starts out centerline 953 * and will inch-up or inch-down as appropriate. It takes roughly 954 * ~15 seconds of >50% cpu to hit the limit. 955 */ 956 lp->lwp_batch = BATCHMAX / 2; 957 958 /* 959 * Dock the parent a cost for the fork, protecting us from fork 960 * bombs. If the parent is forking quickly make the child more 961 * batchy. 962 */ 963 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16); 964 } 965 966 /* 967 * Called when a parent waits for a child. 968 * 969 * MPSAFE 970 */ 971 static void 972 bsd4_exiting(struct lwp *lp, struct proc *child_proc) 973 { 974 } 975 976 /* 977 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 978 * it selects a user process and returns it. If chklp is non-NULL and chklp 979 * has a better or equal priority then the process that would otherwise be 980 * chosen, NULL is returned. 981 * 982 * Until we fix the RUNQ code the chklp test has to be strict or we may 983 * bounce between processes trying to acquire the current process designation. 984 * 985 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is 986 * left intact through the entire routine. 987 */ 988 static 989 struct lwp * 990 chooseproc_locked(struct lwp *chklp) 991 { 992 struct lwp *lp; 993 struct rq *q; 994 u_int32_t *which, *which2; 995 u_int32_t pri; 996 u_int32_t rtqbits; 997 u_int32_t tsqbits; 998 u_int32_t idqbits; 999 cpumask_t cpumask; 1000 1001 rtqbits = bsd4_rtqueuebits; 1002 tsqbits = bsd4_queuebits; 1003 idqbits = bsd4_idqueuebits; 1004 cpumask = mycpu->gd_cpumask; 1005 1006 #ifdef SMP 1007 again: 1008 #endif 1009 if (rtqbits) { 1010 pri = bsfl(rtqbits); 1011 q = &bsd4_rtqueues[pri]; 1012 which = &bsd4_rtqueuebits; 1013 which2 = &rtqbits; 1014 } else if (tsqbits) { 1015 pri = bsfl(tsqbits); 1016 q = &bsd4_queues[pri]; 1017 which = &bsd4_queuebits; 1018 which2 = &tsqbits; 1019 } else if (idqbits) { 1020 pri = bsfl(idqbits); 1021 q = &bsd4_idqueues[pri]; 1022 which = &bsd4_idqueuebits; 1023 which2 = &idqbits; 1024 } else { 1025 return NULL; 1026 } 1027 lp = TAILQ_FIRST(q); 1028 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1029 1030 #ifdef SMP 1031 while ((lp->lwp_cpumask & cpumask) == 0) { 1032 lp = TAILQ_NEXT(lp, lwp_procq); 1033 if (lp == NULL) { 1034 *which2 &= ~(1 << pri); 1035 goto again; 1036 } 1037 } 1038 #endif 1039 1040 /* 1041 * If the passed lwp <chklp> is reasonably close to the selected 1042 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1043 * 1044 * Note that we must error on the side of <chklp> to avoid bouncing 1045 * between threads in the acquire code. 1046 */ 1047 if (chklp) { 1048 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1049 return(NULL); 1050 } 1051 1052 #ifdef SMP 1053 /* 1054 * If the chosen lwp does not reside on this cpu spend a few 1055 * cycles looking for a better candidate at the same priority level. 1056 * This is a fallback check, setrunqueue() tries to wakeup the 1057 * correct cpu and is our front-line affinity. 1058 */ 1059 if (lp->lwp_thread->td_gd != mycpu && 1060 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 1061 ) { 1062 if (chklp->lwp_thread->td_gd == mycpu) { 1063 ++choose_affinity; 1064 lp = chklp; 1065 } 1066 } 1067 #endif 1068 1069 TAILQ_REMOVE(q, lp, lwp_procq); 1070 --bsd4_runqcount; 1071 if (TAILQ_EMPTY(q)) 1072 *which &= ~(1 << pri); 1073 KASSERT((lp->lwp_flag & LWP_ONRUNQ) != 0, ("not on runq6!")); 1074 lp->lwp_flag &= ~LWP_ONRUNQ; 1075 return lp; 1076 } 1077 1078 #ifdef SMP 1079 1080 static 1081 void 1082 need_user_resched_remote(void *dummy) 1083 { 1084 globaldata_t gd = mycpu; 1085 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 1086 1087 need_user_resched(); 1088 lwkt_schedule(&dd->helper_thread); 1089 } 1090 1091 #endif 1092 1093 /* 1094 * bsd4_remrunqueue_locked() removes a given process from the run queue 1095 * that it is on, clearing the queue busy bit if it becomes empty. 1096 * 1097 * Note that user process scheduler is different from the LWKT schedule. 1098 * The user process scheduler only manages user processes but it uses LWKT 1099 * underneath, and a user process operating in the kernel will often be 1100 * 'released' from our management. 1101 * 1102 * MPSAFE - bsd4_spin must be held exclusively on call 1103 */ 1104 static void 1105 bsd4_remrunqueue_locked(struct lwp *lp) 1106 { 1107 struct rq *q; 1108 u_int32_t *which; 1109 u_int8_t pri; 1110 1111 KKASSERT(lp->lwp_flag & LWP_ONRUNQ); 1112 lp->lwp_flag &= ~LWP_ONRUNQ; 1113 --bsd4_runqcount; 1114 KKASSERT(bsd4_runqcount >= 0); 1115 1116 pri = lp->lwp_rqindex; 1117 switch(lp->lwp_rqtype) { 1118 case RTP_PRIO_NORMAL: 1119 q = &bsd4_queues[pri]; 1120 which = &bsd4_queuebits; 1121 break; 1122 case RTP_PRIO_REALTIME: 1123 case RTP_PRIO_FIFO: 1124 q = &bsd4_rtqueues[pri]; 1125 which = &bsd4_rtqueuebits; 1126 break; 1127 case RTP_PRIO_IDLE: 1128 q = &bsd4_idqueues[pri]; 1129 which = &bsd4_idqueuebits; 1130 break; 1131 default: 1132 panic("remrunqueue: invalid rtprio type"); 1133 /* NOT REACHED */ 1134 } 1135 TAILQ_REMOVE(q, lp, lwp_procq); 1136 if (TAILQ_EMPTY(q)) { 1137 KASSERT((*which & (1 << pri)) != 0, 1138 ("remrunqueue: remove from empty queue")); 1139 *which &= ~(1 << pri); 1140 } 1141 } 1142 1143 /* 1144 * bsd4_setrunqueue_locked() 1145 * 1146 * Add a process whos rqtype and rqindex had previously been calculated 1147 * onto the appropriate run queue. Determine if the addition requires 1148 * a reschedule on a cpu and return the cpuid or -1. 1149 * 1150 * NOTE: Lower priorities are better priorities. 1151 * 1152 * MPSAFE - bsd4_spin must be held exclusively on call 1153 */ 1154 static void 1155 bsd4_setrunqueue_locked(struct lwp *lp) 1156 { 1157 struct rq *q; 1158 u_int32_t *which; 1159 int pri; 1160 1161 KKASSERT((lp->lwp_flag & LWP_ONRUNQ) == 0); 1162 lp->lwp_flag |= LWP_ONRUNQ; 1163 ++bsd4_runqcount; 1164 1165 pri = lp->lwp_rqindex; 1166 1167 switch(lp->lwp_rqtype) { 1168 case RTP_PRIO_NORMAL: 1169 q = &bsd4_queues[pri]; 1170 which = &bsd4_queuebits; 1171 break; 1172 case RTP_PRIO_REALTIME: 1173 case RTP_PRIO_FIFO: 1174 q = &bsd4_rtqueues[pri]; 1175 which = &bsd4_rtqueuebits; 1176 break; 1177 case RTP_PRIO_IDLE: 1178 q = &bsd4_idqueues[pri]; 1179 which = &bsd4_idqueuebits; 1180 break; 1181 default: 1182 panic("remrunqueue: invalid rtprio type"); 1183 /* NOT REACHED */ 1184 } 1185 1186 /* 1187 * Add to the correct queue and set the appropriate bit. If no 1188 * lower priority (i.e. better) processes are in the queue then 1189 * we want a reschedule, calculate the best cpu for the job. 1190 * 1191 * Always run reschedules on the LWPs original cpu. 1192 */ 1193 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 1194 *which |= 1 << pri; 1195 } 1196 1197 #ifdef SMP 1198 1199 /* 1200 * For SMP systems a user scheduler helper thread is created for each 1201 * cpu and is used to allow one cpu to wakeup another for the purposes of 1202 * scheduling userland threads from setrunqueue(). 1203 * 1204 * UP systems do not need the helper since there is only one cpu. 1205 * 1206 * We can't use the idle thread for this because we might block. 1207 * Additionally, doing things this way allows us to HLT idle cpus 1208 * on MP systems. 1209 * 1210 * MPSAFE 1211 */ 1212 static void 1213 sched_thread(void *dummy) 1214 { 1215 globaldata_t gd; 1216 bsd4_pcpu_t dd; 1217 struct lwp *nlp; 1218 cpumask_t mask; 1219 int cpuid; 1220 #ifdef SMP 1221 cpumask_t tmpmask; 1222 int tmpid; 1223 #endif 1224 1225 gd = mycpu; 1226 cpuid = gd->gd_cpuid; /* doesn't change */ 1227 mask = gd->gd_cpumask; /* doesn't change */ 1228 dd = &bsd4_pcpu[cpuid]; 1229 1230 /* 1231 * Since we are woken up only when no user processes are scheduled 1232 * on a cpu, we can run at an ultra low priority. 1233 */ 1234 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 1235 1236 for (;;) { 1237 /* 1238 * We use the LWKT deschedule-interlock trick to avoid racing 1239 * bsd4_rdyprocmask. This means we cannot block through to the 1240 * manual lwkt_switch() call we make below. 1241 */ 1242 crit_enter_gd(gd); 1243 lwkt_deschedule_self(gd->gd_curthread); 1244 spin_lock(&bsd4_spin); 1245 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1246 1247 clear_user_resched(); /* This satisfied the reschedule request */ 1248 dd->rrcount = 0; /* Reset the round-robin counter */ 1249 1250 if ((bsd4_curprocmask & mask) == 0) { 1251 /* 1252 * No thread is currently scheduled. 1253 */ 1254 KKASSERT(dd->uschedcp == NULL); 1255 if ((nlp = chooseproc_locked(NULL)) != NULL) { 1256 atomic_set_cpumask(&bsd4_curprocmask, mask); 1257 dd->upri = nlp->lwp_priority; 1258 dd->uschedcp = nlp; 1259 spin_unlock(&bsd4_spin); 1260 #ifdef SMP 1261 lwkt_acquire(nlp->lwp_thread); 1262 #endif 1263 lwkt_schedule(nlp->lwp_thread); 1264 } else { 1265 spin_unlock(&bsd4_spin); 1266 } 1267 } else if (bsd4_runqcount) { 1268 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { 1269 dd->upri = nlp->lwp_priority; 1270 dd->uschedcp = nlp; 1271 spin_unlock(&bsd4_spin); 1272 #ifdef SMP 1273 lwkt_acquire(nlp->lwp_thread); 1274 #endif 1275 lwkt_schedule(nlp->lwp_thread); 1276 } else { 1277 /* 1278 * CHAINING CONDITION TRAIN 1279 * 1280 * We could not deal with the scheduler wakeup 1281 * request on this cpu, locate a ready scheduler 1282 * with no current lp assignment and chain to it. 1283 * 1284 * This ensures that a wakeup race which fails due 1285 * to priority test does not leave other unscheduled 1286 * cpus idle when the runqueue is not empty. 1287 */ 1288 tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask & 1289 smp_active_mask; 1290 if (tmpmask) { 1291 tmpid = BSFCPUMASK(tmpmask); 1292 gd = globaldata_find(cpuid); 1293 dd = &bsd4_pcpu[cpuid]; 1294 atomic_clear_cpumask(&bsd4_rdyprocmask, 1295 CPUMASK(tmpid)); 1296 spin_unlock(&bsd4_spin); 1297 lwkt_schedule(&dd->helper_thread); 1298 } else { 1299 spin_unlock(&bsd4_spin); 1300 } 1301 } 1302 } else { 1303 /* 1304 * The runq is empty. 1305 */ 1306 spin_unlock(&bsd4_spin); 1307 } 1308 crit_exit_gd(gd); 1309 lwkt_switch(); 1310 } 1311 } 1312 1313 /* 1314 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1315 * been cleared by rqinit() and we should not mess with it further. 1316 */ 1317 static void 1318 sched_thread_cpu_init(void) 1319 { 1320 int i; 1321 1322 if (bootverbose) 1323 kprintf("start scheduler helpers on cpus:"); 1324 1325 for (i = 0; i < ncpus; ++i) { 1326 bsd4_pcpu_t dd = &bsd4_pcpu[i]; 1327 cpumask_t mask = CPUMASK(i); 1328 1329 if ((mask & smp_active_mask) == 0) 1330 continue; 1331 1332 if (bootverbose) 1333 kprintf(" %d", i); 1334 1335 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, 1336 TDF_STOPREQ, i, "usched %d", i); 1337 1338 /* 1339 * Allow user scheduling on the target cpu. cpu #0 has already 1340 * been enabled in rqinit(). 1341 */ 1342 if (i) 1343 atomic_clear_cpumask(&bsd4_curprocmask, mask); 1344 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1345 dd->upri = PRIBASE_NULL; 1346 } 1347 if (bootverbose) 1348 kprintf("\n"); 1349 } 1350 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 1351 sched_thread_cpu_init, NULL) 1352 1353 #endif 1354 1355