1 /* 2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/kernel.h> 30 #include <sys/lock.h> 31 #include <sys/queue.h> 32 #include <sys/proc.h> 33 #include <sys/rtprio.h> 34 #include <sys/uio.h> 35 #include <sys/sysctl.h> 36 #include <sys/resourcevar.h> 37 #include <sys/spinlock.h> 38 #include <machine/cpu.h> 39 #include <machine/smp.h> 40 41 #include <sys/thread2.h> 42 #include <sys/spinlock2.h> 43 #include <sys/mplock2.h> 44 45 /* 46 * Priorities. Note that with 32 run queues per scheduler each queue 47 * represents four priority levels. 48 */ 49 50 #define MAXPRI 128 51 #define PRIMASK (MAXPRI - 1) 52 #define PRIBASE_REALTIME 0 53 #define PRIBASE_NORMAL MAXPRI 54 #define PRIBASE_IDLE (MAXPRI * 2) 55 #define PRIBASE_THREAD (MAXPRI * 3) 56 #define PRIBASE_NULL (MAXPRI * 4) 57 58 #define NQS 32 /* 32 run queues. */ 59 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 60 #define PPQMASK (PPQ - 1) 61 62 /* 63 * NICEPPQ - number of nice units per priority queue 64 * 65 * ESTCPUPPQ - number of estcpu units per priority queue 66 * ESTCPUMAX - number of estcpu units 67 */ 68 #define NICEPPQ 2 69 #define ESTCPUPPQ 512 70 #define ESTCPUMAX (ESTCPUPPQ * NQS) 71 #define BATCHMAX (ESTCPUFREQ * 30) 72 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 73 74 #define ESTCPULIM(v) min((v), ESTCPUMAX) 75 76 TAILQ_HEAD(rq, lwp); 77 78 #define lwp_priority lwp_usdata.bsd4.priority 79 #define lwp_rqindex lwp_usdata.bsd4.rqindex 80 #define lwp_estcpu lwp_usdata.bsd4.estcpu 81 #define lwp_batch lwp_usdata.bsd4.batch 82 #define lwp_rqtype lwp_usdata.bsd4.rqtype 83 84 static void bsd4_acquire_curproc(struct lwp *lp); 85 static void bsd4_release_curproc(struct lwp *lp); 86 static void bsd4_select_curproc(globaldata_t gd); 87 static void bsd4_setrunqueue(struct lwp *lp); 88 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 89 sysclock_t cpstamp); 90 static void bsd4_recalculate_estcpu(struct lwp *lp); 91 static void bsd4_resetpriority(struct lwp *lp); 92 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 93 static void bsd4_exiting(struct lwp *lp, struct proc *); 94 static void bsd4_yield(struct lwp *lp); 95 96 #ifdef SMP 97 static void need_user_resched_remote(void *dummy); 98 #endif 99 static struct lwp *chooseproc_locked(struct lwp *chklp); 100 static void bsd4_remrunqueue_locked(struct lwp *lp); 101 static void bsd4_setrunqueue_locked(struct lwp *lp); 102 103 struct usched usched_bsd4 = { 104 { NULL }, 105 "bsd4", "Original DragonFly Scheduler", 106 NULL, /* default registration */ 107 NULL, /* default deregistration */ 108 bsd4_acquire_curproc, 109 bsd4_release_curproc, 110 bsd4_setrunqueue, 111 bsd4_schedulerclock, 112 bsd4_recalculate_estcpu, 113 bsd4_resetpriority, 114 bsd4_forking, 115 bsd4_exiting, 116 NULL, /* setcpumask not supported */ 117 bsd4_yield 118 }; 119 120 struct usched_bsd4_pcpu { 121 struct thread helper_thread; 122 short rrcount; 123 short upri; 124 struct lwp *uschedcp; 125 }; 126 127 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; 128 129 /* 130 * We have NQS (32) run queues per scheduling class. For the normal 131 * class, there are 128 priorities scaled onto these 32 queues. New 132 * processes are added to the last entry in each queue, and processes 133 * are selected for running by taking them from the head and maintaining 134 * a simple FIFO arrangement. Realtime and Idle priority processes have 135 * and explicit 0-31 priority which maps directly onto their class queue 136 * index. When a queue has something in it, the corresponding bit is 137 * set in the queuebits variable, allowing a single read to determine 138 * the state of all 32 queues and then a ffs() to find the first busy 139 * queue. 140 */ 141 static struct rq bsd4_queues[NQS]; 142 static struct rq bsd4_rtqueues[NQS]; 143 static struct rq bsd4_idqueues[NQS]; 144 static u_int32_t bsd4_queuebits; 145 static u_int32_t bsd4_rtqueuebits; 146 static u_int32_t bsd4_idqueuebits; 147 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ 148 static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ 149 static int bsd4_runqcount; 150 #ifdef SMP 151 static volatile int bsd4_scancpu; 152 #endif 153 static struct spinlock bsd4_spin; 154 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; 155 156 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, 157 "Number of run queues"); 158 #ifdef INVARIANTS 159 static int usched_nonoptimal; 160 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, 161 &usched_nonoptimal, 0, "acquire_curproc() was not optimal"); 162 static int usched_optimal; 163 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, 164 &usched_optimal, 0, "acquire_curproc() was optimal"); 165 #endif 166 static int usched_debug = -1; 167 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, 168 "Print debug information for this pid"); 169 #ifdef SMP 170 static int remote_resched_nonaffinity; 171 static int remote_resched_affinity; 172 static int choose_affinity; 173 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, 174 &remote_resched_nonaffinity, 0, "Number of remote rescheds"); 175 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, 176 &remote_resched_affinity, 0, "Number of remote rescheds"); 177 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, 178 &choose_affinity, 0, "chooseproc() was smart"); 179 #endif 180 181 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 182 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW, 183 &usched_bsd4_rrinterval, 0, ""); 184 static int usched_bsd4_decay = 8; 185 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW, 186 &usched_bsd4_decay, 0, "Extra decay when not running"); 187 static int usched_bsd4_batch_time = 10; 188 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_batch_time, CTLFLAG_RW, 189 &usched_bsd4_batch_time, 0, "Minimum batch counter value"); 190 191 /* 192 * Initialize the run queues at boot time. 193 */ 194 static void 195 rqinit(void *dummy) 196 { 197 int i; 198 199 spin_init(&bsd4_spin); 200 for (i = 0; i < NQS; i++) { 201 TAILQ_INIT(&bsd4_queues[i]); 202 TAILQ_INIT(&bsd4_rtqueues[i]); 203 TAILQ_INIT(&bsd4_idqueues[i]); 204 } 205 atomic_clear_cpumask(&bsd4_curprocmask, 1); 206 } 207 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL) 208 209 /* 210 * BSD4_ACQUIRE_CURPROC 211 * 212 * This function is called when the kernel intends to return to userland. 213 * It is responsible for making the thread the current designated userland 214 * thread for this cpu, blocking if necessary. 215 * 216 * The kernel has already depressed our LWKT priority so we must not switch 217 * until we have either assigned or disposed of the thread. 218 * 219 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 220 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 221 * occur, this function is called only under very controlled circumstances. 222 * 223 * MPSAFE 224 */ 225 static void 226 bsd4_acquire_curproc(struct lwp *lp) 227 { 228 globaldata_t gd; 229 bsd4_pcpu_t dd; 230 thread_t td; 231 #if 0 232 struct lwp *olp; 233 #endif 234 235 /* 236 * Make sure we aren't sitting on a tsleep queue. 237 */ 238 td = lp->lwp_thread; 239 crit_enter_quick(td); 240 if (td->td_flags & TDF_TSLEEPQ) 241 tsleep_remove(td); 242 bsd4_recalculate_estcpu(lp); 243 244 /* 245 * If a reschedule was requested give another thread the 246 * driver's seat. 247 */ 248 if (user_resched_wanted()) { 249 clear_user_resched(); 250 bsd4_release_curproc(lp); 251 } 252 253 /* 254 * Loop until we are the current user thread 255 */ 256 gd = mycpu; 257 dd = &bsd4_pcpu[gd->gd_cpuid]; 258 259 do { 260 /* 261 * Process any pending events and higher priority threads. 262 */ 263 lwkt_yield(); 264 265 /* 266 * Become the currently scheduled user thread for this cpu 267 * if we can do so trivially. 268 * 269 * We can steal another thread's current thread designation 270 * on this cpu since if we are running that other thread 271 * must not be, so we can safely deschedule it. 272 */ 273 if (dd->uschedcp == lp) { 274 /* 275 * We are already the current lwp (hot path). 276 */ 277 dd->upri = lp->lwp_priority; 278 } else if (dd->uschedcp == NULL) { 279 /* 280 * We can trivially become the current lwp. 281 */ 282 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 283 dd->uschedcp = lp; 284 dd->upri = lp->lwp_priority; 285 } else if (dd->upri > lp->lwp_priority) { 286 /* 287 * We can steal the current cpu's lwp designation 288 * away simply by replacing it. The other thread 289 * will stall when it tries to return to userland. 290 */ 291 dd->uschedcp = lp; 292 dd->upri = lp->lwp_priority; 293 /* 294 lwkt_deschedule(olp->lwp_thread); 295 bsd4_setrunqueue(olp); 296 */ 297 } else { 298 /* 299 * We cannot become the current lwp, place the lp 300 * on the bsd4 run-queue and deschedule ourselves. 301 * 302 * When we are reactivated we will have another 303 * chance. 304 */ 305 lwkt_deschedule(lp->lwp_thread); 306 bsd4_setrunqueue(lp); 307 lwkt_switch(); 308 /* 309 * Reload after a switch or setrunqueue/switch possibly 310 * moved us to another cpu. 311 */ 312 gd = mycpu; 313 dd = &bsd4_pcpu[gd->gd_cpuid]; 314 } 315 } while (dd->uschedcp != lp); 316 317 crit_exit_quick(td); 318 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 319 } 320 321 /* 322 * BSD4_RELEASE_CURPROC 323 * 324 * This routine detaches the current thread from the userland scheduler, 325 * usually because the thread needs to run or block in the kernel (at 326 * kernel priority) for a while. 327 * 328 * This routine is also responsible for selecting a new thread to 329 * make the current thread. 330 * 331 * NOTE: This implementation differs from the dummy example in that 332 * bsd4_select_curproc() is able to select the current process, whereas 333 * dummy_select_curproc() is not able to select the current process. 334 * This means we have to NULL out uschedcp. 335 * 336 * Additionally, note that we may already be on a run queue if releasing 337 * via the lwkt_switch() in bsd4_setrunqueue(). 338 * 339 * MPSAFE 340 */ 341 static void 342 bsd4_release_curproc(struct lwp *lp) 343 { 344 globaldata_t gd = mycpu; 345 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 346 347 if (dd->uschedcp == lp) { 348 crit_enter(); 349 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 350 dd->uschedcp = NULL; /* don't let lp be selected */ 351 dd->upri = PRIBASE_NULL; 352 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 353 bsd4_select_curproc(gd); 354 crit_exit(); 355 } 356 } 357 358 /* 359 * BSD4_SELECT_CURPROC 360 * 361 * Select a new current process for this cpu and clear any pending user 362 * reschedule request. The cpu currently has no current process. 363 * 364 * This routine is also responsible for equal-priority round-robining, 365 * typically triggered from bsd4_schedulerclock(). In our dummy example 366 * all the 'user' threads are LWKT scheduled all at once and we just 367 * call lwkt_switch(). 368 * 369 * The calling process is not on the queue and cannot be selected. 370 * 371 * MPSAFE 372 */ 373 static 374 void 375 bsd4_select_curproc(globaldata_t gd) 376 { 377 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 378 struct lwp *nlp; 379 int cpuid = gd->gd_cpuid; 380 381 crit_enter_gd(gd); 382 383 spin_lock(&bsd4_spin); 384 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { 385 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid)); 386 dd->upri = nlp->lwp_priority; 387 dd->uschedcp = nlp; 388 spin_unlock(&bsd4_spin); 389 #ifdef SMP 390 lwkt_acquire(nlp->lwp_thread); 391 #endif 392 lwkt_schedule(nlp->lwp_thread); 393 } else { 394 spin_unlock(&bsd4_spin); 395 } 396 #if 0 397 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) { 398 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 399 spin_unlock(&bsd4_spin); 400 lwkt_schedule(&dd->helper_thread); 401 } else { 402 spin_unlock(&bsd4_spin); 403 } 404 #endif 405 crit_exit_gd(gd); 406 } 407 408 /* 409 * BSD4_SETRUNQUEUE 410 * 411 * Place the specified lwp on the user scheduler's run queue. This routine 412 * must be called with the thread descheduled. The lwp must be runnable. 413 * 414 * The thread may be the current thread as a special case. 415 * 416 * MPSAFE 417 */ 418 static void 419 bsd4_setrunqueue(struct lwp *lp) 420 { 421 globaldata_t gd; 422 bsd4_pcpu_t dd; 423 #ifdef SMP 424 int cpuid; 425 cpumask_t mask; 426 cpumask_t tmpmask; 427 #endif 428 429 /* 430 * First validate the process state relative to the current cpu. 431 * We don't need the spinlock for this, just a critical section. 432 * We are in control of the process. 433 */ 434 crit_enter(); 435 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 436 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0, 437 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 438 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags)); 439 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 440 441 /* 442 * Note: gd and dd are relative to the target thread's last cpu, 443 * NOT our current cpu. 444 */ 445 gd = lp->lwp_thread->td_gd; 446 dd = &bsd4_pcpu[gd->gd_cpuid]; 447 448 /* 449 * This process is not supposed to be scheduled anywhere or assigned 450 * as the current process anywhere. Assert the condition. 451 */ 452 KKASSERT(dd->uschedcp != lp); 453 454 #ifndef SMP 455 /* 456 * If we are not SMP we do not have a scheduler helper to kick 457 * and must directly activate the process if none are scheduled. 458 * 459 * This is really only an issue when bootstrapping init since 460 * the caller in all other cases will be a user process, and 461 * even if released (dd->uschedcp == NULL), that process will 462 * kickstart the scheduler when it returns to user mode from 463 * the kernel. 464 */ 465 if (dd->uschedcp == NULL) { 466 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 467 dd->uschedcp = lp; 468 dd->upri = lp->lwp_priority; 469 lwkt_schedule(lp->lwp_thread); 470 crit_exit(); 471 return; 472 } 473 #endif 474 475 #ifdef SMP 476 /* 477 * XXX fixme. Could be part of a remrunqueue/setrunqueue 478 * operation when the priority is recalculated, so TDF_MIGRATING 479 * may already be set. 480 */ 481 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 482 lwkt_giveaway(lp->lwp_thread); 483 #endif 484 485 /* 486 * We lose control of lp the moment we release the spinlock after 487 * having placed lp on the queue. i.e. another cpu could pick it 488 * up and it could exit, or its priority could be further adjusted, 489 * or something like that. 490 */ 491 spin_lock(&bsd4_spin); 492 bsd4_setrunqueue_locked(lp); 493 494 #ifdef SMP 495 /* 496 * Kick the scheduler helper on one of the other cpu's 497 * and request a reschedule if appropriate. 498 * 499 * NOTE: We check all cpus whos rdyprocmask is set. First we 500 * look for cpus without designated lps, then we look for 501 * cpus with designated lps with a worse priority than our 502 * process. 503 */ 504 ++bsd4_scancpu; 505 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 506 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 507 smp_active_mask & usched_global_cpumask; 508 509 while (mask) { 510 tmpmask = ~(CPUMASK(cpuid) - 1); 511 if (mask & tmpmask) 512 cpuid = BSFCPUMASK(mask & tmpmask); 513 else 514 cpuid = BSFCPUMASK(mask); 515 gd = globaldata_find(cpuid); 516 dd = &bsd4_pcpu[cpuid]; 517 518 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) 519 goto found; 520 mask &= ~CPUMASK(cpuid); 521 } 522 523 /* 524 * Then cpus which might have a currently running lp 525 */ 526 mask = bsd4_curprocmask & bsd4_rdyprocmask & 527 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask; 528 529 while (mask) { 530 tmpmask = ~(CPUMASK(cpuid) - 1); 531 if (mask & tmpmask) 532 cpuid = BSFCPUMASK(mask & tmpmask); 533 else 534 cpuid = BSFCPUMASK(mask); 535 gd = globaldata_find(cpuid); 536 dd = &bsd4_pcpu[cpuid]; 537 538 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 539 goto found; 540 mask &= ~CPUMASK(cpuid); 541 } 542 543 /* 544 * If we cannot find a suitable cpu we reload from bsd4_scancpu 545 * and round-robin. Other cpus will pickup as they release their 546 * current lwps or become ready. 547 * 548 * Avoid a degenerate system lockup case if usched_global_cpumask 549 * is set to 0 or otherwise does not cover lwp_cpumask. 550 * 551 * We only kick the target helper thread in this case, we do not 552 * set the user resched flag because 553 */ 554 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 555 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) { 556 cpuid = 0; 557 } 558 gd = globaldata_find(cpuid); 559 dd = &bsd4_pcpu[cpuid]; 560 found: 561 if (gd == mycpu) { 562 spin_unlock(&bsd4_spin); 563 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 564 if (dd->uschedcp == NULL) { 565 lwkt_schedule(&dd->helper_thread); 566 } else { 567 need_user_resched(); 568 } 569 } 570 } else { 571 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 572 spin_unlock(&bsd4_spin); 573 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 574 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 575 else 576 lwkt_schedule(&dd->helper_thread); 577 } 578 #else 579 /* 580 * Request a reschedule if appropriate. 581 */ 582 spin_unlock(&bsd4_spin); 583 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 584 need_user_resched(); 585 } 586 #endif 587 crit_exit(); 588 } 589 590 /* 591 * This routine is called from a systimer IPI. It MUST be MP-safe and 592 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 593 * each cpu. 594 * 595 * MPSAFE 596 */ 597 static 598 void 599 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 600 { 601 globaldata_t gd = mycpu; 602 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 603 604 /* 605 * Do we need to round-robin? We round-robin 10 times a second. 606 * This should only occur for cpu-bound batch processes. 607 */ 608 if (++dd->rrcount >= usched_bsd4_rrinterval) { 609 dd->rrcount = 0; 610 need_user_resched(); 611 } 612 613 /* 614 * Adjust estcpu upward using a real time equivalent calculation. 615 */ 616 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1); 617 618 /* 619 * Spinlocks also hold a critical section so there should not be 620 * any active. 621 */ 622 KKASSERT(gd->gd_spinlocks_wr == 0); 623 624 bsd4_resetpriority(lp); 625 #if 0 626 /* 627 * if we can't call bsd4_resetpriority for some reason we must call 628 * need user_resched(). 629 */ 630 need_user_resched(); 631 #endif 632 } 633 634 /* 635 * Called from acquire and from kern_synch's one-second timer (one of the 636 * callout helper threads) with a critical section held. 637 * 638 * Decay p_estcpu based on the number of ticks we haven't been running 639 * and our p_nice. As the load increases each process observes a larger 640 * number of idle ticks (because other processes are running in them). 641 * This observation leads to a larger correction which tends to make the 642 * system more 'batchy'. 643 * 644 * Note that no recalculation occurs for a process which sleeps and wakes 645 * up in the same tick. That is, a system doing thousands of context 646 * switches per second will still only do serious estcpu calculations 647 * ESTCPUFREQ times per second. 648 * 649 * MPSAFE 650 */ 651 static 652 void 653 bsd4_recalculate_estcpu(struct lwp *lp) 654 { 655 globaldata_t gd = mycpu; 656 sysclock_t cpbase; 657 sysclock_t ttlticks; 658 int estcpu; 659 int decay_factor; 660 661 /* 662 * We have to subtract periodic to get the last schedclock 663 * timeout time, otherwise we would get the upcoming timeout. 664 * Keep in mind that a process can migrate between cpus and 665 * while the scheduler clock should be very close, boundary 666 * conditions could lead to a small negative delta. 667 */ 668 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 669 670 if (lp->lwp_slptime > 1) { 671 /* 672 * Too much time has passed, do a coarse correction. 673 */ 674 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 675 bsd4_resetpriority(lp); 676 lp->lwp_cpbase = cpbase; 677 lp->lwp_cpticks = 0; 678 lp->lwp_batch -= ESTCPUFREQ; 679 if (lp->lwp_batch < 0) 680 lp->lwp_batch = 0; 681 } else if (lp->lwp_cpbase != cpbase) { 682 /* 683 * Adjust estcpu if we are in a different tick. Don't waste 684 * time if we are in the same tick. 685 * 686 * First calculate the number of ticks in the measurement 687 * interval. The ttlticks calculation can wind up 0 due to 688 * a bug in the handling of lwp_slptime (as yet not found), 689 * so make sure we do not get a divide by 0 panic. 690 */ 691 ttlticks = (cpbase - lp->lwp_cpbase) / 692 gd->gd_schedclock.periodic; 693 if (ttlticks < 0) { 694 ttlticks = 0; 695 lp->lwp_cpbase = cpbase; 696 } 697 if (ttlticks == 0) 698 return; 699 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 700 701 /* 702 * Calculate the percentage of one cpu used factoring in ncpus 703 * and the load and adjust estcpu. Handle degenerate cases 704 * by adding 1 to bsd4_runqcount. 705 * 706 * estcpu is scaled by ESTCPUMAX. 707 * 708 * bsd4_runqcount is the excess number of user processes 709 * that cannot be immediately scheduled to cpus. We want 710 * to count these as running to avoid range compression 711 * in the base calculation (which is the actual percentage 712 * of one cpu used). 713 */ 714 estcpu = (lp->lwp_cpticks * ESTCPUMAX) * 715 (bsd4_runqcount + ncpus) / (ncpus * ttlticks); 716 717 /* 718 * If estcpu is > 50% we become more batch-like 719 * If estcpu is <= 50% we become less batch-like 720 * 721 * It takes 30 cpu seconds to traverse the entire range. 722 */ 723 if (estcpu > ESTCPUMAX / 2) { 724 lp->lwp_batch += ttlticks; 725 if (lp->lwp_batch > BATCHMAX) 726 lp->lwp_batch = BATCHMAX; 727 } else { 728 lp->lwp_batch -= ttlticks; 729 if (lp->lwp_batch < 0) 730 lp->lwp_batch = 0; 731 } 732 733 if (usched_debug == lp->lwp_proc->p_pid) { 734 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", 735 lp->lwp_proc->p_pid, lp, 736 estcpu, lp->lwp_estcpu, 737 lp->lwp_batch, 738 lp->lwp_cpticks, ttlticks); 739 } 740 741 /* 742 * Adjust lp->lwp_esetcpu. The decay factor determines how 743 * quickly lwp_estcpu collapses to its realtime calculation. 744 * A slower collapse gives us a more accurate number but 745 * can cause a cpu hog to eat too much cpu before the 746 * scheduler decides to downgrade it. 747 * 748 * NOTE: p_nice is accounted for in bsd4_resetpriority(), 749 * and not here, but we must still ensure that a 750 * cpu-bound nice -20 process does not completely 751 * override a cpu-bound nice +20 process. 752 * 753 * NOTE: We must use ESTCPULIM() here to deal with any 754 * overshoot. 755 */ 756 decay_factor = usched_bsd4_decay; 757 if (decay_factor < 1) 758 decay_factor = 1; 759 if (decay_factor > 1024) 760 decay_factor = 1024; 761 762 lp->lwp_estcpu = ESTCPULIM( 763 (lp->lwp_estcpu * decay_factor + estcpu) / 764 (decay_factor + 1)); 765 766 if (usched_debug == lp->lwp_proc->p_pid) 767 kprintf(" finalestcpu %d\n", lp->lwp_estcpu); 768 bsd4_resetpriority(lp); 769 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 770 lp->lwp_cpticks = 0; 771 } 772 } 773 774 /* 775 * Compute the priority of a process when running in user mode. 776 * Arrange to reschedule if the resulting priority is better 777 * than that of the current process. 778 * 779 * This routine may be called with any process. 780 * 781 * This routine is called by fork1() for initial setup with the process 782 * of the run queue, and also may be called normally with the process on or 783 * off the run queue. 784 * 785 * MPSAFE 786 */ 787 static void 788 bsd4_resetpriority(struct lwp *lp) 789 { 790 bsd4_pcpu_t dd; 791 int newpriority; 792 u_short newrqtype; 793 int reschedcpu; 794 int checkpri; 795 int estcpu; 796 797 /* 798 * Calculate the new priority and queue type 799 */ 800 crit_enter(); 801 spin_lock(&bsd4_spin); 802 803 newrqtype = lp->lwp_rtprio.type; 804 805 switch(newrqtype) { 806 case RTP_PRIO_REALTIME: 807 case RTP_PRIO_FIFO: 808 newpriority = PRIBASE_REALTIME + 809 (lp->lwp_rtprio.prio & PRIMASK); 810 break; 811 case RTP_PRIO_NORMAL: 812 /* 813 * Detune estcpu based on batchiness. lwp_batch ranges 814 * from 0 to BATCHMAX. Limit estcpu for the sake of 815 * the priority calculation to between 50% and 100%. 816 */ 817 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) / 818 (BATCHMAX * 2); 819 820 /* 821 * p_nice piece Adds (0-40) * 2 0-80 822 * estcpu Adds 16384 * 4 / 512 0-128 823 */ 824 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 825 newpriority += estcpu * PPQ / ESTCPUPPQ; 826 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / 827 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 828 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); 829 break; 830 case RTP_PRIO_IDLE: 831 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 832 break; 833 case RTP_PRIO_THREAD: 834 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 835 break; 836 default: 837 panic("Bad RTP_PRIO %d", newrqtype); 838 /* NOT REACHED */ 839 } 840 841 /* 842 * The newpriority incorporates the queue type so do a simple masked 843 * check to determine if the process has moved to another queue. If 844 * it has, and it is currently on a run queue, then move it. 845 */ 846 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 847 lp->lwp_priority = newpriority; 848 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { 849 bsd4_remrunqueue_locked(lp); 850 lp->lwp_rqtype = newrqtype; 851 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 852 bsd4_setrunqueue_locked(lp); 853 checkpri = 1; 854 } else { 855 lp->lwp_rqtype = newrqtype; 856 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 857 checkpri = 0; 858 } 859 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; 860 } else { 861 lp->lwp_priority = newpriority; 862 reschedcpu = -1; 863 checkpri = 1; 864 } 865 866 /* 867 * Determine if we need to reschedule the target cpu. This only 868 * occurs if the LWP is already on a scheduler queue, which means 869 * that idle cpu notification has already occured. At most we 870 * need only issue a need_user_resched() on the appropriate cpu. 871 * 872 * The LWP may be owned by a CPU different from the current one, 873 * in which case dd->uschedcp may be modified without an MP lock 874 * or a spinlock held. The worst that happens is that the code 875 * below causes a spurious need_user_resched() on the target CPU 876 * and dd->pri to be wrong for a short period of time, both of 877 * which are harmless. 878 * 879 * If checkpri is 0 we are adjusting the priority of the current 880 * process, possibly higher (less desireable), so ignore the upri 881 * check which will fail in that case. 882 */ 883 if (reschedcpu >= 0) { 884 dd = &bsd4_pcpu[reschedcpu]; 885 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) && 886 (checkpri == 0 || 887 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { 888 #ifdef SMP 889 if (reschedcpu == mycpu->gd_cpuid) { 890 spin_unlock(&bsd4_spin); 891 need_user_resched(); 892 } else { 893 spin_unlock(&bsd4_spin); 894 atomic_clear_cpumask(&bsd4_rdyprocmask, 895 CPUMASK(reschedcpu)); 896 lwkt_send_ipiq(lp->lwp_thread->td_gd, 897 need_user_resched_remote, NULL); 898 } 899 #else 900 spin_unlock(&bsd4_spin); 901 need_user_resched(); 902 #endif 903 } else { 904 spin_unlock(&bsd4_spin); 905 } 906 } else { 907 spin_unlock(&bsd4_spin); 908 } 909 crit_exit(); 910 } 911 912 /* 913 * MPSAFE 914 */ 915 static 916 void 917 bsd4_yield(struct lwp *lp) 918 { 919 #if 0 920 /* FUTURE (or something similar) */ 921 switch(lp->lwp_rqtype) { 922 case RTP_PRIO_NORMAL: 923 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 924 break; 925 default: 926 break; 927 } 928 #endif 929 need_user_resched(); 930 } 931 932 /* 933 * Called from fork1() when a new child process is being created. 934 * 935 * Give the child process an initial estcpu that is more batch then 936 * its parent and dock the parent for the fork (but do not 937 * reschedule the parent). This comprises the main part of our batch 938 * detection heuristic for both parallel forking and sequential execs. 939 * 940 * XXX lwp should be "spawning" instead of "forking" 941 * 942 * MPSAFE 943 */ 944 static void 945 bsd4_forking(struct lwp *plp, struct lwp *lp) 946 { 947 /* 948 * Put the child 4 queue slots (out of 32) higher than the parent 949 * (less desireable than the parent). 950 */ 951 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4); 952 953 /* 954 * The batch status of children always starts out centerline 955 * and will inch-up or inch-down as appropriate. It takes roughly 956 * ~15 seconds of >50% cpu to hit the limit. 957 */ 958 lp->lwp_batch = BATCHMAX / 2; 959 960 /* 961 * Dock the parent a cost for the fork, protecting us from fork 962 * bombs. If the parent is forking quickly make the child more 963 * batchy. 964 */ 965 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16); 966 } 967 968 /* 969 * Called when a parent waits for a child. 970 * 971 * MPSAFE 972 */ 973 static void 974 bsd4_exiting(struct lwp *lp, struct proc *child_proc) 975 { 976 } 977 978 /* 979 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 980 * it selects a user process and returns it. If chklp is non-NULL and chklp 981 * has a better or equal priority then the process that would otherwise be 982 * chosen, NULL is returned. 983 * 984 * Until we fix the RUNQ code the chklp test has to be strict or we may 985 * bounce between processes trying to acquire the current process designation. 986 * 987 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is 988 * left intact through the entire routine. 989 */ 990 static 991 struct lwp * 992 chooseproc_locked(struct lwp *chklp) 993 { 994 struct lwp *lp; 995 struct rq *q; 996 u_int32_t *which, *which2; 997 u_int32_t pri; 998 u_int32_t rtqbits; 999 u_int32_t tsqbits; 1000 u_int32_t idqbits; 1001 cpumask_t cpumask; 1002 1003 rtqbits = bsd4_rtqueuebits; 1004 tsqbits = bsd4_queuebits; 1005 idqbits = bsd4_idqueuebits; 1006 cpumask = mycpu->gd_cpumask; 1007 1008 #ifdef SMP 1009 again: 1010 #endif 1011 if (rtqbits) { 1012 pri = bsfl(rtqbits); 1013 q = &bsd4_rtqueues[pri]; 1014 which = &bsd4_rtqueuebits; 1015 which2 = &rtqbits; 1016 } else if (tsqbits) { 1017 pri = bsfl(tsqbits); 1018 q = &bsd4_queues[pri]; 1019 which = &bsd4_queuebits; 1020 which2 = &tsqbits; 1021 } else if (idqbits) { 1022 pri = bsfl(idqbits); 1023 q = &bsd4_idqueues[pri]; 1024 which = &bsd4_idqueuebits; 1025 which2 = &idqbits; 1026 } else { 1027 return NULL; 1028 } 1029 lp = TAILQ_FIRST(q); 1030 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1031 1032 #ifdef SMP 1033 while ((lp->lwp_cpumask & cpumask) == 0) { 1034 lp = TAILQ_NEXT(lp, lwp_procq); 1035 if (lp == NULL) { 1036 *which2 &= ~(1 << pri); 1037 goto again; 1038 } 1039 } 1040 #endif 1041 1042 /* 1043 * If the passed lwp <chklp> is reasonably close to the selected 1044 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1045 * 1046 * Note that we must error on the side of <chklp> to avoid bouncing 1047 * between threads in the acquire code. 1048 */ 1049 if (chklp) { 1050 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1051 return(NULL); 1052 } 1053 1054 #ifdef SMP 1055 /* 1056 * If the chosen lwp does not reside on this cpu spend a few 1057 * cycles looking for a better candidate at the same priority level. 1058 * This is a fallback check, setrunqueue() tries to wakeup the 1059 * correct cpu and is our front-line affinity. 1060 */ 1061 if (lp->lwp_thread->td_gd != mycpu && 1062 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 1063 ) { 1064 if (chklp->lwp_thread->td_gd == mycpu) { 1065 ++choose_affinity; 1066 lp = chklp; 1067 } 1068 } 1069 #endif 1070 1071 TAILQ_REMOVE(q, lp, lwp_procq); 1072 --bsd4_runqcount; 1073 if (TAILQ_EMPTY(q)) 1074 *which &= ~(1 << pri); 1075 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1076 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1077 return lp; 1078 } 1079 1080 #ifdef SMP 1081 1082 static 1083 void 1084 need_user_resched_remote(void *dummy) 1085 { 1086 globaldata_t gd = mycpu; 1087 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 1088 1089 need_user_resched(); 1090 lwkt_schedule(&dd->helper_thread); 1091 } 1092 1093 #endif 1094 1095 /* 1096 * bsd4_remrunqueue_locked() removes a given process from the run queue 1097 * that it is on, clearing the queue busy bit if it becomes empty. 1098 * 1099 * Note that user process scheduler is different from the LWKT schedule. 1100 * The user process scheduler only manages user processes but it uses LWKT 1101 * underneath, and a user process operating in the kernel will often be 1102 * 'released' from our management. 1103 * 1104 * MPSAFE - bsd4_spin must be held exclusively on call 1105 */ 1106 static void 1107 bsd4_remrunqueue_locked(struct lwp *lp) 1108 { 1109 struct rq *q; 1110 u_int32_t *which; 1111 u_int8_t pri; 1112 1113 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ); 1114 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1115 --bsd4_runqcount; 1116 KKASSERT(bsd4_runqcount >= 0); 1117 1118 pri = lp->lwp_rqindex; 1119 switch(lp->lwp_rqtype) { 1120 case RTP_PRIO_NORMAL: 1121 q = &bsd4_queues[pri]; 1122 which = &bsd4_queuebits; 1123 break; 1124 case RTP_PRIO_REALTIME: 1125 case RTP_PRIO_FIFO: 1126 q = &bsd4_rtqueues[pri]; 1127 which = &bsd4_rtqueuebits; 1128 break; 1129 case RTP_PRIO_IDLE: 1130 q = &bsd4_idqueues[pri]; 1131 which = &bsd4_idqueuebits; 1132 break; 1133 default: 1134 panic("remrunqueue: invalid rtprio type"); 1135 /* NOT REACHED */ 1136 } 1137 TAILQ_REMOVE(q, lp, lwp_procq); 1138 if (TAILQ_EMPTY(q)) { 1139 KASSERT((*which & (1 << pri)) != 0, 1140 ("remrunqueue: remove from empty queue")); 1141 *which &= ~(1 << pri); 1142 } 1143 } 1144 1145 /* 1146 * bsd4_setrunqueue_locked() 1147 * 1148 * Add a process whos rqtype and rqindex had previously been calculated 1149 * onto the appropriate run queue. Determine if the addition requires 1150 * a reschedule on a cpu and return the cpuid or -1. 1151 * 1152 * NOTE: Lower priorities are better priorities. 1153 * 1154 * MPSAFE - bsd4_spin must be held exclusively on call 1155 */ 1156 static void 1157 bsd4_setrunqueue_locked(struct lwp *lp) 1158 { 1159 struct rq *q; 1160 u_int32_t *which; 1161 int pri; 1162 1163 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 1164 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1165 ++bsd4_runqcount; 1166 1167 pri = lp->lwp_rqindex; 1168 1169 switch(lp->lwp_rqtype) { 1170 case RTP_PRIO_NORMAL: 1171 q = &bsd4_queues[pri]; 1172 which = &bsd4_queuebits; 1173 break; 1174 case RTP_PRIO_REALTIME: 1175 case RTP_PRIO_FIFO: 1176 q = &bsd4_rtqueues[pri]; 1177 which = &bsd4_rtqueuebits; 1178 break; 1179 case RTP_PRIO_IDLE: 1180 q = &bsd4_idqueues[pri]; 1181 which = &bsd4_idqueuebits; 1182 break; 1183 default: 1184 panic("remrunqueue: invalid rtprio type"); 1185 /* NOT REACHED */ 1186 } 1187 1188 /* 1189 * Add to the correct queue and set the appropriate bit. If no 1190 * lower priority (i.e. better) processes are in the queue then 1191 * we want a reschedule, calculate the best cpu for the job. 1192 * 1193 * Always run reschedules on the LWPs original cpu. 1194 */ 1195 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 1196 *which |= 1 << pri; 1197 } 1198 1199 #ifdef SMP 1200 1201 /* 1202 * For SMP systems a user scheduler helper thread is created for each 1203 * cpu and is used to allow one cpu to wakeup another for the purposes of 1204 * scheduling userland threads from setrunqueue(). 1205 * 1206 * UP systems do not need the helper since there is only one cpu. 1207 * 1208 * We can't use the idle thread for this because we might block. 1209 * Additionally, doing things this way allows us to HLT idle cpus 1210 * on MP systems. 1211 * 1212 * MPSAFE 1213 */ 1214 static void 1215 sched_thread(void *dummy) 1216 { 1217 globaldata_t gd; 1218 bsd4_pcpu_t dd; 1219 bsd4_pcpu_t tmpdd; 1220 struct lwp *nlp; 1221 cpumask_t mask; 1222 int cpuid; 1223 #ifdef SMP 1224 cpumask_t tmpmask; 1225 int tmpid; 1226 #endif 1227 1228 gd = mycpu; 1229 cpuid = gd->gd_cpuid; /* doesn't change */ 1230 mask = gd->gd_cpumask; /* doesn't change */ 1231 dd = &bsd4_pcpu[cpuid]; 1232 1233 /* 1234 * Since we are woken up only when no user processes are scheduled 1235 * on a cpu, we can run at an ultra low priority. 1236 */ 1237 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 1238 1239 for (;;) { 1240 /* 1241 * We use the LWKT deschedule-interlock trick to avoid racing 1242 * bsd4_rdyprocmask. This means we cannot block through to the 1243 * manual lwkt_switch() call we make below. 1244 */ 1245 crit_enter_gd(gd); 1246 lwkt_deschedule_self(gd->gd_curthread); 1247 spin_lock(&bsd4_spin); 1248 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1249 1250 clear_user_resched(); /* This satisfied the reschedule request */ 1251 dd->rrcount = 0; /* Reset the round-robin counter */ 1252 1253 if ((bsd4_curprocmask & mask) == 0) { 1254 /* 1255 * No thread is currently scheduled. 1256 */ 1257 KKASSERT(dd->uschedcp == NULL); 1258 if ((nlp = chooseproc_locked(NULL)) != NULL) { 1259 atomic_set_cpumask(&bsd4_curprocmask, mask); 1260 dd->upri = nlp->lwp_priority; 1261 dd->uschedcp = nlp; 1262 spin_unlock(&bsd4_spin); 1263 #ifdef SMP 1264 lwkt_acquire(nlp->lwp_thread); 1265 #endif 1266 lwkt_schedule(nlp->lwp_thread); 1267 } else { 1268 spin_unlock(&bsd4_spin); 1269 } 1270 } else if (bsd4_runqcount) { 1271 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { 1272 dd->upri = nlp->lwp_priority; 1273 dd->uschedcp = nlp; 1274 spin_unlock(&bsd4_spin); 1275 #ifdef SMP 1276 lwkt_acquire(nlp->lwp_thread); 1277 #endif 1278 lwkt_schedule(nlp->lwp_thread); 1279 } else { 1280 /* 1281 * CHAINING CONDITION TRAIN 1282 * 1283 * We could not deal with the scheduler wakeup 1284 * request on this cpu, locate a ready scheduler 1285 * with no current lp assignment and chain to it. 1286 * 1287 * This ensures that a wakeup race which fails due 1288 * to priority test does not leave other unscheduled 1289 * cpus idle when the runqueue is not empty. 1290 */ 1291 tmpmask = ~bsd4_curprocmask & bsd4_rdyprocmask & 1292 smp_active_mask; 1293 if (tmpmask) { 1294 tmpid = BSFCPUMASK(tmpmask); 1295 tmpdd = &bsd4_pcpu[tmpid]; 1296 atomic_clear_cpumask(&bsd4_rdyprocmask, 1297 CPUMASK(tmpid)); 1298 spin_unlock(&bsd4_spin); 1299 lwkt_schedule(&tmpdd->helper_thread); 1300 } else { 1301 spin_unlock(&bsd4_spin); 1302 } 1303 } 1304 } else { 1305 /* 1306 * The runq is empty. 1307 */ 1308 spin_unlock(&bsd4_spin); 1309 } 1310 1311 /* 1312 * We're descheduled unless someone scheduled us. Switch away. 1313 * Exiting the critical section will cause splz() to be called 1314 * for us if interrupts and such are pending. 1315 */ 1316 crit_exit_gd(gd); 1317 lwkt_switch(); 1318 } 1319 } 1320 1321 /* 1322 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1323 * been cleared by rqinit() and we should not mess with it further. 1324 */ 1325 static void 1326 sched_thread_cpu_init(void) 1327 { 1328 int i; 1329 1330 if (bootverbose) 1331 kprintf("start scheduler helpers on cpus:"); 1332 1333 for (i = 0; i < ncpus; ++i) { 1334 bsd4_pcpu_t dd = &bsd4_pcpu[i]; 1335 cpumask_t mask = CPUMASK(i); 1336 1337 if ((mask & smp_active_mask) == 0) 1338 continue; 1339 1340 if (bootverbose) 1341 kprintf(" %d", i); 1342 1343 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, 1344 TDF_NOSTART, i, "usched %d", i); 1345 1346 /* 1347 * Allow user scheduling on the target cpu. cpu #0 has already 1348 * been enabled in rqinit(). 1349 */ 1350 if (i) 1351 atomic_clear_cpumask(&bsd4_curprocmask, mask); 1352 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1353 dd->upri = PRIBASE_NULL; 1354 } 1355 if (bootverbose) 1356 kprintf("\n"); 1357 } 1358 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 1359 sched_thread_cpu_init, NULL) 1360 1361 #endif 1362 1363