1 /* 2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.7 2005/12/23 10:08:14 dillon Exp $ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/lock.h> 33 #include <sys/queue.h> 34 #include <sys/proc.h> 35 #include <sys/rtprio.h> 36 #include <sys/thread2.h> 37 #include <sys/uio.h> 38 #include <sys/sysctl.h> 39 #include <sys/resourcevar.h> 40 #include <machine/ipl.h> 41 #include <machine/cpu.h> 42 #include <machine/smp.h> 43 44 /* 45 * Priorities. Note that with 32 run queues per scheduler each queue 46 * represents four priority levels. 47 */ 48 49 #define MAXPRI 128 50 #define PRIMASK (MAXPRI - 1) 51 #define PRIBASE_REALTIME 0 52 #define PRIBASE_NORMAL MAXPRI 53 #define PRIBASE_IDLE (MAXPRI * 2) 54 #define PRIBASE_THREAD (MAXPRI * 3) 55 #define PRIBASE_NULL (MAXPRI * 4) 56 57 #define NQS 32 /* 32 run queues. */ 58 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 59 60 /* 61 * NICEPPQ - number of nice units per priority queue 62 * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues 63 * 64 * ESTCPUPPQ - number of estcpu units per priority queue 65 * ESTCPUMAX - number of estcpu units 66 * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at 67 * 100% cpu. 68 */ 69 #define NICEPPQ 2 70 #define ESTCPURAMP 4 71 #define ESTCPUPPQ 512 72 #define ESTCPUMAX (ESTCPUPPQ * NQS) 73 #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP) 74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 75 76 #define ESTCPULIM(v) min((v), ESTCPUMAX) 77 78 TAILQ_HEAD(rq, lwp); 79 80 #define lwp_priority lwp_usdata.bsd4.priority 81 #define lwp_rqindex lwp_usdata.bsd4.rqindex 82 #define lwp_origcpu lwp_usdata.bsd4.origcpu 83 #define lwp_estcpu lwp_usdata.bsd4.estcpu 84 85 static void bsd4_acquire_curproc(struct lwp *lp); 86 static void bsd4_release_curproc(struct lwp *lp); 87 static void bsd4_select_curproc(globaldata_t gd); 88 static void bsd4_setrunqueue(struct lwp *lp); 89 static void bsd4_remrunqueue(struct lwp *lp); 90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 91 sysclock_t cpstamp); 92 static void bsd4_resetpriority(struct lwp *lp); 93 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp); 95 96 static void bsd4_recalculate_estcpu(struct lwp *lp); 97 98 struct usched usched_bsd4 = { 99 { NULL }, 100 "bsd4", "Original DragonFly Scheduler", 101 NULL, /* default registration */ 102 NULL, /* default deregistration */ 103 bsd4_acquire_curproc, 104 bsd4_release_curproc, 105 bsd4_select_curproc, 106 bsd4_setrunqueue, 107 bsd4_remrunqueue, 108 bsd4_schedulerclock, 109 bsd4_recalculate_estcpu, 110 bsd4_resetpriority, 111 bsd4_forking, 112 bsd4_exiting, 113 NULL /* setcpumask not supported */ 114 }; 115 116 /* 117 * We have NQS (32) run queues per scheduling class. For the normal 118 * class, there are 128 priorities scaled onto these 32 queues. New 119 * processes are added to the last entry in each queue, and processes 120 * are selected for running by taking them from the head and maintaining 121 * a simple FIFO arrangement. Realtime and Idle priority processes have 122 * and explicit 0-31 priority which maps directly onto their class queue 123 * index. When a queue has something in it, the corresponding bit is 124 * set in the queuebits variable, allowing a single read to determine 125 * the state of all 32 queues and then a ffs() to find the first busy 126 * queue. 127 */ 128 static struct rq queues[NQS]; 129 static struct rq rtqueues[NQS]; 130 static struct rq idqueues[NQS]; 131 static u_int32_t queuebits; 132 static u_int32_t rtqueuebits; 133 static u_int32_t idqueuebits; 134 static cpumask_t curprocmask = -1; /* currently running a user process */ 135 static cpumask_t rdyprocmask; /* ready to accept a user process */ 136 static int runqcount; 137 #ifdef SMP 138 static int scancpu; 139 #endif 140 141 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, ""); 142 #ifdef INVARIANTS 143 static int usched_nonoptimal; 144 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, 145 &usched_nonoptimal, 0, "acquire_curproc() was not optimal"); 146 static int usched_optimal; 147 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, 148 &usched_optimal, 0, "acquire_curproc() was optimal"); 149 #endif 150 static int usched_debug = -1; 151 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, ""); 152 #ifdef SMP 153 static int remote_resched = 1; 154 static int remote_resched_nonaffinity; 155 static int remote_resched_affinity; 156 static int choose_affinity; 157 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW, 158 &remote_resched, 0, "Resched to another cpu"); 159 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, 160 &remote_resched_nonaffinity, 0, "Number of remote rescheds"); 161 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, 162 &remote_resched_affinity, 0, "Number of remote rescheds"); 163 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, 164 &choose_affinity, 0, "chooseproc() was smart"); 165 #endif 166 167 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW, 169 &usched_bsd4_rrinterval, 0, ""); 170 static int usched_bsd4_decay = ESTCPUINCR / 2; 171 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW, 172 &usched_bsd4_decay, 0, ""); 173 174 /* 175 * Initialize the run queues at boot time. 176 */ 177 static void 178 rqinit(void *dummy) 179 { 180 int i; 181 182 for (i = 0; i < NQS; i++) { 183 TAILQ_INIT(&queues[i]); 184 TAILQ_INIT(&rtqueues[i]); 185 TAILQ_INIT(&idqueues[i]); 186 } 187 atomic_clear_int(&curprocmask, 1); 188 } 189 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL) 190 191 /* 192 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 193 * it selects a user process and returns it. If chklp is non-NULL and chklp 194 * has a better or equal priority then the process that would otherwise be 195 * chosen, NULL is returned. 196 * 197 * Until we fix the RUNQ code the chklp test has to be strict or we may 198 * bounce between processes trying to acquire the current process designation. 199 */ 200 static 201 struct lwp * 202 chooseproc(struct lwp *chklp) 203 { 204 struct lwp *lp; 205 struct rq *q; 206 u_int32_t *which; 207 u_int32_t pri; 208 209 if (rtqueuebits) { 210 pri = bsfl(rtqueuebits); 211 q = &rtqueues[pri]; 212 which = &rtqueuebits; 213 } else if (queuebits) { 214 pri = bsfl(queuebits); 215 q = &queues[pri]; 216 which = &queuebits; 217 } else if (idqueuebits) { 218 pri = bsfl(idqueuebits); 219 q = &idqueues[pri]; 220 which = &idqueuebits; 221 } else { 222 return NULL; 223 } 224 lp = TAILQ_FIRST(q); 225 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 226 227 /* 228 * If the passed lwp <chklp> is reasonably close to the selected 229 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 230 * 231 * Note that we must error on the side of <chklp> to avoid bouncing 232 * between threads in the acquire code. 233 */ 234 if (chklp) { 235 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 236 return(NULL); 237 } 238 239 #ifdef SMP 240 /* 241 * If the chosen lwp does not reside on this cpu spend a few 242 * cycles looking for a better candidate at the same priority level. 243 * This is a fallback check, setrunqueue() tries to wakeup the 244 * correct cpu and is our front-line affinity. 245 */ 246 if (lp->lwp_thread->td_gd != mycpu && 247 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 248 ) { 249 if (chklp->lwp_thread->td_gd == mycpu) { 250 ++choose_affinity; 251 lp = chklp; 252 } 253 } 254 #endif 255 256 TAILQ_REMOVE(q, lp, lwp_procq); 257 --runqcount; 258 if (TAILQ_EMPTY(q)) 259 *which &= ~(1 << pri); 260 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!")); 261 lp->lwp_proc->p_flag &= ~P_ONRUNQ; 262 return lp; 263 } 264 265 #ifdef SMP 266 /* 267 * called via an ipi message to reschedule on another cpu. 268 */ 269 static 270 void 271 need_user_resched_remote(void *dummy) 272 { 273 need_user_resched(); 274 } 275 276 #endif 277 278 /* 279 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The 280 * user process may represent any user process, including the current 281 * process. 282 * 283 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target 284 * cpus in an attempt to keep the process on the current cpu at least for 285 * a little while to take advantage of locality of reference (e.g. fork/exec 286 * or short fork/exit, and uio_yield()). 287 * 288 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule 289 * or (user level) preempt on the same cpu that a process was previously 290 * scheduled to. If we cannot do this but we are at enough of a higher 291 * priority then the processes running on other cpus, we will allow the 292 * process to be stolen by another cpu. 293 * 294 * WARNING! a thread can be acquired by another cpu the moment it is put 295 * on the user scheduler's run queue AND we release the MP lock. Since we 296 * release the MP lock before switching out another cpu may begin stealing 297 * our current thread before we are completely switched out! The 298 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the 299 * thread before stealing it. 300 * 301 * NOTE on need_user_resched() calls: we have to call need_user_resched() 302 * if the new process is more important then the current process, or if 303 * the new process is the current process and is now less important then 304 * other processes. 305 * 306 * The associated thread must NOT be scheduled. 307 * The process must be runnable. 308 * This must be called at splhigh(). 309 */ 310 static void 311 bsd4_setrunqueue(struct lwp *lp) 312 { 313 struct rq *q; 314 struct globaldata *gd; 315 int pri; 316 int cpuid; 317 u_int32_t needresched; 318 #ifdef SMP 319 int count; 320 cpumask_t mask; 321 #endif 322 323 ASSERT_MP_LOCK_HELD(lp->lwp_thread); 324 crit_enter(); 325 KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN")); 326 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0, 327 ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid, 328 lp->lwp_tid, lp->lwp_proc->p_flag)); 329 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 330 331 /* 332 * Note: gd is the gd of the TARGET thread's cpu, not our cpu. 333 */ 334 gd = lp->lwp_thread->td_gd; 335 336 /* 337 * Because recalculate is only called once or twice for long sleeps, 338 * not every second forever while the process is sleeping, we have 339 * to manually call it to resynchronize p_cpbase on wakeup or it 340 * will wrap if the process was sleeping long enough (e.g. ~10 min 341 * with the ACPI timer) and really mess up the nticks calculation. 342 */ 343 if (lp->lwp_slptime) { 344 bsd4_recalculate_estcpu(lp); 345 lp->lwp_slptime = 0; 346 } 347 /* 348 * We have not been released, make sure that we are not the currently 349 * designated process. 350 */ 351 KKASSERT(gd->gd_uschedcp != lp); 352 353 /* 354 * Check cpu affinity. The associated thread is stable at the 355 * moment. Note that we may be checking another cpu here so we 356 * have to be careful. We are currently protected by the BGL. 357 * 358 * This allows us to avoid actually queueing the process. 359 * acquire_curproc() will handle any threads we mistakenly schedule. 360 */ 361 cpuid = gd->gd_cpuid; 362 363 if ((curprocmask & (1 << cpuid)) == 0) { 364 atomic_set_int(&curprocmask, 1 << cpuid); 365 gd->gd_uschedcp = lp; 366 gd->gd_upri = lp->lwp_priority; 367 lwkt_schedule(lp->lwp_thread); 368 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */ 369 crit_exit(); 370 #ifdef SMP 371 if (gd != mycpu) 372 ++remote_resched_affinity; 373 #endif 374 return; 375 } 376 377 /* 378 * gd and cpuid may still 'hint' at another cpu. Even so we have 379 * to place this process on the userland scheduler's run queue for 380 * action by the target cpu. 381 */ 382 ++runqcount; 383 lp->lwp_proc->p_flag |= P_ONRUNQ; 384 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) { 385 pri = (lp->lwp_priority & PRIMASK) / PPQ; 386 q = &queues[pri]; 387 queuebits |= 1 << pri; 388 needresched = (queuebits & ((1 << pri) - 1)); 389 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME || 390 lp->lwp_rtprio.type == RTP_PRIO_FIFO) { 391 pri = (u_int8_t)lp->lwp_rtprio.prio; 392 q = &rtqueues[pri]; 393 rtqueuebits |= 1 << pri; 394 needresched = (rtqueuebits & ((1 << pri) - 1)); 395 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) { 396 pri = (u_int8_t)lp->lwp_rtprio.prio; 397 q = &idqueues[pri]; 398 idqueuebits |= 1 << pri; 399 needresched = (idqueuebits & ((1 << pri) - 1)); 400 } else { 401 needresched = 0; 402 panic("setrunqueue: invalid rtprio type"); 403 } 404 KKASSERT(pri < 32); 405 lp->lwp_rqindex = pri; /* remember the queue index */ 406 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 407 408 #ifdef SMP 409 /* 410 * Either wakeup other cpus user thread scheduler or request 411 * preemption on other cpus (which will also wakeup a HLT). 412 * 413 * NOTE! gd and cpuid may still be our 'hint', not our current 414 * cpu info. 415 */ 416 417 count = runqcount; 418 419 /* 420 * Check cpu affinity for user preemption (when the curprocmask bit 421 * is set). Note that gd_upri is a speculative field (we modify 422 * another cpu's gd_upri to avoid sending ipiq storms). 423 */ 424 if (gd == mycpu) { 425 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) { 426 if (lp->lwp_priority < gd->gd_upri - PPQ) { 427 gd->gd_upri = lp->lwp_priority; 428 gd->gd_rrcount = 0; 429 need_user_resched(); 430 --count; 431 } else if (gd->gd_uschedcp == lp && needresched) { 432 gd->gd_rrcount = 0; 433 need_user_resched(); 434 --count; 435 } 436 } 437 } else if (remote_resched) { 438 if (lp->lwp_priority < gd->gd_upri - PPQ) { 439 gd->gd_upri = lp->lwp_priority; 440 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 441 --count; 442 ++remote_resched_affinity; 443 } 444 } 445 446 /* 447 * No affinity, first schedule to any cpus that do not have a current 448 * process. If there is a free cpu we always schedule to it. 449 */ 450 if (count && 451 (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 && 452 (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) { 453 if (!mask) 454 printf("lwp %d/%d nocpu to schedule it on\n", 455 lp->lwp_proc->p_pid, lp->lwp_tid); 456 while (mask && count) { 457 cpuid = bsfl(mask); 458 KKASSERT((curprocmask & (1 << cpuid)) == 0); 459 atomic_clear_int(&rdyprocmask, 1 << cpuid); 460 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread); 461 --count; 462 mask &= ~(1 << cpuid); 463 } 464 } 465 466 /* 467 * If there are still runnable processes try to wakeup a random 468 * cpu that is running a much lower priority process in order to 469 * preempt on it. Note that gd_upri is only a hint, so we can 470 * overwrite it from the wrong cpu. If we can't find one, we 471 * are SOL. 472 * 473 * We depress the priority check so multiple cpu bound programs 474 * do not bounce between cpus. Remember that the clock interrupt 475 * will also cause all cpus to reschedule. 476 * 477 * We must mask against rdyprocmask or we will race in the boot 478 * code (before all cpus have working scheduler helpers), plus 479 * some cpus might not be operational and/or not configured to 480 * handle user processes. 481 */ 482 if (count && remote_resched && ncpus > 1) { 483 cpuid = scancpu; 484 do { 485 if (++cpuid == ncpus) 486 cpuid = 0; 487 } while (cpuid == mycpu->gd_cpuid); 488 scancpu = cpuid; 489 490 if (rdyprocmask & (1 << cpuid)) { 491 gd = globaldata_find(cpuid); 492 493 if (lp->lwp_priority < gd->gd_upri - PPQ) { 494 gd->gd_upri = lp->lwp_priority; 495 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 496 ++remote_resched_nonaffinity; 497 } 498 } 499 } 500 #else 501 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) { 502 if (lp->lwp_priority < gd->gd_upri - PPQ) { 503 gd->gd_upri = lp->lwp_priority; 504 gd->gd_rrcount = 0; 505 need_user_resched(); 506 } else if (gd->gd_uschedcp == lp && needresched) { 507 gd->gd_rrcount = 0; 508 need_user_resched(); 509 } 510 } 511 #endif 512 crit_exit(); 513 } 514 515 /* 516 * remrunqueue() removes a given process from the run queue that it is on, 517 * clearing the queue busy bit if it becomes empty. This function is called 518 * when a userland process is selected for LWKT scheduling. Note that 519 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be 520 * several userland processes whos threads are scheduled or otherwise in 521 * a special state, and such processes are NOT on the userland scheduler's 522 * run queue. 523 * 524 * This must be called at splhigh(). 525 */ 526 static void 527 bsd4_remrunqueue(struct lwp *lp) 528 { 529 struct rq *q; 530 u_int32_t *which; 531 u_int8_t pri; 532 533 ASSERT_MP_LOCK_HELD(lp->lwp_thread); 534 crit_enter(); 535 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!")); 536 lp->lwp_proc->p_flag &= ~P_ONRUNQ; 537 --runqcount; 538 KKASSERT(runqcount >= 0); 539 pri = lp->lwp_rqindex; 540 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) { 541 q = &queues[pri]; 542 which = &queuebits; 543 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME || 544 lp->lwp_rtprio.type == RTP_PRIO_FIFO) { 545 q = &rtqueues[pri]; 546 which = &rtqueuebits; 547 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) { 548 q = &idqueues[pri]; 549 which = &idqueuebits; 550 } else { 551 panic("remrunqueue: invalid rtprio type"); 552 } 553 TAILQ_REMOVE(q, lp, lwp_procq); 554 if (TAILQ_EMPTY(q)) { 555 KASSERT((*which & (1 << pri)) != 0, 556 ("remrunqueue: remove from empty queue")); 557 *which &= ~(1 << pri); 558 } 559 crit_exit(); 560 } 561 562 /* 563 * This routine is called from a systimer IPI. It MUST be MP-safe and 564 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ. 565 * 566 * MPSAFE 567 */ 568 static 569 void 570 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 571 { 572 globaldata_t gd = mycpu; 573 574 /* 575 * Do we need to round-robin? We round-robin 10 times a second. 576 * This should only occur for cpu-bound batch processes. 577 */ 578 if (++gd->gd_rrcount >= usched_bsd4_rrinterval) { 579 gd->gd_rrcount = 0; 580 need_user_resched(); 581 } 582 583 /* 584 * As the process accumulates cpu time p_estcpu is bumped and may 585 * push the process into another scheduling queue. It typically 586 * takes 4 ticks to bump the queue. 587 */ 588 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 589 590 /* 591 * Reducing p_origcpu over time causes more of our estcpu to be 592 * returned to the parent when we exit. This is a small tweak 593 * for the batch detection heuristic. 594 */ 595 if (lp->lwp_origcpu) 596 --lp->lwp_origcpu; 597 598 /* XXX optimize, avoid lock if no reset is required */ 599 if (try_mplock()) { 600 bsd4_resetpriority(lp); 601 rel_mplock(); 602 } 603 } 604 605 /* 606 * Release the current process designation on p. P MUST BE CURPROC. 607 * Attempt to assign a new current process from the run queue. 608 * 609 * This function is called from exit1(), tsleep(), and the passive 610 * release code setup in <arch>/<arch>/trap.c 611 * 612 * If we do not have or cannot get the MP lock we just wakeup the userland 613 * helper scheduler thread for this cpu to do the work for us. 614 * 615 * WARNING! The MP lock may be in an unsynchronized state due to the 616 * way get_mplock() works and the fact that this function may be called 617 * from a passive release during a lwkt_switch(). try_mplock() will deal 618 * with this for us but you should be aware that td_mpcount may not be 619 * useable. 620 */ 621 static void 622 bsd4_release_curproc(struct lwp *lp) 623 { 624 int cpuid; 625 globaldata_t gd = mycpu; 626 627 KKASSERT(lp->lwp_thread->td_gd == gd); 628 crit_enter(); 629 cpuid = gd->gd_cpuid; 630 631 if (gd->gd_uschedcp == lp) { 632 if (try_mplock()) { 633 /* 634 * If we can obtain the MP lock we can directly 635 * select the next current process. 636 * 637 * bsd4_select_curproc() will adjust curprocmask 638 * for us. 639 */ 640 gd->gd_uschedcp = NULL; 641 gd->gd_upri = PRIBASE_NULL; 642 bsd4_select_curproc(gd); 643 rel_mplock(); 644 } else { 645 /* 646 * If we cannot obtain the MP lock schedule our 647 * helper thread to select the next current 648 * process. 649 * 650 * This is the only place where we adjust curprocmask 651 * and rdyprocmask without holding the MP lock. 652 */ 653 gd->gd_uschedcp = NULL; 654 gd->gd_upri = PRIBASE_NULL; 655 atomic_clear_int(&curprocmask, 1 << cpuid); 656 if (runqcount && (rdyprocmask & (1 << cpuid))) { 657 atomic_clear_int(&rdyprocmask, 1 << cpuid); 658 lwkt_schedule(&mycpu->gd_schedthread); 659 } 660 } 661 } 662 crit_exit(); 663 } 664 665 /* 666 * Select a new current process, potentially retaining gd_uschedcp. However, 667 * be sure to round-robin. This routine is generally only called if a 668 * reschedule is requested and that typically only occurs if a new process 669 * has a better priority or when we are round-robining. 670 * 671 * NOTE: Must be called with giant held and the current cpu's gd. 672 * NOTE: The caller must handle the situation where it loses a 673 * uschedcp designation that it previously held, typically by 674 * calling acquire_curproc() again. 675 * NOTE: May not block 676 */ 677 static 678 void 679 bsd4_select_curproc(globaldata_t gd) 680 { 681 struct lwp *nlp; 682 int cpuid = gd->gd_cpuid; 683 void *old; 684 685 clear_user_resched(); 686 get_mplock(); 687 688 /* 689 * Choose the next designated current user process. 690 * Note that we cannot schedule gd_schedthread 691 * if runqcount is 0 without creating a scheduling 692 * loop. 693 * 694 * We do not clear the user resched request here, 695 * we need to test it later when we re-acquire. 696 * 697 * NOTE: chooseproc returns NULL if the chosen lwp 698 * is gd_uschedcp. XXX needs cleanup. 699 */ 700 old = gd->gd_uschedcp; 701 if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) { 702 atomic_set_int(&curprocmask, 1 << cpuid); 703 gd->gd_upri = nlp->lwp_priority; 704 gd->gd_uschedcp = nlp; 705 lwkt_acquire(nlp->lwp_thread); 706 lwkt_schedule(nlp->lwp_thread); 707 } else if (gd->gd_uschedcp) { 708 gd->gd_upri = gd->gd_uschedcp->lwp_priority; 709 KKASSERT(curprocmask & (1 << cpuid)); 710 } else if (runqcount && (rdyprocmask & (1 << cpuid))) { 711 /*gd->gd_uschedcp = NULL;*/ 712 atomic_clear_int(&curprocmask, 1 << cpuid); 713 atomic_clear_int(&rdyprocmask, 1 << cpuid); 714 lwkt_schedule(&gd->gd_schedthread); 715 } else { 716 /*gd->gd_uschedcp = NULL;*/ 717 atomic_clear_int(&curprocmask, 1 << cpuid); 718 } 719 rel_mplock(); 720 } 721 722 /* 723 * Acquire the current process designation on the CURRENT process only. 724 * This function is called at kernel-user priority (not userland priority) 725 * when curlwp does not match gd_uschedcp. 726 * 727 * This function is only called just prior to returning to user mode. 728 * 729 * Basically we recalculate our estcpu to hopefully give us a more 730 * favorable disposition, setrunqueue, then wait for the curlwp 731 * designation to be handed to us (if the setrunqueue didn't do it). 732 * 733 * WARNING! THIS FUNCTION MAY CAUSE THE CURRENT THREAD TO MIGRATE TO 734 * ANOTHER CPU! Because most of the kernel assumes that no migration will 735 * occur, this function is called only under very controlled circumstances. 736 */ 737 static void 738 bsd4_acquire_curproc(struct lwp *lp) 739 { 740 globaldata_t gd = mycpu; 741 742 get_mplock(); 743 crit_enter(); 744 745 /* 746 * Recalculate our priority and put us back on the userland 747 * scheduler's runq. 748 * 749 * Only increment the involuntary context switch count if the 750 * setrunqueue call did not immediately schedule us. 751 */ 752 KKASSERT(lp == gd->gd_curthread->td_lwp); 753 bsd4_recalculate_estcpu(lp); 754 lwkt_deschedule_self(gd->gd_curthread); 755 bsd4_setrunqueue(lp); 756 if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0) 757 ++lp->lwp_stats->p_ru.ru_nivcsw; 758 lwkt_switch(); 759 760 /* 761 * Because we put ourselves back on the userland scheduler's run 762 * queue, WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU 763 */ 764 gd = mycpu; 765 766 /* 767 * We better be the current process when we wake up, and we had 768 * better not be on the run queue. 769 */ 770 KKASSERT(gd->gd_uschedcp == lp); 771 KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0); 772 773 crit_exit(); 774 rel_mplock(); 775 } 776 777 /* 778 * Compute the priority of a process when running in user mode. 779 * Arrange to reschedule if the resulting priority is better 780 * than that of the current process. 781 */ 782 static void 783 bsd4_resetpriority(struct lwp *lp) 784 { 785 int newpriority; 786 int opq; 787 int npq; 788 789 ASSERT_MP_LOCK_HELD(curthread); 790 791 /* 792 * Set p_priority for general process comparisons 793 */ 794 switch(lp->lwp_rtprio.type) { 795 case RTP_PRIO_REALTIME: 796 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio; 797 return; 798 case RTP_PRIO_NORMAL: 799 break; 800 case RTP_PRIO_IDLE: 801 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio; 802 return; 803 case RTP_PRIO_THREAD: 804 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio; 805 return; 806 } 807 808 /* 809 * NORMAL priorities fall through. These are based on niceness 810 * and cpu use. Lower numbers == higher priorities. 811 * 812 * Calculate our priority based on our niceness and estimated cpu. 813 * Note that the nice value adjusts the baseline, which effects 814 * cpu bursts but does not effect overall cpu use between cpu-bound 815 * processes. The use of the nice field in the decay calculation 816 * controls the overall cpu use. 817 * 818 * This isn't an exact calculation. We fit the full nice and 819 * estcpu range into the priority range so the actual PPQ value 820 * is incorrect, but it's still a reasonable way to think about it. 821 */ 822 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 823 newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ; 824 newpriority = newpriority * MAXPRI / 825 (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 826 newpriority = MIN(newpriority, MAXPRI - 1); /* sanity */ 827 newpriority = MAX(newpriority, 0); /* sanity */ 828 npq = newpriority / PPQ; 829 crit_enter(); 830 opq = (lp->lwp_priority & PRIMASK) / PPQ; 831 if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) { 832 /* 833 * We have to move the process to another queue 834 */ 835 bsd4_remrunqueue(lp); 836 lp->lwp_priority = PRIBASE_NORMAL + newpriority; 837 bsd4_setrunqueue(lp); 838 } else { 839 /* 840 * We can just adjust the priority and it will be picked 841 * up later. 842 */ 843 KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0); 844 lp->lwp_priority = PRIBASE_NORMAL + newpriority; 845 } 846 crit_exit(); 847 } 848 849 /* 850 * Called from fork1() when a new child process is being created. 851 * 852 * Give the child process an initial estcpu that is more batch then 853 * its parent and dock the parent for the fork (but do not 854 * reschedule the parent). This comprises the main part of our batch 855 * detection heuristic for both parallel forking and sequential execs. 856 * 857 * Interactive processes will decay the boosted estcpu quickly while batch 858 * processes will tend to compound it. 859 * XXX lwp should be "spawning" instead of "forking" 860 * 861 * MPSAFE 862 */ 863 static void 864 bsd4_forking(struct lwp *plp, struct lwp *lp) 865 { 866 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ); 867 lp->lwp_origcpu = lp->lwp_estcpu; 868 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ); 869 } 870 871 /* 872 * Called when the parent reaps a child. Propogate cpu use by the child 873 * back to the parent. 874 * 875 * MPSAFE 876 */ 877 static void 878 bsd4_exiting(struct lwp *plp, struct lwp *lp) 879 { 880 int delta; 881 882 if (plp->lwp_proc->p_pid != 1) { 883 delta = lp->lwp_estcpu - lp->lwp_origcpu; 884 if (delta > 0) 885 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta); 886 } 887 } 888 889 /* 890 * Called from acquire and from kern_synch's one-second timer with a 891 * critical section held. 892 * 893 * Decay p_estcpu based on the number of ticks we haven't been running 894 * and our p_nice. As the load increases each process observes a larger 895 * number of idle ticks (because other processes are running in them). 896 * This observation leads to a larger correction which tends to make the 897 * system more 'batchy'. 898 * 899 * Note that no recalculation occurs for a process which sleeps and wakes 900 * up in the same tick. That is, a system doing thousands of context 901 * switches per second will still only do serious estcpu calculations 902 * ESTCPUFREQ times per second. 903 */ 904 static 905 void 906 bsd4_recalculate_estcpu(struct lwp *lp) 907 { 908 globaldata_t gd = mycpu; 909 sysclock_t cpbase; 910 int loadfac; 911 int ndecay; 912 int nticks; 913 int nleft; 914 915 ASSERT_MP_LOCK_HELD(curthread); 916 917 /* 918 * We have to subtract periodic to get the last schedclock 919 * timeout time, otherwise we would get the upcoming timeout. 920 * Keep in mind that a process can migrate between cpus and 921 * while the scheduler clock should be very close, boundary 922 * conditions could lead to a small negative delta. 923 */ 924 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 925 926 if (lp->lwp_slptime > 1) { 927 /* 928 * Too much time has passed, do a coarse correction. 929 */ 930 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 931 bsd4_resetpriority(lp); 932 lp->lwp_cpbase = cpbase; 933 lp->lwp_cpticks = 0; 934 } else if (lp->lwp_cpbase != cpbase) { 935 /* 936 * Adjust estcpu if we are in a different tick. Don't waste 937 * time if we are in the same tick. 938 * 939 * First calculate the number of ticks in the measurement 940 * interval. 941 */ 942 nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic; 943 updatepcpu(lp, lp->lwp_cpticks, nticks); 944 945 if ((nleft = nticks - lp->lwp_cpticks) < 0) 946 nleft = 0; 947 if (usched_debug == lp->lwp_proc->p_pid) { 948 printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d", 949 lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu, 950 lp->lwp_cpticks, nticks, nleft); 951 } 952 953 /* 954 * Calculate a decay value based on ticks remaining scaled 955 * down by the instantanious load and p_nice. 956 */ 957 if ((loadfac = runqcount) < 2) 958 loadfac = 2; 959 ndecay = nleft * usched_bsd4_decay * 2 * 960 (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2); 961 962 /* 963 * Adjust p_estcpu. Handle a border case where batch jobs 964 * can get stalled long enough to decay to zero when they 965 * shouldn't. 966 */ 967 if (lp->lwp_estcpu > ndecay * 2) 968 lp->lwp_estcpu -= ndecay; 969 else 970 lp->lwp_estcpu >>= 1; 971 972 if (usched_debug == lp->lwp_proc->p_pid) 973 printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu); 974 975 bsd4_resetpriority(lp); 976 lp->lwp_cpbase = cpbase; 977 lp->lwp_cpticks = 0; 978 } 979 } 980 981 #ifdef SMP 982 983 /* 984 * For SMP systems a user scheduler helper thread is created for each 985 * cpu and is used to allow one cpu to wakeup another for the purposes of 986 * scheduling userland threads from setrunqueue(). UP systems do not 987 * need the helper since there is only one cpu. We can't use the idle 988 * thread for this because we need to hold the MP lock. Additionally, 989 * doing things this way allows us to HLT idle cpus on MP systems. 990 */ 991 static void 992 sched_thread(void *dummy) 993 { 994 globaldata_t gd = mycpu; 995 int cpuid = gd->gd_cpuid; /* doesn't change */ 996 u_int32_t cpumask = 1 << cpuid; /* doesn't change */ 997 998 ASSERT_MP_LOCK_HELD(curthread); 999 for (;;) { 1000 struct lwp *nlp; 1001 1002 lwkt_deschedule_self(gd->gd_curthread); /* interlock */ 1003 atomic_set_int(&rdyprocmask, cpumask); 1004 crit_enter_quick(gd->gd_curthread); 1005 if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) { 1006 atomic_set_int(&curprocmask, cpumask); 1007 gd->gd_upri = nlp->lwp_priority; 1008 gd->gd_uschedcp = nlp; 1009 lwkt_acquire(nlp->lwp_thread); 1010 lwkt_schedule(nlp->lwp_thread); 1011 } 1012 crit_exit_quick(gd->gd_curthread); 1013 lwkt_switch(); 1014 } 1015 } 1016 1017 /* 1018 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1019 * been cleared by rqinit() and we should not mess with it further. 1020 */ 1021 static void 1022 sched_thread_cpu_init(void) 1023 { 1024 int i; 1025 1026 if (bootverbose) 1027 printf("start scheduler helpers on cpus:"); 1028 1029 for (i = 0; i < ncpus; ++i) { 1030 globaldata_t dgd = globaldata_find(i); 1031 cpumask_t mask = 1 << i; 1032 1033 if ((mask & smp_active_mask) == 0) 1034 continue; 1035 1036 if (bootverbose) 1037 printf(" %d", i); 1038 1039 lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread, 1040 TDF_STOPREQ, i, "usched %d", i); 1041 1042 /* 1043 * Allow user scheduling on the target cpu. cpu #0 has already 1044 * been enabled in rqinit(). 1045 */ 1046 if (i) 1047 atomic_clear_int(&curprocmask, mask); 1048 atomic_set_int(&rdyprocmask, mask); 1049 } 1050 if (bootverbose) 1051 printf("\n"); 1052 } 1053 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL) 1054 1055 #endif 1056 1057