1 /* 2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.1 2005/06/30 16:38:49 dillon Exp $ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/lock.h> 33 #include <sys/queue.h> 34 #include <sys/proc.h> 35 #include <sys/rtprio.h> 36 #include <sys/thread2.h> 37 #include <sys/uio.h> 38 #include <sys/sysctl.h> 39 #include <sys/resourcevar.h> 40 #include <machine/ipl.h> 41 #include <machine/cpu.h> 42 #include <machine/smp.h> 43 44 /* 45 * Priorities. Note that with 32 run queues per scheduler each queue 46 * represents four priority levels. 47 */ 48 49 #define MAXPRI 128 50 #define PRIMASK (MAXPRI - 1) 51 #define PRIBASE_REALTIME 0 52 #define PRIBASE_NORMAL MAXPRI 53 #define PRIBASE_IDLE (MAXPRI * 2) 54 #define PRIBASE_THREAD (MAXPRI * 3) 55 #define PRIBASE_NULL (MAXPRI * 4) 56 57 #define NQS 32 /* 32 run queues. */ 58 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 59 60 /* 61 * NICEPPQ - number of nice units per priority queue 62 * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues 63 * 64 * ESTCPUPPQ - number of estcpu units per priority queue 65 * ESTCPUMAX - number of estcpu units 66 * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at 67 * 100% cpu. 68 */ 69 #define NICEPPQ 2 70 #define ESTCPURAMP 4 71 #define ESTCPUPPQ 512 72 #define ESTCPUMAX (ESTCPUPPQ * NQS) 73 #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP) 74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 75 76 #define ESTCPULIM(v) min((v), ESTCPUMAX) 77 78 TAILQ_HEAD(rq, proc); 79 80 #define p_priority p_usdata.bsd4.priority 81 #define p_rqindex p_usdata.bsd4.rqindex 82 #define p_origcpu p_usdata.bsd4.origcpu 83 #define p_estcpu p_usdata.bsd4.estcpu 84 85 static void bsd4_acquire_curproc(struct proc *p); 86 static void bsd4_release_curproc(struct proc *p); 87 static void bsd4_select_curproc(globaldata_t gd); 88 static void bsd4_setrunqueue(struct proc *p); 89 static void bsd4_remrunqueue(struct proc *p); 90 static void bsd4_schedulerclock(struct proc *p, sysclock_t period, 91 sysclock_t cpstamp); 92 static void bsd4_resetpriority(struct proc *p); 93 static void bsd4_forking(struct proc *pp, struct proc *p); 94 static void bsd4_exiting(struct proc *pp, struct proc *p); 95 96 static void bsd4_recalculate_estcpu(struct proc *p); 97 98 struct usched usched_bsd4 = { 99 { NULL }, 100 "bsd4", "Original DragonFly Scheduler", 101 bsd4_acquire_curproc, 102 bsd4_release_curproc, 103 bsd4_select_curproc, 104 bsd4_setrunqueue, 105 bsd4_remrunqueue, 106 bsd4_schedulerclock, 107 bsd4_recalculate_estcpu, 108 bsd4_resetpriority, 109 bsd4_forking, 110 bsd4_exiting 111 }; 112 113 /* 114 * We have NQS (32) run queues per scheduling class. For the normal 115 * class, there are 128 priorities scaled onto these 32 queues. New 116 * processes are added to the last entry in each queue, and processes 117 * are selected for running by taking them from the head and maintaining 118 * a simple FIFO arrangement. Realtime and Idle priority processes have 119 * and explicit 0-31 priority which maps directly onto their class queue 120 * index. When a queue has something in it, the corresponding bit is 121 * set in the queuebits variable, allowing a single read to determine 122 * the state of all 32 queues and then a ffs() to find the first busy 123 * queue. 124 */ 125 static struct rq queues[NQS]; 126 static struct rq rtqueues[NQS]; 127 static struct rq idqueues[NQS]; 128 static u_int32_t queuebits; 129 static u_int32_t rtqueuebits; 130 static u_int32_t idqueuebits; 131 static cpumask_t curprocmask = -1; /* currently running a user process */ 132 static cpumask_t rdyprocmask; /* ready to accept a user process */ 133 static int runqcount; 134 #ifdef SMP 135 static int scancpu; 136 #endif 137 138 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, ""); 139 #ifdef INVARIANTS 140 static int usched_nonoptimal; 141 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, 142 &usched_nonoptimal, 0, "acquire_curproc() was not optimal"); 143 static int usched_optimal; 144 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, 145 &usched_optimal, 0, "acquire_curproc() was optimal"); 146 #endif 147 static int usched_debug = -1; 148 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, ""); 149 #ifdef SMP 150 static int remote_resched = 1; 151 static int remote_resched_nonaffinity; 152 static int remote_resched_affinity; 153 static int choose_affinity; 154 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW, 155 &remote_resched, 0, "Resched to another cpu"); 156 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, 157 &remote_resched_nonaffinity, 0, "Number of remote rescheds"); 158 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, 159 &remote_resched_affinity, 0, "Number of remote rescheds"); 160 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, 161 &choose_affinity, 0, "chooseproc() was smart"); 162 #endif 163 164 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 165 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW, 166 &usched_bsd4_rrinterval, 0, ""); 167 static int usched_bsd4_decay = ESTCPUINCR / 2; 168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW, 169 &usched_bsd4_decay, 0, ""); 170 171 /* 172 * Initialize the run queues at boot time. 173 */ 174 static void 175 rqinit(void *dummy) 176 { 177 int i; 178 179 for (i = 0; i < NQS; i++) { 180 TAILQ_INIT(&queues[i]); 181 TAILQ_INIT(&rtqueues[i]); 182 TAILQ_INIT(&idqueues[i]); 183 } 184 curprocmask &= ~1; 185 } 186 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL) 187 188 /* 189 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 190 * it selects a user process and returns it. If chkp is non-NULL and chkp 191 * has a better or equal then the process that would otherwise be 192 * chosen, NULL is returned. 193 * 194 * Until we fix the RUNQ code the chkp test has to be strict or we may 195 * bounce between processes trying to acquire the current process designation. 196 */ 197 static 198 struct proc * 199 chooseproc(struct proc *chkp) 200 { 201 struct proc *p; 202 struct rq *q; 203 u_int32_t *which; 204 u_int32_t pri; 205 206 if (rtqueuebits) { 207 pri = bsfl(rtqueuebits); 208 q = &rtqueues[pri]; 209 which = &rtqueuebits; 210 } else if (queuebits) { 211 pri = bsfl(queuebits); 212 q = &queues[pri]; 213 which = &queuebits; 214 } else if (idqueuebits) { 215 pri = bsfl(idqueuebits); 216 q = &idqueues[pri]; 217 which = &idqueuebits; 218 } else { 219 return NULL; 220 } 221 p = TAILQ_FIRST(q); 222 KASSERT(p, ("chooseproc: no proc on busy queue")); 223 224 /* 225 * If the passed process <chkp> is reasonably close to the selected 226 * processed <p>, return NULL (indicating that <chkp> should be kept). 227 * 228 * Note that we must error on the side of <chkp> to avoid bouncing 229 * between threads in the acquire code. 230 */ 231 if (chkp) { 232 if (chkp->p_priority < p->p_priority + PPQ) 233 return(NULL); 234 } 235 236 #ifdef SMP 237 /* 238 * If the chosen process does not reside on this cpu spend a few 239 * cycles looking for a better candidate at the same priority level. 240 * This is a fallback check, setrunqueue() tries to wakeup the 241 * correct cpu and is our front-line affinity. 242 */ 243 if (p->p_thread->td_gd != mycpu && 244 (chkp = TAILQ_NEXT(p, p_procq)) != NULL 245 ) { 246 if (chkp->p_thread->td_gd == mycpu) { 247 ++choose_affinity; 248 p = chkp; 249 } 250 } 251 #endif 252 253 TAILQ_REMOVE(q, p, p_procq); 254 --runqcount; 255 if (TAILQ_EMPTY(q)) 256 *which &= ~(1 << pri); 257 KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq6!")); 258 p->p_flag &= ~P_ONRUNQ; 259 return p; 260 } 261 262 #ifdef SMP 263 /* 264 * called via an ipi message to reschedule on another cpu. 265 */ 266 static 267 void 268 need_user_resched_remote(void *dummy) 269 { 270 need_user_resched(); 271 } 272 273 #endif 274 275 /* 276 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The 277 * user process may represent any user process, including the current 278 * process. 279 * 280 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target 281 * cpus in an attempt to keep the process on the current cpu at least for 282 * a little while to take advantage of locality of reference (e.g. fork/exec 283 * or short fork/exit, and uio_yield()). 284 * 285 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule 286 * or (user level) preempt on the same cpu that a process was previously 287 * scheduled to. If we cannot do this but we are at enough of a higher 288 * priority then the processes running on other cpus, we will allow the 289 * process to be stolen by another cpu. 290 * 291 * WARNING! a thread can be acquired by another cpu the moment it is put 292 * on the user scheduler's run queue AND we release the MP lock. Since we 293 * release the MP lock before switching out another cpu may begin stealing 294 * our current thread before we are completely switched out! The 295 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the 296 * thread before stealing it. 297 * 298 * NOTE on need_user_resched() calls: we have to call need_user_resched() 299 * if the new process is more important then the current process, or if 300 * the new process is the current process and is now less important then 301 * other processes. 302 * 303 * The associated thread must NOT be scheduled. 304 * The process must be runnable. 305 * This must be called at splhigh(). 306 */ 307 static void 308 bsd4_setrunqueue(struct proc *p) 309 { 310 struct rq *q; 311 struct globaldata *gd; 312 int pri; 313 int cpuid; 314 u_int32_t needresched; 315 #ifdef SMP 316 int count; 317 cpumask_t mask; 318 #endif 319 320 crit_enter(); 321 KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN")); 322 KASSERT((p->p_flag & P_ONRUNQ) == 0, 323 ("process %d already on runq! flag %08x", p->p_pid, p->p_flag)); 324 KKASSERT((p->p_thread->td_flags & TDF_RUNQ) == 0); 325 326 /* 327 * Note: gd is the gd of the TARGET thread's cpu, not our cpu. 328 */ 329 gd = p->p_thread->td_gd; 330 p->p_slptime = 0; 331 332 /* 333 * We have not been released, make sure that we are not the currently 334 * designated process. 335 */ 336 KKASSERT(gd->gd_uschedcp != p); 337 338 /* 339 * Check cpu affinity. The associated thread is stable at the 340 * moment. Note that we may be checking another cpu here so we 341 * have to be careful. We are currently protected by the BGL. 342 * 343 * This allows us to avoid actually queueing the process. 344 * acquire_curproc() will handle any threads we mistakenly schedule. 345 */ 346 cpuid = gd->gd_cpuid; 347 348 if ((curprocmask & (1 << cpuid)) == 0) { 349 curprocmask |= 1 << cpuid; 350 gd->gd_uschedcp = p; 351 gd->gd_upri = p->p_priority; 352 lwkt_schedule(p->p_thread); 353 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */ 354 crit_exit(); 355 #ifdef SMP 356 if (gd != mycpu) 357 ++remote_resched_affinity; 358 #endif 359 return; 360 } 361 362 /* 363 * gd and cpuid may still 'hint' at another cpu. Even so we have 364 * to place this process on the userland scheduler's run queue for 365 * action by the target cpu. 366 */ 367 ++runqcount; 368 p->p_flag |= P_ONRUNQ; 369 if (p->p_rtprio.type == RTP_PRIO_NORMAL) { 370 pri = (p->p_priority & PRIMASK) / PPQ; 371 q = &queues[pri]; 372 queuebits |= 1 << pri; 373 needresched = (queuebits & ((1 << pri) - 1)); 374 } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || 375 p->p_rtprio.type == RTP_PRIO_FIFO) { 376 pri = (u_int8_t)p->p_rtprio.prio; 377 q = &rtqueues[pri]; 378 rtqueuebits |= 1 << pri; 379 needresched = (rtqueuebits & ((1 << pri) - 1)); 380 } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { 381 pri = (u_int8_t)p->p_rtprio.prio; 382 q = &idqueues[pri]; 383 idqueuebits |= 1 << pri; 384 needresched = (idqueuebits & ((1 << pri) - 1)); 385 } else { 386 needresched = 0; 387 panic("setrunqueue: invalid rtprio type"); 388 } 389 KKASSERT(pri < 32); 390 p->p_rqindex = pri; /* remember the queue index */ 391 TAILQ_INSERT_TAIL(q, p, p_procq); 392 393 #ifdef SMP 394 /* 395 * Either wakeup other cpus user thread scheduler or request 396 * preemption on other cpus (which will also wakeup a HLT). 397 * 398 * NOTE! gd and cpuid may still be our 'hint', not our current 399 * cpu info. 400 */ 401 402 count = runqcount; 403 404 /* 405 * Check cpu affinity for user preemption (when the curprocmask bit 406 * is set). Note that gd_upri is a speculative field (we modify 407 * another cpu's gd_upri to avoid sending ipiq storms). 408 */ 409 if (gd == mycpu) { 410 if ((p->p_thread->td_flags & TDF_NORESCHED) == 0) { 411 if (p->p_priority < gd->gd_upri - PPQ) { 412 gd->gd_upri = p->p_priority; 413 gd->gd_rrcount = 0; 414 need_user_resched(); 415 --count; 416 } else if (gd->gd_uschedcp == p && needresched) { 417 gd->gd_rrcount = 0; 418 need_user_resched(); 419 --count; 420 } 421 } 422 } else if (remote_resched) { 423 if (p->p_priority < gd->gd_upri - PPQ) { 424 gd->gd_upri = p->p_priority; 425 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 426 --count; 427 ++remote_resched_affinity; 428 } 429 } 430 431 /* 432 * No affinity, first schedule to any cpus that do not have a current 433 * process. If there is a free cpu we always schedule to it. 434 */ 435 if (count && 436 (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 && 437 (p->p_flag & P_PASSIVE_ACQ) == 0) { 438 if (!mask) 439 printf("PROC %d nocpu to schedule it on\n", p->p_pid); 440 while (mask && count) { 441 cpuid = bsfl(mask); 442 KKASSERT((curprocmask & (1 << cpuid)) == 0); 443 rdyprocmask &= ~(1 << cpuid); 444 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread); 445 --count; 446 mask &= ~(1 << cpuid); 447 } 448 } 449 450 /* 451 * If there are still runnable processes try to wakeup a random 452 * cpu that is running a much lower priority process in order to 453 * preempt on it. Note that gd_upri is only a hint, so we can 454 * overwrite it from the wrong cpu. If we can't find one, we 455 * are SOL. 456 * 457 * We depress the priority check so multiple cpu bound programs 458 * do not bounce between cpus. Remember that the clock interrupt 459 * will also cause all cpus to reschedule. 460 * 461 * We must mask against rdyprocmask or we will race in the boot 462 * code (before all cpus have working scheduler helpers), plus 463 * some cpus might not be operational and/or not configured to 464 * handle user processes. 465 */ 466 if (count && remote_resched && ncpus > 1) { 467 cpuid = scancpu; 468 do { 469 if (++cpuid == ncpus) 470 cpuid = 0; 471 } while (cpuid == mycpu->gd_cpuid); 472 scancpu = cpuid; 473 474 if (rdyprocmask & (1 << cpuid)) { 475 gd = globaldata_find(cpuid); 476 477 if (p->p_priority < gd->gd_upri - PPQ) { 478 gd->gd_upri = p->p_priority; 479 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 480 ++remote_resched_nonaffinity; 481 } 482 } 483 } 484 #else 485 if ((p->p_thread->td_flags & TDF_NORESCHED) == 0) { 486 if (p->p_priority < gd->gd_upri - PPQ) { 487 gd->gd_upri = p->p_priority; 488 gd->gd_rrcount = 0; 489 need_user_resched(); 490 } else if (gd->gd_uschedcp == p && needresched) { 491 gd->gd_rrcount = 0; 492 need_user_resched(); 493 } 494 } 495 #endif 496 crit_exit(); 497 } 498 499 /* 500 * remrunqueue() removes a given process from the run queue that it is on, 501 * clearing the queue busy bit if it becomes empty. This function is called 502 * when a userland process is selected for LWKT scheduling. Note that 503 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be 504 * several userland processes whos threads are scheduled or otherwise in 505 * a special state, and such processes are NOT on the userland scheduler's 506 * run queue. 507 * 508 * This must be called at splhigh(). 509 */ 510 static void 511 bsd4_remrunqueue(struct proc *p) 512 { 513 struct rq *q; 514 u_int32_t *which; 515 u_int8_t pri; 516 517 crit_enter(); 518 KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq4!")); 519 p->p_flag &= ~P_ONRUNQ; 520 --runqcount; 521 KKASSERT(runqcount >= 0); 522 pri = p->p_rqindex; 523 if (p->p_rtprio.type == RTP_PRIO_NORMAL) { 524 q = &queues[pri]; 525 which = &queuebits; 526 } else if (p->p_rtprio.type == RTP_PRIO_REALTIME || 527 p->p_rtprio.type == RTP_PRIO_FIFO) { 528 q = &rtqueues[pri]; 529 which = &rtqueuebits; 530 } else if (p->p_rtprio.type == RTP_PRIO_IDLE) { 531 q = &idqueues[pri]; 532 which = &idqueuebits; 533 } else { 534 panic("remrunqueue: invalid rtprio type"); 535 } 536 TAILQ_REMOVE(q, p, p_procq); 537 if (TAILQ_EMPTY(q)) { 538 KASSERT((*which & (1 << pri)) != 0, 539 ("remrunqueue: remove from empty queue")); 540 *which &= ~(1 << pri); 541 } 542 crit_exit(); 543 } 544 545 /* 546 * This routine is called from a systimer IPI. It MUST be MP-safe and 547 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ. 548 */ 549 static 550 void 551 bsd4_schedulerclock(struct proc *p, sysclock_t period, sysclock_t cpstamp) 552 { 553 globaldata_t gd = mycpu; 554 555 /* 556 * Do we need to round-robin? We round-robin 10 times a second. 557 * This should only occur for cpu-bound batch processes. 558 */ 559 if (++gd->gd_rrcount >= usched_bsd4_rrinterval) { 560 gd->gd_rrcount = 0; 561 need_user_resched(); 562 } 563 564 /* 565 * As the process accumulates cpu time p_estcpu is bumped and may 566 * push the process into another scheduling queue. It typically 567 * takes 4 ticks to bump the queue. 568 */ 569 p->p_estcpu = ESTCPULIM(p->p_estcpu + ESTCPUINCR); 570 571 /* 572 * Reducing p_origcpu over time causes more of our estcpu to be 573 * returned to the parent when we exit. This is a small tweak 574 * for the batch detection heuristic. 575 */ 576 if (p->p_origcpu) 577 --p->p_origcpu; 578 579 /* XXX optimize, avoid lock if no reset is required */ 580 if (try_mplock()) { 581 bsd4_resetpriority(p); 582 rel_mplock(); 583 } 584 } 585 586 /* 587 * Release the current process designation on p. P MUST BE CURPROC. 588 * Attempt to assign a new current process from the run queue. 589 * 590 * This function is called from exit1(), tsleep(), and the passive 591 * release code setup in <arch>/<arch>/trap.c 592 * 593 * If we do not have or cannot get the MP lock we just wakeup the userland 594 * helper scheduler thread for this cpu to do the work for us. 595 * 596 * WARNING! The MP lock may be in an unsynchronized state due to the 597 * way get_mplock() works and the fact that this function may be called 598 * from a passive release during a lwkt_switch(). try_mplock() will deal 599 * with this for us but you should be aware that td_mpcount may not be 600 * useable. 601 */ 602 static void 603 bsd4_release_curproc(struct proc *p) 604 { 605 int cpuid; 606 globaldata_t gd = mycpu; 607 608 KKASSERT(p->p_thread->td_gd == gd); 609 crit_enter(); 610 cpuid = gd->gd_cpuid; 611 612 if (gd->gd_uschedcp == p) { 613 if (try_mplock()) { 614 /* 615 * YYY when the MP lock is not assumed (see else) we 616 * will have to check that gd_uschedcp is still == p 617 * after acquisition of the MP lock 618 */ 619 gd->gd_uschedcp = NULL; 620 gd->gd_upri = PRIBASE_NULL; 621 bsd4_select_curproc(gd); 622 rel_mplock(); 623 } else { 624 KKASSERT(0); /* MP LOCK ALWAYS HELD AT THE MOMENT */ 625 gd->gd_uschedcp = NULL; 626 gd->gd_upri = PRIBASE_NULL; 627 /* YYY uschedcp and curprocmask */ 628 if (runqcount && (rdyprocmask & (1 << cpuid))) { 629 rdyprocmask &= ~(1 << cpuid); 630 lwkt_schedule(&mycpu->gd_schedthread); 631 } 632 } 633 } 634 crit_exit(); 635 } 636 637 /* 638 * Select a new current process, potentially retaining gd_uschedcp. However, 639 * be sure to round-robin. This routine is generally only called if a 640 * reschedule is requested and that typically only occurs if a new process 641 * has a better priority or when we are round-robining. 642 * 643 * NOTE: Must be called with giant held and the current cpu's gd. 644 * NOTE: The caller must handle the situation where it loses a 645 * uschedcp designation that it previously held, typically by 646 * calling acquire_curproc() again. 647 * NOTE: May not block 648 */ 649 static 650 void 651 bsd4_select_curproc(globaldata_t gd) 652 { 653 struct proc *np; 654 int cpuid = gd->gd_cpuid; 655 void *old; 656 657 clear_user_resched(); 658 659 /* 660 * Choose the next designated current user process. 661 * Note that we cannot schedule gd_schedthread 662 * if runqcount is 0 without creating a scheduling 663 * loop. 664 * 665 * We do not clear the user resched request here, 666 * we need to test it later when we re-acquire. 667 * 668 * NOTE: chooseproc returns NULL if the chosen proc 669 * is gd_uschedcp. XXX needs cleanup. 670 */ 671 old = gd->gd_uschedcp; 672 if ((np = chooseproc(gd->gd_uschedcp)) != NULL) { 673 curprocmask |= 1 << cpuid; 674 gd->gd_upri = np->p_priority; 675 gd->gd_uschedcp = np; 676 lwkt_acquire(np->p_thread); 677 lwkt_schedule(np->p_thread); 678 } else if (gd->gd_uschedcp) { 679 gd->gd_upri = gd->gd_uschedcp->p_priority; 680 KKASSERT(curprocmask & (1 << cpuid)); 681 } else if (runqcount && (rdyprocmask & (1 << cpuid))) { 682 /*gd->gd_uschedcp = NULL;*/ 683 curprocmask &= ~(1 << cpuid); 684 rdyprocmask &= ~(1 << cpuid); 685 lwkt_schedule(&gd->gd_schedthread); 686 } else { 687 /*gd->gd_uschedcp = NULL;*/ 688 curprocmask &= ~(1 << cpuid); 689 } 690 } 691 692 /* 693 * Acquire the current process designation on the CURRENT process only. 694 * This function is called at kernel-user priority (not userland priority) 695 * when curproc does not match gd_uschedcp. 696 * 697 * Basically we recalculate our estcpu to hopefully give us a more 698 * favorable disposition, setrunqueue, then wait for the curproc 699 * designation to be handed to us (if the setrunqueue didn't do it). 700 */ 701 static void 702 bsd4_acquire_curproc(struct proc *p) 703 { 704 globaldata_t gd = mycpu; 705 706 crit_enter(); 707 ++p->p_stats->p_ru.ru_nivcsw; 708 709 /* 710 * Loop until we become the current process. 711 */ 712 do { 713 KKASSERT(p == gd->gd_curthread->td_proc); 714 bsd4_recalculate_estcpu(p); 715 lwkt_deschedule_self(gd->gd_curthread); 716 bsd4_setrunqueue(p); 717 lwkt_switch(); 718 719 /* 720 * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD. 721 */ 722 gd = mycpu; 723 } while (gd->gd_uschedcp != p); 724 725 crit_exit(); 726 727 /* 728 * That's it. Cleanup, we are done. The caller can return to 729 * user mode now. 730 */ 731 KKASSERT((p->p_flag & P_ONRUNQ) == 0); 732 } 733 734 /* 735 * Compute the priority of a process when running in user mode. 736 * Arrange to reschedule if the resulting priority is better 737 * than that of the current process. 738 */ 739 static void 740 bsd4_resetpriority(struct proc *p) 741 { 742 int newpriority; 743 int opq; 744 int npq; 745 746 /* 747 * Set p_priority for general process comparisons 748 */ 749 switch(p->p_rtprio.type) { 750 case RTP_PRIO_REALTIME: 751 p->p_priority = PRIBASE_REALTIME + p->p_rtprio.prio; 752 return; 753 case RTP_PRIO_NORMAL: 754 break; 755 case RTP_PRIO_IDLE: 756 p->p_priority = PRIBASE_IDLE + p->p_rtprio.prio; 757 return; 758 case RTP_PRIO_THREAD: 759 p->p_priority = PRIBASE_THREAD + p->p_rtprio.prio; 760 return; 761 } 762 763 /* 764 * NORMAL priorities fall through. These are based on niceness 765 * and cpu use. Lower numbers == higher priorities. 766 * 767 * Calculate our priority based on our niceness and estimated cpu. 768 * Note that the nice value adjusts the baseline, which effects 769 * cpu bursts but does not effect overall cpu use between cpu-bound 770 * processes. The use of the nice field in the decay calculation 771 * controls the overall cpu use. 772 * 773 * This isn't an exact calculation. We fit the full nice and 774 * estcpu range into the priority range so the actual PPQ value 775 * is incorrect, but it's still a reasonable way to think about it. 776 */ 777 newpriority = (p->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 778 newpriority += p->p_estcpu * PPQ / ESTCPUPPQ; 779 newpriority = newpriority * MAXPRI / 780 (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 781 newpriority = MIN(newpriority, MAXPRI - 1); /* sanity */ 782 newpriority = MAX(newpriority, 0); /* sanity */ 783 npq = newpriority / PPQ; 784 crit_enter(); 785 opq = (p->p_priority & PRIMASK) / PPQ; 786 if (p->p_stat == SRUN && (p->p_flag & P_ONRUNQ) && opq != npq) { 787 /* 788 * We have to move the process to another queue 789 */ 790 bsd4_remrunqueue(p); 791 p->p_priority = PRIBASE_NORMAL + newpriority; 792 bsd4_setrunqueue(p); 793 } else { 794 /* 795 * We can just adjust the priority and it will be picked 796 * up later. 797 */ 798 KKASSERT(opq == npq || (p->p_flag & P_ONRUNQ) == 0); 799 p->p_priority = PRIBASE_NORMAL + newpriority; 800 } 801 crit_exit(); 802 } 803 804 /* 805 * Called from fork1() when a new child process is being created. 806 * 807 * Give the child process an initial estcpu that is more batch then 808 * its parent and dock the parent for the fork (but do not 809 * reschedule the parent). This comprises the main part of our batch 810 * detection heuristic for both parallel forking and sequential execs. 811 * 812 * Interactive processes will decay the boosted estcpu quickly while batch 813 * processes will tend to compound it. 814 */ 815 static void 816 bsd4_forking(struct proc *pp, struct proc *p) 817 { 818 p->p_estcpu = ESTCPULIM(pp->p_estcpu + ESTCPUPPQ); 819 p->p_origcpu = p->p_estcpu; 820 pp->p_estcpu = ESTCPULIM(pp->p_estcpu + ESTCPUPPQ); 821 } 822 823 /* 824 * Called when the parent reaps a child. Propogate cpu use by the child 825 * back to the parent. 826 */ 827 static void 828 bsd4_exiting(struct proc *pp, struct proc *p) 829 { 830 int delta; 831 832 if (pp->p_pid != 1) { 833 delta = p->p_estcpu - p->p_origcpu; 834 if (delta > 0) 835 pp->p_estcpu = ESTCPULIM(pp->p_estcpu + delta); 836 } 837 } 838 839 /* 840 * Called from acquire and from kern_synch's one-second timer with a 841 * critical section held. 842 * 843 * Decay p_estcpu based on the number of ticks we haven't been running 844 * and our p_nice. As the load increases each process observes a larger 845 * number of idle ticks (because other processes are running in them). 846 * This observation leads to a larger correction which tends to make the 847 * system more 'batchy'. 848 * 849 * Note that no recalculation occurs for a process which sleeps and wakes 850 * up in the same tick. That is, a system doing thousands of context 851 * switches per second will still only do serious estcpu calculations 852 * ESTCPUFREQ times per second. 853 */ 854 static 855 void 856 bsd4_recalculate_estcpu(struct proc *p) 857 { 858 globaldata_t gd = mycpu; 859 sysclock_t cpbase; 860 int loadfac; 861 int ndecay; 862 int nticks; 863 int nleft; 864 865 /* 866 * We have to subtract periodic to get the last schedclock 867 * timeout time, otherwise we would get the upcoming timeout. 868 * Keep in mind that a process can migrate between cpus and 869 * while the scheduler clock should be very close, boundary 870 * conditions could lead to a small negative delta. 871 */ 872 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 873 874 if (p->p_slptime > 1) { 875 /* 876 * Too much time has passed, do a coarse correction. 877 */ 878 p->p_estcpu = p->p_estcpu >> 1; 879 bsd4_resetpriority(p); 880 p->p_cpbase = cpbase; 881 p->p_cpticks = 0; 882 } else if (p->p_cpbase != cpbase) { 883 /* 884 * Adjust estcpu if we are in a different tick. Don't waste 885 * time if we are in the same tick. 886 * 887 * First calculate the number of ticks in the measurement 888 * interval. 889 */ 890 nticks = (cpbase - p->p_cpbase) / gd->gd_schedclock.periodic; 891 updatepcpu(p, p->p_cpticks, nticks); 892 893 if ((nleft = nticks - p->p_cpticks) < 0) 894 nleft = 0; 895 if (usched_debug == p->p_pid) { 896 printf("pid %d estcpu %d cpticks %d nticks %d nleft %d", 897 p->p_pid, p->p_estcpu, 898 p->p_cpticks, nticks, nleft); 899 } 900 901 /* 902 * Calculate a decay value based on ticks remaining scaled 903 * down by the instantanious load and p_nice. 904 */ 905 if ((loadfac = runqcount) < 2) 906 loadfac = 2; 907 ndecay = nleft * usched_bsd4_decay * 2 * 908 (PRIO_MAX * 2 - p->p_nice) / (loadfac * PRIO_MAX * 2); 909 910 /* 911 * Adjust p_estcpu. Handle a border case where batch jobs 912 * can get stalled long enough to decay to zero when they 913 * shouldn't. 914 */ 915 if (p->p_estcpu > ndecay * 2) 916 p->p_estcpu -= ndecay; 917 else 918 p->p_estcpu >>= 1; 919 920 if (usched_debug == p->p_pid) 921 printf(" ndecay %d estcpu %d\n", ndecay, p->p_estcpu); 922 923 bsd4_resetpriority(p); 924 p->p_cpbase = cpbase; 925 p->p_cpticks = 0; 926 } 927 } 928 929 #ifdef SMP 930 931 /* 932 * For SMP systems a user scheduler helper thread is created for each 933 * cpu and is used to allow one cpu to wakeup another for the purposes of 934 * scheduling userland threads from setrunqueue(). UP systems do not 935 * need the helper since there is only one cpu. We can't use the idle 936 * thread for this because we need to hold the MP lock. Additionally, 937 * doing things this way allows us to HLT idle cpus on MP systems. 938 */ 939 static void 940 sched_thread(void *dummy) 941 { 942 globaldata_t gd = mycpu; 943 int cpuid = gd->gd_cpuid; /* doesn't change */ 944 u_int32_t cpumask = 1 << cpuid; /* doesn't change */ 945 946 get_mplock(); /* hold the MP lock */ 947 for (;;) { 948 struct proc *np; 949 950 lwkt_deschedule_self(gd->gd_curthread); /* interlock */ 951 rdyprocmask |= cpumask; 952 crit_enter_quick(gd->gd_curthread); 953 if ((curprocmask & cpumask) == 0 && (np = chooseproc(NULL)) != NULL) { 954 curprocmask |= cpumask; 955 gd->gd_upri = np->p_priority; 956 gd->gd_uschedcp = np; 957 lwkt_acquire(np->p_thread); 958 lwkt_schedule(np->p_thread); 959 } 960 crit_exit_quick(gd->gd_curthread); 961 lwkt_switch(); 962 } 963 } 964 965 /* 966 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 967 * been cleared by rqinit() and we should not mess with it further. 968 */ 969 static void 970 sched_thread_cpu_init(void) 971 { 972 int i; 973 974 if (bootverbose) 975 printf("start scheduler helpers on cpus:"); 976 977 for (i = 0; i < ncpus; ++i) { 978 globaldata_t dgd = globaldata_find(i); 979 cpumask_t mask = 1 << i; 980 981 if ((mask & smp_active_mask) == 0) 982 continue; 983 984 if (bootverbose) 985 printf(" %d", i); 986 987 lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread, 988 TDF_STOPREQ, i, "usched %d", i); 989 990 /* 991 * Allow user scheduling on the target cpu. cpu #0 has already 992 * been enabled in rqinit(). 993 */ 994 if (i) 995 curprocmask &= ~mask; 996 rdyprocmask |= mask; 997 } 998 if (bootverbose) 999 printf("\n"); 1000 } 1001 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL) 1002 1003 #endif 1004 1005