1 /* 2 * Copyright (c) 2012-2017 The DragonFly Project. All rights reserved. 3 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Matthew Dillon <dillon@backplane.com>, 7 * by Mihai Carabas <mihai.carabas@gmail.com> 8 * and many others. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in 18 * the documentation and/or other materials provided with the 19 * distribution. 20 * 3. Neither the name of The DragonFly Project nor the names of its 21 * contributors may be used to endorse or promote products derived 22 * from this software without specific, prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 25 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 28 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 29 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 33 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 34 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/lock.h> 41 #include <sys/queue.h> 42 #include <sys/proc.h> 43 #include <sys/rtprio.h> 44 #include <sys/uio.h> 45 #include <sys/sysctl.h> 46 #include <sys/resourcevar.h> 47 #include <sys/spinlock.h> 48 #include <sys/cpu_topology.h> 49 #include <sys/thread2.h> 50 #include <sys/spinlock2.h> 51 52 #include <sys/ktr.h> 53 54 #include <machine/cpu.h> 55 #include <machine/smp.h> 56 57 /* 58 * Priorities. Note that with 32 run queues per scheduler each queue 59 * represents four priority levels. 60 */ 61 62 int dfly_rebalanced; 63 64 #define MAXPRI 128 65 #define PRIMASK (MAXPRI - 1) 66 #define PRIBASE_REALTIME 0 67 #define PRIBASE_NORMAL MAXPRI 68 #define PRIBASE_IDLE (MAXPRI * 2) 69 #define PRIBASE_THREAD (MAXPRI * 3) 70 #define PRIBASE_NULL (MAXPRI * 4) 71 72 #define NQS 32 /* 32 run queues. */ 73 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 74 #define PPQMASK (PPQ - 1) 75 76 /* 77 * NICE_QS - maximum queues nice can shift the process 78 * EST_QS - maximum queues estcpu can shift the process 79 * 80 * ESTCPUPPQ - number of estcpu units per priority queue 81 * ESTCPUMAX - number of estcpu units 82 * 83 * Remember that NICE runs over the whole -20 to +20 range. 84 */ 85 #define NICE_QS 24 /* -20 to +20 shift in whole queues */ 86 #define EST_QS 20 /* 0-MAX shift in whole queues */ 87 #define ESTCPUPPQ 512 88 #define ESTCPUMAX (ESTCPUPPQ * EST_QS) 89 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 90 91 #define ESTCPULIM(v) min((v), ESTCPUMAX) 92 93 TAILQ_HEAD(rq, lwp); 94 95 #define lwp_priority lwp_usdata.dfly.priority 96 #define lwp_forked lwp_usdata.dfly.forked 97 #define lwp_rqindex lwp_usdata.dfly.rqindex 98 #define lwp_estcpu lwp_usdata.dfly.estcpu 99 #define lwp_estfast lwp_usdata.dfly.estfast 100 #define lwp_uload lwp_usdata.dfly.uload 101 #define lwp_rqtype lwp_usdata.dfly.rqtype 102 #define lwp_qcpu lwp_usdata.dfly.qcpu 103 #define lwp_rrcount lwp_usdata.dfly.rrcount 104 105 /* 106 * DFly scheduler pcpu structure. Note that the pcpu uload field must 107 * be 64-bits to avoid overflowing in the situation where more than 32768 108 * processes are on a single cpu's queue. Since high-end systems can 109 * easily run 900,000+ processes, we have to deal with it. 110 */ 111 struct usched_dfly_pcpu { 112 struct spinlock spin; 113 struct thread *helper_thread; 114 u_short scancpu; 115 short upri; 116 long uload; /* 64-bits to avoid overflow (1) */ 117 int ucount; 118 int flags; 119 struct lwp *uschedcp; 120 struct rq queues[NQS]; 121 struct rq rtqueues[NQS]; 122 struct rq idqueues[NQS]; 123 u_int32_t queuebits; 124 u_int32_t rtqueuebits; 125 u_int32_t idqueuebits; 126 int runqcount; 127 int cpuid; 128 cpumask_t cpumask; 129 cpu_node_t *cpunode; 130 } __cachealign; 131 132 /* 133 * Reflecting bits in the global atomic masks allows us to avoid 134 * a certain degree of global ping-ponging. 135 */ 136 #define DFLY_PCPU_RDYMASK 0x0001 /* reflect rdyprocmask */ 137 #define DFLY_PCPU_CURMASK 0x0002 /* reflect curprocmask */ 138 139 typedef struct usched_dfly_pcpu *dfly_pcpu_t; 140 141 static void dfly_acquire_curproc(struct lwp *lp); 142 static void dfly_release_curproc(struct lwp *lp); 143 static void dfly_select_curproc(globaldata_t gd); 144 static void dfly_setrunqueue(struct lwp *lp); 145 static void dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp); 146 static void dfly_schedulerclock(struct lwp *lp, sysclock_t period, 147 sysclock_t cpstamp); 148 static void dfly_recalculate_estcpu(struct lwp *lp); 149 static void dfly_resetpriority(struct lwp *lp); 150 static void dfly_forking(struct lwp *plp, struct lwp *lp); 151 static void dfly_exiting(struct lwp *lp, struct proc *); 152 static void dfly_uload_update(struct lwp *lp); 153 static void dfly_yield(struct lwp *lp); 154 static void dfly_changeqcpu_locked(struct lwp *lp, 155 dfly_pcpu_t dd, dfly_pcpu_t rdd); 156 static dfly_pcpu_t dfly_choose_best_queue(struct lwp *lp); 157 static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd); 158 static dfly_pcpu_t dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp); 159 static void dfly_need_user_resched_remote(void *dummy); 160 static struct lwp *dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd, 161 struct lwp *chklp, int worst); 162 static void dfly_remrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp); 163 static void dfly_setrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp); 164 static void dfly_changedcpu(struct lwp *lp); 165 166 struct usched usched_dfly = { 167 { NULL }, 168 "dfly", "Original DragonFly Scheduler", 169 NULL, /* default registration */ 170 NULL, /* default deregistration */ 171 dfly_acquire_curproc, 172 dfly_release_curproc, 173 dfly_setrunqueue, 174 dfly_schedulerclock, 175 dfly_recalculate_estcpu, 176 dfly_resetpriority, 177 dfly_forking, 178 dfly_exiting, 179 dfly_uload_update, 180 NULL, /* setcpumask not supported */ 181 dfly_yield, 182 dfly_changedcpu 183 }; 184 185 /* 186 * We have NQS (32) run queues per scheduling class. For the normal 187 * class, there are 128 priorities scaled onto these 32 queues. New 188 * processes are added to the last entry in each queue, and processes 189 * are selected for running by taking them from the head and maintaining 190 * a simple FIFO arrangement. Realtime and Idle priority processes have 191 * and explicit 0-31 priority which maps directly onto their class queue 192 * index. When a queue has something in it, the corresponding bit is 193 * set in the queuebits variable, allowing a single read to determine 194 * the state of all 32 queues and then a ffs() to find the first busy 195 * queue. 196 * 197 * curprocmask is used to publish cpus with assigned curprocs to the rest 198 * of the cpus. In certain situations curprocmask may leave a bit set 199 * (e.g. a yield or a token-based yield) even though dd->uschedcp is 200 * NULL'd out temporarily). 201 */ 202 /* currently running a user process */ 203 static cpumask_t dfly_curprocmask = CPUMASK_INITIALIZER_ALLONES; 204 static cpumask_t dfly_rdyprocmask; /* ready to accept a user process */ 205 static struct usched_dfly_pcpu dfly_pcpu[MAXCPU]; 206 static struct sysctl_ctx_list usched_dfly_sysctl_ctx; 207 static struct sysctl_oid *usched_dfly_sysctl_tree; 208 209 /* Debug info exposed through debug.* sysctl */ 210 211 static int usched_dfly_debug = -1; 212 SYSCTL_INT(_debug, OID_AUTO, dfly_scdebug, CTLFLAG_RW, 213 &usched_dfly_debug, 0, 214 "Print debug information for this pid"); 215 216 static int usched_dfly_pid_debug = -1; 217 SYSCTL_INT(_debug, OID_AUTO, dfly_pid_debug, CTLFLAG_RW, 218 &usched_dfly_pid_debug, 0, 219 "Print KTR debug information for this pid"); 220 221 static int usched_dfly_chooser = 0; 222 SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW, 223 &usched_dfly_chooser, 0, 224 "Print KTR debug information for this pid"); 225 226 /* 227 * WARNING! 228 * 229 * The fork bias can have a large effect on the system in the face of a 230 * make -j N or other high-forking applications. 231 * 232 * Larger values are much less invasive vs other things that 233 * might be running in the system, but can cause exec chains 234 * such as those typically generated by make to have higher 235 * latencies in the face of modest load. 236 * 237 * Lower values are more invasive but have reduced latencies 238 * for such exec chains. 239 * 240 * make -j 10 buildkernel example, build times: 241 * 242 * +0 3:04 243 * +1 3:14 -5.2% <-- default 244 * +2 3:22 -8.9% 245 * 246 * This issue occurs due to the way the scheduler affinity heuristics work. 247 * There is no way to really 'fix' the affinity heuristics because when it 248 * comes right down to it trying to instantly schedule a process on an 249 * available cpu (even if it will become unavailable a microsecond later) 250 * tends to cause processes to shift around between cpus and sockets too much 251 * and breaks the affinity. 252 * 253 * NOTE: Heavily concurrent builds typically have enough things on the pan 254 * that they remain time-efficient even with a higher bias. 255 */ 256 static int usched_dfly_forkbias = 1; 257 SYSCTL_INT(_debug, OID_AUTO, dfly_forkbias, CTLFLAG_RW, 258 &usched_dfly_forkbias, 0, 259 "Fork bias for estcpu in whole queues"); 260 261 /* 262 * Tunning usched_dfly - configurable through kern.usched_dfly. 263 * 264 * weight1 - Tries to keep threads on their current cpu. If you 265 * make this value too large the scheduler will not be 266 * able to load-balance large loads. 267 * 268 * weight2 - If non-zero, detects thread pairs undergoing synchronous 269 * communications and tries to move them closer together. 270 * Behavior is adjusted by bit 4 of features (0x10). 271 * 272 * WARNING! Weight2 is a ridiculously sensitive parameter, 273 * a small value is recommended. 274 * 275 * weight3 - Weighting based on the number of recently runnable threads 276 * on the userland scheduling queue (ignoring their loads). 277 * A nominal value here prevents high-priority (low-load) 278 * threads from accumulating on one cpu core when other 279 * cores are available. 280 * 281 * This value should be left fairly small relative to weight1 282 * and weight4. 283 * 284 * weight4 - Weighting based on other cpu queues being available 285 * or running processes with higher lwp_priority's. 286 * 287 * This allows a thread to migrate to another nearby cpu if it 288 * is unable to run on the current cpu based on the other cpu 289 * being idle or running a lower priority (higher lwp_priority) 290 * thread. This value should be large enough to override weight1 291 * 292 * features - These flags can be set or cleared to enable or disable various 293 * features. 294 * 295 * 0x01 Enable idle-cpu pulling (default) 296 * 0x02 Enable proactive pushing (default) 297 * 0x04 Enable rebalancing rover (default) 298 * 0x08 Enable more proactive pushing (default) 299 * 0x10 (flip weight2 limit on same cpu) (default) 300 * 0x20 choose best cpu for forked process 301 * 0x40 choose current cpu for forked process 302 * 0x80 choose random cpu for forked process (default) 303 */ 304 static int usched_dfly_smt = 0; 305 static int usched_dfly_cache_coherent = 0; 306 static int usched_dfly_weight1 = 200; /* keep thread on current cpu */ 307 static int usched_dfly_weight2 = 180; /* synchronous peer's current cpu */ 308 static int usched_dfly_weight3 = 40; /* number of threads on queue */ 309 static int usched_dfly_weight4 = 160; /* availability of idle cores */ 310 static int usched_dfly_features = 0x8F; /* allow pulls */ 311 static int usched_dfly_fast_resched = PPQ / 2; /* delta priority / resched */ 312 static int usched_dfly_swmask = ~PPQMASK; /* allow pulls */ 313 static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10; 314 static int usched_dfly_decay = 8; 315 316 /* KTR debug printings */ 317 318 KTR_INFO_MASTER(usched); 319 320 #if !defined(KTR_USCHED_DFLY) 321 #define KTR_USCHED_DFLY KTR_ALL 322 #endif 323 324 KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc, 0, 325 "USCHED_DFLY(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)", 326 pid_t pid, int old_cpuid, int curr); 327 328 /* 329 * This function is called when the kernel intends to return to userland. 330 * It is responsible for making the thread the current designated userland 331 * thread for this cpu, blocking if necessary. 332 * 333 * The kernel will not depress our LWKT priority until after we return, 334 * in case we have to shove over to another cpu. 335 * 336 * We must determine our thread's disposition before we switch away. This 337 * is very sensitive code. 338 * 339 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 340 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 341 * occur, this function is called only under very controlled circumstances. 342 */ 343 static void 344 dfly_acquire_curproc(struct lwp *lp) 345 { 346 globaldata_t gd; 347 dfly_pcpu_t dd; 348 dfly_pcpu_t rdd; 349 thread_t td; 350 int force_resched; 351 352 /* 353 * Make sure we aren't sitting on a tsleep queue. 354 */ 355 td = lp->lwp_thread; 356 crit_enter_quick(td); 357 if (td->td_flags & TDF_TSLEEPQ) 358 tsleep_remove(td); 359 dfly_recalculate_estcpu(lp); 360 361 gd = mycpu; 362 dd = &dfly_pcpu[gd->gd_cpuid]; 363 364 /* 365 * Process any pending interrupts/ipi's, then handle reschedule 366 * requests. dfly_release_curproc() will try to assign a new 367 * uschedcp that isn't us and otherwise NULL it out. 368 */ 369 force_resched = 0; 370 if ((td->td_mpflags & TDF_MP_BATCH_DEMARC) && 371 lp->lwp_rrcount >= usched_dfly_rrinterval / 2) { 372 force_resched = 1; 373 } 374 375 if (user_resched_wanted()) { 376 if (dd->uschedcp == lp) 377 force_resched = 1; 378 clear_user_resched(); 379 dfly_release_curproc(lp); 380 } 381 382 /* 383 * Loop until we are the current user thread. 384 * 385 * NOTE: dd spinlock not held at top of loop. 386 */ 387 if (dd->uschedcp == lp) 388 lwkt_yield_quick(); 389 390 while (dd->uschedcp != lp) { 391 lwkt_yield_quick(); 392 393 spin_lock(&dd->spin); 394 395 /* This lwp is an outcast; force reschedule. */ 396 if (__predict_false( 397 CPUMASK_TESTBIT(lp->lwp_cpumask, gd->gd_cpuid) == 0) && 398 (rdd = dfly_choose_best_queue(lp)) != dd) { 399 dfly_changeqcpu_locked(lp, dd, rdd); 400 spin_unlock(&dd->spin); 401 lwkt_deschedule(lp->lwp_thread); 402 dfly_setrunqueue_dd(rdd, lp); 403 lwkt_switch(); 404 gd = mycpu; 405 dd = &dfly_pcpu[gd->gd_cpuid]; 406 continue; 407 } 408 409 if (force_resched && 410 (usched_dfly_features & 0x08) && 411 (rdd = dfly_choose_best_queue(lp)) != dd) { 412 /* 413 * We are not or are no longer the current lwp and a 414 * forced reschedule was requested. Figure out the 415 * best cpu to run on (our current cpu will be given 416 * significant weight). 417 * 418 * (if a reschedule was not requested we want to 419 * move this step after the uschedcp tests). 420 */ 421 dfly_changeqcpu_locked(lp, dd, rdd); 422 spin_unlock(&dd->spin); 423 lwkt_deschedule(lp->lwp_thread); 424 dfly_setrunqueue_dd(rdd, lp); 425 lwkt_switch(); 426 gd = mycpu; 427 dd = &dfly_pcpu[gd->gd_cpuid]; 428 continue; 429 } 430 431 /* 432 * Either no reschedule was requested or the best queue was 433 * dd, and no current process has been selected. We can 434 * trivially become the current lwp on the current cpu. 435 */ 436 if (dd->uschedcp == NULL) { 437 atomic_clear_int(&lp->lwp_thread->td_mpflags, 438 TDF_MP_DIDYIELD); 439 if ((dd->flags & DFLY_PCPU_CURMASK) == 0) { 440 ATOMIC_CPUMASK_ORBIT(dfly_curprocmask, 441 gd->gd_cpuid); 442 dd->flags |= DFLY_PCPU_CURMASK; 443 } 444 dd->uschedcp = lp; 445 dd->upri = lp->lwp_priority; 446 KKASSERT(lp->lwp_qcpu == dd->cpuid); 447 spin_unlock(&dd->spin); 448 break; 449 } 450 451 /* 452 * Can we steal the current designated user thread? 453 * 454 * If we do the other thread will stall when it tries to 455 * return to userland, possibly rescheduling elsewhere. 456 * Set need_user_resched() to get the thread to cycle soonest. 457 * 458 * It is important to do a masked test to avoid the edge 459 * case where two near-equal-priority threads are constantly 460 * interrupting each other. 461 * 462 * In the exact match case another thread has already gained 463 * uschedcp and lowered its priority, if we steal it the 464 * other thread will stay stuck on the LWKT runq and not 465 * push to another cpu. So don't steal on equal-priority even 466 * though it might appear to be more beneficial due to not 467 * having to switch back to the other thread's context. 468 * 469 * usched_dfly_fast_resched requires that two threads be 470 * significantly far apart in priority in order to interrupt. 471 * 472 * If better but not sufficiently far apart, the current 473 * uschedcp will be interrupted at the next scheduler clock. 474 */ 475 if (dd->uschedcp && 476 (dd->upri & ~PPQMASK) > 477 (lp->lwp_priority & ~PPQMASK) + usched_dfly_fast_resched) { 478 dd->uschedcp = lp; 479 dd->upri = lp->lwp_priority; 480 KKASSERT(lp->lwp_qcpu == dd->cpuid); 481 need_user_resched(); 482 spin_unlock(&dd->spin); 483 break; 484 } 485 486 /* 487 * Requeue us at lwp_priority, which recalculate_estcpu() 488 * set for us. Reset the rrcount to force placement 489 * at the end of the queue. 490 * 491 * We used to move ourselves to the worst queue, but 492 * this creates a fairly serious priority inversion 493 * problem. 494 */ 495 if (lp->lwp_thread->td_mpflags & TDF_MP_DIDYIELD) { 496 spin_unlock(&dd->spin); 497 lp->lwp_rrcount = usched_dfly_rrinterval; 498 lp->lwp_rqindex = (lp->lwp_priority & PRIMASK) / PPQ; 499 500 lwkt_deschedule(lp->lwp_thread); 501 dfly_setrunqueue_dd(dd, lp); 502 atomic_clear_int(&lp->lwp_thread->td_mpflags, 503 TDF_MP_DIDYIELD); 504 lwkt_switch(); 505 gd = mycpu; 506 dd = &dfly_pcpu[gd->gd_cpuid]; 507 continue; 508 } 509 510 /* 511 * We are not the current lwp, figure out the best cpu 512 * to run on (our current cpu will be given significant 513 * weight). Loop on cpu change. 514 */ 515 if ((usched_dfly_features & 0x02) && 516 force_resched == 0 && 517 (rdd = dfly_choose_best_queue(lp)) != dd) { 518 dfly_changeqcpu_locked(lp, dd, rdd); 519 spin_unlock(&dd->spin); 520 lwkt_deschedule(lp->lwp_thread); 521 dfly_setrunqueue_dd(rdd, lp); 522 lwkt_switch(); 523 gd = mycpu; 524 dd = &dfly_pcpu[gd->gd_cpuid]; 525 continue; 526 } 527 528 /* 529 * We cannot become the current lwp, place the lp on the 530 * run-queue of this or another cpu and deschedule ourselves. 531 * 532 * When we are reactivated we will have another chance. 533 * 534 * Reload after a switch or setrunqueue/switch possibly 535 * moved us to another cpu. 536 */ 537 spin_unlock(&dd->spin); 538 lwkt_deschedule(lp->lwp_thread); 539 dfly_setrunqueue_dd(dd, lp); 540 lwkt_switch(); 541 gd = mycpu; 542 dd = &dfly_pcpu[gd->gd_cpuid]; 543 } 544 545 /* 546 * Make sure upri is synchronized, then yield to LWKT threads as 547 * needed before returning. This could result in another reschedule. 548 * XXX 549 */ 550 crit_exit_quick(td); 551 552 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 553 } 554 555 /* 556 * DFLY_RELEASE_CURPROC 557 * 558 * This routine detaches the current thread from the userland scheduler, 559 * usually because the thread needs to run or block in the kernel (at 560 * kernel priority) for a while. 561 * 562 * This routine is also responsible for selecting a new thread to 563 * make the current thread. 564 * 565 * NOTE: This implementation differs from the dummy example in that 566 * dfly_select_curproc() is able to select the current process, whereas 567 * dummy_select_curproc() is not able to select the current process. 568 * This means we have to NULL out uschedcp. 569 * 570 * Additionally, note that we may already be on a run queue if releasing 571 * via the lwkt_switch() in dfly_setrunqueue(). 572 */ 573 static void 574 dfly_release_curproc(struct lwp *lp) 575 { 576 globaldata_t gd = mycpu; 577 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid]; 578 579 /* 580 * Make sure td_wakefromcpu is defaulted. This will be overwritten 581 * by wakeup(). 582 */ 583 if (dd->uschedcp == lp) { 584 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 585 spin_lock(&dd->spin); 586 if (dd->uschedcp == lp) { 587 dd->uschedcp = NULL; /* don't let lp be selected */ 588 dd->upri = PRIBASE_NULL; 589 590 /* 591 * We're just going to set it again, avoid the global 592 * cache line ping-pong. 593 */ 594 if ((lp->lwp_thread->td_mpflags & TDF_MP_DIDYIELD) == 0) { 595 if (dd->flags & DFLY_PCPU_CURMASK) { 596 ATOMIC_CPUMASK_NANDBIT(dfly_curprocmask, 597 gd->gd_cpuid); 598 dd->flags &= ~DFLY_PCPU_CURMASK; 599 } 600 } 601 spin_unlock(&dd->spin); 602 dfly_select_curproc(gd); 603 } else { 604 spin_unlock(&dd->spin); 605 } 606 } 607 } 608 609 /* 610 * DFLY_SELECT_CURPROC 611 * 612 * Select a new current process for this cpu and clear any pending user 613 * reschedule request. The cpu currently has no current process. 614 * 615 * This routine is also responsible for equal-priority round-robining, 616 * typically triggered from dfly_schedulerclock(). In our dummy example 617 * all the 'user' threads are LWKT scheduled all at once and we just 618 * call lwkt_switch(). 619 * 620 * The calling process is not on the queue and cannot be selected. 621 */ 622 static 623 void 624 dfly_select_curproc(globaldata_t gd) 625 { 626 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid]; 627 struct lwp *nlp; 628 int cpuid = gd->gd_cpuid; 629 630 crit_enter_gd(gd); 631 632 spin_lock(&dd->spin); 633 nlp = dfly_chooseproc_locked(dd, dd, dd->uschedcp, 0); 634 635 if (nlp) { 636 if ((dd->flags & DFLY_PCPU_CURMASK) == 0) { 637 ATOMIC_CPUMASK_ORBIT(dfly_curprocmask, cpuid); 638 dd->flags |= DFLY_PCPU_CURMASK; 639 } 640 dd->upri = nlp->lwp_priority; 641 dd->uschedcp = nlp; 642 #if 0 643 dd->rrcount = 0; /* reset round robin */ 644 #endif 645 spin_unlock(&dd->spin); 646 lwkt_acquire(nlp->lwp_thread); 647 lwkt_schedule(nlp->lwp_thread); 648 } else { 649 spin_unlock(&dd->spin); 650 } 651 crit_exit_gd(gd); 652 } 653 654 /* 655 * Place the specified lwp on the user scheduler's run queue. This routine 656 * must be called with the thread descheduled. The lwp must be runnable. 657 * It must not be possible for anyone else to explicitly schedule this thread. 658 * 659 * The thread may be the current thread as a special case. 660 */ 661 static void 662 dfly_setrunqueue(struct lwp *lp) 663 { 664 dfly_pcpu_t dd; 665 dfly_pcpu_t rdd; 666 667 /* 668 * First validate the process LWKT state. 669 */ 670 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 671 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0, 672 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 673 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags)); 674 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 675 676 /* 677 * NOTE: dd/rdd do not necessarily represent the current cpu. 678 * Instead they may represent the cpu the thread was last 679 * scheduled on or inherited by its parent. 680 */ 681 dd = &dfly_pcpu[lp->lwp_qcpu]; 682 rdd = dd; 683 684 /* 685 * This process is not supposed to be scheduled anywhere or assigned 686 * as the current process anywhere. Assert the condition. 687 */ 688 KKASSERT(rdd->uschedcp != lp); 689 690 /* 691 * Ok, we have to setrunqueue some target cpu and request a reschedule 692 * if necessary. 693 * 694 * We have to choose the best target cpu. It might not be the current 695 * target even if the current cpu has no running user thread (for 696 * example, because the current cpu might be a hyperthread and its 697 * sibling has a thread assigned). 698 * 699 * If we just forked it is most optimal to run the child on the same 700 * cpu just in case the parent decides to wait for it (thus getting 701 * off that cpu). As long as there is nothing else runnable on the 702 * cpu, that is. If we did this unconditionally a parent forking 703 * multiple children before waiting (e.g. make -j N) leaves other 704 * cpus idle that could be working. 705 */ 706 if (lp->lwp_forked) { 707 lp->lwp_forked = 0; 708 if (usched_dfly_features & 0x20) 709 rdd = dfly_choose_best_queue(lp); 710 else if (usched_dfly_features & 0x40) 711 rdd = &dfly_pcpu[lp->lwp_qcpu]; 712 else if (usched_dfly_features & 0x80) 713 rdd = dfly_choose_queue_simple(rdd, lp); 714 else if (dfly_pcpu[lp->lwp_qcpu].runqcount) 715 rdd = dfly_choose_best_queue(lp); 716 else 717 rdd = &dfly_pcpu[lp->lwp_qcpu]; 718 } else { 719 rdd = dfly_choose_best_queue(lp); 720 /* rdd = &dfly_pcpu[lp->lwp_qcpu]; */ 721 } 722 if (lp->lwp_qcpu != rdd->cpuid) { 723 spin_lock(&dd->spin); 724 dfly_changeqcpu_locked(lp, dd, rdd); 725 spin_unlock(&dd->spin); 726 } 727 dfly_setrunqueue_dd(rdd, lp); 728 } 729 730 /* 731 * Change qcpu to rdd->cpuid. The dd the lp is CURRENTLY on must be 732 * spin-locked on-call. rdd does not have to be. 733 */ 734 static void 735 dfly_changeqcpu_locked(struct lwp *lp, dfly_pcpu_t dd, dfly_pcpu_t rdd) 736 { 737 if (lp->lwp_qcpu != rdd->cpuid) { 738 if (lp->lwp_mpflags & LWP_MP_ULOAD) { 739 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD); 740 atomic_add_long(&dd->uload, -lp->lwp_uload); 741 atomic_add_int(&dd->ucount, -1); 742 } 743 lp->lwp_qcpu = rdd->cpuid; 744 } 745 } 746 747 /* 748 * Place lp on rdd's runqueue. Nothing is locked on call. This function 749 * also performs all necessary ancillary notification actions. 750 */ 751 static void 752 dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp) 753 { 754 globaldata_t rgd; 755 756 /* 757 * We might be moving the lp to another cpu's run queue, and once 758 * on the runqueue (even if it is our cpu's), another cpu can rip 759 * it away from us. 760 * 761 * TDF_MIGRATING might already be set if this is part of a 762 * remrunqueue+setrunqueue sequence. 763 */ 764 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 765 lwkt_giveaway(lp->lwp_thread); 766 767 rgd = globaldata_find(rdd->cpuid); 768 769 /* 770 * We lose control of the lp the moment we release the spinlock 771 * after having placed it on the queue. i.e. another cpu could pick 772 * it up, or it could exit, or its priority could be further 773 * adjusted, or something like that. 774 * 775 * WARNING! rdd can point to a foreign cpu! 776 */ 777 spin_lock(&rdd->spin); 778 dfly_setrunqueue_locked(rdd, lp); 779 780 /* 781 * Potentially interrupt the currently-running thread 782 */ 783 if ((rdd->upri & ~PPQMASK) <= (lp->lwp_priority & ~PPQMASK)) { 784 /* 785 * Currently running thread is better or same, do not 786 * interrupt. 787 */ 788 spin_unlock(&rdd->spin); 789 } else if ((rdd->upri & ~PPQMASK) <= (lp->lwp_priority & ~PPQMASK) + 790 usched_dfly_fast_resched) { 791 /* 792 * Currently running thread is not better, but not so bad 793 * that we need to interrupt it. Let it run for one more 794 * scheduler tick. 795 */ 796 if (rdd->uschedcp && 797 rdd->uschedcp->lwp_rrcount < usched_dfly_rrinterval) { 798 rdd->uschedcp->lwp_rrcount = usched_dfly_rrinterval - 1; 799 } 800 spin_unlock(&rdd->spin); 801 } else if (rgd == mycpu) { 802 /* 803 * We should interrupt the currently running thread, which 804 * is on the current cpu. However, if DIDYIELD is set we 805 * round-robin unconditionally and do not interrupt it. 806 */ 807 spin_unlock(&rdd->spin); 808 if (rdd->uschedcp == NULL) 809 wakeup_mycpu(rdd->helper_thread); /* XXX */ 810 if ((lp->lwp_thread->td_mpflags & TDF_MP_DIDYIELD) == 0) 811 need_user_resched(); 812 } else { 813 /* 814 * We should interrupt the currently running thread, which 815 * is on a different cpu. 816 */ 817 spin_unlock(&rdd->spin); 818 lwkt_send_ipiq(rgd, dfly_need_user_resched_remote, NULL); 819 } 820 } 821 822 /* 823 * This routine is called from a systimer IPI. It MUST be MP-safe and 824 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 825 * each cpu. 826 */ 827 static 828 void 829 dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 830 { 831 globaldata_t gd = mycpu; 832 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid]; 833 834 /* 835 * Spinlocks also hold a critical section so there should not be 836 * any active. 837 */ 838 KKASSERT(gd->gd_spinlocks == 0 || dumping); 839 840 /* 841 * If lp is NULL we might be contended and lwkt_switch() may have 842 * cycled into the idle thread. Apply the tick to the current 843 * process on this cpu if it is contended. 844 */ 845 if (gd->gd_curthread == &gd->gd_idlethread) { 846 lp = dd->uschedcp; 847 if (lp && (lp->lwp_thread == NULL || 848 lp->lwp_thread->td_contended == 0)) { 849 lp = NULL; 850 } 851 } 852 853 /* 854 * Dock thread for tick 855 */ 856 if (lp) { 857 /* 858 * Do we need to round-robin? We round-robin 10 times a 859 * second. This should only occur for cpu-bound batch 860 * processes. 861 */ 862 if (++lp->lwp_rrcount >= usched_dfly_rrinterval) { 863 lp->lwp_thread->td_wakefromcpu = -1; 864 need_user_resched(); 865 } 866 867 /* 868 * Adjust estcpu upward using a real time equivalent 869 * calculation, and recalculate lp's priority. Estcpu 870 * is increased such that it will cap-out over a period 871 * of one second. 872 */ 873 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + 874 ESTCPUMAX / ESTCPUFREQ + 1); 875 dfly_resetpriority(lp); 876 } 877 878 /* 879 * Rebalance two cpus every 8 ticks, pulling the worst thread 880 * from the worst cpu's queue into a rotating cpu number. 881 * 882 * This mechanic is needed because the push algorithms can 883 * steady-state in an non-optimal configuration. We need to mix it 884 * up a little, even if it means breaking up a paired thread, so 885 * the push algorithms can rebalance the degenerate conditions. 886 * This portion of the algorithm exists to ensure stability at the 887 * selected weightings. 888 * 889 * Because we might be breaking up optimal conditions we do not want 890 * to execute this too quickly, hence we only rebalance approximately 891 * ~7-8 times per second. The push's, on the otherhand, are capable 892 * moving threads to other cpus at a much higher rate. 893 * 894 * We choose the most heavily loaded thread from the worst queue 895 * in order to ensure that multiple heavy-weight threads on the same 896 * queue get broken up, and also because these threads are the most 897 * likely to be able to remain in place. Hopefully then any pairings, 898 * if applicable, migrate to where these threads are. 899 */ 900 if ((usched_dfly_features & 0x04) && 901 ((u_int)sched_ticks & 7) == 0 && 902 (u_int)sched_ticks / 8 % ncpus == gd->gd_cpuid) { 903 /* 904 * Our cpu is up. 905 */ 906 struct lwp *nlp; 907 dfly_pcpu_t rdd; 908 909 rdd = dfly_choose_worst_queue(dd); 910 if (rdd) { 911 spin_lock(&dd->spin); 912 if (spin_trylock(&rdd->spin)) { 913 nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1); 914 spin_unlock(&rdd->spin); 915 if (nlp == NULL) 916 spin_unlock(&dd->spin); 917 } else { 918 spin_unlock(&dd->spin); 919 nlp = NULL; 920 } 921 } else { 922 nlp = NULL; 923 } 924 /* dd->spin held if nlp != NULL */ 925 926 /* 927 * Either schedule it or add it to our queue. 928 */ 929 if (nlp && 930 (nlp->lwp_priority & ~PPQMASK) < (dd->upri & ~PPQMASK)) { 931 if ((dd->flags & DFLY_PCPU_CURMASK) == 0) { 932 ATOMIC_CPUMASK_ORMASK(dfly_curprocmask, 933 dd->cpumask); 934 dd->flags |= DFLY_PCPU_CURMASK; 935 } 936 dd->upri = nlp->lwp_priority; 937 dd->uschedcp = nlp; 938 #if 0 939 dd->rrcount = 0; /* reset round robin */ 940 #endif 941 spin_unlock(&dd->spin); 942 lwkt_acquire(nlp->lwp_thread); 943 lwkt_schedule(nlp->lwp_thread); 944 } else if (nlp) { 945 dfly_setrunqueue_locked(dd, nlp); 946 spin_unlock(&dd->spin); 947 } 948 } 949 } 950 951 /* 952 * Called from acquire and from kern_synch's one-second timer (one of the 953 * callout helper threads) with a critical section held. 954 * 955 * Adjust p_estcpu based on our single-cpu load, p_nice, and compensate for 956 * overall system load. 957 * 958 * Note that no recalculation occurs for a process which sleeps and wakes 959 * up in the same tick. That is, a system doing thousands of context 960 * switches per second will still only do serious estcpu calculations 961 * ESTCPUFREQ times per second. 962 */ 963 static 964 void 965 dfly_recalculate_estcpu(struct lwp *lp) 966 { 967 globaldata_t gd = mycpu; 968 sysclock_t cpbase; 969 sysclock_t ttlticks; 970 int estcpu; 971 int decay_factor; 972 int ucount; 973 974 /* 975 * We have to subtract periodic to get the last schedclock 976 * timeout time, otherwise we would get the upcoming timeout. 977 * Keep in mind that a process can migrate between cpus and 978 * while the scheduler clock should be very close, boundary 979 * conditions could lead to a small negative delta. 980 */ 981 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 982 983 if (lp->lwp_slptime > 1) { 984 /* 985 * Too much time has passed, do a coarse correction. 986 */ 987 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 988 dfly_resetpriority(lp); 989 lp->lwp_cpbase = cpbase; 990 lp->lwp_cpticks = 0; 991 lp->lwp_estfast = 0; 992 } else if (lp->lwp_cpbase != cpbase) { 993 /* 994 * Adjust estcpu if we are in a different tick. Don't waste 995 * time if we are in the same tick. 996 * 997 * First calculate the number of ticks in the measurement 998 * interval. The ttlticks calculation can wind up 0 due to 999 * a bug in the handling of lwp_slptime (as yet not found), 1000 * so make sure we do not get a divide by 0 panic. 1001 */ 1002 ttlticks = (cpbase - lp->lwp_cpbase) / 1003 gd->gd_schedclock.periodic; 1004 if ((ssysclock_t)ttlticks < 0) { 1005 ttlticks = 0; 1006 lp->lwp_cpbase = cpbase; 1007 } 1008 if (ttlticks < 4) 1009 return; 1010 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 1011 1012 /* 1013 * Calculate instant estcpu based percentage of (one) cpu 1014 * used and exponentially average it into the current 1015 * lwp_estcpu. 1016 */ 1017 ucount = dfly_pcpu[lp->lwp_qcpu].ucount; 1018 estcpu = lp->lwp_cpticks * ESTCPUMAX / ttlticks; 1019 1020 /* 1021 * The higher ttlticks gets, the more meaning the calculation 1022 * has and the smaller our decay_factor in the exponential 1023 * average. 1024 * 1025 * The uload calculation has been removed because it actually 1026 * makes things worse, causing processes which use less cpu 1027 * (such as a browser) to be pumped up and treated the same 1028 * as a cpu-bound process (such as a make). The same effect 1029 * can occur with sufficient load without the uload 1030 * calculation, but occurs less quickly and takes more load. 1031 * In addition, the less cpu a process uses the smaller the 1032 * effect of the overload. 1033 */ 1034 if (ttlticks >= hz) 1035 decay_factor = 1; 1036 else 1037 decay_factor = hz - ttlticks; 1038 1039 lp->lwp_estcpu = ESTCPULIM( 1040 (lp->lwp_estcpu * ttlticks + estcpu) / 1041 (ttlticks + 1)); 1042 if (usched_dfly_debug == lp->lwp_proc->p_pid) 1043 kprintf(" finalestcpu %d %d\n", estcpu, lp->lwp_estcpu); 1044 1045 dfly_resetpriority(lp); 1046 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 1047 lp->lwp_cpticks = 0; 1048 } 1049 } 1050 1051 /* 1052 * Compute the priority of a process when running in user mode. 1053 * Arrange to reschedule if the resulting priority is better 1054 * than that of the current process. 1055 * 1056 * This routine may be called with any process. 1057 * 1058 * This routine is called by fork1() for initial setup with the process of 1059 * the run queue, and also may be called normally with the process on or 1060 * off the run queue. 1061 */ 1062 static void 1063 dfly_resetpriority(struct lwp *lp) 1064 { 1065 dfly_pcpu_t rdd; 1066 int newpriority; 1067 u_short newrqtype; 1068 int rcpu; 1069 int checkpri; 1070 int estcpu; 1071 int delta_uload; 1072 1073 crit_enter(); 1074 1075 /* 1076 * Lock the scheduler (lp) belongs to. This can be on a different 1077 * cpu. Handle races. This loop breaks out with the appropriate 1078 * rdd locked. 1079 */ 1080 for (;;) { 1081 rcpu = lp->lwp_qcpu; 1082 cpu_ccfence(); 1083 rdd = &dfly_pcpu[rcpu]; 1084 spin_lock(&rdd->spin); 1085 if (rcpu == lp->lwp_qcpu) 1086 break; 1087 spin_unlock(&rdd->spin); 1088 } 1089 1090 /* 1091 * Calculate the new priority and queue type 1092 */ 1093 newrqtype = lp->lwp_rtprio.type; 1094 1095 switch(newrqtype) { 1096 case RTP_PRIO_REALTIME: 1097 case RTP_PRIO_FIFO: 1098 newpriority = PRIBASE_REALTIME + 1099 (lp->lwp_rtprio.prio & PRIMASK); 1100 break; 1101 case RTP_PRIO_NORMAL: 1102 /* 1103 * Calculate the new priority. 1104 * 1105 * nice contributes up to NICE_QS queues (typ 32 - full range) 1106 * estcpu contributes up to EST_QS queues (typ 24) 1107 * 1108 * A nice +20 process receives 1/10 cpu vs nice+0. Niced 1109 * process more than 20 apart may receive no cpu, so cpu 1110 * bound nice -20 can prevent a nice +5 from getting any 1111 * cpu. A nice+0, being in the middle, always gets some cpu 1112 * no matter what. 1113 */ 1114 estcpu = lp->lwp_estcpu; 1115 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * 1116 (NICE_QS * PPQ) / PRIO_RANGE; 1117 newpriority += estcpu * PPQ / ESTCPUPPQ; 1118 if (newpriority < 0) 1119 newpriority = 0; 1120 if (newpriority >= MAXPRI) 1121 newpriority = MAXPRI - 1; 1122 newpriority += PRIBASE_NORMAL; 1123 break; 1124 case RTP_PRIO_IDLE: 1125 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 1126 break; 1127 case RTP_PRIO_THREAD: 1128 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 1129 break; 1130 default: 1131 panic("Bad RTP_PRIO %d", newrqtype); 1132 /* NOT REACHED */ 1133 } 1134 1135 /* 1136 * The LWKT scheduler doesn't dive usched structures, give it a hint 1137 * on the relative priority of user threads running in the kernel. 1138 * The LWKT scheduler will always ensure that a user thread running 1139 * in the kernel will get cpu some time, regardless of its upri, 1140 * but can decide not to instantly switch from one kernel or user 1141 * mode user thread to a kernel-mode user thread when it has a less 1142 * desireable user priority. 1143 * 1144 * td_upri has normal sense (higher values are more desireable), so 1145 * negate it (this is a different field lp->lwp_priority) 1146 */ 1147 lp->lwp_thread->td_upri = -(newpriority & usched_dfly_swmask); 1148 1149 /* 1150 * The newpriority incorporates the queue type so do a simple masked 1151 * check to determine if the process has moved to another queue. If 1152 * it has, and it is currently on a run queue, then move it. 1153 * 1154 * Since uload is ~PPQMASK masked, no modifications are necessary if 1155 * we end up in the same run queue. 1156 * 1157 * Reset rrcount if moving to a higher-priority queue, otherwise 1158 * retain rrcount. 1159 */ 1160 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 1161 if (lp->lwp_priority < newpriority) 1162 lp->lwp_rrcount = 0; 1163 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { 1164 dfly_remrunqueue_locked(rdd, lp); 1165 lp->lwp_priority = newpriority; 1166 lp->lwp_rqtype = newrqtype; 1167 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1168 dfly_setrunqueue_locked(rdd, lp); 1169 checkpri = 1; 1170 } else { 1171 lp->lwp_priority = newpriority; 1172 lp->lwp_rqtype = newrqtype; 1173 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1174 checkpri = 0; 1175 } 1176 } else { 1177 /* 1178 * In the same PPQ, uload cannot change. 1179 */ 1180 lp->lwp_priority = newpriority; 1181 checkpri = 1; 1182 rcpu = -1; 1183 } 1184 1185 /* 1186 * Adjust effective load. 1187 * 1188 * Calculate load then scale up or down geometrically based on p_nice. 1189 * Processes niced up (positive) are less important, and processes 1190 * niced downard (negative) are more important. The higher the uload, 1191 * the more important the thread. 1192 */ 1193 /* 0-511, 0-100% cpu */ 1194 delta_uload = lp->lwp_estcpu / NQS; 1195 delta_uload -= delta_uload * lp->lwp_proc->p_nice / (PRIO_MAX + 1); 1196 delta_uload -= lp->lwp_uload; 1197 if (lp->lwp_uload + delta_uload < -32767) { 1198 delta_uload = -32768 - lp->lwp_uload; 1199 } else if (lp->lwp_uload + delta_uload > 32767) { 1200 delta_uload = 32767 - lp->lwp_uload; 1201 } 1202 lp->lwp_uload += delta_uload; 1203 if (lp->lwp_mpflags & LWP_MP_ULOAD) 1204 atomic_add_long(&dfly_pcpu[lp->lwp_qcpu].uload, delta_uload); 1205 1206 /* 1207 * Determine if we need to reschedule the target cpu. This only 1208 * occurs if the LWP is already on a scheduler queue, which means 1209 * that idle cpu notification has already occured. At most we 1210 * need only issue a need_user_resched() on the appropriate cpu. 1211 * 1212 * The LWP may be owned by a CPU different from the current one, 1213 * in which case dd->uschedcp may be modified without an MP lock 1214 * or a spinlock held. The worst that happens is that the code 1215 * below causes a spurious need_user_resched() on the target CPU 1216 * and dd->pri to be wrong for a short period of time, both of 1217 * which are harmless. 1218 * 1219 * If checkpri is 0 we are adjusting the priority of the current 1220 * process, possibly higher (less desireable), so ignore the upri 1221 * check which will fail in that case. 1222 */ 1223 if (rcpu >= 0) { 1224 if (CPUMASK_TESTBIT(dfly_rdyprocmask, rcpu) && 1225 (checkpri == 0 || 1226 (rdd->upri & ~PRIMASK) > 1227 (lp->lwp_priority & ~PRIMASK))) { 1228 if (rcpu == mycpu->gd_cpuid) { 1229 spin_unlock(&rdd->spin); 1230 need_user_resched(); 1231 } else { 1232 spin_unlock(&rdd->spin); 1233 lwkt_send_ipiq(globaldata_find(rcpu), 1234 dfly_need_user_resched_remote, 1235 NULL); 1236 } 1237 } else { 1238 spin_unlock(&rdd->spin); 1239 } 1240 } else { 1241 spin_unlock(&rdd->spin); 1242 } 1243 crit_exit(); 1244 } 1245 1246 static 1247 void 1248 dfly_yield(struct lwp *lp) 1249 { 1250 if (lp->lwp_qcpu != mycpu->gd_cpuid) 1251 return; 1252 KKASSERT(lp == curthread->td_lwp); 1253 1254 /* 1255 * Don't set need_user_resched() or mess with rrcount or anything. 1256 * the TDF flag will override everything as long as we release. 1257 */ 1258 atomic_set_int(&lp->lwp_thread->td_mpflags, TDF_MP_DIDYIELD); 1259 dfly_release_curproc(lp); 1260 } 1261 1262 /* 1263 * Thread was forcefully migrated to another cpu. Normally forced migrations 1264 * are used for iterations and the kernel returns to the original cpu before 1265 * returning and this is not needed. However, if the kernel migrates a 1266 * thread to another cpu and wants to leave it there, it has to call this 1267 * scheduler helper. 1268 * 1269 * Note that the lwkt_migratecpu() function also released the thread, so 1270 * we don't have to worry about that. 1271 */ 1272 static 1273 void 1274 dfly_changedcpu(struct lwp *lp) 1275 { 1276 dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu]; 1277 dfly_pcpu_t rdd = &dfly_pcpu[mycpu->gd_cpuid]; 1278 1279 if (dd != rdd) { 1280 spin_lock(&dd->spin); 1281 dfly_changeqcpu_locked(lp, dd, rdd); 1282 spin_unlock(&dd->spin); 1283 } 1284 } 1285 1286 /* 1287 * Called from fork1() when a new child process is being created. 1288 * 1289 * Give the child process an initial estcpu that is more batch then 1290 * its parent and dock the parent for the fork (but do not 1291 * reschedule the parent). 1292 * 1293 * fast 1294 * 1295 * XXX lwp should be "spawning" instead of "forking" 1296 */ 1297 static void 1298 dfly_forking(struct lwp *plp, struct lwp *lp) 1299 { 1300 int estcpu; 1301 1302 /* 1303 * Put the child 4 queue slots (out of 32) higher than the parent 1304 * (less desireable than the parent). 1305 */ 1306 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + 1307 ESTCPUPPQ * usched_dfly_forkbias); 1308 lp->lwp_forked = 1; 1309 lp->lwp_estfast = 0; 1310 1311 /* 1312 * Even though the lp will be scheduled specially the first time 1313 * due to lp->lwp_forked, it is important to initialize lwp_qcpu 1314 * to avoid favoring a fixed cpu. 1315 */ 1316 #if 0 1317 static uint16_t save_cpu; 1318 lp->lwp_qcpu = ++save_cpu % ncpus; 1319 #else 1320 lp->lwp_qcpu = plp->lwp_qcpu; 1321 if (CPUMASK_TESTBIT(lp->lwp_cpumask, lp->lwp_qcpu) == 0) 1322 lp->lwp_qcpu = BSFCPUMASK(lp->lwp_cpumask); 1323 #endif 1324 1325 /* 1326 * Dock the parent a cost for the fork, protecting us from fork 1327 * bombs. If the parent is forking quickly this makes both the 1328 * parent and child more batchy. 1329 */ 1330 estcpu = plp->lwp_estcpu + ESTCPUPPQ / 16; 1331 plp->lwp_estcpu = ESTCPULIM(estcpu); 1332 } 1333 1334 /* 1335 * Called when a lwp is being removed from this scheduler, typically 1336 * during lwp_exit(). We have to clean out any ULOAD accounting before 1337 * we can let the lp go. The dd->spin lock is not needed for uload 1338 * updates. 1339 * 1340 * Scheduler dequeueing has already occurred, no further action in that 1341 * regard is needed. 1342 */ 1343 static void 1344 dfly_exiting(struct lwp *lp, struct proc *child_proc) 1345 { 1346 dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu]; 1347 1348 if (lp->lwp_mpflags & LWP_MP_ULOAD) { 1349 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD); 1350 atomic_add_long(&dd->uload, -lp->lwp_uload); 1351 atomic_add_int(&dd->ucount, -1); 1352 } 1353 } 1354 1355 /* 1356 * This function cannot block in any way, but spinlocks are ok. 1357 * 1358 * Update the uload based on the state of the thread (whether it is going 1359 * to sleep or running again). The uload is meant to be a longer-term 1360 * load and not an instantanious load. 1361 */ 1362 static void 1363 dfly_uload_update(struct lwp *lp) 1364 { 1365 dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu]; 1366 1367 if (lp->lwp_thread->td_flags & TDF_RUNQ) { 1368 if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) { 1369 spin_lock(&dd->spin); 1370 if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) { 1371 atomic_set_int(&lp->lwp_mpflags, 1372 LWP_MP_ULOAD); 1373 atomic_add_long(&dd->uload, lp->lwp_uload); 1374 atomic_add_int(&dd->ucount, 1); 1375 } 1376 spin_unlock(&dd->spin); 1377 } 1378 } else if (lp->lwp_slptime > 0) { 1379 if (lp->lwp_mpflags & LWP_MP_ULOAD) { 1380 spin_lock(&dd->spin); 1381 if (lp->lwp_mpflags & LWP_MP_ULOAD) { 1382 atomic_clear_int(&lp->lwp_mpflags, 1383 LWP_MP_ULOAD); 1384 atomic_add_long(&dd->uload, -lp->lwp_uload); 1385 atomic_add_int(&dd->ucount, -1); 1386 } 1387 spin_unlock(&dd->spin); 1388 } 1389 } 1390 } 1391 1392 /* 1393 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 1394 * it selects a user process and returns it. If chklp is non-NULL and chklp 1395 * has a better or equal priority then the process that would otherwise be 1396 * chosen, NULL is returned. 1397 * 1398 * Until we fix the RUNQ code the chklp test has to be strict or we may 1399 * bounce between processes trying to acquire the current process designation. 1400 * 1401 * Must be called with rdd->spin locked. The spinlock is left intact through 1402 * the entire routine. dd->spin does not have to be locked. 1403 * 1404 * If worst is non-zero this function finds the worst thread instead of the 1405 * best thread (used by the schedulerclock-based rover). 1406 */ 1407 static 1408 struct lwp * 1409 dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd, 1410 struct lwp *chklp, int worst) 1411 { 1412 struct lwp *lp; 1413 struct rq *q; 1414 u_int32_t *which; 1415 u_int32_t pri; 1416 u_int32_t rtqbits; 1417 u_int32_t tsqbits; 1418 u_int32_t idqbits; 1419 1420 /* 1421 * Select best or worst process. Once selected, clear the bit 1422 * in our local variable (idqbits, tsqbits, or rtqbits) just 1423 * in case we have to loop. 1424 */ 1425 rtqbits = rdd->rtqueuebits; 1426 tsqbits = rdd->queuebits; 1427 idqbits = rdd->idqueuebits; 1428 1429 loopfar: 1430 if (worst) { 1431 if (idqbits) { 1432 pri = bsrl(idqbits); 1433 idqbits &= ~(1U << pri); 1434 q = &rdd->idqueues[pri]; 1435 which = &rdd->idqueuebits; 1436 } else if (tsqbits) { 1437 pri = bsrl(tsqbits); 1438 tsqbits &= ~(1U << pri); 1439 q = &rdd->queues[pri]; 1440 which = &rdd->queuebits; 1441 } else if (rtqbits) { 1442 pri = bsrl(rtqbits); 1443 rtqbits &= ~(1U << pri); 1444 q = &rdd->rtqueues[pri]; 1445 which = &rdd->rtqueuebits; 1446 } else { 1447 return (NULL); 1448 } 1449 lp = TAILQ_LAST(q, rq); 1450 } else { 1451 if (rtqbits) { 1452 pri = bsfl(rtqbits); 1453 rtqbits &= ~(1U << pri); 1454 q = &rdd->rtqueues[pri]; 1455 which = &rdd->rtqueuebits; 1456 } else if (tsqbits) { 1457 pri = bsfl(tsqbits); 1458 tsqbits &= ~(1U << pri); 1459 q = &rdd->queues[pri]; 1460 which = &rdd->queuebits; 1461 } else if (idqbits) { 1462 pri = bsfl(idqbits); 1463 idqbits &= ~(1U << pri); 1464 q = &rdd->idqueues[pri]; 1465 which = &rdd->idqueuebits; 1466 } else { 1467 return (NULL); 1468 } 1469 lp = TAILQ_FIRST(q); 1470 } 1471 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1472 1473 loopnear: 1474 /* 1475 * If the passed lwp <chklp> is reasonably close to the selected 1476 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1477 * 1478 * Note that we must error on the side of <chklp> to avoid bouncing 1479 * between threads in the acquire code. 1480 */ 1481 if (chklp) { 1482 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1483 return(NULL); 1484 } 1485 1486 /* 1487 * When rdd != dd, we have to make sure that the process we 1488 * are pulling is allow to run on our cpu. This alternative 1489 * path is a bit more expensive but its not considered to be 1490 * in the critical path. 1491 */ 1492 if (rdd != dd && CPUMASK_TESTBIT(lp->lwp_cpumask, dd->cpuid) == 0) { 1493 if (worst) 1494 lp = TAILQ_PREV(lp, rq, lwp_procq); 1495 else 1496 lp = TAILQ_NEXT(lp, lwp_procq); 1497 if (lp) 1498 goto loopnear; 1499 goto loopfar; 1500 } 1501 1502 KTR_COND_LOG(usched_chooseproc, 1503 lp->lwp_proc->p_pid == usched_dfly_pid_debug, 1504 lp->lwp_proc->p_pid, 1505 lp->lwp_thread->td_gd->gd_cpuid, 1506 mycpu->gd_cpuid); 1507 1508 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1509 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1510 TAILQ_REMOVE(q, lp, lwp_procq); 1511 --rdd->runqcount; 1512 if (TAILQ_EMPTY(q)) 1513 *which &= ~(1 << pri); 1514 1515 /* 1516 * If we are choosing a process from rdd with the intent to 1517 * move it to dd, lwp_qcpu must be adjusted while rdd's spinlock 1518 * is still held. 1519 */ 1520 if (rdd != dd) { 1521 if (lp->lwp_mpflags & LWP_MP_ULOAD) { 1522 atomic_add_long(&rdd->uload, -lp->lwp_uload); 1523 atomic_add_int(&rdd->ucount, -1); 1524 } 1525 lp->lwp_qcpu = dd->cpuid; 1526 atomic_add_long(&dd->uload, lp->lwp_uload); 1527 atomic_add_int(&dd->ucount, 1); 1528 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD); 1529 } 1530 return lp; 1531 } 1532 1533 /* 1534 * USED TO PUSH RUNNABLE LWPS TO THE LEAST LOADED CPU. 1535 * 1536 * Choose a cpu node to schedule lp on, hopefully nearby its current 1537 * node. 1538 * 1539 * We give the current node a modest advantage for obvious reasons. 1540 * 1541 * We also give the node the thread was woken up FROM a slight advantage 1542 * in order to try to schedule paired threads which synchronize/block waiting 1543 * for each other fairly close to each other. Similarly in a network setting 1544 * this feature will also attempt to place a user process near the kernel 1545 * protocol thread that is feeding it data. THIS IS A CRITICAL PART of the 1546 * algorithm as it heuristically groups synchronizing processes for locality 1547 * of reference in multi-socket systems. 1548 * 1549 * We check against running processes and give a big advantage if there 1550 * are none running. 1551 * 1552 * The caller will normally dfly_setrunqueue() lp on the returned queue. 1553 * 1554 * When the topology is known choose a cpu whos group has, in aggregate, 1555 * has the lowest weighted load. 1556 */ 1557 static 1558 dfly_pcpu_t 1559 dfly_choose_best_queue(struct lwp *lp) 1560 { 1561 cpumask_t wakemask; 1562 cpumask_t mask; 1563 cpu_node_t *cpup; 1564 cpu_node_t *cpun; 1565 cpu_node_t *cpub; 1566 dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu]; 1567 dfly_pcpu_t rdd; 1568 int wakecpu; 1569 int cpuid; 1570 int n; 1571 int count; 1572 long load; 1573 long lowest_load; 1574 1575 /* 1576 * When the topology is unknown choose a random cpu that is hopefully 1577 * idle. 1578 */ 1579 if (dd->cpunode == NULL) 1580 return (dfly_choose_queue_simple(dd, lp)); 1581 1582 /* 1583 * Pairing mask 1584 */ 1585 if ((wakecpu = lp->lwp_thread->td_wakefromcpu) >= 0) 1586 wakemask = dfly_pcpu[wakecpu].cpumask; 1587 else 1588 CPUMASK_ASSZERO(wakemask); 1589 1590 /* 1591 * When the topology is known choose a cpu whos group has, in 1592 * aggregate, has the lowest weighted load. 1593 */ 1594 cpup = root_cpu_node; 1595 rdd = dd; 1596 1597 while (cpup) { 1598 /* 1599 * Degenerate case super-root 1600 */ 1601 if (cpup->child_no == 1) { 1602 cpup = cpup->child_node[0]; 1603 continue; 1604 } 1605 1606 /* 1607 * Terminal cpunode 1608 */ 1609 if (cpup->child_no == 0) { 1610 rdd = &dfly_pcpu[BSFCPUMASK(cpup->members)]; 1611 break; 1612 } 1613 1614 cpub = NULL; 1615 lowest_load = 0x7FFFFFFFFFFFFFFFLLU; 1616 1617 for (n = 0; n < cpup->child_no; ++n) { 1618 /* 1619 * Accumulate load information for all cpus 1620 * which are members of this node. 1621 */ 1622 cpun = cpup->child_node[n]; 1623 mask = cpun->members; 1624 CPUMASK_ANDMASK(mask, usched_global_cpumask); 1625 CPUMASK_ANDMASK(mask, smp_active_mask); 1626 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 1627 if (CPUMASK_TESTZERO(mask)) 1628 continue; 1629 1630 count = 0; 1631 load = 0; 1632 1633 while (CPUMASK_TESTNZERO(mask)) { 1634 cpuid = BSFCPUMASK(mask); 1635 rdd = &dfly_pcpu[cpuid]; 1636 load += rdd->uload; 1637 load += rdd->ucount * usched_dfly_weight3; 1638 1639 if (rdd->uschedcp == NULL && 1640 rdd->runqcount == 0 && 1641 globaldata_find(cpuid)->gd_tdrunqcount == 0 1642 ) { 1643 load -= usched_dfly_weight4; 1644 } 1645 #if 0 1646 else if (rdd->upri > lp->lwp_priority + PPQ) { 1647 load -= usched_dfly_weight4 / 2; 1648 } 1649 #endif 1650 CPUMASK_NANDBIT(mask, cpuid); 1651 ++count; 1652 } 1653 1654 /* 1655 * Compensate if the lp is already accounted for in 1656 * the aggregate uload for this mask set. We want 1657 * to calculate the loads as if lp were not present, 1658 * otherwise the calculation is bogus. 1659 */ 1660 if ((lp->lwp_mpflags & LWP_MP_ULOAD) && 1661 CPUMASK_TESTMASK(dd->cpumask, cpun->members)) { 1662 load -= lp->lwp_uload; 1663 load -= usched_dfly_weight3; 1664 } 1665 1666 load /= count; 1667 1668 /* 1669 * Advantage the cpu group (lp) is already on. 1670 */ 1671 if (CPUMASK_TESTMASK(cpun->members, dd->cpumask)) 1672 load -= usched_dfly_weight1; 1673 1674 /* 1675 * Advantage the cpu group we want to pair (lp) to, 1676 * but don't let it go to the exact same cpu as 1677 * the wakecpu target. 1678 * 1679 * We do this by checking whether cpun is a 1680 * terminal node or not. All cpun's at the same 1681 * level will either all be terminal or all not 1682 * terminal. 1683 * 1684 * If it is and we match we disadvantage the load. 1685 * If it is and we don't match we advantage the load. 1686 * 1687 * Also note that we are effectively disadvantaging 1688 * all-but-one by the same amount, so it won't effect 1689 * the weight1 factor for the all-but-one nodes. 1690 */ 1691 if (CPUMASK_TESTMASK(cpun->members, wakemask)) { 1692 if (cpun->child_no != 0) { 1693 /* advantage */ 1694 load -= usched_dfly_weight2; 1695 } else { 1696 if (usched_dfly_features & 0x10) 1697 load += usched_dfly_weight2; 1698 else 1699 load -= usched_dfly_weight2; 1700 } 1701 } 1702 1703 /* 1704 * Calculate the best load 1705 */ 1706 if (cpub == NULL || lowest_load > load || 1707 (lowest_load == load && 1708 CPUMASK_TESTMASK(cpun->members, dd->cpumask)) 1709 ) { 1710 lowest_load = load; 1711 cpub = cpun; 1712 } 1713 } 1714 cpup = cpub; 1715 } 1716 /* Dispatch this outcast to a proper CPU. */ 1717 if (__predict_false(CPUMASK_TESTBIT(lp->lwp_cpumask, rdd->cpuid) == 0)) 1718 rdd = &dfly_pcpu[BSFCPUMASK(lp->lwp_cpumask)]; 1719 if (usched_dfly_chooser > 0) { 1720 --usched_dfly_chooser; /* only N lines */ 1721 kprintf("lp %02d->%02d %s\n", 1722 lp->lwp_qcpu, rdd->cpuid, lp->lwp_proc->p_comm); 1723 } 1724 return (rdd); 1725 } 1726 1727 /* 1728 * USED TO PULL RUNNABLE LWPS FROM THE MOST LOADED CPU. 1729 * 1730 * Choose the worst queue close to dd's cpu node with a non-empty runq 1731 * that is NOT dd. Also require that the moving of the highest-load thread 1732 * from rdd to dd does not cause the uload's to cross each other. 1733 * 1734 * This is used by the thread chooser when the current cpu's queues are 1735 * empty to steal a thread from another cpu's queue. We want to offload 1736 * the most heavily-loaded queue. 1737 */ 1738 static 1739 dfly_pcpu_t 1740 dfly_choose_worst_queue(dfly_pcpu_t dd) 1741 { 1742 cpumask_t mask; 1743 cpu_node_t *cpup; 1744 cpu_node_t *cpun; 1745 cpu_node_t *cpub; 1746 dfly_pcpu_t rdd; 1747 int cpuid; 1748 int n; 1749 int count; 1750 long load; 1751 long highest_load; 1752 #if 0 1753 int pri; 1754 int hpri; 1755 #endif 1756 1757 /* 1758 * When the topology is unknown choose a random cpu that is hopefully 1759 * idle. 1760 */ 1761 if (dd->cpunode == NULL) { 1762 return (NULL); 1763 } 1764 1765 /* 1766 * When the topology is known choose a cpu whos group has, in 1767 * aggregate, has the highest weighted load. 1768 */ 1769 cpup = root_cpu_node; 1770 rdd = dd; 1771 while (cpup) { 1772 /* 1773 * Degenerate case super-root 1774 */ 1775 if (cpup->child_no == 1) { 1776 cpup = cpup->child_node[0]; 1777 continue; 1778 } 1779 1780 /* 1781 * Terminal cpunode 1782 */ 1783 if (cpup->child_no == 0) { 1784 rdd = &dfly_pcpu[BSFCPUMASK(cpup->members)]; 1785 break; 1786 } 1787 1788 cpub = NULL; 1789 highest_load = 0; 1790 1791 for (n = 0; n < cpup->child_no; ++n) { 1792 /* 1793 * Accumulate load information for all cpus 1794 * which are members of this node. 1795 */ 1796 cpun = cpup->child_node[n]; 1797 mask = cpun->members; 1798 CPUMASK_ANDMASK(mask, usched_global_cpumask); 1799 CPUMASK_ANDMASK(mask, smp_active_mask); 1800 if (CPUMASK_TESTZERO(mask)) 1801 continue; 1802 1803 count = 0; 1804 load = 0; 1805 1806 while (CPUMASK_TESTNZERO(mask)) { 1807 cpuid = BSFCPUMASK(mask); 1808 rdd = &dfly_pcpu[cpuid]; 1809 load += rdd->uload; 1810 load += (long)rdd->ucount * usched_dfly_weight3; 1811 1812 if (rdd->uschedcp == NULL && 1813 rdd->runqcount == 0 && 1814 globaldata_find(cpuid)->gd_tdrunqcount == 0 1815 ) { 1816 load -= usched_dfly_weight4; 1817 } 1818 #if 0 1819 else if (rdd->upri > dd->upri + PPQ) { 1820 load -= usched_dfly_weight4 / 2; 1821 } 1822 #endif 1823 CPUMASK_NANDBIT(mask, cpuid); 1824 ++count; 1825 } 1826 load /= count; 1827 1828 /* 1829 * Prefer candidates which are somewhat closer to 1830 * our cpu. 1831 */ 1832 if (CPUMASK_TESTMASK(dd->cpumask, cpun->members)) 1833 load += usched_dfly_weight1; 1834 1835 /* 1836 * The best candidate is the one with the worst 1837 * (highest) load. 1838 */ 1839 if (cpub == NULL || highest_load < load || 1840 (highest_load == load && 1841 CPUMASK_TESTMASK(cpun->members, dd->cpumask))) { 1842 highest_load = load; 1843 cpub = cpun; 1844 } 1845 } 1846 cpup = cpub; 1847 } 1848 1849 /* 1850 * We never return our own node (dd), and only return a remote 1851 * node if it's load is significantly worse than ours (i.e. where 1852 * stealing a thread would be considered reasonable). 1853 * 1854 * This also helps us avoid breaking paired threads apart which 1855 * can have disastrous effects on performance. 1856 */ 1857 if (rdd == dd) 1858 return(NULL); 1859 1860 #if 0 1861 hpri = 0; 1862 if (rdd->rtqueuebits && hpri < (pri = bsrl(rdd->rtqueuebits))) 1863 hpri = pri; 1864 if (rdd->queuebits && hpri < (pri = bsrl(rdd->queuebits))) 1865 hpri = pri; 1866 if (rdd->idqueuebits && hpri < (pri = bsrl(rdd->idqueuebits))) 1867 hpri = pri; 1868 hpri *= PPQ; 1869 if (rdd->uload - hpri < dd->uload + hpri) 1870 return(NULL); 1871 #endif 1872 return (rdd); 1873 } 1874 1875 static 1876 dfly_pcpu_t 1877 dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp) 1878 { 1879 dfly_pcpu_t rdd; 1880 cpumask_t tmpmask; 1881 cpumask_t mask; 1882 int cpubase; 1883 int cpuid; 1884 1885 /* 1886 * Fallback to the original heuristic, select random cpu, 1887 * first checking the cpus not currently running a user thread. 1888 * 1889 * Use cpuid as the base cpu in our scan, first checking 1890 * cpuid...(ncpus-1), then 0...(cpuid-1). This avoid favoring 1891 * lower-numbered cpus. 1892 */ 1893 ++dd->scancpu; /* SMP race ok */ 1894 mask = dfly_rdyprocmask; 1895 CPUMASK_NANDMASK(mask, dfly_curprocmask); 1896 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 1897 CPUMASK_ANDMASK(mask, smp_active_mask); 1898 CPUMASK_ANDMASK(mask, usched_global_cpumask); 1899 1900 cpubase = (int)(dd->scancpu % ncpus); 1901 CPUMASK_ASSBMASK(tmpmask, cpubase); 1902 CPUMASK_INVMASK(tmpmask); 1903 CPUMASK_ANDMASK(tmpmask, mask); 1904 while (CPUMASK_TESTNZERO(tmpmask)) { 1905 cpuid = BSFCPUMASK(tmpmask); 1906 rdd = &dfly_pcpu[cpuid]; 1907 1908 if ((rdd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) 1909 goto found; 1910 CPUMASK_NANDBIT(tmpmask, cpuid); 1911 } 1912 1913 CPUMASK_ASSBMASK(tmpmask, cpubase); 1914 CPUMASK_ANDMASK(tmpmask, mask); 1915 while (CPUMASK_TESTNZERO(tmpmask)) { 1916 cpuid = BSFCPUMASK(tmpmask); 1917 rdd = &dfly_pcpu[cpuid]; 1918 1919 if ((rdd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) 1920 goto found; 1921 CPUMASK_NANDBIT(tmpmask, cpuid); 1922 } 1923 1924 /* 1925 * Then cpus which might have a currently running lp 1926 */ 1927 mask = dfly_rdyprocmask; 1928 CPUMASK_ANDMASK(mask, dfly_curprocmask); 1929 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 1930 CPUMASK_ANDMASK(mask, smp_active_mask); 1931 CPUMASK_ANDMASK(mask, usched_global_cpumask); 1932 1933 CPUMASK_ASSBMASK(tmpmask, cpubase); 1934 CPUMASK_INVMASK(tmpmask); 1935 CPUMASK_ANDMASK(tmpmask, mask); 1936 while (CPUMASK_TESTNZERO(tmpmask)) { 1937 cpuid = BSFCPUMASK(tmpmask); 1938 rdd = &dfly_pcpu[cpuid]; 1939 1940 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 1941 goto found; 1942 CPUMASK_NANDBIT(tmpmask, cpuid); 1943 } 1944 1945 CPUMASK_ASSBMASK(tmpmask, cpubase); 1946 CPUMASK_ANDMASK(tmpmask, mask); 1947 while (CPUMASK_TESTNZERO(tmpmask)) { 1948 cpuid = BSFCPUMASK(tmpmask); 1949 rdd = &dfly_pcpu[cpuid]; 1950 1951 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 1952 goto found; 1953 CPUMASK_NANDBIT(tmpmask, cpuid); 1954 } 1955 1956 /* 1957 * If we cannot find a suitable cpu we round-robin using scancpu. 1958 * Other cpus will pickup as they release their current lwps or 1959 * become ready. 1960 * 1961 * Avoid a degenerate system lockup case if usched_global_cpumask 1962 * is set to 0 or otherwise does not cover lwp_cpumask. 1963 * 1964 * We only kick the target helper thread in this case, we do not 1965 * set the user resched flag because 1966 */ 1967 cpuid = cpubase; 1968 if (CPUMASK_TESTBIT(lp->lwp_cpumask, cpuid) == 0) 1969 cpuid = BSFCPUMASK(lp->lwp_cpumask); 1970 else if (CPUMASK_TESTBIT(usched_global_cpumask, cpuid) == 0) 1971 cpuid = 0; 1972 rdd = &dfly_pcpu[cpuid]; 1973 found: 1974 return (rdd); 1975 } 1976 1977 static 1978 void 1979 dfly_need_user_resched_remote(void *dummy) 1980 { 1981 globaldata_t gd = mycpu; 1982 dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid]; 1983 1984 /* 1985 * Flag reschedule needed 1986 */ 1987 need_user_resched(); 1988 1989 /* 1990 * If no user thread is currently running we need to kick the helper 1991 * on our cpu to recover. Otherwise the cpu will never schedule 1992 * anything again. 1993 * 1994 * We cannot schedule the process ourselves because this is an 1995 * IPI callback and we cannot acquire spinlocks in an IPI callback. 1996 * 1997 * Call wakeup_mycpu to avoid sending IPIs to other CPUs 1998 */ 1999 if (dd->uschedcp == NULL && (dd->flags & DFLY_PCPU_RDYMASK)) { 2000 ATOMIC_CPUMASK_NANDBIT(dfly_rdyprocmask, gd->gd_cpuid); 2001 dd->flags &= ~DFLY_PCPU_RDYMASK; 2002 wakeup_mycpu(dd->helper_thread); 2003 } 2004 } 2005 2006 /* 2007 * dfly_remrunqueue_locked() removes a given process from the run queue 2008 * that it is on, clearing the queue busy bit if it becomes empty. 2009 * 2010 * Note that user process scheduler is different from the LWKT schedule. 2011 * The user process scheduler only manages user processes but it uses LWKT 2012 * underneath, and a user process operating in the kernel will often be 2013 * 'released' from our management. 2014 * 2015 * uload is NOT adjusted here. It is only adjusted if the lwkt_thread goes 2016 * to sleep or the lwp is moved to a different runq. 2017 */ 2018 static void 2019 dfly_remrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp) 2020 { 2021 struct rq *q; 2022 u_int32_t *which; 2023 u_int8_t pri; 2024 2025 KKASSERT(rdd->runqcount >= 0); 2026 2027 pri = lp->lwp_rqindex; 2028 2029 switch(lp->lwp_rqtype) { 2030 case RTP_PRIO_NORMAL: 2031 q = &rdd->queues[pri]; 2032 which = &rdd->queuebits; 2033 break; 2034 case RTP_PRIO_REALTIME: 2035 case RTP_PRIO_FIFO: 2036 q = &rdd->rtqueues[pri]; 2037 which = &rdd->rtqueuebits; 2038 break; 2039 case RTP_PRIO_IDLE: 2040 q = &rdd->idqueues[pri]; 2041 which = &rdd->idqueuebits; 2042 break; 2043 default: 2044 panic("remrunqueue: invalid rtprio type"); 2045 /* NOT REACHED */ 2046 } 2047 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ); 2048 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 2049 TAILQ_REMOVE(q, lp, lwp_procq); 2050 --rdd->runqcount; 2051 if (TAILQ_EMPTY(q)) { 2052 KASSERT((*which & (1 << pri)) != 0, 2053 ("remrunqueue: remove from empty queue")); 2054 *which &= ~(1 << pri); 2055 } 2056 } 2057 2058 /* 2059 * dfly_setrunqueue_locked() 2060 * 2061 * Add a process whos rqtype and rqindex had previously been calculated 2062 * onto the appropriate run queue. Determine if the addition requires 2063 * a reschedule on a cpu and return the cpuid or -1. 2064 * 2065 * NOTE: Lower priorities are better priorities. 2066 * 2067 * NOTE ON ULOAD: This variable specifies the aggregate load on a cpu, the 2068 * sum of the rough lwp_priority for all running and runnable 2069 * processes. Lower priority processes (higher lwp_priority 2070 * values) actually DO count as more load, not less, because 2071 * these are the programs which require the most care with 2072 * regards to cpu selection. 2073 */ 2074 static void 2075 dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp) 2076 { 2077 u_int32_t *which; 2078 struct rq *q; 2079 int pri; 2080 2081 KKASSERT(lp->lwp_qcpu == rdd->cpuid); 2082 2083 if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) { 2084 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD); 2085 atomic_add_long(&dfly_pcpu[lp->lwp_qcpu].uload, lp->lwp_uload); 2086 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].ucount, 1); 2087 } 2088 2089 pri = lp->lwp_rqindex; 2090 2091 switch(lp->lwp_rqtype) { 2092 case RTP_PRIO_NORMAL: 2093 q = &rdd->queues[pri]; 2094 which = &rdd->queuebits; 2095 break; 2096 case RTP_PRIO_REALTIME: 2097 case RTP_PRIO_FIFO: 2098 q = &rdd->rtqueues[pri]; 2099 which = &rdd->rtqueuebits; 2100 break; 2101 case RTP_PRIO_IDLE: 2102 q = &rdd->idqueues[pri]; 2103 which = &rdd->idqueuebits; 2104 break; 2105 default: 2106 panic("remrunqueue: invalid rtprio type"); 2107 /* NOT REACHED */ 2108 } 2109 2110 /* 2111 * Place us on the selected queue. Determine if we should be 2112 * placed at the head of the queue or at the end. 2113 * 2114 * We are placed at the tail if our round-robin count has expired, 2115 * or is about to expire and the system thinks its a good place to 2116 * round-robin, or there is already a next thread on the queue 2117 * (it might be trying to pick up where it left off and we don't 2118 * want to interfere). 2119 */ 2120 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 2121 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 2122 ++rdd->runqcount; 2123 2124 if (lp->lwp_rrcount >= usched_dfly_rrinterval || 2125 (lp->lwp_rrcount >= usched_dfly_rrinterval / 2 && 2126 (lp->lwp_thread->td_mpflags & TDF_MP_BATCH_DEMARC)) 2127 ) { 2128 /* 2129 * Place on tail 2130 */ 2131 atomic_clear_int(&lp->lwp_thread->td_mpflags, 2132 TDF_MP_BATCH_DEMARC); 2133 lp->lwp_rrcount = 0; 2134 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 2135 } else { 2136 /* 2137 * Retain rrcount and place on head. Count is retained 2138 * even if the queue is empty. 2139 */ 2140 TAILQ_INSERT_HEAD(q, lp, lwp_procq); 2141 } 2142 *which |= 1 << pri; 2143 } 2144 2145 /* 2146 * For SMP systems a user scheduler helper thread is created for each 2147 * cpu and is used to allow one cpu to wakeup another for the purposes of 2148 * scheduling userland threads from setrunqueue(). 2149 * 2150 * UP systems do not need the helper since there is only one cpu. 2151 * 2152 * We can't use the idle thread for this because we might block. 2153 * Additionally, doing things this way allows us to HLT idle cpus 2154 * on MP systems. 2155 */ 2156 static void 2157 dfly_helper_thread(void *dummy) 2158 { 2159 globaldata_t gd; 2160 dfly_pcpu_t dd; 2161 dfly_pcpu_t rdd; 2162 struct lwp *nlp; 2163 cpumask_t mask; 2164 int cpuid; 2165 2166 gd = mycpu; 2167 cpuid = gd->gd_cpuid; /* doesn't change */ 2168 mask = gd->gd_cpumask; /* doesn't change */ 2169 dd = &dfly_pcpu[cpuid]; 2170 2171 /* 2172 * Since we only want to be woken up only when no user processes 2173 * are scheduled on a cpu, run at an ultra low priority. 2174 */ 2175 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 2176 2177 tsleep(dd->helper_thread, 0, "schslp", 0); 2178 2179 for (;;) { 2180 /* 2181 * We use the LWKT deschedule-interlock trick to avoid racing 2182 * dfly_rdyprocmask. This means we cannot block through to the 2183 * manual lwkt_switch() call we make below. 2184 */ 2185 crit_enter_gd(gd); 2186 tsleep_interlock(dd->helper_thread, 0); 2187 2188 spin_lock(&dd->spin); 2189 if ((dd->flags & DFLY_PCPU_RDYMASK) == 0) { 2190 ATOMIC_CPUMASK_ORMASK(dfly_rdyprocmask, mask); 2191 dd->flags |= DFLY_PCPU_RDYMASK; 2192 } 2193 clear_user_resched(); /* This satisfied the reschedule request */ 2194 #if 0 2195 dd->rrcount = 0; /* Reset the round-robin counter */ 2196 #endif 2197 2198 if (dd->runqcount || dd->uschedcp != NULL) { 2199 /* 2200 * Threads are available. A thread may or may not be 2201 * currently scheduled. Get the best thread already queued 2202 * to this cpu. 2203 */ 2204 nlp = dfly_chooseproc_locked(dd, dd, dd->uschedcp, 0); 2205 if (nlp) { 2206 if ((dd->flags & DFLY_PCPU_CURMASK) == 0) { 2207 ATOMIC_CPUMASK_ORMASK(dfly_curprocmask, mask); 2208 dd->flags |= DFLY_PCPU_CURMASK; 2209 } 2210 dd->upri = nlp->lwp_priority; 2211 dd->uschedcp = nlp; 2212 #if 0 2213 dd->rrcount = 0; /* reset round robin */ 2214 #endif 2215 spin_unlock(&dd->spin); 2216 lwkt_acquire(nlp->lwp_thread); 2217 lwkt_schedule(nlp->lwp_thread); 2218 } else { 2219 /* 2220 * This situation should not occur because we had 2221 * at least one thread available. 2222 */ 2223 spin_unlock(&dd->spin); 2224 } 2225 } else if (usched_dfly_features & 0x01) { 2226 /* 2227 * This cpu is devoid of runnable threads, steal a thread 2228 * from another cpu. Since we're stealing, might as well 2229 * load balance at the same time. 2230 * 2231 * We choose the highest-loaded thread from the worst queue. 2232 * 2233 * NOTE! This function only returns a non-NULL rdd when 2234 * another cpu's queue is obviously overloaded. We 2235 * do not want to perform the type of rebalancing 2236 * the schedclock does here because it would result 2237 * in insane process pulling when 'steady' state is 2238 * partially unbalanced (e.g. 6 runnables and only 2239 * 4 cores). 2240 */ 2241 rdd = dfly_choose_worst_queue(dd); 2242 if (rdd && spin_trylock(&rdd->spin)) { 2243 nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1); 2244 spin_unlock(&rdd->spin); 2245 } else { 2246 nlp = NULL; 2247 } 2248 if (nlp) { 2249 if ((dd->flags & DFLY_PCPU_CURMASK) == 0) { 2250 ATOMIC_CPUMASK_ORMASK(dfly_curprocmask, mask); 2251 dd->flags |= DFLY_PCPU_CURMASK; 2252 } 2253 dd->upri = nlp->lwp_priority; 2254 dd->uschedcp = nlp; 2255 #if 0 2256 dd->rrcount = 0; /* reset round robin */ 2257 #endif 2258 spin_unlock(&dd->spin); 2259 lwkt_acquire(nlp->lwp_thread); 2260 lwkt_schedule(nlp->lwp_thread); 2261 } else { 2262 /* 2263 * Leave the thread on our run queue. Another 2264 * scheduler will try to pull it later. 2265 */ 2266 spin_unlock(&dd->spin); 2267 } 2268 } else { 2269 /* 2270 * devoid of runnable threads and not allowed to steal 2271 * any. 2272 */ 2273 spin_unlock(&dd->spin); 2274 } 2275 2276 /* 2277 * We're descheduled unless someone scheduled us. Switch away. 2278 * Exiting the critical section will cause splz() to be called 2279 * for us if interrupts and such are pending. 2280 */ 2281 crit_exit_gd(gd); 2282 tsleep(dd->helper_thread, PINTERLOCKED, "schslp", 0); 2283 } 2284 } 2285 2286 #if 0 2287 static int 2288 sysctl_usched_dfly_stick_to_level(SYSCTL_HANDLER_ARGS) 2289 { 2290 int error, new_val; 2291 2292 new_val = usched_dfly_stick_to_level; 2293 2294 error = sysctl_handle_int(oidp, &new_val, 0, req); 2295 if (error != 0 || req->newptr == NULL) 2296 return (error); 2297 if (new_val > cpu_topology_levels_number - 1 || new_val < 0) 2298 return (EINVAL); 2299 usched_dfly_stick_to_level = new_val; 2300 return (0); 2301 } 2302 #endif 2303 2304 /* 2305 * Setup the queues and scheduler helpers (scheduler helpers are SMP only). 2306 * Note that curprocmask bit 0 has already been cleared by rqinit() and 2307 * we should not mess with it further. 2308 */ 2309 static void 2310 usched_dfly_cpu_init(void) 2311 { 2312 int i; 2313 int j; 2314 int smt_not_supported = 0; 2315 int cache_coherent_not_supported = 0; 2316 2317 if (bootverbose) 2318 kprintf("Start usched_dfly helpers on cpus:\n"); 2319 2320 sysctl_ctx_init(&usched_dfly_sysctl_ctx); 2321 usched_dfly_sysctl_tree = 2322 SYSCTL_ADD_NODE(&usched_dfly_sysctl_ctx, 2323 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 2324 "usched_dfly", CTLFLAG_RD, 0, ""); 2325 2326 for (i = 0; i < ncpus; ++i) { 2327 dfly_pcpu_t dd = &dfly_pcpu[i]; 2328 cpumask_t mask; 2329 2330 CPUMASK_ASSBIT(mask, i); 2331 if (CPUMASK_TESTMASK(mask, smp_active_mask) == 0) 2332 continue; 2333 2334 spin_init(&dd->spin, "uschedcpuinit"); 2335 dd->cpunode = get_cpu_node_by_cpuid(i); 2336 dd->cpuid = i; 2337 CPUMASK_ASSBIT(dd->cpumask, i); 2338 for (j = 0; j < NQS; j++) { 2339 TAILQ_INIT(&dd->queues[j]); 2340 TAILQ_INIT(&dd->rtqueues[j]); 2341 TAILQ_INIT(&dd->idqueues[j]); 2342 } 2343 ATOMIC_CPUMASK_NANDBIT(dfly_curprocmask, 0); 2344 if (i == 0) 2345 dd->flags &= ~DFLY_PCPU_CURMASK; 2346 2347 if (dd->cpunode == NULL) { 2348 smt_not_supported = 1; 2349 cache_coherent_not_supported = 1; 2350 if (bootverbose) 2351 kprintf (" cpu%d - WARNING: No CPU NODE " 2352 "found for cpu\n", i); 2353 } else { 2354 switch (dd->cpunode->type) { 2355 case THREAD_LEVEL: 2356 if (bootverbose) 2357 kprintf (" cpu%d - HyperThreading " 2358 "available. Core siblings: ", 2359 i); 2360 break; 2361 case CORE_LEVEL: 2362 smt_not_supported = 1; 2363 2364 if (bootverbose) 2365 kprintf (" cpu%d - No HT available, " 2366 "multi-core/physical " 2367 "cpu. Physical siblings: ", 2368 i); 2369 break; 2370 case CHIP_LEVEL: 2371 smt_not_supported = 1; 2372 2373 if (bootverbose) 2374 kprintf (" cpu%d - No HT available, " 2375 "single-core/physical cpu. " 2376 "Package siblings: ", 2377 i); 2378 break; 2379 default: 2380 /* Let's go for safe defaults here */ 2381 smt_not_supported = 1; 2382 cache_coherent_not_supported = 1; 2383 if (bootverbose) 2384 kprintf (" cpu%d - Unknown cpunode->" 2385 "type=%u. siblings: ", 2386 i, 2387 (u_int)dd->cpunode->type); 2388 break; 2389 } 2390 2391 if (bootverbose) { 2392 if (dd->cpunode->parent_node != NULL) { 2393 kprint_cpuset(&dd->cpunode-> 2394 parent_node->members); 2395 kprintf("\n"); 2396 } else { 2397 kprintf(" no siblings\n"); 2398 } 2399 } 2400 } 2401 2402 lwkt_create(dfly_helper_thread, NULL, &dd->helper_thread, NULL, 2403 0, i, "usched %d", i); 2404 2405 /* 2406 * Allow user scheduling on the target cpu. cpu #0 has already 2407 * been enabled in rqinit(). 2408 */ 2409 if (i) { 2410 ATOMIC_CPUMASK_NANDMASK(dfly_curprocmask, mask); 2411 dd->flags &= ~DFLY_PCPU_CURMASK; 2412 } 2413 if ((dd->flags & DFLY_PCPU_RDYMASK) == 0) { 2414 ATOMIC_CPUMASK_ORMASK(dfly_rdyprocmask, mask); 2415 dd->flags |= DFLY_PCPU_RDYMASK; 2416 } 2417 dd->upri = PRIBASE_NULL; 2418 2419 } 2420 2421 /* usched_dfly sysctl configurable parameters */ 2422 2423 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2424 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2425 OID_AUTO, "rrinterval", CTLFLAG_RW, 2426 &usched_dfly_rrinterval, 0, ""); 2427 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2428 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2429 OID_AUTO, "decay", CTLFLAG_RW, 2430 &usched_dfly_decay, 0, "Extra decay when not running"); 2431 2432 /* Add enable/disable option for SMT scheduling if supported */ 2433 if (smt_not_supported) { 2434 usched_dfly_smt = 0; 2435 SYSCTL_ADD_STRING(&usched_dfly_sysctl_ctx, 2436 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2437 OID_AUTO, "smt", CTLFLAG_RD, 2438 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED"); 2439 } else { 2440 usched_dfly_smt = 1; 2441 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2442 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2443 OID_AUTO, "smt", CTLFLAG_RW, 2444 &usched_dfly_smt, 0, "Enable SMT scheduling"); 2445 } 2446 2447 /* 2448 * Add enable/disable option for cache coherent scheduling 2449 * if supported 2450 */ 2451 if (cache_coherent_not_supported) { 2452 usched_dfly_cache_coherent = 0; 2453 SYSCTL_ADD_STRING(&usched_dfly_sysctl_ctx, 2454 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2455 OID_AUTO, "cache_coherent", CTLFLAG_RD, 2456 "NOT SUPPORTED", 0, 2457 "Cache coherence NOT SUPPORTED"); 2458 } else { 2459 usched_dfly_cache_coherent = 1; 2460 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2461 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2462 OID_AUTO, "cache_coherent", CTLFLAG_RW, 2463 &usched_dfly_cache_coherent, 0, 2464 "Enable/Disable cache coherent scheduling"); 2465 2466 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2467 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2468 OID_AUTO, "weight1", CTLFLAG_RW, 2469 &usched_dfly_weight1, 200, 2470 "Weight selection for current cpu"); 2471 2472 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2473 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2474 OID_AUTO, "weight2", CTLFLAG_RW, 2475 &usched_dfly_weight2, 180, 2476 "Weight selection for wakefrom cpu"); 2477 2478 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2479 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2480 OID_AUTO, "weight3", CTLFLAG_RW, 2481 &usched_dfly_weight3, 40, 2482 "Weight selection for num threads on queue"); 2483 2484 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2485 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2486 OID_AUTO, "weight4", CTLFLAG_RW, 2487 &usched_dfly_weight4, 160, 2488 "Availability of other idle cpus"); 2489 2490 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2491 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2492 OID_AUTO, "fast_resched", CTLFLAG_RW, 2493 &usched_dfly_fast_resched, 0, 2494 "Availability of other idle cpus"); 2495 2496 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2497 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2498 OID_AUTO, "features", CTLFLAG_RW, 2499 &usched_dfly_features, 0x8F, 2500 "Allow pulls into empty queues"); 2501 2502 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx, 2503 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2504 OID_AUTO, "swmask", CTLFLAG_RW, 2505 &usched_dfly_swmask, ~PPQMASK, 2506 "Queue mask to force thread switch"); 2507 2508 #if 0 2509 SYSCTL_ADD_PROC(&usched_dfly_sysctl_ctx, 2510 SYSCTL_CHILDREN(usched_dfly_sysctl_tree), 2511 OID_AUTO, "stick_to_level", 2512 CTLTYPE_INT | CTLFLAG_RW, 2513 NULL, sizeof usched_dfly_stick_to_level, 2514 sysctl_usched_dfly_stick_to_level, "I", 2515 "Stick a process to this level. See sysctl" 2516 "paremter hw.cpu_topology.level_description"); 2517 #endif 2518 } 2519 } 2520 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2521 usched_dfly_cpu_init, NULL); 2522