1 /* 2 * Copyright (c) 2006 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/lock.h> 39 #include <sys/queue.h> 40 #include <sys/proc.h> 41 #include <sys/rtprio.h> 42 #include <sys/uio.h> 43 #include <sys/sysctl.h> 44 #include <sys/resourcevar.h> 45 #include <sys/spinlock.h> 46 #include <machine/cpu.h> 47 #include <machine/smp.h> 48 49 #include <sys/thread2.h> 50 #include <sys/spinlock2.h> 51 #include <sys/mplock2.h> 52 53 #define MAXPRI 128 54 #define PRIBASE_REALTIME 0 55 #define PRIBASE_NORMAL MAXPRI 56 #define PRIBASE_IDLE (MAXPRI * 2) 57 #define PRIBASE_THREAD (MAXPRI * 3) 58 #define PRIBASE_NULL (MAXPRI * 4) 59 60 #define lwp_priority lwp_usdata.bsd4.priority 61 #define lwp_estcpu lwp_usdata.bsd4.estcpu 62 63 static void dummy_acquire_curproc(struct lwp *lp); 64 static void dummy_release_curproc(struct lwp *lp); 65 static void dummy_select_curproc(globaldata_t gd); 66 static void dummy_setrunqueue(struct lwp *lp); 67 static void dummy_schedulerclock(struct lwp *lp, sysclock_t period, 68 sysclock_t cpstamp); 69 static void dummy_recalculate_estcpu(struct lwp *lp); 70 static void dummy_resetpriority(struct lwp *lp); 71 static void dummy_forking(struct lwp *plp, struct lwp *lp); 72 static void dummy_exiting(struct lwp *plp, struct proc *child); 73 static void dummy_uload_update(struct lwp *lp); 74 static void dummy_yield(struct lwp *lp); 75 static void dummy_changedcpu(struct lwp *lp); 76 77 struct usched usched_dummy = { 78 { NULL }, 79 "dummy", "Dummy DragonFly Scheduler", 80 NULL, /* default registration */ 81 NULL, /* default deregistration */ 82 dummy_acquire_curproc, 83 dummy_release_curproc, 84 dummy_setrunqueue, 85 dummy_schedulerclock, 86 dummy_recalculate_estcpu, 87 dummy_resetpriority, 88 dummy_forking, 89 dummy_exiting, 90 dummy_uload_update, 91 NULL, /* setcpumask not supported */ 92 dummy_yield, 93 dummy_changedcpu 94 }; 95 96 struct usched_dummy_pcpu { 97 int rrcount; 98 struct thread helper_thread; 99 struct lwp *uschedcp; 100 }; 101 102 typedef struct usched_dummy_pcpu *dummy_pcpu_t; 103 104 static struct usched_dummy_pcpu dummy_pcpu[MAXCPU]; 105 static cpumask_t dummy_curprocmask = CPUMASK_INITIALIZER_ALLONES; 106 static cpumask_t dummy_rdyprocmask; 107 static struct spinlock dummy_spin; 108 static TAILQ_HEAD(rq, lwp) dummy_runq; 109 static int dummy_runqcount; 110 111 static int usched_dummy_rrinterval = (ESTCPUFREQ + 9) / 10; 112 SYSCTL_INT(_kern, OID_AUTO, usched_dummy_rrinterval, CTLFLAG_RW, 113 &usched_dummy_rrinterval, 0, ""); 114 115 /* 116 * Initialize the run queues at boot time, clear cpu 0 in curprocmask 117 * to allow dummy scheduling on cpu 0. 118 */ 119 static void 120 dummyinit(void *dummy) 121 { 122 TAILQ_INIT(&dummy_runq); 123 spin_init(&dummy_spin, "uscheddummy"); 124 ATOMIC_CPUMASK_NANDBIT(dummy_curprocmask, 0); 125 } 126 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, dummyinit, NULL) 127 128 /* 129 * DUMMY_ACQUIRE_CURPROC 130 * 131 * This function is called when the kernel intends to return to userland. 132 * It is responsible for making the thread the current designated userland 133 * thread for this cpu, blocking if necessary. 134 * 135 * The kernel will not depress our LWKT priority until after we return, 136 * in case we have to shove over to another cpu. 137 * 138 * We must determine our thread's disposition before we switch away. This 139 * is very sensitive code. 140 * 141 * We are expected to handle userland reschedule requests here too. 142 * 143 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 144 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 145 * occur, this function is called only under very controlled circumstances. 146 * 147 * MPSAFE 148 */ 149 static void 150 dummy_acquire_curproc(struct lwp *lp) 151 { 152 globaldata_t gd = mycpu; 153 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 154 thread_t td = lp->lwp_thread; 155 156 /* 157 * Possibly select another thread 158 */ 159 if (user_resched_wanted()) 160 dummy_select_curproc(gd); 161 162 /* 163 * If this cpu has no current thread, select ourself 164 */ 165 if (dd->uschedcp == lp || 166 (dd->uschedcp == NULL && TAILQ_EMPTY(&dummy_runq))) { 167 ATOMIC_CPUMASK_ORBIT(dummy_curprocmask, gd->gd_cpuid); 168 dd->uschedcp = lp; 169 return; 170 } 171 172 /* 173 * If this cpu's current user process thread is not our thread, 174 * deschedule ourselves and place us on the run queue, then 175 * switch away. 176 * 177 * We loop until we become the current process. Its a good idea 178 * to run any passive release(s) before we mess with the scheduler 179 * so our thread is in the expected state. 180 */ 181 KKASSERT(dd->uschedcp != lp); 182 if (td->td_release) 183 td->td_release(lp->lwp_thread); 184 do { 185 crit_enter(); 186 lwkt_deschedule_self(td); 187 dummy_setrunqueue(lp); 188 if ((td->td_flags & TDF_RUNQ) == 0) 189 ++lp->lwp_ru.ru_nivcsw; 190 lwkt_switch(); /* WE MAY MIGRATE TO ANOTHER CPU */ 191 crit_exit(); 192 gd = mycpu; 193 dd = &dummy_pcpu[gd->gd_cpuid]; 194 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 195 } while (dd->uschedcp != lp); 196 } 197 198 /* 199 * DUMMY_RELEASE_CURPROC 200 * 201 * This routine detaches the current thread from the userland scheduler, 202 * usually because the thread needs to run in the kernel (at kernel priority) 203 * for a while. 204 * 205 * This routine is also responsible for selecting a new thread to 206 * make the current thread. 207 * 208 * MPSAFE 209 */ 210 static void 211 dummy_release_curproc(struct lwp *lp) 212 { 213 globaldata_t gd = mycpu; 214 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 215 216 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 217 if (dd->uschedcp == lp) { 218 dummy_select_curproc(gd); 219 } 220 } 221 222 /* 223 * DUMMY_SELECT_CURPROC 224 * 225 * Select a new current process for this cpu. This satisfies a user 226 * scheduler reschedule request so clear that too. 227 * 228 * This routine is also responsible for equal-priority round-robining, 229 * typically triggered from dummy_schedulerclock(). In our dummy example 230 * all the 'user' threads are LWKT scheduled all at once and we just 231 * call lwkt_switch(). 232 * 233 * MPSAFE 234 */ 235 static 236 void 237 dummy_select_curproc(globaldata_t gd) 238 { 239 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 240 struct lwp *lp; 241 242 clear_user_resched(); 243 spin_lock(&dummy_spin); 244 if ((lp = TAILQ_FIRST(&dummy_runq)) == NULL) { 245 dd->uschedcp = NULL; 246 ATOMIC_CPUMASK_NANDBIT(dummy_curprocmask, gd->gd_cpuid); 247 spin_unlock(&dummy_spin); 248 } else { 249 --dummy_runqcount; 250 TAILQ_REMOVE(&dummy_runq, lp, lwp_procq); 251 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 252 dd->uschedcp = lp; 253 ATOMIC_CPUMASK_ORBIT(dummy_curprocmask, gd->gd_cpuid); 254 spin_unlock(&dummy_spin); 255 lwkt_acquire(lp->lwp_thread); 256 lwkt_schedule(lp->lwp_thread); 257 } 258 } 259 260 /* 261 * DUMMY_SETRUNQUEUE 262 * 263 * This routine is called to schedule a new user process after a fork. 264 * The scheduler module itself might also call this routine to place 265 * the current process on the userland scheduler's run queue prior 266 * to calling dummy_select_curproc(). 267 * 268 * The caller may set LWP_PASSIVE_ACQ in lwp_flags to indicate that we should 269 * attempt to leave the thread on the current cpu. 270 * 271 * MPSAFE 272 */ 273 static void 274 dummy_setrunqueue(struct lwp *lp) 275 { 276 globaldata_t gd = mycpu; 277 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 278 cpumask_t mask; 279 int cpuid; 280 281 if (dd->uschedcp == NULL) { 282 dd->uschedcp = lp; 283 ATOMIC_CPUMASK_ORBIT(dummy_curprocmask, gd->gd_cpuid); 284 lwkt_schedule(lp->lwp_thread); 285 } else { 286 /* 287 * Add to our global runq 288 */ 289 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 290 spin_lock(&dummy_spin); 291 ++dummy_runqcount; 292 TAILQ_INSERT_TAIL(&dummy_runq, lp, lwp_procq); 293 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 294 lwkt_giveaway(lp->lwp_thread); 295 296 /* lp = TAILQ_FIRST(&dummy_runq); */ 297 298 /* 299 * Notify the next available cpu. P.S. some 300 * cpu affinity could be done here. 301 * 302 * The rdyprocmask bit placeholds the knowledge that there 303 * is a process on the runq that needs service. If the 304 * helper thread cannot find a home for it it will forward 305 * the request to another available cpu. 306 */ 307 mask = dummy_rdyprocmask; 308 CPUMASK_NANDMASK(mask, dummy_curprocmask); 309 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 310 if (CPUMASK_TESTNZERO(mask)) { 311 cpuid = BSFCPUMASK(mask); 312 ATOMIC_CPUMASK_NANDBIT(dummy_rdyprocmask, cpuid); 313 spin_unlock(&dummy_spin); 314 lwkt_schedule(&dummy_pcpu[cpuid].helper_thread); 315 } else { 316 spin_unlock(&dummy_spin); 317 } 318 } 319 } 320 321 /* 322 * This routine is called from a systimer IPI. It must NEVER block. 323 * If a lwp compatible with this scheduler is the currently running 324 * thread this function is called with a non-NULL lp, otherwise it 325 * will be called with a NULL lp. 326 * 327 * This routine is called at ESTCPUFREQ on each cpu independantly. 328 * 329 * This routine typically queues a reschedule request, which will cause 330 * the scheduler's BLAH_select_curproc() to be called as soon as possible. 331 */ 332 static 333 void 334 dummy_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 335 { 336 globaldata_t gd = mycpu; 337 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 338 339 if (lp == NULL) 340 return; 341 342 if (++dd->rrcount >= usched_dummy_rrinterval) { 343 dd->rrcount = 0; 344 need_user_resched(); 345 } 346 } 347 348 /* 349 * DUMMY_RECALCULATE_ESTCPU 350 * 351 * Called once a second for any process that is running or has slept 352 * for less then 2 seconds. 353 * 354 * MPSAFE 355 */ 356 static 357 void 358 dummy_recalculate_estcpu(struct lwp *lp) 359 { 360 } 361 362 /* 363 * MPSAFE 364 */ 365 static 366 void 367 dummy_yield(struct lwp *lp) 368 { 369 need_user_resched(); 370 } 371 372 static 373 void 374 dummy_changedcpu(struct lwp *lp __unused) 375 { 376 } 377 378 /* 379 * DUMMY_RESETPRIORITY 380 * 381 * This routine is called after the kernel has potentially modified 382 * the lwp_rtprio structure. The target process may be running or sleeping 383 * or scheduled but not yet running or owned by another cpu. Basically, 384 * it can be in virtually any state. 385 * 386 * This routine is called by fork1() for initial setup with the process 387 * of the run queue, and also may be called normally with the process on or 388 * off the run queue. 389 * 390 * MPSAFE 391 */ 392 static void 393 dummy_resetpriority(struct lwp *lp) 394 { 395 /* XXX spinlock usually needed */ 396 /* 397 * Set p_priority for general process comparisons 398 */ 399 switch(lp->lwp_rtprio.type) { 400 case RTP_PRIO_REALTIME: 401 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio; 402 return; 403 case RTP_PRIO_NORMAL: 404 lp->lwp_priority = PRIBASE_NORMAL + lp->lwp_rtprio.prio; 405 break; 406 case RTP_PRIO_IDLE: 407 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio; 408 return; 409 case RTP_PRIO_THREAD: 410 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio; 411 return; 412 } 413 414 /* 415 * td_upri has normal sense (higher numbers are more desireable), 416 * so negate it. 417 */ 418 lp->lwp_thread->td_upri = -lp->lwp_priority; 419 /* XXX spinlock usually needed */ 420 } 421 422 423 /* 424 * DUMMY_FORKING 425 * 426 * Called from fork1() when a new child process is being created. Allows 427 * the scheduler to predispose the child process before it gets scheduled. 428 * 429 * MPSAFE 430 */ 431 static void 432 dummy_forking(struct lwp *plp, struct lwp *lp) 433 { 434 lp->lwp_estcpu = plp->lwp_estcpu; 435 #if 0 436 ++plp->lwp_estcpu; 437 #endif 438 } 439 440 /* 441 * Called when a lwp is being removed from this scheduler, typically 442 * during lwp_exit(). 443 */ 444 static void 445 dummy_exiting(struct lwp *plp, struct proc *child) 446 { 447 } 448 449 static void 450 dummy_uload_update(struct lwp *lp) 451 { 452 } 453 454 /* 455 * SMP systems may need a scheduler helper thread. This is how one can be 456 * setup. 457 * 458 * We use a neat LWKT scheduling trick to interlock the helper thread. It 459 * is possible to deschedule an LWKT thread and then do some work before 460 * switching away. The thread can be rescheduled at any time, even before 461 * we switch away. 462 * 463 * MPSAFE 464 */ 465 static void 466 dummy_sched_thread(void *dummy) 467 { 468 globaldata_t gd; 469 dummy_pcpu_t dd; 470 struct lwp *lp; 471 cpumask_t cpumask; 472 cpumask_t tmpmask; 473 int cpuid; 474 int tmpid; 475 476 gd = mycpu; 477 cpuid = gd->gd_cpuid; 478 dd = &dummy_pcpu[cpuid]; 479 CPUMASK_ASSBIT(cpumask, cpuid); 480 481 for (;;) { 482 lwkt_deschedule_self(gd->gd_curthread); /* interlock */ 483 ATOMIC_CPUMASK_ORBIT(dummy_rdyprocmask, cpuid); 484 spin_lock(&dummy_spin); 485 if (dd->uschedcp) { 486 /* 487 * We raced another cpu trying to schedule a thread onto us. 488 * If the runq isn't empty hit another free cpu. 489 */ 490 tmpmask = dummy_rdyprocmask; 491 CPUMASK_NANDMASK(tmpmask, dummy_curprocmask); 492 CPUMASK_ANDMASK(tmpmask, gd->gd_other_cpus); 493 if (CPUMASK_TESTNZERO(tmpmask) && dummy_runqcount) { 494 tmpid = BSFCPUMASK(tmpmask); 495 KKASSERT(tmpid != cpuid); 496 ATOMIC_CPUMASK_NANDBIT(dummy_rdyprocmask, tmpid); 497 spin_unlock(&dummy_spin); 498 lwkt_schedule(&dummy_pcpu[tmpid].helper_thread); 499 } else { 500 spin_unlock(&dummy_spin); 501 } 502 } else if ((lp = TAILQ_FIRST(&dummy_runq)) != NULL) { 503 --dummy_runqcount; 504 TAILQ_REMOVE(&dummy_runq, lp, lwp_procq); 505 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 506 dd->uschedcp = lp; 507 ATOMIC_CPUMASK_ORBIT(dummy_curprocmask, cpuid); 508 spin_unlock(&dummy_spin); 509 lwkt_acquire(lp->lwp_thread); 510 lwkt_schedule(lp->lwp_thread); 511 } else { 512 spin_unlock(&dummy_spin); 513 } 514 lwkt_switch(); 515 } 516 } 517 518 /* 519 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 520 * been cleared by rqinit() and we should not mess with it further. 521 */ 522 static void 523 dummy_sched_thread_cpu_init(void) 524 { 525 int i; 526 527 if (bootverbose) 528 kprintf("start dummy scheduler helpers on cpus:"); 529 530 for (i = 0; i < ncpus; ++i) { 531 dummy_pcpu_t dd = &dummy_pcpu[i]; 532 cpumask_t mask; 533 534 CPUMASK_ASSBIT(mask, i); 535 536 if (CPUMASK_TESTMASK(mask, smp_active_mask) == 0) 537 continue; 538 539 if (bootverbose) 540 kprintf(" %d", i); 541 542 lwkt_create(dummy_sched_thread, NULL, NULL, &dd->helper_thread, 543 TDF_NOSTART, i, "dsched %d", i); 544 545 /* 546 * Allow user scheduling on the target cpu. cpu #0 has already 547 * been enabled in rqinit(). 548 */ 549 if (i) 550 ATOMIC_CPUMASK_NANDMASK(dummy_curprocmask, mask); 551 ATOMIC_CPUMASK_ORMASK(dummy_rdyprocmask, mask); 552 } 553 if (bootverbose) 554 kprintf("\n"); 555 } 556 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 557 dummy_sched_thread_cpu_init, NULL) 558