1 /* 2 * Copyright (c) 2006 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/lock.h> 39 #include <sys/queue.h> 40 #include <sys/proc.h> 41 #include <sys/rtprio.h> 42 #include <sys/uio.h> 43 #include <sys/sysctl.h> 44 #include <sys/resourcevar.h> 45 #include <sys/spinlock.h> 46 #include <machine/cpu.h> 47 #include <machine/smp.h> 48 49 #include <sys/thread2.h> 50 #include <sys/spinlock2.h> 51 52 #define MAXPRI 128 53 #define PRIBASE_REALTIME 0 54 #define PRIBASE_NORMAL MAXPRI 55 #define PRIBASE_IDLE (MAXPRI * 2) 56 #define PRIBASE_THREAD (MAXPRI * 3) 57 #define PRIBASE_NULL (MAXPRI * 4) 58 59 #define lwp_priority lwp_usdata.bsd4.priority 60 #define lwp_estcpu lwp_usdata.bsd4.estcpu 61 62 static void dummy_acquire_curproc(struct lwp *lp); 63 static void dummy_release_curproc(struct lwp *lp); 64 static void dummy_select_curproc(globaldata_t gd); 65 static void dummy_setrunqueue(struct lwp *lp); 66 static void dummy_schedulerclock(struct lwp *lp, sysclock_t period, 67 sysclock_t cpstamp); 68 static void dummy_recalculate_estcpu(struct lwp *lp); 69 static void dummy_resetpriority(struct lwp *lp); 70 static void dummy_forking(struct lwp *plp, struct lwp *lp); 71 static void dummy_exiting(struct lwp *plp, struct proc *child); 72 static void dummy_uload_update(struct lwp *lp); 73 static void dummy_yield(struct lwp *lp); 74 static void dummy_changedcpu(struct lwp *lp); 75 76 struct usched usched_dummy = { 77 { NULL }, 78 "dummy", "Dummy DragonFly Scheduler", 79 NULL, /* default registration */ 80 NULL, /* default deregistration */ 81 dummy_acquire_curproc, 82 dummy_release_curproc, 83 dummy_setrunqueue, 84 dummy_schedulerclock, 85 dummy_recalculate_estcpu, 86 dummy_resetpriority, 87 dummy_forking, 88 dummy_exiting, 89 dummy_uload_update, 90 NULL, /* setcpumask not supported */ 91 dummy_yield, 92 dummy_changedcpu 93 }; 94 95 struct usched_dummy_pcpu { 96 int rrcount; 97 struct thread *helper_thread; 98 struct lwp *uschedcp; 99 }; 100 101 typedef struct usched_dummy_pcpu *dummy_pcpu_t; 102 103 static struct usched_dummy_pcpu dummy_pcpu[MAXCPU]; 104 static cpumask_t dummy_curprocmask = CPUMASK_INITIALIZER_ALLONES; 105 static cpumask_t dummy_rdyprocmask; 106 static struct spinlock dummy_spin; 107 static TAILQ_HEAD(rq, lwp) dummy_runq; 108 static int dummy_runqcount; 109 110 static int usched_dummy_rrinterval = (ESTCPUFREQ + 9) / 10; 111 SYSCTL_INT(_kern, OID_AUTO, usched_dummy_rrinterval, CTLFLAG_RW, 112 &usched_dummy_rrinterval, 0, ""); 113 114 /* 115 * Initialize the run queues at boot time, clear cpu 0 in curprocmask 116 * to allow dummy scheduling on cpu 0. 117 */ 118 static void 119 dummyinit(void *dummy) 120 { 121 TAILQ_INIT(&dummy_runq); 122 spin_init(&dummy_spin, "uscheddummy"); 123 ATOMIC_CPUMASK_NANDBIT(dummy_curprocmask, 0); 124 } 125 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, dummyinit, NULL); 126 127 /* 128 * DUMMY_ACQUIRE_CURPROC 129 * 130 * This function is called when the kernel intends to return to userland. 131 * It is responsible for making the thread the current designated userland 132 * thread for this cpu, blocking if necessary. 133 * 134 * The kernel will not depress our LWKT priority until after we return, 135 * in case we have to shove over to another cpu. 136 * 137 * We must determine our thread's disposition before we switch away. This 138 * is very sensitive code. 139 * 140 * We are expected to handle userland reschedule requests here too. 141 * 142 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 143 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 144 * occur, this function is called only under very controlled circumstances. 145 * 146 * MPSAFE 147 */ 148 static void 149 dummy_acquire_curproc(struct lwp *lp) 150 { 151 globaldata_t gd = mycpu; 152 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 153 thread_t td = lp->lwp_thread; 154 155 /* 156 * Possibly select another thread 157 */ 158 if (user_resched_wanted()) 159 dummy_select_curproc(gd); 160 161 /* 162 * If this cpu has no current thread, select ourself 163 */ 164 if (dd->uschedcp == lp || 165 (dd->uschedcp == NULL && TAILQ_EMPTY(&dummy_runq))) { 166 ATOMIC_CPUMASK_ORBIT(dummy_curprocmask, gd->gd_cpuid); 167 dd->uschedcp = lp; 168 return; 169 } 170 171 /* 172 * If this cpu's current user process thread is not our thread, 173 * deschedule ourselves and place us on the run queue, then 174 * switch away. 175 * 176 * We loop until we become the current process. Its a good idea 177 * to run any passive release(s) before we mess with the scheduler 178 * so our thread is in the expected state. 179 */ 180 KKASSERT(dd->uschedcp != lp); 181 if (td->td_release) 182 td->td_release(lp->lwp_thread); 183 do { 184 crit_enter(); 185 lwkt_deschedule_self(td); 186 dummy_setrunqueue(lp); 187 if ((td->td_flags & TDF_RUNQ) == 0) 188 ++lp->lwp_ru.ru_nivcsw; 189 lwkt_switch(); /* WE MAY MIGRATE TO ANOTHER CPU */ 190 crit_exit(); 191 gd = mycpu; 192 dd = &dummy_pcpu[gd->gd_cpuid]; 193 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 194 } while (dd->uschedcp != lp); 195 } 196 197 /* 198 * DUMMY_RELEASE_CURPROC 199 * 200 * This routine detaches the current thread from the userland scheduler, 201 * usually because the thread needs to run in the kernel (at kernel priority) 202 * for a while. 203 * 204 * This routine is also responsible for selecting a new thread to 205 * make the current thread. 206 * 207 * MPSAFE 208 */ 209 static void 210 dummy_release_curproc(struct lwp *lp) 211 { 212 globaldata_t gd = mycpu; 213 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 214 215 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 216 if (dd->uschedcp == lp) { 217 dummy_select_curproc(gd); 218 } 219 } 220 221 /* 222 * DUMMY_SELECT_CURPROC 223 * 224 * Select a new current process for this cpu. This satisfies a user 225 * scheduler reschedule request so clear that too. 226 * 227 * This routine is also responsible for equal-priority round-robining, 228 * typically triggered from dummy_schedulerclock(). In our dummy example 229 * all the 'user' threads are LWKT scheduled all at once and we just 230 * call lwkt_switch(). 231 * 232 * MPSAFE 233 */ 234 static 235 void 236 dummy_select_curproc(globaldata_t gd) 237 { 238 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 239 struct lwp *lp; 240 241 clear_user_resched(); 242 spin_lock(&dummy_spin); 243 if ((lp = TAILQ_FIRST(&dummy_runq)) == NULL) { 244 dd->uschedcp = NULL; 245 ATOMIC_CPUMASK_NANDBIT(dummy_curprocmask, gd->gd_cpuid); 246 spin_unlock(&dummy_spin); 247 } else { 248 --dummy_runqcount; 249 TAILQ_REMOVE(&dummy_runq, lp, lwp_procq); 250 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 251 dd->uschedcp = lp; 252 ATOMIC_CPUMASK_ORBIT(dummy_curprocmask, gd->gd_cpuid); 253 spin_unlock(&dummy_spin); 254 lwkt_acquire(lp->lwp_thread); 255 lwkt_schedule(lp->lwp_thread); 256 } 257 } 258 259 /* 260 * DUMMY_SETRUNQUEUE 261 * 262 * This routine is called to schedule a new user process after a fork. 263 * The scheduler module itself might also call this routine to place 264 * the current process on the userland scheduler's run queue prior 265 * to calling dummy_select_curproc(). 266 * 267 * The caller may set LWP_PASSIVE_ACQ in lwp_flags to indicate that we should 268 * attempt to leave the thread on the current cpu. 269 * 270 * MPSAFE 271 */ 272 static void 273 dummy_setrunqueue(struct lwp *lp) 274 { 275 globaldata_t gd = mycpu; 276 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 277 cpumask_t mask; 278 int cpuid; 279 280 if (dd->uschedcp == NULL) { 281 dd->uschedcp = lp; 282 ATOMIC_CPUMASK_ORBIT(dummy_curprocmask, gd->gd_cpuid); 283 lwkt_schedule(lp->lwp_thread); 284 } else { 285 /* 286 * Add to our global runq 287 */ 288 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 289 spin_lock(&dummy_spin); 290 ++dummy_runqcount; 291 TAILQ_INSERT_TAIL(&dummy_runq, lp, lwp_procq); 292 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 293 lwkt_giveaway(lp->lwp_thread); 294 295 /* lp = TAILQ_FIRST(&dummy_runq); */ 296 297 /* 298 * Notify the next available cpu. P.S. some 299 * cpu affinity could be done here. 300 * 301 * The rdyprocmask bit placeholds the knowledge that there 302 * is a process on the runq that needs service. If the 303 * helper thread cannot find a home for it it will forward 304 * the request to another available cpu. 305 */ 306 mask = dummy_rdyprocmask; 307 CPUMASK_NANDMASK(mask, dummy_curprocmask); 308 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 309 if (CPUMASK_TESTNZERO(mask)) { 310 cpuid = BSFCPUMASK(mask); 311 ATOMIC_CPUMASK_NANDBIT(dummy_rdyprocmask, cpuid); 312 spin_unlock(&dummy_spin); 313 lwkt_schedule(dummy_pcpu[cpuid].helper_thread); 314 } else { 315 spin_unlock(&dummy_spin); 316 } 317 } 318 } 319 320 /* 321 * This routine is called from a systimer IPI. It must NEVER block. 322 * If a lwp compatible with this scheduler is the currently running 323 * thread this function is called with a non-NULL lp, otherwise it 324 * will be called with a NULL lp. 325 * 326 * This routine is called at ESTCPUFREQ on each cpu independantly. 327 * 328 * This routine typically queues a reschedule request, which will cause 329 * the scheduler's BLAH_select_curproc() to be called as soon as possible. 330 */ 331 static 332 void 333 dummy_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 334 { 335 globaldata_t gd = mycpu; 336 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 337 338 if (lp == NULL) 339 return; 340 341 if (++dd->rrcount >= usched_dummy_rrinterval) { 342 dd->rrcount = 0; 343 need_user_resched(); 344 } 345 } 346 347 /* 348 * DUMMY_RECALCULATE_ESTCPU 349 * 350 * Called once a second for any process that is running or has slept 351 * for less then 2 seconds. 352 * 353 * MPSAFE 354 */ 355 static 356 void 357 dummy_recalculate_estcpu(struct lwp *lp) 358 { 359 } 360 361 /* 362 * MPSAFE 363 */ 364 static 365 void 366 dummy_yield(struct lwp *lp) 367 { 368 need_user_resched(); 369 } 370 371 static 372 void 373 dummy_changedcpu(struct lwp *lp __unused) 374 { 375 } 376 377 /* 378 * DUMMY_RESETPRIORITY 379 * 380 * This routine is called after the kernel has potentially modified 381 * the lwp_rtprio structure. The target process may be running or sleeping 382 * or scheduled but not yet running or owned by another cpu. Basically, 383 * it can be in virtually any state. 384 * 385 * This routine is called by fork1() for initial setup with the process 386 * of the run queue, and also may be called normally with the process on or 387 * off the run queue. 388 * 389 * MPSAFE 390 */ 391 static void 392 dummy_resetpriority(struct lwp *lp) 393 { 394 /* XXX spinlock usually needed */ 395 /* 396 * Set p_priority for general process comparisons 397 */ 398 switch(lp->lwp_rtprio.type) { 399 case RTP_PRIO_REALTIME: 400 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio; 401 return; 402 case RTP_PRIO_NORMAL: 403 lp->lwp_priority = PRIBASE_NORMAL + lp->lwp_rtprio.prio; 404 break; 405 case RTP_PRIO_IDLE: 406 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio; 407 return; 408 case RTP_PRIO_THREAD: 409 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio; 410 return; 411 } 412 413 /* 414 * td_upri has normal sense (higher numbers are more desireable), 415 * so negate it. 416 */ 417 lp->lwp_thread->td_upri = -lp->lwp_priority; 418 /* XXX spinlock usually needed */ 419 } 420 421 422 /* 423 * DUMMY_FORKING 424 * 425 * Called from fork1() when a new child process is being created. Allows 426 * the scheduler to predispose the child process before it gets scheduled. 427 * 428 * MPSAFE 429 */ 430 static void 431 dummy_forking(struct lwp *plp, struct lwp *lp) 432 { 433 lp->lwp_estcpu = plp->lwp_estcpu; 434 #if 0 435 ++plp->lwp_estcpu; 436 #endif 437 } 438 439 /* 440 * Called when a lwp is being removed from this scheduler, typically 441 * during lwp_exit(). 442 */ 443 static void 444 dummy_exiting(struct lwp *plp, struct proc *child) 445 { 446 } 447 448 static void 449 dummy_uload_update(struct lwp *lp) 450 { 451 } 452 453 /* 454 * SMP systems may need a scheduler helper thread. This is how one can be 455 * setup. 456 * 457 * We use a neat LWKT scheduling trick to interlock the helper thread. It 458 * is possible to deschedule an LWKT thread and then do some work before 459 * switching away. The thread can be rescheduled at any time, even before 460 * we switch away. 461 * 462 * MPSAFE 463 */ 464 static void 465 dummy_sched_thread(void *dummy) 466 { 467 globaldata_t gd; 468 dummy_pcpu_t dd; 469 struct lwp *lp; 470 cpumask_t cpumask; 471 cpumask_t tmpmask; 472 int cpuid; 473 int tmpid; 474 475 gd = mycpu; 476 cpuid = gd->gd_cpuid; 477 dd = &dummy_pcpu[cpuid]; 478 CPUMASK_ASSBIT(cpumask, cpuid); 479 480 for (;;) { 481 lwkt_deschedule_self(gd->gd_curthread); /* interlock */ 482 ATOMIC_CPUMASK_ORBIT(dummy_rdyprocmask, cpuid); 483 spin_lock(&dummy_spin); 484 if (dd->uschedcp) { 485 /* 486 * We raced another cpu trying to schedule a thread onto us. 487 * If the runq isn't empty hit another free cpu. 488 */ 489 tmpmask = dummy_rdyprocmask; 490 CPUMASK_NANDMASK(tmpmask, dummy_curprocmask); 491 CPUMASK_ANDMASK(tmpmask, gd->gd_other_cpus); 492 if (CPUMASK_TESTNZERO(tmpmask) && dummy_runqcount) { 493 tmpid = BSFCPUMASK(tmpmask); 494 KKASSERT(tmpid != cpuid); 495 ATOMIC_CPUMASK_NANDBIT(dummy_rdyprocmask, tmpid); 496 spin_unlock(&dummy_spin); 497 lwkt_schedule(dummy_pcpu[tmpid].helper_thread); 498 } else { 499 spin_unlock(&dummy_spin); 500 } 501 } else if ((lp = TAILQ_FIRST(&dummy_runq)) != NULL) { 502 --dummy_runqcount; 503 TAILQ_REMOVE(&dummy_runq, lp, lwp_procq); 504 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 505 dd->uschedcp = lp; 506 ATOMIC_CPUMASK_ORBIT(dummy_curprocmask, cpuid); 507 spin_unlock(&dummy_spin); 508 lwkt_acquire(lp->lwp_thread); 509 lwkt_schedule(lp->lwp_thread); 510 } else { 511 spin_unlock(&dummy_spin); 512 } 513 lwkt_switch(); 514 } 515 } 516 517 /* 518 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 519 * been cleared by rqinit() and we should not mess with it further. 520 */ 521 static void 522 dummy_sched_thread_cpu_init(void) 523 { 524 int i; 525 526 if (bootverbose) 527 kprintf("start dummy scheduler helpers on cpus:"); 528 529 for (i = 0; i < ncpus; ++i) { 530 dummy_pcpu_t dd = &dummy_pcpu[i]; 531 cpumask_t mask; 532 533 CPUMASK_ASSBIT(mask, i); 534 535 if (CPUMASK_TESTMASK(mask, smp_active_mask) == 0) 536 continue; 537 538 if (bootverbose) 539 kprintf(" %d", i); 540 541 lwkt_create(dummy_sched_thread, NULL, &dd->helper_thread, NULL, 542 TDF_NOSTART, i, "dsched %d", i); 543 544 /* 545 * Allow user scheduling on the target cpu. cpu #0 has already 546 * been enabled in rqinit(). 547 */ 548 if (i) 549 ATOMIC_CPUMASK_NANDMASK(dummy_curprocmask, mask); 550 ATOMIC_CPUMASK_ORMASK(dummy_rdyprocmask, mask); 551 } 552 if (bootverbose) 553 kprintf("\n"); 554 } 555 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 556 dummy_sched_thread_cpu_init, NULL); 557