1 /* $NetBSD: kern_synch.c,v 1.358 2023/07/17 12:54:29 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020
5 * The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
11 * Daniel Sieger.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 */
34
35 /*-
36 * Copyright (c) 1982, 1986, 1990, 1991, 1993
37 * The Regents of the University of California. All rights reserved.
38 * (c) UNIX System Laboratories, Inc.
39 * All or some portions of this file are derived from material licensed
40 * to the University of California by American Telephone and Telegraph
41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
42 * the permission of UNIX System Laboratories, Inc.
43 *
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
46 * are met:
47 * 1. Redistributions of source code must retain the above copyright
48 * notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 * notice, this list of conditions and the following disclaimer in the
51 * documentation and/or other materials provided with the distribution.
52 * 3. Neither the name of the University nor the names of its contributors
53 * may be used to endorse or promote products derived from this software
54 * without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66 * SUCH DAMAGE.
67 *
68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
69 */
70
71 #include <sys/cdefs.h>
72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.358 2023/07/17 12:54:29 riastradh Exp $");
73
74 #include "opt_kstack.h"
75 #include "opt_ddb.h"
76 #include "opt_dtrace.h"
77
78 #define __MUTEX_PRIVATE
79
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/proc.h>
83 #include <sys/kernel.h>
84 #include <sys/cpu.h>
85 #include <sys/pserialize.h>
86 #include <sys/resource.h>
87 #include <sys/resourcevar.h>
88 #include <sys/rwlock.h>
89 #include <sys/sched.h>
90 #include <sys/syscall_stats.h>
91 #include <sys/sleepq.h>
92 #include <sys/lockdebug.h>
93 #include <sys/evcnt.h>
94 #include <sys/intr.h>
95 #include <sys/lwpctl.h>
96 #include <sys/atomic.h>
97 #include <sys/syslog.h>
98
99 #include <uvm/uvm_extern.h>
100
101 #include <dev/lockstat.h>
102
103 #include <sys/dtrace_bsd.h>
104 int dtrace_vtime_active=0;
105 dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
106
107 #ifdef DDB
108 #include <ddb/ddb.h>
109 #endif
110
111 static void sched_unsleep(struct lwp *, bool);
112 static void sched_changepri(struct lwp *, pri_t);
113 static void sched_lendpri(struct lwp *, pri_t);
114
115 syncobj_t sleep_syncobj = {
116 .sobj_name = "sleep",
117 .sobj_flag = SOBJ_SLEEPQ_SORTED,
118 .sobj_unsleep = sleepq_unsleep,
119 .sobj_changepri = sleepq_changepri,
120 .sobj_lendpri = sleepq_lendpri,
121 .sobj_owner = syncobj_noowner,
122 };
123
124 syncobj_t sched_syncobj = {
125 .sobj_name = "sched",
126 .sobj_flag = SOBJ_SLEEPQ_SORTED,
127 .sobj_unsleep = sched_unsleep,
128 .sobj_changepri = sched_changepri,
129 .sobj_lendpri = sched_lendpri,
130 .sobj_owner = syncobj_noowner,
131 };
132
133 syncobj_t kpause_syncobj = {
134 .sobj_name = "kpause",
135 .sobj_flag = SOBJ_SLEEPQ_NULL,
136 .sobj_unsleep = sleepq_unsleep,
137 .sobj_changepri = sleepq_changepri,
138 .sobj_lendpri = sleepq_lendpri,
139 .sobj_owner = syncobj_noowner,
140 };
141
142 /* "Lightning bolt": once a second sleep address. */
143 kcondvar_t lbolt __cacheline_aligned;
144
145 u_int sched_pstats_ticks __cacheline_aligned;
146
147 /* Preemption event counters. */
148 static struct evcnt kpreempt_ev_crit __cacheline_aligned;
149 static struct evcnt kpreempt_ev_klock __cacheline_aligned;
150 static struct evcnt kpreempt_ev_immed __cacheline_aligned;
151
152 void
synch_init(void)153 synch_init(void)
154 {
155
156 cv_init(&lbolt, "lbolt");
157
158 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
159 "kpreempt", "defer: critical section");
160 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
161 "kpreempt", "defer: kernel_lock");
162 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
163 "kpreempt", "immediate");
164 }
165
166 /*
167 * OBSOLETE INTERFACE
168 *
169 * General sleep call. Suspends the current LWP until a wakeup is
170 * performed on the specified identifier. The LWP will then be made
171 * runnable with the specified priority. Sleeps at most timo/hz seconds (0
172 * means no timeout). If pri includes PCATCH flag, signals are checked
173 * before and after sleeping, else signals are not checked. Returns 0 if
174 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
175 * signal needs to be delivered, ERESTART is returned if the current system
176 * call should be restarted if possible, and EINTR is returned if the system
177 * call should be interrupted by the signal (return EINTR).
178 */
179 int
tsleep(wchan_t ident,pri_t priority,const char * wmesg,int timo)180 tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo)
181 {
182 struct lwp *l = curlwp;
183 sleepq_t *sq;
184 kmutex_t *mp;
185 bool catch_p;
186
187 KASSERT((l->l_pflag & LP_INTR) == 0);
188 KASSERT(ident != &lbolt);
189 //KASSERT(KERNEL_LOCKED_P());
190
191 if (sleepq_dontsleep(l)) {
192 (void)sleepq_abort(NULL, 0);
193 return 0;
194 }
195
196 l->l_kpriority = true;
197 catch_p = priority & PCATCH;
198 sq = sleeptab_lookup(&sleeptab, ident, &mp);
199 sleepq_enter(sq, l, mp);
200 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p);
201 return sleepq_block(timo, catch_p, &sleep_syncobj);
202 }
203
204 int
mtsleep(wchan_t ident,pri_t priority,const char * wmesg,int timo,kmutex_t * mtx)205 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
206 kmutex_t *mtx)
207 {
208 struct lwp *l = curlwp;
209 sleepq_t *sq;
210 kmutex_t *mp;
211 bool catch_p;
212 int error;
213
214 KASSERT((l->l_pflag & LP_INTR) == 0);
215 KASSERT(ident != &lbolt);
216
217 if (sleepq_dontsleep(l)) {
218 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
219 return 0;
220 }
221
222 l->l_kpriority = true;
223 catch_p = priority & PCATCH;
224 sq = sleeptab_lookup(&sleeptab, ident, &mp);
225 sleepq_enter(sq, l, mp);
226 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p);
227 mutex_exit(mtx);
228 error = sleepq_block(timo, catch_p, &sleep_syncobj);
229
230 if ((priority & PNORELOCK) == 0)
231 mutex_enter(mtx);
232
233 return error;
234 }
235
236 /*
237 * General sleep call for situations where a wake-up is not expected.
238 */
239 int
kpause(const char * wmesg,bool intr,int timo,kmutex_t * mtx)240 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
241 {
242 struct lwp *l = curlwp;
243 int error;
244
245 KASSERT(timo != 0 || intr);
246
247 if (sleepq_dontsleep(l))
248 return sleepq_abort(NULL, 0);
249
250 if (mtx != NULL)
251 mutex_exit(mtx);
252 l->l_kpriority = true;
253 lwp_lock(l);
254 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks);
255 sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr);
256 error = sleepq_block(timo, intr, &kpause_syncobj);
257 if (mtx != NULL)
258 mutex_enter(mtx);
259
260 return error;
261 }
262
263 /*
264 * OBSOLETE INTERFACE
265 *
266 * Make all LWPs sleeping on the specified identifier runnable.
267 */
268 void
wakeup(wchan_t ident)269 wakeup(wchan_t ident)
270 {
271 sleepq_t *sq;
272 kmutex_t *mp;
273
274 if (__predict_false(cold))
275 return;
276
277 sq = sleeptab_lookup(&sleeptab, ident, &mp);
278 sleepq_wake(sq, ident, (u_int)-1, mp);
279 }
280
281 /*
282 * General yield call. Puts the current LWP back on its run queue and
283 * performs a context switch.
284 */
285 void
yield(void)286 yield(void)
287 {
288 struct lwp *l = curlwp;
289
290 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
291 lwp_lock(l);
292
293 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
294 KASSERT(l->l_stat == LSONPROC);
295
296 /* Voluntary - ditch kpriority boost. */
297 l->l_kpriority = false;
298 spc_lock(l->l_cpu);
299 mi_switch(l);
300 KERNEL_LOCK(l->l_biglocks, l);
301 }
302
303 /*
304 * General preemption call. Puts the current LWP back on its run queue
305 * and performs an involuntary context switch. Different from yield()
306 * in that:
307 *
308 * - It's counted differently (involuntary vs. voluntary).
309 * - Realtime threads go to the head of their runqueue vs. tail for yield().
310 * - Priority boost is retained unless LWP has exceeded timeslice.
311 */
312 void
preempt(void)313 preempt(void)
314 {
315 struct lwp *l = curlwp;
316
317 KERNEL_UNLOCK_ALL(l, &l->l_biglocks);
318 lwp_lock(l);
319
320 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
321 KASSERT(l->l_stat == LSONPROC);
322
323 spc_lock(l->l_cpu);
324 /* Involuntary - keep kpriority boost unless a CPU hog. */
325 if ((l->l_cpu->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) != 0) {
326 l->l_kpriority = false;
327 }
328 l->l_pflag |= LP_PREEMPTING;
329 mi_switch(l);
330 KERNEL_LOCK(l->l_biglocks, l);
331 }
332
333 /*
334 * Return true if the current LWP should yield the processor. Intended to
335 * be used by long-running code in kernel.
336 */
337 inline bool
preempt_needed(void)338 preempt_needed(void)
339 {
340 lwp_t *l = curlwp;
341 int needed;
342
343 KPREEMPT_DISABLE(l);
344 needed = l->l_cpu->ci_want_resched;
345 KPREEMPT_ENABLE(l);
346
347 return (needed != 0);
348 }
349
350 /*
351 * A breathing point for long running code in kernel.
352 */
353 void
preempt_point(void)354 preempt_point(void)
355 {
356
357 if (__predict_false(preempt_needed())) {
358 preempt();
359 }
360 }
361
362 /*
363 * Handle a request made by another agent to preempt the current LWP
364 * in-kernel. Usually called when l_dopreempt may be non-zero.
365 *
366 * Character addresses for lockstat only.
367 */
368 static char kpreempt_is_disabled;
369 static char kernel_lock_held;
370 static char is_softint_lwp;
371 static char spl_is_raised;
372
373 bool
kpreempt(uintptr_t where)374 kpreempt(uintptr_t where)
375 {
376 uintptr_t failed;
377 lwp_t *l;
378 int s, dop, lsflag;
379
380 l = curlwp;
381 failed = 0;
382 while ((dop = l->l_dopreempt) != 0) {
383 if (l->l_stat != LSONPROC) {
384 /*
385 * About to block (or die), let it happen.
386 * Doesn't really count as "preemption has
387 * been blocked", since we're going to
388 * context switch.
389 */
390 atomic_swap_uint(&l->l_dopreempt, 0);
391 return true;
392 }
393 KASSERT((l->l_flag & LW_IDLE) == 0);
394 if (__predict_false(l->l_nopreempt != 0)) {
395 /* LWP holds preemption disabled, explicitly. */
396 if ((dop & DOPREEMPT_COUNTED) == 0) {
397 kpreempt_ev_crit.ev_count++;
398 }
399 failed = (uintptr_t)&kpreempt_is_disabled;
400 break;
401 }
402 if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
403 /* Can't preempt soft interrupts yet. */
404 atomic_swap_uint(&l->l_dopreempt, 0);
405 failed = (uintptr_t)&is_softint_lwp;
406 break;
407 }
408 s = splsched();
409 if (__predict_false(l->l_blcnt != 0 ||
410 curcpu()->ci_biglock_wanted != NULL)) {
411 /* Hold or want kernel_lock, code is not MT safe. */
412 splx(s);
413 if ((dop & DOPREEMPT_COUNTED) == 0) {
414 kpreempt_ev_klock.ev_count++;
415 }
416 failed = (uintptr_t)&kernel_lock_held;
417 break;
418 }
419 if (__predict_false(!cpu_kpreempt_enter(where, s))) {
420 /*
421 * It may be that the IPL is too high.
422 * kpreempt_enter() can schedule an
423 * interrupt to retry later.
424 */
425 splx(s);
426 failed = (uintptr_t)&spl_is_raised;
427 break;
428 }
429 /* Do it! */
430 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) {
431 kpreempt_ev_immed.ev_count++;
432 }
433 lwp_lock(l);
434 /* Involuntary - keep kpriority boost. */
435 l->l_pflag |= LP_PREEMPTING;
436 spc_lock(l->l_cpu);
437 mi_switch(l);
438 l->l_nopreempt++;
439 splx(s);
440
441 /* Take care of any MD cleanup. */
442 cpu_kpreempt_exit(where);
443 l->l_nopreempt--;
444 }
445
446 if (__predict_true(!failed)) {
447 return false;
448 }
449
450 /* Record preemption failure for reporting via lockstat. */
451 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
452 lsflag = 0;
453 LOCKSTAT_ENTER(lsflag);
454 if (__predict_false(lsflag)) {
455 if (where == 0) {
456 where = (uintptr_t)__builtin_return_address(0);
457 }
458 /* Preemption is on, might recurse, so make it atomic. */
459 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL,
460 (void *)where) == NULL) {
461 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
462 l->l_pfaillock = failed;
463 }
464 }
465 LOCKSTAT_EXIT(lsflag);
466 return true;
467 }
468
469 /*
470 * Return true if preemption is explicitly disabled.
471 */
472 bool
kpreempt_disabled(void)473 kpreempt_disabled(void)
474 {
475 const lwp_t *l = curlwp;
476
477 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
478 (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 ||
479 cpu_kpreempt_disabled();
480 }
481
482 /*
483 * Disable kernel preemption.
484 */
485 void
kpreempt_disable(void)486 kpreempt_disable(void)
487 {
488
489 KPREEMPT_DISABLE(curlwp);
490 }
491
492 /*
493 * Reenable kernel preemption.
494 */
495 void
kpreempt_enable(void)496 kpreempt_enable(void)
497 {
498
499 KPREEMPT_ENABLE(curlwp);
500 }
501
502 /*
503 * Compute the amount of time during which the current lwp was running.
504 *
505 * - update l_rtime unless it's an idle lwp.
506 */
507
508 void
updatertime(lwp_t * l,const struct bintime * now)509 updatertime(lwp_t *l, const struct bintime *now)
510 {
511 static bool backwards = false;
512
513 if (__predict_false(l->l_flag & LW_IDLE))
514 return;
515
516 if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) {
517 char caller[128];
518
519 #ifdef DDB
520 db_symstr(caller, sizeof(caller),
521 (db_expr_t)(intptr_t)__builtin_return_address(0),
522 DB_STGY_PROC);
523 #else
524 snprintf(caller, sizeof(caller), "%p",
525 __builtin_return_address(0));
526 #endif
527 backwards = true;
528 printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:"
529 " timecounter went backwards"
530 " from (%jd + 0x%016"PRIx64"/2^64) sec"
531 " to (%jd + 0x%016"PRIx64"/2^64) sec"
532 " in %s\n",
533 (long)l->l_lid,
534 l->l_proc->p_comm,
535 l->l_name ? " " : "",
536 l->l_name ? l->l_name : "",
537 l->l_pflag,
538 (intmax_t)l->l_stime.sec, l->l_stime.frac,
539 (intmax_t)now->sec, now->frac,
540 caller);
541 }
542
543 /* rtime += now - stime */
544 bintime_add(&l->l_rtime, now);
545 bintime_sub(&l->l_rtime, &l->l_stime);
546 }
547
548 /*
549 * Select next LWP from the current CPU to run..
550 */
551 static inline lwp_t *
nextlwp(struct cpu_info * ci,struct schedstate_percpu * spc)552 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc)
553 {
554 lwp_t *newl;
555
556 /*
557 * Let sched_nextlwp() select the LWP to run the CPU next.
558 * If no LWP is runnable, select the idle LWP.
559 *
560 * On arrival here LWPs on a run queue are locked by spc_mutex which
561 * is currently held. Idle LWPs are always locked by spc_lwplock,
562 * which may or may not be held here. On exit from this code block,
563 * in all cases newl is locked by spc_lwplock.
564 */
565 newl = sched_nextlwp();
566 if (newl != NULL) {
567 sched_dequeue(newl);
568 KASSERT(lwp_locked(newl, spc->spc_mutex));
569 KASSERT(newl->l_cpu == ci);
570 newl->l_stat = LSONPROC;
571 newl->l_pflag |= LP_RUNNING;
572 spc->spc_curpriority = lwp_eprio(newl);
573 spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE);
574 lwp_setlock(newl, spc->spc_lwplock);
575 } else {
576 /*
577 * The idle LWP does not get set to LSONPROC, because
578 * otherwise it screws up the output from top(1) etc.
579 */
580 newl = ci->ci_data.cpu_idlelwp;
581 newl->l_pflag |= LP_RUNNING;
582 spc->spc_curpriority = PRI_IDLE;
583 spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) |
584 SPCF_IDLE;
585 }
586
587 /*
588 * Only clear want_resched if there are no pending (slow) software
589 * interrupts. We can do this without an atomic, because no new
590 * LWPs can appear in the queue due to our hold on spc_mutex, and
591 * the update to ci_want_resched will become globally visible before
592 * the release of spc_mutex becomes globally visible.
593 */
594 if (ci->ci_data.cpu_softints == 0)
595 ci->ci_want_resched = 0;
596
597 return newl;
598 }
599
600 /*
601 * The machine independent parts of context switch.
602 *
603 * NOTE: l->l_cpu is not changed in this routine, because an LWP never
604 * changes its own l_cpu (that would screw up curcpu on many ports and could
605 * cause all kinds of other evil stuff). l_cpu is always changed by some
606 * other actor, when it's known the LWP is not running (the LP_RUNNING flag
607 * is checked under lock).
608 */
609 void
mi_switch(lwp_t * l)610 mi_switch(lwp_t *l)
611 {
612 struct cpu_info *ci;
613 struct schedstate_percpu *spc;
614 struct lwp *newl;
615 kmutex_t *lock;
616 int oldspl;
617 struct bintime bt;
618 bool returning;
619
620 KASSERT(lwp_locked(l, NULL));
621 KASSERT(kpreempt_disabled());
622 KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex));
623 KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked");
624
625 kstack_check_magic(l);
626
627 binuptime(&bt);
628
629 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp);
630 KASSERT((l->l_pflag & LP_RUNNING) != 0);
631 KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN);
632 ci = curcpu();
633 spc = &ci->ci_schedstate;
634 returning = false;
635 newl = NULL;
636
637 /*
638 * If we have been asked to switch to a specific LWP, then there
639 * is no need to inspect the run queues. If a soft interrupt is
640 * blocking, then return to the interrupted thread without adjusting
641 * VM context or its start time: neither have been changed in order
642 * to take the interrupt.
643 */
644 if (l->l_switchto != NULL) {
645 if ((l->l_pflag & LP_INTR) != 0) {
646 returning = true;
647 softint_block(l);
648 if ((l->l_pflag & LP_TIMEINTR) != 0)
649 updatertime(l, &bt);
650 }
651 newl = l->l_switchto;
652 l->l_switchto = NULL;
653 }
654 #ifndef __HAVE_FAST_SOFTINTS
655 else if (ci->ci_data.cpu_softints != 0) {
656 /* There are pending soft interrupts, so pick one. */
657 newl = softint_picklwp();
658 newl->l_stat = LSONPROC;
659 newl->l_pflag |= LP_RUNNING;
660 }
661 #endif /* !__HAVE_FAST_SOFTINTS */
662
663 /*
664 * If on the CPU and we have gotten this far, then we must yield.
665 */
666 if (l->l_stat == LSONPROC && l != newl) {
667 KASSERT(lwp_locked(l, spc->spc_lwplock));
668 KASSERT((l->l_flag & LW_IDLE) == 0);
669 l->l_stat = LSRUN;
670 lwp_setlock(l, spc->spc_mutex);
671 sched_enqueue(l);
672 sched_preempted(l);
673
674 /*
675 * Handle migration. Note that "migrating LWP" may
676 * be reset here, if interrupt/preemption happens
677 * early in idle LWP.
678 */
679 if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) {
680 KASSERT((l->l_pflag & LP_INTR) == 0);
681 spc->spc_migrating = l;
682 }
683 }
684
685 /* Pick new LWP to run. */
686 if (newl == NULL) {
687 newl = nextlwp(ci, spc);
688 }
689
690 /* Items that must be updated with the CPU locked. */
691 if (!returning) {
692 /* Count time spent in current system call */
693 SYSCALL_TIME_SLEEP(l);
694
695 updatertime(l, &bt);
696
697 /* Update the new LWP's start time. */
698 newl->l_stime = bt;
699
700 /*
701 * ci_curlwp changes when a fast soft interrupt occurs.
702 * We use ci_onproc to keep track of which kernel or
703 * user thread is running 'underneath' the software
704 * interrupt. This is important for time accounting,
705 * itimers and forcing user threads to preempt (aston).
706 */
707 ci->ci_onproc = newl;
708 }
709
710 /*
711 * Preemption related tasks. Must be done holding spc_mutex. Clear
712 * l_dopreempt without an atomic - it's only ever set non-zero by
713 * sched_resched_cpu() which also holds spc_mutex, and only ever
714 * cleared by the LWP itself (us) with atomics when not under lock.
715 */
716 l->l_dopreempt = 0;
717 if (__predict_false(l->l_pfailaddr != 0)) {
718 LOCKSTAT_FLAG(lsflag);
719 LOCKSTAT_ENTER(lsflag);
720 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
721 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
722 1, l->l_pfailtime, l->l_pfailaddr);
723 LOCKSTAT_EXIT(lsflag);
724 l->l_pfailtime = 0;
725 l->l_pfaillock = 0;
726 l->l_pfailaddr = 0;
727 }
728
729 if (l != newl) {
730 struct lwp *prevlwp;
731
732 /* Release all locks, but leave the current LWP locked */
733 if (l->l_mutex == spc->spc_mutex) {
734 /*
735 * Drop spc_lwplock, if the current LWP has been moved
736 * to the run queue (it is now locked by spc_mutex).
737 */
738 mutex_spin_exit(spc->spc_lwplock);
739 } else {
740 /*
741 * Otherwise, drop the spc_mutex, we are done with the
742 * run queues.
743 */
744 mutex_spin_exit(spc->spc_mutex);
745 }
746
747 /* We're down to only one lock, so do debug checks. */
748 LOCKDEBUG_BARRIER(l->l_mutex, 1);
749
750 /* Count the context switch. */
751 CPU_COUNT(CPU_COUNT_NSWTCH, 1);
752 l->l_ncsw++;
753 if ((l->l_pflag & LP_PREEMPTING) != 0) {
754 l->l_nivcsw++;
755 l->l_pflag &= ~LP_PREEMPTING;
756 }
757
758 /*
759 * Increase the count of spin-mutexes before the release
760 * of the last lock - we must remain at IPL_SCHED after
761 * releasing the lock.
762 */
763 KASSERTMSG(ci->ci_mtx_count == -1,
764 "%s: cpu%u: ci_mtx_count (%d) != -1 "
765 "(block with spin-mutex held)",
766 __func__, cpu_index(ci), ci->ci_mtx_count);
767 oldspl = MUTEX_SPIN_OLDSPL(ci);
768 ci->ci_mtx_count = -2;
769
770 /* Update status for lwpctl, if present. */
771 if (l->l_lwpctl != NULL) {
772 l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ?
773 LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE);
774 }
775
776 /*
777 * If curlwp is a soft interrupt LWP, there's nobody on the
778 * other side to unlock - we're returning into an assembly
779 * trampoline. Unlock now. This is safe because this is a
780 * kernel LWP and is bound to current CPU: the worst anyone
781 * else will do to it, is to put it back onto this CPU's run
782 * queue (and the CPU is busy here right now!).
783 */
784 if (returning) {
785 /* Keep IPL_SCHED after this; MD code will fix up. */
786 l->l_pflag &= ~LP_RUNNING;
787 lwp_unlock(l);
788 } else {
789 /* A normal LWP: save old VM context. */
790 pmap_deactivate(l);
791 }
792
793 /*
794 * If DTrace has set the active vtime enum to anything
795 * other than INACTIVE (0), then it should have set the
796 * function to call.
797 */
798 if (__predict_false(dtrace_vtime_active)) {
799 (*dtrace_vtime_switch_func)(newl);
800 }
801
802 /*
803 * We must ensure not to come here from inside a read section.
804 */
805 KASSERT(pserialize_not_in_read_section());
806
807 /* Switch to the new LWP.. */
808 #ifdef MULTIPROCESSOR
809 KASSERT(curlwp == ci->ci_curlwp);
810 #endif
811 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp);
812 prevlwp = cpu_switchto(l, newl, returning);
813 ci = curcpu();
814 #ifdef MULTIPROCESSOR
815 KASSERT(curlwp == ci->ci_curlwp);
816 #endif
817 KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p",
818 l, curlwp, prevlwp);
819 KASSERT(prevlwp != NULL);
820 KASSERT(l->l_cpu == ci);
821 KASSERT(ci->ci_mtx_count == -2);
822
823 /*
824 * Immediately mark the previous LWP as no longer running
825 * and unlock (to keep lock wait times short as possible).
826 * We'll still be at IPL_SCHED afterwards. If a zombie,
827 * don't touch after clearing LP_RUNNING as it could be
828 * reaped by another CPU. Issue a memory barrier to ensure
829 * this.
830 *
831 * atomic_store_release matches atomic_load_acquire in
832 * lwp_free.
833 */
834 KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0);
835 lock = prevlwp->l_mutex;
836 if (__predict_false(prevlwp->l_stat == LSZOMB)) {
837 atomic_store_release(&prevlwp->l_pflag,
838 prevlwp->l_pflag & ~LP_RUNNING);
839 } else {
840 prevlwp->l_pflag &= ~LP_RUNNING;
841 }
842 mutex_spin_exit(lock);
843
844 /*
845 * Switched away - we have new curlwp.
846 * Restore VM context and IPL.
847 */
848 pmap_activate(l);
849 pcu_switchpoint(l);
850
851 /* Update status for lwpctl, if present. */
852 if (l->l_lwpctl != NULL) {
853 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
854 l->l_lwpctl->lc_pctr++;
855 }
856
857 /*
858 * Normalize the spin mutex count and restore the previous
859 * SPL. Note that, unless the caller disabled preemption,
860 * we can be preempted at any time after this splx().
861 */
862 KASSERT(l->l_cpu == ci);
863 KASSERT(ci->ci_mtx_count == -1);
864 ci->ci_mtx_count = 0;
865 splx(oldspl);
866 } else {
867 /* Nothing to do - just unlock and return. */
868 mutex_spin_exit(spc->spc_mutex);
869 l->l_pflag &= ~LP_PREEMPTING;
870 lwp_unlock(l);
871 }
872
873 KASSERT(l == curlwp);
874 KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0);
875
876 SYSCALL_TIME_WAKEUP(l);
877 LOCKDEBUG_BARRIER(NULL, 1);
878 }
879
880 /*
881 * setrunnable: change LWP state to be runnable, placing it on the run queue.
882 *
883 * Call with the process and LWP locked. Will return with the LWP unlocked.
884 */
885 void
setrunnable(struct lwp * l)886 setrunnable(struct lwp *l)
887 {
888 struct proc *p = l->l_proc;
889 struct cpu_info *ci;
890 kmutex_t *oldlock;
891
892 KASSERT((l->l_flag & LW_IDLE) == 0);
893 KASSERT((l->l_flag & LW_DBGSUSPEND) == 0);
894 KASSERT(mutex_owned(p->p_lock));
895 KASSERT(lwp_locked(l, NULL));
896 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
897
898 switch (l->l_stat) {
899 case LSSTOP:
900 /*
901 * If we're being traced (possibly because someone attached us
902 * while we were stopped), check for a signal from the debugger.
903 */
904 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0)
905 signotify(l);
906 p->p_nrlwps++;
907 break;
908 case LSSUSPENDED:
909 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
910 l->l_flag &= ~LW_WSUSPEND;
911 p->p_nrlwps++;
912 cv_broadcast(&p->p_lwpcv);
913 break;
914 case LSSLEEP:
915 KASSERT(l->l_wchan != NULL);
916 break;
917 case LSIDL:
918 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
919 break;
920 default:
921 panic("setrunnable: lwp %p state was %d", l, l->l_stat);
922 }
923
924 /*
925 * If the LWP was sleeping, start it again.
926 */
927 if (l->l_wchan != NULL) {
928 l->l_stat = LSSLEEP;
929 /* lwp_unsleep() will release the lock. */
930 lwp_unsleep(l, true);
931 return;
932 }
933
934 /*
935 * If the LWP is still on the CPU, mark it as LSONPROC. It may be
936 * about to call mi_switch(), in which case it will yield.
937 */
938 if ((l->l_pflag & LP_RUNNING) != 0) {
939 l->l_stat = LSONPROC;
940 l->l_slptime = 0;
941 lwp_unlock(l);
942 return;
943 }
944
945 /*
946 * Look for a CPU to run.
947 * Set the LWP runnable.
948 */
949 ci = sched_takecpu(l);
950 l->l_cpu = ci;
951 spc_lock(ci);
952 oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex);
953 sched_setrunnable(l);
954 l->l_stat = LSRUN;
955 l->l_slptime = 0;
956 sched_enqueue(l);
957 sched_resched_lwp(l, true);
958 /* SPC & LWP now unlocked. */
959 mutex_spin_exit(oldlock);
960 }
961
962 /*
963 * suspendsched:
964 *
965 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
966 */
967 void
suspendsched(void)968 suspendsched(void)
969 {
970 CPU_INFO_ITERATOR cii;
971 struct cpu_info *ci;
972 struct lwp *l;
973 struct proc *p;
974
975 /*
976 * We do this by process in order not to violate the locking rules.
977 */
978 mutex_enter(&proc_lock);
979 PROCLIST_FOREACH(p, &allproc) {
980 mutex_enter(p->p_lock);
981 if ((p->p_flag & PK_SYSTEM) != 0) {
982 mutex_exit(p->p_lock);
983 continue;
984 }
985
986 if (p->p_stat != SSTOP) {
987 if (p->p_stat != SZOMB && p->p_stat != SDEAD) {
988 p->p_pptr->p_nstopchild++;
989 p->p_waited = 0;
990 }
991 p->p_stat = SSTOP;
992 }
993
994 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
995 if (l == curlwp)
996 continue;
997
998 lwp_lock(l);
999
1000 /*
1001 * Set L_WREBOOT so that the LWP will suspend itself
1002 * when it tries to return to user mode. We want to
1003 * try and get to get as many LWPs as possible to
1004 * the user / kernel boundary, so that they will
1005 * release any locks that they hold.
1006 */
1007 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
1008
1009 if (l->l_stat == LSSLEEP &&
1010 (l->l_flag & LW_SINTR) != 0) {
1011 /* setrunnable() will release the lock. */
1012 setrunnable(l);
1013 continue;
1014 }
1015
1016 lwp_unlock(l);
1017 }
1018
1019 mutex_exit(p->p_lock);
1020 }
1021 mutex_exit(&proc_lock);
1022
1023 /*
1024 * Kick all CPUs to make them preempt any LWPs running in user mode.
1025 * They'll trap into the kernel and suspend themselves in userret().
1026 *
1027 * Unusually, we don't hold any other scheduler object locked, which
1028 * would keep preemption off for sched_resched_cpu(), so disable it
1029 * explicitly.
1030 */
1031 kpreempt_disable();
1032 for (CPU_INFO_FOREACH(cii, ci)) {
1033 spc_lock(ci);
1034 sched_resched_cpu(ci, PRI_KERNEL, true);
1035 /* spc now unlocked */
1036 }
1037 kpreempt_enable();
1038 }
1039
1040 /*
1041 * sched_unsleep:
1042 *
1043 * The is called when the LWP has not been awoken normally but instead
1044 * interrupted: for example, if the sleep timed out. Because of this,
1045 * it's not a valid action for running or idle LWPs.
1046 */
1047 static void
sched_unsleep(struct lwp * l,bool cleanup)1048 sched_unsleep(struct lwp *l, bool cleanup)
1049 {
1050
1051 lwp_unlock(l);
1052 panic("sched_unsleep");
1053 }
1054
1055 static void
sched_changepri(struct lwp * l,pri_t pri)1056 sched_changepri(struct lwp *l, pri_t pri)
1057 {
1058 struct schedstate_percpu *spc;
1059 struct cpu_info *ci;
1060
1061 KASSERT(lwp_locked(l, NULL));
1062
1063 ci = l->l_cpu;
1064 spc = &ci->ci_schedstate;
1065
1066 if (l->l_stat == LSRUN) {
1067 KASSERT(lwp_locked(l, spc->spc_mutex));
1068 sched_dequeue(l);
1069 l->l_priority = pri;
1070 sched_enqueue(l);
1071 sched_resched_lwp(l, false);
1072 } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
1073 /* On priority drop, only evict realtime LWPs. */
1074 KASSERT(lwp_locked(l, spc->spc_lwplock));
1075 l->l_priority = pri;
1076 spc_lock(ci);
1077 sched_resched_cpu(ci, spc->spc_maxpriority, true);
1078 /* spc now unlocked */
1079 } else {
1080 l->l_priority = pri;
1081 }
1082 }
1083
1084 static void
sched_lendpri(struct lwp * l,pri_t pri)1085 sched_lendpri(struct lwp *l, pri_t pri)
1086 {
1087 struct schedstate_percpu *spc;
1088 struct cpu_info *ci;
1089
1090 KASSERT(lwp_locked(l, NULL));
1091
1092 ci = l->l_cpu;
1093 spc = &ci->ci_schedstate;
1094
1095 if (l->l_stat == LSRUN) {
1096 KASSERT(lwp_locked(l, spc->spc_mutex));
1097 sched_dequeue(l);
1098 l->l_inheritedprio = pri;
1099 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
1100 sched_enqueue(l);
1101 sched_resched_lwp(l, false);
1102 } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
1103 /* On priority drop, only evict realtime LWPs. */
1104 KASSERT(lwp_locked(l, spc->spc_lwplock));
1105 l->l_inheritedprio = pri;
1106 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
1107 spc_lock(ci);
1108 sched_resched_cpu(ci, spc->spc_maxpriority, true);
1109 /* spc now unlocked */
1110 } else {
1111 l->l_inheritedprio = pri;
1112 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
1113 }
1114 }
1115
1116 struct lwp *
syncobj_noowner(wchan_t wchan)1117 syncobj_noowner(wchan_t wchan)
1118 {
1119
1120 return NULL;
1121 }
1122
1123 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */
1124 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE;
1125
1126 /*
1127 * Constants for averages over 1, 5 and 15 minutes when sampling at
1128 * 5 second intervals.
1129 */
1130 static const fixpt_t cexp[ ] = {
1131 0.9200444146293232 * FSCALE, /* exp(-1/12) */
1132 0.9834714538216174 * FSCALE, /* exp(-1/60) */
1133 0.9944598480048967 * FSCALE, /* exp(-1/180) */
1134 };
1135
1136 /*
1137 * sched_pstats:
1138 *
1139 * => Update process statistics and check CPU resource allocation.
1140 * => Call scheduler-specific hook to eventually adjust LWP priorities.
1141 * => Compute load average of a quantity on 1, 5 and 15 minute intervals.
1142 */
1143 void
sched_pstats(void)1144 sched_pstats(void)
1145 {
1146 struct loadavg *avg = &averunnable;
1147 const int clkhz = (stathz != 0 ? stathz : hz);
1148 static bool backwardslwp = false;
1149 static bool backwardsproc = false;
1150 static u_int lavg_count = 0;
1151 struct proc *p;
1152 int nrun;
1153
1154 sched_pstats_ticks++;
1155 if (++lavg_count >= 5) {
1156 lavg_count = 0;
1157 nrun = 0;
1158 }
1159 mutex_enter(&proc_lock);
1160 PROCLIST_FOREACH(p, &allproc) {
1161 struct lwp *l;
1162 struct rlimit *rlim;
1163 time_t runtm;
1164 int sig;
1165
1166 /* Increment sleep time (if sleeping), ignore overflow. */
1167 mutex_enter(p->p_lock);
1168 runtm = p->p_rtime.sec;
1169 LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1170 fixpt_t lpctcpu;
1171 u_int lcpticks;
1172
1173 if (__predict_false((l->l_flag & LW_IDLE) != 0))
1174 continue;
1175 lwp_lock(l);
1176 if (__predict_false(l->l_rtime.sec < 0) &&
1177 !backwardslwp) {
1178 backwardslwp = true;
1179 printf("WARNING: lwp %ld (%s%s%s): "
1180 "negative runtime: "
1181 "(%jd + 0x%016"PRIx64"/2^64) sec\n",
1182 (long)l->l_lid,
1183 l->l_proc->p_comm,
1184 l->l_name ? " " : "",
1185 l->l_name ? l->l_name : "",
1186 (intmax_t)l->l_rtime.sec,
1187 l->l_rtime.frac);
1188 }
1189 runtm += l->l_rtime.sec;
1190 l->l_swtime++;
1191 sched_lwp_stats(l);
1192
1193 /* For load average calculation. */
1194 if (__predict_false(lavg_count == 0) &&
1195 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) {
1196 switch (l->l_stat) {
1197 case LSSLEEP:
1198 if (l->l_slptime > 1) {
1199 break;
1200 }
1201 /* FALLTHROUGH */
1202 case LSRUN:
1203 case LSONPROC:
1204 case LSIDL:
1205 nrun++;
1206 }
1207 }
1208 lwp_unlock(l);
1209
1210 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
1211 if (l->l_slptime != 0)
1212 continue;
1213
1214 lpctcpu = l->l_pctcpu;
1215 lcpticks = atomic_swap_uint(&l->l_cpticks, 0);
1216 lpctcpu += ((FSCALE - ccpu) *
1217 (lcpticks * FSCALE / clkhz)) >> FSHIFT;
1218 l->l_pctcpu = lpctcpu;
1219 }
1220 /* Calculating p_pctcpu only for ps(1) */
1221 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
1222
1223 if (__predict_false(runtm < 0)) {
1224 if (!backwardsproc) {
1225 backwardsproc = true;
1226 printf("WARNING: pid %ld (%s): "
1227 "negative runtime; "
1228 "monotonic clock has gone backwards\n",
1229 (long)p->p_pid, p->p_comm);
1230 }
1231 mutex_exit(p->p_lock);
1232 continue;
1233 }
1234
1235 /*
1236 * Check if the process exceeds its CPU resource allocation.
1237 * If over the hard limit, kill it with SIGKILL.
1238 * If over the soft limit, send SIGXCPU and raise
1239 * the soft limit a little.
1240 */
1241 rlim = &p->p_rlimit[RLIMIT_CPU];
1242 sig = 0;
1243 if (__predict_false(runtm >= rlim->rlim_cur)) {
1244 if (runtm >= rlim->rlim_max) {
1245 sig = SIGKILL;
1246 log(LOG_NOTICE,
1247 "pid %d, command %s, is killed: %s\n",
1248 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
1249 uprintf("pid %d, command %s, is killed: %s\n",
1250 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
1251 } else {
1252 sig = SIGXCPU;
1253 if (rlim->rlim_cur < rlim->rlim_max)
1254 rlim->rlim_cur += 5;
1255 }
1256 }
1257 mutex_exit(p->p_lock);
1258 if (__predict_false(sig)) {
1259 KASSERT((p->p_flag & PK_SYSTEM) == 0);
1260 psignal(p, sig);
1261 }
1262 }
1263
1264 /* Load average calculation. */
1265 if (__predict_false(lavg_count == 0)) {
1266 int i;
1267 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg));
1268 for (i = 0; i < __arraycount(cexp); i++) {
1269 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
1270 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
1271 }
1272 }
1273
1274 /* Lightning bolt. */
1275 cv_broadcast(&lbolt);
1276
1277 mutex_exit(&proc_lock);
1278 }
1279