1 /* $NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $ */
2
3 /*
4 * Copyright (c) 2008, 2011 Mindaugas Rasiukevicius <rmind at NetBSD org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 /*
30 * System calls relating to the scheduler.
31 *
32 * Lock order:
33 *
34 * cpu_lock ->
35 * proc_lock ->
36 * proc_t::p_lock ->
37 * lwp_t::lwp_lock
38 *
39 * TODO:
40 * - Handle pthread_setschedprio() as defined by POSIX;
41 */
42
43 #include <sys/cdefs.h>
44 __KERNEL_RCSID(0, "$NetBSD: sys_sched.c,v 1.50 2023/04/09 09:18:09 riastradh Exp $");
45
46 #include <sys/param.h>
47
48 #include <sys/cpu.h>
49 #include <sys/kauth.h>
50 #include <sys/kmem.h>
51 #include <sys/lwp.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/pset.h>
55 #include <sys/sched.h>
56 #include <sys/syscallargs.h>
57 #include <sys/sysctl.h>
58 #include <sys/systm.h>
59 #include <sys/types.h>
60 #include <sys/unistd.h>
61
62 static struct sysctllog *sched_sysctl_log;
63 static kauth_listener_t sched_listener;
64
65 /*
66 * Convert user priority or the in-kernel priority or convert the current
67 * priority to the appropriate range according to the policy change.
68 */
69 static pri_t
convert_pri(lwp_t * l,int policy,pri_t pri)70 convert_pri(lwp_t *l, int policy, pri_t pri)
71 {
72
73 /* Convert user priority to the in-kernel */
74 if (pri != PRI_NONE) {
75 /* Only for real-time threads */
76 KASSERT(pri >= SCHED_PRI_MIN);
77 KASSERT(pri <= SCHED_PRI_MAX);
78 KASSERT(policy != SCHED_OTHER);
79 return PRI_USER_RT + pri;
80 }
81
82 /* Neither policy, nor priority change */
83 if (l->l_class == policy)
84 return l->l_priority;
85
86 /* Time-sharing -> real-time */
87 if (l->l_class == SCHED_OTHER) {
88 KASSERT(policy == SCHED_FIFO || policy == SCHED_RR);
89 return PRI_USER_RT;
90 }
91
92 /* Real-time -> time-sharing */
93 if (policy == SCHED_OTHER) {
94 KASSERT(l->l_class == SCHED_FIFO || l->l_class == SCHED_RR);
95 /*
96 * this is a bit arbitrary because the priority is dynamic
97 * for SCHED_OTHER threads and will likely be changed by
98 * the scheduler soon anyway.
99 */
100 return l->l_priority - PRI_USER_RT;
101 }
102
103 /* Real-time -> real-time */
104 return l->l_priority;
105 }
106
107 int
do_sched_setparam(pid_t pid,lwpid_t lid,int policy,const struct sched_param * params)108 do_sched_setparam(pid_t pid, lwpid_t lid, int policy,
109 const struct sched_param *params)
110 {
111 struct proc *p;
112 struct lwp *t;
113 pri_t pri;
114 u_int lcnt;
115 int error;
116
117 error = 0;
118
119 pri = params->sched_priority;
120
121 /* If no parameters specified, just return (this should not happen) */
122 if (pri == PRI_NONE && policy == SCHED_NONE)
123 return 0;
124
125 /* Validate scheduling class */
126 if (policy != SCHED_NONE && (policy < SCHED_OTHER || policy > SCHED_RR))
127 return EINVAL;
128
129 /* Validate priority */
130 if (pri != PRI_NONE && (pri < SCHED_PRI_MIN || pri > SCHED_PRI_MAX))
131 return EINVAL;
132
133 if (pid != 0) {
134 /* Find the process */
135 mutex_enter(&proc_lock);
136 p = proc_find(pid);
137 if (p == NULL) {
138 mutex_exit(&proc_lock);
139 return ESRCH;
140 }
141 mutex_enter(p->p_lock);
142 mutex_exit(&proc_lock);
143 /* Disallow modification of system processes */
144 if ((p->p_flag & PK_SYSTEM) != 0) {
145 mutex_exit(p->p_lock);
146 return EPERM;
147 }
148 } else {
149 /* Use the calling process */
150 p = curlwp->l_proc;
151 mutex_enter(p->p_lock);
152 }
153
154 /* Find the LWP(s) */
155 lcnt = 0;
156 LIST_FOREACH(t, &p->p_lwps, l_sibling) {
157 pri_t kpri;
158 int lpolicy;
159
160 if (lid && lid != t->l_lid)
161 continue;
162
163 lcnt++;
164 lwp_lock(t);
165 lpolicy = (policy == SCHED_NONE) ? t->l_class : policy;
166
167 /* Disallow setting of priority for SCHED_OTHER threads */
168 if (lpolicy == SCHED_OTHER && pri != PRI_NONE) {
169 lwp_unlock(t);
170 error = EINVAL;
171 break;
172 }
173
174 /* Convert priority, if needed */
175 kpri = convert_pri(t, lpolicy, pri);
176
177 /* Check the permission */
178 error = kauth_authorize_process(kauth_cred_get(),
179 KAUTH_PROCESS_SCHEDULER_SETPARAM, p, t, KAUTH_ARG(lpolicy),
180 KAUTH_ARG(kpri));
181 if (error) {
182 lwp_unlock(t);
183 break;
184 }
185
186 /* Set the scheduling class, change the priority */
187 t->l_class = lpolicy;
188 lwp_changepri(t, kpri);
189 lwp_unlock(t);
190 }
191 mutex_exit(p->p_lock);
192 return (lcnt == 0) ? ESRCH : error;
193 }
194
195 /*
196 * Set scheduling parameters.
197 */
198 int
sys__sched_setparam(struct lwp * l,const struct sys__sched_setparam_args * uap,register_t * retval)199 sys__sched_setparam(struct lwp *l, const struct sys__sched_setparam_args *uap,
200 register_t *retval)
201 {
202 /* {
203 syscallarg(pid_t) pid;
204 syscallarg(lwpid_t) lid;
205 syscallarg(int) policy;
206 syscallarg(const struct sched_param *) params;
207 } */
208 struct sched_param params;
209 int error;
210
211 /* Get the parameters from the user-space */
212 error = copyin(SCARG(uap, params), ¶ms, sizeof(params));
213 if (error)
214 goto out;
215
216 error = do_sched_setparam(SCARG(uap, pid), SCARG(uap, lid),
217 SCARG(uap, policy), ¶ms);
218 out:
219 return error;
220 }
221
222 /*
223 * do_sched_getparam:
224 *
225 * if lid=0, returns the parameter of the first LWP in the process.
226 */
227 int
do_sched_getparam(pid_t pid,lwpid_t lid,int * policy,struct sched_param * params)228 do_sched_getparam(pid_t pid, lwpid_t lid, int *policy,
229 struct sched_param *params)
230 {
231 struct sched_param lparams;
232 struct lwp *t;
233 int error, lpolicy;
234
235 if (pid < 0 || lid < 0)
236 return EINVAL;
237
238 t = lwp_find2(pid, lid); /* acquire p_lock */
239 if (t == NULL)
240 return ESRCH;
241
242 /* Check the permission */
243 error = kauth_authorize_process(kauth_cred_get(),
244 KAUTH_PROCESS_SCHEDULER_GETPARAM, t->l_proc, NULL, NULL, NULL);
245 if (error != 0) {
246 mutex_exit(t->l_proc->p_lock);
247 return error;
248 }
249
250 lwp_lock(t);
251 lparams.sched_priority = t->l_priority;
252 lpolicy = t->l_class;
253 lwp_unlock(t);
254 mutex_exit(t->l_proc->p_lock);
255
256 /*
257 * convert to the user-visible priority value.
258 * it's an inversion of convert_pri().
259 *
260 * the SCHED_OTHER case is a bit arbitrary given that
261 * - we don't allow setting the priority.
262 * - the priority is dynamic.
263 */
264 switch (lpolicy) {
265 case SCHED_OTHER:
266 lparams.sched_priority -= PRI_USER;
267 break;
268 case SCHED_RR:
269 case SCHED_FIFO:
270 lparams.sched_priority -= PRI_USER_RT;
271 break;
272 }
273
274 if (policy != NULL)
275 *policy = lpolicy;
276
277 if (params != NULL)
278 *params = lparams;
279
280 return error;
281 }
282
283 /*
284 * Get scheduling parameters.
285 */
286 int
sys__sched_getparam(struct lwp * l,const struct sys__sched_getparam_args * uap,register_t * retval)287 sys__sched_getparam(struct lwp *l, const struct sys__sched_getparam_args *uap,
288 register_t *retval)
289 {
290 /* {
291 syscallarg(pid_t) pid;
292 syscallarg(lwpid_t) lid;
293 syscallarg(int *) policy;
294 syscallarg(struct sched_param *) params;
295 } */
296 struct sched_param params;
297 int error, policy;
298
299 error = do_sched_getparam(SCARG(uap, pid), SCARG(uap, lid), &policy,
300 ¶ms);
301 if (error)
302 goto out;
303
304 error = copyout(¶ms, SCARG(uap, params), sizeof(params));
305 if (error == 0 && SCARG(uap, policy) != NULL)
306 error = copyout(&policy, SCARG(uap, policy), sizeof(int));
307 out:
308 return error;
309 }
310
311 /*
312 * Allocate the CPU set, and get it from userspace.
313 */
314 static int
genkcpuset(kcpuset_t ** dset,const cpuset_t * sset,size_t size)315 genkcpuset(kcpuset_t **dset, const cpuset_t *sset, size_t size)
316 {
317 kcpuset_t *kset;
318 int error;
319
320 kcpuset_create(&kset, true);
321 error = kcpuset_copyin(sset, kset, size);
322 if (error) {
323 kcpuset_unuse(kset, NULL);
324 } else {
325 *dset = kset;
326 }
327 return error;
328 }
329
330 /*
331 * Set affinity.
332 */
333 int
sys__sched_setaffinity(struct lwp * l,const struct sys__sched_setaffinity_args * uap,register_t * retval)334 sys__sched_setaffinity(struct lwp *l,
335 const struct sys__sched_setaffinity_args *uap, register_t *retval)
336 {
337 /* {
338 syscallarg(pid_t) pid;
339 syscallarg(lwpid_t) lid;
340 syscallarg(size_t) size;
341 syscallarg(const cpuset_t *) cpuset;
342 } */
343 kcpuset_t *kcset, *kcpulst = NULL;
344 struct cpu_info *ici, *ci;
345 struct proc *p;
346 struct lwp *t;
347 CPU_INFO_ITERATOR cii;
348 bool alloff;
349 lwpid_t lid;
350 u_int lcnt;
351 int error;
352
353 error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
354 if (error)
355 return error;
356
357 /*
358 * Traverse _each_ CPU to:
359 * - Check that CPUs in the mask have no assigned processor set.
360 * - Check that at least one CPU from the mask is online.
361 * - Find the first target CPU to migrate.
362 *
363 * To avoid the race with CPU online/offline calls and processor sets,
364 * cpu_lock will be locked for the entire operation.
365 */
366 ci = NULL;
367 alloff = false;
368 mutex_enter(&cpu_lock);
369 for (CPU_INFO_FOREACH(cii, ici)) {
370 struct schedstate_percpu *ispc;
371
372 if (!kcpuset_isset(kcset, cpu_index(ici))) {
373 continue;
374 }
375
376 ispc = &ici->ci_schedstate;
377 /* Check that CPU is not in the processor-set */
378 if (ispc->spc_psid != PS_NONE) {
379 error = EPERM;
380 goto out;
381 }
382 /* Skip offline CPUs */
383 if (ispc->spc_flags & SPCF_OFFLINE) {
384 alloff = true;
385 continue;
386 }
387 /* Target CPU to migrate */
388 if (ci == NULL) {
389 ci = ici;
390 }
391 }
392 if (ci == NULL) {
393 if (alloff) {
394 /* All CPUs in the set are offline */
395 error = EPERM;
396 goto out;
397 }
398 /* Empty set */
399 kcpuset_unuse(kcset, &kcpulst);
400 kcset = NULL;
401 }
402
403 if (SCARG(uap, pid) != 0) {
404 /* Find the process */
405 mutex_enter(&proc_lock);
406 p = proc_find(SCARG(uap, pid));
407 if (p == NULL) {
408 mutex_exit(&proc_lock);
409 error = ESRCH;
410 goto out;
411 }
412 mutex_enter(p->p_lock);
413 mutex_exit(&proc_lock);
414 /* Disallow modification of system processes. */
415 if ((p->p_flag & PK_SYSTEM) != 0) {
416 mutex_exit(p->p_lock);
417 error = EPERM;
418 goto out;
419 }
420 } else {
421 /* Use the calling process */
422 p = l->l_proc;
423 mutex_enter(p->p_lock);
424 }
425
426 /*
427 * Check the permission.
428 */
429 error = kauth_authorize_process(l->l_cred,
430 KAUTH_PROCESS_SCHEDULER_SETAFFINITY, p, NULL, NULL, NULL);
431 if (error != 0) {
432 mutex_exit(p->p_lock);
433 goto out;
434 }
435
436 /* Iterate through LWP(s). */
437 lcnt = 0;
438 lid = SCARG(uap, lid);
439 LIST_FOREACH(t, &p->p_lwps, l_sibling) {
440 if (lid && lid != t->l_lid) {
441 continue;
442 }
443 lwp_lock(t);
444 /* No affinity for zombie LWPs. */
445 if (t->l_stat == LSZOMB) {
446 lwp_unlock(t);
447 continue;
448 }
449 /* First, release existing affinity, if any. */
450 if (t->l_affinity) {
451 kcpuset_unuse(t->l_affinity, &kcpulst);
452 }
453 if (kcset) {
454 /*
455 * Hold a reference on affinity mask, assign mask to
456 * LWP and migrate it to another CPU (unlocks LWP).
457 */
458 kcpuset_use(kcset);
459 t->l_affinity = kcset;
460 lwp_migrate(t, ci);
461 } else {
462 /* Old affinity mask is released, just clear. */
463 t->l_affinity = NULL;
464 lwp_unlock(t);
465 }
466 lcnt++;
467 }
468 mutex_exit(p->p_lock);
469 if (lcnt == 0) {
470 error = ESRCH;
471 }
472 out:
473 mutex_exit(&cpu_lock);
474
475 /*
476 * Drop the initial reference (LWPs, if any, have the ownership now),
477 * and destroy whatever is in the G/C list, if filled.
478 */
479 if (kcset) {
480 kcpuset_unuse(kcset, &kcpulst);
481 }
482 if (kcpulst) {
483 kcpuset_destroy(kcpulst);
484 }
485 return error;
486 }
487
488 /*
489 * Get affinity.
490 */
491 int
sys__sched_getaffinity(struct lwp * l,const struct sys__sched_getaffinity_args * uap,register_t * retval)492 sys__sched_getaffinity(struct lwp *l,
493 const struct sys__sched_getaffinity_args *uap, register_t *retval)
494 {
495 /* {
496 syscallarg(pid_t) pid;
497 syscallarg(lwpid_t) lid;
498 syscallarg(size_t) size;
499 syscallarg(cpuset_t *) cpuset;
500 } */
501 struct lwp *t;
502 kcpuset_t *kcset;
503 int error;
504
505 if (SCARG(uap, pid) < 0 || SCARG(uap, lid) < 0)
506 return EINVAL;
507
508 error = genkcpuset(&kcset, SCARG(uap, cpuset), SCARG(uap, size));
509 if (error)
510 return error;
511
512 /* Locks the LWP */
513 t = lwp_find2(SCARG(uap, pid), SCARG(uap, lid));
514 if (t == NULL) {
515 error = ESRCH;
516 goto out;
517 }
518 /* Check the permission */
519 if (kauth_authorize_process(l->l_cred,
520 KAUTH_PROCESS_SCHEDULER_GETAFFINITY, t->l_proc, NULL, NULL, NULL)) {
521 mutex_exit(t->l_proc->p_lock);
522 error = EPERM;
523 goto out;
524 }
525 lwp_lock(t);
526 if (t->l_affinity) {
527 kcpuset_copy(kcset, t->l_affinity);
528 } else {
529 kcpuset_zero(kcset);
530 }
531 lwp_unlock(t);
532 mutex_exit(t->l_proc->p_lock);
533
534 error = kcpuset_copyout(kcset, SCARG(uap, cpuset), SCARG(uap, size));
535 out:
536 kcpuset_unuse(kcset, NULL);
537 return error;
538 }
539
540 /*
541 * Priority protection for PTHREAD_PRIO_PROTECT. This is a weak
542 * analogue of priority inheritance: temp raise the priority
543 * of the caller when accessing a protected resource.
544 */
545 int
sys__sched_protect(struct lwp * l,const struct sys__sched_protect_args * uap,register_t * retval)546 sys__sched_protect(struct lwp *l,
547 const struct sys__sched_protect_args *uap, register_t *retval)
548 {
549 /* {
550 syscallarg(int) priority;
551 syscallarg(int *) opriority;
552 } */
553 int error;
554 pri_t pri;
555
556 KASSERT(l->l_inheritedprio == -1);
557 KASSERT(l->l_auxprio == -1 || l->l_auxprio == l->l_protectprio);
558
559 pri = SCARG(uap, priority);
560 error = 0;
561 lwp_lock(l);
562 if (pri == -1) {
563 /* back out priority changes */
564 switch(l->l_protectdepth) {
565 case 0:
566 error = EINVAL;
567 break;
568 case 1:
569 l->l_protectdepth = 0;
570 l->l_protectprio = -1;
571 l->l_auxprio = -1;
572 break;
573 default:
574 l->l_protectdepth--;
575 break;
576 }
577 } else if (pri < 0) {
578 /* Just retrieve the current value, for debugging */
579 if (l->l_protectprio == -1)
580 error = ENOENT;
581 else
582 *retval = l->l_protectprio - PRI_USER_RT;
583 } else if (__predict_false(pri < SCHED_PRI_MIN ||
584 pri > SCHED_PRI_MAX || l->l_priority > pri + PRI_USER_RT)) {
585 /* must fail if existing priority is higher */
586 error = EPERM;
587 } else {
588 /* play along but make no changes if not a realtime LWP. */
589 l->l_protectdepth++;
590 pri += PRI_USER_RT;
591 if (__predict_true(l->l_class != SCHED_OTHER &&
592 pri > l->l_protectprio)) {
593 l->l_protectprio = pri;
594 l->l_auxprio = pri;
595 }
596 }
597 lwp_unlock(l);
598
599 return error;
600 }
601
602 /*
603 * Yield.
604 */
605 int
sys_sched_yield(struct lwp * l,const void * v,register_t * retval)606 sys_sched_yield(struct lwp *l, const void *v, register_t *retval)
607 {
608
609 yield();
610 return 0;
611 }
612
613 /*
614 * Sysctl nodes and initialization.
615 */
616 static void
sysctl_sched_setup(struct sysctllog ** clog)617 sysctl_sched_setup(struct sysctllog **clog)
618 {
619 const struct sysctlnode *node = NULL;
620
621 sysctl_createv(clog, 0, NULL, NULL,
622 CTLFLAG_PERMANENT|CTLFLAG_IMMEDIATE,
623 CTLTYPE_INT, "posix_sched",
624 SYSCTL_DESCR("Version of IEEE Std 1003.1 and its "
625 "Process Scheduling option to which the "
626 "system attempts to conform"),
627 NULL, _POSIX_PRIORITY_SCHEDULING, NULL, 0,
628 CTL_KERN, CTL_CREATE, CTL_EOL);
629 sysctl_createv(clog, 0, NULL, &node,
630 CTLFLAG_PERMANENT,
631 CTLTYPE_NODE, "sched",
632 SYSCTL_DESCR("Scheduler options"),
633 NULL, 0, NULL, 0,
634 CTL_KERN, CTL_CREATE, CTL_EOL);
635
636 if (node == NULL)
637 return;
638
639 sysctl_createv(clog, 0, &node, NULL,
640 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
641 CTLTYPE_INT, "pri_min",
642 SYSCTL_DESCR("Minimal POSIX real-time priority"),
643 NULL, SCHED_PRI_MIN, NULL, 0,
644 CTL_CREATE, CTL_EOL);
645 sysctl_createv(clog, 0, &node, NULL,
646 CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
647 CTLTYPE_INT, "pri_max",
648 SYSCTL_DESCR("Maximal POSIX real-time priority"),
649 NULL, SCHED_PRI_MAX, NULL, 0,
650 CTL_CREATE, CTL_EOL);
651 }
652
653 static int
sched_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)654 sched_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
655 void *arg0, void *arg1, void *arg2, void *arg3)
656 {
657 struct proc *p;
658 int result;
659
660 result = KAUTH_RESULT_DEFER;
661 p = arg0;
662
663 switch (action) {
664 case KAUTH_PROCESS_SCHEDULER_GETPARAM:
665 if (kauth_cred_uidmatch(cred, p->p_cred))
666 result = KAUTH_RESULT_ALLOW;
667 break;
668
669 case KAUTH_PROCESS_SCHEDULER_SETPARAM:
670 if (kauth_cred_uidmatch(cred, p->p_cred)) {
671 struct lwp *l;
672 int policy;
673 pri_t priority;
674
675 l = arg1;
676 policy = (int)(unsigned long)arg2;
677 priority = (pri_t)(unsigned long)arg3;
678
679 if ((policy == l->l_class ||
680 (policy != SCHED_FIFO && policy != SCHED_RR)) &&
681 priority <= l->l_priority)
682 result = KAUTH_RESULT_ALLOW;
683 }
684
685 break;
686
687 case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
688 result = KAUTH_RESULT_ALLOW;
689 break;
690
691 case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
692 /* Privileged; we let the secmodel handle this. */
693 break;
694
695 default:
696 break;
697 }
698
699 return result;
700 }
701
702 void
sched_init(void)703 sched_init(void)
704 {
705
706 sysctl_sched_setup(&sched_sysctl_log);
707
708 sched_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
709 sched_listener_cb, NULL);
710 }
711