xref: /openbsd/sys/kern/sched_bsd.c (revision aa563902)
1 /*	$OpenBSD: sched_bsd.c,v 1.77 2023/07/11 07:02:43 claudio Exp $	*/
2 /*	$NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $	*/
3 
4 /*-
5  * Copyright (c) 1982, 1986, 1990, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * (c) UNIX System Laboratories, Inc.
8  * All or some portions of this file are derived from material licensed
9  * to the University of California by American Telephone and Telegraph
10  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11  * the permission of UNIX System Laboratories, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)kern_synch.c	8.6 (Berkeley) 1/21/94
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/proc.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/resourcevar.h>
46 #include <uvm/uvm_extern.h>
47 #include <sys/sched.h>
48 #include <sys/timeout.h>
49 #include <sys/smr.h>
50 #include <sys/tracepoint.h>
51 
52 #ifdef KTRACE
53 #include <sys/ktrace.h>
54 #endif
55 
56 
57 int	lbolt;			/* once a second sleep address */
58 int	rrticks_init;		/* # of hardclock ticks per roundrobin() */
59 
60 #ifdef MULTIPROCESSOR
61 struct __mp_lock sched_lock;
62 #endif
63 
64 void			schedcpu(void *);
65 uint32_t		decay_aftersleep(uint32_t, uint32_t);
66 
67 /*
68  * Force switch among equal priority processes every 100ms.
69  */
70 void
71 roundrobin(struct cpu_info *ci)
72 {
73 	struct schedstate_percpu *spc = &ci->ci_schedstate;
74 
75 	spc->spc_rrticks = rrticks_init;
76 
77 	if (ci->ci_curproc != NULL) {
78 		if (spc->spc_schedflags & SPCF_SEENRR) {
79 			/*
80 			 * The process has already been through a roundrobin
81 			 * without switching and may be hogging the CPU.
82 			 * Indicate that the process should yield.
83 			 */
84 			atomic_setbits_int(&spc->spc_schedflags,
85 			    SPCF_SHOULDYIELD);
86 		} else {
87 			atomic_setbits_int(&spc->spc_schedflags,
88 			    SPCF_SEENRR);
89 		}
90 	}
91 
92 	if (spc->spc_nrun)
93 		need_resched(ci);
94 }
95 
96 /*
97  * Constants for digital decay and forget:
98  *	90% of (p_estcpu) usage in 5 * loadav time
99  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
100  *          Note that, as ps(1) mentions, this can let percentages
101  *          total over 100% (I've seen 137.9% for 3 processes).
102  *
103  * Note that hardclock updates p_estcpu and p_cpticks independently.
104  *
105  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
106  * That is, the system wants to compute a value of decay such
107  * that the following for loop:
108  * 	for (i = 0; i < (5 * loadavg); i++)
109  * 		p_estcpu *= decay;
110  * will compute
111  * 	p_estcpu *= 0.1;
112  * for all values of loadavg:
113  *
114  * Mathematically this loop can be expressed by saying:
115  * 	decay ** (5 * loadavg) ~= .1
116  *
117  * The system computes decay as:
118  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
119  *
120  * We wish to prove that the system's computation of decay
121  * will always fulfill the equation:
122  * 	decay ** (5 * loadavg) ~= .1
123  *
124  * If we compute b as:
125  * 	b = 2 * loadavg
126  * then
127  * 	decay = b / (b + 1)
128  *
129  * We now need to prove two things:
130  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
131  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
132  *
133  * Facts:
134  *         For x close to zero, exp(x) =~ 1 + x, since
135  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
136  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
137  *         For x close to zero, ln(1+x) =~ x, since
138  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
139  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
140  *         ln(.1) =~ -2.30
141  *
142  * Proof of (1):
143  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
144  *	solving for factor,
145  *      ln(factor) =~ (-2.30/5*loadav), or
146  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
147  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
148  *
149  * Proof of (2):
150  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
151  *	solving for power,
152  *      power*ln(b/(b+1)) =~ -2.30, or
153  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
154  *
155  * Actual power values for the implemented algorithm are as follows:
156  *      loadav: 1       2       3       4
157  *      power:  5.68    10.32   14.94   19.55
158  */
159 
160 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
161 #define	loadfactor(loadav)	(2 * (loadav))
162 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
163 
164 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
165 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
166 
167 /*
168  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
169  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
170  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
171  *
172  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
173  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
174  *
175  * If you don't want to bother with the faster/more-accurate formula, you
176  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
177  * (more general) method of calculating the %age of CPU used by a process.
178  */
179 #define	CCPU_SHIFT	11
180 
181 /*
182  * Recompute process priorities, every second.
183  */
184 void
185 schedcpu(void *arg)
186 {
187 	struct timeout *to = (struct timeout *)arg;
188 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
189 	struct proc *p;
190 	int s;
191 	unsigned int newcpu;
192 
193 	LIST_FOREACH(p, &allproc, p_list) {
194 		/*
195 		 * Idle threads are never placed on the runqueue,
196 		 * therefore computing their priority is pointless.
197 		 */
198 		if (p->p_cpu != NULL &&
199 		    p->p_cpu->ci_schedstate.spc_idleproc == p)
200 			continue;
201 		/*
202 		 * Increment sleep time (if sleeping). We ignore overflow.
203 		 */
204 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
205 			p->p_slptime++;
206 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
207 		/*
208 		 * If the process has slept the entire second,
209 		 * stop recalculating its priority until it wakes up.
210 		 */
211 		if (p->p_slptime > 1)
212 			continue;
213 		SCHED_LOCK(s);
214 		/*
215 		 * p_pctcpu is only for diagnostic tools such as ps.
216 		 */
217 #if	(FSHIFT >= CCPU_SHIFT)
218 		p->p_pctcpu += (stathz == 100)?
219 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
220                 	100 * (((fixpt_t) p->p_cpticks)
221 				<< (FSHIFT - CCPU_SHIFT)) / stathz;
222 #else
223 		p->p_pctcpu += ((FSCALE - ccpu) *
224 			(p->p_cpticks * FSCALE / stathz)) >> FSHIFT;
225 #endif
226 		p->p_cpticks = 0;
227 		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
228 		setpriority(p, newcpu, p->p_p->ps_nice);
229 
230 		if (p->p_stat == SRUN &&
231 		    (p->p_runpri / SCHED_PPQ) != (p->p_usrpri / SCHED_PPQ)) {
232 			remrunqueue(p);
233 			setrunqueue(p->p_cpu, p, p->p_usrpri);
234 		}
235 		SCHED_UNLOCK(s);
236 	}
237 	uvm_meter();
238 	wakeup(&lbolt);
239 	timeout_add_sec(to, 1);
240 }
241 
242 /*
243  * Recalculate the priority of a process after it has slept for a while.
244  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
245  * least six times the loadfactor will decay p_estcpu to zero.
246  */
247 uint32_t
248 decay_aftersleep(uint32_t estcpu, uint32_t slptime)
249 {
250 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
251 	uint32_t newcpu;
252 
253 	if (slptime > 5 * loadfac)
254 		newcpu = 0;
255 	else {
256 		newcpu = estcpu;
257 		slptime--;	/* the first time was done in schedcpu */
258 		while (newcpu && --slptime)
259 			newcpu = decay_cpu(loadfac, newcpu);
260 
261 	}
262 
263 	return (newcpu);
264 }
265 
266 /*
267  * General yield call.  Puts the current process back on its run queue and
268  * performs a voluntary context switch.
269  */
270 void
271 yield(void)
272 {
273 	struct proc *p = curproc;
274 	int s;
275 
276 	SCHED_LOCK(s);
277 	setrunqueue(p->p_cpu, p, p->p_usrpri);
278 	p->p_ru.ru_nvcsw++;
279 	mi_switch();
280 	SCHED_UNLOCK(s);
281 }
282 
283 /*
284  * General preemption call.  Puts the current process back on its run queue
285  * and performs an involuntary context switch.  If a process is supplied,
286  * we switch to that process.  Otherwise, we use the normal process selection
287  * criteria.
288  */
289 void
290 preempt(void)
291 {
292 	struct proc *p = curproc;
293 	int s;
294 
295 	SCHED_LOCK(s);
296 	setrunqueue(p->p_cpu, p, p->p_usrpri);
297 	p->p_ru.ru_nivcsw++;
298 	mi_switch();
299 	SCHED_UNLOCK(s);
300 }
301 
302 void
303 mi_switch(void)
304 {
305 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
306 	struct proc *p = curproc;
307 	struct proc *nextproc;
308 	struct process *pr = p->p_p;
309 	struct timespec ts;
310 #ifdef MULTIPROCESSOR
311 	int hold_count;
312 	int sched_count;
313 #endif
314 
315 	assertwaitok();
316 	KASSERT(p->p_stat != SONPROC);
317 
318 	SCHED_ASSERT_LOCKED();
319 
320 #ifdef MULTIPROCESSOR
321 	/*
322 	 * Release the kernel_lock, as we are about to yield the CPU.
323 	 */
324 	sched_count = __mp_release_all_but_one(&sched_lock);
325 	if (_kernel_lock_held())
326 		hold_count = __mp_release_all(&kernel_lock);
327 	else
328 		hold_count = 0;
329 #endif
330 
331 	/*
332 	 * Compute the amount of time during which the current
333 	 * process was running, and add that to its total so far.
334 	 */
335 	nanouptime(&ts);
336 	if (timespeccmp(&ts, &spc->spc_runtime, <)) {
337 #if 0
338 		printf("uptime is not monotonic! "
339 		    "ts=%lld.%09lu, runtime=%lld.%09lu\n",
340 		    (long long)tv.tv_sec, tv.tv_nsec,
341 		    (long long)spc->spc_runtime.tv_sec,
342 		    spc->spc_runtime.tv_nsec);
343 #endif
344 	} else {
345 		timespecsub(&ts, &spc->spc_runtime, &ts);
346 		timespecadd(&p->p_rtime, &ts, &p->p_rtime);
347 	}
348 
349 	/* add the time counts for this thread to the process's total */
350 	tuagg_unlocked(pr, p);
351 
352 	/*
353 	 * Process is about to yield the CPU; clear the appropriate
354 	 * scheduling flags.
355 	 */
356 	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
357 
358 	nextproc = sched_chooseproc();
359 
360 	if (p != nextproc) {
361 		uvmexp.swtch++;
362 		TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,
363 		    nextproc->p_p->ps_pid);
364 		cpu_switchto(p, nextproc);
365 		TRACEPOINT(sched, on__cpu, NULL);
366 	} else {
367 		TRACEPOINT(sched, remain__cpu, NULL);
368 		p->p_stat = SONPROC;
369 	}
370 
371 	clear_resched(curcpu());
372 
373 	SCHED_ASSERT_LOCKED();
374 
375 	/*
376 	 * To preserve lock ordering, we need to release the sched lock
377 	 * and grab it after we grab the big lock.
378 	 * In the future, when the sched lock isn't recursive, we'll
379 	 * just release it here.
380 	 */
381 #ifdef MULTIPROCESSOR
382 	__mp_unlock(&sched_lock);
383 #endif
384 
385 	SCHED_ASSERT_UNLOCKED();
386 
387 	smr_idle();
388 
389 	/*
390 	 * We're running again; record our new start time.  We might
391 	 * be running on a new CPU now, so don't use the cache'd
392 	 * schedstate_percpu pointer.
393 	 */
394 	KASSERT(p->p_cpu == curcpu());
395 
396 	nanouptime(&p->p_cpu->ci_schedstate.spc_runtime);
397 
398 #ifdef MULTIPROCESSOR
399 	/*
400 	 * Reacquire the kernel_lock now.  We do this after we've
401 	 * released the scheduler lock to avoid deadlock, and before
402 	 * we reacquire the interlock and the scheduler lock.
403 	 */
404 	if (hold_count)
405 		__mp_acquire_count(&kernel_lock, hold_count);
406 	__mp_acquire_count(&sched_lock, sched_count + 1);
407 #endif
408 }
409 
410 /*
411  * Change process state to be runnable,
412  * placing it on the run queue.
413  */
414 void
415 setrunnable(struct proc *p)
416 {
417 	struct process *pr = p->p_p;
418 	u_char prio;
419 
420 	SCHED_ASSERT_LOCKED();
421 
422 	switch (p->p_stat) {
423 	case 0:
424 	case SRUN:
425 	case SONPROC:
426 	case SDEAD:
427 	case SIDL:
428 	default:
429 		panic("setrunnable");
430 	case SSTOP:
431 		/*
432 		 * If we're being traced (possibly because someone attached us
433 		 * while we were stopped), check for a signal from the debugger.
434 		 */
435 		if ((pr->ps_flags & PS_TRACED) != 0 && pr->ps_xsig != 0)
436 			atomic_setbits_int(&p->p_siglist, sigmask(pr->ps_xsig));
437 		prio = p->p_usrpri;
438 		unsleep(p);
439 		break;
440 	case SSLEEP:
441 		prio = p->p_slppri;
442 		unsleep(p);		/* e.g. when sending signals */
443 
444 		/* if not yet asleep, don't add to runqueue */
445 		if (ISSET(p->p_flag, P_WSLEEP))
446 			return;
447 		break;
448 	}
449 	setrunqueue(NULL, p, prio);
450 	if (p->p_slptime > 1) {
451 		uint32_t newcpu;
452 
453 		newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime);
454 		setpriority(p, newcpu, pr->ps_nice);
455 	}
456 	p->p_slptime = 0;
457 }
458 
459 /*
460  * Compute the priority of a process.
461  */
462 void
463 setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
464 {
465 	unsigned int newprio;
466 
467 	newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);
468 
469 	SCHED_ASSERT_LOCKED();
470 	p->p_estcpu = newcpu;
471 	p->p_usrpri = newprio;
472 }
473 
474 /*
475  * We adjust the priority of the current process.  The priority of a process
476  * gets worse as it accumulates CPU time.  The cpu usage estimator (p_estcpu)
477  * is increased here.  The formula for computing priorities (in kern_synch.c)
478  * will compute a different value each time p_estcpu increases. This can
479  * cause a switch, but unless the priority crosses a PPQ boundary the actual
480  * queue will not change.  The cpu usage estimator ramps up quite quickly
481  * when the process is running (linearly), and decays away exponentially, at
482  * a rate which is proportionally slower when the system is busy.  The basic
483  * principle is that the system will 90% forget that the process used a lot
484  * of CPU time in 5 * loadav seconds.  This causes the system to favor
485  * processes which haven't run much recently, and to round-robin among other
486  * processes.
487  */
488 void
489 schedclock(struct proc *p)
490 {
491 	struct cpu_info *ci = curcpu();
492 	struct schedstate_percpu *spc = &ci->ci_schedstate;
493 	uint32_t newcpu;
494 	int s;
495 
496 	if (p == spc->spc_idleproc || spc->spc_spinning)
497 		return;
498 
499 	SCHED_LOCK(s);
500 	newcpu = ESTCPULIM(p->p_estcpu + 1);
501 	setpriority(p, newcpu, p->p_p->ps_nice);
502 	SCHED_UNLOCK(s);
503 }
504 
505 void (*cpu_setperf)(int);
506 
507 #define PERFPOL_MANUAL 0
508 #define PERFPOL_AUTO 1
509 #define PERFPOL_HIGH 2
510 int perflevel = 100;
511 int perfpolicy = PERFPOL_AUTO;
512 
513 #ifndef SMALL_KERNEL
514 /*
515  * The code below handles CPU throttling.
516  */
517 #include <sys/sysctl.h>
518 
519 void setperf_auto(void *);
520 struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL);
521 extern int hw_power;
522 
523 void
524 setperf_auto(void *v)
525 {
526 	static uint64_t *idleticks, *totalticks;
527 	static int downbeats;
528 	int i, j = 0;
529 	int speedup = 0;
530 	CPU_INFO_ITERATOR cii;
531 	struct cpu_info *ci;
532 	uint64_t idle, total, allidle = 0, alltotal = 0;
533 
534 	if (perfpolicy != PERFPOL_AUTO)
535 		return;
536 
537 	if (cpu_setperf == NULL)
538 		return;
539 
540 	if (hw_power) {
541 		speedup = 1;
542 		goto faster;
543 	}
544 
545 	if (!idleticks)
546 		if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks),
547 		    M_DEVBUF, M_NOWAIT | M_ZERO)))
548 			return;
549 	if (!totalticks)
550 		if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks),
551 		    M_DEVBUF, M_NOWAIT | M_ZERO))) {
552 			free(idleticks, M_DEVBUF,
553 			    sizeof(*idleticks) * ncpusfound);
554 			return;
555 		}
556 	CPU_INFO_FOREACH(cii, ci) {
557 		if (!cpu_is_online(ci))
558 			continue;
559 		total = 0;
560 		for (i = 0; i < CPUSTATES; i++) {
561 			total += ci->ci_schedstate.spc_cp_time[i];
562 		}
563 		total -= totalticks[j];
564 		idle = ci->ci_schedstate.spc_cp_time[CP_IDLE] - idleticks[j];
565 		if (idle < total / 3)
566 			speedup = 1;
567 		alltotal += total;
568 		allidle += idle;
569 		idleticks[j] += idle;
570 		totalticks[j] += total;
571 		j++;
572 	}
573 	if (allidle < alltotal / 2)
574 		speedup = 1;
575 	if (speedup && downbeats < 5)
576 		downbeats++;
577 
578 	if (speedup && perflevel != 100) {
579 faster:
580 		perflevel = 100;
581 		cpu_setperf(perflevel);
582 	} else if (!speedup && perflevel != 0 && --downbeats <= 0) {
583 		perflevel = 0;
584 		cpu_setperf(perflevel);
585 	}
586 
587 	timeout_add_msec(&setperf_to, 100);
588 }
589 
590 int
591 sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
592 {
593 	int err;
594 
595 	if (!cpu_setperf)
596 		return EOPNOTSUPP;
597 
598 	if (perfpolicy != PERFPOL_MANUAL)
599 		return sysctl_rdint(oldp, oldlenp, newp, perflevel);
600 
601 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
602 	    &perflevel, 0, 100);
603 	if (err)
604 		return err;
605 
606 	if (newp != NULL)
607 		cpu_setperf(perflevel);
608 
609 	return 0;
610 }
611 
612 int
613 sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
614 {
615 	char policy[32];
616 	int err;
617 
618 	if (!cpu_setperf)
619 		return EOPNOTSUPP;
620 
621 	switch (perfpolicy) {
622 	case PERFPOL_MANUAL:
623 		strlcpy(policy, "manual", sizeof(policy));
624 		break;
625 	case PERFPOL_AUTO:
626 		strlcpy(policy, "auto", sizeof(policy));
627 		break;
628 	case PERFPOL_HIGH:
629 		strlcpy(policy, "high", sizeof(policy));
630 		break;
631 	default:
632 		strlcpy(policy, "unknown", sizeof(policy));
633 		break;
634 	}
635 
636 	if (newp == NULL)
637 		return sysctl_rdstring(oldp, oldlenp, newp, policy);
638 
639 	err = sysctl_string(oldp, oldlenp, newp, newlen, policy, sizeof(policy));
640 	if (err)
641 		return err;
642 	if (strcmp(policy, "manual") == 0)
643 		perfpolicy = PERFPOL_MANUAL;
644 	else if (strcmp(policy, "auto") == 0)
645 		perfpolicy = PERFPOL_AUTO;
646 	else if (strcmp(policy, "high") == 0)
647 		perfpolicy = PERFPOL_HIGH;
648 	else
649 		return EINVAL;
650 
651 	if (perfpolicy == PERFPOL_AUTO) {
652 		timeout_add_msec(&setperf_to, 200);
653 	} else if (perfpolicy == PERFPOL_HIGH) {
654 		perflevel = 100;
655 		cpu_setperf(perflevel);
656 	}
657 	return 0;
658 }
659 #endif
660 
661 void
662 scheduler_start(void)
663 {
664 	static struct timeout schedcpu_to;
665 
666 	/*
667 	 * We avoid polluting the global namespace by keeping the scheduler
668 	 * timeouts static in this function.
669 	 * We setup the timeout here and kick schedcpu once to make it do
670 	 * its job.
671 	 */
672 	timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
673 
674 	rrticks_init = hz / 10;
675 	schedcpu(&schedcpu_to);
676 
677 #ifndef SMALL_KERNEL
678 	if (perfpolicy == PERFPOL_AUTO)
679 		timeout_add_msec(&setperf_to, 200);
680 #endif
681 }
682 
683