xref: /illumos-gate/usr/src/uts/common/os/cpu_pm.c (revision 0ed5c46e)
10e751525SEric Saxe /*
20e751525SEric Saxe  * CDDL HEADER START
30e751525SEric Saxe  *
40e751525SEric Saxe  * The contents of this file are subject to the terms of the
50e751525SEric Saxe  * Common Development and Distribution License (the "License").
60e751525SEric Saxe  * You may not use this file except in compliance with the License.
70e751525SEric Saxe  *
80e751525SEric Saxe  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90e751525SEric Saxe  * or http://www.opensolaris.org/os/licensing.
100e751525SEric Saxe  * See the License for the specific language governing permissions
110e751525SEric Saxe  * and limitations under the License.
120e751525SEric Saxe  *
130e751525SEric Saxe  * When distributing Covered Code, include this CDDL HEADER in each
140e751525SEric Saxe  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150e751525SEric Saxe  * If applicable, add the following below this CDDL HEADER, with the
160e751525SEric Saxe  * fields enclosed by brackets "[]" replaced with your own identifying
170e751525SEric Saxe  * information: Portions Copyright [yyyy] [name of copyright owner]
180e751525SEric Saxe  *
190e751525SEric Saxe  * CDDL HEADER END
200e751525SEric Saxe  */
210e751525SEric Saxe /*
220e751525SEric Saxe  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
230e751525SEric Saxe  * Use is subject to license terms.
240e751525SEric Saxe  */
250e751525SEric Saxe 
260e751525SEric Saxe #include <sys/cpu_pm.h>
270e751525SEric Saxe #include <sys/cmn_err.h>
28113b131bSEric Saxe #include <sys/time.h>
290e751525SEric Saxe #include <sys/sdt.h>
300e751525SEric Saxe 
310e751525SEric Saxe /*
320e751525SEric Saxe  * Solaris Event Based CPU Power Manager
330e751525SEric Saxe  *
340e751525SEric Saxe  * This file implements platform independent event based CPU power management.
350e751525SEric Saxe  * When CPUs are configured into the system, the CMT scheduling subsystem will
360e751525SEric Saxe  * query the platform to determine if the CPU belongs to any power management
370e751525SEric Saxe  * domains. That is, sets of CPUs that share power management states.
380e751525SEric Saxe  *
390e751525SEric Saxe  * Active Power Management domains represent a group of CPUs across which the
400e751525SEric Saxe  * Operating System can request speed changes (which may in turn result
410e751525SEric Saxe  * in voltage changes). This allows the operating system to trade off
420e751525SEric Saxe  * performance for power savings.
430e751525SEric Saxe  *
440e751525SEric Saxe  * Idle Power Management domains can enter power savings states when they are
450e751525SEric Saxe  * unutilized. These states allow the Operating System to trade off power
460e751525SEric Saxe  * for performance (in the form of latency to transition from the idle state
470e751525SEric Saxe  * to an active one).
480e751525SEric Saxe  *
490e751525SEric Saxe  * For each active and idle power domain the CMT subsystem instantiates, a
500e751525SEric Saxe  * cpupm_domain_t structure is created. As the dispatcher schedules threads
510e751525SEric Saxe  * to run on the system's CPUs, it will also track the utilization of the
520e751525SEric Saxe  * enumerated power domains. Significant changes in utilization will result
530e751525SEric Saxe  * in the dispatcher sending the power manager events that relate to the
540e751525SEric Saxe  * utilization of the power domain. The power manager recieves the events,
550e751525SEric Saxe  * and in the context of the policy objectives in force, may decide to request
560e751525SEric Saxe  * the domain's power/performance state be changed.
570e751525SEric Saxe  *
580e751525SEric Saxe  * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
590e751525SEric Saxe  * manager will request the CPUs in the domain run at their fastest (and most
600e751525SEric Saxe  * power consuming) state. When the domain becomes idle (utilization at zero),
610e751525SEric Saxe  * the power manager will request that the CPUs run at a speed that saves the
620e751525SEric Saxe  * most power.
630e751525SEric Saxe  *
640e751525SEric Saxe  * The advantage of this scheme, is that the CPU power manager working with the
650e751525SEric Saxe  * dispatcher can be extremely responsive to changes in utilization. Optimizing
660e751525SEric Saxe  * for performance in the presence of utilization, and power savings in the
670e751525SEric Saxe  * presence of idleness. Such close collaboration with the dispatcher has other
680e751525SEric Saxe  * benefits that will play out in the form of more sophisticated power /
690e751525SEric Saxe  * performance policy in the near future.
700e751525SEric Saxe  *
710e751525SEric Saxe  * Avoiding state thrashing in the presence of transient periods of utilization
720e751525SEric Saxe  * and idleness while still being responsive to non-transient periods is key.
73113b131bSEric Saxe  * The power manager implements a "governor" that is used to throttle
740e751525SEric Saxe  * state transitions when a significant amount of transient idle or transient
750e751525SEric Saxe  * work is detected.
760e751525SEric Saxe  *
770e751525SEric Saxe  * Kernel background activity (e.g. taskq threads) are by far the most common
780e751525SEric Saxe  * form of transient utilization. Ungoverned in the face of this utililzation,
790e751525SEric Saxe  * hundreds of state transitions per second would result on an idle system.
800e751525SEric Saxe  *
810e751525SEric Saxe  * Transient idleness is common when a thread briefly yields the CPU to
820e751525SEric Saxe  * wait for an event elsewhere in the system. Where the idle period is short
830e751525SEric Saxe  * enough, the overhead associated with making the state transition doesn't
840e751525SEric Saxe  * justify the power savings.
85113b131bSEric Saxe  *
86113b131bSEric Saxe  * The following is the state machine for the governor implemented by
87113b131bSEric Saxe  * cpupm_utilization_event():
88113b131bSEric Saxe  *
89113b131bSEric Saxe  *         ----->---tw---->-----
90113b131bSEric Saxe  *        /                     \
91113b131bSEric Saxe  *      (I)-<-ti-<-     -<-ntw-<(W)
92113b131bSEric Saxe  *       |         \   /         |
93113b131bSEric Saxe  *       \          \ /          /
94113b131bSEric Saxe  *        >-nti/rm->(D)--->-tw->-
95113b131bSEric Saxe  * Key:
96113b131bSEric Saxe  *
97113b131bSEric Saxe  * States
98113b131bSEric Saxe  * - (D): Default (ungoverned)
99113b131bSEric Saxe  * - (W): Transient work governed
100113b131bSEric Saxe  * - (I): Transient idle governed
101113b131bSEric Saxe  * State Transitions
102113b131bSEric Saxe  * - tw: transient work
103113b131bSEric Saxe  * - ti: transient idleness
104113b131bSEric Saxe  * - ntw: non-transient work
105113b131bSEric Saxe  * - nti: non-transient idleness
106113b131bSEric Saxe  * - rm: thread remain event
1070e751525SEric Saxe  */
1080e751525SEric Saxe 
1090e751525SEric Saxe static cpupm_domain_t *cpupm_domains = NULL;
1100e751525SEric Saxe 
1110e751525SEric Saxe /*
1120e751525SEric Saxe  * Uninitialized state of CPU power management is disabled
1130e751525SEric Saxe  */
1140e751525SEric Saxe cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
1150e751525SEric Saxe 
1160e751525SEric Saxe /*
1170e751525SEric Saxe  * Periods of utilization lasting less than this time interval are characterized
1180e751525SEric Saxe  * as transient. State changes associated with transient work are considered
1190e751525SEric Saxe  * to be mispredicted. That is, it's not worth raising and lower power states
1200e751525SEric Saxe  * where the utilization lasts for less than this interval.
1210e751525SEric Saxe  */
1220e751525SEric Saxe hrtime_t cpupm_tw_predict_interval;
1230e751525SEric Saxe 
1240e751525SEric Saxe /*
1250e751525SEric Saxe  * Periods of idleness lasting less than this time interval are characterized
1260e751525SEric Saxe  * as transient. State changes associated with transient idle are considered
1270e751525SEric Saxe  * to be mispredicted. That is, it's not worth lowering and raising power
1280e751525SEric Saxe  * states where the idleness lasts for less than this interval.
1290e751525SEric Saxe  */
1300e751525SEric Saxe hrtime_t cpupm_ti_predict_interval;
1310e751525SEric Saxe 
1320e751525SEric Saxe /*
1330e751525SEric Saxe  * Number of mispredictions after which future transitions will be governed.
1340e751525SEric Saxe  */
135113b131bSEric Saxe int cpupm_mispredict_thresh = 4;
1360e751525SEric Saxe 
1370e751525SEric Saxe /*
1380e751525SEric Saxe  * Likewise, the number of mispredicted governed transitions after which the
1390e751525SEric Saxe  * governor will be removed.
1400e751525SEric Saxe  */
141113b131bSEric Saxe int cpupm_mispredict_gov_thresh = 4;
1420e751525SEric Saxe 
1430e751525SEric Saxe /*
144113b131bSEric Saxe  * The transient work and transient idle prediction intervals are specified
145113b131bSEric Saxe  * here. Tuning them higher will result in the transient work, and transient
146113b131bSEric Saxe  * idle governors being used more aggresively, which limits the frequency of
147113b131bSEric Saxe  * state transitions at the expense of performance and power savings,
148113b131bSEric Saxe  * respectively. The intervals are specified in nanoseconds.
1490e751525SEric Saxe  */
1500e751525SEric Saxe /*
151113b131bSEric Saxe  * 400 usec
1520e751525SEric Saxe  */
153113b131bSEric Saxe #define	CPUPM_DEFAULT_TI_INTERVAL	400000
154113b131bSEric Saxe /*
155113b131bSEric Saxe  * 400 usec
156113b131bSEric Saxe  */
157113b131bSEric Saxe #define	CPUPM_DEFAULT_TW_INTERVAL	400000
1580e751525SEric Saxe 
159113b131bSEric Saxe hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL;
160113b131bSEric Saxe hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL;
1610e751525SEric Saxe 
1620e751525SEric Saxe 
163113b131bSEric Saxe static void	cpupm_governor_initialize(void);
1640e751525SEric Saxe static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
1650e751525SEric Saxe 
1660e751525SEric Saxe cpupm_policy_t
cpupm_get_policy(void)1670e751525SEric Saxe cpupm_get_policy(void)
1680e751525SEric Saxe {
1690e751525SEric Saxe 	return (cpupm_policy);
1700e751525SEric Saxe }
1710e751525SEric Saxe 
1720e751525SEric Saxe int
cpupm_set_policy(cpupm_policy_t new_policy)1730e751525SEric Saxe cpupm_set_policy(cpupm_policy_t new_policy)
1740e751525SEric Saxe {
1750e751525SEric Saxe 	static int	gov_init = 0;
1760e751525SEric Saxe 	int		result = 0;
1770e751525SEric Saxe 
1780e751525SEric Saxe 	mutex_enter(&cpu_lock);
1790e751525SEric Saxe 	if (new_policy == cpupm_policy) {
1800e751525SEric Saxe 		mutex_exit(&cpu_lock);
1810e751525SEric Saxe 		return (result);
1820e751525SEric Saxe 	}
1830e751525SEric Saxe 
1840e751525SEric Saxe 	/*
1850e751525SEric Saxe 	 * Pausing CPUs causes a high priority thread to be scheduled
1860e751525SEric Saxe 	 * on all other CPUs (besides the current one). This locks out
1870e751525SEric Saxe 	 * other CPUs from making CPUPM state transitions.
1880e751525SEric Saxe 	 */
1890e751525SEric Saxe 	switch (new_policy) {
1900e751525SEric Saxe 	case CPUPM_POLICY_DISABLED:
191*0ed5c46eSJosef 'Jeff' Sipek 		pause_cpus(NULL, NULL);
1920e751525SEric Saxe 		cpupm_policy = CPUPM_POLICY_DISABLED;
1930e751525SEric Saxe 		start_cpus();
1940e751525SEric Saxe 
1950e751525SEric Saxe 		result = cmt_pad_disable(PGHW_POW_ACTIVE);
1960e751525SEric Saxe 
1970e751525SEric Saxe 		/*
1980e751525SEric Saxe 		 * Once PAD has been enabled, it should always be possible
1990e751525SEric Saxe 		 * to disable it.
2000e751525SEric Saxe 		 */
2010e751525SEric Saxe 		ASSERT(result == 0);
2020e751525SEric Saxe 
2030e751525SEric Saxe 		/*
2040e751525SEric Saxe 		 * Bring all the active power domains to the maximum
2050e751525SEric Saxe 		 * performance state.
2060e751525SEric Saxe 		 */
2070e751525SEric Saxe 		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
2080e751525SEric Saxe 		    CPUPM_STATE_MAX_PERF);
2090e751525SEric Saxe 
2100e751525SEric Saxe 		break;
2110e751525SEric Saxe 	case CPUPM_POLICY_ELASTIC:
2120e751525SEric Saxe 
2130e751525SEric Saxe 		result = cmt_pad_enable(PGHW_POW_ACTIVE);
2140e751525SEric Saxe 		if (result < 0) {
2150e751525SEric Saxe 			/*
2160e751525SEric Saxe 			 * Failed to enable PAD across the active power
2170e751525SEric Saxe 			 * domains, which may well be because none were
2180e751525SEric Saxe 			 * enumerated.
2190e751525SEric Saxe 			 */
2200e751525SEric Saxe 			break;
2210e751525SEric Saxe 		}
2220e751525SEric Saxe 
2230e751525SEric Saxe 		/*
224113b131bSEric Saxe 		 * Initialize the governor parameters the first time through.
2250e751525SEric Saxe 		 */
2260e751525SEric Saxe 		if (gov_init == 0) {
227113b131bSEric Saxe 			cpupm_governor_initialize();
2280e751525SEric Saxe 			gov_init = 1;
2290e751525SEric Saxe 		}
230113b131bSEric Saxe 
231*0ed5c46eSJosef 'Jeff' Sipek 		pause_cpus(NULL, NULL);
2320e751525SEric Saxe 		cpupm_policy = CPUPM_POLICY_ELASTIC;
2330e751525SEric Saxe 		start_cpus();
2340e751525SEric Saxe 
2350e751525SEric Saxe 		break;
2360e751525SEric Saxe 	default:
2370e751525SEric Saxe 		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
2380e751525SEric Saxe 		    new_policy);
2390e751525SEric Saxe 		ASSERT(0);
2400e751525SEric Saxe 		break;
2410e751525SEric Saxe 	}
2420e751525SEric Saxe 	mutex_exit(&cpu_lock);
2430e751525SEric Saxe 
2440e751525SEric Saxe 	return (result);
2450e751525SEric Saxe }
2460e751525SEric Saxe 
2470e751525SEric Saxe /*
2480e751525SEric Saxe  * Look for an existing power domain
2490e751525SEric Saxe  */
2500e751525SEric Saxe static cpupm_domain_t *
cpupm_domain_find(id_t id,cpupm_dtype_t type)2510e751525SEric Saxe cpupm_domain_find(id_t id, cpupm_dtype_t type)
2520e751525SEric Saxe {
2530e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
2540e751525SEric Saxe 
2550e751525SEric Saxe 	cpupm_domain_t *dom;
2560e751525SEric Saxe 
2570e751525SEric Saxe 	dom = cpupm_domains;
2580e751525SEric Saxe 	while (dom != NULL) {
2590e751525SEric Saxe 		if (id == dom->cpd_id && type == dom->cpd_type)
2600e751525SEric Saxe 			return (dom);
2610e751525SEric Saxe 		dom = dom->cpd_next;
2620e751525SEric Saxe 	}
2630e751525SEric Saxe 	return (NULL);
2640e751525SEric Saxe }
2650e751525SEric Saxe 
2660e751525SEric Saxe /*
2670e751525SEric Saxe  * Create a new domain
2680e751525SEric Saxe  */
2690e751525SEric Saxe static cpupm_domain_t *
cpupm_domain_create(id_t id,cpupm_dtype_t type)2700e751525SEric Saxe cpupm_domain_create(id_t id, cpupm_dtype_t type)
2710e751525SEric Saxe {
2720e751525SEric Saxe 	cpupm_domain_t *dom;
2730e751525SEric Saxe 
2740e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
2750e751525SEric Saxe 
2760e751525SEric Saxe 	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
2770e751525SEric Saxe 	dom->cpd_id = id;
2780e751525SEric Saxe 	dom->cpd_type = type;
2790e751525SEric Saxe 
2800e751525SEric Saxe 	/* Link into the known domain list */
2810e751525SEric Saxe 	dom->cpd_next = cpupm_domains;
2820e751525SEric Saxe 	cpupm_domains = dom;
2830e751525SEric Saxe 
2840e751525SEric Saxe 	return (dom);
2850e751525SEric Saxe }
2860e751525SEric Saxe 
2870e751525SEric Saxe static void
cpupm_domain_state_enum(struct cpu * cp,cpupm_domain_t * dom)2880e751525SEric Saxe cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
2890e751525SEric Saxe {
2900e751525SEric Saxe 	/*
2910e751525SEric Saxe 	 * In the envent we're enumerating because the domain's state
2920e751525SEric Saxe 	 * configuration has changed, toss any existing states.
2930e751525SEric Saxe 	 */
2940e751525SEric Saxe 	if (dom->cpd_nstates > 0) {
2950e751525SEric Saxe 		kmem_free(dom->cpd_states,
2960e751525SEric Saxe 		    sizeof (cpupm_state_t) * dom->cpd_nstates);
2970e751525SEric Saxe 		dom->cpd_nstates = 0;
2980e751525SEric Saxe 	}
2990e751525SEric Saxe 
3000e751525SEric Saxe 	/*
3010e751525SEric Saxe 	 * Query to determine the number of states, allocate storage
3020e751525SEric Saxe 	 * large enough to hold the state information, and pass it back
3030e751525SEric Saxe 	 * to the platform driver to complete the enumeration.
3040e751525SEric Saxe 	 */
3050e751525SEric Saxe 	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
3060e751525SEric Saxe 
3070e751525SEric Saxe 	if (dom->cpd_nstates == 0)
3080e751525SEric Saxe 		return;
3090e751525SEric Saxe 
3100e751525SEric Saxe 	dom->cpd_states =
3110e751525SEric Saxe 	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
3120e751525SEric Saxe 	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
3130e751525SEric Saxe }
3140e751525SEric Saxe 
3150e751525SEric Saxe /*
3160e751525SEric Saxe  * Initialize the specified type of power domain on behalf of the CPU
3170e751525SEric Saxe  */
3180e751525SEric Saxe cpupm_domain_t *
cpupm_domain_init(struct cpu * cp,cpupm_dtype_t type)3190e751525SEric Saxe cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
3200e751525SEric Saxe {
3210e751525SEric Saxe 	cpupm_domain_t	*dom;
3220e751525SEric Saxe 	id_t		did;
3230e751525SEric Saxe 
3240e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
3250e751525SEric Saxe 
3260e751525SEric Saxe 	/*
3270e751525SEric Saxe 	 * Instantiate the domain if it doesn't already exist
3280e751525SEric Saxe 	 * and enumerate its power states.
3290e751525SEric Saxe 	 */
3300e751525SEric Saxe 	did = cpupm_domain_id(cp, type);
3310e751525SEric Saxe 	dom = cpupm_domain_find(did, type);
3320e751525SEric Saxe 	if (dom == NULL) {
3330e751525SEric Saxe 		dom = cpupm_domain_create(did, type);
3340e751525SEric Saxe 		cpupm_domain_state_enum(cp, dom);
3350e751525SEric Saxe 	}
3360e751525SEric Saxe 
3370e751525SEric Saxe 	/*
3380e751525SEric Saxe 	 * Named state initialization
3390e751525SEric Saxe 	 */
3400e751525SEric Saxe 	if (type == CPUPM_DTYPE_ACTIVE) {
3410e751525SEric Saxe 		/*
3420e751525SEric Saxe 		 * For active power domains, the highest performance
3430e751525SEric Saxe 		 * state is defined as first state returned from
3440e751525SEric Saxe 		 * the domain enumeration.
3450e751525SEric Saxe 		 */
3460e751525SEric Saxe 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
3470e751525SEric Saxe 		    &dom->cpd_states[0];
3480e751525SEric Saxe 		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
3490e751525SEric Saxe 		    &dom->cpd_states[dom->cpd_nstates - 1];
3500e751525SEric Saxe 
3510e751525SEric Saxe 		/*
3520e751525SEric Saxe 		 * Begin by assuming CPU is running at the max perf state.
3530e751525SEric Saxe 		 */
3540e751525SEric Saxe 		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
3550e751525SEric Saxe 	}
3560e751525SEric Saxe 
3570e751525SEric Saxe 	return (dom);
3580e751525SEric Saxe }
3590e751525SEric Saxe 
3600e751525SEric Saxe /*
3610e751525SEric Saxe  * Return the id associated with the given type of domain
3620e751525SEric Saxe  * to which cp belongs
3630e751525SEric Saxe  */
3640e751525SEric Saxe id_t
cpupm_domain_id(struct cpu * cp,cpupm_dtype_t type)3650e751525SEric Saxe cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
3660e751525SEric Saxe {
3670e751525SEric Saxe 	return (cpupm_plat_domain_id(cp, type));
3680e751525SEric Saxe }
3690e751525SEric Saxe 
3700e751525SEric Saxe /*
3710e751525SEric Saxe  * Initiate a state change for the specified domain on behalf of cp
3720e751525SEric Saxe  */
3730e751525SEric Saxe int
cpupm_change_state(struct cpu * cp,cpupm_domain_t * dom,cpupm_state_t * state)3740e751525SEric Saxe cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
3750e751525SEric Saxe {
3760e751525SEric Saxe 	if (cpupm_plat_change_state(cp, state) < 0)
3770e751525SEric Saxe 		return (-1);
3780e751525SEric Saxe 
3790e751525SEric Saxe 	DTRACE_PROBE2(cpupm__change__state,
3800e751525SEric Saxe 	    cpupm_domain_t *, dom,
3810e751525SEric Saxe 	    cpupm_state_t *, state);
3820e751525SEric Saxe 
3830e751525SEric Saxe 	dom->cpd_state = state;
3840e751525SEric Saxe 	return (0);
3850e751525SEric Saxe }
3860e751525SEric Saxe 
3870e751525SEric Saxe /*
3880e751525SEric Saxe  * Interface into the CPU power manager to indicate a significant change
3890e751525SEric Saxe  * in utilization of the specified active power domain
3900e751525SEric Saxe  */
3910e751525SEric Saxe void
cpupm_utilization_event(struct cpu * cp,hrtime_t now,cpupm_domain_t * dom,cpupm_util_event_t event)3920e751525SEric Saxe cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
3930e751525SEric Saxe 			    cpupm_util_event_t event)
3940e751525SEric Saxe {
3950e751525SEric Saxe 	cpupm_state_t	*new_state = NULL;
3960e751525SEric Saxe 	hrtime_t	last;
3970e751525SEric Saxe 
3980e751525SEric Saxe 	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
3990e751525SEric Saxe 		return;
4000e751525SEric Saxe 	}
4010e751525SEric Saxe 
4020e751525SEric Saxe 	/*
4030e751525SEric Saxe 	 * What follows is a simple elastic power state management policy.
4040e751525SEric Saxe 	 *
4050e751525SEric Saxe 	 * If the utilization has become non-zero, and the domain was
4060e751525SEric Saxe 	 * previously at it's lowest power state, then transition it
4070e751525SEric Saxe 	 * to the highest state in the spirit of "race to idle".
4080e751525SEric Saxe 	 *
4090e751525SEric Saxe 	 * If the utilization has dropped to zero, then transition the
4100e751525SEric Saxe 	 * domain to its lowest power state.
4110e751525SEric Saxe 	 *
412113b131bSEric Saxe 	 * Statistics are maintained to implement a governor to reduce state
4130e751525SEric Saxe 	 * transitions resulting from either transient work, or periods of
4140e751525SEric Saxe 	 * transient idleness on the domain.
4150e751525SEric Saxe 	 */
4160e751525SEric Saxe 	switch (event) {
4170e751525SEric Saxe 	case CPUPM_DOM_REMAIN_BUSY:
4180e751525SEric Saxe 
4190e751525SEric Saxe 		/*
4200e751525SEric Saxe 		 * We've received an event that the domain is running a thread
4210e751525SEric Saxe 		 * that's made it to the end of it's time slice. If we are at
4220e751525SEric Saxe 		 * low power, then raise it. If the transient work governor
4230e751525SEric Saxe 		 * is engaged, then remove it.
4240e751525SEric Saxe 		 */
4250e751525SEric Saxe 		if (dom->cpd_state ==
4260e751525SEric Saxe 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
4270e751525SEric Saxe 			new_state =
4280e751525SEric Saxe 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
429113b131bSEric Saxe 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
430113b131bSEric Saxe 				dom->cpd_governor = CPUPM_GOV_DISENGAGED;
4310e751525SEric Saxe 				dom->cpd_tw = 0;
4320e751525SEric Saxe 			}
4330e751525SEric Saxe 		}
4340e751525SEric Saxe 		break;
4350e751525SEric Saxe 
4360e751525SEric Saxe 	case CPUPM_DOM_BUSY_FROM_IDLE:
4370e751525SEric Saxe 		last = dom->cpd_last_lower;
4380e751525SEric Saxe 		dom->cpd_last_raise = now;
4390e751525SEric Saxe 
4400e751525SEric Saxe 		DTRACE_PROBE3(cpupm__raise__req,
4410e751525SEric Saxe 		    cpupm_domain_t *, dom,
4420e751525SEric Saxe 		    hrtime_t, last,
4430e751525SEric Saxe 		    hrtime_t, now);
4440e751525SEric Saxe 
4450e751525SEric Saxe 		if (dom->cpd_state ==
4460e751525SEric Saxe 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
4470e751525SEric Saxe 
4480e751525SEric Saxe 			/*
4490e751525SEric Saxe 			 * There's non-zero utilization, and the domain is
4500e751525SEric Saxe 			 * running in the lower power state. Before we
451113b131bSEric Saxe 			 * consider raising power, check if the preceeding
452113b131bSEric Saxe 			 * idle period was transient in duration.
453113b131bSEric Saxe 			 *
454113b131bSEric Saxe 			 * If the domain is already transient work governed,
455113b131bSEric Saxe 			 * then we don't bother maintaining transient idle
456113b131bSEric Saxe 			 * statistics, as the presence of enough transient work
457113b131bSEric Saxe 			 * can also make the domain frequently transiently idle.
458113b131bSEric Saxe 			 * In this case, we still want to remain transient work
459113b131bSEric Saxe 			 * governed.
4600e751525SEric Saxe 			 */
461113b131bSEric Saxe 			if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
4620e751525SEric Saxe 				if ((now - last) < cpupm_ti_predict_interval) {
4630e751525SEric Saxe 					/*
4640e751525SEric Saxe 					 * We're raising the domain power and
4650e751525SEric Saxe 					 * we *just* lowered it. Consider
4660e751525SEric Saxe 					 * this a mispredicted power state
4670e751525SEric Saxe 					 * transition due to a transient
4680e751525SEric Saxe 					 * idle period.
4690e751525SEric Saxe 					 */
470113b131bSEric Saxe 					if (++dom->cpd_ti >=
4710e751525SEric Saxe 					    cpupm_mispredict_thresh) {
4720e751525SEric Saxe 						/*
4730e751525SEric Saxe 						 * There's enough transient
4740e751525SEric Saxe 						 * idle transitions to
4750e751525SEric Saxe 						 * justify governing future
4760e751525SEric Saxe 						 * lowering requests.
4770e751525SEric Saxe 						 */
478113b131bSEric Saxe 						dom->cpd_governor =
479113b131bSEric Saxe 						    CPUPM_GOV_TRANS_IDLE;
4800e751525SEric Saxe 						dom->cpd_ti = 0;
4810e751525SEric Saxe 						DTRACE_PROBE1(
4820e751525SEric Saxe 						    cpupm__ti__governed,
4830e751525SEric Saxe 						    cpupm_domain_t *, dom);
4840e751525SEric Saxe 					}
4850e751525SEric Saxe 				} else {
4860e751525SEric Saxe 					/*
4870e751525SEric Saxe 					 * We correctly predicted the last
4880e751525SEric Saxe 					 * lowering.
4890e751525SEric Saxe 					 */
4900e751525SEric Saxe 					dom->cpd_ti = 0;
4910e751525SEric Saxe 				}
4920e751525SEric Saxe 			}
493113b131bSEric Saxe 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
4940e751525SEric Saxe 				/*
4950e751525SEric Saxe 				 * Raise requests are governed due to
4960e751525SEric Saxe 				 * transient work.
4970e751525SEric Saxe 				 */
4980e751525SEric Saxe 				DTRACE_PROBE1(cpupm__raise__governed,
4990e751525SEric Saxe 				    cpupm_domain_t *, dom);
5000e751525SEric Saxe 
5010e751525SEric Saxe 				return;
5020e751525SEric Saxe 			}
5030e751525SEric Saxe 			/*
5040e751525SEric Saxe 			 * Prepare to transition to the higher power state
5050e751525SEric Saxe 			 */
5060e751525SEric Saxe 			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
5070e751525SEric Saxe 
5080e751525SEric Saxe 		} else if (dom->cpd_state ==
5090e751525SEric Saxe 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
5100e751525SEric Saxe 
5110e751525SEric Saxe 			/*
5120e751525SEric Saxe 			 * Utilization is non-zero, and we're already running
5130e751525SEric Saxe 			 * in the higher power state. Take this opportunity to
5140e751525SEric Saxe 			 * perform some book keeping if the last lowering
5150e751525SEric Saxe 			 * request was governed.
5160e751525SEric Saxe 			 */
517113b131bSEric Saxe 			if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
518113b131bSEric Saxe 
5190e751525SEric Saxe 				if ((now - last) >= cpupm_ti_predict_interval) {
5200e751525SEric Saxe 					/*
5210e751525SEric Saxe 					 * The domain is transient idle
5220e751525SEric Saxe 					 * governed, and we mispredicted
5230e751525SEric Saxe 					 * governing the last lowering request.
5240e751525SEric Saxe 					 */
5250e751525SEric Saxe 					if (++dom->cpd_ti >=
5260e751525SEric Saxe 					    cpupm_mispredict_gov_thresh) {
5270e751525SEric Saxe 						/*
5280e751525SEric Saxe 						 * There's enough non-transient
5290e751525SEric Saxe 						 * idle periods to justify
5300e751525SEric Saxe 						 * removing the governor.
5310e751525SEric Saxe 						 */
532113b131bSEric Saxe 						dom->cpd_governor =
533113b131bSEric Saxe 						    CPUPM_GOV_DISENGAGED;
5340e751525SEric Saxe 						dom->cpd_ti = 0;
5350e751525SEric Saxe 						DTRACE_PROBE1(
5360e751525SEric Saxe 						    cpupm__ti__ungoverned,
5370e751525SEric Saxe 						    cpupm_domain_t *, dom);
5380e751525SEric Saxe 					}
5390e751525SEric Saxe 				} else {
5400e751525SEric Saxe 					/*
5410e751525SEric Saxe 					 * Correctly predicted governing the
5420e751525SEric Saxe 					 * last lowering request.
5430e751525SEric Saxe 					 */
5440e751525SEric Saxe 					dom->cpd_ti = 0;
5450e751525SEric Saxe 				}
5460e751525SEric Saxe 			}
5470e751525SEric Saxe 		}
5480e751525SEric Saxe 		break;
5490e751525SEric Saxe 
5500e751525SEric Saxe 	case CPUPM_DOM_IDLE_FROM_BUSY:
5510e751525SEric Saxe 		last = dom->cpd_last_raise;
5520e751525SEric Saxe 		dom->cpd_last_lower = now;
5530e751525SEric Saxe 
5540e751525SEric Saxe 		DTRACE_PROBE3(cpupm__lower__req,
5550e751525SEric Saxe 		    cpupm_domain_t *, dom,
5560e751525SEric Saxe 		    hrtime_t, last,
5570e751525SEric Saxe 		    hrtime_t, now);
5580e751525SEric Saxe 
5590e751525SEric Saxe 		if (dom->cpd_state ==
5600e751525SEric Saxe 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
5610e751525SEric Saxe 
5620e751525SEric Saxe 			/*
5630e751525SEric Saxe 			 * The domain is idle, and is running in the highest
5640e751525SEric Saxe 			 * performance state. Before we consider lowering power,
5650e751525SEric Saxe 			 * perform some book keeping for the transient work
5660e751525SEric Saxe 			 * governor.
5670e751525SEric Saxe 			 */
568113b131bSEric Saxe 			if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
5690e751525SEric Saxe 				if ((now - last) < cpupm_tw_predict_interval) {
5700e751525SEric Saxe 					/*
5710e751525SEric Saxe 					 * We're lowering the domain power and
5720e751525SEric Saxe 					 * we *just* raised it. Consider the
5730e751525SEric Saxe 					 * last raise mispredicted due to
5740e751525SEric Saxe 					 * transient work.
5750e751525SEric Saxe 					 */
5760e751525SEric Saxe 					if (++dom->cpd_tw >=
5770e751525SEric Saxe 					    cpupm_mispredict_thresh) {
5780e751525SEric Saxe 						/*
579113b131bSEric Saxe 						 * There's enough transient work
5800e751525SEric Saxe 						 * transitions to justify
581113b131bSEric Saxe 						 * governing future raise
5820e751525SEric Saxe 						 * requests.
5830e751525SEric Saxe 						 */
584113b131bSEric Saxe 						dom->cpd_governor =
585113b131bSEric Saxe 						    CPUPM_GOV_TRANS_WORK;
5860e751525SEric Saxe 						dom->cpd_tw = 0;
5870e751525SEric Saxe 						DTRACE_PROBE1(
5880e751525SEric Saxe 						    cpupm__tw__governed,
5890e751525SEric Saxe 						    cpupm_domain_t *, dom);
5900e751525SEric Saxe 					}
5910e751525SEric Saxe 				} else {
5920e751525SEric Saxe 					/*
5930e751525SEric Saxe 					 * We correctly predicted during the
5940e751525SEric Saxe 					 * last raise.
5950e751525SEric Saxe 					 */
5960e751525SEric Saxe 					dom->cpd_tw = 0;
5970e751525SEric Saxe 				}
5980e751525SEric Saxe 			}
599113b131bSEric Saxe 			if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
6000e751525SEric Saxe 				/*
6010e751525SEric Saxe 				 * Lowering requests are governed due to
6020e751525SEric Saxe 				 * transient idleness.
6030e751525SEric Saxe 				 */
6040e751525SEric Saxe 				DTRACE_PROBE1(cpupm__lowering__governed,
6050e751525SEric Saxe 				    cpupm_domain_t *, dom);
6060e751525SEric Saxe 
6070e751525SEric Saxe 				return;
6080e751525SEric Saxe 			}
6090e751525SEric Saxe 
6100e751525SEric Saxe 			/*
6110e751525SEric Saxe 			 * Prepare to transition to a lower power state.
6120e751525SEric Saxe 			 */
6130e751525SEric Saxe 			new_state =
6140e751525SEric Saxe 			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
6150e751525SEric Saxe 
6160e751525SEric Saxe 		} else if (dom->cpd_state ==
6170e751525SEric Saxe 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
6180e751525SEric Saxe 
6190e751525SEric Saxe 			/*
6200e751525SEric Saxe 			 * The domain is idle, and we're already running in
6210e751525SEric Saxe 			 * the lower power state. Take this opportunity to
6220e751525SEric Saxe 			 * perform some book keeping if the last raising
6230e751525SEric Saxe 			 * request was governed.
6240e751525SEric Saxe 			 */
625113b131bSEric Saxe 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
6260e751525SEric Saxe 				if ((now - last) >= cpupm_tw_predict_interval) {
6270e751525SEric Saxe 					/*
6280e751525SEric Saxe 					 * The domain is transient work
6290e751525SEric Saxe 					 * governed, and we mispredicted
6300e751525SEric Saxe 					 * governing the last raising request.
6310e751525SEric Saxe 					 */
6320e751525SEric Saxe 					if (++dom->cpd_tw >=
6330e751525SEric Saxe 					    cpupm_mispredict_gov_thresh) {
6340e751525SEric Saxe 						/*
6350e751525SEric Saxe 						 * There's enough non-transient
6360e751525SEric Saxe 						 * work to justify removing
6370e751525SEric Saxe 						 * the governor.
6380e751525SEric Saxe 						 */
639113b131bSEric Saxe 						dom->cpd_governor =
640113b131bSEric Saxe 						    CPUPM_GOV_DISENGAGED;
6410e751525SEric Saxe 						dom->cpd_tw = 0;
6420e751525SEric Saxe 						DTRACE_PROBE1(
6430e751525SEric Saxe 						    cpupm__tw__ungoverned,
6440e751525SEric Saxe 						    cpupm_domain_t *, dom);
6450e751525SEric Saxe 					}
6460e751525SEric Saxe 				} else {
6470e751525SEric Saxe 					/*
6480e751525SEric Saxe 					 * We correctly predicted governing
6490e751525SEric Saxe 					 * the last raise.
6500e751525SEric Saxe 					 */
6510e751525SEric Saxe 					dom->cpd_tw = 0;
6520e751525SEric Saxe 				}
6530e751525SEric Saxe 			}
6540e751525SEric Saxe 		}
6550e751525SEric Saxe 		break;
6560e751525SEric Saxe 	}
6570e751525SEric Saxe 	/*
6580e751525SEric Saxe 	 * Change the power state
6590e751525SEric Saxe 	 * Not much currently done if this doesn't succeed
6600e751525SEric Saxe 	 */
6610e751525SEric Saxe 	if (new_state)
6620e751525SEric Saxe 		(void) cpupm_change_state(cp, dom, new_state);
6630e751525SEric Saxe }
6640e751525SEric Saxe 
6650e751525SEric Saxe 
6660e751525SEric Saxe /*
6670e751525SEric Saxe  * Interface called by platforms to dynamically change the
6680e751525SEric Saxe  * MAX performance cpupm state
6690e751525SEric Saxe  */
6700e751525SEric Saxe void
cpupm_redefine_max_activepwr_state(struct cpu * cp,int max_perf_level)6710e751525SEric Saxe cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
6720e751525SEric Saxe {
6730e751525SEric Saxe 	cpupm_domain_t	*dom;
6740e751525SEric Saxe 	id_t		did;
6750e751525SEric Saxe 	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
6760e751525SEric Saxe 	boolean_t	change_state = B_FALSE;
6770e751525SEric Saxe 	cpupm_state_t	*new_state = NULL;
6780e751525SEric Saxe 
6790e751525SEric Saxe 	did = cpupm_domain_id(cp, type);
680a3114836SGerry Liu 	if (MUTEX_HELD(&cpu_lock)) {
681a3114836SGerry Liu 		dom = cpupm_domain_find(did, type);
682a3114836SGerry Liu 	} else {
6830e751525SEric Saxe 		mutex_enter(&cpu_lock);
6840e751525SEric Saxe 		dom = cpupm_domain_find(did, type);
6850e751525SEric Saxe 		mutex_exit(&cpu_lock);
686a3114836SGerry Liu 	}
6870e751525SEric Saxe 
6880e751525SEric Saxe 	/*
6890e751525SEric Saxe 	 * Can use a lock to avoid changing the power state of the cpu when
6900e751525SEric Saxe 	 * CPUPM_STATE_MAX_PERF is getting changed.
6910e751525SEric Saxe 	 * Since the occurance of events to change MAX_PERF is not frequent,
6920e751525SEric Saxe 	 * it may not be a good idea to overburden with locks. In the worst
6930e751525SEric Saxe 	 * case, for one cycle the power may not get changed to the required
6940e751525SEric Saxe 	 * level
6950e751525SEric Saxe 	 */
6960e751525SEric Saxe 	if (dom != NULL) {
6970e751525SEric Saxe 		if (dom->cpd_state ==
6980e751525SEric Saxe 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
6990e751525SEric Saxe 			change_state = B_TRUE;
7000e751525SEric Saxe 		}
7010e751525SEric Saxe 
7020e751525SEric Saxe 		/*
7030e751525SEric Saxe 		 * If an out of range level is passed, use the lowest supported
7040e751525SEric Saxe 		 * speed.
7050e751525SEric Saxe 		 */
7060e751525SEric Saxe 		if (max_perf_level >= dom->cpd_nstates &&
7070e751525SEric Saxe 		    dom->cpd_nstates > 1) {
7080e751525SEric Saxe 			max_perf_level = dom->cpd_nstates - 1;
7090e751525SEric Saxe 		}
7100e751525SEric Saxe 
7110e751525SEric Saxe 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
7120e751525SEric Saxe 		    &dom->cpd_states[max_perf_level];
7130e751525SEric Saxe 
7140e751525SEric Saxe 		/*
7150e751525SEric Saxe 		 * If the current state is MAX_PERF, change the current state
7160e751525SEric Saxe 		 * to the new MAX_PERF
7170e751525SEric Saxe 		 */
7180e751525SEric Saxe 		if (change_state) {
7190e751525SEric Saxe 			new_state =
7200e751525SEric Saxe 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
7210e751525SEric Saxe 			if (new_state) {
7220e751525SEric Saxe 				(void) cpupm_change_state(cp, dom, new_state);
7230e751525SEric Saxe 			}
7240e751525SEric Saxe 		}
7250e751525SEric Saxe 	}
7260e751525SEric Saxe }
7270e751525SEric Saxe 
7280e751525SEric Saxe /*
729113b131bSEric Saxe  * Initialize the parameters for the transience governor state machine
7300e751525SEric Saxe  */
731113b131bSEric Saxe static void
cpupm_governor_initialize(void)7320e751525SEric Saxe cpupm_governor_initialize(void)
7330e751525SEric Saxe {
7340e751525SEric Saxe 	/*
735113b131bSEric Saxe 	 * The default prediction intervals are specified in nanoseconds.
736113b131bSEric Saxe 	 * Convert these to the equivalent in unscaled hrtime, which is the
737113b131bSEric Saxe 	 * format of the timestamps passed to cpupm_utilization_event()
7380e751525SEric Saxe 	 */
739113b131bSEric Saxe 	cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval);
740113b131bSEric Saxe 	cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval);
7410e751525SEric Saxe }
7420e751525SEric Saxe 
7430e751525SEric Saxe /*
7440e751525SEric Saxe  * Initiate a state change in all CPUPM domain instances of the specified type
7450e751525SEric Saxe  */
7460e751525SEric Saxe static void
cpupm_state_change_global(cpupm_dtype_t type,cpupm_state_name_t state)7470e751525SEric Saxe cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
7480e751525SEric Saxe {
7490e751525SEric Saxe 	cpu_t		*cp;
7500e751525SEric Saxe 	pg_cmt_t	*pwr_pg;
7510e751525SEric Saxe 	cpupm_domain_t	*dom;
7520e751525SEric Saxe 	group_t		*hwset;
7530e751525SEric Saxe 	group_iter_t	giter;
7540e751525SEric Saxe 	pg_cpu_itr_t	cpu_iter;
7550e751525SEric Saxe 	pghw_type_t	hw;
7560e751525SEric Saxe 
7570e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
7580e751525SEric Saxe 
7590e751525SEric Saxe 	switch (type) {
7600e751525SEric Saxe 	case CPUPM_DTYPE_ACTIVE:
7610e751525SEric Saxe 		hw = PGHW_POW_ACTIVE;
7620e751525SEric Saxe 		break;
7630e751525SEric Saxe 	default:
7640e751525SEric Saxe 		/*
7650e751525SEric Saxe 		 * Power domain types other than "active" unsupported.
7660e751525SEric Saxe 		 */
7670e751525SEric Saxe 		ASSERT(type == CPUPM_DTYPE_ACTIVE);
7680e751525SEric Saxe 		return;
7690e751525SEric Saxe 	}
7700e751525SEric Saxe 
7710e751525SEric Saxe 	if ((hwset = pghw_set_lookup(hw)) == NULL)
7720e751525SEric Saxe 		return;
7730e751525SEric Saxe 
7740e751525SEric Saxe 	/*
7750e751525SEric Saxe 	 * Iterate over the power domains
7760e751525SEric Saxe 	 */
7770e751525SEric Saxe 	group_iter_init(&giter);
7780e751525SEric Saxe 	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
7790e751525SEric Saxe 
7800e751525SEric Saxe 		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
7810e751525SEric Saxe 
7820e751525SEric Saxe 		/*
7830e751525SEric Saxe 		 * Iterate over the CPUs in each domain
7840e751525SEric Saxe 		 */
7850e751525SEric Saxe 		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
7860e751525SEric Saxe 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
7870e751525SEric Saxe 			(void) cpupm_change_state(cp, dom,
7880e751525SEric Saxe 			    dom->cpd_named_states[state]);
7890e751525SEric Saxe 		}
7900e751525SEric Saxe 	}
7910e751525SEric Saxe }
792