10e751525SEric Saxe /*
20e751525SEric Saxe * CDDL HEADER START
30e751525SEric Saxe *
40e751525SEric Saxe * The contents of this file are subject to the terms of the
50e751525SEric Saxe * Common Development and Distribution License (the "License").
60e751525SEric Saxe * You may not use this file except in compliance with the License.
70e751525SEric Saxe *
80e751525SEric Saxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90e751525SEric Saxe * or http://www.opensolaris.org/os/licensing.
100e751525SEric Saxe * See the License for the specific language governing permissions
110e751525SEric Saxe * and limitations under the License.
120e751525SEric Saxe *
130e751525SEric Saxe * When distributing Covered Code, include this CDDL HEADER in each
140e751525SEric Saxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150e751525SEric Saxe * If applicable, add the following below this CDDL HEADER, with the
160e751525SEric Saxe * fields enclosed by brackets "[]" replaced with your own identifying
170e751525SEric Saxe * information: Portions Copyright [yyyy] [name of copyright owner]
180e751525SEric Saxe *
190e751525SEric Saxe * CDDL HEADER END
200e751525SEric Saxe */
210e751525SEric Saxe /*
220e751525SEric Saxe * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
230e751525SEric Saxe * Use is subject to license terms.
240e751525SEric Saxe */
250e751525SEric Saxe
260e751525SEric Saxe #include <sys/cpu_pm.h>
270e751525SEric Saxe #include <sys/cmn_err.h>
28113b131bSEric Saxe #include <sys/time.h>
290e751525SEric Saxe #include <sys/sdt.h>
300e751525SEric Saxe
310e751525SEric Saxe /*
320e751525SEric Saxe * Solaris Event Based CPU Power Manager
330e751525SEric Saxe *
340e751525SEric Saxe * This file implements platform independent event based CPU power management.
350e751525SEric Saxe * When CPUs are configured into the system, the CMT scheduling subsystem will
360e751525SEric Saxe * query the platform to determine if the CPU belongs to any power management
370e751525SEric Saxe * domains. That is, sets of CPUs that share power management states.
380e751525SEric Saxe *
390e751525SEric Saxe * Active Power Management domains represent a group of CPUs across which the
400e751525SEric Saxe * Operating System can request speed changes (which may in turn result
410e751525SEric Saxe * in voltage changes). This allows the operating system to trade off
420e751525SEric Saxe * performance for power savings.
430e751525SEric Saxe *
440e751525SEric Saxe * Idle Power Management domains can enter power savings states when they are
450e751525SEric Saxe * unutilized. These states allow the Operating System to trade off power
460e751525SEric Saxe * for performance (in the form of latency to transition from the idle state
470e751525SEric Saxe * to an active one).
480e751525SEric Saxe *
490e751525SEric Saxe * For each active and idle power domain the CMT subsystem instantiates, a
500e751525SEric Saxe * cpupm_domain_t structure is created. As the dispatcher schedules threads
510e751525SEric Saxe * to run on the system's CPUs, it will also track the utilization of the
520e751525SEric Saxe * enumerated power domains. Significant changes in utilization will result
530e751525SEric Saxe * in the dispatcher sending the power manager events that relate to the
540e751525SEric Saxe * utilization of the power domain. The power manager recieves the events,
550e751525SEric Saxe * and in the context of the policy objectives in force, may decide to request
560e751525SEric Saxe * the domain's power/performance state be changed.
570e751525SEric Saxe *
580e751525SEric Saxe * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
590e751525SEric Saxe * manager will request the CPUs in the domain run at their fastest (and most
600e751525SEric Saxe * power consuming) state. When the domain becomes idle (utilization at zero),
610e751525SEric Saxe * the power manager will request that the CPUs run at a speed that saves the
620e751525SEric Saxe * most power.
630e751525SEric Saxe *
640e751525SEric Saxe * The advantage of this scheme, is that the CPU power manager working with the
650e751525SEric Saxe * dispatcher can be extremely responsive to changes in utilization. Optimizing
660e751525SEric Saxe * for performance in the presence of utilization, and power savings in the
670e751525SEric Saxe * presence of idleness. Such close collaboration with the dispatcher has other
680e751525SEric Saxe * benefits that will play out in the form of more sophisticated power /
690e751525SEric Saxe * performance policy in the near future.
700e751525SEric Saxe *
710e751525SEric Saxe * Avoiding state thrashing in the presence of transient periods of utilization
720e751525SEric Saxe * and idleness while still being responsive to non-transient periods is key.
73113b131bSEric Saxe * The power manager implements a "governor" that is used to throttle
740e751525SEric Saxe * state transitions when a significant amount of transient idle or transient
750e751525SEric Saxe * work is detected.
760e751525SEric Saxe *
770e751525SEric Saxe * Kernel background activity (e.g. taskq threads) are by far the most common
780e751525SEric Saxe * form of transient utilization. Ungoverned in the face of this utililzation,
790e751525SEric Saxe * hundreds of state transitions per second would result on an idle system.
800e751525SEric Saxe *
810e751525SEric Saxe * Transient idleness is common when a thread briefly yields the CPU to
820e751525SEric Saxe * wait for an event elsewhere in the system. Where the idle period is short
830e751525SEric Saxe * enough, the overhead associated with making the state transition doesn't
840e751525SEric Saxe * justify the power savings.
85113b131bSEric Saxe *
86113b131bSEric Saxe * The following is the state machine for the governor implemented by
87113b131bSEric Saxe * cpupm_utilization_event():
88113b131bSEric Saxe *
89113b131bSEric Saxe * ----->---tw---->-----
90113b131bSEric Saxe * / \
91113b131bSEric Saxe * (I)-<-ti-<- -<-ntw-<(W)
92113b131bSEric Saxe * | \ / |
93113b131bSEric Saxe * \ \ / /
94113b131bSEric Saxe * >-nti/rm->(D)--->-tw->-
95113b131bSEric Saxe * Key:
96113b131bSEric Saxe *
97113b131bSEric Saxe * States
98113b131bSEric Saxe * - (D): Default (ungoverned)
99113b131bSEric Saxe * - (W): Transient work governed
100113b131bSEric Saxe * - (I): Transient idle governed
101113b131bSEric Saxe * State Transitions
102113b131bSEric Saxe * - tw: transient work
103113b131bSEric Saxe * - ti: transient idleness
104113b131bSEric Saxe * - ntw: non-transient work
105113b131bSEric Saxe * - nti: non-transient idleness
106113b131bSEric Saxe * - rm: thread remain event
1070e751525SEric Saxe */
1080e751525SEric Saxe
1090e751525SEric Saxe static cpupm_domain_t *cpupm_domains = NULL;
1100e751525SEric Saxe
1110e751525SEric Saxe /*
1120e751525SEric Saxe * Uninitialized state of CPU power management is disabled
1130e751525SEric Saxe */
1140e751525SEric Saxe cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
1150e751525SEric Saxe
1160e751525SEric Saxe /*
1170e751525SEric Saxe * Periods of utilization lasting less than this time interval are characterized
1180e751525SEric Saxe * as transient. State changes associated with transient work are considered
1190e751525SEric Saxe * to be mispredicted. That is, it's not worth raising and lower power states
1200e751525SEric Saxe * where the utilization lasts for less than this interval.
1210e751525SEric Saxe */
1220e751525SEric Saxe hrtime_t cpupm_tw_predict_interval;
1230e751525SEric Saxe
1240e751525SEric Saxe /*
1250e751525SEric Saxe * Periods of idleness lasting less than this time interval are characterized
1260e751525SEric Saxe * as transient. State changes associated with transient idle are considered
1270e751525SEric Saxe * to be mispredicted. That is, it's not worth lowering and raising power
1280e751525SEric Saxe * states where the idleness lasts for less than this interval.
1290e751525SEric Saxe */
1300e751525SEric Saxe hrtime_t cpupm_ti_predict_interval;
1310e751525SEric Saxe
1320e751525SEric Saxe /*
1330e751525SEric Saxe * Number of mispredictions after which future transitions will be governed.
1340e751525SEric Saxe */
135113b131bSEric Saxe int cpupm_mispredict_thresh = 4;
1360e751525SEric Saxe
1370e751525SEric Saxe /*
1380e751525SEric Saxe * Likewise, the number of mispredicted governed transitions after which the
1390e751525SEric Saxe * governor will be removed.
1400e751525SEric Saxe */
141113b131bSEric Saxe int cpupm_mispredict_gov_thresh = 4;
1420e751525SEric Saxe
1430e751525SEric Saxe /*
144113b131bSEric Saxe * The transient work and transient idle prediction intervals are specified
145113b131bSEric Saxe * here. Tuning them higher will result in the transient work, and transient
146113b131bSEric Saxe * idle governors being used more aggresively, which limits the frequency of
147113b131bSEric Saxe * state transitions at the expense of performance and power savings,
148113b131bSEric Saxe * respectively. The intervals are specified in nanoseconds.
1490e751525SEric Saxe */
1500e751525SEric Saxe /*
151113b131bSEric Saxe * 400 usec
1520e751525SEric Saxe */
153113b131bSEric Saxe #define CPUPM_DEFAULT_TI_INTERVAL 400000
154113b131bSEric Saxe /*
155113b131bSEric Saxe * 400 usec
156113b131bSEric Saxe */
157113b131bSEric Saxe #define CPUPM_DEFAULT_TW_INTERVAL 400000
1580e751525SEric Saxe
159113b131bSEric Saxe hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL;
160113b131bSEric Saxe hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL;
1610e751525SEric Saxe
1620e751525SEric Saxe
163113b131bSEric Saxe static void cpupm_governor_initialize(void);
1640e751525SEric Saxe static void cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
1650e751525SEric Saxe
1660e751525SEric Saxe cpupm_policy_t
cpupm_get_policy(void)1670e751525SEric Saxe cpupm_get_policy(void)
1680e751525SEric Saxe {
1690e751525SEric Saxe return (cpupm_policy);
1700e751525SEric Saxe }
1710e751525SEric Saxe
1720e751525SEric Saxe int
cpupm_set_policy(cpupm_policy_t new_policy)1730e751525SEric Saxe cpupm_set_policy(cpupm_policy_t new_policy)
1740e751525SEric Saxe {
1750e751525SEric Saxe static int gov_init = 0;
1760e751525SEric Saxe int result = 0;
1770e751525SEric Saxe
1780e751525SEric Saxe mutex_enter(&cpu_lock);
1790e751525SEric Saxe if (new_policy == cpupm_policy) {
1800e751525SEric Saxe mutex_exit(&cpu_lock);
1810e751525SEric Saxe return (result);
1820e751525SEric Saxe }
1830e751525SEric Saxe
1840e751525SEric Saxe /*
1850e751525SEric Saxe * Pausing CPUs causes a high priority thread to be scheduled
1860e751525SEric Saxe * on all other CPUs (besides the current one). This locks out
1870e751525SEric Saxe * other CPUs from making CPUPM state transitions.
1880e751525SEric Saxe */
1890e751525SEric Saxe switch (new_policy) {
1900e751525SEric Saxe case CPUPM_POLICY_DISABLED:
191*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL);
1920e751525SEric Saxe cpupm_policy = CPUPM_POLICY_DISABLED;
1930e751525SEric Saxe start_cpus();
1940e751525SEric Saxe
1950e751525SEric Saxe result = cmt_pad_disable(PGHW_POW_ACTIVE);
1960e751525SEric Saxe
1970e751525SEric Saxe /*
1980e751525SEric Saxe * Once PAD has been enabled, it should always be possible
1990e751525SEric Saxe * to disable it.
2000e751525SEric Saxe */
2010e751525SEric Saxe ASSERT(result == 0);
2020e751525SEric Saxe
2030e751525SEric Saxe /*
2040e751525SEric Saxe * Bring all the active power domains to the maximum
2050e751525SEric Saxe * performance state.
2060e751525SEric Saxe */
2070e751525SEric Saxe cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
2080e751525SEric Saxe CPUPM_STATE_MAX_PERF);
2090e751525SEric Saxe
2100e751525SEric Saxe break;
2110e751525SEric Saxe case CPUPM_POLICY_ELASTIC:
2120e751525SEric Saxe
2130e751525SEric Saxe result = cmt_pad_enable(PGHW_POW_ACTIVE);
2140e751525SEric Saxe if (result < 0) {
2150e751525SEric Saxe /*
2160e751525SEric Saxe * Failed to enable PAD across the active power
2170e751525SEric Saxe * domains, which may well be because none were
2180e751525SEric Saxe * enumerated.
2190e751525SEric Saxe */
2200e751525SEric Saxe break;
2210e751525SEric Saxe }
2220e751525SEric Saxe
2230e751525SEric Saxe /*
224113b131bSEric Saxe * Initialize the governor parameters the first time through.
2250e751525SEric Saxe */
2260e751525SEric Saxe if (gov_init == 0) {
227113b131bSEric Saxe cpupm_governor_initialize();
2280e751525SEric Saxe gov_init = 1;
2290e751525SEric Saxe }
230113b131bSEric Saxe
231*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL);
2320e751525SEric Saxe cpupm_policy = CPUPM_POLICY_ELASTIC;
2330e751525SEric Saxe start_cpus();
2340e751525SEric Saxe
2350e751525SEric Saxe break;
2360e751525SEric Saxe default:
2370e751525SEric Saxe cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
2380e751525SEric Saxe new_policy);
2390e751525SEric Saxe ASSERT(0);
2400e751525SEric Saxe break;
2410e751525SEric Saxe }
2420e751525SEric Saxe mutex_exit(&cpu_lock);
2430e751525SEric Saxe
2440e751525SEric Saxe return (result);
2450e751525SEric Saxe }
2460e751525SEric Saxe
2470e751525SEric Saxe /*
2480e751525SEric Saxe * Look for an existing power domain
2490e751525SEric Saxe */
2500e751525SEric Saxe static cpupm_domain_t *
cpupm_domain_find(id_t id,cpupm_dtype_t type)2510e751525SEric Saxe cpupm_domain_find(id_t id, cpupm_dtype_t type)
2520e751525SEric Saxe {
2530e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
2540e751525SEric Saxe
2550e751525SEric Saxe cpupm_domain_t *dom;
2560e751525SEric Saxe
2570e751525SEric Saxe dom = cpupm_domains;
2580e751525SEric Saxe while (dom != NULL) {
2590e751525SEric Saxe if (id == dom->cpd_id && type == dom->cpd_type)
2600e751525SEric Saxe return (dom);
2610e751525SEric Saxe dom = dom->cpd_next;
2620e751525SEric Saxe }
2630e751525SEric Saxe return (NULL);
2640e751525SEric Saxe }
2650e751525SEric Saxe
2660e751525SEric Saxe /*
2670e751525SEric Saxe * Create a new domain
2680e751525SEric Saxe */
2690e751525SEric Saxe static cpupm_domain_t *
cpupm_domain_create(id_t id,cpupm_dtype_t type)2700e751525SEric Saxe cpupm_domain_create(id_t id, cpupm_dtype_t type)
2710e751525SEric Saxe {
2720e751525SEric Saxe cpupm_domain_t *dom;
2730e751525SEric Saxe
2740e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
2750e751525SEric Saxe
2760e751525SEric Saxe dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
2770e751525SEric Saxe dom->cpd_id = id;
2780e751525SEric Saxe dom->cpd_type = type;
2790e751525SEric Saxe
2800e751525SEric Saxe /* Link into the known domain list */
2810e751525SEric Saxe dom->cpd_next = cpupm_domains;
2820e751525SEric Saxe cpupm_domains = dom;
2830e751525SEric Saxe
2840e751525SEric Saxe return (dom);
2850e751525SEric Saxe }
2860e751525SEric Saxe
2870e751525SEric Saxe static void
cpupm_domain_state_enum(struct cpu * cp,cpupm_domain_t * dom)2880e751525SEric Saxe cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
2890e751525SEric Saxe {
2900e751525SEric Saxe /*
2910e751525SEric Saxe * In the envent we're enumerating because the domain's state
2920e751525SEric Saxe * configuration has changed, toss any existing states.
2930e751525SEric Saxe */
2940e751525SEric Saxe if (dom->cpd_nstates > 0) {
2950e751525SEric Saxe kmem_free(dom->cpd_states,
2960e751525SEric Saxe sizeof (cpupm_state_t) * dom->cpd_nstates);
2970e751525SEric Saxe dom->cpd_nstates = 0;
2980e751525SEric Saxe }
2990e751525SEric Saxe
3000e751525SEric Saxe /*
3010e751525SEric Saxe * Query to determine the number of states, allocate storage
3020e751525SEric Saxe * large enough to hold the state information, and pass it back
3030e751525SEric Saxe * to the platform driver to complete the enumeration.
3040e751525SEric Saxe */
3050e751525SEric Saxe dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
3060e751525SEric Saxe
3070e751525SEric Saxe if (dom->cpd_nstates == 0)
3080e751525SEric Saxe return;
3090e751525SEric Saxe
3100e751525SEric Saxe dom->cpd_states =
3110e751525SEric Saxe kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
3120e751525SEric Saxe (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
3130e751525SEric Saxe }
3140e751525SEric Saxe
3150e751525SEric Saxe /*
3160e751525SEric Saxe * Initialize the specified type of power domain on behalf of the CPU
3170e751525SEric Saxe */
3180e751525SEric Saxe cpupm_domain_t *
cpupm_domain_init(struct cpu * cp,cpupm_dtype_t type)3190e751525SEric Saxe cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
3200e751525SEric Saxe {
3210e751525SEric Saxe cpupm_domain_t *dom;
3220e751525SEric Saxe id_t did;
3230e751525SEric Saxe
3240e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
3250e751525SEric Saxe
3260e751525SEric Saxe /*
3270e751525SEric Saxe * Instantiate the domain if it doesn't already exist
3280e751525SEric Saxe * and enumerate its power states.
3290e751525SEric Saxe */
3300e751525SEric Saxe did = cpupm_domain_id(cp, type);
3310e751525SEric Saxe dom = cpupm_domain_find(did, type);
3320e751525SEric Saxe if (dom == NULL) {
3330e751525SEric Saxe dom = cpupm_domain_create(did, type);
3340e751525SEric Saxe cpupm_domain_state_enum(cp, dom);
3350e751525SEric Saxe }
3360e751525SEric Saxe
3370e751525SEric Saxe /*
3380e751525SEric Saxe * Named state initialization
3390e751525SEric Saxe */
3400e751525SEric Saxe if (type == CPUPM_DTYPE_ACTIVE) {
3410e751525SEric Saxe /*
3420e751525SEric Saxe * For active power domains, the highest performance
3430e751525SEric Saxe * state is defined as first state returned from
3440e751525SEric Saxe * the domain enumeration.
3450e751525SEric Saxe */
3460e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
3470e751525SEric Saxe &dom->cpd_states[0];
3480e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
3490e751525SEric Saxe &dom->cpd_states[dom->cpd_nstates - 1];
3500e751525SEric Saxe
3510e751525SEric Saxe /*
3520e751525SEric Saxe * Begin by assuming CPU is running at the max perf state.
3530e751525SEric Saxe */
3540e751525SEric Saxe dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
3550e751525SEric Saxe }
3560e751525SEric Saxe
3570e751525SEric Saxe return (dom);
3580e751525SEric Saxe }
3590e751525SEric Saxe
3600e751525SEric Saxe /*
3610e751525SEric Saxe * Return the id associated with the given type of domain
3620e751525SEric Saxe * to which cp belongs
3630e751525SEric Saxe */
3640e751525SEric Saxe id_t
cpupm_domain_id(struct cpu * cp,cpupm_dtype_t type)3650e751525SEric Saxe cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
3660e751525SEric Saxe {
3670e751525SEric Saxe return (cpupm_plat_domain_id(cp, type));
3680e751525SEric Saxe }
3690e751525SEric Saxe
3700e751525SEric Saxe /*
3710e751525SEric Saxe * Initiate a state change for the specified domain on behalf of cp
3720e751525SEric Saxe */
3730e751525SEric Saxe int
cpupm_change_state(struct cpu * cp,cpupm_domain_t * dom,cpupm_state_t * state)3740e751525SEric Saxe cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
3750e751525SEric Saxe {
3760e751525SEric Saxe if (cpupm_plat_change_state(cp, state) < 0)
3770e751525SEric Saxe return (-1);
3780e751525SEric Saxe
3790e751525SEric Saxe DTRACE_PROBE2(cpupm__change__state,
3800e751525SEric Saxe cpupm_domain_t *, dom,
3810e751525SEric Saxe cpupm_state_t *, state);
3820e751525SEric Saxe
3830e751525SEric Saxe dom->cpd_state = state;
3840e751525SEric Saxe return (0);
3850e751525SEric Saxe }
3860e751525SEric Saxe
3870e751525SEric Saxe /*
3880e751525SEric Saxe * Interface into the CPU power manager to indicate a significant change
3890e751525SEric Saxe * in utilization of the specified active power domain
3900e751525SEric Saxe */
3910e751525SEric Saxe void
cpupm_utilization_event(struct cpu * cp,hrtime_t now,cpupm_domain_t * dom,cpupm_util_event_t event)3920e751525SEric Saxe cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
3930e751525SEric Saxe cpupm_util_event_t event)
3940e751525SEric Saxe {
3950e751525SEric Saxe cpupm_state_t *new_state = NULL;
3960e751525SEric Saxe hrtime_t last;
3970e751525SEric Saxe
3980e751525SEric Saxe if (cpupm_policy == CPUPM_POLICY_DISABLED) {
3990e751525SEric Saxe return;
4000e751525SEric Saxe }
4010e751525SEric Saxe
4020e751525SEric Saxe /*
4030e751525SEric Saxe * What follows is a simple elastic power state management policy.
4040e751525SEric Saxe *
4050e751525SEric Saxe * If the utilization has become non-zero, and the domain was
4060e751525SEric Saxe * previously at it's lowest power state, then transition it
4070e751525SEric Saxe * to the highest state in the spirit of "race to idle".
4080e751525SEric Saxe *
4090e751525SEric Saxe * If the utilization has dropped to zero, then transition the
4100e751525SEric Saxe * domain to its lowest power state.
4110e751525SEric Saxe *
412113b131bSEric Saxe * Statistics are maintained to implement a governor to reduce state
4130e751525SEric Saxe * transitions resulting from either transient work, or periods of
4140e751525SEric Saxe * transient idleness on the domain.
4150e751525SEric Saxe */
4160e751525SEric Saxe switch (event) {
4170e751525SEric Saxe case CPUPM_DOM_REMAIN_BUSY:
4180e751525SEric Saxe
4190e751525SEric Saxe /*
4200e751525SEric Saxe * We've received an event that the domain is running a thread
4210e751525SEric Saxe * that's made it to the end of it's time slice. If we are at
4220e751525SEric Saxe * low power, then raise it. If the transient work governor
4230e751525SEric Saxe * is engaged, then remove it.
4240e751525SEric Saxe */
4250e751525SEric Saxe if (dom->cpd_state ==
4260e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
4270e751525SEric Saxe new_state =
4280e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
429113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
430113b131bSEric Saxe dom->cpd_governor = CPUPM_GOV_DISENGAGED;
4310e751525SEric Saxe dom->cpd_tw = 0;
4320e751525SEric Saxe }
4330e751525SEric Saxe }
4340e751525SEric Saxe break;
4350e751525SEric Saxe
4360e751525SEric Saxe case CPUPM_DOM_BUSY_FROM_IDLE:
4370e751525SEric Saxe last = dom->cpd_last_lower;
4380e751525SEric Saxe dom->cpd_last_raise = now;
4390e751525SEric Saxe
4400e751525SEric Saxe DTRACE_PROBE3(cpupm__raise__req,
4410e751525SEric Saxe cpupm_domain_t *, dom,
4420e751525SEric Saxe hrtime_t, last,
4430e751525SEric Saxe hrtime_t, now);
4440e751525SEric Saxe
4450e751525SEric Saxe if (dom->cpd_state ==
4460e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
4470e751525SEric Saxe
4480e751525SEric Saxe /*
4490e751525SEric Saxe * There's non-zero utilization, and the domain is
4500e751525SEric Saxe * running in the lower power state. Before we
451113b131bSEric Saxe * consider raising power, check if the preceeding
452113b131bSEric Saxe * idle period was transient in duration.
453113b131bSEric Saxe *
454113b131bSEric Saxe * If the domain is already transient work governed,
455113b131bSEric Saxe * then we don't bother maintaining transient idle
456113b131bSEric Saxe * statistics, as the presence of enough transient work
457113b131bSEric Saxe * can also make the domain frequently transiently idle.
458113b131bSEric Saxe * In this case, we still want to remain transient work
459113b131bSEric Saxe * governed.
4600e751525SEric Saxe */
461113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
4620e751525SEric Saxe if ((now - last) < cpupm_ti_predict_interval) {
4630e751525SEric Saxe /*
4640e751525SEric Saxe * We're raising the domain power and
4650e751525SEric Saxe * we *just* lowered it. Consider
4660e751525SEric Saxe * this a mispredicted power state
4670e751525SEric Saxe * transition due to a transient
4680e751525SEric Saxe * idle period.
4690e751525SEric Saxe */
470113b131bSEric Saxe if (++dom->cpd_ti >=
4710e751525SEric Saxe cpupm_mispredict_thresh) {
4720e751525SEric Saxe /*
4730e751525SEric Saxe * There's enough transient
4740e751525SEric Saxe * idle transitions to
4750e751525SEric Saxe * justify governing future
4760e751525SEric Saxe * lowering requests.
4770e751525SEric Saxe */
478113b131bSEric Saxe dom->cpd_governor =
479113b131bSEric Saxe CPUPM_GOV_TRANS_IDLE;
4800e751525SEric Saxe dom->cpd_ti = 0;
4810e751525SEric Saxe DTRACE_PROBE1(
4820e751525SEric Saxe cpupm__ti__governed,
4830e751525SEric Saxe cpupm_domain_t *, dom);
4840e751525SEric Saxe }
4850e751525SEric Saxe } else {
4860e751525SEric Saxe /*
4870e751525SEric Saxe * We correctly predicted the last
4880e751525SEric Saxe * lowering.
4890e751525SEric Saxe */
4900e751525SEric Saxe dom->cpd_ti = 0;
4910e751525SEric Saxe }
4920e751525SEric Saxe }
493113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
4940e751525SEric Saxe /*
4950e751525SEric Saxe * Raise requests are governed due to
4960e751525SEric Saxe * transient work.
4970e751525SEric Saxe */
4980e751525SEric Saxe DTRACE_PROBE1(cpupm__raise__governed,
4990e751525SEric Saxe cpupm_domain_t *, dom);
5000e751525SEric Saxe
5010e751525SEric Saxe return;
5020e751525SEric Saxe }
5030e751525SEric Saxe /*
5040e751525SEric Saxe * Prepare to transition to the higher power state
5050e751525SEric Saxe */
5060e751525SEric Saxe new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
5070e751525SEric Saxe
5080e751525SEric Saxe } else if (dom->cpd_state ==
5090e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
5100e751525SEric Saxe
5110e751525SEric Saxe /*
5120e751525SEric Saxe * Utilization is non-zero, and we're already running
5130e751525SEric Saxe * in the higher power state. Take this opportunity to
5140e751525SEric Saxe * perform some book keeping if the last lowering
5150e751525SEric Saxe * request was governed.
5160e751525SEric Saxe */
517113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
518113b131bSEric Saxe
5190e751525SEric Saxe if ((now - last) >= cpupm_ti_predict_interval) {
5200e751525SEric Saxe /*
5210e751525SEric Saxe * The domain is transient idle
5220e751525SEric Saxe * governed, and we mispredicted
5230e751525SEric Saxe * governing the last lowering request.
5240e751525SEric Saxe */
5250e751525SEric Saxe if (++dom->cpd_ti >=
5260e751525SEric Saxe cpupm_mispredict_gov_thresh) {
5270e751525SEric Saxe /*
5280e751525SEric Saxe * There's enough non-transient
5290e751525SEric Saxe * idle periods to justify
5300e751525SEric Saxe * removing the governor.
5310e751525SEric Saxe */
532113b131bSEric Saxe dom->cpd_governor =
533113b131bSEric Saxe CPUPM_GOV_DISENGAGED;
5340e751525SEric Saxe dom->cpd_ti = 0;
5350e751525SEric Saxe DTRACE_PROBE1(
5360e751525SEric Saxe cpupm__ti__ungoverned,
5370e751525SEric Saxe cpupm_domain_t *, dom);
5380e751525SEric Saxe }
5390e751525SEric Saxe } else {
5400e751525SEric Saxe /*
5410e751525SEric Saxe * Correctly predicted governing the
5420e751525SEric Saxe * last lowering request.
5430e751525SEric Saxe */
5440e751525SEric Saxe dom->cpd_ti = 0;
5450e751525SEric Saxe }
5460e751525SEric Saxe }
5470e751525SEric Saxe }
5480e751525SEric Saxe break;
5490e751525SEric Saxe
5500e751525SEric Saxe case CPUPM_DOM_IDLE_FROM_BUSY:
5510e751525SEric Saxe last = dom->cpd_last_raise;
5520e751525SEric Saxe dom->cpd_last_lower = now;
5530e751525SEric Saxe
5540e751525SEric Saxe DTRACE_PROBE3(cpupm__lower__req,
5550e751525SEric Saxe cpupm_domain_t *, dom,
5560e751525SEric Saxe hrtime_t, last,
5570e751525SEric Saxe hrtime_t, now);
5580e751525SEric Saxe
5590e751525SEric Saxe if (dom->cpd_state ==
5600e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
5610e751525SEric Saxe
5620e751525SEric Saxe /*
5630e751525SEric Saxe * The domain is idle, and is running in the highest
5640e751525SEric Saxe * performance state. Before we consider lowering power,
5650e751525SEric Saxe * perform some book keeping for the transient work
5660e751525SEric Saxe * governor.
5670e751525SEric Saxe */
568113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
5690e751525SEric Saxe if ((now - last) < cpupm_tw_predict_interval) {
5700e751525SEric Saxe /*
5710e751525SEric Saxe * We're lowering the domain power and
5720e751525SEric Saxe * we *just* raised it. Consider the
5730e751525SEric Saxe * last raise mispredicted due to
5740e751525SEric Saxe * transient work.
5750e751525SEric Saxe */
5760e751525SEric Saxe if (++dom->cpd_tw >=
5770e751525SEric Saxe cpupm_mispredict_thresh) {
5780e751525SEric Saxe /*
579113b131bSEric Saxe * There's enough transient work
5800e751525SEric Saxe * transitions to justify
581113b131bSEric Saxe * governing future raise
5820e751525SEric Saxe * requests.
5830e751525SEric Saxe */
584113b131bSEric Saxe dom->cpd_governor =
585113b131bSEric Saxe CPUPM_GOV_TRANS_WORK;
5860e751525SEric Saxe dom->cpd_tw = 0;
5870e751525SEric Saxe DTRACE_PROBE1(
5880e751525SEric Saxe cpupm__tw__governed,
5890e751525SEric Saxe cpupm_domain_t *, dom);
5900e751525SEric Saxe }
5910e751525SEric Saxe } else {
5920e751525SEric Saxe /*
5930e751525SEric Saxe * We correctly predicted during the
5940e751525SEric Saxe * last raise.
5950e751525SEric Saxe */
5960e751525SEric Saxe dom->cpd_tw = 0;
5970e751525SEric Saxe }
5980e751525SEric Saxe }
599113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
6000e751525SEric Saxe /*
6010e751525SEric Saxe * Lowering requests are governed due to
6020e751525SEric Saxe * transient idleness.
6030e751525SEric Saxe */
6040e751525SEric Saxe DTRACE_PROBE1(cpupm__lowering__governed,
6050e751525SEric Saxe cpupm_domain_t *, dom);
6060e751525SEric Saxe
6070e751525SEric Saxe return;
6080e751525SEric Saxe }
6090e751525SEric Saxe
6100e751525SEric Saxe /*
6110e751525SEric Saxe * Prepare to transition to a lower power state.
6120e751525SEric Saxe */
6130e751525SEric Saxe new_state =
6140e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
6150e751525SEric Saxe
6160e751525SEric Saxe } else if (dom->cpd_state ==
6170e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
6180e751525SEric Saxe
6190e751525SEric Saxe /*
6200e751525SEric Saxe * The domain is idle, and we're already running in
6210e751525SEric Saxe * the lower power state. Take this opportunity to
6220e751525SEric Saxe * perform some book keeping if the last raising
6230e751525SEric Saxe * request was governed.
6240e751525SEric Saxe */
625113b131bSEric Saxe if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
6260e751525SEric Saxe if ((now - last) >= cpupm_tw_predict_interval) {
6270e751525SEric Saxe /*
6280e751525SEric Saxe * The domain is transient work
6290e751525SEric Saxe * governed, and we mispredicted
6300e751525SEric Saxe * governing the last raising request.
6310e751525SEric Saxe */
6320e751525SEric Saxe if (++dom->cpd_tw >=
6330e751525SEric Saxe cpupm_mispredict_gov_thresh) {
6340e751525SEric Saxe /*
6350e751525SEric Saxe * There's enough non-transient
6360e751525SEric Saxe * work to justify removing
6370e751525SEric Saxe * the governor.
6380e751525SEric Saxe */
639113b131bSEric Saxe dom->cpd_governor =
640113b131bSEric Saxe CPUPM_GOV_DISENGAGED;
6410e751525SEric Saxe dom->cpd_tw = 0;
6420e751525SEric Saxe DTRACE_PROBE1(
6430e751525SEric Saxe cpupm__tw__ungoverned,
6440e751525SEric Saxe cpupm_domain_t *, dom);
6450e751525SEric Saxe }
6460e751525SEric Saxe } else {
6470e751525SEric Saxe /*
6480e751525SEric Saxe * We correctly predicted governing
6490e751525SEric Saxe * the last raise.
6500e751525SEric Saxe */
6510e751525SEric Saxe dom->cpd_tw = 0;
6520e751525SEric Saxe }
6530e751525SEric Saxe }
6540e751525SEric Saxe }
6550e751525SEric Saxe break;
6560e751525SEric Saxe }
6570e751525SEric Saxe /*
6580e751525SEric Saxe * Change the power state
6590e751525SEric Saxe * Not much currently done if this doesn't succeed
6600e751525SEric Saxe */
6610e751525SEric Saxe if (new_state)
6620e751525SEric Saxe (void) cpupm_change_state(cp, dom, new_state);
6630e751525SEric Saxe }
6640e751525SEric Saxe
6650e751525SEric Saxe
6660e751525SEric Saxe /*
6670e751525SEric Saxe * Interface called by platforms to dynamically change the
6680e751525SEric Saxe * MAX performance cpupm state
6690e751525SEric Saxe */
6700e751525SEric Saxe void
cpupm_redefine_max_activepwr_state(struct cpu * cp,int max_perf_level)6710e751525SEric Saxe cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
6720e751525SEric Saxe {
6730e751525SEric Saxe cpupm_domain_t *dom;
6740e751525SEric Saxe id_t did;
6750e751525SEric Saxe cpupm_dtype_t type = CPUPM_DTYPE_ACTIVE;
6760e751525SEric Saxe boolean_t change_state = B_FALSE;
6770e751525SEric Saxe cpupm_state_t *new_state = NULL;
6780e751525SEric Saxe
6790e751525SEric Saxe did = cpupm_domain_id(cp, type);
680a3114836SGerry Liu if (MUTEX_HELD(&cpu_lock)) {
681a3114836SGerry Liu dom = cpupm_domain_find(did, type);
682a3114836SGerry Liu } else {
6830e751525SEric Saxe mutex_enter(&cpu_lock);
6840e751525SEric Saxe dom = cpupm_domain_find(did, type);
6850e751525SEric Saxe mutex_exit(&cpu_lock);
686a3114836SGerry Liu }
6870e751525SEric Saxe
6880e751525SEric Saxe /*
6890e751525SEric Saxe * Can use a lock to avoid changing the power state of the cpu when
6900e751525SEric Saxe * CPUPM_STATE_MAX_PERF is getting changed.
6910e751525SEric Saxe * Since the occurance of events to change MAX_PERF is not frequent,
6920e751525SEric Saxe * it may not be a good idea to overburden with locks. In the worst
6930e751525SEric Saxe * case, for one cycle the power may not get changed to the required
6940e751525SEric Saxe * level
6950e751525SEric Saxe */
6960e751525SEric Saxe if (dom != NULL) {
6970e751525SEric Saxe if (dom->cpd_state ==
6980e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
6990e751525SEric Saxe change_state = B_TRUE;
7000e751525SEric Saxe }
7010e751525SEric Saxe
7020e751525SEric Saxe /*
7030e751525SEric Saxe * If an out of range level is passed, use the lowest supported
7040e751525SEric Saxe * speed.
7050e751525SEric Saxe */
7060e751525SEric Saxe if (max_perf_level >= dom->cpd_nstates &&
7070e751525SEric Saxe dom->cpd_nstates > 1) {
7080e751525SEric Saxe max_perf_level = dom->cpd_nstates - 1;
7090e751525SEric Saxe }
7100e751525SEric Saxe
7110e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
7120e751525SEric Saxe &dom->cpd_states[max_perf_level];
7130e751525SEric Saxe
7140e751525SEric Saxe /*
7150e751525SEric Saxe * If the current state is MAX_PERF, change the current state
7160e751525SEric Saxe * to the new MAX_PERF
7170e751525SEric Saxe */
7180e751525SEric Saxe if (change_state) {
7190e751525SEric Saxe new_state =
7200e751525SEric Saxe dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
7210e751525SEric Saxe if (new_state) {
7220e751525SEric Saxe (void) cpupm_change_state(cp, dom, new_state);
7230e751525SEric Saxe }
7240e751525SEric Saxe }
7250e751525SEric Saxe }
7260e751525SEric Saxe }
7270e751525SEric Saxe
7280e751525SEric Saxe /*
729113b131bSEric Saxe * Initialize the parameters for the transience governor state machine
7300e751525SEric Saxe */
731113b131bSEric Saxe static void
cpupm_governor_initialize(void)7320e751525SEric Saxe cpupm_governor_initialize(void)
7330e751525SEric Saxe {
7340e751525SEric Saxe /*
735113b131bSEric Saxe * The default prediction intervals are specified in nanoseconds.
736113b131bSEric Saxe * Convert these to the equivalent in unscaled hrtime, which is the
737113b131bSEric Saxe * format of the timestamps passed to cpupm_utilization_event()
7380e751525SEric Saxe */
739113b131bSEric Saxe cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval);
740113b131bSEric Saxe cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval);
7410e751525SEric Saxe }
7420e751525SEric Saxe
7430e751525SEric Saxe /*
7440e751525SEric Saxe * Initiate a state change in all CPUPM domain instances of the specified type
7450e751525SEric Saxe */
7460e751525SEric Saxe static void
cpupm_state_change_global(cpupm_dtype_t type,cpupm_state_name_t state)7470e751525SEric Saxe cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
7480e751525SEric Saxe {
7490e751525SEric Saxe cpu_t *cp;
7500e751525SEric Saxe pg_cmt_t *pwr_pg;
7510e751525SEric Saxe cpupm_domain_t *dom;
7520e751525SEric Saxe group_t *hwset;
7530e751525SEric Saxe group_iter_t giter;
7540e751525SEric Saxe pg_cpu_itr_t cpu_iter;
7550e751525SEric Saxe pghw_type_t hw;
7560e751525SEric Saxe
7570e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
7580e751525SEric Saxe
7590e751525SEric Saxe switch (type) {
7600e751525SEric Saxe case CPUPM_DTYPE_ACTIVE:
7610e751525SEric Saxe hw = PGHW_POW_ACTIVE;
7620e751525SEric Saxe break;
7630e751525SEric Saxe default:
7640e751525SEric Saxe /*
7650e751525SEric Saxe * Power domain types other than "active" unsupported.
7660e751525SEric Saxe */
7670e751525SEric Saxe ASSERT(type == CPUPM_DTYPE_ACTIVE);
7680e751525SEric Saxe return;
7690e751525SEric Saxe }
7700e751525SEric Saxe
7710e751525SEric Saxe if ((hwset = pghw_set_lookup(hw)) == NULL)
7720e751525SEric Saxe return;
7730e751525SEric Saxe
7740e751525SEric Saxe /*
7750e751525SEric Saxe * Iterate over the power domains
7760e751525SEric Saxe */
7770e751525SEric Saxe group_iter_init(&giter);
7780e751525SEric Saxe while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
7790e751525SEric Saxe
7800e751525SEric Saxe dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
7810e751525SEric Saxe
7820e751525SEric Saxe /*
7830e751525SEric Saxe * Iterate over the CPUs in each domain
7840e751525SEric Saxe */
7850e751525SEric Saxe PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
7860e751525SEric Saxe while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
7870e751525SEric Saxe (void) cpupm_change_state(cp, dom,
7880e751525SEric Saxe dom->cpd_named_states[state]);
7890e751525SEric Saxe }
7900e751525SEric Saxe }
7910e751525SEric Saxe }
792