xref: /illumos-gate/usr/src/uts/intel/io/amdzen/smntemp.c (revision 2a8bcb4e)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019, Joyent, Inc.
14  * Copyright 2022 Oxide Computer Company
15  */
16 
17 /*
18  * This implements a temperature sensor for AMD Zen family products that rely
19  * upon the SMN framework for getting temperature information.
20  */
21 
22 #include <sys/modctl.h>
23 #include <sys/conf.h>
24 #include <sys/devops.h>
25 #include <sys/types.h>
26 #include <sys/cred.h>
27 #include <sys/ddi.h>
28 #include <sys/sunddi.h>
29 #include <sys/cmn_err.h>
30 #include <sys/x86_archext.h>
31 #include <sys/cpuvar.h>
32 #include <sys/sensors.h>
33 #include <sys/sysmacros.h>
34 #include <sys/amdzen/smn.h>
35 #include <amdzen_client.h>
36 
37 /*
38  * The following are register offsets and the meaning of their bits related to
39  * temperature. These addresses reside in the System Management Network which is
40  * accessed through the northbridge. They are not addresses in PCI configuration
41  * space.
42  */
43 #define	SMN_SMU_THERMAL_CURTEMP			SMN_MAKE_REG(0x00059800)
44 #define	SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(x)	((x) >> 21)
45 #define	SMN_SMU_THERMAL_CURTEMP_RANGE_SEL		(1 << 19)
46 
47 #define	SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ		(-49)
48 #define	SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS		3
49 #define	SMN_SMU_THERMAL_CURTEMP_BITS_MASK		0x7
50 
51 /*
52  * The temperature sensor in Family 17 is measured in terms of 0.125 C steps.
53  */
54 #define	SMN_THERMAL_GRANULARITY	8
55 
56 typedef enum {
57 	SMNTEMP_F_MUTEX	= 1 << 0
58 } smntemp_flags_t;
59 
60 typedef struct {
61 	uint_t stt_dfno;
62 	id_t stt_ksensor;
63 	struct smntemp *stt_smn;
64 	smntemp_flags_t stt_flags;
65 	kmutex_t stt_mutex;
66 	hrtime_t stt_last_read;
67 	uint32_t stt_reg;
68 	int64_t stt_temp;
69 } smntemp_temp_t;
70 
71 typedef struct smntemp {
72 	dev_info_t *smn_dip;
73 	uint_t smn_ntemps;
74 	int smn_offset;
75 	smntemp_temp_t *smn_temps;
76 } smntemp_t;
77 
78 static smntemp_t smntemp_data;
79 
80 /*
81  * AMD processors report a control temperature (called Tctl) which may be
82  * different from the junction temperature, which is the value that is actually
83  * measured from the die (sometimes called Tdie or Tjct). This is done so that
84  * socket-based environmental monitoring can be consistent from a platform
85  * perspective, but doesn't help us. Unfortunately, these values aren't in
86  * datasheets that we can find, but have been documented partially in a series
87  * of blog posts by AMD when discussing their 'Ryzen Master' monitoring software
88  * for Windows.
89  *
90  * The brand strings below may contain partial matches such in the Threadripper
91  * cases so we can match the entire family of processors. The offset value is
92  * the quantity in degrees that we should adjust Tctl to reach Tdie.
93  */
94 typedef struct {
95 	const char	*sto_brand;
96 	uint_t		sto_family;
97 	int		sto_off;
98 } smntemp_offset_t;
99 
100 static const smntemp_offset_t smntemp_offsets[] = {
101 	{ "AMD Ryzen 5 1600X", 0x17, -20 },
102 	{ "AMD Ryzen 7 1700X", 0x17, -20 },
103 	{ "AMD Ryzen 7 1800X", 0x17, -20 },
104 	{ "AMD Ryzen 7 2700X", 0x17, -10 },
105 	{ "AMD Ryzen Threadripper 19", 0x17, -27 },
106 	{ "AMD Ryzen Threadripper 29", 0x17, -27 },
107 	{ NULL }
108 };
109 
110 static int
111 smntemp_temp_update(smntemp_t *smn, smntemp_temp_t *stt)
112 {
113 	int ret;
114 	uint32_t reg;
115 	int64_t raw, decimal;
116 
117 	ASSERT(MUTEX_HELD((&stt->stt_mutex)));
118 
119 	if ((ret = amdzen_c_smn_read(stt->stt_dfno, SMN_SMU_THERMAL_CURTEMP,
120 	    &reg)) != 0) {
121 		return (ret);
122 	}
123 
124 	stt->stt_last_read = gethrtime();
125 	stt->stt_reg = reg;
126 	raw = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) >>
127 	    SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS;
128 	decimal = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) &
129 	    SMN_SMU_THERMAL_CURTEMP_BITS_MASK;
130 	if ((reg & SMN_SMU_THERMAL_CURTEMP_RANGE_SEL) != 0) {
131 		raw += SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ;
132 	}
133 	raw += smn->smn_offset;
134 	stt->stt_temp = raw << SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS;
135 	stt->stt_temp += decimal;
136 
137 	return (0);
138 }
139 
140 static int
141 smntemp_temp_read(void *arg, sensor_ioctl_scalar_t *temp)
142 {
143 	int ret;
144 	smntemp_temp_t *stt = arg;
145 	smntemp_t *smn = stt->stt_smn;
146 
147 	mutex_enter(&stt->stt_mutex);
148 	if ((ret = smntemp_temp_update(smn, stt)) != 0) {
149 		mutex_exit(&stt->stt_mutex);
150 		return (ret);
151 	}
152 
153 	temp->sis_unit = SENSOR_UNIT_CELSIUS;
154 	temp->sis_value = stt->stt_temp;
155 	temp->sis_gran = SMN_THERMAL_GRANULARITY;
156 	mutex_exit(&stt->stt_mutex);
157 
158 	return (0);
159 }
160 
161 static const ksensor_ops_t smntemp_temp_ops = {
162 	.kso_kind = ksensor_kind_temperature,
163 	.kso_scalar = smntemp_temp_read
164 };
165 
166 static void
167 smntemp_cleanup(smntemp_t *smn)
168 {
169 	if (smn->smn_temps != NULL) {
170 		uint_t i;
171 
172 		(void) ksensor_remove(smn->smn_dip, KSENSOR_ALL_IDS);
173 		for (i = 0; i < smn->smn_ntemps; i++) {
174 			if ((smn->smn_temps[i].stt_flags & SMNTEMP_F_MUTEX) !=
175 			    0) {
176 				mutex_destroy(&smn->smn_temps[i].stt_mutex);
177 				smn->smn_temps[i].stt_flags &= ~SMNTEMP_F_MUTEX;
178 			}
179 		}
180 		kmem_free(smn->smn_temps, sizeof (smntemp_temp_t) *
181 		    smn->smn_ntemps);
182 		smn->smn_temps = NULL;
183 		smn->smn_ntemps = 0;
184 	}
185 
186 	if (smn->smn_dip != NULL) {
187 		ddi_remove_minor_node(smn->smn_dip, NULL);
188 		ddi_set_driver_private(smn->smn_dip, NULL);
189 		smn->smn_dip = NULL;
190 	}
191 }
192 
193 static boolean_t
194 smntemp_find_offset(smntemp_t *smn)
195 {
196 	uint_t i, family;
197 	char buf[256];
198 
199 	if (cpuid_getbrandstr(CPU, buf, sizeof (buf)) >= sizeof (buf)) {
200 		dev_err(smn->smn_dip, CE_WARN, "!failed to read processor "
201 		    "brand string, brand larger than internal buffer");
202 		return (B_FALSE);
203 	}
204 
205 	family = cpuid_getfamily(CPU);
206 
207 	for (i = 0; i < ARRAY_SIZE(smntemp_offsets); i++) {
208 		if (family != smntemp_offsets[i].sto_family)
209 			continue;
210 		if (strncmp(buf, smntemp_offsets[i].sto_brand,
211 		    strlen(smntemp_offsets[i].sto_brand)) == 0) {
212 			smn->smn_offset = smntemp_offsets[i].sto_off;
213 			break;
214 		}
215 	}
216 
217 	return (B_TRUE);
218 }
219 
220 static int
221 smntemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
222 {
223 	uint_t i;
224 	smntemp_t *smntemp = &smntemp_data;
225 
226 	if (cmd == DDI_RESUME) {
227 		return (DDI_SUCCESS);
228 	} else if (cmd != DDI_ATTACH) {
229 		return (DDI_FAILURE);
230 	}
231 
232 	if (smntemp->smn_dip != NULL) {
233 		dev_err(dip, CE_WARN, "!smntemp already attached");
234 		return (DDI_FAILURE);
235 	}
236 	smntemp->smn_dip = dip;
237 	ddi_set_driver_private(dip, smntemp);
238 
239 	if (!smntemp_find_offset(smntemp)) {
240 		goto err;
241 	}
242 
243 	smntemp->smn_ntemps = amdzen_c_df_count();
244 	if (smntemp->smn_ntemps == 0) {
245 		dev_err(dip, CE_WARN, "!found zero DFs, can't attach smntemp");
246 		goto err;
247 	}
248 	smntemp->smn_temps = kmem_zalloc(sizeof (smntemp_temp_t) *
249 	    smntemp->smn_ntemps, KM_SLEEP);
250 	for (i = 0; i < smntemp->smn_ntemps; i++) {
251 		int ret;
252 		char buf[128];
253 
254 		smntemp->smn_temps[i].stt_smn = smntemp;
255 		smntemp->smn_temps[i].stt_dfno = i;
256 		mutex_init(&smntemp->smn_temps[i].stt_mutex, NULL, MUTEX_DRIVER,
257 		    NULL);
258 		smntemp->smn_temps[i].stt_flags |= SMNTEMP_F_MUTEX;
259 
260 		if (snprintf(buf, sizeof (buf), "procnode.%u", i) >=
261 		    sizeof (buf)) {
262 			dev_err(dip, CE_WARN, "!unexpected buffer name overrun "
263 			    "assembling temperature minor %u", i);
264 			goto err;
265 		}
266 
267 		if ((ret = ksensor_create(dip, &smntemp_temp_ops,
268 		    &smntemp->smn_temps[i], buf, DDI_NT_SENSOR_TEMP_CPU,
269 		    &smntemp->smn_temps[i].stt_ksensor)) != 0) {
270 			dev_err(dip, CE_WARN, "!failed to create sensor %s: %d",
271 			    buf, ret);
272 			goto err;
273 		}
274 	}
275 
276 	return (DDI_SUCCESS);
277 
278 err:
279 	smntemp_cleanup(smntemp);
280 	return (DDI_FAILURE);
281 }
282 
283 static int
284 smntemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
285 {
286 	smntemp_t *smntemp = &smntemp_data;
287 
288 	if (cmd == DDI_SUSPEND) {
289 		return (DDI_SUCCESS);
290 	} else if (cmd != DDI_DETACH) {
291 		return (DDI_FAILURE);
292 	}
293 
294 	if (smntemp->smn_dip == NULL) {
295 		dev_err(smntemp->smn_dip, CE_WARN, "!asked to detach smn "
296 		    "instance %d that was never attached",
297 		    ddi_get_instance(dip));
298 		return (DDI_FAILURE);
299 	}
300 
301 	smntemp_cleanup(smntemp);
302 	return (DDI_SUCCESS);
303 }
304 
305 static struct dev_ops smntemp_dev_ops = {
306 	.devo_rev = DEVO_REV,
307 	.devo_refcnt = 0,
308 	.devo_getinfo = nodev,
309 	.devo_identify = nulldev,
310 	.devo_probe = nulldev,
311 	.devo_attach = smntemp_attach,
312 	.devo_detach = smntemp_detach,
313 	.devo_reset = nodev,
314 	.devo_quiesce = ddi_quiesce_not_needed,
315 };
316 
317 static struct modldrv smntemp_modldrv = {
318 	.drv_modops = &mod_driverops,
319 	.drv_linkinfo = "AMD SMN Temperature Driver",
320 	.drv_dev_ops = &smntemp_dev_ops
321 };
322 
323 static struct modlinkage smntemp_modlinkage = {
324 	.ml_rev = MODREV_1,
325 	.ml_linkage = { &smntemp_modldrv, NULL }
326 };
327 
328 int
329 _init(void)
330 {
331 	return (mod_install(&smntemp_modlinkage));
332 }
333 
334 int
335 _info(struct modinfo *modinfop)
336 {
337 	return (mod_info(&smntemp_modlinkage, modinfop));
338 }
339 
340 int
341 _fini(void)
342 {
343 	return (mod_remove(&smntemp_modlinkage));
344 }
345