1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2019, Joyent, Inc. 14 * Copyright 2021 Oxide Computer Company 15 */ 16 17 /* 18 * Intel CPU Thermal sensor driver 19 * 20 * These MSRs that were used were introduced with the 'Core' family processors 21 * and have since spread beyond there, even to the Atom line. Currently, 22 * temperature sensors exist on a per-core basis and optionally on a per-package 23 * basis. The temperature sensor exposes a reading that's relative to the 24 * processor's maximum junction temperature, often referred to as Tj. We 25 * currently only support models where we can determine that junction 26 * temperature programmatically. For older processors, we would need to track 27 * down the datasheet. Unfortunately, the values here are often on a per-brand 28 * string basis. As in two CPUs with the same model and stepping, but have 29 * binned differently have different temperatures. 30 * 31 * The temperature is exposed through /dev and uses a semi-standard sensor 32 * framework. We expose one minor node per CPU core and one minor node per CPU 33 * package, if that is supported. Reads are rate-limited in the driver at 100ms 34 * by default per the global variable coretemp_cache_ms. 35 */ 36 37 #include <sys/modctl.h> 38 #include <sys/conf.h> 39 #include <sys/devops.h> 40 #include <sys/types.h> 41 #include <sys/file.h> 42 #include <sys/open.h> 43 #include <sys/stat.h> 44 #include <sys/cred.h> 45 #include <sys/ddi.h> 46 #include <sys/sunddi.h> 47 #include <sys/list.h> 48 #include <sys/stddef.h> 49 #include <sys/cmn_err.h> 50 #include <sys/x86_archext.h> 51 #include <sys/cpu_module.h> 52 #include <sys/ontrap.h> 53 #include <sys/cpuvar.h> 54 #include <sys/x_call.h> 55 #include <sys/sensors.h> 56 57 /* 58 * The Intel SDM says that the measurements we get are always in degrees 59 * Celsius. 60 */ 61 #define CORETEMP_GRANULARITY 1 62 63 typedef enum coretemp_sensor_type { 64 CORETEMP_S_CORE, 65 CORETEMP_S_SOCKET 66 } coretemp_sensor_type_t; 67 68 typedef struct coretemp_sensor { 69 list_node_t cs_link; 70 struct coretemp *cs_coretemp; 71 char cs_name[128]; 72 id_t cs_sensor; 73 coretemp_sensor_type_t cs_type; 74 enum cmi_hdl_class cs_class; 75 uint_t cs_chip; 76 uint_t cs_core; 77 uint_t cs_strand; 78 uint_t cs_tjmax; 79 uint_t cs_status_msr; 80 uint_t cs_intr_msr; 81 hrtime_t cs_last_read; 82 uint64_t cs_status; 83 uint64_t cs_intr; 84 /* The following fields are derived from above */ 85 uint_t cs_temperature; 86 uint_t cs_resolution; 87 } coretemp_sensor_t; 88 89 typedef struct coretemp { 90 dev_info_t *coretemp_dip; 91 cpuset_t *coretemp_cpuset; 92 boolean_t coretemp_pkg; 93 kmutex_t coretemp_mutex; 94 list_t coretemp_sensors; 95 } coretemp_t; 96 97 coretemp_t *coretemp; 98 99 /* 100 * This indicates a number of milliseconds that we should wait between reads. 101 * This is somewhat arbitrary, but the goal is to reduce cross call activity 102 * and reflect that the sensor may not update all the time. 103 */ 104 uint_t coretemp_cache_ms = 100; 105 106 static int 107 coretemp_rdmsr_xc(xc_arg_t arg1, xc_arg_t arg2, xc_arg_t arg3) 108 { 109 uint_t msr = (uint_t)arg1; 110 uint64_t *valp = (uint64_t *)arg2; 111 cmi_errno_t *errp = (cmi_errno_t *)arg3; 112 113 on_trap_data_t otd; 114 115 if (on_trap(&otd, OT_DATA_ACCESS) == 0) { 116 if (checked_rdmsr(msr, valp) == 0) { 117 *errp = CMI_SUCCESS; 118 } else { 119 *errp = CMIERR_NOTSUP; 120 } 121 } else { 122 *errp = CMIERR_MSRGPF; 123 } 124 no_trap(); 125 126 return (0); 127 } 128 129 /* 130 * This really should just be a call to the CMI handle to provide us the MSR. 131 * However, that routine, cmi_hdl_rdmsr(), cannot be safely used until it is 132 * fixed for use outside of a panic-like context. 133 */ 134 static int 135 coretemp_rdmsr(coretemp_t *ct, cmi_hdl_t hdl, uint_t msr, uint64_t *valp) 136 { 137 id_t cpu = cmi_hdl_logical_id(hdl); 138 int ret = CMI_SUCCESS; 139 140 ASSERT(MUTEX_HELD(&ct->coretemp_mutex)); 141 kpreempt_disable(); 142 if (CPU->cpu_id == cpu) { 143 (void) coretemp_rdmsr_xc((xc_arg_t)msr, (xc_arg_t)valp, 144 (xc_arg_t)&ret); 145 } else { 146 cpuset_only(ct->coretemp_cpuset, (uint_t)cpu); 147 xc_call((xc_arg_t)msr, (xc_arg_t)valp, (xc_arg_t)&ret, 148 (ulong_t *)ct->coretemp_cpuset, coretemp_rdmsr_xc); 149 } 150 kpreempt_enable(); 151 152 return (ret); 153 } 154 155 static int 156 coretemp_cmi_errno(cmi_errno_t e) 157 { 158 switch (e) { 159 case CMIERR_NOTSUP: 160 return (ENOTSUP); 161 default: 162 return (EIO); 163 } 164 } 165 166 /* 167 * Answer the question of whether or not the driver can support the CPU in 168 * question. Right now we have the following constraints for supporting the CPU: 169 * 170 * o The CPU is made by Intel 171 * o The CPU has the Digital Thermal Sensor 172 * o The CPU family is 6, which is usually implicit from the above 173 * o We can determine its junction temperature through an MSR 174 * 175 * If we can't determine the junction temperature programmatically, then we need 176 * to set up tables of CPUs to do so. This can be fleshed out and improved. 177 */ 178 static boolean_t 179 coretemp_supported(void) 180 { 181 uint_t model; 182 183 if (cpuid_getvendor(CPU) != X86_VENDOR_Intel) { 184 return (B_FALSE); 185 } 186 187 if (!is_x86_feature(x86_featureset, X86FSET_CORE_THERMAL)) { 188 return (B_FALSE); 189 } 190 191 if (cpuid_getfamily(CPU) != 6) { 192 return (B_FALSE); 193 } 194 195 model = cpuid_getmodel(CPU); 196 if (model <= INTC_MODEL_PENRYN || model == INTC_MODEL_SILVERTHORNE || 197 model == INTC_MODEL_LINCROFT || model == INTC_MODEL_PENWELL || 198 model == INTC_MODEL_CLOVERVIEW || model == INTC_MODEL_CEDARVIEW) { 199 return (B_FALSE); 200 } 201 202 return (B_TRUE); 203 } 204 205 /* 206 * We need to determine the value of Tj Max as all temperature sensors are 207 * derived from this value. The ease of this depends on how old the processor in 208 * question is. The Core family processors after Penryn have support for an MSR 209 * that tells us what to go for. In the Atom family, processors starting with 210 * Silvermont have support for an MSR that documents this value. For older 211 * processors, one needs to track down the datasheet for a specific processor. 212 * Two processors in the same family/model may have different values of Tj Max. 213 * At the moment, we only support this on processors that have that MSR. 214 */ 215 static int 216 coretemp_calculate_tjmax(coretemp_t *ct, cmi_hdl_t hdl, uint_t *tjmax) 217 { 218 cmi_errno_t e; 219 uint64_t val = 0; 220 221 e = coretemp_rdmsr(ct, hdl, MSR_TEMPERATURE_TARGET, &val); 222 if (e != CMI_SUCCESS) { 223 return (coretemp_cmi_errno(e)); 224 } else if (val == 0) { 225 return (EINVAL); 226 } 227 228 *tjmax = MSR_TEMPERATURE_TARGET_TARGET(val); 229 return (0); 230 } 231 232 static int 233 coretemp_update(coretemp_t *ct, coretemp_sensor_t *sensor, cmi_hdl_t hdl) 234 { 235 cmi_errno_t e; 236 int err = 0; 237 uint64_t intr, status; 238 239 if ((e = coretemp_rdmsr(ct, hdl, sensor->cs_status_msr, &status)) != 240 CMI_SUCCESS) { 241 err = coretemp_cmi_errno(e); 242 dev_err(ct->coretemp_dip, CE_WARN, "!failed to get thermal " 243 "status on %s: %d", sensor->cs_name, err); 244 return (err); 245 } 246 247 if ((e = coretemp_rdmsr(ct, hdl, sensor->cs_intr_msr, &intr)) != 248 CMI_SUCCESS) { 249 err = coretemp_cmi_errno(e); 250 dev_err(ct->coretemp_dip, CE_WARN, "!failed to get thermal " 251 "interrupt on %s: %d", sensor->cs_name, err); 252 return (err); 253 } 254 255 sensor->cs_status = status; 256 sensor->cs_intr = intr; 257 sensor->cs_last_read = gethrtime(); 258 return (0); 259 } 260 261 static int 262 coretemp_read(void *arg, sensor_ioctl_scalar_t *scalar) 263 { 264 coretemp_sensor_t *sensor = arg; 265 coretemp_t *ct = sensor->cs_coretemp; 266 hrtime_t diff; 267 uint_t reading, resolution; 268 269 mutex_enter(&ct->coretemp_mutex); 270 diff = NSEC2MSEC(gethrtime() - sensor->cs_last_read); 271 if (diff > 0 && diff > (hrtime_t)coretemp_cache_ms) { 272 int ret; 273 cmi_hdl_t hdl; 274 275 if ((hdl = cmi_hdl_lookup(sensor->cs_class, sensor->cs_chip, 276 sensor->cs_core, sensor->cs_strand)) == NULL) { 277 mutex_exit(&ct->coretemp_mutex); 278 return (ENXIO); 279 } 280 ret = coretemp_update(ct, sensor, hdl); 281 cmi_hdl_rele(hdl); 282 if (ret != 0) { 283 mutex_exit(&ct->coretemp_mutex); 284 return (ret); 285 } 286 } 287 288 switch (sensor->cs_type) { 289 case CORETEMP_S_CORE: 290 if ((sensor->cs_status & IA32_THERM_STATUS_READ_VALID) == 0) { 291 mutex_exit(&ct->coretemp_mutex); 292 return (EIO); 293 } 294 reading = IA32_THERM_STATUS_READING(sensor->cs_status); 295 resolution = IA32_THERM_STATUS_RESOLUTION(sensor->cs_status); 296 break; 297 case CORETEMP_S_SOCKET: 298 reading = IA32_PKG_THERM_STATUS_READING(sensor->cs_status); 299 resolution = 0; 300 break; 301 default: 302 mutex_exit(&ct->coretemp_mutex); 303 return (ENXIO); 304 } 305 if (reading >= sensor->cs_tjmax) { 306 dev_err(ct->coretemp_dip, CE_WARN, "!found invalid temperature " 307 "on sensor %s: readout: %u, tjmax: %u, raw: 0x%" 308 PRIx64, sensor->cs_name, reading, sensor->cs_tjmax, 309 sensor->cs_status); 310 mutex_exit(&ct->coretemp_mutex); 311 return (EIO); 312 } 313 sensor->cs_temperature = sensor->cs_tjmax - reading; 314 sensor->cs_resolution = resolution; 315 316 scalar->sis_unit = SENSOR_UNIT_CELSIUS; 317 scalar->sis_value = sensor->cs_temperature; 318 scalar->sis_gran = CORETEMP_GRANULARITY; 319 scalar->sis_prec = sensor->cs_resolution; 320 mutex_exit(&ct->coretemp_mutex); 321 322 return (0); 323 } 324 325 static const ksensor_ops_t coretemp_temp_ops = { 326 .kso_kind = ksensor_kind_temperature, 327 .kso_scalar = coretemp_read 328 }; 329 330 static void 331 coretemp_destroy(coretemp_t *ct) 332 { 333 coretemp_sensor_t *sensor; 334 335 (void) ksensor_remove(ct->coretemp_dip, KSENSOR_ALL_IDS); 336 while ((sensor = list_remove_head(&ct->coretemp_sensors)) != NULL) { 337 kmem_free(sensor, sizeof (coretemp_sensor_t)); 338 } 339 list_destroy(&ct->coretemp_sensors); 340 341 if (ct->coretemp_cpuset != NULL) { 342 cpuset_free(ct->coretemp_cpuset); 343 } 344 345 mutex_destroy(&ct->coretemp_mutex); 346 kmem_free(ct, sizeof (coretemp_t)); 347 } 348 349 static boolean_t 350 coretemp_create_sensor(coretemp_t *ct, cmi_hdl_t hdl, uint_t tjmax, 351 coretemp_sensor_type_t type) 352 { 353 int err; 354 coretemp_sensor_t *sensor; 355 356 sensor = kmem_zalloc(sizeof (coretemp_sensor_t), KM_SLEEP); 357 sensor->cs_coretemp = ct; 358 sensor->cs_type = type; 359 sensor->cs_class = cmi_hdl_class(hdl); 360 sensor->cs_chip = cmi_hdl_chipid(hdl); 361 sensor->cs_core = cmi_hdl_coreid(hdl); 362 sensor->cs_strand = 0; 363 sensor->cs_tjmax = tjmax; 364 365 switch (sensor->cs_type) { 366 case CORETEMP_S_CORE: 367 if (snprintf(sensor->cs_name, sizeof (sensor->cs_name), 368 "chip%u.core%u", sensor->cs_chip, sensor->cs_core) >= 369 sizeof (sensor->cs_name)) { 370 goto err; 371 } 372 sensor->cs_status_msr = MSR_IA32_THERM_STATUS; 373 sensor->cs_intr_msr = MSR_IA32_THERM_INTERRUPT; 374 break; 375 case CORETEMP_S_SOCKET: 376 if (snprintf(sensor->cs_name, sizeof (sensor->cs_name), 377 "chip%u", sensor->cs_chip) >= sizeof (sensor->cs_name)) { 378 goto err; 379 } 380 sensor->cs_status_msr = MSR_IA32_PACKAGE_THERM_STATUS; 381 sensor->cs_intr_msr = MSR_IA32_PACKAGE_THERM_INTERRUPT; 382 break; 383 } 384 385 if ((err = ksensor_create(ct->coretemp_dip, &coretemp_temp_ops, sensor, 386 sensor->cs_name, DDI_NT_SENSOR_TEMP_CPU, &sensor->cs_sensor)) != 387 0) { 388 dev_err(ct->coretemp_dip, CE_WARN, "failed to create ksensor " 389 "for %s: %d", sensor->cs_name, err); 390 } 391 392 ASSERT(MUTEX_HELD(&ct->coretemp_mutex)); 393 list_insert_tail(&ct->coretemp_sensors, sensor); 394 395 return (B_TRUE); 396 err: 397 kmem_free(sensor, sizeof (coretemp_sensor_t)); 398 return (B_FALSE); 399 } 400 401 static int 402 coretemp_walk(cmi_hdl_t hdl, void *arg1, void *arg2, void *arg3) 403 { 404 coretemp_t *ct = arg1; 405 boolean_t *walkerr = arg2; 406 uint_t tjmax; 407 int err; 408 409 /* 410 * The temperature sensor only exists on a per-core basis. Therefore we 411 * ignore any non-zero strand. 412 */ 413 if (cmi_hdl_strandid(hdl) != 0) { 414 return (CMI_HDL_WALK_NEXT); 415 } 416 417 if ((err = coretemp_calculate_tjmax(ct, hdl, &tjmax)) != 0) { 418 dev_err(ct->coretemp_dip, CE_WARN, 419 "failed to read Tj Max on %u/%u: %d", cmi_hdl_chipid(hdl), 420 cmi_hdl_coreid(hdl), err); 421 *walkerr = B_TRUE; 422 return (CMI_HDL_WALK_DONE); 423 } 424 425 if (!coretemp_create_sensor(ct, hdl, tjmax, CORETEMP_S_CORE)) { 426 *walkerr = B_TRUE; 427 return (CMI_HDL_WALK_DONE); 428 } 429 430 if (ct->coretemp_pkg && cmi_hdl_coreid(hdl) == 0 && 431 !coretemp_create_sensor(ct, hdl, tjmax, CORETEMP_S_SOCKET)) { 432 *walkerr = B_TRUE; 433 return (CMI_HDL_WALK_DONE); 434 } 435 436 return (CMI_HDL_WALK_NEXT); 437 } 438 439 static int 440 coretemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 441 { 442 boolean_t walkerr; 443 coretemp_t *ct = NULL; 444 445 if (cmd == DDI_RESUME) { 446 return (DDI_SUCCESS); 447 } else if (cmd != DDI_ATTACH) { 448 return (DDI_FAILURE); 449 } 450 451 if (coretemp != NULL) { 452 return (DDI_FAILURE); 453 } 454 455 ct = kmem_zalloc(sizeof (coretemp_t), KM_SLEEP); 456 ct->coretemp_dip = dip; 457 ct->coretemp_pkg = is_x86_feature(x86_featureset, X86FSET_PKG_THERMAL); 458 list_create(&ct->coretemp_sensors, sizeof (coretemp_sensor_t), 459 offsetof(coretemp_sensor_t, cs_link)); 460 mutex_init(&ct->coretemp_mutex, NULL, MUTEX_DRIVER, NULL); 461 ct->coretemp_cpuset = cpuset_alloc(KM_SLEEP); 462 463 mutex_enter(&ct->coretemp_mutex); 464 walkerr = B_FALSE; 465 cmi_hdl_walk(coretemp_walk, ct, &walkerr, NULL); 466 467 if (walkerr) { 468 mutex_exit(&ct->coretemp_mutex); 469 goto fail; 470 } 471 472 coretemp = ct; 473 mutex_exit(&ct->coretemp_mutex); 474 return (DDI_SUCCESS); 475 fail: 476 coretemp = NULL; 477 coretemp_destroy(ct); 478 return (DDI_FAILURE); 479 480 } 481 482 static int 483 coretemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 484 { 485 if (cmd == DDI_SUSPEND) { 486 return (DDI_SUCCESS); 487 } else if (cmd != DDI_DETACH) { 488 return (DDI_FAILURE); 489 } 490 491 if (coretemp == NULL) { 492 return (DDI_FAILURE); 493 } 494 495 coretemp_destroy(coretemp); 496 coretemp = NULL; 497 498 return (DDI_SUCCESS); 499 } 500 501 static struct dev_ops coretemp_dev_ops = { 502 .devo_rev = DEVO_REV, 503 .devo_refcnt = 0, 504 .devo_getinfo = nodev, 505 .devo_identify = nulldev, 506 .devo_probe = nulldev, 507 .devo_attach = coretemp_attach, 508 .devo_detach = coretemp_detach, 509 .devo_reset = nodev, 510 .devo_quiesce = ddi_quiesce_not_needed 511 }; 512 513 static struct modldrv coretemp_modldrv = { 514 .drv_modops = &mod_driverops, 515 .drv_linkinfo = "Intel CPU/Package thermal sensor", 516 .drv_dev_ops = &coretemp_dev_ops 517 }; 518 519 static struct modlinkage coretemp_modlinkage = { 520 .ml_rev = MODREV_1, 521 .ml_linkage = { &coretemp_modldrv, NULL } 522 }; 523 524 int 525 _init(void) 526 { 527 if (!coretemp_supported()) { 528 return (ENOTSUP); 529 } 530 531 return (mod_install(&coretemp_modlinkage)); 532 } 533 534 int 535 _info(struct modinfo *modinfop) 536 { 537 return (mod_info(&coretemp_modlinkage, modinfop)); 538 } 539 540 int 541 _fini(void) 542 { 543 return (mod_remove(&coretemp_modlinkage)); 544 } 545