1 /*
2 * This file is part of John the Ripper password cracker.
3 *
4 * Functions common to OpenCL and other accelerators (eg. FPGA) go in this file.
5 *
6 * This software is
7 * Copyright (c) 2010-2012 Samuele Giovanni Tonon <samu at linuxasylum dot net>
8 * Copyright (c) 2010-2013 Lukas Odzioba <ukasz@openwall.net>
9 * Copyright (c) 2010-2013 magnum
10 * Copyright (c) 2012-2015 Claudio André <claudioandre.br at gmail.com>
11 * and is hereby released to the general public under the following terms:
12 * Redistribution and use in source and binary forms, with or without
13 * modifications, are permitted.
14 */
15
16 #if defined (HAVE_OPENCL)
17
18 #ifdef AC_BUILT
19 #include "autoconfig.h"
20 #endif
21
22 #include <stdio.h>
23 #include <stdlib.h>
24
25 #if HAVE_LIBDL
26 #include <dlfcn.h>
27 #elif HAVE_WINDOWS_H
28 // For mingw/VC
29 #include "Win32-dlfcn-port.h"
30 #define HAVE_LIBDL 1
31 #endif
32
33 #include <string.h>
34
35 #include "gpu_common.h"
36 #include "john.h"
37 #include "memory.h"
38 #include "params.h"
39 #include "logger.h"
40 #include "signals.h"
41 #ifndef BENCH_BUILD
42 #include "options.h"
43 #endif
44
45 int gpu_id;
46 int engaged_devices[MAX_GPU_DEVICES + 1];
47 int requested_devices[MAX_GPU_DEVICES + 1];
48 hw_bus gpu_device_bus[MAX_GPU_DEVICES];
49
50 int gpu_temp_limit, cool_gpu_down;
51 char gpu_degree_sign[8] = "";
52
53 void *nvml_lib;
54 #if __linux__ && HAVE_LIBDL
55 NVMLINIT nvmlInit;
56 NVMLSHUTDOWN nvmlShutdown;
57 NVMLDEVICEGETHANDLEBYINDEX nvmlDeviceGetHandleByIndex;
58 NVMLDEVICEGETTEMPERATURE nvmlDeviceGetTemperature;
59 NVMLDEVICEGETFANSPEED nvmlDeviceGetFanSpeed;
60 NVMLDEVICEGETUTILIZATIONRATES nvmlDeviceGetUtilizationRates;
61 NVMLDEVICEGETPCIINFO nvmlDeviceGetPciInfo;
62 NVMLDEVICEGETNAME nvmlDeviceGetName;
63 NVMLDEVICEGETHANDLEBYPCIBUSID nvmlDeviceGetHandleByPciBusId;
64 NVMLDEVICEGETINDEX nvmlDeviceGetIndex;
65 NVMLDEVICEGETCURRPCIELINKWIDTH nvmlDeviceGetCurrPcieLinkWidth;
66 NVMLDEVICEGETMAXPCIELINKWIDTH nvmlDeviceGetMaxPcieLinkWidth;
67 #endif /* __linux__ && HAVE_LIBDL */
68
69 void *adl_lib;
70
71 #if HAVE_LIBDL
72 static int amd = 0;
73 int amd2adl[MAX_GPU_DEVICES];
74 int adl2od[MAX_GPU_DEVICES];
75
76 ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create;
77 ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy;
78 ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
79 ADL_ADAPTER_ADAPTERINFO_GET ADL_Adapter_AdapterInfo_Get;
80 ADL_ADAPTER_ACTIVE_GET ADL_Adapter_Active_Get;
81 ADL_OVERDRIVE_CAPS ADL_Overdrive_Caps;
82
83 ADL_OVERDRIVE5_THERMALDEVICES_ENUM ADL_Overdrive5_ThermalDevices_Enum;
84 ADL_OVERDRIVE5_ODPARAMETERS_GET ADL_Overdrive5_ODParameters_Get;
85 ADL_OVERDRIVE5_TEMPERATURE_GET ADL_Overdrive5_Temperature_Get;
86 ADL_OVERDRIVE5_FANSPEED_GET ADL_Overdrive5_FanSpeed_Get;
87 ADL_OVERDRIVE5_FANSPEEDINFO_GET ADL_Overdrive5_FanSpeedInfo_Get;
88 ADL_OVERDRIVE5_CURRENTACTIVITY_GET ADL_Overdrive5_CurrentActivity_Get;
89
90 ADL_OVERDRIVE6_FANSPEED_GET ADL_Overdrive6_FanSpeed_Get;
91 ADL_OVERDRIVE6_THERMALCONTROLLER_CAPS ADL_Overdrive6_ThermalController_Caps;
92 ADL_OVERDRIVE6_TEMPERATURE_GET ADL_Overdrive6_Temperature_Get;
93 ADL_OVERDRIVE6_CURRENTSTATUS_GET ADL_Overdrive6_CurrentStatus_Get;
94 ADL_OVERDRIVE6_CAPABILITIES_GET ADL_Overdrive6_Capabilities_Get;
95
96 // Memory allocation callback function
ADL_Main_Memory_Alloc(int iSize)97 static void* ADL_Main_Memory_Alloc(int iSize)
98 {
99 void*lpBuffer = malloc(iSize);
100 return lpBuffer;
101 }
102
103 #endif /* HAVE_LIBDL */
104
advance_cursor()105 void advance_cursor()
106 {
107 static int pos = 0;
108 char cursor[4] = { '/', '-', '\\', '|' };
109
110 if (john_main_process) {
111 fprintf(stderr, "%c\b", cursor[pos]);
112 pos = (pos + 1) % 4;
113 }
114 }
115
116 /* Function pointer to read temperature for device n */
117 void (*dev_get_temp[MAX_GPU_DEVICES]) (int id, int *temp, int *fanspeed,
118 int *util, int *cl, int *ml);
119
120 /* Map OpenCL device number to ADL/NVML device number */
121 unsigned int temp_dev_id[MAX_GPU_DEVICES];
122
nvidia_probe(void)123 void nvidia_probe(void)
124 {
125 #if __linux__ && HAVE_LIBDL
126 if (nvml_lib)
127 return;
128
129 if (!(nvml_lib = dlopen("libnvidia-ml.so", RTLD_LAZY|RTLD_GLOBAL)))
130 return;
131
132 nvmlInit = (NVMLINIT) dlsym(nvml_lib, "nvmlInit");
133 nvmlShutdown = (NVMLSHUTDOWN) dlsym(nvml_lib, "nvmlShutdown");
134 nvmlDeviceGetHandleByIndex = (NVMLDEVICEGETHANDLEBYINDEX) dlsym(nvml_lib, "nvmlDeviceGetHandleByIndex");
135 nvmlDeviceGetTemperature = (NVMLDEVICEGETTEMPERATURE) dlsym(nvml_lib, "nvmlDeviceGetTemperature");
136 nvmlDeviceGetFanSpeed = (NVMLDEVICEGETFANSPEED) dlsym(nvml_lib, "nvmlDeviceGetFanSpeed");
137 nvmlDeviceGetUtilizationRates = (NVMLDEVICEGETUTILIZATIONRATES) dlsym(nvml_lib, "nvmlDeviceGetUtilizationRates");
138 nvmlDeviceGetPciInfo = (NVMLDEVICEGETPCIINFO) dlsym(nvml_lib, "nvmlDeviceGetPciInfo");
139 nvmlDeviceGetName = (NVMLDEVICEGETNAME) dlsym(nvml_lib, "nvmlDeviceGetName");
140 nvmlDeviceGetHandleByPciBusId = (NVMLDEVICEGETHANDLEBYPCIBUSID) dlsym(nvml_lib, "nvmlDeviceGetHandleByPciBusId");
141 nvmlDeviceGetIndex = (NVMLDEVICEGETINDEX) dlsym(nvml_lib, "nvmlDeviceGetIndex");
142 //nvmlUnitGetCount = (NVMLUNITGETCOUNT) dlsym(nvml_lib, "nvmlUnitGetCount");
143 nvmlDeviceGetCurrPcieLinkWidth = (NVMLDEVICEGETCURRPCIELINKWIDTH) dlsym(nvml_lib, "nvmlDeviceGetCurrPcieLinkWidth");
144 nvmlDeviceGetMaxPcieLinkWidth = (NVMLDEVICEGETMAXPCIELINKWIDTH) dlsym(nvml_lib, "nvmlDeviceGetMaxPcieLinkWidth");
145 nvmlInit();
146 #endif
147 }
148
amd_probe(void)149 void amd_probe(void)
150 {
151 #if HAVE_LIBDL
152 LPAdapterInfo lpAdapterInfo = NULL;
153 int i, ret;
154 int iNumberAdapters = 0;
155 int iOverdriveSupported = 0;
156 int iOverdriveEnabled = 0;
157 int iOverdriveVersion = 0;
158 char *env;
159
160 if (adl_lib)
161 return;
162
163 #if HAVE_WINDOWS_H
164 if (!(adl_lib = dlopen("atiadlxx.dll", RTLD_LAZY|RTLD_GLOBAL)) &&
165 !(adl_lib = dlopen("atiadlxy.dll", RTLD_LAZY|RTLD_GLOBAL)))
166 return;
167 #else
168 if (!(adl_lib = dlopen("libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL)))
169 return;
170 #endif
171
172 env = getenv("COMPUTE");
173 if (env && *env)
174 setenv("DISPLAY", env, 1);
175 else {
176 env = getenv("DISPLAY");
177 if (!env || !*env)
178 setenv("DISPLAY", ":0", 1);
179 }
180
181 ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE) dlsym(adl_lib,"ADL_Main_Control_Create");
182 ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY) dlsym(adl_lib,"ADL_Main_Control_Destroy");
183 ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET) dlsym(adl_lib,"ADL_Adapter_NumberOfAdapters_Get");
184 ADL_Adapter_AdapterInfo_Get = (ADL_ADAPTER_ADAPTERINFO_GET) dlsym(adl_lib,"ADL_Adapter_AdapterInfo_Get");
185 ADL_Adapter_Active_Get = (ADL_ADAPTER_ACTIVE_GET)dlsym(adl_lib, "ADL_Adapter_Active_Get");
186 ADL_Overdrive_Caps = (ADL_OVERDRIVE_CAPS)dlsym(adl_lib, "ADL_Overdrive_Caps");
187
188 ADL_Overdrive5_ThermalDevices_Enum = (ADL_OVERDRIVE5_THERMALDEVICES_ENUM) dlsym(adl_lib, "ADL_Overdrive5_ThermalDevices_Enum");
189 ADL_Overdrive5_Temperature_Get = (ADL_OVERDRIVE5_TEMPERATURE_GET) dlsym(adl_lib, "ADL_Overdrive5_Temperature_Get");
190 ADL_Overdrive5_FanSpeed_Get = (ADL_OVERDRIVE5_FANSPEED_GET) dlsym(adl_lib, "ADL_Overdrive5_FanSpeed_Get");
191 ADL_Overdrive5_FanSpeedInfo_Get = (ADL_OVERDRIVE5_FANSPEEDINFO_GET) dlsym(adl_lib, "ADL_Overdrive5_FanSpeedInfo_Get");
192 ADL_Overdrive5_ODParameters_Get = (ADL_OVERDRIVE5_ODPARAMETERS_GET) dlsym(adl_lib, "ADL_Overdrive5_ODParameters_Get");
193 ADL_Overdrive5_CurrentActivity_Get = (ADL_OVERDRIVE5_CURRENTACTIVITY_GET) dlsym(adl_lib, "ADL_Overdrive5_CurrentActivity_Get");
194
195 ADL_Overdrive6_FanSpeed_Get = (ADL_OVERDRIVE6_FANSPEED_GET) dlsym(adl_lib,"ADL_Overdrive6_FanSpeed_Get");
196 ADL_Overdrive6_ThermalController_Caps = (ADL_OVERDRIVE6_THERMALCONTROLLER_CAPS)dlsym(adl_lib, "ADL_Overdrive6_ThermalController_Caps");
197 ADL_Overdrive6_Temperature_Get = (ADL_OVERDRIVE6_TEMPERATURE_GET)dlsym(adl_lib, "ADL_Overdrive6_Temperature_Get");
198 ADL_Overdrive6_CurrentStatus_Get = (ADL_OVERDRIVE6_CURRENTSTATUS_GET)dlsym(adl_lib, "ADL_Overdrive6_CurrentStatus_Get");
199 ADL_Overdrive6_Capabilities_Get = (ADL_OVERDRIVE6_CAPABILITIES_GET)dlsym(adl_lib, "ADL_Overdrive6_Capabilities_Get");
200
201 if ((ret = ADL_Main_Control_Create(ADL_Main_Memory_Alloc, 1)) != ADL_OK)
202 return;
203
204 // Obtain the number of adapters for the system
205 if (ADL_Adapter_NumberOfAdapters_Get(&iNumberAdapters) != ADL_OK)
206 return;
207
208 if (iNumberAdapters > 0) {
209 lpAdapterInfo = (LPAdapterInfo)mem_alloc(sizeof(AdapterInfo) * iNumberAdapters);
210 memset(lpAdapterInfo,'\0', sizeof(AdapterInfo) * iNumberAdapters);
211
212 ADL_Adapter_AdapterInfo_Get(lpAdapterInfo, sizeof(AdapterInfo) * iNumberAdapters);
213 }
214
215 for (i = 0; i < iNumberAdapters; i++) {
216 int adapterActive = 0;
217 AdapterInfo adapterInfo = lpAdapterInfo[i];
218
219 ADL_Adapter_Active_Get(adapterInfo.iAdapterIndex , &adapterActive);
220 if (adapterActive) {
221 int adl_id = adapterInfo.iAdapterIndex;
222
223 amd2adl[amd] = adl_id;
224 adl2od[adl_id] = 0;
225 gpu_device_bus[amd].bus = adapterInfo.iBusNumber;
226 gpu_device_bus[amd].device = adapterInfo.iDeviceNumber;
227 gpu_device_bus[amd].function = adapterInfo.iFunctionNumber;
228
229 #if OCL_DEBUG
230 printf("amd %u adl %u hardware id %02x:%02x.%x\n", amd, adl_id, gpu_device_bus[amd].bus, gpu_device_bus[amd].device,gpu_device_bus[amd].function);
231 #endif
232 memset(gpu_device_bus[amd].busId, '\0', sizeof(gpu_device_bus[amd].busId));
233 sprintf(gpu_device_bus[amd].busId, "%02x:%02x.%x", gpu_device_bus[amd].bus,
234 gpu_device_bus[amd].device,gpu_device_bus[amd].function);
235
236 amd++;
237
238 if (ADL_Overdrive_Caps(adl_id, &iOverdriveSupported, &iOverdriveEnabled, &iOverdriveVersion) != ADL_OK) {
239 MEM_FREE(lpAdapterInfo);
240 ADL_Main_Control_Destroy();
241 return;
242 }
243
244 if (!iOverdriveSupported) {
245 MEM_FREE(lpAdapterInfo);
246 ADL_Main_Control_Destroy();
247 return;
248 }
249
250 if (iOverdriveVersion == 5)
251 adl2od[adl_id] = 5;
252 else if (iOverdriveVersion == 6)
253 adl2od[adl_id] = 6;
254 else
255 adl2od[adl_id] = 0;
256 }
257 }
258 MEM_FREE(lpAdapterInfo);
259 ADL_Main_Control_Destroy();
260 #endif
261 }
262
nvidia_get_temp(int nvml_id,int * temp,int * fanspeed,int * util,int * cl,int * ml)263 void nvidia_get_temp(int nvml_id, int *temp, int *fanspeed, int *util,
264 int *cl, int *ml)
265 {
266 #if __linux__ && HAVE_LIBDL
267 nvmlUtilization_t s_util;
268 nvmlDevice_t dev;
269 unsigned int value;
270
271 if (nvmlDeviceGetHandleByIndex(nvml_id, &dev) != NVML_SUCCESS) {
272 *temp = *fanspeed = *util = *cl = *ml = -1;
273 return;
274 }
275
276 if (nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &value) == NVML_SUCCESS)
277 *temp = value;
278 else
279 *temp = -1;
280 if (nvmlDeviceGetFanSpeed(dev, &value) == NVML_SUCCESS)
281 *fanspeed = value;
282 else
283 *fanspeed = -1;
284 if (nvmlDeviceGetUtilizationRates(dev, &s_util) == NVML_SUCCESS)
285 *util = s_util.gpu;
286 else
287 *util = -1;
288 if (nvmlDeviceGetMaxPcieLinkWidth(dev, &value) == NVML_SUCCESS)
289 *ml = value;
290 if (nvmlDeviceGetCurrPcieLinkWidth(dev, &value) == NVML_SUCCESS)
291 *cl = value;
292 else
293 *cl = *ml;
294 if (*ml < *cl)
295 *ml = *cl;
296 #endif /* __linux__ && HAVE_LIBDL */
297 }
298
299 #if HAVE_LIBDL
get_temp_od5(int adl_id,int * temp,int * fanspeed,int * util,int * cl,int * ml)300 static void get_temp_od5(int adl_id, int *temp, int *fanspeed, int *util,
301 int *cl, int *ml)
302 {
303 int ADL_Err = ADL_ERR;
304 ADLFanSpeedInfo fanSpeedInfo = { 0 };
305 int fanSpeedReportingMethod = 0;
306 int iThermalControllerIndex;
307 ADLThermalControllerInfo termalControllerInfo = { 0 };
308 ADLODParameters overdriveParameters = { 0 };
309 ADLPMActivity activity = { 0 };
310
311 if (ADL_Main_Control_Create(ADL_Main_Memory_Alloc, 1) != ADL_OK)
312 return;
313
314 *temp = *fanspeed = *util = *cl = *ml = -1;
315
316 if (!ADL_Overdrive5_ThermalDevices_Enum ||
317 !ADL_Overdrive5_Temperature_Get ||
318 !ADL_Overdrive5_FanSpeed_Get ||
319 !ADL_Overdrive5_FanSpeedInfo_Get ||
320 !ADL_Overdrive5_ODParameters_Get ||
321 !ADL_Overdrive5_CurrentActivity_Get)
322 return;
323
324 termalControllerInfo.iSize = sizeof(ADLThermalControllerInfo);
325
326 for (iThermalControllerIndex = 0; iThermalControllerIndex < 10; iThermalControllerIndex++) {
327 ADL_Err = ADL_Overdrive5_ThermalDevices_Enum(adl_id, iThermalControllerIndex, &termalControllerInfo);
328
329 if (ADL_Err == ADL_WARNING_NO_DATA)
330 break;
331
332 if (termalControllerInfo.iThermalDomain == ADL_DL_THERMAL_DOMAIN_GPU) {
333 ADLTemperature adlTemperature = { 0 };
334 ADLFanSpeedValue fanSpeedValue = { 0 };
335
336 adlTemperature.iSize = sizeof(ADLTemperature);
337 if (ADL_Overdrive5_Temperature_Get(adl_id, iThermalControllerIndex, &adlTemperature) == ADL_OK)
338 *temp = adlTemperature.iTemperature / 1000;
339
340 fanSpeedInfo.iSize = sizeof(ADLFanSpeedInfo);
341 if (ADL_Overdrive5_FanSpeedInfo_Get(adl_id, iThermalControllerIndex, &fanSpeedInfo) == ADL_OK)
342 if ((fanSpeedReportingMethod = (fanSpeedInfo.iFlags & ADL_DL_FANCTRL_SUPPORTS_PERCENT_READ))) {
343 fanSpeedValue.iSpeedType = fanSpeedReportingMethod;
344 if (ADL_Overdrive5_FanSpeed_Get(adl_id, iThermalControllerIndex, &fanSpeedValue) == ADL_OK)
345 *fanspeed = fanSpeedValue.iFanSpeed;
346 }
347 }
348 }
349
350 overdriveParameters.iSize = sizeof(ADLODParameters);
351 if (ADL_Overdrive5_ODParameters_Get(adl_id, &overdriveParameters) == ADL_OK) {
352 activity.iSize = sizeof(ADLPMActivity);
353 if (ADL_Overdrive5_CurrentActivity_Get(adl_id, &activity) == ADL_OK)
354 if (overdriveParameters.iActivityReportingSupported) {
355 *util = activity.iActivityPercent;
356 *cl = activity.iCurrentBusLanes;
357 *ml = activity.iMaximumBusLanes;
358 }
359 }
360
361 ADL_Main_Control_Destroy();
362 return;
363 }
364
get_temp_od6(int adl_id,int * temp,int * fanspeed,int * util,int * cl,int * ml)365 static void get_temp_od6(int adl_id, int *temp, int *fanspeed, int *util,
366 int *cl, int *ml)
367 {
368 ADLOD6FanSpeedInfo fanSpeedInfo = { 0 };
369 ADLOD6ThermalControllerCaps thermalControllerCaps = { 0 };
370 ADLOD6Capabilities od6Capabilities = { 0 };
371 int temperature = 0;
372 ADLOD6CurrentStatus currentStatus = { 0 };
373
374 if (ADL_Main_Control_Create(ADL_Main_Memory_Alloc, 1) != ADL_OK)
375 return;
376
377 *temp = *fanspeed = *util = -1;
378
379 if (!ADL_Overdrive6_FanSpeed_Get ||
380 !ADL_Overdrive6_ThermalController_Caps ||
381 !ADL_Overdrive6_Temperature_Get ||
382 !ADL_Overdrive6_CurrentStatus_Get)
383 return;
384
385 if (ADL_Overdrive6_ThermalController_Caps(adl_id, &thermalControllerCaps) == ADL_OK) {
386 if (thermalControllerCaps.iCapabilities & ADL_OD6_TCCAPS_FANSPEED_CONTROL)
387 if (thermalControllerCaps.iCapabilities & ADL_OD6_TCCAPS_FANSPEED_PERCENT_READ)
388 if (ADL_Overdrive6_FanSpeed_Get(adl_id, &fanSpeedInfo) == ADL_OK)
389 if (fanSpeedInfo.iSpeedType & ADL_OD6_FANSPEED_TYPE_PERCENT)
390 *fanspeed = fanSpeedInfo.iFanSpeedPercent;
391
392 if (thermalControllerCaps.iCapabilities & ADL_OD6_TCCAPS_THERMAL_CONTROLLER)
393 if (ADL_Overdrive6_Temperature_Get(adl_id, &temperature) == ADL_OK)
394 *temp = temperature / 1000;
395
396 if (ADL_Overdrive6_Capabilities_Get(adl_id, &od6Capabilities) == ADL_OK)
397 if (od6Capabilities.iCapabilities & ADL_OD6_CAPABILITY_GPU_ACTIVITY_MONITOR)
398 if (ADL_Overdrive6_CurrentStatus_Get(adl_id, ¤tStatus) == ADL_OK)
399 {
400 *util = currentStatus.iActivityPercent;
401 *cl = currentStatus.iCurrentBusLanes;
402 *ml = currentStatus.iMaximumBusLanes;
403 }
404 }
405
406 ADL_Main_Control_Destroy();
407 return;
408 }
409 #endif
410
amd_get_temp(int amd_id,int * temp,int * fanspeed,int * util,int * cl,int * ml)411 void amd_get_temp(int amd_id, int *temp, int *fanspeed, int *util, int *cl,
412 int *ml)
413 {
414 #if HAVE_LIBDL
415 int adl_id = amd_id;
416
417 if (adl2od[adl_id] == 5) {
418 get_temp_od5(adl_id, temp, fanspeed, util, cl, ml);
419 } else if (adl2od[adl_id] == 6) {
420 get_temp_od6(adl_id, temp, fanspeed, util, cl, ml);
421 } else
422 #endif
423 *temp = *fanspeed = *util = *cl = *ml = -1;
424 }
425
id2nvml(const hw_bus busInfo)426 int id2nvml(const hw_bus busInfo) {
427 #if __linux__ && HAVE_LIBDL
428 nvmlDevice_t dev;
429
430 if (nvmlDeviceGetHandleByPciBusId &&
431 nvmlDeviceGetHandleByPciBusId(busInfo.busId, &dev) == NVML_SUCCESS &&
432 nvmlDeviceGetIndex)
433 {
434 unsigned int id_NVML;
435
436 if (nvmlDeviceGetIndex(dev, &id_NVML) == NVML_SUCCESS)
437 return id_NVML;
438 }
439 #endif
440 return -1;
441 }
442
id2adl(const hw_bus busInfo)443 int id2adl(const hw_bus busInfo) {
444 #if HAVE_LIBDL
445 int hardware_id = 0;
446
447 while (hardware_id < amd) {
448
449 if (gpu_device_bus[hardware_id].bus == busInfo.bus &&
450 gpu_device_bus[hardware_id].device == busInfo.device &&
451 gpu_device_bus[hardware_id].function == busInfo.function)
452 return amd2adl[hardware_id];
453
454 hardware_id++;
455 }
456 #endif
457 return -1;
458 }
459
gpu_check_temp(void)460 void gpu_check_temp(void)
461 {
462 #if HAVE_LIBDL
463 static int warned, warnedTemperature;
464 int i, hot_gpu = 0, alerts = 0;
465
466 if (gpu_temp_limit < 0)
467 return;
468
469 for (i = 0; i < MAX_GPU_DEVICES && engaged_devices[i] != DEV_LIST_END; i++)
470 if (dev_get_temp[engaged_devices[i]]) {
471 int fan, temp, util, cl, ml;
472 int dev = engaged_devices[i];
473
474 dev_get_temp[dev](temp_dev_id[dev], &temp, &fan, &util, &cl, &ml);
475
476 if (temp > 125 || temp < 10) {
477 if (!warned++) {
478 log_event("Device %d probably invalid temp reading (%d%sC).",
479 dev + 1, temp, gpu_degree_sign);
480 fprintf(stderr,
481 "Device %d probably invalid temp reading (%d%sC).\n",
482 dev + 1, temp, gpu_degree_sign);
483 }
484 return;
485 }
486
487 if (temp >= gpu_temp_limit) {
488
489 if (!alerts++ && !event_abort && !warnedTemperature) {
490 char s_fan[16] = "n/a";
491 if (fan >= 0)
492 sprintf(s_fan, "%u%%", fan);
493
494 if (cool_gpu_down == 1)
495 warnedTemperature++;
496
497 log_event("Device %d overheat (%d%sC, fan %s), %s%s.",
498 dev + 1, temp, gpu_degree_sign, s_fan,
499 (cool_gpu_down > 0) ? "sleeping" : "aborting job",
500 (hot_gpu) ? " again" : "");
501 fprintf(stderr,
502 "Device %d overheat (%d%sC, fan %s), %s%s.\n",
503 dev + 1, temp, gpu_degree_sign, s_fan,
504 (cool_gpu_down > 0) ? "sleeping" : "aborting job",
505 (hot_gpu) ? " again" : "");
506 }
507 hot_gpu = 1;
508 /***
509 * Graceful handling of GPU overheating
510 * - sleep for a while before re-checking the temperature.
511 ***/
512 if (cool_gpu_down > 0) {
513 int t = cool_gpu_down;
514 while ((t = sleep(t)) && !event_abort);
515
516 // Warn again in case things don't calm down
517 if (alerts > 5)
518 alerts = 0;
519
520 /***
521 * Re-check the temperature of the same GPU.
522 * And loop indefinidely:
523 * - if the GPU doesn't cool down enough during the sleep time
524 ***/
525 i--;
526 continue;
527 } else
528 event_abort++;
529 } else {
530
531 if (hot_gpu && options.verbosity > VERB_DEFAULT &&
532 !warnedTemperature) {
533 char s_fan[16] = "n/a";
534 if (fan >= 0)
535 sprintf(s_fan, "%u%%", fan);
536
537 log_event("Device %d is waking up (%d%sC, fan %s).",
538 dev + 1, temp, gpu_degree_sign, s_fan);
539 fprintf(stderr,
540 "Device %d is waking up (%d%sC, fan %s).\n",
541 dev + 1, temp, gpu_degree_sign, s_fan);
542 }
543 hot_gpu = 0;
544 }
545 }
546 #endif
547 }
548
gpu_log_temp(void)549 void gpu_log_temp(void)
550 {
551 #if HAVE_LIBDL
552 int i;
553
554 for (i = 0; i < MAX_GPU_DEVICES && engaged_devices[i] != DEV_LIST_END; i++)
555 if (dev_get_temp[engaged_devices[i]]) {
556 char s_gpu[256] = "";
557 int n, fan, temp, util, cl, ml;
558 int dev = engaged_devices[i];
559
560 fan = temp = util = -1;
561 dev_get_temp[dev](temp_dev_id[dev], &temp, &fan, &util, &cl, &ml);
562 n = sprintf(s_gpu, "Device %d:", dev + 1);
563 if (temp >= 0)
564 n += sprintf(s_gpu + n, " temp: %u%sC", temp, gpu_degree_sign);
565 if (util > 0)
566 n += sprintf(s_gpu + n, " util: %u%%", util);
567 if (fan >= 0)
568 n += sprintf(s_gpu + n, " fan: %u%%", fan);
569 if (temp >= 0 || util > 0 || fan > 0)
570 log_event("- %s", s_gpu);
571 }
572 #endif
573 }
574
575 #endif /* defined (HAVE_OPENCL) */
576