1 /*
2  * This file is part of John the Ripper password cracker.
3  *
4  * Functions common to OpenCL and other accelerators (eg. FPGA) go in this file.
5  *
6  * This software is
7  * Copyright (c) 2010-2012 Samuele Giovanni Tonon <samu at linuxasylum dot net>
8  * Copyright (c) 2010-2013 Lukas Odzioba <ukasz@openwall.net>
9  * Copyright (c) 2010-2013 magnum
10  * Copyright (c) 2012-2015 Claudio André <claudioandre.br at gmail.com>
11  * and is hereby released to the general public under the following terms:
12  *    Redistribution and use in source and binary forms, with or without
13  *    modifications, are permitted.
14  */
15 
16 #if defined (HAVE_OPENCL)
17 
18 #ifdef AC_BUILT
19 #include "autoconfig.h"
20 #endif
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 
25 #if HAVE_LIBDL
26 #include <dlfcn.h>
27 #elif HAVE_WINDOWS_H
28 // For mingw/VC
29 #include "Win32-dlfcn-port.h"
30 #define HAVE_LIBDL 1
31 #endif
32 
33 #include <string.h>
34 
35 #include "gpu_common.h"
36 #include "john.h"
37 #include "memory.h"
38 #include "params.h"
39 #include "logger.h"
40 #include "signals.h"
41 #ifndef BENCH_BUILD
42 #include "options.h"
43 #endif
44 
45 int gpu_id;
46 int engaged_devices[MAX_GPU_DEVICES + 1];
47 int requested_devices[MAX_GPU_DEVICES + 1];
48 hw_bus gpu_device_bus[MAX_GPU_DEVICES];
49 
50 int gpu_temp_limit, cool_gpu_down;
51 char gpu_degree_sign[8] = "";
52 
53 void *nvml_lib;
54 #if __linux__ && HAVE_LIBDL
55 NVMLINIT nvmlInit;
56 NVMLSHUTDOWN nvmlShutdown;
57 NVMLDEVICEGETHANDLEBYINDEX nvmlDeviceGetHandleByIndex;
58 NVMLDEVICEGETTEMPERATURE nvmlDeviceGetTemperature;
59 NVMLDEVICEGETFANSPEED nvmlDeviceGetFanSpeed;
60 NVMLDEVICEGETUTILIZATIONRATES nvmlDeviceGetUtilizationRates;
61 NVMLDEVICEGETPCIINFO nvmlDeviceGetPciInfo;
62 NVMLDEVICEGETNAME nvmlDeviceGetName;
63 NVMLDEVICEGETHANDLEBYPCIBUSID nvmlDeviceGetHandleByPciBusId;
64 NVMLDEVICEGETINDEX nvmlDeviceGetIndex;
65 NVMLDEVICEGETCURRPCIELINKWIDTH nvmlDeviceGetCurrPcieLinkWidth;
66 NVMLDEVICEGETMAXPCIELINKWIDTH nvmlDeviceGetMaxPcieLinkWidth;
67 #endif /* __linux__ && HAVE_LIBDL */
68 
69 void *adl_lib;
70 
71 #if HAVE_LIBDL
72 static int amd = 0;
73 int amd2adl[MAX_GPU_DEVICES];
74 int adl2od[MAX_GPU_DEVICES];
75 
76 ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create;
77 ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy;
78 ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
79 ADL_ADAPTER_ADAPTERINFO_GET ADL_Adapter_AdapterInfo_Get;
80 ADL_ADAPTER_ACTIVE_GET ADL_Adapter_Active_Get;
81 ADL_OVERDRIVE_CAPS ADL_Overdrive_Caps;
82 
83 ADL_OVERDRIVE5_THERMALDEVICES_ENUM ADL_Overdrive5_ThermalDevices_Enum;
84 ADL_OVERDRIVE5_ODPARAMETERS_GET ADL_Overdrive5_ODParameters_Get;
85 ADL_OVERDRIVE5_TEMPERATURE_GET ADL_Overdrive5_Temperature_Get;
86 ADL_OVERDRIVE5_FANSPEED_GET ADL_Overdrive5_FanSpeed_Get;
87 ADL_OVERDRIVE5_FANSPEEDINFO_GET ADL_Overdrive5_FanSpeedInfo_Get;
88 ADL_OVERDRIVE5_CURRENTACTIVITY_GET ADL_Overdrive5_CurrentActivity_Get;
89 
90 ADL_OVERDRIVE6_FANSPEED_GET ADL_Overdrive6_FanSpeed_Get;
91 ADL_OVERDRIVE6_THERMALCONTROLLER_CAPS ADL_Overdrive6_ThermalController_Caps;
92 ADL_OVERDRIVE6_TEMPERATURE_GET ADL_Overdrive6_Temperature_Get;
93 ADL_OVERDRIVE6_CURRENTSTATUS_GET ADL_Overdrive6_CurrentStatus_Get;
94 ADL_OVERDRIVE6_CAPABILITIES_GET ADL_Overdrive6_Capabilities_Get;
95 
96 // Memory allocation callback function
ADL_Main_Memory_Alloc(int iSize)97 static void* ADL_Main_Memory_Alloc(int iSize)
98 {
99 	void*lpBuffer = malloc(iSize);
100 	return lpBuffer;
101 }
102 
103 #endif /* HAVE_LIBDL */
104 
advance_cursor()105 void advance_cursor()
106 {
107 	static int pos = 0;
108 	char cursor[4] = { '/', '-', '\\', '|' };
109 
110 	if (john_main_process) {
111 		fprintf(stderr, "%c\b", cursor[pos]);
112 		pos = (pos + 1) % 4;
113 	}
114 }
115 
116 /* Function pointer to read temperature for device n */
117 void (*dev_get_temp[MAX_GPU_DEVICES]) (int id, int *temp, int *fanspeed,
118                                        int *util, int *cl, int *ml);
119 
120 /* Map OpenCL device number to ADL/NVML device number */
121 unsigned int temp_dev_id[MAX_GPU_DEVICES];
122 
nvidia_probe(void)123 void nvidia_probe(void)
124 {
125 #if __linux__ && HAVE_LIBDL
126 	if (nvml_lib)
127 		return;
128 
129 	if (!(nvml_lib = dlopen("libnvidia-ml.so", RTLD_LAZY|RTLD_GLOBAL)))
130 		return;
131 
132 	nvmlInit = (NVMLINIT) dlsym(nvml_lib, "nvmlInit");
133 	nvmlShutdown = (NVMLSHUTDOWN) dlsym(nvml_lib, "nvmlShutdown");
134 	nvmlDeviceGetHandleByIndex = (NVMLDEVICEGETHANDLEBYINDEX) dlsym(nvml_lib, "nvmlDeviceGetHandleByIndex");
135 	nvmlDeviceGetTemperature = (NVMLDEVICEGETTEMPERATURE) dlsym(nvml_lib, "nvmlDeviceGetTemperature");
136 	nvmlDeviceGetFanSpeed = (NVMLDEVICEGETFANSPEED) dlsym(nvml_lib, "nvmlDeviceGetFanSpeed");
137 	nvmlDeviceGetUtilizationRates = (NVMLDEVICEGETUTILIZATIONRATES) dlsym(nvml_lib, "nvmlDeviceGetUtilizationRates");
138 	nvmlDeviceGetPciInfo = (NVMLDEVICEGETPCIINFO) dlsym(nvml_lib, "nvmlDeviceGetPciInfo");
139 	nvmlDeviceGetName = (NVMLDEVICEGETNAME) dlsym(nvml_lib, "nvmlDeviceGetName");
140 	nvmlDeviceGetHandleByPciBusId = (NVMLDEVICEGETHANDLEBYPCIBUSID) dlsym(nvml_lib, "nvmlDeviceGetHandleByPciBusId");
141 	nvmlDeviceGetIndex = (NVMLDEVICEGETINDEX) dlsym(nvml_lib, "nvmlDeviceGetIndex");
142 	//nvmlUnitGetCount = (NVMLUNITGETCOUNT) dlsym(nvml_lib, "nvmlUnitGetCount");
143 	nvmlDeviceGetCurrPcieLinkWidth = (NVMLDEVICEGETCURRPCIELINKWIDTH) dlsym(nvml_lib, "nvmlDeviceGetCurrPcieLinkWidth");
144 	nvmlDeviceGetMaxPcieLinkWidth = (NVMLDEVICEGETMAXPCIELINKWIDTH) dlsym(nvml_lib, "nvmlDeviceGetMaxPcieLinkWidth");
145 	nvmlInit();
146 #endif
147 }
148 
amd_probe(void)149 void amd_probe(void)
150 {
151 #if HAVE_LIBDL
152 	LPAdapterInfo lpAdapterInfo = NULL;
153 	int i, ret;
154 	int iNumberAdapters = 0;
155 	int iOverdriveSupported = 0;
156 	int iOverdriveEnabled = 0;
157 	int iOverdriveVersion = 0;
158 	char *env;
159 
160 	if (adl_lib)
161 		return;
162 
163 #if HAVE_WINDOWS_H
164 	if (!(adl_lib = dlopen("atiadlxx.dll", RTLD_LAZY|RTLD_GLOBAL)) &&
165 	    !(adl_lib = dlopen("atiadlxy.dll", RTLD_LAZY|RTLD_GLOBAL)))
166 		return;
167 #else
168 	if (!(adl_lib = dlopen("libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL)))
169 		return;
170 #endif
171 
172 	env = getenv("COMPUTE");
173 	if (env && *env)
174 		setenv("DISPLAY", env, 1);
175 	else {
176 		env = getenv("DISPLAY");
177 		if (!env || !*env)
178 			setenv("DISPLAY", ":0", 1);
179 	}
180 
181 	ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE) dlsym(adl_lib,"ADL_Main_Control_Create");
182 	ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY) dlsym(adl_lib,"ADL_Main_Control_Destroy");
183 	ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET) dlsym(adl_lib,"ADL_Adapter_NumberOfAdapters_Get");
184 	ADL_Adapter_AdapterInfo_Get = (ADL_ADAPTER_ADAPTERINFO_GET) dlsym(adl_lib,"ADL_Adapter_AdapterInfo_Get");
185 	ADL_Adapter_Active_Get = (ADL_ADAPTER_ACTIVE_GET)dlsym(adl_lib, "ADL_Adapter_Active_Get");
186 	ADL_Overdrive_Caps = (ADL_OVERDRIVE_CAPS)dlsym(adl_lib, "ADL_Overdrive_Caps");
187 
188 	ADL_Overdrive5_ThermalDevices_Enum = (ADL_OVERDRIVE5_THERMALDEVICES_ENUM) dlsym(adl_lib, "ADL_Overdrive5_ThermalDevices_Enum");
189 	ADL_Overdrive5_Temperature_Get = (ADL_OVERDRIVE5_TEMPERATURE_GET) dlsym(adl_lib, "ADL_Overdrive5_Temperature_Get");
190 	ADL_Overdrive5_FanSpeed_Get = (ADL_OVERDRIVE5_FANSPEED_GET) dlsym(adl_lib, "ADL_Overdrive5_FanSpeed_Get");
191 	ADL_Overdrive5_FanSpeedInfo_Get = (ADL_OVERDRIVE5_FANSPEEDINFO_GET) dlsym(adl_lib, "ADL_Overdrive5_FanSpeedInfo_Get");
192 	ADL_Overdrive5_ODParameters_Get = (ADL_OVERDRIVE5_ODPARAMETERS_GET) dlsym(adl_lib, "ADL_Overdrive5_ODParameters_Get");
193 	ADL_Overdrive5_CurrentActivity_Get = (ADL_OVERDRIVE5_CURRENTACTIVITY_GET) dlsym(adl_lib, "ADL_Overdrive5_CurrentActivity_Get");
194 
195 	ADL_Overdrive6_FanSpeed_Get = (ADL_OVERDRIVE6_FANSPEED_GET) dlsym(adl_lib,"ADL_Overdrive6_FanSpeed_Get");
196 	ADL_Overdrive6_ThermalController_Caps = (ADL_OVERDRIVE6_THERMALCONTROLLER_CAPS)dlsym(adl_lib, "ADL_Overdrive6_ThermalController_Caps");
197 	ADL_Overdrive6_Temperature_Get = (ADL_OVERDRIVE6_TEMPERATURE_GET)dlsym(adl_lib, "ADL_Overdrive6_Temperature_Get");
198 	ADL_Overdrive6_CurrentStatus_Get = (ADL_OVERDRIVE6_CURRENTSTATUS_GET)dlsym(adl_lib, "ADL_Overdrive6_CurrentStatus_Get");
199 	ADL_Overdrive6_Capabilities_Get = (ADL_OVERDRIVE6_CAPABILITIES_GET)dlsym(adl_lib, "ADL_Overdrive6_Capabilities_Get");
200 
201 	if ((ret = ADL_Main_Control_Create(ADL_Main_Memory_Alloc, 1)) != ADL_OK)
202 		return;
203 
204 	// Obtain the number of adapters for the system
205 	if (ADL_Adapter_NumberOfAdapters_Get(&iNumberAdapters) != ADL_OK)
206 		return;
207 
208 	if (iNumberAdapters > 0) {
209 		lpAdapterInfo = (LPAdapterInfo)mem_alloc(sizeof(AdapterInfo) * iNumberAdapters);
210 		memset(lpAdapterInfo,'\0', sizeof(AdapterInfo) * iNumberAdapters);
211 
212 		ADL_Adapter_AdapterInfo_Get(lpAdapterInfo, sizeof(AdapterInfo) * iNumberAdapters);
213 	}
214 
215 	for (i = 0; i < iNumberAdapters; i++) {
216 		int adapterActive = 0;
217 		AdapterInfo adapterInfo = lpAdapterInfo[i];
218 
219 		ADL_Adapter_Active_Get(adapterInfo.iAdapterIndex , &adapterActive);
220 		if (adapterActive) {
221 			int adl_id = adapterInfo.iAdapterIndex;
222 
223 			amd2adl[amd] = adl_id;
224 			adl2od[adl_id] = 0;
225 			gpu_device_bus[amd].bus = adapterInfo.iBusNumber;
226 			gpu_device_bus[amd].device = adapterInfo.iDeviceNumber;
227 			gpu_device_bus[amd].function = adapterInfo.iFunctionNumber;
228 
229 #if OCL_DEBUG
230 			printf("amd %u adl %u hardware id %02x:%02x.%x\n", amd, adl_id, gpu_device_bus[amd].bus, gpu_device_bus[amd].device,gpu_device_bus[amd].function);
231 #endif
232 			memset(gpu_device_bus[amd].busId, '\0', sizeof(gpu_device_bus[amd].busId));
233 			sprintf(gpu_device_bus[amd].busId, "%02x:%02x.%x", gpu_device_bus[amd].bus,
234 				gpu_device_bus[amd].device,gpu_device_bus[amd].function);
235 
236 			amd++;
237 
238 			if (ADL_Overdrive_Caps(adl_id, &iOverdriveSupported, &iOverdriveEnabled, &iOverdriveVersion) != ADL_OK) {
239 				MEM_FREE(lpAdapterInfo);
240 				ADL_Main_Control_Destroy();
241 				return;
242 			}
243 
244 			if (!iOverdriveSupported) {
245 				MEM_FREE(lpAdapterInfo);
246 				ADL_Main_Control_Destroy();
247 				return;
248 			}
249 
250 			if (iOverdriveVersion == 5)
251 				adl2od[adl_id] = 5;
252 			else if (iOverdriveVersion == 6)
253 				adl2od[adl_id] = 6;
254 			else
255 				adl2od[adl_id] = 0;
256 		}
257 	}
258 	MEM_FREE(lpAdapterInfo);
259 	ADL_Main_Control_Destroy();
260 #endif
261 }
262 
nvidia_get_temp(int nvml_id,int * temp,int * fanspeed,int * util,int * cl,int * ml)263 void nvidia_get_temp(int nvml_id, int *temp, int *fanspeed, int *util,
264                      int *cl, int *ml)
265 {
266 #if __linux__ && HAVE_LIBDL
267 	nvmlUtilization_t s_util;
268 	nvmlDevice_t dev;
269 	unsigned int value;
270 
271 	if (nvmlDeviceGetHandleByIndex(nvml_id, &dev) != NVML_SUCCESS) {
272 		*temp = *fanspeed = *util = *cl = *ml = -1;
273 		return;
274 	}
275 
276 	if (nvmlDeviceGetTemperature(dev, NVML_TEMPERATURE_GPU, &value) == NVML_SUCCESS)
277 		*temp = value;
278 	else
279 		*temp = -1;
280 	if (nvmlDeviceGetFanSpeed(dev, &value) == NVML_SUCCESS)
281 		*fanspeed = value;
282 	else
283 		*fanspeed = -1;
284 	if (nvmlDeviceGetUtilizationRates(dev, &s_util) == NVML_SUCCESS)
285 		*util = s_util.gpu;
286 	else
287 		*util = -1;
288 	if (nvmlDeviceGetMaxPcieLinkWidth(dev, &value) == NVML_SUCCESS)
289 		*ml = value;
290 	if (nvmlDeviceGetCurrPcieLinkWidth(dev, &value) == NVML_SUCCESS)
291 		*cl = value;
292 	else
293 		*cl = *ml;
294 	if (*ml < *cl)
295 		*ml = *cl;
296 #endif /* __linux__ && HAVE_LIBDL */
297 }
298 
299 #if HAVE_LIBDL
get_temp_od5(int adl_id,int * temp,int * fanspeed,int * util,int * cl,int * ml)300 static void get_temp_od5(int adl_id, int *temp, int *fanspeed, int *util,
301                          int *cl, int *ml)
302 {
303 	int ADL_Err = ADL_ERR;
304 	ADLFanSpeedInfo fanSpeedInfo = { 0 };
305 	int fanSpeedReportingMethod = 0;
306 	int iThermalControllerIndex;
307 	ADLThermalControllerInfo termalControllerInfo = { 0 };
308 	ADLODParameters overdriveParameters = { 0 };
309 	ADLPMActivity activity = { 0 };
310 
311 	if (ADL_Main_Control_Create(ADL_Main_Memory_Alloc, 1) != ADL_OK)
312 		return;
313 
314 	*temp = *fanspeed = *util = *cl = *ml = -1;
315 
316 	if (!ADL_Overdrive5_ThermalDevices_Enum ||
317 	    !ADL_Overdrive5_Temperature_Get ||
318 	    !ADL_Overdrive5_FanSpeed_Get ||
319 	    !ADL_Overdrive5_FanSpeedInfo_Get ||
320 	    !ADL_Overdrive5_ODParameters_Get ||
321 	    !ADL_Overdrive5_CurrentActivity_Get)
322 		return;
323 
324 	termalControllerInfo.iSize = sizeof(ADLThermalControllerInfo);
325 
326 	for (iThermalControllerIndex = 0; iThermalControllerIndex < 10; iThermalControllerIndex++) {
327 		ADL_Err = ADL_Overdrive5_ThermalDevices_Enum(adl_id, iThermalControllerIndex, &termalControllerInfo);
328 
329 		if (ADL_Err == ADL_WARNING_NO_DATA)
330 			break;
331 
332 		if (termalControllerInfo.iThermalDomain == ADL_DL_THERMAL_DOMAIN_GPU) {
333 			ADLTemperature adlTemperature = { 0 };
334 			ADLFanSpeedValue fanSpeedValue = { 0 };
335 
336 			adlTemperature.iSize = sizeof(ADLTemperature);
337 			if (ADL_Overdrive5_Temperature_Get(adl_id, iThermalControllerIndex, &adlTemperature) == ADL_OK)
338 				*temp = adlTemperature.iTemperature / 1000;
339 
340 			fanSpeedInfo.iSize = sizeof(ADLFanSpeedInfo);
341 			if (ADL_Overdrive5_FanSpeedInfo_Get(adl_id, iThermalControllerIndex, &fanSpeedInfo) == ADL_OK)
342 			if ((fanSpeedReportingMethod = (fanSpeedInfo.iFlags & ADL_DL_FANCTRL_SUPPORTS_PERCENT_READ))) {
343 				fanSpeedValue.iSpeedType = fanSpeedReportingMethod;
344 				if (ADL_Overdrive5_FanSpeed_Get(adl_id, iThermalControllerIndex, &fanSpeedValue) == ADL_OK)
345 					*fanspeed = fanSpeedValue.iFanSpeed;
346 			}
347 		}
348 	}
349 
350 	overdriveParameters.iSize = sizeof(ADLODParameters);
351 	if (ADL_Overdrive5_ODParameters_Get(adl_id, &overdriveParameters) == ADL_OK) {
352 		activity.iSize = sizeof(ADLPMActivity);
353 		if (ADL_Overdrive5_CurrentActivity_Get(adl_id, &activity) == ADL_OK)
354 		if (overdriveParameters.iActivityReportingSupported) {
355 			*util = activity.iActivityPercent;
356 			*cl = activity.iCurrentBusLanes;
357 			*ml = activity.iMaximumBusLanes;
358 		}
359 	}
360 
361 	ADL_Main_Control_Destroy();
362 	return;
363 }
364 
get_temp_od6(int adl_id,int * temp,int * fanspeed,int * util,int * cl,int * ml)365 static void get_temp_od6(int adl_id, int *temp, int *fanspeed, int *util,
366                          int *cl, int *ml)
367 {
368 	ADLOD6FanSpeedInfo fanSpeedInfo = { 0 };
369 	ADLOD6ThermalControllerCaps thermalControllerCaps = { 0 };
370 	ADLOD6Capabilities od6Capabilities = { 0 };
371 	int temperature = 0;
372 	ADLOD6CurrentStatus currentStatus = { 0 };
373 
374 	if (ADL_Main_Control_Create(ADL_Main_Memory_Alloc, 1) != ADL_OK)
375 		return;
376 
377 	*temp = *fanspeed = *util = -1;
378 
379 	if (!ADL_Overdrive6_FanSpeed_Get ||
380 	    !ADL_Overdrive6_ThermalController_Caps ||
381 	    !ADL_Overdrive6_Temperature_Get ||
382 	    !ADL_Overdrive6_CurrentStatus_Get)
383 		return;
384 
385 	if (ADL_Overdrive6_ThermalController_Caps(adl_id, &thermalControllerCaps) == ADL_OK) {
386 		if (thermalControllerCaps.iCapabilities & ADL_OD6_TCCAPS_FANSPEED_CONTROL)
387 		if (thermalControllerCaps.iCapabilities & ADL_OD6_TCCAPS_FANSPEED_PERCENT_READ)
388 		if (ADL_Overdrive6_FanSpeed_Get(adl_id, &fanSpeedInfo) == ADL_OK)
389 		if (fanSpeedInfo.iSpeedType & ADL_OD6_FANSPEED_TYPE_PERCENT)
390 			*fanspeed = fanSpeedInfo.iFanSpeedPercent;
391 
392 		if (thermalControllerCaps.iCapabilities & ADL_OD6_TCCAPS_THERMAL_CONTROLLER)
393 		if (ADL_Overdrive6_Temperature_Get(adl_id, &temperature) == ADL_OK)
394 			*temp = temperature / 1000;
395 
396 		if (ADL_Overdrive6_Capabilities_Get(adl_id, &od6Capabilities) == ADL_OK)
397 		if (od6Capabilities.iCapabilities & ADL_OD6_CAPABILITY_GPU_ACTIVITY_MONITOR)
398 		if (ADL_Overdrive6_CurrentStatus_Get(adl_id, &currentStatus) == ADL_OK)
399 		{
400 			*util = currentStatus.iActivityPercent;
401 			*cl = currentStatus.iCurrentBusLanes;
402 			*ml = currentStatus.iMaximumBusLanes;
403 		}
404 	}
405 
406 	ADL_Main_Control_Destroy();
407 	return;
408 }
409 #endif
410 
amd_get_temp(int amd_id,int * temp,int * fanspeed,int * util,int * cl,int * ml)411 void amd_get_temp(int amd_id, int *temp, int *fanspeed, int *util, int *cl,
412                   int *ml)
413 {
414 #if HAVE_LIBDL
415 	int adl_id = amd_id;
416 
417 	if (adl2od[adl_id] == 5) {
418 		get_temp_od5(adl_id, temp, fanspeed, util, cl, ml);
419 	} else if (adl2od[adl_id] == 6) {
420 		get_temp_od6(adl_id, temp, fanspeed, util, cl, ml);
421 	} else
422 #endif
423 		*temp = *fanspeed = *util = *cl = *ml = -1;
424 }
425 
id2nvml(const hw_bus busInfo)426 int id2nvml(const hw_bus busInfo) {
427 #if __linux__ && HAVE_LIBDL
428 	nvmlDevice_t dev;
429 
430 	if (nvmlDeviceGetHandleByPciBusId &&
431 	    nvmlDeviceGetHandleByPciBusId(busInfo.busId, &dev) == NVML_SUCCESS &&
432 	    nvmlDeviceGetIndex)
433 	{
434 		unsigned int id_NVML;
435 
436 		if (nvmlDeviceGetIndex(dev, &id_NVML) == NVML_SUCCESS)
437 			return id_NVML;
438 	}
439 #endif
440 	return -1;
441 }
442 
id2adl(const hw_bus busInfo)443 int id2adl(const hw_bus busInfo) {
444 #if HAVE_LIBDL
445 	int hardware_id = 0;
446 
447 	while (hardware_id < amd) {
448 
449 		if (gpu_device_bus[hardware_id].bus == busInfo.bus &&
450 		    gpu_device_bus[hardware_id].device == busInfo.device &&
451 		    gpu_device_bus[hardware_id].function == busInfo.function)
452 			return amd2adl[hardware_id];
453 
454 		hardware_id++;
455 	}
456 #endif
457 	return -1;
458 }
459 
gpu_check_temp(void)460 void gpu_check_temp(void)
461 {
462 #if HAVE_LIBDL
463 	static int warned, warnedTemperature;
464 	int i, hot_gpu = 0, alerts = 0;
465 
466 	if (gpu_temp_limit < 0)
467 		return;
468 
469 	for (i = 0; i < MAX_GPU_DEVICES && engaged_devices[i] != DEV_LIST_END; i++)
470 	if (dev_get_temp[engaged_devices[i]]) {
471 		int fan, temp, util, cl, ml;
472 		int dev = engaged_devices[i];
473 
474 		dev_get_temp[dev](temp_dev_id[dev], &temp, &fan, &util, &cl, &ml);
475 
476 		if (temp > 125 || temp < 10) {
477 			if (!warned++) {
478 				log_event("Device %d probably invalid temp reading (%d%sC).",
479 				          dev + 1, temp, gpu_degree_sign);
480 				fprintf(stderr,
481 				        "Device %d probably invalid temp reading (%d%sC).\n",
482 				        dev + 1, temp, gpu_degree_sign);
483 			}
484 			return;
485 		}
486 
487 		if (temp >= gpu_temp_limit) {
488 
489 			if (!alerts++ && !event_abort && !warnedTemperature) {
490 				char s_fan[16] = "n/a";
491 				if (fan >= 0)
492 					sprintf(s_fan, "%u%%", fan);
493 
494 				if (cool_gpu_down == 1)
495 					warnedTemperature++;
496 
497 				log_event("Device %d overheat (%d%sC, fan %s), %s%s.",
498 				          dev + 1, temp, gpu_degree_sign, s_fan,
499 				          (cool_gpu_down > 0) ? "sleeping" : "aborting job",
500 				          (hot_gpu) ? " again" : "");
501 				fprintf(stderr,
502 				        "Device %d overheat (%d%sC, fan %s), %s%s.\n",
503 				        dev + 1, temp, gpu_degree_sign, s_fan,
504 				        (cool_gpu_down > 0) ? "sleeping" : "aborting job",
505 				        (hot_gpu) ? " again" : "");
506 			}
507 			hot_gpu = 1;
508 			/***
509 			 * Graceful handling of GPU overheating
510 			 * - sleep for a while before re-checking the temperature.
511 			 ***/
512 			if (cool_gpu_down > 0) {
513 				int t = cool_gpu_down;
514 				while ((t = sleep(t)) && !event_abort);
515 
516 				// Warn again in case things don't calm down
517 				if (alerts > 5)
518 					alerts = 0;
519 
520 				/***
521 				 * Re-check the temperature of the same GPU.
522 				 * And loop indefinidely:
523 				 * - if the GPU doesn't cool down enough during the sleep time
524 				 ***/
525 				i--;
526 				continue;
527 			} else
528 				event_abort++;
529 		} else {
530 
531 			if (hot_gpu && options.verbosity > VERB_DEFAULT &&
532 			    !warnedTemperature) {
533 				char s_fan[16] = "n/a";
534 				if (fan >= 0)
535 					sprintf(s_fan, "%u%%", fan);
536 
537 				log_event("Device %d is waking up (%d%sC, fan %s).",
538 				          dev + 1, temp, gpu_degree_sign, s_fan);
539 				fprintf(stderr,
540 				        "Device %d is waking up (%d%sC, fan %s).\n",
541 				        dev + 1, temp, gpu_degree_sign, s_fan);
542 			}
543 			hot_gpu = 0;
544 		}
545 	}
546 #endif
547 }
548 
gpu_log_temp(void)549 void gpu_log_temp(void)
550 {
551 #if HAVE_LIBDL
552 	int i;
553 
554 	for (i = 0; i < MAX_GPU_DEVICES && engaged_devices[i] != DEV_LIST_END; i++)
555 	if (dev_get_temp[engaged_devices[i]]) {
556 		char s_gpu[256] = "";
557 		int n, fan, temp, util, cl, ml;
558 		int dev = engaged_devices[i];
559 
560 		fan = temp = util = -1;
561 		dev_get_temp[dev](temp_dev_id[dev], &temp, &fan, &util, &cl, &ml);
562 		n = sprintf(s_gpu, "Device %d:", dev + 1);
563 		if (temp >= 0)
564 			n += sprintf(s_gpu + n, " temp: %u%sC", temp, gpu_degree_sign);
565 		if (util > 0)
566 			n += sprintf(s_gpu + n, " util: %u%%", util);
567 		if (fan >= 0)
568 			n += sprintf(s_gpu + n, " fan: %u%%", fan);
569 		if (temp >= 0 || util > 0 || fan > 0)
570 			log_event("- %s", s_gpu);
571 	}
572 #endif
573 }
574 
575 #endif /* defined (HAVE_OPENCL) */
576