1 /*
2  * This file is part of John the Ripper password cracker.
3  *
4  * Common OpenCL functions.
5  *
6  * This software is
7  * Copyright (c) 2010-2012 Samuele Giovanni Tonon <samu at linuxasylum dot net>
8  * Copyright (c) 2010-2013 Lukas Odzioba <ukasz@openwall.net>
9  * Copyright (c) 2010-2015 magnum
10  * Copyright (c) 2012-2015 Claudio André <claudioandre.br at gmail.com>
11  *
12  * and is hereby released to the general public under the following terms:
13  * Redistribution and use in source and binary forms, with or without
14  * modifications, are permitted.
15  */
16 
17 #ifdef HAVE_OPENCL
18 
19 #define _BSD_SOURCE 1           // setenv()
20 #define _DEFAULT_SOURCE 1       // setenv()
21 #define NEED_OS_TIMER
22 #define NEED_OS_FLOCK
23 #define NEED_OS_FORK
24 #include "os.h"
25 
26 #include <assert.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <sys/stat.h>
30 #include <time.h>
31 #include <signal.h>
32 #include <limits.h>
33 #include <stdlib.h>
34 #if !AC_BUILT || HAVE_FCNTL_H
35 #include <fcntl.h>
36 #endif
37 #include <unistd.h>
38 
39 #ifdef NO_JOHN_BLD
40 #define JOHN_BLD "unk-build-type"
41 #else
42 #include "john_build_rule.h"
43 #endif
44 
45 #include "jumbo.h"
46 #include "options.h"
47 #include "config.h"
48 #include "common.h"
49 #include "logger.h"
50 #include "opencl_common.h"
51 #include "mask_ext.h"
52 #include "dyna_salt.h"
53 #include "signals.h"
54 #include "recovery.h"
55 #include "status.h"
56 #include "john.h"
57 #include "md5.h"
58 #include "misc.h"
59 #include "john_mpi.h"
60 
61 /* Set this to eg. 3 for some added debug and retry stuff */
62 #define RACE_CONDITION_DEBUG 0
63 
64 #define LOG_SIZE 1024*16
65 
66 #if !defined(__CYGWIN__) && !defined(__MINGW32__)
67 // If true, use realpath(3) for translating eg. "-I./kernels" into an absolute
68 // path before submitting as JIT compile option to OpenCL.
69 #define I_REALPATH 1
70 #endif
71 
72 // If we are a release build, only output OpenCL build log if
73 // there was a fatal error (or --verbosity was increased).
74 #ifdef JTR_RELEASE_BUILD
75 #define LOG_VERB VERB_LEGACY
76 #else
77 #define LOG_VERB VERB_DEFAULT
78 #endif
79 
80 /* Common OpenCL variables */
81 int platform_id;
82 int default_gpu_selected;
83 int default_device_selected;
84 int ocl_autotune_running;
85 size_t ocl_max_lws;
86 
87 static char opencl_log[LOG_SIZE];
88 static int opencl_initialized;
89 
90 static void load_device_info(int sequential_id);
91 static char* get_device_capability(int sequential_id);
92 
93 // Used by auto-tuning to decide how GWS should changed between trials.
94 extern int autotune_get_next_gws_size(size_t num, int step, int startup,
95                                       int default_value);
96 extern int autotune_get_prev_gws_size(size_t num, int step);
97 
98 // Settings to use for auto-tuning.
99 static int buffer_size;
100 static int default_value;
101 static int hash_loops;
102 static int duration_time = 0;
103 static const char **warnings;
104 static int *split_events;
105 static int main_opencl_event;
106 static struct fmt_main *self;
107 static void (*create_clobj)(size_t gws, struct fmt_main *self);
108 static void (*release_clobj)(void);
109 static char fmt_base_name[128];
110 static size_t gws_limit;
111 static int printed_mask;
112 static struct db_main *autotune_db;
113 static struct db_salt *autotune_salts;
114 int autotune_real_db;
115 
116 typedef struct {
117 	cl_platform_id platform;
118 	int num_devices;
119 } cl_platform;
120 static cl_platform platforms[MAX_PLATFORMS + 1];
121 
122 
123 cl_device_id devices[MAX_GPU_DEVICES + 1];
124 cl_context context[MAX_GPU_DEVICES];
125 cl_program program[MAX_GPU_DEVICES];
126 cl_command_queue queue[MAX_GPU_DEVICES];
127 cl_int ret_code;
128 cl_kernel crypt_kernel;
129 size_t local_work_size;
130 size_t global_work_size;
131 size_t max_group_size;
132 unsigned int ocl_v_width = 1;
133 unsigned long long global_speed;
134 
135 cl_event *profilingEvent, *firstEvent, *lastEvent;
136 cl_event *multi_profilingEvent[MAX_EVENTS];
137 
138 int device_info[MAX_GPU_DEVICES];
139 static ocl_device_details ocl_device_list[MAX_GPU_DEVICES];
140 
opencl_process_event(void)141 void opencl_process_event(void)
142 {
143 	if (!ocl_autotune_running && !bench_or_test_running) {
144 #if !OS_TIMER
145 		sig_timer_emu_tick();
146 #endif
147 		if (event_pending) {
148 			if (event_save) {
149 				event_save = 0;
150 				rec_save();
151 			}
152 
153 			if (event_status) {
154 				event_status = 0;
155 				status_print();
156 			}
157 
158 			if (event_ticksafety) {
159 				event_ticksafety = 0;
160 				status_ticks_overflow_safety();
161 			}
162 
163 			event_pending = (event_abort || event_poll_files || event_reload);
164 		}
165 	}
166 }
167 
get_number_of_available_platforms()168 int get_number_of_available_platforms()
169 {
170 	int i = 0;
171 
172 	while (platforms[i].platform)
173 		i++;
174 
175 	return i;
176 }
177 
178 /* Get the number of available devices (all the OpenCL devices) */
get_number_of_available_devices()179 int get_number_of_available_devices()
180 {
181 	int total = 0, i = 0;
182 
183 	while (platforms[i].platform)
184 		total += platforms[i++].num_devices;
185 
186 	return total;
187 }
188 
189 /*
190  * Get the total number of devices that were requested (do not count duplicates)
191  * --device=2,2 result that "one" device is really in use;
192  */
get_number_of_devices_in_use()193 int get_number_of_devices_in_use()
194 {
195 	int i = 0;
196 
197 	while (engaged_devices[i] != DEV_LIST_END)
198 		i++;
199 
200 	return i;
201 }
202 
203 /*
204  * Get the total number of requested devices (count duplicates)
205  * --device=2,2 result that "two" devices will be used. E.g., to split tasks;
206  */
get_number_of_requested_devices()207 int get_number_of_requested_devices()
208 {
209 	int i = 0;
210 
211 	while (requested_devices[i] != DEV_LIST_END)
212 		i++;
213 
214 	return i;
215 }
216 
get_platform_id(int sequential_id)217 int get_platform_id(int sequential_id)
218 {
219 	int pos = 0, i = 0;
220 
221 	while (platforms[i].platform) {
222 		pos += platforms[i].num_devices;
223 
224 		if (sequential_id < pos)
225 			break;
226 		i++;
227 	}
228 	return (platforms[i].platform ? i : -1);
229 }
230 
get_device_id(int sequential_id)231 int get_device_id(int sequential_id)
232 {
233 	int pos = sequential_id, i = 0;
234 
235 	while (platforms[i].platform && pos >= platforms[i].num_devices) {
236 		pos -= platforms[i].num_devices;
237 		i++;
238 	}
239 	return (platforms[i].platform ? pos : -1);
240 }
241 
get_sequential_id(unsigned int dev_id,unsigned int platform_id)242 int get_sequential_id(unsigned int dev_id, unsigned int platform_id)
243 {
244 	int pos = 0, i = 0;
245 
246 	while (platforms[i].platform && i < platform_id)
247 		pos += platforms[i++].num_devices;
248 
249 	if (i == platform_id && dev_id >= platforms[i].num_devices)
250 		return -1;
251 
252 	return (platforms[i].platform ? pos + dev_id : -1);
253 }
254 
opencl_driver_value(int sequential_id,int * major,int * minor)255 void opencl_driver_value(int sequential_id, int *major, int *minor)
256 {
257 	char dname[MAX_OCLINFO_STRING_LEN];
258 	char *p;
259 
260 	*major = 0, *minor = 0;
261 
262 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DRIVER_VERSION,
263 		sizeof(dname), dname, NULL), "clGetDeviceInfo for CL_DRIVER_VERSION");
264 
265 	p = dname;
266 	while (*p && !isdigit((int)*p))
267 		p++;
268 	if (*p) {
269 		*major = atoi(p);
270 		while (*p && isdigit((int)*p))
271 			p++;
272 		while (*p && !isdigit((int)*p))
273 			p++;
274 		if (*p) {
275 			*minor = atoi(p);
276 		}
277 	}
278 }
279 
opencl_driver_ver(int sequential_id)280 static char *opencl_driver_ver(int sequential_id)
281 {
282 	static char ret[64];
283 	int major, minor;
284 
285 	opencl_driver_value(sequential_id, &major, &minor);
286 
287 	snprintf(ret, sizeof(ret), "-DDEV_VER_MAJOR=%d -DDEV_VER_MINOR=%d",
288 	         major, minor);
289 
290 	return ret;
291 }
292 
remove_spaces(char * str)293 static char *remove_spaces(char *str) {
294 
295 	char *out = str, *put = str;
296 
297 	for (; *str; str++) {
298 		if (*str != ' ')
299 			*put++ = *str;
300 	}
301 	*put = '\0';
302 
303 	return out;
304 }
305 
opencl_driver_info(int sequential_id)306 static char *opencl_driver_info(int sequential_id)
307 {
308 	static char buf[64 + MAX_OCLINFO_STRING_LEN];
309 	char dname[MAX_OCLINFO_STRING_LEN], tmp[sizeof(buf)], set[64];
310 	static char output[sizeof(tmp) + sizeof(dname)];
311 	char *name, *recommendation = NULL;
312 	int major = 0, minor = 0, conf_major = 0, conf_minor = 0, found;
313 	struct cfg_list *list;
314 	struct cfg_line *line;
315 
316 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DRIVER_VERSION,
317 		sizeof(dname), dname, NULL), "clGetDeviceInfo for CL_DRIVER_VERSION");
318 
319 	opencl_driver_value(sequential_id, &major, &minor);
320 	name = buf;
321 
322 	if ((list = cfg_get_list("List.OpenCL:", "Drivers")))
323 	if ((line = list->head))
324 	do {
325 		char *p;
326 
327 		//Parse driver information.
328 		strncpy(set, line->data, 64);
329 		remove_spaces(set);
330 
331 		p = strtokm(set, ",");
332 		conf_major = strtoul(p, NULL, 10);
333 
334 		p = strtokm(NULL, ";");
335 		conf_minor = strtoul(p, NULL, 10);
336 
337 		name = strtokm(NULL, ";");
338 		recommendation = strtokm(NULL, ";");
339 
340 		if (gpu_amd(device_info[sequential_id]))
341 		if (conf_major == major && conf_minor == minor)
342 			break;
343 
344 		if (gpu_nvidia(device_info[sequential_id]))
345 		if (recommendation && strstr(recommendation, "N"))
346 		if (conf_major <= major && conf_minor <= minor)
347 			break;
348 
349 #ifdef OCL_DEBUG
350 		fprintf(stderr, "Driver: %i, %i -> %s , %s\n",
351 			conf_major, conf_minor, name, recommendation);
352 #endif
353 	} while ((line = line->next));
354 
355 	if (gpu_amd(device_info[sequential_id]) &&
356 	    get_platform_vendor_id(get_platform_id(sequential_id)) == DEV_AMD) {
357 
358 		if (major < 1912)
359 			snprintf(buf, sizeof(buf), "%s - Catalyst %s", dname, name);
360 		else if (major < 2500)
361 			snprintf(buf, sizeof(buf), "%s - Crimson %s", dname, name);
362 		else
363 			snprintf(buf, sizeof(buf), "%s - AMDGPU-Pro %s", dname, name);
364 		snprintf(tmp, sizeof(tmp), "%s", buf);
365 	} else
366 		snprintf(tmp, sizeof(tmp), "%s", dname);
367 
368 	snprintf(dname, sizeof(dname), " ");
369 
370 	if (recommendation) {
371 		//Check hardware
372 		found = (strstr(recommendation, "G") && amd_gcn(device_info[sequential_id]));
373 		found += (strstr(recommendation, "N") && gpu_nvidia(device_info[sequential_id]));
374 		found += (strstr(recommendation, "V") &&
375 			 (amd_vliw4(device_info[sequential_id]) ||
376 			  amd_vliw5(device_info[sequential_id])));
377 
378 		//Check OS
379 		if (found) {
380 			found = (strstr(recommendation, "*") != NULL);
381 			found += (strstr(recommendation, "L") && strstr(JOHN_BLD, "linux"));
382 			found += (strstr(recommendation, "W") && strstr(JOHN_BLD, "windows"));
383 		}
384 
385 		if (strstr(recommendation, "T"))
386 			snprintf(dname, sizeof(dname), " [known bad]");
387 		else if (found) {
388 			if (strstr(recommendation, "R"))
389 				snprintf(dname, sizeof(dname), " [recommended]");
390 			else if (strstr(recommendation, "S"))
391 				snprintf(dname, sizeof(dname), " [supported]");
392 		}
393 	}
394 	snprintf(output, sizeof(output), "%s%s", tmp, dname);
395 
396 	return output;
397 }
398 
ns2string(cl_ulong nanosec)399 static char *ns2string(cl_ulong nanosec)
400 {
401 	char *buf = mem_alloc_tiny(16, MEM_ALIGN_NONE);
402 	int s, ms, us, ns;
403 
404 	ns = nanosec % 1000;
405 	nanosec /= 1000;
406 	us = nanosec % 1000;
407 	nanosec /= 1000;
408 	ms = nanosec % 1000;
409 	s = nanosec / 1000;
410 
411 	if (s) {
412 		if (ms)
413 			snprintf(buf, 16, "%d.%03ds", s, ms);
414 		else
415 			snprintf(buf, 16, "%ds", s);
416 	} else if (ms) {
417 		if (us)
418 			snprintf(buf, 16, "%d.%03dms", ms, us);
419 		else
420 			snprintf(buf, 16, "%dms", ms);
421 	} else if (us) {
422 		if (ns)
423 			snprintf(buf, 16, "%d.%03dus", us, ns);
424 		else
425 			snprintf(buf, 16, "%dus", us);
426 	} else
427 		snprintf(buf, 16, "%dns", ns);
428 	return buf;
429 }
430 
ms2string(int millisec)431 static char *ms2string(int millisec)
432 {
433 	return ns2string(millisec * 1000000ULL);
434 }
435 
get_if_device_is_in_use(int sequential_id)436 static int get_if_device_is_in_use(int sequential_id)
437 {
438 	int i = 0, found = 0;
439 	int num_devices;
440 
441 	if (sequential_id >= get_number_of_available_devices()) {
442 		return -1;
443 	}
444 
445 	num_devices = get_number_of_devices_in_use();
446 
447 	for (i = 0; i < num_devices && !found; i++) {
448 		if (sequential_id == engaged_devices[i])
449 			found = 1;
450 	}
451 	return found;
452 }
453 
454 /*
455  * Load information about all platforms and devices available in the
456  * running system
457  */
load_opencl_environment()458 static void load_opencl_environment()
459 {
460 	cl_platform_id platform_list[MAX_PLATFORMS];
461 	cl_uint num_platforms, device_pos = 0;
462 	int ret, i;
463 
464 	/* Find OpenCL enabled devices. We ignore error here, in case
465 	 * there is no platform and we'd like to run a non-OpenCL format. */
466 	ret = clGetPlatformIDs(MAX_PLATFORMS, platform_list, &num_platforms);
467 
468 	if (ret != CL_SUCCESS)
469 		num_platforms = 0;
470 
471 	if (num_platforms < 1 && options.verbosity > VERB_LEGACY)
472 		fprintf(stderr, "%u: No OpenCL platforms were found: %s\n",
473 		        NODE, get_error_name(ret));
474 
475 	for (i = 0; i < num_platforms; i++) {
476 		cl_uint num_devices;
477 
478 		// It is possible to have a platform without any devices
479 		// Ignore error here too on purpose.
480 		ret = clGetDeviceIDs(platform_list[i], CL_DEVICE_TYPE_ALL,
481 			MAX_GPU_DEVICES - device_pos, /* avoid buffer overrun */
482 			&devices[device_pos], &num_devices);
483 		if (ret != CL_SUCCESS)
484 			num_devices = 0;
485 
486 		if (num_devices < 1 && options.verbosity > VERB_LEGACY)
487 			fprintf(stderr,
488 			        "%u: No OpenCL devices were found on platform #%d: %s\n",
489 			        NODE, i, get_error_name(ret));
490 
491 		// Save platform and devices information
492 		platforms[i].platform = platform_list[i];
493 		platforms[i].num_devices = num_devices;
494 
495 		// Point to the end of the list
496 		device_pos += num_devices;
497 
498 #ifdef OCL_DEBUG
499 	{
500 		char opencl_data[LOG_SIZE];
501 
502 		SOFT_CLERROR(clGetPlatformInfo(platform_list[i],
503 			CL_PLATFORM_NAME, sizeof(opencl_data), opencl_data, NULL),
504 			"clGetPlatformInfo for CL_PLATFORM_NAME");
505 
506 		fprintf(stderr, "%u: OpenCL platform %d: %s, %d device(s).\n",
507 		        NODE, i, opencl_data, num_devices);
508 	}
509 #endif
510 	}
511 
512 	// Set NULL to the final buffer position.
513 	platforms[i].platform = NULL;
514 	devices[device_pos] = NULL;
515 }
516 
get_pci_info(int sequential_id,hw_bus * hardware_info)517 static cl_int get_pci_info(int sequential_id, hw_bus *hardware_info)
518 {
519 
520 	cl_int ret;
521 
522 	hardware_info->bus = -1;
523 	hardware_info->device = -1;
524 	hardware_info->function = -1;
525 	memset(hardware_info->busId, '\0', sizeof(hardware_info->busId));
526 
527 	if (gpu_amd(device_info[sequential_id]) ||
528 	    cpu_amd(device_info[sequential_id])) {
529 		cl_device_topology_amd topo;
530 
531 		ret = clGetDeviceInfo(devices[sequential_id],
532 			CL_DEVICE_TOPOLOGY_AMD, sizeof(topo), &topo, NULL);
533 
534 		if (ret == CL_SUCCESS) {
535 			hardware_info->bus = topo.pcie.bus & 0xff;
536 			hardware_info->device = topo.pcie.device & 0xff;
537 			hardware_info->function = topo.pcie.function & 0xff;
538 		} else
539 			return ret;
540 	} else if (gpu_nvidia(device_info[sequential_id])) {
541 		cl_uint entries;
542 
543 		ret = clGetDeviceInfo(devices[sequential_id], CL_DEVICE_PCI_BUS_ID_NV,
544 		                      sizeof(cl_uint), &entries, NULL);
545 
546 		if (ret == CL_SUCCESS)
547 			hardware_info->bus = entries;
548 		else
549 			return ret;
550 
551 		ret = clGetDeviceInfo(devices[sequential_id], CL_DEVICE_PCI_SLOT_ID_NV,
552 		                      sizeof(cl_uint), &entries, NULL);
553 
554 		if (ret == CL_SUCCESS) {
555 			hardware_info->device = entries >> 3;
556 			hardware_info->function = entries & 7;
557 		} else
558 			return ret;
559 	} else
560 		return CL_SUCCESS;
561 
562 	sprintf(hardware_info->busId, "%02x:%02x.%x", hardware_info->bus,
563 	        hardware_info->device, hardware_info->function);
564 	return CL_SUCCESS;
565 }
566 
567 /*
568  * Initialize an OpenCL device:
569  * - create context and queue;
570  * - get bus and map to monitoring stuff;
571  */
start_opencl_device(int sequential_id,int * err_type)572 static int start_opencl_device(int sequential_id, int *err_type)
573 {
574 	cl_context_properties properties[3];
575 	char opencl_data[LOG_SIZE];
576 	int retry = 0;
577 
578 	// Get the detailed information about the device
579 	// (populate device_info[d] bitfield).
580 	load_device_info(sequential_id);
581 
582 	// Get hardware bus/PCIE information.
583 	get_pci_info(sequential_id, &ocl_device_list[sequential_id].pci_info);
584 
585 	// Map temp monitoring function and NVML/ADL id to our device id
586 	if (gpu_nvidia(device_info[sequential_id])) {
587 		temp_dev_id[sequential_id] =
588 		    id2nvml(ocl_device_list[sequential_id].pci_info);
589 		dev_get_temp[sequential_id] = nvml_lib ? nvidia_get_temp : NULL;
590 	} else if (gpu_amd(device_info[sequential_id])) {
591 		temp_dev_id[sequential_id] =
592 		    id2adl(ocl_device_list[sequential_id].pci_info);
593 		dev_get_temp[sequential_id] = adl_lib ? amd_get_temp : NULL;
594 
595 		if (sequential_id > 0 &&
596 		    temp_dev_id[sequential_id] == temp_dev_id[sequential_id - 1]) {
597 			/* Kludge for 7990 > 14.9. We hates AMD. */
598 			ocl_device_list[sequential_id].pci_info.bus++;
599 			temp_dev_id[sequential_id] =
600 				id2adl(ocl_device_list[sequential_id].pci_info);
601 		}
602 	} else {
603 		temp_dev_id[sequential_id] = sequential_id;
604 		dev_get_temp[sequential_id] = NULL;
605 	}
606 
607 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_NAME,
608 	                               sizeof(opencl_data), opencl_data, NULL),
609 	               "clGetDeviceInfo for DEVICE_NAME");
610 
611 	max_group_size = get_device_max_lws(sequential_id);
612 
613 	do {
614 		// Get the platform properties
615 		properties[0] = CL_CONTEXT_PLATFORM;
616 		properties[1] = (cl_context_properties)
617 			platforms[get_platform_id(sequential_id)].platform;
618 		properties[2] = 0;
619 
620 		// Setup context and queue
621 		context[sequential_id] = clCreateContext(properties, 1,
622 			&devices[sequential_id], NULL, NULL, &ret_code);
623 
624 		if (ret_code != CL_SUCCESS) {
625 			fprintf(stderr, "%u: Error creating context for device %d "
626 			        "(%d:%d): %s, %s\n",
627 			        NODE, sequential_id,
628 			        get_platform_id(sequential_id),
629 			        get_device_id(sequential_id), get_error_name(ret_code),
630 			        retry < RACE_CONDITION_DEBUG ? "retrying" : "giving up");
631 			if (++retry > RACE_CONDITION_DEBUG)
632 				error();
633 			usleep((retry + NODE) * 100);
634 		}
635 	} while (ret_code != CL_SUCCESS);
636 
637 	retry = 0;
638 	do {
639 		queue[sequential_id] = clCreateCommandQueue(context[sequential_id],
640 		                       devices[sequential_id], 0, &ret_code);
641 
642 		if (ret_code != CL_SUCCESS) {
643 			fprintf(stderr, "%u: Error creating command queue for "
644 			        "device %d (%d:%d): %s, %s\n", NODE,
645 			        sequential_id, get_platform_id(sequential_id),
646 			        get_device_id(sequential_id), get_error_name(ret_code),
647 			        retry < RACE_CONDITION_DEBUG ? "retrying" : "giving up");
648 			if (++retry > RACE_CONDITION_DEBUG)
649 				error();
650 			usleep((retry + NODE) * 100);
651 		}
652 	} while (ret_code != CL_SUCCESS);
653 
654 #ifdef OCL_DEBUG
655 	fprintf(stderr, "  Device %d: %s\n", sequential_id, opencl_data);
656 #endif
657 
658 	// Success.
659 	return 1;
660 }
661 
662 /* Add one requested OpenCL device to the list of the requested devices
663  * - it only adds a device that is working properly;
664  * - so, the device is initialized inside the routine;
665  */
add_device_to_list(int sequential_id)666 static void add_device_to_list(int sequential_id)
667 {
668 	int i = 0, found;
669 
670 	found = get_if_device_is_in_use(sequential_id);
671 
672 	if (found < 0) {
673 #if HAVE_MPI
674 		if (mpi_p > 1)
675 			fprintf(stderr, "%u@%s: ", mpi_id + 1, mpi_name);
676 #elif OS_FORK
677 		if (options.fork)
678 			fprintf(stderr, "%u: ", options.node_min);
679 #endif
680 		fprintf(stderr, "Error: --device must be between 1 and %d "
681 		        "(the number of devices available).\n",
682 		        get_number_of_available_devices());
683 		error();
684 	}
685 
686 	if (found == 0) {
687 		// Only requested and working devices should be started.
688 		if (! start_opencl_device(sequential_id, &i)) {
689 #if HAVE_MPI
690 			if (mpi_p > 1)
691 				fprintf(stderr, "%u@%s: ", mpi_id + 1, mpi_name);
692 #elif OS_FORK
693 			if (options.fork)
694 				fprintf(stderr, "%u: ", options.node_min);
695 #endif
696 			fprintf(stderr, "Device id %d not working correctly,"
697 			        " skipping.\n", sequential_id + 1);
698 			return;
699 		}
700 		engaged_devices[get_number_of_devices_in_use() + 1] = DEV_LIST_END;
701 		engaged_devices[get_number_of_devices_in_use()] = sequential_id;
702 	}
703 	// The full list of requested devices.
704 	requested_devices[get_number_of_requested_devices() + 1] = DEV_LIST_END;
705 	requested_devices[get_number_of_requested_devices()] = sequential_id;
706 }
707 
708 /* Used below (inside add_device_type routine) to sort devices */
709 typedef struct {
710 	int index;
711 	cl_device_id ID;
712 	unsigned int value;
713 } speed_sort_t;
714 
715 /* Used below (inside add_device_type routine) to sort devices */
comparator(const void * p1,const void * p2)716 static int comparator(const void *p1, const void *p2)
717 {
718 	const speed_sort_t *c1 = (const speed_sort_t *)p1;
719 	const speed_sort_t *c2 = (const speed_sort_t *)p2;
720 	int diff = (int)c2->value - (int)c1->value;
721 	if (diff)
722 		return diff;
723 	return c1->index - c2->index;
724 }
725 
726 /* Add groups of devices to requested OpenCL devices list */
add_device_type(cl_ulong device_type,int top)727 static void add_device_type(cl_ulong device_type, int top)
728 {
729 	int i, j, sequence_nr = 0;
730 	int found = 0;
731 	speed_sort_t dev[MAX_GPU_DEVICES];
732 
733 	// Get all devices of requested type.
734 	for (i = 0; platforms[i].platform; i++) {
735 		cl_device_id devices[MAX_GPU_DEVICES];
736 		cl_uint device_num = 0;
737 
738 		if (clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_ALL,
739 				MAX_GPU_DEVICES, devices, &device_num) == CL_SUCCESS) {
740 			// Sort devices by speed
741 			for (j = 0; j < device_num && sequence_nr < MAX_GPU_DEVICES;
742 			     j++, sequence_nr++) {
743 				load_device_info(sequence_nr);
744 				dev[sequence_nr].index = sequence_nr;
745 				dev[sequence_nr].ID = devices[j];
746 				dev[sequence_nr].value = opencl_speed_index(sequence_nr);
747 			}
748 		}
749 	}
750 
751 	// If there is something to sort, do it.
752 	if (sequence_nr > 1)
753 		qsort(dev, sequence_nr, sizeof(dev[0]), comparator);
754 
755 	// Add the devices sorted by speed devices
756 	for (j = 0; j < sequence_nr; j++) {
757 		cl_ulong long_entries = 0;
758 
759 		if (clGetDeviceInfo(dev[j].ID, CL_DEVICE_TYPE,
760 			sizeof(cl_ulong), &long_entries, NULL) == CL_SUCCESS) {
761 			if (long_entries & device_type) {
762 				found++;
763 				add_device_to_list(dev[j].index);
764 
765 				// Only the best should be added
766 				if (top)
767 					break;
768 			}
769 		}
770 	}
771 	// If testing preferred devices, do not warn or fail
772 	if (!found && !default_device_selected)
773 		error_msg("No OpenCL device of that type found\n");
774 }
775 
776 /* Build a list of the requested OpenCL devices */
build_device_list(const char * device_list[MAX_GPU_DEVICES])777 static void build_device_list(const char *device_list[MAX_GPU_DEVICES])
778 {
779 	int n = 0;
780 
781 	while (device_list[n] && n < MAX_GPU_DEVICES) {
782 		int len = MAX(strlen(device_list[n]), 3);
783 		/* Add devices in the preferable order: gpu,
784 		 * accelerator, and cpu. */
785 		cl_device_type trial_list[] = {
786 			CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_ACCELERATOR,
787 			CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_DEFAULT
788 		};
789 
790 		if (!strcmp(device_list[n], "all"))
791 			add_device_type(CL_DEVICE_TYPE_ALL, 0);
792 		else if (!strcmp(device_list[n], "cpu"))
793 			add_device_type(CL_DEVICE_TYPE_CPU, 0);
794 		else if (!strcmp(device_list[n], "gpu"))
795 			add_device_type(CL_DEVICE_TYPE_GPU, 0);
796 		else if (!strncmp(device_list[n], "accelerator", len))
797 			add_device_type(CL_DEVICE_TYPE_ACCELERATOR, 0);
798 		else if (!strncmp(device_list[n], "best", len)) {
799 			int i = 0, top = (options.fork ? 0 : 1);
800 
801 			/* Set a flag that JtR has changed the value of --devices. */
802 			default_device_selected = 1;
803 			if (top)
804 				default_gpu_selected = 1;
805 
806 			do
807 				add_device_type(trial_list[i++], top);
808 			while (get_number_of_devices_in_use() == 0 &&
809 			         trial_list[i] != CL_DEVICE_TYPE_DEFAULT);
810 		}
811 		else if (!isdigit(ARCH_INDEX(device_list[n][0]))) {
812 			fprintf(stderr, "Error: --device must be numerical, "
813 			        "or one of \"all\", \"cpu\", \"gpu\" and\n"
814 			        "\"acc[elerator]\".\n");
815 			error();
816 		} else if (device_list[n][0] == '0') {
817 			fprintf(stderr, "Error: --device must be between 1 and %d "
818 			          "(the number of devices available).\n",
819 			          get_number_of_available_devices());
820 			error();
821 		} else
822 			add_device_to_list(atoi(device_list[n]) - 1);
823 		n++;
824 	}
825 }
826 
827 /*
828  * Load the OpenCL environment
829  * - fill in the "existing" devices list (devices[] variable) and;
830  * - fill in the "in use" devices list (engaged_devices[] variable);
831  *   - device was initialized;
832  *   - do not count duplicates;
833  *     --device=2,2 result that "one" device is really in use;
834  * - fill in the "all requested" devices list (requested_devices[] variable);
835  *   - device was initialized;
836  *   - count duplicates;
837  *     --device=2,2 result that "two" devices will be used, e.g., to split tasks;
838  *
839  * Warn if no device is found
840  * On MPI, hide devices from other instances
841  */
opencl_load_environment(void)842 void opencl_load_environment(void)
843 {
844 	char *env;
845 
846 	// Prefer COMPUTE over DISPLAY and lacking both, assume :0
847 	env = getenv("COMPUTE");
848 	if (env && *env)
849 		setenv("DISPLAY", env, 1);
850 	else {
851 		// We assume that 10 dot something is X11
852 		// forwarding so we override that too.
853 		env = getenv("DISPLAY");
854 		if (!env || !*env || strstr(env, ":10."))
855 			setenv("DISPLAY", ":0", 1);
856 	}
857 
858 	if (!opencl_initialized) {
859 		int i;
860 		const char *cmdline_devices[MAX_GPU_DEVICES];
861 
862 		nvidia_probe();
863 		amd_probe();
864 
865 		// Initialize OpenCL global control variables
866 		cmdline_devices[0] = NULL;
867 		engaged_devices[0] = DEV_LIST_END;
868 		requested_devices[0] = DEV_LIST_END;
869 
870 		for (i = 0; i < MAX_GPU_DEVICES; i++) {
871 			context[i] = NULL;
872 			queue[i] = NULL;
873 		}
874 
875 		// Read the GPU temperature setting to abort
876 		gpu_temp_limit = cfg_get_int(SECTION_OPTIONS, SUBSECTION_GPU,
877 		             "AbortTemperature");
878 		cool_gpu_down = cfg_get_int(SECTION_OPTIONS, SUBSECTION_GPU,
879 		             "SleepOnTemperature");
880 
881 		// Load information about available platforms and devices
882 		load_opencl_environment();
883 
884 		// Ensure that there is at least one OpenCL device available
885 		if (get_number_of_available_devices() == 0) {
886 			fprintf(stderr, "No OpenCL devices found\n");
887 			error();
888 		}
889 
890 		// Get the "--device" list requested by the user
891 		{
892 			int n = 0;
893 			struct list_entry *current;
894 
895 			if ((current = options.acc_devices->head)) {
896 				do {
897 					cmdline_devices[n++] = current->data;
898 				} while ((current = current->next) && n < MAX_GPU_DEVICES);
899 
900 				cmdline_devices[n] = NULL;
901 			} else
902 				gpu_id = NO_GPU;
903 		}
904 
905 		// If none selected, read the "--device" from the configuration file
906 		if (!options.acc_devices->head && gpu_id <= NO_GPU) {
907 			const char *devcfg;
908 
909 			if ((devcfg = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL,
910 			                            "Device")) && *devcfg) {
911 				cmdline_devices[0] = devcfg;
912 				cmdline_devices[1] = NULL;
913 			}
914 		}
915 
916 		// No "--device" requested. Pick the most powerful GPU as the default one.
917 		if (!cmdline_devices[0]) {
918 			cmdline_devices[0] = "best";
919 			cmdline_devices[1] = NULL;
920 		}
921 
922 		// Build the list of requested (and working) OpenCL devices
923 		build_device_list(cmdline_devices);
924 
925 		// No working OpenCL device was found
926 		if (get_number_of_devices_in_use() == 0) {
927 			fprintf(stderr, "No OpenCL devices found\n");
928 			error();
929 		}
930 #if OS_FORK
931 		// Poor man's multi-device support.
932 		if ((options.fork ? options.fork : 1) > 1 && options.acc_devices->count) {
933 			// Pick device to use for this node
934 			gpu_id = requested_devices[(options.node_min - 1) %
935 			    get_number_of_requested_devices()];
936 
937 			// Hide any other devices from list
938 			engaged_devices[0] = gpu_id;
939 			engaged_devices[1] = DEV_LIST_END;
940 		} else
941 #endif
942 
943 #ifdef HAVE_MPI
944 		// Poor man's multi-device support.
945 		if (mpi_p > 1 && mpi_p_local > 1) {
946 			// Pick device to use for this node
947 			gpu_id = engaged_devices[mpi_id % get_number_of_devices_in_use()];
948 
949 			// Hide any other devices from list
950 			engaged_devices[0] = gpu_id;
951 			engaged_devices[1] = DEV_LIST_END;
952 		} else
953 #endif
954 			gpu_id = engaged_devices[0];
955 		platform_id = get_platform_id(gpu_id);
956 
957 		opencl_initialized = 1;
958 	}
959 }
960 
961 /* Get the device preferred vector width */
opencl_get_vector_width(int sequential_id,int size)962 unsigned int opencl_get_vector_width(int sequential_id, int size)
963 {
964 	/* --force-scalar option, or john.conf ForceScalar boolean */
965 	if (options.flags & FLG_SCALAR)
966 		options.v_width = 1;
967 
968 	/* --force-vector-width=N */
969 	if (options.v_width) {
970 		ocl_v_width = options.v_width;
971 	} else {
972 		cl_uint v_width = 0;
973 
974 		// If OpenCL has not yet been loaded, load it now
975 		opencl_load_environment();
976 
977 		/* OK, we supply the real figure */
978 		switch (size) {
979 		case sizeof(cl_char):
980 			HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id],
981 				CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
982 				sizeof(v_width), &v_width, NULL),
983 			               "clGetDeviceInfo for char vector width");
984 			break;
985 		case sizeof(cl_short):
986 			HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id],
987 				CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
988 				sizeof(v_width), &v_width, NULL),
989 			               "clGetDeviceInfo for short vector width");
990 			break;
991 		case sizeof(cl_int):
992 			HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id],
993 				CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
994 				sizeof(v_width), &v_width, NULL),
995 			               "clGetDeviceInfo for int vector width");
996 			break;
997 		case sizeof(cl_long):
998 			HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id],
999 				CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
1000 				sizeof(v_width), &v_width, NULL),
1001 			               "clGetDeviceInfo for long vector width");
1002 			break;
1003 		default:
1004 			fprintf(stderr, "%s() called with unknown type\n", __FUNCTION__);
1005 			error();
1006 		}
1007 		ocl_v_width = v_width;
1008 	}
1009 	return ocl_v_width;
1010 }
1011 
1012 /* Called by core after calling format's done() */
opencl_done()1013 void opencl_done()
1014 {
1015 	int i;
1016 	int num_devices;
1017 
1018 	printed_mask = 0;
1019 
1020 	if (!opencl_initialized)
1021 		return;
1022 
1023 	num_devices = get_number_of_devices_in_use();
1024 
1025 	for (i = 0; i < num_devices; i++) {
1026 		if (queue[engaged_devices[i]])
1027 			HANDLE_CLERROR(clReleaseCommandQueue(queue[engaged_devices[i]]),
1028 			               "clReleaseCommandQueue");
1029 		queue[engaged_devices[i]] = NULL;
1030 		if (context[engaged_devices[i]])
1031 			HANDLE_CLERROR(clReleaseContext(context[engaged_devices[i]]),
1032 			               "clReleaseContext");
1033 		context[engaged_devices[i]] = NULL;
1034 		program[engaged_devices[i]] = NULL;
1035 	}
1036 
1037 	/* Reset in case we load another format after this */
1038 	local_work_size = global_work_size = duration_time = 0;
1039 	ocl_max_lws = 0;
1040 	ocl_v_width = 1;
1041 	fmt_base_name[0] = 0;
1042 	opencl_initialized = 0;
1043 	crypt_kernel = NULL;
1044 
1045 	engaged_devices[0] = engaged_devices[1] = DEV_LIST_END;
1046 }
1047 
opencl_get_config_name(const char * format,const char * config_name)1048 static char *opencl_get_config_name(const char *format, const char *config_name)
1049 {
1050 	static char config_item[256];
1051 
1052 	snprintf(config_item, sizeof(config_item), "%s%s", format, config_name);
1053 	return config_item;
1054 }
1055 
opencl_get_user_preferences(const char * format)1056 void opencl_get_user_preferences(const char *format)
1057 {
1058 	char *tmp_value;
1059 
1060 	if (format) {
1061 		snprintf(fmt_base_name, sizeof(fmt_base_name), "%s", format);
1062 		if ((tmp_value = strrchr(fmt_base_name, (int)'-')))
1063 			*tmp_value = 0;
1064 		strlwr(fmt_base_name);
1065 	} else
1066 		fmt_base_name[0] = 0;
1067 
1068 	if (format && (tmp_value = (char*)cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL,
1069 			opencl_get_config_name(fmt_base_name, LWS_CONFIG_NAME))))
1070 		local_work_size = atoi(tmp_value);
1071 
1072 	if (options.lws)
1073 		local_work_size = options.lws;
1074 	else if ((tmp_value = getenv("LWS")))
1075 		local_work_size = atoi(tmp_value);
1076 
1077 	if (format && (tmp_value = (char*)cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL,
1078 			opencl_get_config_name(fmt_base_name, GWS_CONFIG_NAME))))
1079 		global_work_size = atoi(tmp_value);
1080 
1081 	if (options.gws)
1082 		global_work_size = options.gws;
1083 	else if ((tmp_value = getenv("GWS")))
1084 		global_work_size = atoi(tmp_value);
1085 
1086 	if (local_work_size)
1087 		// Ensure a valid multiple is used.
1088 		global_work_size = GET_MULTIPLE_OR_ZERO(global_work_size,
1089 		                                        local_work_size);
1090 
1091 	if (format && (tmp_value = (char*)cfg_get_param(SECTION_OPTIONS,
1092 		SUBSECTION_OPENCL, opencl_get_config_name(fmt_base_name,
1093 		DUR_CONFIG_NAME))) && *tmp_value)
1094 		duration_time = atoi(tmp_value);
1095 	else if ((tmp_value = (char*)cfg_get_param(SECTION_OPTIONS,
1096 		SUBSECTION_OPENCL, "Global" DUR_CONFIG_NAME)) && *tmp_value)
1097 		duration_time = atoi(tmp_value);
1098 }
1099 
opencl_get_sane_lws_gws_values()1100 void opencl_get_sane_lws_gws_values()
1101 {
1102 	if (!local_work_size) {
1103 		if (cpu(device_info[gpu_id]))
1104 			local_work_size =
1105 				get_platform_vendor_id(platform_id) == DEV_INTEL ?
1106 			8 : 1;
1107 		else
1108 			local_work_size = 64;
1109 	}
1110 
1111 	if (!global_work_size)
1112 		global_work_size = 768;
1113 }
1114 
get_device_name_(int sequential_id)1115 char* get_device_name_(int sequential_id)
1116 {
1117 	static char device_name[MAX_OCLINFO_STRING_LEN];
1118 
1119 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_NAME,
1120 	                               sizeof(device_name), device_name, NULL),
1121 	               "clGetDeviceInfo for DEVICE_NAME");
1122 
1123 	return device_name;
1124 }
1125 
1126 /* Print and log information about an OpenCL devide in use */
print_device_info(int sequential_id)1127 static void print_device_info(int sequential_id)
1128 {
1129 	static int printed[MAX_GPU_DEVICES];
1130 	char device_name[MAX_OCLINFO_STRING_LEN];
1131 	char board_name[LOG_SIZE] = "";
1132 	cl_int ret_code;
1133 
1134 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_NAME,
1135 	                               sizeof(device_name), device_name, NULL),
1136 	               "clGetDeviceInfo for DEVICE_NAME");
1137 
1138 	ret_code = clGetDeviceInfo(devices[sequential_id],
1139 		CL_DEVICE_BOARD_NAME_AMD, sizeof(opencl_log), opencl_log, NULL);
1140 
1141 	if (ret_code == CL_SUCCESS) {
1142 		char *p = ltrim(rtrim(opencl_log));
1143 
1144 		if  (strlen(p))
1145 			sprintf(board_name, " [%s]", p);
1146 	}
1147 
1148 	if (options.verbosity > 1 && !printed[sequential_id]++)
1149 		fprintf(stderr, "Device %d%s%s: %s%s\n",
1150 		        sequential_id + 1,
1151 #if HAVE_MPI
1152 		        "@", mpi_name,
1153 #else
1154 		        "", "",
1155 #endif
1156 		        device_name, board_name);
1157 	log_event("Device %d: %s%s", sequential_id + 1, device_name, board_name);
1158 }
1159 
1160 /*
1161  * Given a string, return a newly allocated string that is a copy of
1162  * the original but quoted. The old string is freed.
1163  */
quote_str(char * orig)1164 static char *quote_str(char *orig)
1165 {
1166 	char *new = mem_alloc(strlen(orig) + 3);
1167 	char *s = orig;
1168 	char *d = new;
1169 
1170 	*d++ = '"';
1171 	while (*s)
1172 		*d++ = *s++;
1173 	*d++ = '"';
1174 	*d = 0;
1175 
1176 	MEM_FREE(orig);
1177 
1178 	return new;
1179 }
1180 
1181 #if defined(__CYGWIN__) || defined(__MINGW32__)
mingw_try_relative_path(char * self_path)1182 static char *mingw_try_relative_path(char *self_path)
1183 {
1184 	int len;
1185 	struct stat file_stat;
1186 	struct path {
1187 		char *prefix1, *prefix2;
1188 	};
1189 
1190 	if (!stat(self_path, &file_stat) && S_ISDIR(file_stat.st_mode))
1191 		return self_path;
1192 
1193 	len = strlen(self_path);
1194 	if (len > PATH_BUFFER_SIZE - 4)
1195 		return self_path;
1196 
1197 	{
1198 		int i = 0;
1199 		char *origin = (char *) mem_calloc(len + 1, sizeof(char));
1200 		char *fixed_path = (char *) mem_calloc(PATH_BUFFER_SIZE, sizeof(char));
1201 		struct path prefixes[] = {
1202 			{".",  "./"   /* Child */ },
1203 			{"..", "../"  /* Root */ },
1204 			{NULL, NULL}
1205 		};
1206 		strncpy(origin, self_path, len);
1207 		MEM_FREE(self_path);
1208 
1209 		while (prefixes[i].prefix1) {
1210 			if (origin[0] == '/')
1211 				strcpy(fixed_path, prefixes[i].prefix1);
1212 			else
1213 				strcpy(fixed_path, prefixes[i].prefix2);
1214 			strncat(fixed_path, origin, len);
1215 
1216 			if (!stat(fixed_path, &file_stat) && S_ISDIR(file_stat.st_mode))
1217 				goto found;
1218 			i++;
1219 		}
1220 		/* Give up */
1221 		MEM_FREE(fixed_path);
1222 		return origin;
1223 
1224 	found:
1225 		MEM_FREE(origin);
1226 		return fixed_path;
1227 	}
1228 }
1229 #endif
1230 
include_source(const char * pathname,int sequential_id,const char * opts)1231 static char *include_source(const char *pathname, int sequential_id, const char *opts)
1232 {
1233 	char *include, *full_path;
1234 	const char *global_opts;
1235 
1236 #if I_REALPATH
1237 	char *pex = (char*)path_expand_safe(pathname);
1238 
1239 	if (!(full_path = realpath(pex, NULL)))
1240 		pexit("realpath()");
1241 
1242 	MEM_FREE(pex);
1243 #else
1244 	full_path = (char*)path_expand_safe(pathname);
1245 #if defined(__CYGWIN__) || defined(__MINGW32__)
1246 	full_path = mingw_try_relative_path(full_path);
1247 #endif
1248 #endif
1249 
1250 	include = (char *) mem_calloc(LINE_BUFFER_SIZE, sizeof(char));
1251 
1252 	if (!(global_opts = getenv("OPENCLBUILDOPTIONS")))
1253 		if (!(global_opts = cfg_get_param(SECTION_OPTIONS,
1254 		    SUBSECTION_OPENCL, "GlobalBuildOpts")))
1255 			global_opts = OPENCLBUILDOPTIONS;
1256 
1257 	if (strchr(full_path, ' ')) {
1258 		full_path = quote_str(full_path);
1259 	}
1260 
1261 	snprintf(include, LINE_BUFFER_SIZE,
1262 	         "-I %s %s %s%s%s%s%d %s%d %s -D_OPENCL_COMPILER %s",
1263 	        full_path,
1264 	        global_opts,
1265 	        get_platform_vendor_id(get_platform_id(sequential_id)) ==
1266 	         PLATFORM_MESA ? "-D__MESA__ " :
1267 	        get_platform_vendor_id(get_platform_id(sequential_id)) ==
1268 	         PLATFORM_POCL ? "-D__POCL__ " :
1269 	        get_platform_vendor_id(get_platform_id(sequential_id)) ==
1270 	         PLATFORM_BEIGNET ?
1271 	         "-D__BEIGNET__ " :
1272 	        get_device_capability(sequential_id),
1273 #ifdef __APPLE__
1274 	        "-D__OS_X__ ",
1275 #else
1276 	        (options.verbosity >= VERB_MAX &&
1277 	         gpu_nvidia(device_info[sequential_id])) ?
1278 	         "-cl-nv-verbose " : "",
1279 #endif
1280 	        get_device_type(sequential_id) == CL_DEVICE_TYPE_CPU ? "-D__CPU__ "
1281 	        : get_device_type(sequential_id) == CL_DEVICE_TYPE_GPU ? "-D__GPU__ " : "",
1282 	        "-DDEVICE_INFO=", device_info[sequential_id],
1283 	        "-D__SIZEOF_HOST_SIZE_T__=", (int)sizeof(size_t),
1284 	        opencl_driver_ver(sequential_id),
1285 	        opts ? opts : "");
1286 
1287 	MEM_FREE(full_path);
1288 
1289 	return include;
1290 }
1291 
opencl_build(int sequential_id,const char * opts,int save,const char * file_name,cl_program * program,const char * kernel_source_file,const char * kernel_source)1292 void opencl_build(int sequential_id, const char *opts, int save, const char *file_name, cl_program *program, const char *kernel_source_file, const char *kernel_source)
1293 {
1294 	cl_int build_code, err_code;
1295 	char *build_log, *build_opts;
1296 	size_t log_size;
1297 	const char *srcptr[] = { kernel_source };
1298 #if HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS)
1299 	int kludge_file = 0;
1300 #endif
1301 
1302 	/* This over-rides binary caching */
1303 	if (getenv("DUMP_BINARY")) {
1304 		char *bname = basename(kernel_source_file);
1305 		char *ext = ".bin";
1306 		int size = strlen(bname) + strlen(ext) + 1;
1307 		char *name = mem_alloc_tiny(size, MEM_ALIGN_NONE);
1308 
1309 		save = 1;
1310 		snprintf(name, size, "%s%s", bname, ext);
1311 		file_name = name;
1312 	}
1313 
1314 	*program =
1315 	    clCreateProgramWithSource(context[sequential_id], 1, srcptr,
1316 	                              NULL, &err_code);
1317 	HANDLE_CLERROR(err_code, "clCreateProgramWithSource");
1318 	// include source is thread safe.
1319 	build_opts = include_source("$JOHN/kernels", sequential_id, opts);
1320 
1321 	if (options.verbosity > VERB_LEGACY)
1322 		fprintf(stderr, "Options used: %s %s\n", build_opts,
1323 		        kernel_source_file);
1324 
1325 	kernel_source_file = path_expand(kernel_source_file);
1326 
1327 #if HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS)
1328 	if (mpi_p > 1) {
1329 #if RACE_CONDITION_DEBUG
1330 		if (options.verbosity == VERB_DEBUG)
1331 			fprintf(stderr, "Node %d %s kludge locking %s...\n",
1332 			        NODE, __FUNCTION__, kernel_source_file);
1333 #endif
1334 		if ((kludge_file = open(kernel_source_file, O_RDWR | O_APPEND)) < 0) {
1335 			pexit("Error opening kernel file");
1336 		} else {
1337 #if FCNTL_LOCKS
1338 			struct flock lock;
1339 
1340 			memset(&lock, 0, sizeof(lock));
1341 			lock.l_type = F_WRLCK;
1342 			while (fcntl(kludge_file, F_SETLKW, &lock)) {
1343 				if (errno != EINTR)
1344 					pexit("fcntl(F_WRLCK)");
1345 			}
1346 #else
1347 			while (flock(kludge_file, LOCK_EX)) {
1348 				if (errno != EINTR)
1349 					pexit("flock(LOCK_EX)");
1350 			}
1351 #endif /* FCNTL_LOCKS */
1352 		}
1353 #if RACE_CONDITION_DEBUG
1354 		if (options.verbosity == VERB_DEBUG)
1355 			fprintf(stderr, "Node %d got a kludge lock\n", NODE);
1356 #endif
1357 	}
1358 #endif /* HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS) */
1359 
1360 	build_code = clBuildProgram(*program, 0, NULL,
1361 	                            build_opts, NULL, NULL);
1362 
1363 	HANDLE_CLERROR(clGetProgramBuildInfo(*program,
1364 	                                     devices[sequential_id],
1365 	                                     CL_PROGRAM_BUILD_LOG, 0, NULL,
1366 	                                     &log_size),
1367 	               "clGetProgramBuildInfo I");
1368 	build_log = (char *)mem_calloc(1, log_size + 1);
1369 
1370 	HANDLE_CLERROR(clGetProgramBuildInfo(*program,
1371 	                                     devices[sequential_id],
1372 	                                     CL_PROGRAM_BUILD_LOG, log_size + 1,
1373 	                                     (void *)build_log, NULL),
1374 	               "clGetProgramBuildInfo II");
1375 
1376 	// Report build errors and warnings
1377 	if (build_code != CL_SUCCESS) {
1378 		// Give us info about error and exit (through HANDLE_CLERROR)
1379 		if (options.verbosity <= VERB_LEGACY)
1380 			fprintf(stderr, "Options used: %s %s\n",
1381 			        build_opts, kernel_source_file);
1382 		if (strlen(build_log) > 1)
1383 			fprintf(stderr, "Build log: %s\n", build_log);
1384 		fprintf(stderr, "Error building kernel %s. DEVICE_INFO=%d\n",
1385 		        kernel_source_file, device_info[sequential_id]);
1386 		HANDLE_CLERROR(build_code, "clBuildProgram");
1387 	}
1388 	// Nvidia may return a single '\n' that we ignore
1389 	else if (options.verbosity >= LOG_VERB && strlen(build_log) > 1)
1390 		fprintf(stderr, "Build log: %s\n", build_log);
1391 
1392 	MEM_FREE(build_log);
1393 	MEM_FREE(build_opts);
1394 
1395 	if (save) {
1396 		FILE *file;
1397 		size_t source_size;
1398 		char *source, *full_path;
1399 
1400 		HANDLE_CLERROR(clGetProgramInfo(*program,
1401 		                                CL_PROGRAM_BINARY_SIZES,
1402 		                                sizeof(size_t), &source_size, NULL),
1403 		               "clGetProgramInfo for CL_PROGRAM_BINARY_SIZES");
1404 
1405 		if (options.verbosity >= VERB_MAX)
1406 			fprintf(stderr, "binary size "Zu"\n", source_size);
1407 
1408 		source = mem_calloc(1, source_size);
1409 
1410 		HANDLE_CLERROR(clGetProgramInfo(*program,
1411 		                                CL_PROGRAM_BINARIES,
1412 		                                sizeof(char *), &source, NULL),
1413 		               "clGetProgramInfo for CL_PROGRAM_BINARIES");
1414 
1415 		file = fopen(full_path = (char*)path_expand_safe(file_name), "w");
1416 		MEM_FREE(full_path);
1417 
1418 		if (file == NULL)
1419 			perror("Error creating binary cache file");
1420 		else {
1421 #if OS_FLOCK || FCNTL_LOCKS
1422 #if RACE_CONDITION_DEBUG
1423 			if (options.verbosity == VERB_DEBUG)
1424 				fprintf(stderr, "Node %d %s locking %s...\n", NODE, __FUNCTION__, file_name);
1425 #endif
1426 			{
1427 #if FCNTL_LOCKS
1428 				struct flock lock;
1429 
1430 				memset(&lock, 0, sizeof(lock));
1431 				lock.l_type = F_WRLCK;
1432 				while (fcntl(fileno(file), F_SETLKW, &lock)) {
1433 					if (errno != EINTR)
1434 						pexit("fcntl(F_WRLCK)");
1435 				}
1436 #else
1437 				while (flock(fileno(file), LOCK_EX)) {
1438 					if (errno != EINTR)
1439 						pexit("flock(LOCK_EX)");
1440 				}
1441 #endif
1442 			}
1443 #if RACE_CONDITION_DEBUG
1444 			if (options.verbosity == VERB_DEBUG)
1445 				fprintf(stderr, "Node %d got a lock on %s\n", NODE, file_name);
1446 #endif
1447 #endif /* OS_FLOCK || FCNTL_LOCKS */
1448 			if (fwrite(source, source_size, 1, file) != 1)
1449 				perror("Error caching kernel binary");
1450 #if RACE_CONDITION_DEBUG
1451 			if (options.verbosity == VERB_DEBUG)
1452 				fprintf(stderr, "Node %d closing %s\n", NODE, file_name);
1453 #endif
1454 			fclose(file);
1455 		}
1456 		MEM_FREE(source);
1457 	}
1458 
1459 #if HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS)
1460 #if RACE_CONDITION_DEBUG
1461 	if (mpi_p > 1 && options.verbosity == VERB_DEBUG)
1462 		fprintf(stderr, "Node %d releasing kludge lock\n", NODE);
1463 #endif
1464 	if (mpi_p > 1)
1465 		close(kludge_file);
1466 #endif /* HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS) */
1467 }
1468 
opencl_build_from_binary(int sequential_id,cl_program * program,const char * kernel_source,size_t program_size)1469 void opencl_build_from_binary(int sequential_id, cl_program *program, const char *kernel_source, size_t program_size)
1470 {
1471 	cl_int build_code, err_code;
1472 	char *build_log;
1473 	const char *srcptr[] = { kernel_source };
1474 
1475 	build_log = (char *) mem_calloc(LOG_SIZE, sizeof(char));
1476 	*program =
1477 	    clCreateProgramWithBinary(context[sequential_id], 1,
1478 	                              &devices[sequential_id], &program_size,
1479 	                              (const unsigned char **)srcptr,
1480 	                              NULL, &err_code);
1481 	HANDLE_CLERROR(err_code,
1482 	               "clCreateProgramWithBinary (using cached binary)");
1483 
1484 	build_code = clBuildProgram(*program, 0,
1485 	                            NULL, NULL, NULL, NULL);
1486 
1487 	HANDLE_CLERROR(clGetProgramBuildInfo(*program,
1488 	                                     devices[sequential_id],
1489 	                                     CL_PROGRAM_BUILD_LOG, LOG_SIZE,
1490 	                                     (void *)build_log,
1491 	                                     NULL),
1492 	               "clGetProgramBuildInfo (using cached binary)");
1493 
1494 	// Report build errors and warnings
1495 	if (build_code != CL_SUCCESS) {
1496 		// Give us info about error and exit (through HANDLE_CLERROR)
1497 		if (strlen(build_log) > 1)
1498 			fprintf(stderr, "Binary build log: %s\n", build_log);
1499 		fprintf(stderr, "Error %d building kernel using cached binary."
1500 		        " DEVICE_INFO=%d\n", build_code, device_info[sequential_id]);
1501 		HANDLE_CLERROR(build_code, "clBuildProgram");
1502 	}
1503 	// Nvidia may return a single '\n' that we ignore
1504 	else if (options.verbosity >= LOG_VERB && strlen(build_log) > 1)
1505 		fprintf(stderr, "Binary Build log: %s\n", build_log);
1506 
1507 	MEM_FREE(build_log);
1508 }
1509 
1510 // Do the proper test using different global work sizes.
clear_profiling_events()1511 static void clear_profiling_events()
1512 {
1513 	int i;
1514 
1515 	// Release events
1516 	for (i = 0; i < MAX_EVENTS; i++) {
1517 		if (multi_profilingEvent[i] && *multi_profilingEvent[i])
1518 			HANDLE_CLERROR(clReleaseEvent(*multi_profilingEvent[i]),
1519 			               "clReleaseEvent");
1520 
1521 		if (multi_profilingEvent[i])
1522 			*multi_profilingEvent[i] = NULL;
1523 		multi_profilingEvent[i] = NULL;
1524 	}
1525 }
1526 
1527 // Fill [set_salt(), set_key()] the OpenCL device with data. Returns
1528 // salt, and fills binary pointer.
fill_opencl_device(size_t gws,void ** binary)1529 static void* fill_opencl_device(size_t gws, void **binary)
1530 {
1531 	int i;
1532 	size_t kpc = gws * ocl_v_width;
1533 	void *salt;
1534 
1535 	// Set keys - unique printable length-7 keys
1536 	self->methods.clear_keys();
1537 	{
1538 		char key[PLAINTEXT_BUFFER_SIZE];
1539 		int len = mask_add_len;
1540 
1541 		if (mask_add_len == 0 ||
1542 		    options.req_minlength != -1 || options.req_maxlength != 0) {
1543 			len = (self->params.benchmark_length & 0x7f);
1544 
1545 			if (len < options.req_minlength)
1546 				len = options.req_minlength;
1547 			if (options.req_maxlength && len > options.req_maxlength)
1548 				len = options.req_maxlength;
1549 		}
1550 		// Obey format's min and max length
1551 		len = MAX(len, self->params.plaintext_min_length);
1552 		len = MIN(len, self->params.plaintext_length);
1553 
1554 		if (options.verbosity == VERB_DEBUG)
1555 			fprintf(stderr, "Tuning to length %d\n", len);
1556 
1557 		memset(key, 0x41, sizeof(key));
1558 		key[len] = 0;
1559 
1560 		for (i = 0; i < kpc; i++) {
1561 			int l = len - 1;
1562 
1563 			self->methods.set_key(key, i);
1564 			while (l >= 0 && ++key[l] > 0x60)
1565 				key[l--] = 0x21;
1566 		}
1567 	}
1568 
1569 	// Set salt
1570 	dyna_salt_init(self);
1571 	if (self->methods.tunable_cost_value[0] && autotune_db->real) {
1572 		struct db_main *db = autotune_db->real;
1573 		struct db_salt *s = db->salts;
1574 
1575 		while (s->next && s->cost[0] < db->max_cost[0])
1576 			s = s->next;
1577 		salt = s->salt;
1578 		*binary = s->list->binary;
1579 	} else {
1580 		char *ciphertext;
1581 
1582 		if (!self->params.tests[0].fields[1])
1583 			self->params.tests[0].fields[1] = self->params.tests[0].ciphertext;
1584 		ciphertext = self->methods.prepare(self->params.tests[0].fields, self);
1585 		ciphertext = self->methods.split(ciphertext, 0, self);
1586 		salt = self->methods.salt(ciphertext);
1587 		*binary = self->methods.binary(ciphertext);
1588 		if (salt)
1589 			dyna_salt_create(salt);
1590 	}
1591 	self->methods.set_salt(salt);
1592 
1593 	return salt;
1594 }
1595 
1596 // Do a test run with a specific global work size, return total duration
1597 // (or return zero for error or limits exceeded)
gws_test(size_t gws,unsigned int rounds,int sequential_id)1598 static cl_ulong gws_test(size_t gws, unsigned int rounds, int sequential_id)
1599 {
1600 	cl_ulong startTime, endTime, runtime = 0, looptime = 0;
1601 	int i, count, total = 0;
1602 	size_t kpc = gws * ocl_v_width;
1603 	cl_event benchEvent[MAX_EVENTS];
1604 	int result, number_of_events = 0;
1605 	void *salt, *binary;
1606 	int amd_bug;
1607 
1608 	for (i = 0; i < MAX_EVENTS; i++)
1609 		benchEvent[i] = NULL;
1610 
1611 	// Ensure format knows its GWS
1612 	global_work_size = gws;
1613 
1614 	// Prepare buffers.
1615 	create_clobj(gws, self);
1616 
1617 	// Transfer data to the OpenCL device
1618 	salt = fill_opencl_device(gws, &binary);
1619 
1620 	// Activate events. Then clear them later.
1621 	for (i = 0; i < MAX_EVENTS; i++)
1622 		multi_profilingEvent[i] = &benchEvent[i];
1623 
1624 	// Timing run
1625 	count = kpc;
1626 	result = self->methods.crypt_all(&count, autotune_salts);
1627 	if (result < 0) {
1628 		runtime = looptime = 0;
1629 
1630 		if (options.verbosity > VERB_LEGACY)
1631 			fprintf(stderr, " (error occurred)");
1632 		clear_profiling_events();
1633 		release_clobj();
1634 		if (!self->methods.tunable_cost_value[0] || !autotune_db->real)
1635 			dyna_salt_remove(salt);
1636 		return 0;
1637 	}
1638 	self->methods.cmp_all(binary, result);
1639 
1640 	for (i = 0; (*multi_profilingEvent[i]); i++)
1641 		number_of_events++;
1642 
1643 	//** Get execution time **//
1644 	for (i = 0; i < number_of_events; i++) {
1645 		char mult[32] = "";
1646 
1647 		amd_bug = 0;
1648 
1649 		HANDLE_CLERROR(clWaitForEvents(1, multi_profilingEvent[i]),
1650 		               "clWaitForEvents");
1651 		HANDLE_CLERROR(clGetEventProfilingInfo(*multi_profilingEvent[i],
1652 		                                       CL_PROFILING_COMMAND_START,
1653 		                                       sizeof(cl_ulong), &startTime,
1654 		                                       NULL),
1655 		               "clGetEventProfilingInfo start");
1656 		HANDLE_CLERROR(clGetEventProfilingInfo(*multi_profilingEvent[i],
1657 		                                       CL_PROFILING_COMMAND_END,
1658 		                                       sizeof(cl_ulong), &endTime,
1659 		                                       NULL),
1660 		               "clGetEventProfilingInfo end");
1661 
1662 		/* Work around AMD bug. It randomly claims that a kernel
1663 		   run took less than a microsecond, fooling our auto tune */
1664 		if (endTime - startTime < 1000) {
1665 			amd_bug = 1;
1666 
1667 			HANDLE_CLERROR(clGetEventProfilingInfo(*multi_profilingEvent[i],
1668 			                                       CL_PROFILING_COMMAND_SUBMIT,
1669 			                                       sizeof(cl_ulong), &startTime,
1670 			                                       NULL),
1671 			               "clGetEventProfilingInfo submit");
1672 		}
1673 
1674 		/* Work around OSX bug with HD4000 driver */
1675 		if (endTime == 0)
1676 			endTime = startTime;
1677 
1678 		if ((split_events) && (i == split_events[0] ||
1679 		                       i == split_events[1] || i == split_events[2])) {
1680 			looptime += (endTime - startTime);
1681 			total++;
1682 
1683 			if (i == split_events[0])
1684 				sprintf(mult, "%dx", rounds / hash_loops);
1685 		} else
1686 			runtime += (endTime - startTime);
1687 
1688 		if (options.verbosity >= VERB_MAX)
1689 			fprintf(stderr, "%s%s%s%s", warnings[i], mult,
1690 			        ns2string(endTime - startTime), (amd_bug) ? "*" : "");
1691 
1692 		/* Single-invocation duration limit */
1693 		if (duration_time &&
1694 		    (endTime - startTime) > 1000000ULL * duration_time) {
1695 			runtime = looptime = 0;
1696 
1697 			if (options.verbosity >= VERB_MAX)
1698 				fprintf(stderr, " (exceeds %s)", ms2string(duration_time));
1699 			break;
1700 		}
1701 	}
1702 	if (options.verbosity >= VERB_MAX)
1703 		fprintf(stderr, "\n");
1704 
1705 	if (total)
1706 		runtime += (looptime * rounds) / (hash_loops * total);
1707 
1708 	clear_profiling_events();
1709 	release_clobj();
1710 
1711 	if (!self->methods.tunable_cost_value[0] || !autotune_db->real)
1712 		dyna_salt_remove(salt);
1713 
1714 	return runtime;
1715 }
1716 
opencl_init_auto_setup(int p_default_value,int p_hash_loops,int * p_split_events,const char ** p_warnings,int p_main_opencl_event,struct fmt_main * p_self,void (* p_create_clobj)(size_t gws,struct fmt_main * self),void (* p_release_clobj)(void),int p_buffer_size,size_t p_gws_limit,struct db_main * db)1717 void opencl_init_auto_setup(int p_default_value, int p_hash_loops,
1718                             int *p_split_events, const char **p_warnings,
1719                             int p_main_opencl_event, struct fmt_main *p_self,
1720                             void (*p_create_clobj)(size_t gws, struct fmt_main *self),
1721                             void (*p_release_clobj)(void), int p_buffer_size, size_t p_gws_limit,
1722                             struct db_main *db)
1723 {
1724 	// Initialize events
1725 	clear_profiling_events();
1726 
1727 	// Get parameters
1728 	buffer_size = p_buffer_size;
1729 	default_value = p_default_value;
1730 	hash_loops = p_hash_loops;
1731 	split_events = p_split_events;
1732 	warnings = p_warnings;
1733 	main_opencl_event = p_main_opencl_event;
1734 	self = p_self;
1735 	create_clobj = p_create_clobj;
1736 	release_clobj = p_release_clobj;
1737 	gws_limit = p_gws_limit;
1738 	autotune_db = db;
1739 	autotune_real_db = db && db->real && db->real == db;
1740 	autotune_salts = db ? db->salts : NULL;
1741 }
1742 
1743 /*
1744  * Since opencl_find_best_gws() needs more event control (even more events) to
1745  * work properly, opencl_find_best_workgroup() cannot be used by formats that
1746  * are using it.  Therefore, despite the fact that opencl_find_best_lws() does
1747  * almost the same that opencl_find_best_workgroup() can do, it also handles
1748  * the necessary event(s) and can do a proper crypt_all() execution analysis
1749  * when shared GWS detection is used.
1750  */
opencl_find_best_lws(size_t group_size_limit,int sequential_id,cl_kernel crypt_kernel)1751 void opencl_find_best_lws(size_t group_size_limit, int sequential_id,
1752                           cl_kernel crypt_kernel)
1753 {
1754 	size_t gws;
1755 	cl_int ret_code;
1756 	int i, j, numloops, count, result;
1757 	size_t my_work_group, optimal_work_group;
1758 	size_t max_group_size, wg_multiple, sumStartTime, sumEndTime;
1759 	cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX;
1760 	cl_event benchEvent[MAX_EVENTS];
1761 	void *salt, *binary;
1762 
1763 	for (i = 0; i < MAX_EVENTS; i++)
1764 		benchEvent[i] = NULL;
1765 
1766 	gws = global_work_size;
1767 
1768 	if (options.verbosity > VERB_LEGACY)
1769 		fprintf(stderr, "Calculating best LWS for GWS="Zu"\n", gws);
1770 
1771 	if (get_device_version(sequential_id) < 110) {
1772 		if (get_device_type(sequential_id) == CL_DEVICE_TYPE_GPU)
1773 			wg_multiple = 32;
1774 		else if (get_platform_vendor_id(get_platform_id(sequential_id))
1775 		         == DEV_INTEL)
1776 			wg_multiple = 8;
1777 		else
1778 			wg_multiple = 1;
1779 	} else
1780 		wg_multiple = get_kernel_preferred_multiple(sequential_id,
1781 		              crypt_kernel);
1782 
1783 	if (platform_apple(get_platform_id(sequential_id)) &&
1784 	    cpu(device_info[sequential_id]))
1785 		max_group_size = 1;
1786 	else
1787 		max_group_size = ocl_max_lws ?
1788 			ocl_max_lws : get_kernel_max_lws(sequential_id, crypt_kernel);
1789 
1790 	if (max_group_size > group_size_limit)
1791 		// Needed to deal (at least) with cryptsha512-opencl limits.
1792 		max_group_size = group_size_limit;
1793 
1794 	// Safety harness
1795 	if (wg_multiple > max_group_size)
1796 		wg_multiple = max_group_size;
1797 
1798 	// Change command queue to be used by crypt_all (profile needed)
1799 	clReleaseCommandQueue(queue[sequential_id]);
1800 
1801 	// Create a new queue with profiling enabled
1802 	queue[sequential_id] =
1803 	    clCreateCommandQueue(context[sequential_id],
1804 	                         devices[sequential_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
1805 	HANDLE_CLERROR(ret_code, "clCreateCommandQueue");
1806 
1807 	// Transfer data to the OpenCL device
1808 	salt = fill_opencl_device(gws, &binary);
1809 
1810 	// Warm-up run
1811 	local_work_size = wg_multiple;
1812 	count = global_work_size * ocl_v_width;
1813 	result = self->methods.crypt_all(&count, autotune_salts);
1814 	if (result > 0)
1815 		self->methods.cmp_all(binary, result);
1816 
1817 	// Activate events. Then clear them later.
1818 	for (i = 0; i < MAX_EVENTS; i++)
1819 		multi_profilingEvent[i] = &benchEvent[i];
1820 
1821 	// Timing run
1822 	count = global_work_size * ocl_v_width;
1823 	result = self->methods.crypt_all(&count, autotune_salts);
1824 	if (result > 0)
1825 		self->methods.cmp_all(binary, result);
1826 
1827 	HANDLE_CLERROR(clWaitForEvents(1, &benchEvent[main_opencl_event]),
1828 	               "clWaitForEvents");
1829 	HANDLE_CLERROR(clFinish(queue[sequential_id]), "clFinish");
1830 	HANDLE_CLERROR(clGetEventProfilingInfo(benchEvent[main_opencl_event],
1831 	                                       CL_PROFILING_COMMAND_START,
1832 	                                       sizeof(cl_ulong),
1833 	                                       &startTime, NULL),
1834 	               "clGetEventProfilingInfo start");
1835 
1836 	HANDLE_CLERROR(clGetEventProfilingInfo(benchEvent[main_opencl_event],
1837 	                                       CL_PROFILING_COMMAND_END,
1838 	                                       sizeof(cl_ulong), &endTime, NULL),
1839 	               "clGetEventProfilingInfo end");
1840 	cl_ulong roundup = endTime - startTime - 1;
1841 	numloops = (int)(size_t)((200000000ULL + roundup) / (endTime - startTime));
1842 
1843 	clear_profiling_events();
1844 
1845 	if (numloops < 1)
1846 		numloops = 1;
1847 
1848 	// Find minimum time
1849 	for (optimal_work_group = my_work_group = wg_multiple;
1850 	        (int)my_work_group <= (int)max_group_size;
1851 	        my_work_group += wg_multiple) {
1852 
1853 		global_work_size = gws;
1854 		if (gws % my_work_group != 0) {
1855 
1856 			if (GET_EXACT_MULTIPLE(gws, my_work_group) > global_work_size)
1857 			    continue;
1858 			global_work_size = GET_EXACT_MULTIPLE(gws, my_work_group);
1859 		}
1860 
1861 		if (options.verbosity > VERB_LEGACY)
1862 			fprintf(stderr, "Testing LWS=" Zu " GWS=" Zu " ...", my_work_group,
1863 			        global_work_size);
1864 
1865 		sumStartTime = 0;
1866 		sumEndTime = 0;
1867 
1868 		for (i = 0; i < numloops; i++) {
1869 			advance_cursor();
1870 			local_work_size = my_work_group;
1871 
1872 			// Activate events. Then clear them later.
1873 			for (j = 0; j < MAX_EVENTS; j++)
1874 				multi_profilingEvent[j] = &benchEvent[j];
1875 
1876 			count = global_work_size * ocl_v_width;
1877 			result = self->methods.crypt_all(&count, autotune_salts);
1878 			if (result < 0) {
1879 				startTime = endTime = 0;
1880 				break;
1881 			}
1882 			self->methods.cmp_all(binary, result);
1883 
1884 			HANDLE_CLERROR(clWaitForEvents(1, &benchEvent[main_opencl_event]),
1885 			               "clWaitForEvents");
1886 			HANDLE_CLERROR(clFinish(queue[sequential_id]), "clFinish");
1887 			HANDLE_CLERROR(clGetEventProfilingInfo(benchEvent
1888 				[main_opencl_event], CL_PROFILING_COMMAND_START,
1889 				sizeof(cl_ulong), &startTime, NULL),
1890 			               "clGetEventProfilingInfo start");
1891 			HANDLE_CLERROR(clGetEventProfilingInfo(benchEvent
1892 				[main_opencl_event], CL_PROFILING_COMMAND_END,
1893 				sizeof(cl_ulong), &endTime, NULL),
1894 			               "clGetEventProfilingInfo end");
1895 
1896 			sumStartTime += startTime;
1897 			sumEndTime += endTime;
1898 
1899 			clear_profiling_events();
1900 		}
1901 
1902 		/* Erase the 'spinning wheel' cursor */
1903 		if (john_main_process)
1904 			fprintf(stderr, " \b");
1905 
1906 		if (!endTime)
1907 			break;
1908 		if (options.verbosity > VERB_LEGACY)
1909 			fprintf(stderr, " %s%s\n", ns2string(sumEndTime - sumStartTime),
1910 			    ((double)(sumEndTime - sumStartTime) / kernelExecTimeNs < 0.997)
1911 			        ? "+" : "");
1912 		if ((double)(sumEndTime - sumStartTime) / kernelExecTimeNs < 0.997) {
1913 			kernelExecTimeNs = sumEndTime - sumStartTime;
1914 			optimal_work_group = my_work_group;
1915 		} else {
1916 			if (my_work_group >= 256 ||
1917 			    (my_work_group >= 8 && wg_multiple < 8)) {
1918 				/* Jump to next power of 2 */
1919 				size_t x, y;
1920 				x = my_work_group;
1921 				while ((y = x & (x - 1)))
1922 					x = y;
1923 				x *= 2;
1924 				my_work_group =
1925 				    GET_NEXT_MULTIPLE(x, wg_multiple);
1926 				/* The loop logic will re-add wg_multiple */
1927 				my_work_group -= wg_multiple;
1928 			}
1929 		}
1930 	}
1931 	// Release profiling queue and create new with profiling disabled
1932 	HANDLE_CLERROR(clReleaseCommandQueue(queue[sequential_id]),
1933 	               "clReleaseCommandQueue");
1934 	queue[sequential_id] =
1935 	    clCreateCommandQueue(context[sequential_id],
1936 	                         devices[sequential_id], 0, &ret_code);
1937 	HANDLE_CLERROR(ret_code, "clCreateCommandQueue");
1938 	local_work_size = optimal_work_group;
1939 	global_work_size = GET_EXACT_MULTIPLE(gws, local_work_size);
1940 
1941 	if (!self->methods.tunable_cost_value[0] || !autotune_db->real)
1942 		dyna_salt_remove(salt);
1943 }
1944 
human_speed(unsigned long long int speed)1945 static char *human_speed(unsigned long long int speed)
1946 {
1947 	static char out[32];
1948 	char p = '\0';
1949 
1950 	if (speed > 1000000) {
1951 		speed /= 1000;
1952 		p = 'K';
1953 	}
1954 	if (speed > 1000000) {
1955 		speed /= 1000;
1956 		p = 'M';
1957 	}
1958 	if (speed > 1000000) {
1959 		speed /= 1000;
1960 		p = 'G';
1961 	}
1962 	if (speed > 1000000) {
1963 		speed /= 1000;
1964 		p = 'T'; /* you wish */
1965 	}
1966 	if (p)
1967 		snprintf(out, sizeof(out), "%llu%cc/s", speed, p);
1968 	else
1969 		snprintf(out, sizeof(out), "%lluc/s", speed);
1970 
1971 	return out;
1972 }
1973 
get_bitmap_size_bits(uint32_t num_elements,int sequential_id)1974 uint32_t get_bitmap_size_bits(uint32_t num_elements, int sequential_id)
1975 {
1976 	uint32_t size, elements = num_elements;
1977 	//On super: 128MB , 1GB, 2GB
1978 	cl_ulong memory_available = get_max_mem_alloc_size(sequential_id);
1979 
1980 	get_power_of_two(elements);
1981 
1982 	size = (elements * 8);
1983 
1984 	if (num_elements < (16))
1985 		size = (16 * 1024 * 8); //Cache?
1986 	else if (num_elements < (128))
1987 		size = (1024 * 1024 * 8 * 16);
1988 	else if (num_elements < (16 * 1024))
1989 		size *= 1024 * 4;
1990 	else
1991 		size *= 256;
1992 
1993 	if (size > memory_available) {
1994 		size = memory_available;
1995 		get_power_of_two(size);
1996 
1997 	}
1998 	if (!size || size > INT_MAX)
1999 		size = (uint32_t)INT_MAX + 1U;
2000 
2001 	return size;
2002 }
2003 
opencl_find_best_gws(int step,int max_duration,int sequential_id,unsigned int rounds,int have_lws)2004 void opencl_find_best_gws(int step, int max_duration,
2005                           int sequential_id, unsigned int rounds, int have_lws)
2006 {
2007 	size_t num = 0;
2008 	size_t optimal_gws = local_work_size, soft_limit = 0;
2009 	unsigned long long speed, best_speed = 0, raw_speed;
2010 	cl_ulong run_time;
2011 	int save_duration_time = duration_time;
2012 	cl_uint core_count = get_processors_count(sequential_id);
2013 
2014 	if (have_lws) {
2015 		if (core_count > 2)
2016 			optimal_gws = lcm(core_count, optimal_gws);
2017 		default_value = optimal_gws;
2018 	} else {
2019 		soft_limit = local_work_size * core_count * 128;
2020 	}
2021 
2022 	/* conf setting may override (decrease) code's max duration */
2023 	if (!duration_time || max_duration < duration_time)
2024 		duration_time = max_duration;
2025 
2026 	if (options.verbosity > VERB_DEFAULT) {
2027 		if (mask_int_cand.num_int_cand > 1 && !printed_mask++)
2028 			fprintf(stderr, "Internal mask, multiplier: %u (target: %u)\n",
2029 			        mask_int_cand.num_int_cand, mask_int_cand_target);
2030 		else if (mask_int_cand_target > 1 && !printed_mask)
2031 			fprintf(stderr, "Internal mask not utilized (target: %u)\n",
2032 			        mask_int_cand_target);
2033 	}
2034 	if (options.verbosity > VERB_LEGACY) {
2035 		fprintf(stderr, "Calculating best GWS for LWS="Zu"; "
2036 		        "max. %s single kernel invocation.\n",
2037 		        local_work_size, ms2string(duration_time));
2038 	}
2039 
2040 	if (options.verbosity >= VERB_MAX)
2041 		fprintf(stderr, "Raw speed figures including buffer transfers:\n");
2042 
2043 	// Change command queue to be used by crypt_all (profile needed)
2044 	clReleaseCommandQueue(queue[sequential_id]);    // Delete old queue
2045 
2046 	// Create a new queue with profiling enabled
2047 	queue[sequential_id] =
2048 	    clCreateCommandQueue(context[sequential_id],
2049 	                         devices[sequential_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
2050 	HANDLE_CLERROR(ret_code, "clCreateCommandQueue");
2051 
2052 	for (num = autotune_get_next_gws_size(num, step, 1, default_value);;
2053 	        num = autotune_get_next_gws_size(num, step, 0, default_value)) {
2054 		size_t kpc = num * ocl_v_width;
2055 
2056 		// Check if hardware can handle the size we are going
2057 		// to try now.
2058 		if ((soft_limit && (num > soft_limit)) ||
2059 		    (gws_limit && (num > gws_limit)) || ((gws_limit == 0) &&
2060 		    (buffer_size * kpc * 1.1 > get_max_mem_alloc_size(gpu_id)))) {
2061 			if (!optimal_gws)
2062 				optimal_gws = num;
2063 
2064 			if (options.verbosity >= VERB_MAX)
2065 				fprintf(stderr, "Hardware resources exhausted\n");
2066 			break;
2067 		}
2068 
2069 		if (!(run_time = gws_test(num, rounds, sequential_id)))
2070 			break;
2071 
2072 		if (options.verbosity <= VERB_LEGACY)
2073 			advance_cursor();
2074 
2075 		raw_speed = (kpc / (run_time / 1E9)) * mask_int_cand.num_int_cand;
2076 		speed = rounds * raw_speed;
2077 
2078 		if (options.verbosity > VERB_LEGACY)
2079 			fprintf(stderr, "gws: %9zu\t%10s%12llu "
2080 			        "rounds/s%10s per crypt_all()",
2081 			        num, human_speed(raw_speed), speed, ns2string(run_time));
2082 
2083 		/*
2084 		 * Larger GWS is very expensive for single mode, so we try to
2085 		 * keep it reasonable low here.
2086 		 */
2087 		if (speed >
2088 		    ((options.flags & FLG_SINGLE_CHK ? 1.25 : 1.01) * best_speed)) {
2089 			if (options.verbosity > VERB_LEGACY)
2090 				fprintf(stderr, (speed > 2 * best_speed) ? "!" : "+");
2091 			best_speed = speed;
2092 			global_speed = raw_speed;
2093 			optimal_gws = num;
2094 		}
2095 		if (options.verbosity > VERB_LEGACY)
2096 			fprintf(stderr, "\n");
2097 	}
2098 
2099 	/* Backward run */
2100 	for (num = autotune_get_prev_gws_size(optimal_gws, step);;
2101 	     num = autotune_get_prev_gws_size(num, step)) {
2102 		size_t kpc = num * ocl_v_width;
2103 
2104 		if (!(run_time = gws_test(num, rounds, sequential_id)))
2105 			break;
2106 
2107 		if (options.verbosity <= VERB_LEGACY)
2108 			advance_cursor();
2109 
2110 		raw_speed = (kpc / (run_time / 1E9)) * mask_int_cand.num_int_cand;
2111 		speed = rounds * raw_speed;
2112 
2113 		if (options.verbosity > VERB_LEGACY)
2114 			fprintf(stderr, "gws: %9zu\t%10s%12llu "
2115 			        "rounds/s%10s per crypt_all()",
2116 			        num, human_speed(raw_speed), speed, ns2string(run_time));
2117 
2118 		if (speed < best_speed) {
2119 			if (options.verbosity > VERB_LEGACY)
2120 				fprintf(stderr, "-\n");
2121 			break;
2122 		}
2123 		best_speed = speed;
2124 		global_speed = raw_speed;
2125 		optimal_gws = num;
2126 		if (options.verbosity > VERB_LEGACY)
2127 			fprintf(stderr, "!!\n");
2128 	}
2129 
2130 	/* Erase the 'spinning wheel' cursor */
2131 	if (options.verbosity <= VERB_LEGACY && john_main_process)
2132 		fprintf(stderr, " \b");
2133 
2134 	// Release profiling queue and create new with profiling disabled
2135 	HANDLE_CLERROR(clReleaseCommandQueue(queue[sequential_id]),
2136 	               "clReleaseCommandQueue");
2137 	queue[sequential_id] =
2138 	    clCreateCommandQueue(context[sequential_id],
2139 	                         devices[sequential_id], 0, &ret_code);
2140 	HANDLE_CLERROR(ret_code, "clCreateCommandQueue");
2141 	global_work_size = optimal_gws;
2142 
2143 	duration_time = save_duration_time;
2144 }
2145 
2146 /* Get one device compute capability as a string */
get_device_capability(int sequential_id)2147 static char* get_device_capability(int sequential_id)
2148 {
2149 	static char ret[32];
2150 	unsigned int major = 0, minor = 0;
2151 
2152 	ret[0] = '\0';
2153 
2154 	get_compute_capability(sequential_id, &major, &minor);
2155 
2156 	if (major) {
2157 		snprintf(ret, sizeof(ret), "-DSM_MAJOR=%d -DSM_MINOR=%d ",
2158 		         major, minor);
2159 	}
2160 
2161 	return ret;
2162 }
2163 
2164 /* Load detailed information about a device
2165  * - fill in the details of the OpenCL device (device_info[] bitfield variable);
2166  */
load_device_info(int sequential_id)2167 static void load_device_info(int sequential_id)
2168 {
2169 	cl_device_type device;
2170 	unsigned int major = 0, minor = 0;
2171 
2172 	device = get_device_type(sequential_id);
2173 
2174 	if (device == CL_DEVICE_TYPE_CPU)
2175 		device_info[sequential_id] = DEV_CPU;
2176 	else if (device == CL_DEVICE_TYPE_GPU)
2177 		device_info[sequential_id] = DEV_GPU;
2178 	else if (device == CL_DEVICE_TYPE_ACCELERATOR)
2179 		device_info[sequential_id] = DEV_ACCELERATOR;
2180 
2181 	device_info[sequential_id] += get_vendor_id(sequential_id);
2182 	device_info[sequential_id] += get_processor_family(sequential_id);
2183 	device_info[sequential_id] += get_byte_addressable(sequential_id);
2184 
2185 	get_compute_capability(sequential_id, &major, &minor);
2186 
2187 	if (major) {
2188 		device_info[sequential_id] += (major == 2 ? DEV_NV_C2X : 0);
2189 		device_info[sequential_id] +=
2190 		    (major == 3 && minor == 0 ? DEV_NV_C30 : 0);
2191 		device_info[sequential_id] +=
2192 		    (major == 3 && minor == 2 ? DEV_NV_C32 : 0);
2193 		device_info[sequential_id] +=
2194 		    (major == 3 && minor == 5 ? DEV_NV_C35 : 0);
2195 		device_info[sequential_id] += (major == 5 ? DEV_NV_MAXWELL : 0);
2196 		device_info[sequential_id] += (major == 6 ? DEV_NV_PASCAL : 0);
2197 		device_info[sequential_id] += (major == 7 ? DEV_NV_VOLTA : 0);
2198 	}
2199 }
2200 
opencl_read_source(const char * kernel_filename,char ** kernel_source)2201 size_t opencl_read_source(const char *kernel_filename, char **kernel_source)
2202 {
2203 	FILE *fp;
2204 	char *full_path;
2205 	size_t source_size, read_size;
2206 
2207 	fp = fopen(full_path = (char*)path_expand_safe(kernel_filename), "rb");
2208 	MEM_FREE(full_path);
2209 
2210 	if (!fp)
2211 		pexit("Can't read source kernel");
2212 
2213 #if OS_FLOCK || FCNTL_LOCKS
2214 #if RACE_CONDITION_DEBUG
2215 	if (options.verbosity == VERB_DEBUG)
2216 		fprintf(stderr, "Node %d %s locking (shared) %s...\n", NODE, __FUNCTION__, kernel_filename);
2217 #endif
2218 	{
2219 #if FCNTL_LOCKS
2220 		struct flock lock;
2221 
2222 		memset(&lock, 0, sizeof(lock));
2223 		lock.l_type = F_RDLCK;
2224 		while (fcntl(fileno(fp), F_SETLKW, &lock)) {
2225 			if (errno != EINTR)
2226 				pexit("fcntl(F_RDLCK)");
2227 		}
2228 #else
2229 		while (flock(fileno(fp), LOCK_SH)) {
2230 			if (errno != EINTR)
2231 				pexit("flock(LOCK_SH)");
2232 		}
2233 #endif
2234 	}
2235 #if RACE_CONDITION_DEBUG
2236 	if (options.verbosity == VERB_DEBUG)
2237 		fprintf(stderr, "Node %d got a shared lock on %s\n", NODE, kernel_filename);
2238 #endif
2239 #endif /* OS_FLOCK || FCNTL_LOCKS */
2240 	fseek(fp, 0, SEEK_END);
2241 	source_size = ftell(fp);
2242 	fseek(fp, 0, SEEK_SET);
2243 	MEM_FREE((*kernel_source));
2244 	*kernel_source = mem_calloc(1, source_size + 1);
2245 	read_size = fread(*kernel_source, sizeof(char), source_size, fp);
2246 	if (read_size != source_size)
2247 		fprintf(stderr,
2248 		        "Error reading source: expected "Zu", got "Zu" bytes (%s).\n",
2249 		        source_size, read_size,
2250 		        feof(fp) ? "EOF" : strerror(errno));
2251 #if RACE_CONDITION_DEBUG
2252 	if (options.verbosity == VERB_DEBUG)
2253 		fprintf(stderr, "Node %d closing %s\n", NODE, kernel_filename);
2254 #endif
2255 	fclose(fp);
2256 	return source_size;
2257 }
2258 
2259 #if JOHN_SYSTEMWIDE
replace_str(const char * string,char * from,char * to)2260 static const char *replace_str(const char *string, char *from, char *to)
2261 {
2262 	static char buffer[512];
2263 	char *p;
2264 	int len;
2265 
2266 	if (!(p = strstr(string, from)))
2267 		return string;
2268 
2269 	len = p - string;
2270 	strncpy(buffer, string, len);
2271 	buffer[len] = '\0';
2272 
2273 	sprintf(buffer + len, "%s%s", to, p + strlen(from));
2274 
2275 	return buffer;
2276 }
2277 #endif
2278 
2279 
opencl_build_kernel_opt(const char * kernel_filename,int sequential_id,const char * opts)2280 void opencl_build_kernel_opt(const char *kernel_filename, int sequential_id,
2281                              const char *opts)
2282 {
2283 	char *kernel_source = NULL;
2284 	opencl_read_source(kernel_filename, &kernel_source);
2285 	opencl_build(sequential_id, opts, 0, NULL, &program[sequential_id], kernel_filename, kernel_source);
2286 	MEM_FREE(kernel_source);
2287 }
2288 
2289 #define md5add(string) MD5_Update(&ctx, (string), strlen(string))
2290 
opencl_build_kernel(const char * kernel_filename,int sequential_id,const char * opts,int warn)2291 void opencl_build_kernel(const char *kernel_filename, int sequential_id, const char *opts,
2292                          int warn)
2293 {
2294 #if HAVE_MPI
2295 	static int once;
2296 #endif
2297 
2298 	/*
2299 	 * Disable binary caching for:
2300 	 * - nvidia unless on macOS
2301 	 * - CPU if on macOS
2302 	 */
2303 	if ((gpu_nvidia(device_info[sequential_id]) && !platform_apple(get_platform_id(sequential_id))) ||
2304 	    (cpu(device_info[sequential_id]) && platform_apple(get_platform_id(sequential_id)))) {
2305 		log_event("- Kernel binary caching disabled for this platform/device");
2306 		opencl_build_kernel_opt(kernel_filename, sequential_id, opts);
2307 	} else {
2308 		struct stat source_stat, bin_stat;
2309 		char dev_name[512], bin_name[512];
2310 		const char *tmp_name;
2311 		unsigned char hash[16];
2312 		char hash_str[33];
2313 		uint64_t startTime, runtime;
2314 		int i;
2315 		MD5_CTX ctx;
2316 		char *kernel_source = NULL;
2317 		const char *global_opts;
2318 
2319 		if (!(global_opts = getenv("OPENCLBUILDOPTIONS")))
2320 			if (!(global_opts = cfg_get_param(SECTION_OPTIONS,
2321 			    SUBSECTION_OPENCL, "GlobalBuildOpts")))
2322 				global_opts = OPENCLBUILDOPTIONS;
2323 
2324 		startTime = (unsigned long)time(NULL);
2325 
2326 		// Get device name.
2327 		HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2328 		                               CL_DEVICE_NAME, sizeof(dev_name),
2329 		                               dev_name, NULL),
2330 		               "clGetDeviceInfo for DEVICE_NAME");
2331 
2332 /*
2333  * Create a hash of kernel source and parameters, and use as cache name.
2334  */
2335 		MD5_Init(&ctx);
2336 		md5add(kernel_filename);
2337 		opencl_read_source(kernel_filename, &kernel_source);
2338 		md5add(kernel_source);
2339 		md5add(global_opts);
2340 		if (opts)
2341 			md5add(opts);
2342 		md5add(opencl_driver_ver(sequential_id));
2343 		md5add(dev_name);
2344 		MD5_Update(&ctx, (char*)&platform_id, sizeof(platform_id));
2345 		MD5_Final(hash, &ctx);
2346 
2347 		for (i = 0; i < 16; i++) {
2348 			hash_str[2 * i + 0] = itoa16[hash[i] >> 4];
2349 			hash_str[2 * i + 1] = itoa16[hash[i] & 0xf];
2350 		}
2351 		hash_str[32] = 0;
2352 
2353 #if JOHN_SYSTEMWIDE
2354 		tmp_name = replace_str(kernel_filename, "$JOHN", JOHN_PRIVATE_HOME);
2355 #else
2356 		tmp_name = kernel_filename;
2357 #endif
2358 		snprintf(bin_name, sizeof(bin_name), "%s_%s.bin",
2359 		         tmp_name, hash_str);
2360 
2361 		// Select the kernel to run.
2362 		if (!getenv("DUMP_BINARY") &&
2363 		    !stat(path_expand(kernel_filename), &source_stat) &&
2364 		    !stat(path_expand(bin_name), &bin_stat) &&
2365 			(source_stat.st_mtime < bin_stat.st_mtime)) {
2366 			size_t program_size = opencl_read_source(bin_name, &kernel_source);
2367 			log_event("- Building kernel from cached binary");
2368 			opencl_build_from_binary(sequential_id, &program[sequential_id], kernel_source, program_size);
2369 		} else {
2370 			log_event("- Building kernel and caching binary");
2371 			if (warn && options.verbosity > VERB_DEFAULT) {
2372 				fprintf(stderr, "Building the kernel, this "
2373 				        "could take a while\n");
2374 				fflush(stdout);
2375 			}
2376 			opencl_read_source(kernel_filename, &kernel_source);
2377 			opencl_build(sequential_id, opts, 1, bin_name, &program[sequential_id], kernel_filename, kernel_source);
2378 		}
2379 		if (warn && options.verbosity > VERB_DEFAULT) {
2380 			if ((runtime = (unsigned long)(time(NULL) - startTime))
2381 			        > 2UL)
2382 				fprintf(stderr, "Build time: %lu seconds\n",
2383 				        (unsigned long)runtime);
2384 			fflush(stdout);
2385 		}
2386 
2387 		MEM_FREE(kernel_source);
2388 	}
2389 #if HAVE_MPI
2390 	if (mpi_p > 1 && !once++) {
2391 #if RACE_CONDITION_DEBUG
2392 		if (options.verbosity == VERB_DEBUG)
2393 			fprintf(stderr, "Node %d reached %s() MPI build barrier\n",
2394 			        NODE, __FUNCTION__);
2395 #endif
2396 		MPI_Barrier(MPI_COMM_WORLD);
2397 		if (mpi_id == 0 && options.verbosity >= VERB_DEFAULT)
2398 			fprintf(stderr, "All nodes done OpenCL build\n");
2399 	}
2400 #endif /* HAVE_MPI */
2401 }
2402 
opencl_prepare_dev(int sequential_id)2403 int opencl_prepare_dev(int sequential_id)
2404 {
2405 	int err_type = 0;
2406 #ifdef HAVE_MPI
2407 	static int once;
2408 #endif
2409 
2410 	// If OpenCL has not yet been loaded, load it now
2411 	opencl_load_environment();
2412 
2413 	if (sequential_id < 0)
2414 		sequential_id = gpu_id;
2415 
2416 	profilingEvent = firstEvent = lastEvent = NULL;
2417 	if (!context[sequential_id])
2418 		start_opencl_device(sequential_id, &err_type);
2419 	print_device_info(sequential_id);
2420 
2421 #if HAVE_MPI
2422 	if (mpi_p > 1 && !once++) {
2423 		// Avoid silly race conditions seen with nvidia
2424 #if RACE_CONDITION_DEBUG
2425 		if (options.verbosity == VERB_DEBUG)
2426 			fprintf(stderr, "Node %d reached MPI prep barrier\n", NODE);
2427 #endif
2428 		MPI_Barrier(MPI_COMM_WORLD);
2429 		if (mpi_id == 0 && options.verbosity == VERB_DEBUG)
2430 			fprintf(stderr, "All nodes done OpenCL prepare\n");
2431 	}
2432 #endif
2433 
2434 	return sequential_id;
2435 }
2436 
opencl_init(const char * kernel_filename,int sequential_id,const char * opts)2437 void opencl_init(const char *kernel_filename, int sequential_id, const char *opts)
2438 {
2439 	sequential_id = opencl_prepare_dev(sequential_id);
2440 	opencl_build_kernel(kernel_filename, sequential_id, opts, 0);
2441 }
2442 
get_device_type(int sequential_id)2443 cl_device_type get_device_type(int sequential_id)
2444 {
2445 	cl_device_type type;
2446 
2447 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_TYPE,
2448 	                               sizeof(cl_device_type), &type, NULL),
2449 	               "clGetDeviceInfo for CL_DEVICE_TYPE");
2450 
2451 	return type;
2452 }
2453 
get_local_memory_size(int sequential_id)2454 cl_ulong get_local_memory_size(int sequential_id)
2455 {
2456 	cl_ulong size;
2457 
2458 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2459 	                               CL_DEVICE_LOCAL_MEM_SIZE,
2460 	                               sizeof(cl_ulong), &size, NULL),
2461 	               "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE");
2462 
2463 	return size;
2464 }
2465 
get_global_memory_size(int sequential_id)2466 cl_ulong get_global_memory_size(int sequential_id)
2467 {
2468 	cl_ulong size;
2469 
2470 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2471 	                               CL_DEVICE_GLOBAL_MEM_SIZE,
2472 	                               sizeof(cl_ulong), &size, NULL),
2473 	               "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE");
2474 
2475 	return size;
2476 }
2477 
get_device_max_lws(int sequential_id)2478 size_t get_device_max_lws(int sequential_id)
2479 {
2480 	size_t max_group_size;
2481 
2482 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2483 	                               CL_DEVICE_MAX_WORK_GROUP_SIZE,
2484 	                               sizeof(max_group_size),
2485 	                               &max_group_size, NULL),
2486 	               "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE");
2487 
2488 	return max_group_size;
2489 }
2490 
get_max_mem_alloc_size(int sequential_id)2491 cl_ulong get_max_mem_alloc_size(int sequential_id)
2492 {
2493 	cl_ulong max_alloc_size;
2494 
2495 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2496 	                               CL_DEVICE_MAX_MEM_ALLOC_SIZE,
2497 	                               sizeof(max_alloc_size),
2498 	                               &max_alloc_size, NULL),
2499 	               "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE");
2500 
2501 	return max_alloc_size;
2502 }
2503 
get_kernel_max_lws(int sequential_id,cl_kernel crypt_kernel)2504 size_t get_kernel_max_lws(int sequential_id, cl_kernel crypt_kernel)
2505 {
2506 	size_t max_group_size;
2507 
2508 	HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel,
2509 		devices[sequential_id],
2510 		CL_KERNEL_WORK_GROUP_SIZE,
2511 		sizeof(max_group_size),
2512 		&max_group_size, NULL),
2513 	               "clGetKernelWorkGroupInfo for CL_KERNEL_WORK_GROUP_SIZE");
2514 
2515 	return max_group_size;
2516 }
2517 
get_max_compute_units(int sequential_id)2518 cl_uint get_max_compute_units(int sequential_id)
2519 {
2520 	cl_uint size;
2521 
2522 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2523 	                               CL_DEVICE_MAX_COMPUTE_UNITS,
2524 	                               sizeof(cl_uint), &size, NULL),
2525 	               "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS");
2526 
2527 	return size;
2528 }
2529 
get_kernel_preferred_multiple(int sequential_id,cl_kernel crypt_kernel)2530 size_t get_kernel_preferred_multiple(int sequential_id, cl_kernel crypt_kernel)
2531 {
2532 	size_t size;
2533 
2534 	HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel,
2535 		devices[sequential_id],
2536 		CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
2537 		sizeof(size), &size, NULL),
2538 		"clGetKernelWorkGroupInfo for CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE");
2539 
2540 	return size;
2541 }
2542 
get_compute_capability(int sequential_id,unsigned int * major,unsigned int * minor)2543 void get_compute_capability(int sequential_id, unsigned int *major,
2544                             unsigned int *minor)
2545 {
2546 	clGetDeviceInfo(devices[sequential_id],
2547 	                CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
2548 	                sizeof(cl_uint), major, NULL);
2549 	clGetDeviceInfo(devices[sequential_id],
2550 	                CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
2551 	                sizeof(cl_uint), minor, NULL);
2552 }
2553 
get_processors_count(int sequential_id)2554 cl_uint get_processors_count(int sequential_id)
2555 {
2556 	cl_uint core_count = get_max_compute_units(sequential_id);
2557 	char dname[MAX_OCLINFO_STRING_LEN];
2558 
2559 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2560 	                               CL_DEVICE_NAME,
2561 	                               sizeof(dname), dname, NULL),
2562 	               "clGetDeviceInfo for CL_DEVICE_NAME");
2563 
2564 	ocl_device_list[sequential_id].cores_per_MP = 0;
2565 
2566 	if (gpu_nvidia(device_info[sequential_id])) {
2567 		unsigned int major = 0, minor = 0;
2568 
2569 		get_compute_capability(sequential_id, &major, &minor);
2570 		if (major == 1)         // 1.x Tesla
2571 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 8);
2572 		else if (major == 2 && minor == 0)  // 2.0 Fermi
2573 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 32);
2574 		else if (major == 2 && minor >= 1)  // 2.1 Fermi
2575 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 48);
2576 		else if (major == 3)    // 3.x Kepler
2577 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 192);
2578 		else if (major == 5)    // 5.x Maxwell
2579 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 128);
2580 		else if (major == 6)    // 6.x Pascal
2581 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 128);
2582 		else if (major >= 7)    // 7.x Volta, 8.x Turing?
2583 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 64);
2584 /*
2585  * Apple, VCL and some other environments don't expose get_compute_capability()
2586  * so we need this crap - which is incomplete.
2587  * http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units
2588  *
2589  * This will produce a *guessed* figure
2590  */
2591 
2592 		// Volta or Turing
2593 		else if (strstr(dname, "TITAN V") || strstr(dname, "RTX 2"))
2594 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 64);
2595 		// Pascal
2596 		else if (strstr(dname, "GTX 10"))
2597 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 128);
2598 		// Maxwell
2599 		else if (strstr(dname, "GTX 9") || strstr(dname, "GTX TITAN X"))
2600 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 128);
2601 		// Kepler
2602 		else if (strstr(dname, "GT 6") || strstr(dname, "GTX 6") ||
2603 		         strstr(dname, "GT 7") || strstr(dname, "GTX 7") ||
2604 		         strstr(dname, "GT 8") || strstr(dname, "GTX 8") ||
2605 		         strstr(dname, "GTX TITAN"))
2606 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 192);
2607 		// Fermi
2608 		else if (strstr(dname, "GT 5") || strstr(dname, "GTX 5"))
2609 			core_count *= (ocl_device_list[sequential_id].cores_per_MP = 48);
2610 	} else if (gpu_intel(device_info[sequential_id])) {
2611 		// It seems all current models are x 8
2612 		core_count *= ocl_device_list[sequential_id].cores_per_MP = 8;
2613 	} else if (gpu_amd(device_info[sequential_id])) {
2614 		// 16 thread proc * 5 SP
2615 		core_count *= (ocl_device_list[sequential_id].cores_per_MP = (16 *
2616 		               ((amd_gcn(device_info[sequential_id]) ||
2617 		                 amd_vliw4(device_info[sequential_id])) ? 4 : 5)));
2618 	} else {
2619 		// Nothing else known, we use half native vector width for long
2620 		cl_uint v_width;
2621 
2622 		HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2623 		                               CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG,
2624 		                               sizeof(v_width), &v_width, NULL),
2625 		              "clGetDeviceInfo for CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG");
2626 		core_count *= (ocl_device_list[sequential_id].cores_per_MP = v_width);
2627 	}
2628 
2629 	return core_count;
2630 }
2631 
opencl_speed_index(int sequential_id)2632 unsigned int opencl_speed_index(int sequential_id)
2633 {
2634 	cl_uint clock;
2635 
2636 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2637 	                               CL_DEVICE_MAX_CLOCK_FREQUENCY,
2638 	                               sizeof(clock), &clock, NULL),
2639 	               "clGetDeviceInfo for CL_DEVICE_MAX_CLOCK_FREQUENCY");
2640 
2641 	return clock * get_processors_count(sequential_id);
2642 }
2643 
get_processor_family(int sequential_id)2644 cl_uint get_processor_family(int sequential_id)
2645 {
2646 	char dname[MAX_OCLINFO_STRING_LEN];
2647 
2648 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_NAME,
2649 	                               sizeof(dname), dname, NULL),
2650 	               "clGetDeviceInfo for CL_DEVICE_NAME");
2651 
2652 	/* Workaround for MESA. */
2653 	if (*dname)
2654 		strlwr(&dname[1]);
2655 
2656 	if gpu_amd
2657 	(device_info[sequential_id]) {
2658 
2659 		if ((strstr(dname, "Cedar") ||  //AMD Radeon VLIW5
2660 		        strstr(dname, "Redwood") || strstr(dname, "Juniper")
2661 		        || strstr(dname, "Cypress") || strstr(dname, "Hemlock")
2662 		        || strstr(dname, "Caicos") ||   //AMD Radeon VLIW5 Gen 2
2663 		        strstr(dname, "Turks") || strstr(dname, "Barts") ||
2664 		        strstr(dname, "Wrestler")
2665 		        || strstr(dname, "Ontario") || strstr(dname, "Zacate")
2666 		        || strstr(dname, "Winterpark") || strstr(dname, "Beavercreek")
2667 		        || strstr(dname, "Cayman") ||   //AMD Radeon VLIW4
2668 		        strstr(dname, "Antilles") || strstr(dname, "Devastator")
2669 		        || strstr(dname, "R7")  //AMD Radeon VLIW4
2670 		    )) {
2671 
2672 			if (strstr(dname, "Cayman") ||
2673 			        strstr(dname, "Antilles") ||
2674 			        strstr(dname, "Devastator") || strstr(dname, "R7"))
2675 				return DEV_AMD_VLIW4;
2676 			else
2677 				return DEV_AMD_VLIW5;
2678 
2679 		} else {
2680 
2681 			if (strstr(dname, "Capeverde") || strstr(dname, "Malta") ||
2682 			        strstr(dname, "Oland") || strstr(dname, "Hainan") ||
2683 			        strstr(dname, "Pitcairn") || strstr(dname, "Tahiti"))
2684 				return DEV_AMD_GCN_10; //AMD Radeon GCN 1.0
2685 
2686 			else if (strstr(dname, "Bonaire") || strstr(dname, "Hawaii") ||
2687 				strstr(dname, "Vesuvius") || strstr(dname, "Grenada"))
2688 				return DEV_AMD_GCN_11; //AMD Radeon GCN 1.1
2689 
2690 			else if (strstr(dname, "Tonga") || strstr(dname, "Antigua") ||
2691 				strstr(dname, "Fiji"))
2692 				return DEV_AMD_GCN_12; //AMD Radeon GCN 1.2
2693 			 /*
2694 			 * Graphics IP v6:
2695 			 *   - Cape Verde, Hainan, Oland, Pitcairn, Tahiti
2696 			 * Graphics IP v7:
2697 			 *   - Bonaire, Havaii, Kalindi, Mullins, Spectre, Spooky
2698 			 * Graphics IP v8:
2699 			 *   - Iceland
2700 			 */
2701 			/* All current GPUs are GCN so let's default to that */
2702 			//return DEV_UNKNOWN;
2703 			return DEV_AMD_GCN_12;
2704 		}
2705 	}
2706 	return DEV_UNKNOWN;
2707 }
2708 
get_byte_addressable(int sequential_id)2709 int get_byte_addressable(int sequential_id)
2710 {
2711 	char dname[MAX_OCLINFO_STRING_LEN];
2712 
2713 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2714 	                               CL_DEVICE_EXTENSIONS,
2715 	                               sizeof(dname), dname, NULL),
2716 	               "clGetDeviceInfo for CL_DEVICE_EXTENSIONS");
2717 
2718 	if (strstr(dname, "cl_khr_byte_addressable_store") == NULL)
2719 		return DEV_NO_BYTE_ADDRESSABLE;
2720 
2721 	return DEV_UNKNOWN;
2722 }
2723 
get_vendor_id(int sequential_id)2724 int get_vendor_id(int sequential_id)
2725 {
2726 	char dname[MAX_OCLINFO_STRING_LEN];
2727 
2728 	HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_VENDOR,
2729 	                               sizeof(dname), dname, NULL),
2730 	               "clGetDeviceInfo for CL_DEVICE_VENDOR");
2731 
2732 	if (strstr(dname, "NVIDIA"))
2733 		return DEV_NVIDIA;
2734 
2735 	if (strstr(dname, "Intel"))
2736 		return DEV_INTEL;
2737 
2738 	if (strstr(dname, "Advanced Micro") ||
2739 	        strstr(dname, "AMD") || strstr(dname, "ATI"))
2740 		return DEV_AMD;
2741 
2742 	return DEV_UNKNOWN;
2743 }
2744 
get_platform_vendor_id(int platform_id)2745 int get_platform_vendor_id(int platform_id)
2746 {
2747 	char dname[MAX_OCLINFO_STRING_LEN];
2748 	cl_platform_id platform[MAX_PLATFORMS];
2749 	cl_uint num_platforms;
2750 
2751 	HANDLE_CLERROR(clGetPlatformIDs(MAX_PLATFORMS, platform,
2752 	                                &num_platforms),
2753 	               "clGetPlatformIDs");
2754 
2755 	HANDLE_CLERROR(clGetPlatformInfo(platform[platform_id], CL_PLATFORM_NAME,
2756 	                                 sizeof(dname), dname, NULL),
2757 	               "clGetPlatformInfo for CL_PLATFORM_NAME");
2758 
2759 	if (strstr(dname, "NVIDIA"))
2760 		return DEV_NVIDIA;
2761 
2762 	if (strstr(dname, "Apple"))
2763 		return PLATFORM_APPLE;
2764 
2765 	if (strstr(dname, "Intel"))
2766 		return DEV_INTEL;
2767 
2768 	if (strstr(dname, "Advanced Micro") ||
2769 	        strstr(dname, "AMD") || strstr(dname, "ATI"))
2770 		return DEV_AMD;
2771 
2772 	if ((strstr(dname, "MESA")) || (strstr(dname, "Mesa")))
2773 		return PLATFORM_MESA;
2774 
2775 	if (strstr(dname, "beignet"))
2776 		return PLATFORM_BEIGNET;
2777 
2778 	if (strstr(dname, "Portable Computing Language") || strstr(dname, "pocl"))
2779 		return PLATFORM_POCL;
2780 
2781 	/*
2782 	 * If we found nothing recognized in the device name, look at
2783 	 * device version string as well
2784 	 */
2785 	HANDLE_CLERROR(clGetPlatformInfo(platform[platform_id], CL_PLATFORM_VERSION,
2786 	                                 sizeof(dname), dname, NULL),
2787 	               "clGetPlatformInfo for CL_PLATFORM_VERSION");
2788 
2789 	if ((strstr(dname, "MESA")) || (strstr(dname, "Mesa")))
2790 		return PLATFORM_MESA;
2791 
2792 	return DEV_UNKNOWN;
2793 }
2794 
get_device_version(int sequential_id)2795 int get_device_version(int sequential_id)
2796 {
2797 	char dname[MAX_OCLINFO_STRING_LEN];
2798 	unsigned int major, minor;
2799 
2800 	if ((clGetDeviceInfo(devices[sequential_id], CL_DEVICE_VERSION,
2801 			MAX_OCLINFO_STRING_LEN, dname, NULL) == CL_SUCCESS) &&
2802 			sscanf(dname, "OpenCL %u.%u", &major, &minor) == 2)
2803 		return major * 100 + minor * 10;
2804 
2805 	return DEV_UNKNOWN;
2806 }
2807 
get_opencl_header_version()2808 char *get_opencl_header_version()
2809 {
2810 #ifdef CL_VERSION_2_2
2811 	return "2.2";
2812 #elif CL_VERSION_2_1
2813 	return "2.1";
2814 #elif CL_VERSION_2_0
2815 	return "2.0";
2816 #elif CL_VERSION_1_2
2817 	return "1.2";
2818 #elif CL_VERSION_1_1
2819 	return "1.1";
2820 #elif CL_VERSION_1_0
2821 	return "1.0";
2822 #else
2823 	return "Unknown";
2824 #endif
2825 }
2826 
get_error_name(cl_int cl_error)2827 char *get_error_name(cl_int cl_error)
2828 {
2829 	char *message;
2830 	static char out[128];
2831 	static char *err_small[] = {
2832 		"CL_SUCCESS", "CL_DEVICE_NOT_FOUND", "CL_DEVICE_NOT_AVAILABLE",
2833 		"CL_COMPILER_NOT_AVAILABLE",
2834 		"CL_MEM_OBJECT_ALLOCATION_FAILURE", "CL_OUT_OF_RESOURCES",
2835 		"CL_OUT_OF_HOST_MEMORY",
2836 		"CL_PROFILING_INFO_NOT_AVAILABLE", "CL_MEM_COPY_OVERLAP",
2837 		"CL_IMAGE_FORMAT_MISMATCH",
2838 		"CL_IMAGE_FORMAT_NOT_SUPPORTED", "CL_BUILD_PROGRAM_FAILURE",
2839 		"CL_MAP_FAILURE", "CL_MISALIGNED_SUB_BUFFER_OFFSET",
2840 		"CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST",
2841 		"CL_COMPILE_PROGRAM_FAILURE", "CL_LINKER_NOT_AVAILABLE",
2842 		"CL_LINK_PROGRAM_FAILURE", "CL_DEVICE_PARTITION_FAILED",
2843 		"CL_KERNEL_ARG_INFO_NOT_AVAILABLE"
2844 	};
2845 	static char *err_invalid[] = {
2846 		"CL_INVALID_VALUE", "CL_INVALID_DEVICE_TYPE",
2847 		"CL_INVALID_PLATFORM", "CL_INVALID_DEVICE",
2848 		"CL_INVALID_CONTEXT", "CL_INVALID_QUEUE_PROPERTIES",
2849 		"CL_INVALID_COMMAND_QUEUE", "CL_INVALID_HOST_PTR",
2850 		"CL_INVALID_MEM_OBJECT", "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
2851 		"CL_INVALID_IMAGE_SIZE", "CL_INVALID_SAMPLER",
2852 		"CL_INVALID_BINARY", "CL_INVALID_BUILD_OPTIONS",
2853 		"CL_INVALID_PROGRAM", "CL_INVALID_PROGRAM_EXECUTABLE",
2854 		"CL_INVALID_KERNEL_NAME", "CL_INVALID_KERNEL_DEFINITION",
2855 		"CL_INVALID_KERNEL", "CL_INVALID_ARG_INDEX",
2856 		"CL_INVALID_ARG_VALUE", "CL_INVALID_ARG_SIZE",
2857 		"CL_INVALID_KERNEL_ARGS", "CL_INVALID_WORK_DIMENSION",
2858 		"CL_INVALID_WORK_GROUP_SIZE", "CL_INVALID_WORK_ITEM_SIZE",
2859 		"CL_INVALID_GLOBAL_OFFSET", "CL_INVALID_EVENT_WAIT_LIST",
2860 		"CL_INVALID_EVENT", "CL_INVALID_OPERATION",
2861 		"CL_INVALID_GL_OBJECT", "CL_INVALID_BUFFER_SIZE",
2862 		"CL_INVALID_MIP_LEVEL", "CL_INVALID_GLOBAL_WORK_SIZE",
2863 		"CL_INVALID_PROPERTY", "CL_INVALID_IMAGE_DESCRIPTOR",
2864 		"CL_INVALID_COMPILER_OPTIONS", "CL_INVALID_LINKER_OPTIONS",
2865 		"CL_INVALID_DEVICE_PARTITION_COUNT"
2866 	};
2867 
2868 	if (cl_error <= 0 && cl_error >= -19)
2869 		message = err_small[-cl_error];
2870 	else if (cl_error <= -30 && cl_error >= -68)
2871 		message = err_invalid[-cl_error - 30];
2872 	else
2873 		message = "UNKNOWN OPENCL ERROR";
2874 	sprintf(out, "%s (%d)", message, cl_error);
2875 	return out;
2876 }
2877 
2878 /*
2879  * We currently leave all of this to single.c instead but this function
2880  * remains for future functionality.
2881  */
opencl_calc_min_kpc(size_t lws,size_t gws,int v_width)2882 int opencl_calc_min_kpc(size_t lws, size_t gws, int v_width)
2883 {
2884 	return gws * v_width;
2885 }
2886 
2887 /***
2888  * Despite of whatever the user uses as -dev=N, I will always list devices in
2889  * their natural order as defined by the OpenCL libraries.
2890  *
2891  * In order to be able to know everything about the device and list it obeying
2892  * its natural sequence (defined by hardware, PCI slots sequence, ...) is better
2893  * to scan all OpenCL stuff and list only when needed. Otherwise, I might need
2894  * to reorder first and then list.
2895  ***/
opencl_list_devices(void)2896 void opencl_list_devices(void)
2897 {
2898 	char dname[MAX_OCLINFO_STRING_LEN];
2899 	size_t z_entries;
2900 	cl_uint entries;
2901 	cl_ulong long_entries;
2902 	int i, j, sequence_nr = 0, err_type = 0, platform_in_use = -1;
2903 	size_t p_size;
2904 	int available_devices = 0;
2905 	cl_int ret;
2906 	cl_platform_id platform_list[MAX_PLATFORMS];
2907 	cl_uint num_platforms, num_devices;
2908 
2909 	/* Obtain a list of available platforms */
2910 	ret = clGetPlatformIDs(MAX_PLATFORMS, platform_list, &num_platforms);
2911 
2912 	if (!num_platforms)
2913 		fprintf(stderr, "Error: No OpenCL-capable platforms were detected"
2914 		        " by the installed OpenCL driver.\n");
2915 
2916 	if (ret != CL_SUCCESS && options.verbosity > VERB_LEGACY)
2917 		fprintf(stderr, "Throw clError: clGetPlatformIDs() = %s\n",
2918 		        get_error_name(ret));
2919 
2920 	for (i = 0; i < num_platforms; i++) {
2921 		platforms[i].platform = platform_list[i];
2922 		ret = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_ALL,
2923 		                     MAX_GPU_DEVICES, &devices[available_devices],
2924 		                     &num_devices);
2925 
2926 		if ((ret != CL_SUCCESS || num_devices < 1) &&
2927 		     options.verbosity > VERB_LEGACY)
2928 			fprintf(stderr, "No OpenCL devices was found on platform #%d"
2929 			                 ", clGetDeviceIDs() = %s\n",
2930 			        i, get_error_name(ret));
2931 
2932 		available_devices += num_devices;
2933 		platforms[i].num_devices = num_devices;
2934 	}
2935 
2936 	if (!available_devices) {
2937 		fprintf(stderr, "Error: No OpenCL-capable devices were detected"
2938 		        " by the installed OpenCL driver.\n\n");
2939 		return;
2940 	}
2941 	/* Initialize OpenCL environment */
2942 	if (!getenv("_SKIP_OCL_INITIALIZATION"))
2943 		opencl_load_environment();
2944 
2945 	for (i = 0; platforms[i].platform; i++) {
2946 
2947 		/* Query devices for information */
2948 		for (j = 0; j < platforms[i].num_devices; j++, sequence_nr++) {
2949 			cl_device_local_mem_type memtype;
2950 			cl_bool boolean;
2951 			char *p;
2952 			int ret, cpu;
2953 			int fan, temp, util, cl, ml;
2954 
2955 			if (!getenv("_SKIP_OCL_INITIALIZATION") &&
2956 			   (!default_gpu_selected && !get_if_device_is_in_use(sequence_nr)))
2957 				/* Nothing to do, skipping */
2958 				continue;
2959 
2960 			if (platform_in_use != i) {
2961 				/* Now, dealing with different platform. */
2962 				/* Obtain information about platform */
2963 				clGetPlatformInfo(platforms[i].platform,
2964 				                  CL_PLATFORM_NAME, sizeof(dname), dname, NULL);
2965 				printf("Platform #%d name: %s, ", i, dname);
2966 				clGetPlatformInfo(platforms[i].platform,
2967 				                  CL_PLATFORM_VERSION, sizeof(dname), dname, NULL);
2968 				printf("version: %s\n", dname);
2969 
2970 				clGetPlatformInfo(platforms[i].platform,
2971 				                  CL_PLATFORM_EXTENSIONS, sizeof(dname), dname, NULL);
2972 				if (options.verbosity > VERB_LEGACY)
2973 					printf("    Platform extensions:    %s\n", dname);
2974 
2975 				/* Obtain a list of devices available */
2976 				if (!platforms[i].num_devices)
2977 					printf("%d devices found\n", platforms[i].num_devices);
2978 
2979 				platform_in_use = i;
2980 			}
2981 			clGetDeviceInfo(devices[sequence_nr], CL_DEVICE_NAME,
2982 			                sizeof(dname), dname, NULL);
2983 			p = ltrim(dname);
2984 			printf("    Device #%d (%d) name:     %s\n", j, sequence_nr + 1, p);
2985 
2986 			// Check if device seems to be working.
2987 			if (!start_opencl_device(sequence_nr, &err_type)) {
2988 
2989 				if (err_type == 1)
2990 					printf("    Status:                 %s (%s)\n",
2991 					       "Context creation error", get_error_name(ret_code));
2992 				else
2993 					printf("    Status:                 %s (%s)\n",
2994 					       "Queue creation error", get_error_name(ret_code));
2995 			}
2996 
2997 			ret = clGetDeviceInfo(devices[sequence_nr],
2998 			                      CL_DEVICE_BOARD_NAME_AMD, sizeof(dname), dname, NULL);
2999 			if (ret == CL_SUCCESS && strlen(dname))
3000 				printf("    Board name:             %s\n", dname);
3001 
3002 			clGetDeviceInfo(devices[sequence_nr], CL_DEVICE_VENDOR,
3003 			                sizeof(dname), dname, NULL);
3004 			printf("    Device vendor:          %s\n", dname);
3005 			clGetDeviceInfo(devices[sequence_nr], CL_DEVICE_TYPE,
3006 			                sizeof(cl_ulong), &long_entries, NULL);
3007 			printf("    Device type:            ");
3008 			cpu = (long_entries & CL_DEVICE_TYPE_CPU);
3009 			if (cpu)
3010 				printf("CPU ");
3011 			if (long_entries & CL_DEVICE_TYPE_GPU)
3012 				printf("GPU ");
3013 			if (long_entries & CL_DEVICE_TYPE_ACCELERATOR)
3014 				printf("Accelerator ");
3015 			if (long_entries & CL_DEVICE_TYPE_DEFAULT)
3016 				printf("Default ");
3017 			if (long_entries & ~(CL_DEVICE_TYPE_DEFAULT |
3018 			                     CL_DEVICE_TYPE_ACCELERATOR |
3019 			                     CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU))
3020 				printf("Unknown ");
3021 			clGetDeviceInfo(devices[sequence_nr],
3022 			                CL_DEVICE_ENDIAN_LITTLE, sizeof(cl_bool), &boolean, NULL);
3023 			printf("(%s)\n", boolean == CL_TRUE ? "LE" : "BE");
3024 			clGetDeviceInfo(devices[sequence_nr], CL_DEVICE_VERSION,
3025 			                sizeof(dname), dname, NULL);
3026 			printf("    Device version:         %s\n", dname);
3027 			printf("    Driver version:         %s\n",
3028 			       opencl_driver_info(sequence_nr));
3029 
3030 			clGetDeviceInfo(devices[sequence_nr],
3031 			                CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR,
3032 			                sizeof(cl_uint), &entries, NULL);
3033 			printf("    Native vector widths:   char %d, ", entries);
3034 			clGetDeviceInfo(devices[sequence_nr],
3035 			                CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT,
3036 			                sizeof(cl_uint), &entries, NULL);
3037 			printf("short %d, ", entries);
3038 			clGetDeviceInfo(devices[sequence_nr],
3039 			                CL_DEVICE_NATIVE_VECTOR_WIDTH_INT,
3040 			                sizeof(cl_uint), &entries, NULL);
3041 			printf("int %d, ", entries);
3042 			clGetDeviceInfo(devices[sequence_nr],
3043 			                CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG,
3044 			                sizeof(cl_uint), &entries, NULL);
3045 			printf("long %d\n", entries);
3046 
3047 			clGetDeviceInfo(devices[sequence_nr],
3048 			                CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
3049 			                sizeof(cl_uint), &entries, NULL);
3050 			printf("    Preferred vector width: char %d, ", entries);
3051 			clGetDeviceInfo(devices[sequence_nr],
3052 			                CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
3053 			                sizeof(cl_uint), &entries, NULL);
3054 			printf("short %d, ", entries);
3055 			clGetDeviceInfo(devices[sequence_nr],
3056 			                CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
3057 			                sizeof(cl_uint), &entries, NULL);
3058 			printf("int %d, ", entries);
3059 			clGetDeviceInfo(devices[sequence_nr],
3060 			                CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
3061 			                sizeof(cl_uint), &entries, NULL);
3062 			printf("long %d\n", entries);
3063 
3064 			clGetDeviceInfo(devices[sequence_nr],
3065 			                CL_DEVICE_GLOBAL_MEM_SIZE,
3066 			                sizeof(cl_ulong), &long_entries, NULL);
3067 			clGetDeviceInfo(devices[sequence_nr],
3068 			                CL_DEVICE_ERROR_CORRECTION_SUPPORT,
3069 			                sizeof(cl_bool), &boolean, NULL);
3070 			printf("    Global Memory:          %sB%s\n",
3071 			       human_prefix(long_entries),
3072 			       boolean == CL_TRUE ? " (ECC)" : "");
3073 			clGetDeviceInfo(devices[sequence_nr],
3074 			                CL_DEVICE_EXTENSIONS, sizeof(dname), dname, NULL);
3075 			if (options.verbosity > VERB_LEGACY)
3076 				printf("    Device extensions:      %s\n", dname);
3077 
3078 			clGetDeviceInfo(devices[sequence_nr],
3079 			                CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
3080 			                sizeof(cl_ulong), &long_entries, NULL);
3081 			if (long_entries)
3082 				printf("    Global Memory Cache:    %sB\n",
3083 				       human_prefix(long_entries)
3084 				      );
3085 			clGetDeviceInfo(devices[sequence_nr],
3086 			                CL_DEVICE_LOCAL_MEM_SIZE,
3087 			                sizeof(cl_ulong), &long_entries, NULL);
3088 			clGetDeviceInfo(devices[sequence_nr],
3089 			                CL_DEVICE_LOCAL_MEM_TYPE,
3090 			                sizeof(cl_device_local_mem_type), &memtype, NULL);
3091 			printf("    Local Memory:           %sB (%s)\n",
3092 			       human_prefix(long_entries),
3093 			       memtype == CL_LOCAL ? "Local" : "Global");
3094 			clGetDeviceInfo(devices[sequence_nr],
3095 			                CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
3096 			                sizeof(cl_ulong), &long_entries, NULL);
3097 			if (long_entries)
3098 				printf("    Constant Buffer size:   %sB\n",
3099 				       human_prefix(long_entries)
3100 				      );
3101 			clGetDeviceInfo(devices[sequence_nr],
3102 			                CL_DEVICE_MAX_MEM_ALLOC_SIZE,
3103 			                sizeof(long_entries), &long_entries, NULL);
3104 			printf("    Max memory alloc. size: %sB\n",
3105 			       human_prefix(long_entries));
3106 			ret = clGetDeviceInfo(devices[sequence_nr],
3107 			                      CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_int), &entries, NULL);
3108 			if (ret == CL_SUCCESS && entries)
3109 				printf("    Max clock (MHz):        %u\n", entries);
3110 			ret = clGetDeviceInfo(devices[sequence_nr],
3111 			                      CL_DEVICE_PROFILING_TIMER_RESOLUTION,
3112 			                      sizeof(size_t), &z_entries, NULL);
3113 			if (ret == CL_SUCCESS && z_entries)
3114 				printf("    Profiling timer res.:   "Zu" ns\n", z_entries);
3115 			clGetDeviceInfo(devices[sequence_nr],
3116 			                CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &p_size, NULL);
3117 			printf("    Max Work Group Size:    %d\n", (int)p_size);
3118 			clGetDeviceInfo(devices[sequence_nr],
3119 			                CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &entries, NULL);
3120 			printf("    Parallel compute cores: %d\n", entries);
3121 
3122 			long_entries = get_processors_count(sequence_nr);
3123 			if (!cpu && ocl_device_list[sequence_nr].cores_per_MP > 1)
3124 				printf("    %s      "LLu" "
3125 				       " (%d x %d)\n",
3126 					gpu_nvidia(device_info[sequence_nr]) ? "CUDA cores:       " : "Stream processors:",
3127 				       (unsigned long long)long_entries,
3128 				       entries, ocl_device_list[sequence_nr].cores_per_MP);
3129 			printf("    Speed index:            %u\n",
3130 			       opencl_speed_index(sequence_nr));
3131 
3132 			ret = clGetDeviceInfo(devices[sequence_nr],
3133 			                      CL_DEVICE_SIMD_WIDTH_AMD, sizeof(cl_uint),
3134 			                      &long_entries, NULL);
3135 			if (ret == CL_SUCCESS)
3136 				printf("    SIMD width:             "LLu"\n",
3137 				       (unsigned long long)long_entries);
3138 
3139 			ret = clGetDeviceInfo(devices[sequence_nr],
3140 			                      CL_DEVICE_WAVEFRONT_WIDTH_AMD,
3141 			                      sizeof(cl_uint), &long_entries, NULL);
3142 			if (ret == CL_SUCCESS)
3143 				printf("    Wavefront width:        "LLu"\n",
3144 				       (unsigned long long)long_entries);
3145 
3146 			ret = clGetDeviceInfo(devices[sequence_nr],
3147 			                      CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint),
3148 			                      &long_entries, NULL);
3149 			if (ret == CL_SUCCESS)
3150 				printf("    Warp size:              "LLu"\n",
3151 				       (unsigned long long)long_entries);
3152 
3153 			ret = clGetDeviceInfo(devices[sequence_nr],
3154 			                      CL_DEVICE_REGISTERS_PER_BLOCK_NV,
3155 			                      sizeof(cl_uint), &long_entries, NULL);
3156 			if (ret == CL_SUCCESS)
3157 				printf("    Max. GPRs/work-group:   "LLu"\n",
3158 				       (unsigned long long)long_entries);
3159 
3160 			if (gpu_nvidia(device_info[sequence_nr])) {
3161 				unsigned int major = 0, minor = 0;
3162 
3163 				get_compute_capability(sequence_nr, &major, &minor);
3164 				if (major && minor)
3165 					printf("    Compute capability:     %u.%u "
3166 					       "(sm_%u%u)\n", major, minor, major, minor);
3167 			}
3168 			ret = clGetDeviceInfo(devices[sequence_nr],
3169 			                      CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV,
3170 			                      sizeof(cl_bool), &boolean, NULL);
3171 			if (ret == CL_SUCCESS)
3172 				printf("    Kernel exec. timeout:   %s\n",
3173 				       boolean ? "yes" : "no");
3174 
3175 			fan = temp = util = cl = ml = -1;
3176 #if HAVE_LIBDL
3177 			if (nvml_lib && gpu_nvidia(device_info[sequence_nr]) &&
3178 			    id2nvml(ocl_device_list[sequence_nr].pci_info) >= 0) {
3179 				printf("    NVML id:                %d\n",
3180 				       id2nvml(ocl_device_list[sequence_nr].pci_info));
3181 				nvidia_get_temp(id2nvml(ocl_device_list[sequence_nr].pci_info),
3182 				                &temp, &fan, &util, &cl, &ml);
3183 			} else if (adl_lib && gpu_amd(device_info[sequence_nr])) {
3184 				printf("    ADL:                    Overdrive%d, device id %d\n",
3185 				       adl2od[id2adl(ocl_device_list[sequence_nr].pci_info)],
3186 				       id2adl(ocl_device_list[sequence_nr].pci_info));
3187 				amd_get_temp(id2adl(ocl_device_list[sequence_nr].pci_info),
3188 				             &temp, &fan, &util, &cl, &ml);
3189 			}
3190 #endif
3191 			if (ocl_device_list[sequence_nr].pci_info.bus >= 0) {
3192 				printf("    PCI device topology:    %s\n",
3193 				       ocl_device_list[sequence_nr].pci_info.busId);
3194 			}
3195 			if (cl >= 0)
3196 				printf("    PCI lanes:              %d/%d\n", cl, ml);
3197 			if (fan >= 0)
3198 				printf("    Fan speed:              %u%%\n", fan);
3199 			if (temp >= 0)
3200 				printf("    Temperature:            %u%sC\n",
3201 				       temp, gpu_degree_sign);
3202 			if (util >= 0)
3203 				printf("    Utilization:            %u%%\n", util);
3204 			else if (temp >= 0)
3205 				printf("    Utilization:            n/a\n");
3206 			puts("");
3207 		}
3208 	}
3209 	return;
3210 }
3211 
3212 #undef LOG_SIZE
3213 #undef SRC_SIZE
3214 #endif
3215