1 /*
2 * This file is part of John the Ripper password cracker.
3 *
4 * Common OpenCL functions.
5 *
6 * This software is
7 * Copyright (c) 2010-2012 Samuele Giovanni Tonon <samu at linuxasylum dot net>
8 * Copyright (c) 2010-2013 Lukas Odzioba <ukasz@openwall.net>
9 * Copyright (c) 2010-2015 magnum
10 * Copyright (c) 2012-2015 Claudio André <claudioandre.br at gmail.com>
11 *
12 * and is hereby released to the general public under the following terms:
13 * Redistribution and use in source and binary forms, with or without
14 * modifications, are permitted.
15 */
16
17 #ifdef HAVE_OPENCL
18
19 #define _BSD_SOURCE 1 // setenv()
20 #define _DEFAULT_SOURCE 1 // setenv()
21 #define NEED_OS_TIMER
22 #define NEED_OS_FLOCK
23 #define NEED_OS_FORK
24 #include "os.h"
25
26 #include <assert.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <sys/stat.h>
30 #include <time.h>
31 #include <signal.h>
32 #include <limits.h>
33 #include <stdlib.h>
34 #if !AC_BUILT || HAVE_FCNTL_H
35 #include <fcntl.h>
36 #endif
37 #include <unistd.h>
38
39 #ifdef NO_JOHN_BLD
40 #define JOHN_BLD "unk-build-type"
41 #else
42 #include "john_build_rule.h"
43 #endif
44
45 #include "jumbo.h"
46 #include "options.h"
47 #include "config.h"
48 #include "common.h"
49 #include "logger.h"
50 #include "opencl_common.h"
51 #include "mask_ext.h"
52 #include "dyna_salt.h"
53 #include "signals.h"
54 #include "recovery.h"
55 #include "status.h"
56 #include "john.h"
57 #include "md5.h"
58 #include "misc.h"
59 #include "john_mpi.h"
60
61 /* Set this to eg. 3 for some added debug and retry stuff */
62 #define RACE_CONDITION_DEBUG 0
63
64 #define LOG_SIZE 1024*16
65
66 #if !defined(__CYGWIN__) && !defined(__MINGW32__)
67 // If true, use realpath(3) for translating eg. "-I./kernels" into an absolute
68 // path before submitting as JIT compile option to OpenCL.
69 #define I_REALPATH 1
70 #endif
71
72 // If we are a release build, only output OpenCL build log if
73 // there was a fatal error (or --verbosity was increased).
74 #ifdef JTR_RELEASE_BUILD
75 #define LOG_VERB VERB_LEGACY
76 #else
77 #define LOG_VERB VERB_DEFAULT
78 #endif
79
80 /* Common OpenCL variables */
81 int platform_id;
82 int default_gpu_selected;
83 int default_device_selected;
84 int ocl_autotune_running;
85 size_t ocl_max_lws;
86
87 static char opencl_log[LOG_SIZE];
88 static int opencl_initialized;
89
90 static void load_device_info(int sequential_id);
91 static char* get_device_capability(int sequential_id);
92
93 // Used by auto-tuning to decide how GWS should changed between trials.
94 extern int autotune_get_next_gws_size(size_t num, int step, int startup,
95 int default_value);
96 extern int autotune_get_prev_gws_size(size_t num, int step);
97
98 // Settings to use for auto-tuning.
99 static int buffer_size;
100 static int default_value;
101 static int hash_loops;
102 static int duration_time = 0;
103 static const char **warnings;
104 static int *split_events;
105 static int main_opencl_event;
106 static struct fmt_main *self;
107 static void (*create_clobj)(size_t gws, struct fmt_main *self);
108 static void (*release_clobj)(void);
109 static char fmt_base_name[128];
110 static size_t gws_limit;
111 static int printed_mask;
112 static struct db_main *autotune_db;
113 static struct db_salt *autotune_salts;
114 int autotune_real_db;
115
116 typedef struct {
117 cl_platform_id platform;
118 int num_devices;
119 } cl_platform;
120 static cl_platform platforms[MAX_PLATFORMS + 1];
121
122
123 cl_device_id devices[MAX_GPU_DEVICES + 1];
124 cl_context context[MAX_GPU_DEVICES];
125 cl_program program[MAX_GPU_DEVICES];
126 cl_command_queue queue[MAX_GPU_DEVICES];
127 cl_int ret_code;
128 cl_kernel crypt_kernel;
129 size_t local_work_size;
130 size_t global_work_size;
131 size_t max_group_size;
132 unsigned int ocl_v_width = 1;
133 unsigned long long global_speed;
134
135 cl_event *profilingEvent, *firstEvent, *lastEvent;
136 cl_event *multi_profilingEvent[MAX_EVENTS];
137
138 int device_info[MAX_GPU_DEVICES];
139 static ocl_device_details ocl_device_list[MAX_GPU_DEVICES];
140
opencl_process_event(void)141 void opencl_process_event(void)
142 {
143 if (!ocl_autotune_running && !bench_or_test_running) {
144 #if !OS_TIMER
145 sig_timer_emu_tick();
146 #endif
147 if (event_pending) {
148 if (event_save) {
149 event_save = 0;
150 rec_save();
151 }
152
153 if (event_status) {
154 event_status = 0;
155 status_print();
156 }
157
158 if (event_ticksafety) {
159 event_ticksafety = 0;
160 status_ticks_overflow_safety();
161 }
162
163 event_pending = (event_abort || event_poll_files || event_reload);
164 }
165 }
166 }
167
get_number_of_available_platforms()168 int get_number_of_available_platforms()
169 {
170 int i = 0;
171
172 while (platforms[i].platform)
173 i++;
174
175 return i;
176 }
177
178 /* Get the number of available devices (all the OpenCL devices) */
get_number_of_available_devices()179 int get_number_of_available_devices()
180 {
181 int total = 0, i = 0;
182
183 while (platforms[i].platform)
184 total += platforms[i++].num_devices;
185
186 return total;
187 }
188
189 /*
190 * Get the total number of devices that were requested (do not count duplicates)
191 * --device=2,2 result that "one" device is really in use;
192 */
get_number_of_devices_in_use()193 int get_number_of_devices_in_use()
194 {
195 int i = 0;
196
197 while (engaged_devices[i] != DEV_LIST_END)
198 i++;
199
200 return i;
201 }
202
203 /*
204 * Get the total number of requested devices (count duplicates)
205 * --device=2,2 result that "two" devices will be used. E.g., to split tasks;
206 */
get_number_of_requested_devices()207 int get_number_of_requested_devices()
208 {
209 int i = 0;
210
211 while (requested_devices[i] != DEV_LIST_END)
212 i++;
213
214 return i;
215 }
216
get_platform_id(int sequential_id)217 int get_platform_id(int sequential_id)
218 {
219 int pos = 0, i = 0;
220
221 while (platforms[i].platform) {
222 pos += platforms[i].num_devices;
223
224 if (sequential_id < pos)
225 break;
226 i++;
227 }
228 return (platforms[i].platform ? i : -1);
229 }
230
get_device_id(int sequential_id)231 int get_device_id(int sequential_id)
232 {
233 int pos = sequential_id, i = 0;
234
235 while (platforms[i].platform && pos >= platforms[i].num_devices) {
236 pos -= platforms[i].num_devices;
237 i++;
238 }
239 return (platforms[i].platform ? pos : -1);
240 }
241
get_sequential_id(unsigned int dev_id,unsigned int platform_id)242 int get_sequential_id(unsigned int dev_id, unsigned int platform_id)
243 {
244 int pos = 0, i = 0;
245
246 while (platforms[i].platform && i < platform_id)
247 pos += platforms[i++].num_devices;
248
249 if (i == platform_id && dev_id >= platforms[i].num_devices)
250 return -1;
251
252 return (platforms[i].platform ? pos + dev_id : -1);
253 }
254
opencl_driver_value(int sequential_id,int * major,int * minor)255 void opencl_driver_value(int sequential_id, int *major, int *minor)
256 {
257 char dname[MAX_OCLINFO_STRING_LEN];
258 char *p;
259
260 *major = 0, *minor = 0;
261
262 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DRIVER_VERSION,
263 sizeof(dname), dname, NULL), "clGetDeviceInfo for CL_DRIVER_VERSION");
264
265 p = dname;
266 while (*p && !isdigit((int)*p))
267 p++;
268 if (*p) {
269 *major = atoi(p);
270 while (*p && isdigit((int)*p))
271 p++;
272 while (*p && !isdigit((int)*p))
273 p++;
274 if (*p) {
275 *minor = atoi(p);
276 }
277 }
278 }
279
opencl_driver_ver(int sequential_id)280 static char *opencl_driver_ver(int sequential_id)
281 {
282 static char ret[64];
283 int major, minor;
284
285 opencl_driver_value(sequential_id, &major, &minor);
286
287 snprintf(ret, sizeof(ret), "-DDEV_VER_MAJOR=%d -DDEV_VER_MINOR=%d",
288 major, minor);
289
290 return ret;
291 }
292
remove_spaces(char * str)293 static char *remove_spaces(char *str) {
294
295 char *out = str, *put = str;
296
297 for (; *str; str++) {
298 if (*str != ' ')
299 *put++ = *str;
300 }
301 *put = '\0';
302
303 return out;
304 }
305
opencl_driver_info(int sequential_id)306 static char *opencl_driver_info(int sequential_id)
307 {
308 static char buf[64 + MAX_OCLINFO_STRING_LEN];
309 char dname[MAX_OCLINFO_STRING_LEN], tmp[sizeof(buf)], set[64];
310 static char output[sizeof(tmp) + sizeof(dname)];
311 char *name, *recommendation = NULL;
312 int major = 0, minor = 0, conf_major = 0, conf_minor = 0, found;
313 struct cfg_list *list;
314 struct cfg_line *line;
315
316 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DRIVER_VERSION,
317 sizeof(dname), dname, NULL), "clGetDeviceInfo for CL_DRIVER_VERSION");
318
319 opencl_driver_value(sequential_id, &major, &minor);
320 name = buf;
321
322 if ((list = cfg_get_list("List.OpenCL:", "Drivers")))
323 if ((line = list->head))
324 do {
325 char *p;
326
327 //Parse driver information.
328 strncpy(set, line->data, 64);
329 remove_spaces(set);
330
331 p = strtokm(set, ",");
332 conf_major = strtoul(p, NULL, 10);
333
334 p = strtokm(NULL, ";");
335 conf_minor = strtoul(p, NULL, 10);
336
337 name = strtokm(NULL, ";");
338 recommendation = strtokm(NULL, ";");
339
340 if (gpu_amd(device_info[sequential_id]))
341 if (conf_major == major && conf_minor == minor)
342 break;
343
344 if (gpu_nvidia(device_info[sequential_id]))
345 if (recommendation && strstr(recommendation, "N"))
346 if (conf_major <= major && conf_minor <= minor)
347 break;
348
349 #ifdef OCL_DEBUG
350 fprintf(stderr, "Driver: %i, %i -> %s , %s\n",
351 conf_major, conf_minor, name, recommendation);
352 #endif
353 } while ((line = line->next));
354
355 if (gpu_amd(device_info[sequential_id]) &&
356 get_platform_vendor_id(get_platform_id(sequential_id)) == DEV_AMD) {
357
358 if (major < 1912)
359 snprintf(buf, sizeof(buf), "%s - Catalyst %s", dname, name);
360 else if (major < 2500)
361 snprintf(buf, sizeof(buf), "%s - Crimson %s", dname, name);
362 else
363 snprintf(buf, sizeof(buf), "%s - AMDGPU-Pro %s", dname, name);
364 snprintf(tmp, sizeof(tmp), "%s", buf);
365 } else
366 snprintf(tmp, sizeof(tmp), "%s", dname);
367
368 snprintf(dname, sizeof(dname), " ");
369
370 if (recommendation) {
371 //Check hardware
372 found = (strstr(recommendation, "G") && amd_gcn(device_info[sequential_id]));
373 found += (strstr(recommendation, "N") && gpu_nvidia(device_info[sequential_id]));
374 found += (strstr(recommendation, "V") &&
375 (amd_vliw4(device_info[sequential_id]) ||
376 amd_vliw5(device_info[sequential_id])));
377
378 //Check OS
379 if (found) {
380 found = (strstr(recommendation, "*") != NULL);
381 found += (strstr(recommendation, "L") && strstr(JOHN_BLD, "linux"));
382 found += (strstr(recommendation, "W") && strstr(JOHN_BLD, "windows"));
383 }
384
385 if (strstr(recommendation, "T"))
386 snprintf(dname, sizeof(dname), " [known bad]");
387 else if (found) {
388 if (strstr(recommendation, "R"))
389 snprintf(dname, sizeof(dname), " [recommended]");
390 else if (strstr(recommendation, "S"))
391 snprintf(dname, sizeof(dname), " [supported]");
392 }
393 }
394 snprintf(output, sizeof(output), "%s%s", tmp, dname);
395
396 return output;
397 }
398
ns2string(cl_ulong nanosec)399 static char *ns2string(cl_ulong nanosec)
400 {
401 char *buf = mem_alloc_tiny(16, MEM_ALIGN_NONE);
402 int s, ms, us, ns;
403
404 ns = nanosec % 1000;
405 nanosec /= 1000;
406 us = nanosec % 1000;
407 nanosec /= 1000;
408 ms = nanosec % 1000;
409 s = nanosec / 1000;
410
411 if (s) {
412 if (ms)
413 snprintf(buf, 16, "%d.%03ds", s, ms);
414 else
415 snprintf(buf, 16, "%ds", s);
416 } else if (ms) {
417 if (us)
418 snprintf(buf, 16, "%d.%03dms", ms, us);
419 else
420 snprintf(buf, 16, "%dms", ms);
421 } else if (us) {
422 if (ns)
423 snprintf(buf, 16, "%d.%03dus", us, ns);
424 else
425 snprintf(buf, 16, "%dus", us);
426 } else
427 snprintf(buf, 16, "%dns", ns);
428 return buf;
429 }
430
ms2string(int millisec)431 static char *ms2string(int millisec)
432 {
433 return ns2string(millisec * 1000000ULL);
434 }
435
get_if_device_is_in_use(int sequential_id)436 static int get_if_device_is_in_use(int sequential_id)
437 {
438 int i = 0, found = 0;
439 int num_devices;
440
441 if (sequential_id >= get_number_of_available_devices()) {
442 return -1;
443 }
444
445 num_devices = get_number_of_devices_in_use();
446
447 for (i = 0; i < num_devices && !found; i++) {
448 if (sequential_id == engaged_devices[i])
449 found = 1;
450 }
451 return found;
452 }
453
454 /*
455 * Load information about all platforms and devices available in the
456 * running system
457 */
load_opencl_environment()458 static void load_opencl_environment()
459 {
460 cl_platform_id platform_list[MAX_PLATFORMS];
461 cl_uint num_platforms, device_pos = 0;
462 int ret, i;
463
464 /* Find OpenCL enabled devices. We ignore error here, in case
465 * there is no platform and we'd like to run a non-OpenCL format. */
466 ret = clGetPlatformIDs(MAX_PLATFORMS, platform_list, &num_platforms);
467
468 if (ret != CL_SUCCESS)
469 num_platforms = 0;
470
471 if (num_platforms < 1 && options.verbosity > VERB_LEGACY)
472 fprintf(stderr, "%u: No OpenCL platforms were found: %s\n",
473 NODE, get_error_name(ret));
474
475 for (i = 0; i < num_platforms; i++) {
476 cl_uint num_devices;
477
478 // It is possible to have a platform without any devices
479 // Ignore error here too on purpose.
480 ret = clGetDeviceIDs(platform_list[i], CL_DEVICE_TYPE_ALL,
481 MAX_GPU_DEVICES - device_pos, /* avoid buffer overrun */
482 &devices[device_pos], &num_devices);
483 if (ret != CL_SUCCESS)
484 num_devices = 0;
485
486 if (num_devices < 1 && options.verbosity > VERB_LEGACY)
487 fprintf(stderr,
488 "%u: No OpenCL devices were found on platform #%d: %s\n",
489 NODE, i, get_error_name(ret));
490
491 // Save platform and devices information
492 platforms[i].platform = platform_list[i];
493 platforms[i].num_devices = num_devices;
494
495 // Point to the end of the list
496 device_pos += num_devices;
497
498 #ifdef OCL_DEBUG
499 {
500 char opencl_data[LOG_SIZE];
501
502 SOFT_CLERROR(clGetPlatformInfo(platform_list[i],
503 CL_PLATFORM_NAME, sizeof(opencl_data), opencl_data, NULL),
504 "clGetPlatformInfo for CL_PLATFORM_NAME");
505
506 fprintf(stderr, "%u: OpenCL platform %d: %s, %d device(s).\n",
507 NODE, i, opencl_data, num_devices);
508 }
509 #endif
510 }
511
512 // Set NULL to the final buffer position.
513 platforms[i].platform = NULL;
514 devices[device_pos] = NULL;
515 }
516
get_pci_info(int sequential_id,hw_bus * hardware_info)517 static cl_int get_pci_info(int sequential_id, hw_bus *hardware_info)
518 {
519
520 cl_int ret;
521
522 hardware_info->bus = -1;
523 hardware_info->device = -1;
524 hardware_info->function = -1;
525 memset(hardware_info->busId, '\0', sizeof(hardware_info->busId));
526
527 if (gpu_amd(device_info[sequential_id]) ||
528 cpu_amd(device_info[sequential_id])) {
529 cl_device_topology_amd topo;
530
531 ret = clGetDeviceInfo(devices[sequential_id],
532 CL_DEVICE_TOPOLOGY_AMD, sizeof(topo), &topo, NULL);
533
534 if (ret == CL_SUCCESS) {
535 hardware_info->bus = topo.pcie.bus & 0xff;
536 hardware_info->device = topo.pcie.device & 0xff;
537 hardware_info->function = topo.pcie.function & 0xff;
538 } else
539 return ret;
540 } else if (gpu_nvidia(device_info[sequential_id])) {
541 cl_uint entries;
542
543 ret = clGetDeviceInfo(devices[sequential_id], CL_DEVICE_PCI_BUS_ID_NV,
544 sizeof(cl_uint), &entries, NULL);
545
546 if (ret == CL_SUCCESS)
547 hardware_info->bus = entries;
548 else
549 return ret;
550
551 ret = clGetDeviceInfo(devices[sequential_id], CL_DEVICE_PCI_SLOT_ID_NV,
552 sizeof(cl_uint), &entries, NULL);
553
554 if (ret == CL_SUCCESS) {
555 hardware_info->device = entries >> 3;
556 hardware_info->function = entries & 7;
557 } else
558 return ret;
559 } else
560 return CL_SUCCESS;
561
562 sprintf(hardware_info->busId, "%02x:%02x.%x", hardware_info->bus,
563 hardware_info->device, hardware_info->function);
564 return CL_SUCCESS;
565 }
566
567 /*
568 * Initialize an OpenCL device:
569 * - create context and queue;
570 * - get bus and map to monitoring stuff;
571 */
start_opencl_device(int sequential_id,int * err_type)572 static int start_opencl_device(int sequential_id, int *err_type)
573 {
574 cl_context_properties properties[3];
575 char opencl_data[LOG_SIZE];
576 int retry = 0;
577
578 // Get the detailed information about the device
579 // (populate device_info[d] bitfield).
580 load_device_info(sequential_id);
581
582 // Get hardware bus/PCIE information.
583 get_pci_info(sequential_id, &ocl_device_list[sequential_id].pci_info);
584
585 // Map temp monitoring function and NVML/ADL id to our device id
586 if (gpu_nvidia(device_info[sequential_id])) {
587 temp_dev_id[sequential_id] =
588 id2nvml(ocl_device_list[sequential_id].pci_info);
589 dev_get_temp[sequential_id] = nvml_lib ? nvidia_get_temp : NULL;
590 } else if (gpu_amd(device_info[sequential_id])) {
591 temp_dev_id[sequential_id] =
592 id2adl(ocl_device_list[sequential_id].pci_info);
593 dev_get_temp[sequential_id] = adl_lib ? amd_get_temp : NULL;
594
595 if (sequential_id > 0 &&
596 temp_dev_id[sequential_id] == temp_dev_id[sequential_id - 1]) {
597 /* Kludge for 7990 > 14.9. We hates AMD. */
598 ocl_device_list[sequential_id].pci_info.bus++;
599 temp_dev_id[sequential_id] =
600 id2adl(ocl_device_list[sequential_id].pci_info);
601 }
602 } else {
603 temp_dev_id[sequential_id] = sequential_id;
604 dev_get_temp[sequential_id] = NULL;
605 }
606
607 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_NAME,
608 sizeof(opencl_data), opencl_data, NULL),
609 "clGetDeviceInfo for DEVICE_NAME");
610
611 max_group_size = get_device_max_lws(sequential_id);
612
613 do {
614 // Get the platform properties
615 properties[0] = CL_CONTEXT_PLATFORM;
616 properties[1] = (cl_context_properties)
617 platforms[get_platform_id(sequential_id)].platform;
618 properties[2] = 0;
619
620 // Setup context and queue
621 context[sequential_id] = clCreateContext(properties, 1,
622 &devices[sequential_id], NULL, NULL, &ret_code);
623
624 if (ret_code != CL_SUCCESS) {
625 fprintf(stderr, "%u: Error creating context for device %d "
626 "(%d:%d): %s, %s\n",
627 NODE, sequential_id,
628 get_platform_id(sequential_id),
629 get_device_id(sequential_id), get_error_name(ret_code),
630 retry < RACE_CONDITION_DEBUG ? "retrying" : "giving up");
631 if (++retry > RACE_CONDITION_DEBUG)
632 error();
633 usleep((retry + NODE) * 100);
634 }
635 } while (ret_code != CL_SUCCESS);
636
637 retry = 0;
638 do {
639 queue[sequential_id] = clCreateCommandQueue(context[sequential_id],
640 devices[sequential_id], 0, &ret_code);
641
642 if (ret_code != CL_SUCCESS) {
643 fprintf(stderr, "%u: Error creating command queue for "
644 "device %d (%d:%d): %s, %s\n", NODE,
645 sequential_id, get_platform_id(sequential_id),
646 get_device_id(sequential_id), get_error_name(ret_code),
647 retry < RACE_CONDITION_DEBUG ? "retrying" : "giving up");
648 if (++retry > RACE_CONDITION_DEBUG)
649 error();
650 usleep((retry + NODE) * 100);
651 }
652 } while (ret_code != CL_SUCCESS);
653
654 #ifdef OCL_DEBUG
655 fprintf(stderr, " Device %d: %s\n", sequential_id, opencl_data);
656 #endif
657
658 // Success.
659 return 1;
660 }
661
662 /* Add one requested OpenCL device to the list of the requested devices
663 * - it only adds a device that is working properly;
664 * - so, the device is initialized inside the routine;
665 */
add_device_to_list(int sequential_id)666 static void add_device_to_list(int sequential_id)
667 {
668 int i = 0, found;
669
670 found = get_if_device_is_in_use(sequential_id);
671
672 if (found < 0) {
673 #if HAVE_MPI
674 if (mpi_p > 1)
675 fprintf(stderr, "%u@%s: ", mpi_id + 1, mpi_name);
676 #elif OS_FORK
677 if (options.fork)
678 fprintf(stderr, "%u: ", options.node_min);
679 #endif
680 fprintf(stderr, "Error: --device must be between 1 and %d "
681 "(the number of devices available).\n",
682 get_number_of_available_devices());
683 error();
684 }
685
686 if (found == 0) {
687 // Only requested and working devices should be started.
688 if (! start_opencl_device(sequential_id, &i)) {
689 #if HAVE_MPI
690 if (mpi_p > 1)
691 fprintf(stderr, "%u@%s: ", mpi_id + 1, mpi_name);
692 #elif OS_FORK
693 if (options.fork)
694 fprintf(stderr, "%u: ", options.node_min);
695 #endif
696 fprintf(stderr, "Device id %d not working correctly,"
697 " skipping.\n", sequential_id + 1);
698 return;
699 }
700 engaged_devices[get_number_of_devices_in_use() + 1] = DEV_LIST_END;
701 engaged_devices[get_number_of_devices_in_use()] = sequential_id;
702 }
703 // The full list of requested devices.
704 requested_devices[get_number_of_requested_devices() + 1] = DEV_LIST_END;
705 requested_devices[get_number_of_requested_devices()] = sequential_id;
706 }
707
708 /* Used below (inside add_device_type routine) to sort devices */
709 typedef struct {
710 int index;
711 cl_device_id ID;
712 unsigned int value;
713 } speed_sort_t;
714
715 /* Used below (inside add_device_type routine) to sort devices */
comparator(const void * p1,const void * p2)716 static int comparator(const void *p1, const void *p2)
717 {
718 const speed_sort_t *c1 = (const speed_sort_t *)p1;
719 const speed_sort_t *c2 = (const speed_sort_t *)p2;
720 int diff = (int)c2->value - (int)c1->value;
721 if (diff)
722 return diff;
723 return c1->index - c2->index;
724 }
725
726 /* Add groups of devices to requested OpenCL devices list */
add_device_type(cl_ulong device_type,int top)727 static void add_device_type(cl_ulong device_type, int top)
728 {
729 int i, j, sequence_nr = 0;
730 int found = 0;
731 speed_sort_t dev[MAX_GPU_DEVICES];
732
733 // Get all devices of requested type.
734 for (i = 0; platforms[i].platform; i++) {
735 cl_device_id devices[MAX_GPU_DEVICES];
736 cl_uint device_num = 0;
737
738 if (clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_ALL,
739 MAX_GPU_DEVICES, devices, &device_num) == CL_SUCCESS) {
740 // Sort devices by speed
741 for (j = 0; j < device_num && sequence_nr < MAX_GPU_DEVICES;
742 j++, sequence_nr++) {
743 load_device_info(sequence_nr);
744 dev[sequence_nr].index = sequence_nr;
745 dev[sequence_nr].ID = devices[j];
746 dev[sequence_nr].value = opencl_speed_index(sequence_nr);
747 }
748 }
749 }
750
751 // If there is something to sort, do it.
752 if (sequence_nr > 1)
753 qsort(dev, sequence_nr, sizeof(dev[0]), comparator);
754
755 // Add the devices sorted by speed devices
756 for (j = 0; j < sequence_nr; j++) {
757 cl_ulong long_entries = 0;
758
759 if (clGetDeviceInfo(dev[j].ID, CL_DEVICE_TYPE,
760 sizeof(cl_ulong), &long_entries, NULL) == CL_SUCCESS) {
761 if (long_entries & device_type) {
762 found++;
763 add_device_to_list(dev[j].index);
764
765 // Only the best should be added
766 if (top)
767 break;
768 }
769 }
770 }
771 // If testing preferred devices, do not warn or fail
772 if (!found && !default_device_selected)
773 error_msg("No OpenCL device of that type found\n");
774 }
775
776 /* Build a list of the requested OpenCL devices */
build_device_list(const char * device_list[MAX_GPU_DEVICES])777 static void build_device_list(const char *device_list[MAX_GPU_DEVICES])
778 {
779 int n = 0;
780
781 while (device_list[n] && n < MAX_GPU_DEVICES) {
782 int len = MAX(strlen(device_list[n]), 3);
783 /* Add devices in the preferable order: gpu,
784 * accelerator, and cpu. */
785 cl_device_type trial_list[] = {
786 CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_ACCELERATOR,
787 CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_DEFAULT
788 };
789
790 if (!strcmp(device_list[n], "all"))
791 add_device_type(CL_DEVICE_TYPE_ALL, 0);
792 else if (!strcmp(device_list[n], "cpu"))
793 add_device_type(CL_DEVICE_TYPE_CPU, 0);
794 else if (!strcmp(device_list[n], "gpu"))
795 add_device_type(CL_DEVICE_TYPE_GPU, 0);
796 else if (!strncmp(device_list[n], "accelerator", len))
797 add_device_type(CL_DEVICE_TYPE_ACCELERATOR, 0);
798 else if (!strncmp(device_list[n], "best", len)) {
799 int i = 0, top = (options.fork ? 0 : 1);
800
801 /* Set a flag that JtR has changed the value of --devices. */
802 default_device_selected = 1;
803 if (top)
804 default_gpu_selected = 1;
805
806 do
807 add_device_type(trial_list[i++], top);
808 while (get_number_of_devices_in_use() == 0 &&
809 trial_list[i] != CL_DEVICE_TYPE_DEFAULT);
810 }
811 else if (!isdigit(ARCH_INDEX(device_list[n][0]))) {
812 fprintf(stderr, "Error: --device must be numerical, "
813 "or one of \"all\", \"cpu\", \"gpu\" and\n"
814 "\"acc[elerator]\".\n");
815 error();
816 } else if (device_list[n][0] == '0') {
817 fprintf(stderr, "Error: --device must be between 1 and %d "
818 "(the number of devices available).\n",
819 get_number_of_available_devices());
820 error();
821 } else
822 add_device_to_list(atoi(device_list[n]) - 1);
823 n++;
824 }
825 }
826
827 /*
828 * Load the OpenCL environment
829 * - fill in the "existing" devices list (devices[] variable) and;
830 * - fill in the "in use" devices list (engaged_devices[] variable);
831 * - device was initialized;
832 * - do not count duplicates;
833 * --device=2,2 result that "one" device is really in use;
834 * - fill in the "all requested" devices list (requested_devices[] variable);
835 * - device was initialized;
836 * - count duplicates;
837 * --device=2,2 result that "two" devices will be used, e.g., to split tasks;
838 *
839 * Warn if no device is found
840 * On MPI, hide devices from other instances
841 */
opencl_load_environment(void)842 void opencl_load_environment(void)
843 {
844 char *env;
845
846 // Prefer COMPUTE over DISPLAY and lacking both, assume :0
847 env = getenv("COMPUTE");
848 if (env && *env)
849 setenv("DISPLAY", env, 1);
850 else {
851 // We assume that 10 dot something is X11
852 // forwarding so we override that too.
853 env = getenv("DISPLAY");
854 if (!env || !*env || strstr(env, ":10."))
855 setenv("DISPLAY", ":0", 1);
856 }
857
858 if (!opencl_initialized) {
859 int i;
860 const char *cmdline_devices[MAX_GPU_DEVICES];
861
862 nvidia_probe();
863 amd_probe();
864
865 // Initialize OpenCL global control variables
866 cmdline_devices[0] = NULL;
867 engaged_devices[0] = DEV_LIST_END;
868 requested_devices[0] = DEV_LIST_END;
869
870 for (i = 0; i < MAX_GPU_DEVICES; i++) {
871 context[i] = NULL;
872 queue[i] = NULL;
873 }
874
875 // Read the GPU temperature setting to abort
876 gpu_temp_limit = cfg_get_int(SECTION_OPTIONS, SUBSECTION_GPU,
877 "AbortTemperature");
878 cool_gpu_down = cfg_get_int(SECTION_OPTIONS, SUBSECTION_GPU,
879 "SleepOnTemperature");
880
881 // Load information about available platforms and devices
882 load_opencl_environment();
883
884 // Ensure that there is at least one OpenCL device available
885 if (get_number_of_available_devices() == 0) {
886 fprintf(stderr, "No OpenCL devices found\n");
887 error();
888 }
889
890 // Get the "--device" list requested by the user
891 {
892 int n = 0;
893 struct list_entry *current;
894
895 if ((current = options.acc_devices->head)) {
896 do {
897 cmdline_devices[n++] = current->data;
898 } while ((current = current->next) && n < MAX_GPU_DEVICES);
899
900 cmdline_devices[n] = NULL;
901 } else
902 gpu_id = NO_GPU;
903 }
904
905 // If none selected, read the "--device" from the configuration file
906 if (!options.acc_devices->head && gpu_id <= NO_GPU) {
907 const char *devcfg;
908
909 if ((devcfg = cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL,
910 "Device")) && *devcfg) {
911 cmdline_devices[0] = devcfg;
912 cmdline_devices[1] = NULL;
913 }
914 }
915
916 // No "--device" requested. Pick the most powerful GPU as the default one.
917 if (!cmdline_devices[0]) {
918 cmdline_devices[0] = "best";
919 cmdline_devices[1] = NULL;
920 }
921
922 // Build the list of requested (and working) OpenCL devices
923 build_device_list(cmdline_devices);
924
925 // No working OpenCL device was found
926 if (get_number_of_devices_in_use() == 0) {
927 fprintf(stderr, "No OpenCL devices found\n");
928 error();
929 }
930 #if OS_FORK
931 // Poor man's multi-device support.
932 if ((options.fork ? options.fork : 1) > 1 && options.acc_devices->count) {
933 // Pick device to use for this node
934 gpu_id = requested_devices[(options.node_min - 1) %
935 get_number_of_requested_devices()];
936
937 // Hide any other devices from list
938 engaged_devices[0] = gpu_id;
939 engaged_devices[1] = DEV_LIST_END;
940 } else
941 #endif
942
943 #ifdef HAVE_MPI
944 // Poor man's multi-device support.
945 if (mpi_p > 1 && mpi_p_local > 1) {
946 // Pick device to use for this node
947 gpu_id = engaged_devices[mpi_id % get_number_of_devices_in_use()];
948
949 // Hide any other devices from list
950 engaged_devices[0] = gpu_id;
951 engaged_devices[1] = DEV_LIST_END;
952 } else
953 #endif
954 gpu_id = engaged_devices[0];
955 platform_id = get_platform_id(gpu_id);
956
957 opencl_initialized = 1;
958 }
959 }
960
961 /* Get the device preferred vector width */
opencl_get_vector_width(int sequential_id,int size)962 unsigned int opencl_get_vector_width(int sequential_id, int size)
963 {
964 /* --force-scalar option, or john.conf ForceScalar boolean */
965 if (options.flags & FLG_SCALAR)
966 options.v_width = 1;
967
968 /* --force-vector-width=N */
969 if (options.v_width) {
970 ocl_v_width = options.v_width;
971 } else {
972 cl_uint v_width = 0;
973
974 // If OpenCL has not yet been loaded, load it now
975 opencl_load_environment();
976
977 /* OK, we supply the real figure */
978 switch (size) {
979 case sizeof(cl_char):
980 HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id],
981 CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
982 sizeof(v_width), &v_width, NULL),
983 "clGetDeviceInfo for char vector width");
984 break;
985 case sizeof(cl_short):
986 HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id],
987 CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
988 sizeof(v_width), &v_width, NULL),
989 "clGetDeviceInfo for short vector width");
990 break;
991 case sizeof(cl_int):
992 HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id],
993 CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
994 sizeof(v_width), &v_width, NULL),
995 "clGetDeviceInfo for int vector width");
996 break;
997 case sizeof(cl_long):
998 HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id],
999 CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
1000 sizeof(v_width), &v_width, NULL),
1001 "clGetDeviceInfo for long vector width");
1002 break;
1003 default:
1004 fprintf(stderr, "%s() called with unknown type\n", __FUNCTION__);
1005 error();
1006 }
1007 ocl_v_width = v_width;
1008 }
1009 return ocl_v_width;
1010 }
1011
1012 /* Called by core after calling format's done() */
opencl_done()1013 void opencl_done()
1014 {
1015 int i;
1016 int num_devices;
1017
1018 printed_mask = 0;
1019
1020 if (!opencl_initialized)
1021 return;
1022
1023 num_devices = get_number_of_devices_in_use();
1024
1025 for (i = 0; i < num_devices; i++) {
1026 if (queue[engaged_devices[i]])
1027 HANDLE_CLERROR(clReleaseCommandQueue(queue[engaged_devices[i]]),
1028 "clReleaseCommandQueue");
1029 queue[engaged_devices[i]] = NULL;
1030 if (context[engaged_devices[i]])
1031 HANDLE_CLERROR(clReleaseContext(context[engaged_devices[i]]),
1032 "clReleaseContext");
1033 context[engaged_devices[i]] = NULL;
1034 program[engaged_devices[i]] = NULL;
1035 }
1036
1037 /* Reset in case we load another format after this */
1038 local_work_size = global_work_size = duration_time = 0;
1039 ocl_max_lws = 0;
1040 ocl_v_width = 1;
1041 fmt_base_name[0] = 0;
1042 opencl_initialized = 0;
1043 crypt_kernel = NULL;
1044
1045 engaged_devices[0] = engaged_devices[1] = DEV_LIST_END;
1046 }
1047
opencl_get_config_name(const char * format,const char * config_name)1048 static char *opencl_get_config_name(const char *format, const char *config_name)
1049 {
1050 static char config_item[256];
1051
1052 snprintf(config_item, sizeof(config_item), "%s%s", format, config_name);
1053 return config_item;
1054 }
1055
opencl_get_user_preferences(const char * format)1056 void opencl_get_user_preferences(const char *format)
1057 {
1058 char *tmp_value;
1059
1060 if (format) {
1061 snprintf(fmt_base_name, sizeof(fmt_base_name), "%s", format);
1062 if ((tmp_value = strrchr(fmt_base_name, (int)'-')))
1063 *tmp_value = 0;
1064 strlwr(fmt_base_name);
1065 } else
1066 fmt_base_name[0] = 0;
1067
1068 if (format && (tmp_value = (char*)cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL,
1069 opencl_get_config_name(fmt_base_name, LWS_CONFIG_NAME))))
1070 local_work_size = atoi(tmp_value);
1071
1072 if (options.lws)
1073 local_work_size = options.lws;
1074 else if ((tmp_value = getenv("LWS")))
1075 local_work_size = atoi(tmp_value);
1076
1077 if (format && (tmp_value = (char*)cfg_get_param(SECTION_OPTIONS, SUBSECTION_OPENCL,
1078 opencl_get_config_name(fmt_base_name, GWS_CONFIG_NAME))))
1079 global_work_size = atoi(tmp_value);
1080
1081 if (options.gws)
1082 global_work_size = options.gws;
1083 else if ((tmp_value = getenv("GWS")))
1084 global_work_size = atoi(tmp_value);
1085
1086 if (local_work_size)
1087 // Ensure a valid multiple is used.
1088 global_work_size = GET_MULTIPLE_OR_ZERO(global_work_size,
1089 local_work_size);
1090
1091 if (format && (tmp_value = (char*)cfg_get_param(SECTION_OPTIONS,
1092 SUBSECTION_OPENCL, opencl_get_config_name(fmt_base_name,
1093 DUR_CONFIG_NAME))) && *tmp_value)
1094 duration_time = atoi(tmp_value);
1095 else if ((tmp_value = (char*)cfg_get_param(SECTION_OPTIONS,
1096 SUBSECTION_OPENCL, "Global" DUR_CONFIG_NAME)) && *tmp_value)
1097 duration_time = atoi(tmp_value);
1098 }
1099
opencl_get_sane_lws_gws_values()1100 void opencl_get_sane_lws_gws_values()
1101 {
1102 if (!local_work_size) {
1103 if (cpu(device_info[gpu_id]))
1104 local_work_size =
1105 get_platform_vendor_id(platform_id) == DEV_INTEL ?
1106 8 : 1;
1107 else
1108 local_work_size = 64;
1109 }
1110
1111 if (!global_work_size)
1112 global_work_size = 768;
1113 }
1114
get_device_name_(int sequential_id)1115 char* get_device_name_(int sequential_id)
1116 {
1117 static char device_name[MAX_OCLINFO_STRING_LEN];
1118
1119 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_NAME,
1120 sizeof(device_name), device_name, NULL),
1121 "clGetDeviceInfo for DEVICE_NAME");
1122
1123 return device_name;
1124 }
1125
1126 /* Print and log information about an OpenCL devide in use */
print_device_info(int sequential_id)1127 static void print_device_info(int sequential_id)
1128 {
1129 static int printed[MAX_GPU_DEVICES];
1130 char device_name[MAX_OCLINFO_STRING_LEN];
1131 char board_name[LOG_SIZE] = "";
1132 cl_int ret_code;
1133
1134 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_NAME,
1135 sizeof(device_name), device_name, NULL),
1136 "clGetDeviceInfo for DEVICE_NAME");
1137
1138 ret_code = clGetDeviceInfo(devices[sequential_id],
1139 CL_DEVICE_BOARD_NAME_AMD, sizeof(opencl_log), opencl_log, NULL);
1140
1141 if (ret_code == CL_SUCCESS) {
1142 char *p = ltrim(rtrim(opencl_log));
1143
1144 if (strlen(p))
1145 sprintf(board_name, " [%s]", p);
1146 }
1147
1148 if (options.verbosity > 1 && !printed[sequential_id]++)
1149 fprintf(stderr, "Device %d%s%s: %s%s\n",
1150 sequential_id + 1,
1151 #if HAVE_MPI
1152 "@", mpi_name,
1153 #else
1154 "", "",
1155 #endif
1156 device_name, board_name);
1157 log_event("Device %d: %s%s", sequential_id + 1, device_name, board_name);
1158 }
1159
1160 /*
1161 * Given a string, return a newly allocated string that is a copy of
1162 * the original but quoted. The old string is freed.
1163 */
quote_str(char * orig)1164 static char *quote_str(char *orig)
1165 {
1166 char *new = mem_alloc(strlen(orig) + 3);
1167 char *s = orig;
1168 char *d = new;
1169
1170 *d++ = '"';
1171 while (*s)
1172 *d++ = *s++;
1173 *d++ = '"';
1174 *d = 0;
1175
1176 MEM_FREE(orig);
1177
1178 return new;
1179 }
1180
1181 #if defined(__CYGWIN__) || defined(__MINGW32__)
mingw_try_relative_path(char * self_path)1182 static char *mingw_try_relative_path(char *self_path)
1183 {
1184 int len;
1185 struct stat file_stat;
1186 struct path {
1187 char *prefix1, *prefix2;
1188 };
1189
1190 if (!stat(self_path, &file_stat) && S_ISDIR(file_stat.st_mode))
1191 return self_path;
1192
1193 len = strlen(self_path);
1194 if (len > PATH_BUFFER_SIZE - 4)
1195 return self_path;
1196
1197 {
1198 int i = 0;
1199 char *origin = (char *) mem_calloc(len + 1, sizeof(char));
1200 char *fixed_path = (char *) mem_calloc(PATH_BUFFER_SIZE, sizeof(char));
1201 struct path prefixes[] = {
1202 {".", "./" /* Child */ },
1203 {"..", "../" /* Root */ },
1204 {NULL, NULL}
1205 };
1206 strncpy(origin, self_path, len);
1207 MEM_FREE(self_path);
1208
1209 while (prefixes[i].prefix1) {
1210 if (origin[0] == '/')
1211 strcpy(fixed_path, prefixes[i].prefix1);
1212 else
1213 strcpy(fixed_path, prefixes[i].prefix2);
1214 strncat(fixed_path, origin, len);
1215
1216 if (!stat(fixed_path, &file_stat) && S_ISDIR(file_stat.st_mode))
1217 goto found;
1218 i++;
1219 }
1220 /* Give up */
1221 MEM_FREE(fixed_path);
1222 return origin;
1223
1224 found:
1225 MEM_FREE(origin);
1226 return fixed_path;
1227 }
1228 }
1229 #endif
1230
include_source(const char * pathname,int sequential_id,const char * opts)1231 static char *include_source(const char *pathname, int sequential_id, const char *opts)
1232 {
1233 char *include, *full_path;
1234 const char *global_opts;
1235
1236 #if I_REALPATH
1237 char *pex = (char*)path_expand_safe(pathname);
1238
1239 if (!(full_path = realpath(pex, NULL)))
1240 pexit("realpath()");
1241
1242 MEM_FREE(pex);
1243 #else
1244 full_path = (char*)path_expand_safe(pathname);
1245 #if defined(__CYGWIN__) || defined(__MINGW32__)
1246 full_path = mingw_try_relative_path(full_path);
1247 #endif
1248 #endif
1249
1250 include = (char *) mem_calloc(LINE_BUFFER_SIZE, sizeof(char));
1251
1252 if (!(global_opts = getenv("OPENCLBUILDOPTIONS")))
1253 if (!(global_opts = cfg_get_param(SECTION_OPTIONS,
1254 SUBSECTION_OPENCL, "GlobalBuildOpts")))
1255 global_opts = OPENCLBUILDOPTIONS;
1256
1257 if (strchr(full_path, ' ')) {
1258 full_path = quote_str(full_path);
1259 }
1260
1261 snprintf(include, LINE_BUFFER_SIZE,
1262 "-I %s %s %s%s%s%s%d %s%d %s -D_OPENCL_COMPILER %s",
1263 full_path,
1264 global_opts,
1265 get_platform_vendor_id(get_platform_id(sequential_id)) ==
1266 PLATFORM_MESA ? "-D__MESA__ " :
1267 get_platform_vendor_id(get_platform_id(sequential_id)) ==
1268 PLATFORM_POCL ? "-D__POCL__ " :
1269 get_platform_vendor_id(get_platform_id(sequential_id)) ==
1270 PLATFORM_BEIGNET ?
1271 "-D__BEIGNET__ " :
1272 get_device_capability(sequential_id),
1273 #ifdef __APPLE__
1274 "-D__OS_X__ ",
1275 #else
1276 (options.verbosity >= VERB_MAX &&
1277 gpu_nvidia(device_info[sequential_id])) ?
1278 "-cl-nv-verbose " : "",
1279 #endif
1280 get_device_type(sequential_id) == CL_DEVICE_TYPE_CPU ? "-D__CPU__ "
1281 : get_device_type(sequential_id) == CL_DEVICE_TYPE_GPU ? "-D__GPU__ " : "",
1282 "-DDEVICE_INFO=", device_info[sequential_id],
1283 "-D__SIZEOF_HOST_SIZE_T__=", (int)sizeof(size_t),
1284 opencl_driver_ver(sequential_id),
1285 opts ? opts : "");
1286
1287 MEM_FREE(full_path);
1288
1289 return include;
1290 }
1291
opencl_build(int sequential_id,const char * opts,int save,const char * file_name,cl_program * program,const char * kernel_source_file,const char * kernel_source)1292 void opencl_build(int sequential_id, const char *opts, int save, const char *file_name, cl_program *program, const char *kernel_source_file, const char *kernel_source)
1293 {
1294 cl_int build_code, err_code;
1295 char *build_log, *build_opts;
1296 size_t log_size;
1297 const char *srcptr[] = { kernel_source };
1298 #if HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS)
1299 int kludge_file = 0;
1300 #endif
1301
1302 /* This over-rides binary caching */
1303 if (getenv("DUMP_BINARY")) {
1304 char *bname = basename(kernel_source_file);
1305 char *ext = ".bin";
1306 int size = strlen(bname) + strlen(ext) + 1;
1307 char *name = mem_alloc_tiny(size, MEM_ALIGN_NONE);
1308
1309 save = 1;
1310 snprintf(name, size, "%s%s", bname, ext);
1311 file_name = name;
1312 }
1313
1314 *program =
1315 clCreateProgramWithSource(context[sequential_id], 1, srcptr,
1316 NULL, &err_code);
1317 HANDLE_CLERROR(err_code, "clCreateProgramWithSource");
1318 // include source is thread safe.
1319 build_opts = include_source("$JOHN/kernels", sequential_id, opts);
1320
1321 if (options.verbosity > VERB_LEGACY)
1322 fprintf(stderr, "Options used: %s %s\n", build_opts,
1323 kernel_source_file);
1324
1325 kernel_source_file = path_expand(kernel_source_file);
1326
1327 #if HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS)
1328 if (mpi_p > 1) {
1329 #if RACE_CONDITION_DEBUG
1330 if (options.verbosity == VERB_DEBUG)
1331 fprintf(stderr, "Node %d %s kludge locking %s...\n",
1332 NODE, __FUNCTION__, kernel_source_file);
1333 #endif
1334 if ((kludge_file = open(kernel_source_file, O_RDWR | O_APPEND)) < 0) {
1335 pexit("Error opening kernel file");
1336 } else {
1337 #if FCNTL_LOCKS
1338 struct flock lock;
1339
1340 memset(&lock, 0, sizeof(lock));
1341 lock.l_type = F_WRLCK;
1342 while (fcntl(kludge_file, F_SETLKW, &lock)) {
1343 if (errno != EINTR)
1344 pexit("fcntl(F_WRLCK)");
1345 }
1346 #else
1347 while (flock(kludge_file, LOCK_EX)) {
1348 if (errno != EINTR)
1349 pexit("flock(LOCK_EX)");
1350 }
1351 #endif /* FCNTL_LOCKS */
1352 }
1353 #if RACE_CONDITION_DEBUG
1354 if (options.verbosity == VERB_DEBUG)
1355 fprintf(stderr, "Node %d got a kludge lock\n", NODE);
1356 #endif
1357 }
1358 #endif /* HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS) */
1359
1360 build_code = clBuildProgram(*program, 0, NULL,
1361 build_opts, NULL, NULL);
1362
1363 HANDLE_CLERROR(clGetProgramBuildInfo(*program,
1364 devices[sequential_id],
1365 CL_PROGRAM_BUILD_LOG, 0, NULL,
1366 &log_size),
1367 "clGetProgramBuildInfo I");
1368 build_log = (char *)mem_calloc(1, log_size + 1);
1369
1370 HANDLE_CLERROR(clGetProgramBuildInfo(*program,
1371 devices[sequential_id],
1372 CL_PROGRAM_BUILD_LOG, log_size + 1,
1373 (void *)build_log, NULL),
1374 "clGetProgramBuildInfo II");
1375
1376 // Report build errors and warnings
1377 if (build_code != CL_SUCCESS) {
1378 // Give us info about error and exit (through HANDLE_CLERROR)
1379 if (options.verbosity <= VERB_LEGACY)
1380 fprintf(stderr, "Options used: %s %s\n",
1381 build_opts, kernel_source_file);
1382 if (strlen(build_log) > 1)
1383 fprintf(stderr, "Build log: %s\n", build_log);
1384 fprintf(stderr, "Error building kernel %s. DEVICE_INFO=%d\n",
1385 kernel_source_file, device_info[sequential_id]);
1386 HANDLE_CLERROR(build_code, "clBuildProgram");
1387 }
1388 // Nvidia may return a single '\n' that we ignore
1389 else if (options.verbosity >= LOG_VERB && strlen(build_log) > 1)
1390 fprintf(stderr, "Build log: %s\n", build_log);
1391
1392 MEM_FREE(build_log);
1393 MEM_FREE(build_opts);
1394
1395 if (save) {
1396 FILE *file;
1397 size_t source_size;
1398 char *source, *full_path;
1399
1400 HANDLE_CLERROR(clGetProgramInfo(*program,
1401 CL_PROGRAM_BINARY_SIZES,
1402 sizeof(size_t), &source_size, NULL),
1403 "clGetProgramInfo for CL_PROGRAM_BINARY_SIZES");
1404
1405 if (options.verbosity >= VERB_MAX)
1406 fprintf(stderr, "binary size "Zu"\n", source_size);
1407
1408 source = mem_calloc(1, source_size);
1409
1410 HANDLE_CLERROR(clGetProgramInfo(*program,
1411 CL_PROGRAM_BINARIES,
1412 sizeof(char *), &source, NULL),
1413 "clGetProgramInfo for CL_PROGRAM_BINARIES");
1414
1415 file = fopen(full_path = (char*)path_expand_safe(file_name), "w");
1416 MEM_FREE(full_path);
1417
1418 if (file == NULL)
1419 perror("Error creating binary cache file");
1420 else {
1421 #if OS_FLOCK || FCNTL_LOCKS
1422 #if RACE_CONDITION_DEBUG
1423 if (options.verbosity == VERB_DEBUG)
1424 fprintf(stderr, "Node %d %s locking %s...\n", NODE, __FUNCTION__, file_name);
1425 #endif
1426 {
1427 #if FCNTL_LOCKS
1428 struct flock lock;
1429
1430 memset(&lock, 0, sizeof(lock));
1431 lock.l_type = F_WRLCK;
1432 while (fcntl(fileno(file), F_SETLKW, &lock)) {
1433 if (errno != EINTR)
1434 pexit("fcntl(F_WRLCK)");
1435 }
1436 #else
1437 while (flock(fileno(file), LOCK_EX)) {
1438 if (errno != EINTR)
1439 pexit("flock(LOCK_EX)");
1440 }
1441 #endif
1442 }
1443 #if RACE_CONDITION_DEBUG
1444 if (options.verbosity == VERB_DEBUG)
1445 fprintf(stderr, "Node %d got a lock on %s\n", NODE, file_name);
1446 #endif
1447 #endif /* OS_FLOCK || FCNTL_LOCKS */
1448 if (fwrite(source, source_size, 1, file) != 1)
1449 perror("Error caching kernel binary");
1450 #if RACE_CONDITION_DEBUG
1451 if (options.verbosity == VERB_DEBUG)
1452 fprintf(stderr, "Node %d closing %s\n", NODE, file_name);
1453 #endif
1454 fclose(file);
1455 }
1456 MEM_FREE(source);
1457 }
1458
1459 #if HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS)
1460 #if RACE_CONDITION_DEBUG
1461 if (mpi_p > 1 && options.verbosity == VERB_DEBUG)
1462 fprintf(stderr, "Node %d releasing kludge lock\n", NODE);
1463 #endif
1464 if (mpi_p > 1)
1465 close(kludge_file);
1466 #endif /* HAVE_MPI && (OS_FLOCK || FCNTL_LOCKS) */
1467 }
1468
opencl_build_from_binary(int sequential_id,cl_program * program,const char * kernel_source,size_t program_size)1469 void opencl_build_from_binary(int sequential_id, cl_program *program, const char *kernel_source, size_t program_size)
1470 {
1471 cl_int build_code, err_code;
1472 char *build_log;
1473 const char *srcptr[] = { kernel_source };
1474
1475 build_log = (char *) mem_calloc(LOG_SIZE, sizeof(char));
1476 *program =
1477 clCreateProgramWithBinary(context[sequential_id], 1,
1478 &devices[sequential_id], &program_size,
1479 (const unsigned char **)srcptr,
1480 NULL, &err_code);
1481 HANDLE_CLERROR(err_code,
1482 "clCreateProgramWithBinary (using cached binary)");
1483
1484 build_code = clBuildProgram(*program, 0,
1485 NULL, NULL, NULL, NULL);
1486
1487 HANDLE_CLERROR(clGetProgramBuildInfo(*program,
1488 devices[sequential_id],
1489 CL_PROGRAM_BUILD_LOG, LOG_SIZE,
1490 (void *)build_log,
1491 NULL),
1492 "clGetProgramBuildInfo (using cached binary)");
1493
1494 // Report build errors and warnings
1495 if (build_code != CL_SUCCESS) {
1496 // Give us info about error and exit (through HANDLE_CLERROR)
1497 if (strlen(build_log) > 1)
1498 fprintf(stderr, "Binary build log: %s\n", build_log);
1499 fprintf(stderr, "Error %d building kernel using cached binary."
1500 " DEVICE_INFO=%d\n", build_code, device_info[sequential_id]);
1501 HANDLE_CLERROR(build_code, "clBuildProgram");
1502 }
1503 // Nvidia may return a single '\n' that we ignore
1504 else if (options.verbosity >= LOG_VERB && strlen(build_log) > 1)
1505 fprintf(stderr, "Binary Build log: %s\n", build_log);
1506
1507 MEM_FREE(build_log);
1508 }
1509
1510 // Do the proper test using different global work sizes.
clear_profiling_events()1511 static void clear_profiling_events()
1512 {
1513 int i;
1514
1515 // Release events
1516 for (i = 0; i < MAX_EVENTS; i++) {
1517 if (multi_profilingEvent[i] && *multi_profilingEvent[i])
1518 HANDLE_CLERROR(clReleaseEvent(*multi_profilingEvent[i]),
1519 "clReleaseEvent");
1520
1521 if (multi_profilingEvent[i])
1522 *multi_profilingEvent[i] = NULL;
1523 multi_profilingEvent[i] = NULL;
1524 }
1525 }
1526
1527 // Fill [set_salt(), set_key()] the OpenCL device with data. Returns
1528 // salt, and fills binary pointer.
fill_opencl_device(size_t gws,void ** binary)1529 static void* fill_opencl_device(size_t gws, void **binary)
1530 {
1531 int i;
1532 size_t kpc = gws * ocl_v_width;
1533 void *salt;
1534
1535 // Set keys - unique printable length-7 keys
1536 self->methods.clear_keys();
1537 {
1538 char key[PLAINTEXT_BUFFER_SIZE];
1539 int len = mask_add_len;
1540
1541 if (mask_add_len == 0 ||
1542 options.req_minlength != -1 || options.req_maxlength != 0) {
1543 len = (self->params.benchmark_length & 0x7f);
1544
1545 if (len < options.req_minlength)
1546 len = options.req_minlength;
1547 if (options.req_maxlength && len > options.req_maxlength)
1548 len = options.req_maxlength;
1549 }
1550 // Obey format's min and max length
1551 len = MAX(len, self->params.plaintext_min_length);
1552 len = MIN(len, self->params.plaintext_length);
1553
1554 if (options.verbosity == VERB_DEBUG)
1555 fprintf(stderr, "Tuning to length %d\n", len);
1556
1557 memset(key, 0x41, sizeof(key));
1558 key[len] = 0;
1559
1560 for (i = 0; i < kpc; i++) {
1561 int l = len - 1;
1562
1563 self->methods.set_key(key, i);
1564 while (l >= 0 && ++key[l] > 0x60)
1565 key[l--] = 0x21;
1566 }
1567 }
1568
1569 // Set salt
1570 dyna_salt_init(self);
1571 if (self->methods.tunable_cost_value[0] && autotune_db->real) {
1572 struct db_main *db = autotune_db->real;
1573 struct db_salt *s = db->salts;
1574
1575 while (s->next && s->cost[0] < db->max_cost[0])
1576 s = s->next;
1577 salt = s->salt;
1578 *binary = s->list->binary;
1579 } else {
1580 char *ciphertext;
1581
1582 if (!self->params.tests[0].fields[1])
1583 self->params.tests[0].fields[1] = self->params.tests[0].ciphertext;
1584 ciphertext = self->methods.prepare(self->params.tests[0].fields, self);
1585 ciphertext = self->methods.split(ciphertext, 0, self);
1586 salt = self->methods.salt(ciphertext);
1587 *binary = self->methods.binary(ciphertext);
1588 if (salt)
1589 dyna_salt_create(salt);
1590 }
1591 self->methods.set_salt(salt);
1592
1593 return salt;
1594 }
1595
1596 // Do a test run with a specific global work size, return total duration
1597 // (or return zero for error or limits exceeded)
gws_test(size_t gws,unsigned int rounds,int sequential_id)1598 static cl_ulong gws_test(size_t gws, unsigned int rounds, int sequential_id)
1599 {
1600 cl_ulong startTime, endTime, runtime = 0, looptime = 0;
1601 int i, count, total = 0;
1602 size_t kpc = gws * ocl_v_width;
1603 cl_event benchEvent[MAX_EVENTS];
1604 int result, number_of_events = 0;
1605 void *salt, *binary;
1606 int amd_bug;
1607
1608 for (i = 0; i < MAX_EVENTS; i++)
1609 benchEvent[i] = NULL;
1610
1611 // Ensure format knows its GWS
1612 global_work_size = gws;
1613
1614 // Prepare buffers.
1615 create_clobj(gws, self);
1616
1617 // Transfer data to the OpenCL device
1618 salt = fill_opencl_device(gws, &binary);
1619
1620 // Activate events. Then clear them later.
1621 for (i = 0; i < MAX_EVENTS; i++)
1622 multi_profilingEvent[i] = &benchEvent[i];
1623
1624 // Timing run
1625 count = kpc;
1626 result = self->methods.crypt_all(&count, autotune_salts);
1627 if (result < 0) {
1628 runtime = looptime = 0;
1629
1630 if (options.verbosity > VERB_LEGACY)
1631 fprintf(stderr, " (error occurred)");
1632 clear_profiling_events();
1633 release_clobj();
1634 if (!self->methods.tunable_cost_value[0] || !autotune_db->real)
1635 dyna_salt_remove(salt);
1636 return 0;
1637 }
1638 self->methods.cmp_all(binary, result);
1639
1640 for (i = 0; (*multi_profilingEvent[i]); i++)
1641 number_of_events++;
1642
1643 //** Get execution time **//
1644 for (i = 0; i < number_of_events; i++) {
1645 char mult[32] = "";
1646
1647 amd_bug = 0;
1648
1649 HANDLE_CLERROR(clWaitForEvents(1, multi_profilingEvent[i]),
1650 "clWaitForEvents");
1651 HANDLE_CLERROR(clGetEventProfilingInfo(*multi_profilingEvent[i],
1652 CL_PROFILING_COMMAND_START,
1653 sizeof(cl_ulong), &startTime,
1654 NULL),
1655 "clGetEventProfilingInfo start");
1656 HANDLE_CLERROR(clGetEventProfilingInfo(*multi_profilingEvent[i],
1657 CL_PROFILING_COMMAND_END,
1658 sizeof(cl_ulong), &endTime,
1659 NULL),
1660 "clGetEventProfilingInfo end");
1661
1662 /* Work around AMD bug. It randomly claims that a kernel
1663 run took less than a microsecond, fooling our auto tune */
1664 if (endTime - startTime < 1000) {
1665 amd_bug = 1;
1666
1667 HANDLE_CLERROR(clGetEventProfilingInfo(*multi_profilingEvent[i],
1668 CL_PROFILING_COMMAND_SUBMIT,
1669 sizeof(cl_ulong), &startTime,
1670 NULL),
1671 "clGetEventProfilingInfo submit");
1672 }
1673
1674 /* Work around OSX bug with HD4000 driver */
1675 if (endTime == 0)
1676 endTime = startTime;
1677
1678 if ((split_events) && (i == split_events[0] ||
1679 i == split_events[1] || i == split_events[2])) {
1680 looptime += (endTime - startTime);
1681 total++;
1682
1683 if (i == split_events[0])
1684 sprintf(mult, "%dx", rounds / hash_loops);
1685 } else
1686 runtime += (endTime - startTime);
1687
1688 if (options.verbosity >= VERB_MAX)
1689 fprintf(stderr, "%s%s%s%s", warnings[i], mult,
1690 ns2string(endTime - startTime), (amd_bug) ? "*" : "");
1691
1692 /* Single-invocation duration limit */
1693 if (duration_time &&
1694 (endTime - startTime) > 1000000ULL * duration_time) {
1695 runtime = looptime = 0;
1696
1697 if (options.verbosity >= VERB_MAX)
1698 fprintf(stderr, " (exceeds %s)", ms2string(duration_time));
1699 break;
1700 }
1701 }
1702 if (options.verbosity >= VERB_MAX)
1703 fprintf(stderr, "\n");
1704
1705 if (total)
1706 runtime += (looptime * rounds) / (hash_loops * total);
1707
1708 clear_profiling_events();
1709 release_clobj();
1710
1711 if (!self->methods.tunable_cost_value[0] || !autotune_db->real)
1712 dyna_salt_remove(salt);
1713
1714 return runtime;
1715 }
1716
opencl_init_auto_setup(int p_default_value,int p_hash_loops,int * p_split_events,const char ** p_warnings,int p_main_opencl_event,struct fmt_main * p_self,void (* p_create_clobj)(size_t gws,struct fmt_main * self),void (* p_release_clobj)(void),int p_buffer_size,size_t p_gws_limit,struct db_main * db)1717 void opencl_init_auto_setup(int p_default_value, int p_hash_loops,
1718 int *p_split_events, const char **p_warnings,
1719 int p_main_opencl_event, struct fmt_main *p_self,
1720 void (*p_create_clobj)(size_t gws, struct fmt_main *self),
1721 void (*p_release_clobj)(void), int p_buffer_size, size_t p_gws_limit,
1722 struct db_main *db)
1723 {
1724 // Initialize events
1725 clear_profiling_events();
1726
1727 // Get parameters
1728 buffer_size = p_buffer_size;
1729 default_value = p_default_value;
1730 hash_loops = p_hash_loops;
1731 split_events = p_split_events;
1732 warnings = p_warnings;
1733 main_opencl_event = p_main_opencl_event;
1734 self = p_self;
1735 create_clobj = p_create_clobj;
1736 release_clobj = p_release_clobj;
1737 gws_limit = p_gws_limit;
1738 autotune_db = db;
1739 autotune_real_db = db && db->real && db->real == db;
1740 autotune_salts = db ? db->salts : NULL;
1741 }
1742
1743 /*
1744 * Since opencl_find_best_gws() needs more event control (even more events) to
1745 * work properly, opencl_find_best_workgroup() cannot be used by formats that
1746 * are using it. Therefore, despite the fact that opencl_find_best_lws() does
1747 * almost the same that opencl_find_best_workgroup() can do, it also handles
1748 * the necessary event(s) and can do a proper crypt_all() execution analysis
1749 * when shared GWS detection is used.
1750 */
opencl_find_best_lws(size_t group_size_limit,int sequential_id,cl_kernel crypt_kernel)1751 void opencl_find_best_lws(size_t group_size_limit, int sequential_id,
1752 cl_kernel crypt_kernel)
1753 {
1754 size_t gws;
1755 cl_int ret_code;
1756 int i, j, numloops, count, result;
1757 size_t my_work_group, optimal_work_group;
1758 size_t max_group_size, wg_multiple, sumStartTime, sumEndTime;
1759 cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX;
1760 cl_event benchEvent[MAX_EVENTS];
1761 void *salt, *binary;
1762
1763 for (i = 0; i < MAX_EVENTS; i++)
1764 benchEvent[i] = NULL;
1765
1766 gws = global_work_size;
1767
1768 if (options.verbosity > VERB_LEGACY)
1769 fprintf(stderr, "Calculating best LWS for GWS="Zu"\n", gws);
1770
1771 if (get_device_version(sequential_id) < 110) {
1772 if (get_device_type(sequential_id) == CL_DEVICE_TYPE_GPU)
1773 wg_multiple = 32;
1774 else if (get_platform_vendor_id(get_platform_id(sequential_id))
1775 == DEV_INTEL)
1776 wg_multiple = 8;
1777 else
1778 wg_multiple = 1;
1779 } else
1780 wg_multiple = get_kernel_preferred_multiple(sequential_id,
1781 crypt_kernel);
1782
1783 if (platform_apple(get_platform_id(sequential_id)) &&
1784 cpu(device_info[sequential_id]))
1785 max_group_size = 1;
1786 else
1787 max_group_size = ocl_max_lws ?
1788 ocl_max_lws : get_kernel_max_lws(sequential_id, crypt_kernel);
1789
1790 if (max_group_size > group_size_limit)
1791 // Needed to deal (at least) with cryptsha512-opencl limits.
1792 max_group_size = group_size_limit;
1793
1794 // Safety harness
1795 if (wg_multiple > max_group_size)
1796 wg_multiple = max_group_size;
1797
1798 // Change command queue to be used by crypt_all (profile needed)
1799 clReleaseCommandQueue(queue[sequential_id]);
1800
1801 // Create a new queue with profiling enabled
1802 queue[sequential_id] =
1803 clCreateCommandQueue(context[sequential_id],
1804 devices[sequential_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
1805 HANDLE_CLERROR(ret_code, "clCreateCommandQueue");
1806
1807 // Transfer data to the OpenCL device
1808 salt = fill_opencl_device(gws, &binary);
1809
1810 // Warm-up run
1811 local_work_size = wg_multiple;
1812 count = global_work_size * ocl_v_width;
1813 result = self->methods.crypt_all(&count, autotune_salts);
1814 if (result > 0)
1815 self->methods.cmp_all(binary, result);
1816
1817 // Activate events. Then clear them later.
1818 for (i = 0; i < MAX_EVENTS; i++)
1819 multi_profilingEvent[i] = &benchEvent[i];
1820
1821 // Timing run
1822 count = global_work_size * ocl_v_width;
1823 result = self->methods.crypt_all(&count, autotune_salts);
1824 if (result > 0)
1825 self->methods.cmp_all(binary, result);
1826
1827 HANDLE_CLERROR(clWaitForEvents(1, &benchEvent[main_opencl_event]),
1828 "clWaitForEvents");
1829 HANDLE_CLERROR(clFinish(queue[sequential_id]), "clFinish");
1830 HANDLE_CLERROR(clGetEventProfilingInfo(benchEvent[main_opencl_event],
1831 CL_PROFILING_COMMAND_START,
1832 sizeof(cl_ulong),
1833 &startTime, NULL),
1834 "clGetEventProfilingInfo start");
1835
1836 HANDLE_CLERROR(clGetEventProfilingInfo(benchEvent[main_opencl_event],
1837 CL_PROFILING_COMMAND_END,
1838 sizeof(cl_ulong), &endTime, NULL),
1839 "clGetEventProfilingInfo end");
1840 cl_ulong roundup = endTime - startTime - 1;
1841 numloops = (int)(size_t)((200000000ULL + roundup) / (endTime - startTime));
1842
1843 clear_profiling_events();
1844
1845 if (numloops < 1)
1846 numloops = 1;
1847
1848 // Find minimum time
1849 for (optimal_work_group = my_work_group = wg_multiple;
1850 (int)my_work_group <= (int)max_group_size;
1851 my_work_group += wg_multiple) {
1852
1853 global_work_size = gws;
1854 if (gws % my_work_group != 0) {
1855
1856 if (GET_EXACT_MULTIPLE(gws, my_work_group) > global_work_size)
1857 continue;
1858 global_work_size = GET_EXACT_MULTIPLE(gws, my_work_group);
1859 }
1860
1861 if (options.verbosity > VERB_LEGACY)
1862 fprintf(stderr, "Testing LWS=" Zu " GWS=" Zu " ...", my_work_group,
1863 global_work_size);
1864
1865 sumStartTime = 0;
1866 sumEndTime = 0;
1867
1868 for (i = 0; i < numloops; i++) {
1869 advance_cursor();
1870 local_work_size = my_work_group;
1871
1872 // Activate events. Then clear them later.
1873 for (j = 0; j < MAX_EVENTS; j++)
1874 multi_profilingEvent[j] = &benchEvent[j];
1875
1876 count = global_work_size * ocl_v_width;
1877 result = self->methods.crypt_all(&count, autotune_salts);
1878 if (result < 0) {
1879 startTime = endTime = 0;
1880 break;
1881 }
1882 self->methods.cmp_all(binary, result);
1883
1884 HANDLE_CLERROR(clWaitForEvents(1, &benchEvent[main_opencl_event]),
1885 "clWaitForEvents");
1886 HANDLE_CLERROR(clFinish(queue[sequential_id]), "clFinish");
1887 HANDLE_CLERROR(clGetEventProfilingInfo(benchEvent
1888 [main_opencl_event], CL_PROFILING_COMMAND_START,
1889 sizeof(cl_ulong), &startTime, NULL),
1890 "clGetEventProfilingInfo start");
1891 HANDLE_CLERROR(clGetEventProfilingInfo(benchEvent
1892 [main_opencl_event], CL_PROFILING_COMMAND_END,
1893 sizeof(cl_ulong), &endTime, NULL),
1894 "clGetEventProfilingInfo end");
1895
1896 sumStartTime += startTime;
1897 sumEndTime += endTime;
1898
1899 clear_profiling_events();
1900 }
1901
1902 /* Erase the 'spinning wheel' cursor */
1903 if (john_main_process)
1904 fprintf(stderr, " \b");
1905
1906 if (!endTime)
1907 break;
1908 if (options.verbosity > VERB_LEGACY)
1909 fprintf(stderr, " %s%s\n", ns2string(sumEndTime - sumStartTime),
1910 ((double)(sumEndTime - sumStartTime) / kernelExecTimeNs < 0.997)
1911 ? "+" : "");
1912 if ((double)(sumEndTime - sumStartTime) / kernelExecTimeNs < 0.997) {
1913 kernelExecTimeNs = sumEndTime - sumStartTime;
1914 optimal_work_group = my_work_group;
1915 } else {
1916 if (my_work_group >= 256 ||
1917 (my_work_group >= 8 && wg_multiple < 8)) {
1918 /* Jump to next power of 2 */
1919 size_t x, y;
1920 x = my_work_group;
1921 while ((y = x & (x - 1)))
1922 x = y;
1923 x *= 2;
1924 my_work_group =
1925 GET_NEXT_MULTIPLE(x, wg_multiple);
1926 /* The loop logic will re-add wg_multiple */
1927 my_work_group -= wg_multiple;
1928 }
1929 }
1930 }
1931 // Release profiling queue and create new with profiling disabled
1932 HANDLE_CLERROR(clReleaseCommandQueue(queue[sequential_id]),
1933 "clReleaseCommandQueue");
1934 queue[sequential_id] =
1935 clCreateCommandQueue(context[sequential_id],
1936 devices[sequential_id], 0, &ret_code);
1937 HANDLE_CLERROR(ret_code, "clCreateCommandQueue");
1938 local_work_size = optimal_work_group;
1939 global_work_size = GET_EXACT_MULTIPLE(gws, local_work_size);
1940
1941 if (!self->methods.tunable_cost_value[0] || !autotune_db->real)
1942 dyna_salt_remove(salt);
1943 }
1944
human_speed(unsigned long long int speed)1945 static char *human_speed(unsigned long long int speed)
1946 {
1947 static char out[32];
1948 char p = '\0';
1949
1950 if (speed > 1000000) {
1951 speed /= 1000;
1952 p = 'K';
1953 }
1954 if (speed > 1000000) {
1955 speed /= 1000;
1956 p = 'M';
1957 }
1958 if (speed > 1000000) {
1959 speed /= 1000;
1960 p = 'G';
1961 }
1962 if (speed > 1000000) {
1963 speed /= 1000;
1964 p = 'T'; /* you wish */
1965 }
1966 if (p)
1967 snprintf(out, sizeof(out), "%llu%cc/s", speed, p);
1968 else
1969 snprintf(out, sizeof(out), "%lluc/s", speed);
1970
1971 return out;
1972 }
1973
get_bitmap_size_bits(uint32_t num_elements,int sequential_id)1974 uint32_t get_bitmap_size_bits(uint32_t num_elements, int sequential_id)
1975 {
1976 uint32_t size, elements = num_elements;
1977 //On super: 128MB , 1GB, 2GB
1978 cl_ulong memory_available = get_max_mem_alloc_size(sequential_id);
1979
1980 get_power_of_two(elements);
1981
1982 size = (elements * 8);
1983
1984 if (num_elements < (16))
1985 size = (16 * 1024 * 8); //Cache?
1986 else if (num_elements < (128))
1987 size = (1024 * 1024 * 8 * 16);
1988 else if (num_elements < (16 * 1024))
1989 size *= 1024 * 4;
1990 else
1991 size *= 256;
1992
1993 if (size > memory_available) {
1994 size = memory_available;
1995 get_power_of_two(size);
1996
1997 }
1998 if (!size || size > INT_MAX)
1999 size = (uint32_t)INT_MAX + 1U;
2000
2001 return size;
2002 }
2003
opencl_find_best_gws(int step,int max_duration,int sequential_id,unsigned int rounds,int have_lws)2004 void opencl_find_best_gws(int step, int max_duration,
2005 int sequential_id, unsigned int rounds, int have_lws)
2006 {
2007 size_t num = 0;
2008 size_t optimal_gws = local_work_size, soft_limit = 0;
2009 unsigned long long speed, best_speed = 0, raw_speed;
2010 cl_ulong run_time;
2011 int save_duration_time = duration_time;
2012 cl_uint core_count = get_processors_count(sequential_id);
2013
2014 if (have_lws) {
2015 if (core_count > 2)
2016 optimal_gws = lcm(core_count, optimal_gws);
2017 default_value = optimal_gws;
2018 } else {
2019 soft_limit = local_work_size * core_count * 128;
2020 }
2021
2022 /* conf setting may override (decrease) code's max duration */
2023 if (!duration_time || max_duration < duration_time)
2024 duration_time = max_duration;
2025
2026 if (options.verbosity > VERB_DEFAULT) {
2027 if (mask_int_cand.num_int_cand > 1 && !printed_mask++)
2028 fprintf(stderr, "Internal mask, multiplier: %u (target: %u)\n",
2029 mask_int_cand.num_int_cand, mask_int_cand_target);
2030 else if (mask_int_cand_target > 1 && !printed_mask)
2031 fprintf(stderr, "Internal mask not utilized (target: %u)\n",
2032 mask_int_cand_target);
2033 }
2034 if (options.verbosity > VERB_LEGACY) {
2035 fprintf(stderr, "Calculating best GWS for LWS="Zu"; "
2036 "max. %s single kernel invocation.\n",
2037 local_work_size, ms2string(duration_time));
2038 }
2039
2040 if (options.verbosity >= VERB_MAX)
2041 fprintf(stderr, "Raw speed figures including buffer transfers:\n");
2042
2043 // Change command queue to be used by crypt_all (profile needed)
2044 clReleaseCommandQueue(queue[sequential_id]); // Delete old queue
2045
2046 // Create a new queue with profiling enabled
2047 queue[sequential_id] =
2048 clCreateCommandQueue(context[sequential_id],
2049 devices[sequential_id], CL_QUEUE_PROFILING_ENABLE, &ret_code);
2050 HANDLE_CLERROR(ret_code, "clCreateCommandQueue");
2051
2052 for (num = autotune_get_next_gws_size(num, step, 1, default_value);;
2053 num = autotune_get_next_gws_size(num, step, 0, default_value)) {
2054 size_t kpc = num * ocl_v_width;
2055
2056 // Check if hardware can handle the size we are going
2057 // to try now.
2058 if ((soft_limit && (num > soft_limit)) ||
2059 (gws_limit && (num > gws_limit)) || ((gws_limit == 0) &&
2060 (buffer_size * kpc * 1.1 > get_max_mem_alloc_size(gpu_id)))) {
2061 if (!optimal_gws)
2062 optimal_gws = num;
2063
2064 if (options.verbosity >= VERB_MAX)
2065 fprintf(stderr, "Hardware resources exhausted\n");
2066 break;
2067 }
2068
2069 if (!(run_time = gws_test(num, rounds, sequential_id)))
2070 break;
2071
2072 if (options.verbosity <= VERB_LEGACY)
2073 advance_cursor();
2074
2075 raw_speed = (kpc / (run_time / 1E9)) * mask_int_cand.num_int_cand;
2076 speed = rounds * raw_speed;
2077
2078 if (options.verbosity > VERB_LEGACY)
2079 fprintf(stderr, "gws: %9zu\t%10s%12llu "
2080 "rounds/s%10s per crypt_all()",
2081 num, human_speed(raw_speed), speed, ns2string(run_time));
2082
2083 /*
2084 * Larger GWS is very expensive for single mode, so we try to
2085 * keep it reasonable low here.
2086 */
2087 if (speed >
2088 ((options.flags & FLG_SINGLE_CHK ? 1.25 : 1.01) * best_speed)) {
2089 if (options.verbosity > VERB_LEGACY)
2090 fprintf(stderr, (speed > 2 * best_speed) ? "!" : "+");
2091 best_speed = speed;
2092 global_speed = raw_speed;
2093 optimal_gws = num;
2094 }
2095 if (options.verbosity > VERB_LEGACY)
2096 fprintf(stderr, "\n");
2097 }
2098
2099 /* Backward run */
2100 for (num = autotune_get_prev_gws_size(optimal_gws, step);;
2101 num = autotune_get_prev_gws_size(num, step)) {
2102 size_t kpc = num * ocl_v_width;
2103
2104 if (!(run_time = gws_test(num, rounds, sequential_id)))
2105 break;
2106
2107 if (options.verbosity <= VERB_LEGACY)
2108 advance_cursor();
2109
2110 raw_speed = (kpc / (run_time / 1E9)) * mask_int_cand.num_int_cand;
2111 speed = rounds * raw_speed;
2112
2113 if (options.verbosity > VERB_LEGACY)
2114 fprintf(stderr, "gws: %9zu\t%10s%12llu "
2115 "rounds/s%10s per crypt_all()",
2116 num, human_speed(raw_speed), speed, ns2string(run_time));
2117
2118 if (speed < best_speed) {
2119 if (options.verbosity > VERB_LEGACY)
2120 fprintf(stderr, "-\n");
2121 break;
2122 }
2123 best_speed = speed;
2124 global_speed = raw_speed;
2125 optimal_gws = num;
2126 if (options.verbosity > VERB_LEGACY)
2127 fprintf(stderr, "!!\n");
2128 }
2129
2130 /* Erase the 'spinning wheel' cursor */
2131 if (options.verbosity <= VERB_LEGACY && john_main_process)
2132 fprintf(stderr, " \b");
2133
2134 // Release profiling queue and create new with profiling disabled
2135 HANDLE_CLERROR(clReleaseCommandQueue(queue[sequential_id]),
2136 "clReleaseCommandQueue");
2137 queue[sequential_id] =
2138 clCreateCommandQueue(context[sequential_id],
2139 devices[sequential_id], 0, &ret_code);
2140 HANDLE_CLERROR(ret_code, "clCreateCommandQueue");
2141 global_work_size = optimal_gws;
2142
2143 duration_time = save_duration_time;
2144 }
2145
2146 /* Get one device compute capability as a string */
get_device_capability(int sequential_id)2147 static char* get_device_capability(int sequential_id)
2148 {
2149 static char ret[32];
2150 unsigned int major = 0, minor = 0;
2151
2152 ret[0] = '\0';
2153
2154 get_compute_capability(sequential_id, &major, &minor);
2155
2156 if (major) {
2157 snprintf(ret, sizeof(ret), "-DSM_MAJOR=%d -DSM_MINOR=%d ",
2158 major, minor);
2159 }
2160
2161 return ret;
2162 }
2163
2164 /* Load detailed information about a device
2165 * - fill in the details of the OpenCL device (device_info[] bitfield variable);
2166 */
load_device_info(int sequential_id)2167 static void load_device_info(int sequential_id)
2168 {
2169 cl_device_type device;
2170 unsigned int major = 0, minor = 0;
2171
2172 device = get_device_type(sequential_id);
2173
2174 if (device == CL_DEVICE_TYPE_CPU)
2175 device_info[sequential_id] = DEV_CPU;
2176 else if (device == CL_DEVICE_TYPE_GPU)
2177 device_info[sequential_id] = DEV_GPU;
2178 else if (device == CL_DEVICE_TYPE_ACCELERATOR)
2179 device_info[sequential_id] = DEV_ACCELERATOR;
2180
2181 device_info[sequential_id] += get_vendor_id(sequential_id);
2182 device_info[sequential_id] += get_processor_family(sequential_id);
2183 device_info[sequential_id] += get_byte_addressable(sequential_id);
2184
2185 get_compute_capability(sequential_id, &major, &minor);
2186
2187 if (major) {
2188 device_info[sequential_id] += (major == 2 ? DEV_NV_C2X : 0);
2189 device_info[sequential_id] +=
2190 (major == 3 && minor == 0 ? DEV_NV_C30 : 0);
2191 device_info[sequential_id] +=
2192 (major == 3 && minor == 2 ? DEV_NV_C32 : 0);
2193 device_info[sequential_id] +=
2194 (major == 3 && minor == 5 ? DEV_NV_C35 : 0);
2195 device_info[sequential_id] += (major == 5 ? DEV_NV_MAXWELL : 0);
2196 device_info[sequential_id] += (major == 6 ? DEV_NV_PASCAL : 0);
2197 device_info[sequential_id] += (major == 7 ? DEV_NV_VOLTA : 0);
2198 }
2199 }
2200
opencl_read_source(const char * kernel_filename,char ** kernel_source)2201 size_t opencl_read_source(const char *kernel_filename, char **kernel_source)
2202 {
2203 FILE *fp;
2204 char *full_path;
2205 size_t source_size, read_size;
2206
2207 fp = fopen(full_path = (char*)path_expand_safe(kernel_filename), "rb");
2208 MEM_FREE(full_path);
2209
2210 if (!fp)
2211 pexit("Can't read source kernel");
2212
2213 #if OS_FLOCK || FCNTL_LOCKS
2214 #if RACE_CONDITION_DEBUG
2215 if (options.verbosity == VERB_DEBUG)
2216 fprintf(stderr, "Node %d %s locking (shared) %s...\n", NODE, __FUNCTION__, kernel_filename);
2217 #endif
2218 {
2219 #if FCNTL_LOCKS
2220 struct flock lock;
2221
2222 memset(&lock, 0, sizeof(lock));
2223 lock.l_type = F_RDLCK;
2224 while (fcntl(fileno(fp), F_SETLKW, &lock)) {
2225 if (errno != EINTR)
2226 pexit("fcntl(F_RDLCK)");
2227 }
2228 #else
2229 while (flock(fileno(fp), LOCK_SH)) {
2230 if (errno != EINTR)
2231 pexit("flock(LOCK_SH)");
2232 }
2233 #endif
2234 }
2235 #if RACE_CONDITION_DEBUG
2236 if (options.verbosity == VERB_DEBUG)
2237 fprintf(stderr, "Node %d got a shared lock on %s\n", NODE, kernel_filename);
2238 #endif
2239 #endif /* OS_FLOCK || FCNTL_LOCKS */
2240 fseek(fp, 0, SEEK_END);
2241 source_size = ftell(fp);
2242 fseek(fp, 0, SEEK_SET);
2243 MEM_FREE((*kernel_source));
2244 *kernel_source = mem_calloc(1, source_size + 1);
2245 read_size = fread(*kernel_source, sizeof(char), source_size, fp);
2246 if (read_size != source_size)
2247 fprintf(stderr,
2248 "Error reading source: expected "Zu", got "Zu" bytes (%s).\n",
2249 source_size, read_size,
2250 feof(fp) ? "EOF" : strerror(errno));
2251 #if RACE_CONDITION_DEBUG
2252 if (options.verbosity == VERB_DEBUG)
2253 fprintf(stderr, "Node %d closing %s\n", NODE, kernel_filename);
2254 #endif
2255 fclose(fp);
2256 return source_size;
2257 }
2258
2259 #if JOHN_SYSTEMWIDE
replace_str(const char * string,char * from,char * to)2260 static const char *replace_str(const char *string, char *from, char *to)
2261 {
2262 static char buffer[512];
2263 char *p;
2264 int len;
2265
2266 if (!(p = strstr(string, from)))
2267 return string;
2268
2269 len = p - string;
2270 strncpy(buffer, string, len);
2271 buffer[len] = '\0';
2272
2273 sprintf(buffer + len, "%s%s", to, p + strlen(from));
2274
2275 return buffer;
2276 }
2277 #endif
2278
2279
opencl_build_kernel_opt(const char * kernel_filename,int sequential_id,const char * opts)2280 void opencl_build_kernel_opt(const char *kernel_filename, int sequential_id,
2281 const char *opts)
2282 {
2283 char *kernel_source = NULL;
2284 opencl_read_source(kernel_filename, &kernel_source);
2285 opencl_build(sequential_id, opts, 0, NULL, &program[sequential_id], kernel_filename, kernel_source);
2286 MEM_FREE(kernel_source);
2287 }
2288
2289 #define md5add(string) MD5_Update(&ctx, (string), strlen(string))
2290
opencl_build_kernel(const char * kernel_filename,int sequential_id,const char * opts,int warn)2291 void opencl_build_kernel(const char *kernel_filename, int sequential_id, const char *opts,
2292 int warn)
2293 {
2294 #if HAVE_MPI
2295 static int once;
2296 #endif
2297
2298 /*
2299 * Disable binary caching for:
2300 * - nvidia unless on macOS
2301 * - CPU if on macOS
2302 */
2303 if ((gpu_nvidia(device_info[sequential_id]) && !platform_apple(get_platform_id(sequential_id))) ||
2304 (cpu(device_info[sequential_id]) && platform_apple(get_platform_id(sequential_id)))) {
2305 log_event("- Kernel binary caching disabled for this platform/device");
2306 opencl_build_kernel_opt(kernel_filename, sequential_id, opts);
2307 } else {
2308 struct stat source_stat, bin_stat;
2309 char dev_name[512], bin_name[512];
2310 const char *tmp_name;
2311 unsigned char hash[16];
2312 char hash_str[33];
2313 uint64_t startTime, runtime;
2314 int i;
2315 MD5_CTX ctx;
2316 char *kernel_source = NULL;
2317 const char *global_opts;
2318
2319 if (!(global_opts = getenv("OPENCLBUILDOPTIONS")))
2320 if (!(global_opts = cfg_get_param(SECTION_OPTIONS,
2321 SUBSECTION_OPENCL, "GlobalBuildOpts")))
2322 global_opts = OPENCLBUILDOPTIONS;
2323
2324 startTime = (unsigned long)time(NULL);
2325
2326 // Get device name.
2327 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2328 CL_DEVICE_NAME, sizeof(dev_name),
2329 dev_name, NULL),
2330 "clGetDeviceInfo for DEVICE_NAME");
2331
2332 /*
2333 * Create a hash of kernel source and parameters, and use as cache name.
2334 */
2335 MD5_Init(&ctx);
2336 md5add(kernel_filename);
2337 opencl_read_source(kernel_filename, &kernel_source);
2338 md5add(kernel_source);
2339 md5add(global_opts);
2340 if (opts)
2341 md5add(opts);
2342 md5add(opencl_driver_ver(sequential_id));
2343 md5add(dev_name);
2344 MD5_Update(&ctx, (char*)&platform_id, sizeof(platform_id));
2345 MD5_Final(hash, &ctx);
2346
2347 for (i = 0; i < 16; i++) {
2348 hash_str[2 * i + 0] = itoa16[hash[i] >> 4];
2349 hash_str[2 * i + 1] = itoa16[hash[i] & 0xf];
2350 }
2351 hash_str[32] = 0;
2352
2353 #if JOHN_SYSTEMWIDE
2354 tmp_name = replace_str(kernel_filename, "$JOHN", JOHN_PRIVATE_HOME);
2355 #else
2356 tmp_name = kernel_filename;
2357 #endif
2358 snprintf(bin_name, sizeof(bin_name), "%s_%s.bin",
2359 tmp_name, hash_str);
2360
2361 // Select the kernel to run.
2362 if (!getenv("DUMP_BINARY") &&
2363 !stat(path_expand(kernel_filename), &source_stat) &&
2364 !stat(path_expand(bin_name), &bin_stat) &&
2365 (source_stat.st_mtime < bin_stat.st_mtime)) {
2366 size_t program_size = opencl_read_source(bin_name, &kernel_source);
2367 log_event("- Building kernel from cached binary");
2368 opencl_build_from_binary(sequential_id, &program[sequential_id], kernel_source, program_size);
2369 } else {
2370 log_event("- Building kernel and caching binary");
2371 if (warn && options.verbosity > VERB_DEFAULT) {
2372 fprintf(stderr, "Building the kernel, this "
2373 "could take a while\n");
2374 fflush(stdout);
2375 }
2376 opencl_read_source(kernel_filename, &kernel_source);
2377 opencl_build(sequential_id, opts, 1, bin_name, &program[sequential_id], kernel_filename, kernel_source);
2378 }
2379 if (warn && options.verbosity > VERB_DEFAULT) {
2380 if ((runtime = (unsigned long)(time(NULL) - startTime))
2381 > 2UL)
2382 fprintf(stderr, "Build time: %lu seconds\n",
2383 (unsigned long)runtime);
2384 fflush(stdout);
2385 }
2386
2387 MEM_FREE(kernel_source);
2388 }
2389 #if HAVE_MPI
2390 if (mpi_p > 1 && !once++) {
2391 #if RACE_CONDITION_DEBUG
2392 if (options.verbosity == VERB_DEBUG)
2393 fprintf(stderr, "Node %d reached %s() MPI build barrier\n",
2394 NODE, __FUNCTION__);
2395 #endif
2396 MPI_Barrier(MPI_COMM_WORLD);
2397 if (mpi_id == 0 && options.verbosity >= VERB_DEFAULT)
2398 fprintf(stderr, "All nodes done OpenCL build\n");
2399 }
2400 #endif /* HAVE_MPI */
2401 }
2402
opencl_prepare_dev(int sequential_id)2403 int opencl_prepare_dev(int sequential_id)
2404 {
2405 int err_type = 0;
2406 #ifdef HAVE_MPI
2407 static int once;
2408 #endif
2409
2410 // If OpenCL has not yet been loaded, load it now
2411 opencl_load_environment();
2412
2413 if (sequential_id < 0)
2414 sequential_id = gpu_id;
2415
2416 profilingEvent = firstEvent = lastEvent = NULL;
2417 if (!context[sequential_id])
2418 start_opencl_device(sequential_id, &err_type);
2419 print_device_info(sequential_id);
2420
2421 #if HAVE_MPI
2422 if (mpi_p > 1 && !once++) {
2423 // Avoid silly race conditions seen with nvidia
2424 #if RACE_CONDITION_DEBUG
2425 if (options.verbosity == VERB_DEBUG)
2426 fprintf(stderr, "Node %d reached MPI prep barrier\n", NODE);
2427 #endif
2428 MPI_Barrier(MPI_COMM_WORLD);
2429 if (mpi_id == 0 && options.verbosity == VERB_DEBUG)
2430 fprintf(stderr, "All nodes done OpenCL prepare\n");
2431 }
2432 #endif
2433
2434 return sequential_id;
2435 }
2436
opencl_init(const char * kernel_filename,int sequential_id,const char * opts)2437 void opencl_init(const char *kernel_filename, int sequential_id, const char *opts)
2438 {
2439 sequential_id = opencl_prepare_dev(sequential_id);
2440 opencl_build_kernel(kernel_filename, sequential_id, opts, 0);
2441 }
2442
get_device_type(int sequential_id)2443 cl_device_type get_device_type(int sequential_id)
2444 {
2445 cl_device_type type;
2446
2447 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_TYPE,
2448 sizeof(cl_device_type), &type, NULL),
2449 "clGetDeviceInfo for CL_DEVICE_TYPE");
2450
2451 return type;
2452 }
2453
get_local_memory_size(int sequential_id)2454 cl_ulong get_local_memory_size(int sequential_id)
2455 {
2456 cl_ulong size;
2457
2458 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2459 CL_DEVICE_LOCAL_MEM_SIZE,
2460 sizeof(cl_ulong), &size, NULL),
2461 "clGetDeviceInfo for CL_DEVICE_LOCAL_MEM_SIZE");
2462
2463 return size;
2464 }
2465
get_global_memory_size(int sequential_id)2466 cl_ulong get_global_memory_size(int sequential_id)
2467 {
2468 cl_ulong size;
2469
2470 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2471 CL_DEVICE_GLOBAL_MEM_SIZE,
2472 sizeof(cl_ulong), &size, NULL),
2473 "clGetDeviceInfo for CL_DEVICE_GLOBAL_MEM_SIZE");
2474
2475 return size;
2476 }
2477
get_device_max_lws(int sequential_id)2478 size_t get_device_max_lws(int sequential_id)
2479 {
2480 size_t max_group_size;
2481
2482 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2483 CL_DEVICE_MAX_WORK_GROUP_SIZE,
2484 sizeof(max_group_size),
2485 &max_group_size, NULL),
2486 "clGetDeviceInfo for CL_DEVICE_MAX_WORK_GROUP_SIZE");
2487
2488 return max_group_size;
2489 }
2490
get_max_mem_alloc_size(int sequential_id)2491 cl_ulong get_max_mem_alloc_size(int sequential_id)
2492 {
2493 cl_ulong max_alloc_size;
2494
2495 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2496 CL_DEVICE_MAX_MEM_ALLOC_SIZE,
2497 sizeof(max_alloc_size),
2498 &max_alloc_size, NULL),
2499 "clGetDeviceInfo for CL_DEVICE_MAX_MEM_ALLOC_SIZE");
2500
2501 return max_alloc_size;
2502 }
2503
get_kernel_max_lws(int sequential_id,cl_kernel crypt_kernel)2504 size_t get_kernel_max_lws(int sequential_id, cl_kernel crypt_kernel)
2505 {
2506 size_t max_group_size;
2507
2508 HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel,
2509 devices[sequential_id],
2510 CL_KERNEL_WORK_GROUP_SIZE,
2511 sizeof(max_group_size),
2512 &max_group_size, NULL),
2513 "clGetKernelWorkGroupInfo for CL_KERNEL_WORK_GROUP_SIZE");
2514
2515 return max_group_size;
2516 }
2517
get_max_compute_units(int sequential_id)2518 cl_uint get_max_compute_units(int sequential_id)
2519 {
2520 cl_uint size;
2521
2522 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2523 CL_DEVICE_MAX_COMPUTE_UNITS,
2524 sizeof(cl_uint), &size, NULL),
2525 "clGetDeviceInfo for CL_DEVICE_MAX_COMPUTE_UNITS");
2526
2527 return size;
2528 }
2529
get_kernel_preferred_multiple(int sequential_id,cl_kernel crypt_kernel)2530 size_t get_kernel_preferred_multiple(int sequential_id, cl_kernel crypt_kernel)
2531 {
2532 size_t size;
2533
2534 HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel,
2535 devices[sequential_id],
2536 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
2537 sizeof(size), &size, NULL),
2538 "clGetKernelWorkGroupInfo for CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE");
2539
2540 return size;
2541 }
2542
get_compute_capability(int sequential_id,unsigned int * major,unsigned int * minor)2543 void get_compute_capability(int sequential_id, unsigned int *major,
2544 unsigned int *minor)
2545 {
2546 clGetDeviceInfo(devices[sequential_id],
2547 CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
2548 sizeof(cl_uint), major, NULL);
2549 clGetDeviceInfo(devices[sequential_id],
2550 CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
2551 sizeof(cl_uint), minor, NULL);
2552 }
2553
get_processors_count(int sequential_id)2554 cl_uint get_processors_count(int sequential_id)
2555 {
2556 cl_uint core_count = get_max_compute_units(sequential_id);
2557 char dname[MAX_OCLINFO_STRING_LEN];
2558
2559 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2560 CL_DEVICE_NAME,
2561 sizeof(dname), dname, NULL),
2562 "clGetDeviceInfo for CL_DEVICE_NAME");
2563
2564 ocl_device_list[sequential_id].cores_per_MP = 0;
2565
2566 if (gpu_nvidia(device_info[sequential_id])) {
2567 unsigned int major = 0, minor = 0;
2568
2569 get_compute_capability(sequential_id, &major, &minor);
2570 if (major == 1) // 1.x Tesla
2571 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 8);
2572 else if (major == 2 && minor == 0) // 2.0 Fermi
2573 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 32);
2574 else if (major == 2 && minor >= 1) // 2.1 Fermi
2575 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 48);
2576 else if (major == 3) // 3.x Kepler
2577 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 192);
2578 else if (major == 5) // 5.x Maxwell
2579 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 128);
2580 else if (major == 6) // 6.x Pascal
2581 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 128);
2582 else if (major >= 7) // 7.x Volta, 8.x Turing?
2583 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 64);
2584 /*
2585 * Apple, VCL and some other environments don't expose get_compute_capability()
2586 * so we need this crap - which is incomplete.
2587 * http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units
2588 *
2589 * This will produce a *guessed* figure
2590 */
2591
2592 // Volta or Turing
2593 else if (strstr(dname, "TITAN V") || strstr(dname, "RTX 2"))
2594 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 64);
2595 // Pascal
2596 else if (strstr(dname, "GTX 10"))
2597 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 128);
2598 // Maxwell
2599 else if (strstr(dname, "GTX 9") || strstr(dname, "GTX TITAN X"))
2600 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 128);
2601 // Kepler
2602 else if (strstr(dname, "GT 6") || strstr(dname, "GTX 6") ||
2603 strstr(dname, "GT 7") || strstr(dname, "GTX 7") ||
2604 strstr(dname, "GT 8") || strstr(dname, "GTX 8") ||
2605 strstr(dname, "GTX TITAN"))
2606 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 192);
2607 // Fermi
2608 else if (strstr(dname, "GT 5") || strstr(dname, "GTX 5"))
2609 core_count *= (ocl_device_list[sequential_id].cores_per_MP = 48);
2610 } else if (gpu_intel(device_info[sequential_id])) {
2611 // It seems all current models are x 8
2612 core_count *= ocl_device_list[sequential_id].cores_per_MP = 8;
2613 } else if (gpu_amd(device_info[sequential_id])) {
2614 // 16 thread proc * 5 SP
2615 core_count *= (ocl_device_list[sequential_id].cores_per_MP = (16 *
2616 ((amd_gcn(device_info[sequential_id]) ||
2617 amd_vliw4(device_info[sequential_id])) ? 4 : 5)));
2618 } else {
2619 // Nothing else known, we use half native vector width for long
2620 cl_uint v_width;
2621
2622 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2623 CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG,
2624 sizeof(v_width), &v_width, NULL),
2625 "clGetDeviceInfo for CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG");
2626 core_count *= (ocl_device_list[sequential_id].cores_per_MP = v_width);
2627 }
2628
2629 return core_count;
2630 }
2631
opencl_speed_index(int sequential_id)2632 unsigned int opencl_speed_index(int sequential_id)
2633 {
2634 cl_uint clock;
2635
2636 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2637 CL_DEVICE_MAX_CLOCK_FREQUENCY,
2638 sizeof(clock), &clock, NULL),
2639 "clGetDeviceInfo for CL_DEVICE_MAX_CLOCK_FREQUENCY");
2640
2641 return clock * get_processors_count(sequential_id);
2642 }
2643
get_processor_family(int sequential_id)2644 cl_uint get_processor_family(int sequential_id)
2645 {
2646 char dname[MAX_OCLINFO_STRING_LEN];
2647
2648 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_NAME,
2649 sizeof(dname), dname, NULL),
2650 "clGetDeviceInfo for CL_DEVICE_NAME");
2651
2652 /* Workaround for MESA. */
2653 if (*dname)
2654 strlwr(&dname[1]);
2655
2656 if gpu_amd
2657 (device_info[sequential_id]) {
2658
2659 if ((strstr(dname, "Cedar") || //AMD Radeon VLIW5
2660 strstr(dname, "Redwood") || strstr(dname, "Juniper")
2661 || strstr(dname, "Cypress") || strstr(dname, "Hemlock")
2662 || strstr(dname, "Caicos") || //AMD Radeon VLIW5 Gen 2
2663 strstr(dname, "Turks") || strstr(dname, "Barts") ||
2664 strstr(dname, "Wrestler")
2665 || strstr(dname, "Ontario") || strstr(dname, "Zacate")
2666 || strstr(dname, "Winterpark") || strstr(dname, "Beavercreek")
2667 || strstr(dname, "Cayman") || //AMD Radeon VLIW4
2668 strstr(dname, "Antilles") || strstr(dname, "Devastator")
2669 || strstr(dname, "R7") //AMD Radeon VLIW4
2670 )) {
2671
2672 if (strstr(dname, "Cayman") ||
2673 strstr(dname, "Antilles") ||
2674 strstr(dname, "Devastator") || strstr(dname, "R7"))
2675 return DEV_AMD_VLIW4;
2676 else
2677 return DEV_AMD_VLIW5;
2678
2679 } else {
2680
2681 if (strstr(dname, "Capeverde") || strstr(dname, "Malta") ||
2682 strstr(dname, "Oland") || strstr(dname, "Hainan") ||
2683 strstr(dname, "Pitcairn") || strstr(dname, "Tahiti"))
2684 return DEV_AMD_GCN_10; //AMD Radeon GCN 1.0
2685
2686 else if (strstr(dname, "Bonaire") || strstr(dname, "Hawaii") ||
2687 strstr(dname, "Vesuvius") || strstr(dname, "Grenada"))
2688 return DEV_AMD_GCN_11; //AMD Radeon GCN 1.1
2689
2690 else if (strstr(dname, "Tonga") || strstr(dname, "Antigua") ||
2691 strstr(dname, "Fiji"))
2692 return DEV_AMD_GCN_12; //AMD Radeon GCN 1.2
2693 /*
2694 * Graphics IP v6:
2695 * - Cape Verde, Hainan, Oland, Pitcairn, Tahiti
2696 * Graphics IP v7:
2697 * - Bonaire, Havaii, Kalindi, Mullins, Spectre, Spooky
2698 * Graphics IP v8:
2699 * - Iceland
2700 */
2701 /* All current GPUs are GCN so let's default to that */
2702 //return DEV_UNKNOWN;
2703 return DEV_AMD_GCN_12;
2704 }
2705 }
2706 return DEV_UNKNOWN;
2707 }
2708
get_byte_addressable(int sequential_id)2709 int get_byte_addressable(int sequential_id)
2710 {
2711 char dname[MAX_OCLINFO_STRING_LEN];
2712
2713 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id],
2714 CL_DEVICE_EXTENSIONS,
2715 sizeof(dname), dname, NULL),
2716 "clGetDeviceInfo for CL_DEVICE_EXTENSIONS");
2717
2718 if (strstr(dname, "cl_khr_byte_addressable_store") == NULL)
2719 return DEV_NO_BYTE_ADDRESSABLE;
2720
2721 return DEV_UNKNOWN;
2722 }
2723
get_vendor_id(int sequential_id)2724 int get_vendor_id(int sequential_id)
2725 {
2726 char dname[MAX_OCLINFO_STRING_LEN];
2727
2728 HANDLE_CLERROR(clGetDeviceInfo(devices[sequential_id], CL_DEVICE_VENDOR,
2729 sizeof(dname), dname, NULL),
2730 "clGetDeviceInfo for CL_DEVICE_VENDOR");
2731
2732 if (strstr(dname, "NVIDIA"))
2733 return DEV_NVIDIA;
2734
2735 if (strstr(dname, "Intel"))
2736 return DEV_INTEL;
2737
2738 if (strstr(dname, "Advanced Micro") ||
2739 strstr(dname, "AMD") || strstr(dname, "ATI"))
2740 return DEV_AMD;
2741
2742 return DEV_UNKNOWN;
2743 }
2744
get_platform_vendor_id(int platform_id)2745 int get_platform_vendor_id(int platform_id)
2746 {
2747 char dname[MAX_OCLINFO_STRING_LEN];
2748 cl_platform_id platform[MAX_PLATFORMS];
2749 cl_uint num_platforms;
2750
2751 HANDLE_CLERROR(clGetPlatformIDs(MAX_PLATFORMS, platform,
2752 &num_platforms),
2753 "clGetPlatformIDs");
2754
2755 HANDLE_CLERROR(clGetPlatformInfo(platform[platform_id], CL_PLATFORM_NAME,
2756 sizeof(dname), dname, NULL),
2757 "clGetPlatformInfo for CL_PLATFORM_NAME");
2758
2759 if (strstr(dname, "NVIDIA"))
2760 return DEV_NVIDIA;
2761
2762 if (strstr(dname, "Apple"))
2763 return PLATFORM_APPLE;
2764
2765 if (strstr(dname, "Intel"))
2766 return DEV_INTEL;
2767
2768 if (strstr(dname, "Advanced Micro") ||
2769 strstr(dname, "AMD") || strstr(dname, "ATI"))
2770 return DEV_AMD;
2771
2772 if ((strstr(dname, "MESA")) || (strstr(dname, "Mesa")))
2773 return PLATFORM_MESA;
2774
2775 if (strstr(dname, "beignet"))
2776 return PLATFORM_BEIGNET;
2777
2778 if (strstr(dname, "Portable Computing Language") || strstr(dname, "pocl"))
2779 return PLATFORM_POCL;
2780
2781 /*
2782 * If we found nothing recognized in the device name, look at
2783 * device version string as well
2784 */
2785 HANDLE_CLERROR(clGetPlatformInfo(platform[platform_id], CL_PLATFORM_VERSION,
2786 sizeof(dname), dname, NULL),
2787 "clGetPlatformInfo for CL_PLATFORM_VERSION");
2788
2789 if ((strstr(dname, "MESA")) || (strstr(dname, "Mesa")))
2790 return PLATFORM_MESA;
2791
2792 return DEV_UNKNOWN;
2793 }
2794
get_device_version(int sequential_id)2795 int get_device_version(int sequential_id)
2796 {
2797 char dname[MAX_OCLINFO_STRING_LEN];
2798 unsigned int major, minor;
2799
2800 if ((clGetDeviceInfo(devices[sequential_id], CL_DEVICE_VERSION,
2801 MAX_OCLINFO_STRING_LEN, dname, NULL) == CL_SUCCESS) &&
2802 sscanf(dname, "OpenCL %u.%u", &major, &minor) == 2)
2803 return major * 100 + minor * 10;
2804
2805 return DEV_UNKNOWN;
2806 }
2807
get_opencl_header_version()2808 char *get_opencl_header_version()
2809 {
2810 #ifdef CL_VERSION_2_2
2811 return "2.2";
2812 #elif CL_VERSION_2_1
2813 return "2.1";
2814 #elif CL_VERSION_2_0
2815 return "2.0";
2816 #elif CL_VERSION_1_2
2817 return "1.2";
2818 #elif CL_VERSION_1_1
2819 return "1.1";
2820 #elif CL_VERSION_1_0
2821 return "1.0";
2822 #else
2823 return "Unknown";
2824 #endif
2825 }
2826
get_error_name(cl_int cl_error)2827 char *get_error_name(cl_int cl_error)
2828 {
2829 char *message;
2830 static char out[128];
2831 static char *err_small[] = {
2832 "CL_SUCCESS", "CL_DEVICE_NOT_FOUND", "CL_DEVICE_NOT_AVAILABLE",
2833 "CL_COMPILER_NOT_AVAILABLE",
2834 "CL_MEM_OBJECT_ALLOCATION_FAILURE", "CL_OUT_OF_RESOURCES",
2835 "CL_OUT_OF_HOST_MEMORY",
2836 "CL_PROFILING_INFO_NOT_AVAILABLE", "CL_MEM_COPY_OVERLAP",
2837 "CL_IMAGE_FORMAT_MISMATCH",
2838 "CL_IMAGE_FORMAT_NOT_SUPPORTED", "CL_BUILD_PROGRAM_FAILURE",
2839 "CL_MAP_FAILURE", "CL_MISALIGNED_SUB_BUFFER_OFFSET",
2840 "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST",
2841 "CL_COMPILE_PROGRAM_FAILURE", "CL_LINKER_NOT_AVAILABLE",
2842 "CL_LINK_PROGRAM_FAILURE", "CL_DEVICE_PARTITION_FAILED",
2843 "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"
2844 };
2845 static char *err_invalid[] = {
2846 "CL_INVALID_VALUE", "CL_INVALID_DEVICE_TYPE",
2847 "CL_INVALID_PLATFORM", "CL_INVALID_DEVICE",
2848 "CL_INVALID_CONTEXT", "CL_INVALID_QUEUE_PROPERTIES",
2849 "CL_INVALID_COMMAND_QUEUE", "CL_INVALID_HOST_PTR",
2850 "CL_INVALID_MEM_OBJECT", "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
2851 "CL_INVALID_IMAGE_SIZE", "CL_INVALID_SAMPLER",
2852 "CL_INVALID_BINARY", "CL_INVALID_BUILD_OPTIONS",
2853 "CL_INVALID_PROGRAM", "CL_INVALID_PROGRAM_EXECUTABLE",
2854 "CL_INVALID_KERNEL_NAME", "CL_INVALID_KERNEL_DEFINITION",
2855 "CL_INVALID_KERNEL", "CL_INVALID_ARG_INDEX",
2856 "CL_INVALID_ARG_VALUE", "CL_INVALID_ARG_SIZE",
2857 "CL_INVALID_KERNEL_ARGS", "CL_INVALID_WORK_DIMENSION",
2858 "CL_INVALID_WORK_GROUP_SIZE", "CL_INVALID_WORK_ITEM_SIZE",
2859 "CL_INVALID_GLOBAL_OFFSET", "CL_INVALID_EVENT_WAIT_LIST",
2860 "CL_INVALID_EVENT", "CL_INVALID_OPERATION",
2861 "CL_INVALID_GL_OBJECT", "CL_INVALID_BUFFER_SIZE",
2862 "CL_INVALID_MIP_LEVEL", "CL_INVALID_GLOBAL_WORK_SIZE",
2863 "CL_INVALID_PROPERTY", "CL_INVALID_IMAGE_DESCRIPTOR",
2864 "CL_INVALID_COMPILER_OPTIONS", "CL_INVALID_LINKER_OPTIONS",
2865 "CL_INVALID_DEVICE_PARTITION_COUNT"
2866 };
2867
2868 if (cl_error <= 0 && cl_error >= -19)
2869 message = err_small[-cl_error];
2870 else if (cl_error <= -30 && cl_error >= -68)
2871 message = err_invalid[-cl_error - 30];
2872 else
2873 message = "UNKNOWN OPENCL ERROR";
2874 sprintf(out, "%s (%d)", message, cl_error);
2875 return out;
2876 }
2877
2878 /*
2879 * We currently leave all of this to single.c instead but this function
2880 * remains for future functionality.
2881 */
opencl_calc_min_kpc(size_t lws,size_t gws,int v_width)2882 int opencl_calc_min_kpc(size_t lws, size_t gws, int v_width)
2883 {
2884 return gws * v_width;
2885 }
2886
2887 /***
2888 * Despite of whatever the user uses as -dev=N, I will always list devices in
2889 * their natural order as defined by the OpenCL libraries.
2890 *
2891 * In order to be able to know everything about the device and list it obeying
2892 * its natural sequence (defined by hardware, PCI slots sequence, ...) is better
2893 * to scan all OpenCL stuff and list only when needed. Otherwise, I might need
2894 * to reorder first and then list.
2895 ***/
opencl_list_devices(void)2896 void opencl_list_devices(void)
2897 {
2898 char dname[MAX_OCLINFO_STRING_LEN];
2899 size_t z_entries;
2900 cl_uint entries;
2901 cl_ulong long_entries;
2902 int i, j, sequence_nr = 0, err_type = 0, platform_in_use = -1;
2903 size_t p_size;
2904 int available_devices = 0;
2905 cl_int ret;
2906 cl_platform_id platform_list[MAX_PLATFORMS];
2907 cl_uint num_platforms, num_devices;
2908
2909 /* Obtain a list of available platforms */
2910 ret = clGetPlatformIDs(MAX_PLATFORMS, platform_list, &num_platforms);
2911
2912 if (!num_platforms)
2913 fprintf(stderr, "Error: No OpenCL-capable platforms were detected"
2914 " by the installed OpenCL driver.\n");
2915
2916 if (ret != CL_SUCCESS && options.verbosity > VERB_LEGACY)
2917 fprintf(stderr, "Throw clError: clGetPlatformIDs() = %s\n",
2918 get_error_name(ret));
2919
2920 for (i = 0; i < num_platforms; i++) {
2921 platforms[i].platform = platform_list[i];
2922 ret = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_ALL,
2923 MAX_GPU_DEVICES, &devices[available_devices],
2924 &num_devices);
2925
2926 if ((ret != CL_SUCCESS || num_devices < 1) &&
2927 options.verbosity > VERB_LEGACY)
2928 fprintf(stderr, "No OpenCL devices was found on platform #%d"
2929 ", clGetDeviceIDs() = %s\n",
2930 i, get_error_name(ret));
2931
2932 available_devices += num_devices;
2933 platforms[i].num_devices = num_devices;
2934 }
2935
2936 if (!available_devices) {
2937 fprintf(stderr, "Error: No OpenCL-capable devices were detected"
2938 " by the installed OpenCL driver.\n\n");
2939 return;
2940 }
2941 /* Initialize OpenCL environment */
2942 if (!getenv("_SKIP_OCL_INITIALIZATION"))
2943 opencl_load_environment();
2944
2945 for (i = 0; platforms[i].platform; i++) {
2946
2947 /* Query devices for information */
2948 for (j = 0; j < platforms[i].num_devices; j++, sequence_nr++) {
2949 cl_device_local_mem_type memtype;
2950 cl_bool boolean;
2951 char *p;
2952 int ret, cpu;
2953 int fan, temp, util, cl, ml;
2954
2955 if (!getenv("_SKIP_OCL_INITIALIZATION") &&
2956 (!default_gpu_selected && !get_if_device_is_in_use(sequence_nr)))
2957 /* Nothing to do, skipping */
2958 continue;
2959
2960 if (platform_in_use != i) {
2961 /* Now, dealing with different platform. */
2962 /* Obtain information about platform */
2963 clGetPlatformInfo(platforms[i].platform,
2964 CL_PLATFORM_NAME, sizeof(dname), dname, NULL);
2965 printf("Platform #%d name: %s, ", i, dname);
2966 clGetPlatformInfo(platforms[i].platform,
2967 CL_PLATFORM_VERSION, sizeof(dname), dname, NULL);
2968 printf("version: %s\n", dname);
2969
2970 clGetPlatformInfo(platforms[i].platform,
2971 CL_PLATFORM_EXTENSIONS, sizeof(dname), dname, NULL);
2972 if (options.verbosity > VERB_LEGACY)
2973 printf(" Platform extensions: %s\n", dname);
2974
2975 /* Obtain a list of devices available */
2976 if (!platforms[i].num_devices)
2977 printf("%d devices found\n", platforms[i].num_devices);
2978
2979 platform_in_use = i;
2980 }
2981 clGetDeviceInfo(devices[sequence_nr], CL_DEVICE_NAME,
2982 sizeof(dname), dname, NULL);
2983 p = ltrim(dname);
2984 printf(" Device #%d (%d) name: %s\n", j, sequence_nr + 1, p);
2985
2986 // Check if device seems to be working.
2987 if (!start_opencl_device(sequence_nr, &err_type)) {
2988
2989 if (err_type == 1)
2990 printf(" Status: %s (%s)\n",
2991 "Context creation error", get_error_name(ret_code));
2992 else
2993 printf(" Status: %s (%s)\n",
2994 "Queue creation error", get_error_name(ret_code));
2995 }
2996
2997 ret = clGetDeviceInfo(devices[sequence_nr],
2998 CL_DEVICE_BOARD_NAME_AMD, sizeof(dname), dname, NULL);
2999 if (ret == CL_SUCCESS && strlen(dname))
3000 printf(" Board name: %s\n", dname);
3001
3002 clGetDeviceInfo(devices[sequence_nr], CL_DEVICE_VENDOR,
3003 sizeof(dname), dname, NULL);
3004 printf(" Device vendor: %s\n", dname);
3005 clGetDeviceInfo(devices[sequence_nr], CL_DEVICE_TYPE,
3006 sizeof(cl_ulong), &long_entries, NULL);
3007 printf(" Device type: ");
3008 cpu = (long_entries & CL_DEVICE_TYPE_CPU);
3009 if (cpu)
3010 printf("CPU ");
3011 if (long_entries & CL_DEVICE_TYPE_GPU)
3012 printf("GPU ");
3013 if (long_entries & CL_DEVICE_TYPE_ACCELERATOR)
3014 printf("Accelerator ");
3015 if (long_entries & CL_DEVICE_TYPE_DEFAULT)
3016 printf("Default ");
3017 if (long_entries & ~(CL_DEVICE_TYPE_DEFAULT |
3018 CL_DEVICE_TYPE_ACCELERATOR |
3019 CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_CPU))
3020 printf("Unknown ");
3021 clGetDeviceInfo(devices[sequence_nr],
3022 CL_DEVICE_ENDIAN_LITTLE, sizeof(cl_bool), &boolean, NULL);
3023 printf("(%s)\n", boolean == CL_TRUE ? "LE" : "BE");
3024 clGetDeviceInfo(devices[sequence_nr], CL_DEVICE_VERSION,
3025 sizeof(dname), dname, NULL);
3026 printf(" Device version: %s\n", dname);
3027 printf(" Driver version: %s\n",
3028 opencl_driver_info(sequence_nr));
3029
3030 clGetDeviceInfo(devices[sequence_nr],
3031 CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR,
3032 sizeof(cl_uint), &entries, NULL);
3033 printf(" Native vector widths: char %d, ", entries);
3034 clGetDeviceInfo(devices[sequence_nr],
3035 CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT,
3036 sizeof(cl_uint), &entries, NULL);
3037 printf("short %d, ", entries);
3038 clGetDeviceInfo(devices[sequence_nr],
3039 CL_DEVICE_NATIVE_VECTOR_WIDTH_INT,
3040 sizeof(cl_uint), &entries, NULL);
3041 printf("int %d, ", entries);
3042 clGetDeviceInfo(devices[sequence_nr],
3043 CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG,
3044 sizeof(cl_uint), &entries, NULL);
3045 printf("long %d\n", entries);
3046
3047 clGetDeviceInfo(devices[sequence_nr],
3048 CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
3049 sizeof(cl_uint), &entries, NULL);
3050 printf(" Preferred vector width: char %d, ", entries);
3051 clGetDeviceInfo(devices[sequence_nr],
3052 CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
3053 sizeof(cl_uint), &entries, NULL);
3054 printf("short %d, ", entries);
3055 clGetDeviceInfo(devices[sequence_nr],
3056 CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
3057 sizeof(cl_uint), &entries, NULL);
3058 printf("int %d, ", entries);
3059 clGetDeviceInfo(devices[sequence_nr],
3060 CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
3061 sizeof(cl_uint), &entries, NULL);
3062 printf("long %d\n", entries);
3063
3064 clGetDeviceInfo(devices[sequence_nr],
3065 CL_DEVICE_GLOBAL_MEM_SIZE,
3066 sizeof(cl_ulong), &long_entries, NULL);
3067 clGetDeviceInfo(devices[sequence_nr],
3068 CL_DEVICE_ERROR_CORRECTION_SUPPORT,
3069 sizeof(cl_bool), &boolean, NULL);
3070 printf(" Global Memory: %sB%s\n",
3071 human_prefix(long_entries),
3072 boolean == CL_TRUE ? " (ECC)" : "");
3073 clGetDeviceInfo(devices[sequence_nr],
3074 CL_DEVICE_EXTENSIONS, sizeof(dname), dname, NULL);
3075 if (options.verbosity > VERB_LEGACY)
3076 printf(" Device extensions: %s\n", dname);
3077
3078 clGetDeviceInfo(devices[sequence_nr],
3079 CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
3080 sizeof(cl_ulong), &long_entries, NULL);
3081 if (long_entries)
3082 printf(" Global Memory Cache: %sB\n",
3083 human_prefix(long_entries)
3084 );
3085 clGetDeviceInfo(devices[sequence_nr],
3086 CL_DEVICE_LOCAL_MEM_SIZE,
3087 sizeof(cl_ulong), &long_entries, NULL);
3088 clGetDeviceInfo(devices[sequence_nr],
3089 CL_DEVICE_LOCAL_MEM_TYPE,
3090 sizeof(cl_device_local_mem_type), &memtype, NULL);
3091 printf(" Local Memory: %sB (%s)\n",
3092 human_prefix(long_entries),
3093 memtype == CL_LOCAL ? "Local" : "Global");
3094 clGetDeviceInfo(devices[sequence_nr],
3095 CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
3096 sizeof(cl_ulong), &long_entries, NULL);
3097 if (long_entries)
3098 printf(" Constant Buffer size: %sB\n",
3099 human_prefix(long_entries)
3100 );
3101 clGetDeviceInfo(devices[sequence_nr],
3102 CL_DEVICE_MAX_MEM_ALLOC_SIZE,
3103 sizeof(long_entries), &long_entries, NULL);
3104 printf(" Max memory alloc. size: %sB\n",
3105 human_prefix(long_entries));
3106 ret = clGetDeviceInfo(devices[sequence_nr],
3107 CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_int), &entries, NULL);
3108 if (ret == CL_SUCCESS && entries)
3109 printf(" Max clock (MHz): %u\n", entries);
3110 ret = clGetDeviceInfo(devices[sequence_nr],
3111 CL_DEVICE_PROFILING_TIMER_RESOLUTION,
3112 sizeof(size_t), &z_entries, NULL);
3113 if (ret == CL_SUCCESS && z_entries)
3114 printf(" Profiling timer res.: "Zu" ns\n", z_entries);
3115 clGetDeviceInfo(devices[sequence_nr],
3116 CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &p_size, NULL);
3117 printf(" Max Work Group Size: %d\n", (int)p_size);
3118 clGetDeviceInfo(devices[sequence_nr],
3119 CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &entries, NULL);
3120 printf(" Parallel compute cores: %d\n", entries);
3121
3122 long_entries = get_processors_count(sequence_nr);
3123 if (!cpu && ocl_device_list[sequence_nr].cores_per_MP > 1)
3124 printf(" %s "LLu" "
3125 " (%d x %d)\n",
3126 gpu_nvidia(device_info[sequence_nr]) ? "CUDA cores: " : "Stream processors:",
3127 (unsigned long long)long_entries,
3128 entries, ocl_device_list[sequence_nr].cores_per_MP);
3129 printf(" Speed index: %u\n",
3130 opencl_speed_index(sequence_nr));
3131
3132 ret = clGetDeviceInfo(devices[sequence_nr],
3133 CL_DEVICE_SIMD_WIDTH_AMD, sizeof(cl_uint),
3134 &long_entries, NULL);
3135 if (ret == CL_SUCCESS)
3136 printf(" SIMD width: "LLu"\n",
3137 (unsigned long long)long_entries);
3138
3139 ret = clGetDeviceInfo(devices[sequence_nr],
3140 CL_DEVICE_WAVEFRONT_WIDTH_AMD,
3141 sizeof(cl_uint), &long_entries, NULL);
3142 if (ret == CL_SUCCESS)
3143 printf(" Wavefront width: "LLu"\n",
3144 (unsigned long long)long_entries);
3145
3146 ret = clGetDeviceInfo(devices[sequence_nr],
3147 CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint),
3148 &long_entries, NULL);
3149 if (ret == CL_SUCCESS)
3150 printf(" Warp size: "LLu"\n",
3151 (unsigned long long)long_entries);
3152
3153 ret = clGetDeviceInfo(devices[sequence_nr],
3154 CL_DEVICE_REGISTERS_PER_BLOCK_NV,
3155 sizeof(cl_uint), &long_entries, NULL);
3156 if (ret == CL_SUCCESS)
3157 printf(" Max. GPRs/work-group: "LLu"\n",
3158 (unsigned long long)long_entries);
3159
3160 if (gpu_nvidia(device_info[sequence_nr])) {
3161 unsigned int major = 0, minor = 0;
3162
3163 get_compute_capability(sequence_nr, &major, &minor);
3164 if (major && minor)
3165 printf(" Compute capability: %u.%u "
3166 "(sm_%u%u)\n", major, minor, major, minor);
3167 }
3168 ret = clGetDeviceInfo(devices[sequence_nr],
3169 CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV,
3170 sizeof(cl_bool), &boolean, NULL);
3171 if (ret == CL_SUCCESS)
3172 printf(" Kernel exec. timeout: %s\n",
3173 boolean ? "yes" : "no");
3174
3175 fan = temp = util = cl = ml = -1;
3176 #if HAVE_LIBDL
3177 if (nvml_lib && gpu_nvidia(device_info[sequence_nr]) &&
3178 id2nvml(ocl_device_list[sequence_nr].pci_info) >= 0) {
3179 printf(" NVML id: %d\n",
3180 id2nvml(ocl_device_list[sequence_nr].pci_info));
3181 nvidia_get_temp(id2nvml(ocl_device_list[sequence_nr].pci_info),
3182 &temp, &fan, &util, &cl, &ml);
3183 } else if (adl_lib && gpu_amd(device_info[sequence_nr])) {
3184 printf(" ADL: Overdrive%d, device id %d\n",
3185 adl2od[id2adl(ocl_device_list[sequence_nr].pci_info)],
3186 id2adl(ocl_device_list[sequence_nr].pci_info));
3187 amd_get_temp(id2adl(ocl_device_list[sequence_nr].pci_info),
3188 &temp, &fan, &util, &cl, &ml);
3189 }
3190 #endif
3191 if (ocl_device_list[sequence_nr].pci_info.bus >= 0) {
3192 printf(" PCI device topology: %s\n",
3193 ocl_device_list[sequence_nr].pci_info.busId);
3194 }
3195 if (cl >= 0)
3196 printf(" PCI lanes: %d/%d\n", cl, ml);
3197 if (fan >= 0)
3198 printf(" Fan speed: %u%%\n", fan);
3199 if (temp >= 0)
3200 printf(" Temperature: %u%sC\n",
3201 temp, gpu_degree_sign);
3202 if (util >= 0)
3203 printf(" Utilization: %u%%\n", util);
3204 else if (temp >= 0)
3205 printf(" Utilization: n/a\n");
3206 puts("");
3207 }
3208 }
3209 return;
3210 }
3211
3212 #undef LOG_SIZE
3213 #undef SRC_SIZE
3214 #endif
3215