1 /*
2 * This software is Copyright (c) 2015 Sayantan Datta <stdatta at openwall dot com>
3 * and it is hereby released to the general public under the following terms:
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted.
6 */
7 #ifdef HAVE_OPENCL
8
9 #include <sys/time.h>
10
11 #include "opencl_mscash2_helper_plug.h"
12 #include "options.h"
13
14 #define PADDING 1024
15
16 typedef struct {
17 unsigned int istate[5];
18 unsigned int ostate[5];
19 unsigned int buf[5];
20 unsigned int out[4];
21 } devIterTempSz;
22
23 typedef struct {
24 cl_mem bufferDccHashes;
25 cl_mem bufferSha1Hashes;
26 cl_mem bufferSalt;
27 cl_mem bufferDcc2Hashes;
28 cl_mem bufferIterTemp;
29 } deviceBuffer;
30
31 typedef struct {
32 cl_kernel devKernel[4];
33 size_t devLws;
34 size_t devGws;
35 unsigned int devInUse;
36 } deviceParam;
37
38 static deviceBuffer *devBuffer = NULL;
39 static deviceParam *devParam = NULL;
40 static cl_event *events = NULL;
41 static unsigned int eventCtr = 0;
42 static unsigned int maxActiveDevices = 0;
43
initNumDevices(void)44 void initNumDevices(void)
45 {
46 devBuffer = (deviceBuffer *) mem_calloc(MAX_GPU_DEVICES, sizeof(deviceBuffer));
47 devParam = (deviceParam *) mem_calloc(MAX_GPU_DEVICES, sizeof(deviceParam));
48 events = (cl_event *) mem_alloc(MAX_GPU_DEVICES * sizeof(cl_event));
49 }
50
createDevObjGws(size_t gws,int jtrUniqDevId)51 static void createDevObjGws(size_t gws, int jtrUniqDevId)
52 {
53 devBuffer[jtrUniqDevId].bufferDccHashes = clCreateBuffer(context[jtrUniqDevId], CL_MEM_READ_ONLY, 4 * (gws + PADDING) * sizeof(cl_uint), NULL, &ret_code);
54 HANDLE_CLERROR(ret_code, "Failed allocating bufferDccHashes.");
55
56 devBuffer[jtrUniqDevId].bufferDcc2Hashes = clCreateBuffer(context[jtrUniqDevId], CL_MEM_WRITE_ONLY, 4 * (gws + PADDING) * sizeof(cl_uint), NULL, &ret_code);
57 HANDLE_CLERROR(ret_code, "Failed allocating bufferDcc2Hashes.");
58
59 devBuffer[jtrUniqDevId].bufferIterTemp = clCreateBuffer(context[jtrUniqDevId], CL_MEM_READ_WRITE, (gws + PADDING) * sizeof(devIterTempSz), NULL, &ret_code);
60 HANDLE_CLERROR(ret_code, "Failed allocating bufferIterTemp.");
61
62 devBuffer[jtrUniqDevId].bufferSha1Hashes = clCreateBuffer(context[jtrUniqDevId], CL_MEM_READ_WRITE, 5 * (gws + PADDING) * sizeof(cl_uint), NULL, &ret_code);
63 HANDLE_CLERROR(ret_code, "Failed allocating bufferSha1Hashes.");
64
65 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[0], 0, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferDccHashes), "Set Kernel 0 Arg 0 :FAILED");
66 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[0], 3, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferIterTemp), "Set Kernel 0 Arg 3 :FAILED");
67 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[1], 0, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferDccHashes), "Set Kernel 1 Arg 0 :FAILED");
68 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[1], 1, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferIterTemp), "Set Kernel 1 Arg 1 :FAILED");
69 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[1], 2, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferSha1Hashes), "Set Kernel 1 Arg 2 :FAILED");
70 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[2], 0, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferIterTemp), "Set Kernel 2 Arg 0 :FAILED");
71 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[3], 0, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferIterTemp), "Set Kernel 3 Arg 0 :FAILED");
72 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[3], 1, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferDcc2Hashes), "Set Kernel 3 Arg 1 :FAILED");
73 }
74
releaseDevObjGws(int jtrUniqDevId)75 static void releaseDevObjGws(int jtrUniqDevId)
76 {
77 if (devBuffer[jtrUniqDevId].bufferIterTemp) {
78 HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferDccHashes), "Failed releasing bufferDccHashes.");
79 HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferDcc2Hashes), "Failed releasing bufferDcc2Hashes.");
80 HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferIterTemp), "Failed releasing bufferIterTemp.");
81 HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferSha1Hashes), "Failed releasing bufferSha1Hashes.");
82 devBuffer[jtrUniqDevId].bufferIterTemp = 0;
83 }
84 }
85
createDevObj(int jtrUniqDevId)86 static void createDevObj(int jtrUniqDevId)
87 {
88 devBuffer[jtrUniqDevId].bufferSalt = clCreateBuffer(context[jtrUniqDevId], CL_MEM_READ_ONLY, SALT_BUFFER_SIZE, NULL, &ret_code);
89 HANDLE_CLERROR(ret_code, "Failed allocating bufferSalt.");
90
91 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[0], 1, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferSalt), "Set Kernel 0 Arg 1 :FAILED");
92 }
93
releaseDevObj(int jtrUniqDevId)94 static void releaseDevObj(int jtrUniqDevId)
95 {
96 if (devBuffer[jtrUniqDevId].bufferSalt) {
97 HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferSalt), "Failed releasing bufferSalt.");
98 devBuffer[jtrUniqDevId].bufferSalt = 0;
99 }
100 }
101
releaseAll()102 void releaseAll()
103 {
104 int i;
105
106 for (i = 0; i < get_number_of_devices_in_use(); i++) {
107 releaseDevObjGws(engaged_devices[i]);
108 releaseDevObj(engaged_devices[i]);
109 if (devParam[engaged_devices[i]].devKernel[0]) {
110 HANDLE_CLERROR(clReleaseKernel(devParam[engaged_devices[i]].devKernel[0]), "Error releasing kernel pbkdf2_preprocess_short");
111 HANDLE_CLERROR(clReleaseKernel(devParam[engaged_devices[i]].devKernel[1]), "Error releasing kernel pbkdf2_preprocess_long");
112 HANDLE_CLERROR(clReleaseKernel(devParam[engaged_devices[i]].devKernel[2]), "Error releasing kernel pbkdf2_iter");
113 HANDLE_CLERROR(clReleaseKernel(devParam[engaged_devices[i]].devKernel[3]), "Error releasing kernel pbkdf2_postprocess");
114 HANDLE_CLERROR(clReleaseProgram(program[engaged_devices[i]]), "Error releasing Program");
115 devParam[engaged_devices[i]].devKernel[0] = 0;
116 }
117 }
118
119 MEM_FREE(events);
120 MEM_FREE(devBuffer);
121 MEM_FREE(devParam);
122 }
123
findLwsLimit(int jtrUniqDevId)124 static size_t findLwsLimit(int jtrUniqDevId)
125 {
126 size_t minLws[4] = { 0 };
127
128 minLws[0] = get_kernel_max_lws(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[0]);
129 minLws[1] = get_kernel_max_lws(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[1]);
130 minLws[2] = get_kernel_max_lws(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[2]);
131 minLws[3] = get_kernel_max_lws(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[3]);
132
133 if (minLws[0] > minLws[1])
134 minLws[0] = minLws[1];
135 if (minLws[2] > minLws[3])
136 minLws[2] = minLws[3];
137 if (minLws[0] > minLws[2])
138 minLws[0] = minLws[2];
139
140 return minLws[0];
141 }
142
preferredLwsSize(int jtrUniqDevId)143 static size_t preferredLwsSize(int jtrUniqDevId)
144 {
145 size_t minLws[4] = { 0 };
146
147 minLws[0] = get_kernel_preferred_multiple(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[0]);
148 minLws[1] = get_kernel_preferred_multiple(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[1]);
149 minLws[2] = get_kernel_preferred_multiple(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[2]);
150 minLws[3] = get_kernel_preferred_multiple(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[3]);
151
152 if (minLws[0] > minLws[1])
153 minLws[0] = minLws[1];
154 if (minLws[2] > minLws[3])
155 minLws[2] = minLws[3];
156 if (minLws[0] > minLws[2])
157 minLws[0] = minLws[2];
158
159 return minLws[0];
160 }
161
execKernel(cl_uint * hostDccHashes,cl_uint * hostSha1Hashes,cl_uint * hostSalt,cl_uint saltlen,unsigned int iterCount,cl_uint * hostDcc2Hashes,cl_uint keyCount,int jtrUniqDevId,cl_command_queue cmdQueue)162 static void execKernel(cl_uint *hostDccHashes, cl_uint *hostSha1Hashes, cl_uint *hostSalt, cl_uint saltlen, unsigned int iterCount, cl_uint *hostDcc2Hashes, cl_uint keyCount, int jtrUniqDevId, cl_command_queue cmdQueue)
163 {
164 size_t N = keyCount, *M = devParam[jtrUniqDevId].devLws ? &devParam[jtrUniqDevId].devLws : NULL;
165 unsigned int i, itrCntKrnl = ITERATION_COUNT_PER_CALL;
166
167 N = devParam[jtrUniqDevId].devLws ? (keyCount + devParam[jtrUniqDevId].devLws - 1) / devParam[jtrUniqDevId].devLws * devParam[jtrUniqDevId].devLws : keyCount;
168
169 HANDLE_CLERROR(clEnqueueWriteBuffer(cmdQueue, devBuffer[jtrUniqDevId].bufferDccHashes, CL_FALSE, 0, 4 * keyCount * sizeof(cl_uint), hostDccHashes, 0, NULL, NULL ), "Failed in clEnqueueWriteBuffer bufferDccHashes.");
170 if (saltlen > 22)
171 HANDLE_CLERROR(clEnqueueWriteBuffer(cmdQueue, devBuffer[jtrUniqDevId].bufferSha1Hashes, CL_FALSE, 0, 5 * keyCount * sizeof(cl_uint), hostSha1Hashes, 0, NULL, NULL ), "Failed in clEnqueueWriteBuffer bufferSha1Hashes.");
172 else
173 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[0], 2, sizeof(cl_uint), &saltlen), "Set Kernel 0 Arg 2 :FAILED");
174
175 HANDLE_CLERROR(clEnqueueWriteBuffer(cmdQueue, devBuffer[jtrUniqDevId].bufferSalt, CL_FALSE, 0, SALT_BUFFER_SIZE, hostSalt, 0, NULL, NULL ), "Failed in clEnqueueWriteBuffer bufferSalt.");
176
177 if (saltlen < 23)
178 HANDLE_CLERROR(clEnqueueNDRangeKernel(cmdQueue, devParam[jtrUniqDevId].devKernel[0], 1, NULL, &N, M, 0, NULL, NULL), "Failed in clEnqueueNDRangeKernel devKernel[0].");
179 else
180 HANDLE_CLERROR(clEnqueueNDRangeKernel(cmdQueue, devParam[jtrUniqDevId].devKernel[1], 1, NULL, &N, M, 0, NULL, NULL), "Failed in clEnqueueNDRangeKernel devKernel[1].");
181
182 for (i = 0; i < iterCount - 1; i += itrCntKrnl ) {
183 if (i + itrCntKrnl >= iterCount)
184 itrCntKrnl = iterCount - i - 1;
185
186 HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[2], 1, sizeof(cl_uint), &itrCntKrnl), "Set Kernel 1 Arg 1 :FAILED");
187
188 M = devParam[jtrUniqDevId].devLws ? &devParam[jtrUniqDevId].devLws : NULL;
189 HANDLE_CLERROR(clEnqueueNDRangeKernel(cmdQueue, devParam[jtrUniqDevId].devKernel[2], 1, NULL, &N, M, 0, NULL, NULL), "Failed in clEnqueueNDRangeKernel devKernel[2].");
190
191 opencl_process_event();
192 }
193
194 M = devParam[jtrUniqDevId].devLws ? &devParam[jtrUniqDevId].devLws : NULL;
195 HANDLE_CLERROR(clEnqueueNDRangeKernel(cmdQueue, devParam[jtrUniqDevId].devKernel[3], 1, NULL, &N, M, 0, NULL, &events[eventCtr]), "Failed in clEnqueueNDRangeKernel devKernel[2].");
196
197 eventCtr++;
198 }
199
autoTune(int jtrUniqDevId,long double kernelRunMs)200 static size_t autoTune(int jtrUniqDevId, long double kernelRunMs)
201 {
202 size_t gwsLimit, gwsInit, gwsRound;
203 size_t lwsLimit, lwsInit;
204
205 struct timeval startc, endc;
206 long double timeMs = 0, minTimeMs = 0;
207
208 size_t pcount, count;
209
210 int tuneGws, tuneLws;
211
212 cl_uint *hostDccHashes, *hostSalt, *hostDcc2Hashes;
213
214 unsigned int i;
215 unsigned int a = 0xffaabbcc;
216 unsigned int b = 0xbbccaaee;
217 unsigned int c = 0xccffbbdd;
218 unsigned int d = 0xff123456;
219
220 gwsLimit = get_max_mem_alloc_size
221 (jtrUniqDevId) / sizeof(devIterTempSz);
222 get_power_of_two(gwsLimit);
223 if (gwsLimit + PADDING >
224 get_max_mem_alloc_size
225 (jtrUniqDevId) / sizeof(devIterTempSz))
226 gwsLimit >>= 1;
227
228 lwsLimit = findLwsLimit(jtrUniqDevId);
229 lwsInit = preferredLwsSize(jtrUniqDevId);
230
231 gwsInit = 1024;
232 gwsRound = 8192;
233 if (cpu(device_info[jtrUniqDevId])) {
234 gwsInit = 256;
235 gwsRound = 64;
236 if (lwsLimit > 8)
237 lwsLimit = 8;
238 if (lwsInit > 8)
239 lwsInit = 8;
240 }
241
242 if (gwsInit > gwsLimit)
243 gwsInit = gwsLimit;
244 if (gwsInit < lwsInit)
245 lwsInit = gwsInit;
246
247 local_work_size = 0;
248 global_work_size = 0;
249 tuneGws = 1;
250 tuneLws = 1;
251 opencl_get_user_preferences(FORMAT_LABEL);
252 if (local_work_size) {
253 tuneLws = 0;
254 if (local_work_size & (local_work_size - 1))
255 get_power_of_two(local_work_size);
256 if (local_work_size > lwsLimit)
257 local_work_size = lwsLimit;
258 }
259 if (global_work_size)
260 tuneGws = 0;
261
262 devParam[jtrUniqDevId].devLws = local_work_size;
263 devParam[jtrUniqDevId].devGws = global_work_size;
264
265 #if 0
266 fprintf(stderr, "lwsInit:"Zu" lwsLimit:"Zu""
267 " gwsInit:"Zu" gwsLimit:"Zu"\n",
268 lwsInit, lwsLimit, gwsInit,
269 gwsLimit);
270 #endif
271 /* Auto tune start.*/
272 pcount = gwsInit;
273 count = 0;
274 #define calcMs(start, end) \
275 ((long double)(end.tv_sec - start.tv_sec) * 1000.000 + \
276 (long double)(end.tv_usec - start.tv_usec) / 1000.000)
277 if (tuneGws) {
278 createDevObjGws(pcount, jtrUniqDevId);
279 hostDccHashes = (cl_uint *) mem_alloc(pcount * sizeof(cl_uint) * 4);
280 hostDcc2Hashes = (cl_uint *) mem_calloc(pcount * 4, sizeof(cl_uint));
281 hostSalt = (cl_uint *) mem_alloc(SALT_BUFFER_SIZE);
282 for (i = 0; i < pcount; i++) {
283 hostDccHashes[i * 4] = a++;
284 hostDccHashes[i * 4 + 1] = a + b++;
285 hostDccHashes[i * 4 + 2] = c++;
286 hostDccHashes[i * 4 + 3] = c + d++;
287 }
288 memset(hostSalt, 0x2B, SALT_BUFFER_SIZE);
289
290 gettimeofday(&startc, NULL);
291 eventCtr = 0;
292 execKernel(hostDccHashes, NULL, hostSalt, 20, 10240, hostDcc2Hashes, pcount, jtrUniqDevId, queue[jtrUniqDevId]);
293 HANDLE_CLERROR(clFinish(queue[jtrUniqDevId]), "Finish Error");
294 gettimeofday(&endc, NULL);
295
296 timeMs = calcMs(startc, endc);
297 count = (size_t)((kernelRunMs / timeMs) * (long double)gwsInit);
298 count = GET_NEXT_MULTIPLE(count, gwsRound);
299
300 MEM_FREE(hostDccHashes);
301 MEM_FREE(hostDcc2Hashes);
302 MEM_FREE(hostSalt);
303 releaseDevObjGws(jtrUniqDevId);
304
305 pcount = count;
306 createDevObjGws(pcount, jtrUniqDevId);
307 hostDccHashes = (cl_uint *) mem_alloc(pcount * sizeof(cl_uint) * 4);
308 hostDcc2Hashes = (cl_uint *) mem_calloc(pcount * 4, sizeof(cl_uint));
309 hostSalt = (cl_uint *) mem_alloc(SALT_BUFFER_SIZE);
310 for (i = 0; i < pcount; i++) {
311 hostDccHashes[i * 4] = a++;
312 hostDccHashes[i * 4 + 1] = a + b++;
313 hostDccHashes[i * 4 + 2] = c++;
314 hostDccHashes[i * 4 + 3] = c + d++;
315 }
316 memset(hostSalt, 0x2B, SALT_BUFFER_SIZE);
317
318 gettimeofday(&startc, NULL);
319 eventCtr = 0;
320 execKernel(hostDccHashes, NULL, hostSalt, 20, 10240, hostDcc2Hashes, pcount, jtrUniqDevId, queue[jtrUniqDevId]);
321 HANDLE_CLERROR(clFinish(queue[jtrUniqDevId]), "Finish Error");
322 gettimeofday(&endc, NULL);
323
324 timeMs = calcMs(startc, endc);
325 count = (size_t)((kernelRunMs / timeMs) * (long double)count);
326 count = GET_NEXT_MULTIPLE(count, gwsRound);
327
328 MEM_FREE(hostDccHashes);
329 MEM_FREE(hostDcc2Hashes);
330 MEM_FREE(hostSalt);
331 }
332
333 if (tuneGws && tuneLws)
334 releaseDevObjGws(jtrUniqDevId);
335
336 if (tuneLws) {
337 size_t bestLws;
338 count = tuneGws ? count : devParam[jtrUniqDevId].devGws;
339
340 createDevObjGws(count, jtrUniqDevId);
341 pcount = count;
342 hostDccHashes = (cl_uint *) mem_alloc(pcount * sizeof(cl_uint) * 4);
343 hostDcc2Hashes = (cl_uint *) mem_calloc(pcount * 4, sizeof(cl_uint));
344 hostSalt = (cl_uint *) mem_alloc(SALT_BUFFER_SIZE);
345 for (i = 0; i < pcount; i++) {
346 hostDccHashes[i * 4] = a++;
347 hostDccHashes[i * 4 + 1] = a + b++;
348 hostDccHashes[i * 4 + 2] = c++;
349 hostDccHashes[i * 4 + 3] = c + d++;
350 }
351 memset(hostSalt, 0x2B, SALT_BUFFER_SIZE);
352
353 devParam[jtrUniqDevId].devLws = lwsInit;
354
355 gettimeofday(&startc, NULL);
356 eventCtr = 0;
357 execKernel(hostDccHashes, NULL, hostSalt, 20, 10240, hostDcc2Hashes, pcount, jtrUniqDevId, queue[jtrUniqDevId]);
358 HANDLE_CLERROR(clFinish(queue[jtrUniqDevId]), "Finish Error");
359 gettimeofday(&endc, NULL);
360
361 timeMs = calcMs(startc, endc);
362
363 minTimeMs = timeMs;
364 bestLws = devParam[jtrUniqDevId].devLws;
365
366 devParam[jtrUniqDevId].devLws = 2 * lwsInit;
367
368 while (devParam[jtrUniqDevId].devLws <= lwsLimit) {
369 for (i = 0; i < pcount; i++) {
370 hostDccHashes[i * 4] = a++;
371 hostDccHashes[i * 4 + 1] = a + b++;
372 hostDccHashes[i * 4 + 2] = c++;
373 hostDccHashes[i * 4 + 3] = c + d++;
374 }
375 gettimeofday(&startc, NULL);
376 pcount = count;
377 eventCtr = 0;
378 execKernel(hostDccHashes, NULL, hostSalt, 20, 10240, hostDcc2Hashes, pcount, jtrUniqDevId, queue[jtrUniqDevId]);
379 HANDLE_CLERROR(clFinish(queue[jtrUniqDevId]), "Finish Error");
380 gettimeofday(&endc, NULL);
381
382 timeMs = calcMs(startc, endc);
383
384 if (minTimeMs > timeMs) {
385 minTimeMs = timeMs;
386 bestLws = devParam[jtrUniqDevId].devLws;
387 }
388
389 devParam[jtrUniqDevId].devLws *= 2;
390 }
391
392 devParam[jtrUniqDevId].devLws = bestLws;
393
394 if (devParam[jtrUniqDevId].devLws > lwsLimit)
395 devParam[jtrUniqDevId].devLws = lwsLimit;
396
397 MEM_FREE(hostDccHashes);
398 MEM_FREE(hostDcc2Hashes);
399 MEM_FREE(hostSalt);
400 }
401
402 if (tuneGws && tuneLws) {
403 count = (size_t)((kernelRunMs / minTimeMs) * (long double)count);
404 count = GET_NEXT_MULTIPLE(count, gwsRound);
405 }
406
407 if (tuneGws) {
408 if (count > gwsLimit)
409 count = gwsLimit;
410 releaseDevObjGws(jtrUniqDevId);
411 createDevObjGws(count, jtrUniqDevId);
412 devParam[jtrUniqDevId].devGws = count;
413 }
414
415 if (!tuneGws && !tuneLws)
416 createDevObjGws(devParam[jtrUniqDevId].devGws, jtrUniqDevId);
417 /* Auto tune finish.*/
418
419 if (devParam[jtrUniqDevId].devGws % gwsRound) {
420 devParam[jtrUniqDevId].devGws = GET_NEXT_MULTIPLE(devParam[jtrUniqDevId].devGws, gwsRound);
421 releaseDevObjGws(jtrUniqDevId);
422 if (devParam[jtrUniqDevId].devGws > gwsLimit)
423 devParam[jtrUniqDevId].devGws = gwsLimit;
424 createDevObjGws(devParam[jtrUniqDevId].devGws, jtrUniqDevId);
425 }
426
427 if (devParam[jtrUniqDevId].devGws > gwsLimit) {
428 releaseDevObjGws(jtrUniqDevId);
429 devParam[jtrUniqDevId].devGws = gwsLimit;
430 createDevObjGws(devParam[jtrUniqDevId].devGws, jtrUniqDevId);
431 }
432
433 if (options.verbosity > VERB_LEGACY)
434 fprintf(stdout, "Device %d GWS: "Zu", LWS: "Zu"\n", jtrUniqDevId,
435 devParam[jtrUniqDevId].devGws, devParam[jtrUniqDevId].devLws);
436
437 #undef calcMs
438 return devParam[jtrUniqDevId].devGws;
439 }
440
selectDevice(int jtrUniqDevId,struct fmt_main * self)441 size_t selectDevice(int jtrUniqDevId, struct fmt_main *self)
442 {
443 char buildOpts[300];
444
445 sprintf(buildOpts, "-D SALT_BUFFER_SIZE=" Zu, SALT_BUFFER_SIZE);
446 opencl_init("$JOHN/kernels/pbkdf2_kernel.cl", jtrUniqDevId, buildOpts);
447
448 devParam[jtrUniqDevId].devKernel[0] = clCreateKernel(program[jtrUniqDevId], "pbkdf2_preprocess_short", &ret_code);
449 HANDLE_CLERROR(ret_code, "Error creating kernel pbkdf2_preprocess_short.");
450
451 devParam[jtrUniqDevId].devKernel[1] = clCreateKernel(program[jtrUniqDevId], "pbkdf2_preprocess_long", &ret_code);
452 HANDLE_CLERROR(ret_code, "Error creating kernel pbkdf2_preprocess_long.");
453
454 devParam[jtrUniqDevId].devKernel[2] = clCreateKernel(program[jtrUniqDevId], "pbkdf2_iter", &ret_code);
455 HANDLE_CLERROR(ret_code, "Error creating kernel pbkdf2_iter.");
456
457 devParam[jtrUniqDevId].devKernel[3] = clCreateKernel(program[jtrUniqDevId], "pbkdf2_postprocess", &ret_code);
458 HANDLE_CLERROR(ret_code, "Error creating kernel pbkdf2_postprocess.");
459
460 createDevObj(jtrUniqDevId);
461
462 maxActiveDevices++;
463
464 return autoTune(jtrUniqDevId, 1000);
465 }
466
dcc2Execute(cl_uint * hostDccHashes,cl_uint * hostSha1Hashes,cl_uint * hostSalt,cl_uint saltlen,cl_uint iterCount,cl_uint * hostDcc2Hashes,cl_uint numKeys)467 void dcc2Execute(cl_uint *hostDccHashes, cl_uint *hostSha1Hashes, cl_uint *hostSalt, cl_uint saltlen, cl_uint iterCount, cl_uint *hostDcc2Hashes, cl_uint numKeys)
468 {
469 int i;
470 unsigned int workPart, workOffset = 0;
471 cl_int ret;
472
473 #ifdef _DEBUG
474 struct timeval startc, endc;
475 #endif
476
477 eventCtr = 0;
478 memset(hostDcc2Hashes, 0, numKeys * sizeof(cl_uint));
479
480 ///Divide memory and work
481 for (i = 0; i < maxActiveDevices; ++i) {
482 if (i == maxActiveDevices - 1)
483 workPart = numKeys - workOffset;
484 else
485 workPart = devParam[engaged_devices[i]].devGws;
486
487 if ((int)workPart <= 0)
488 workPart = devParam[engaged_devices[i]].devLws;
489 #ifdef _DEBUG
490 gettimeofday(&startc, NULL) ;
491 fprintf(stderr, "Work Offset:%d Work Part Size:%d Event No:%d",workOffset,workPart,event_ctr);
492
493 if (workPart != devParam[engaged_devices[i]].devGws)
494 fprintf(stderr, "Deficit: %d "Zu"\n", engaged_devices[i], devParam[engaged_devices[i]].devGws - workPart);
495 #endif
496
497 ///call to execKernel()
498 execKernel(hostDccHashes + 4 * workOffset, hostSha1Hashes + 5 * workOffset, hostSalt, saltlen, iterCount, hostDcc2Hashes + 4 * workOffset, workPart, engaged_devices[i], queue[engaged_devices[i]]);
499 workOffset += workPart;
500
501 #ifdef _DEBUG
502 gettimeofday(&endc, NULL);
503 fprintf(stderr, "GPU enqueue time:%f\n",(endc.tv_sec - startc.tv_sec) + (double)(endc.tv_usec - startc.tv_usec) / 1000000.000) ;
504 #endif
505 }
506
507 ///Synchronize all kernels
508 for (i = maxActiveDevices - 1; i >= 0; --i)
509 HANDLE_CLERROR(clFlush(queue[engaged_devices[i]]), "Flush Error");
510
511 for (i = 0; i < maxActiveDevices; ++i) {
512 while (1) {
513 HANDLE_CLERROR(clGetEventInfo(events[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL), "Error in Get Event Info");
514 if ((ret) == CL_COMPLETE)
515 break;
516 #ifdef _DEBUG
517 printf("%d%d ", ret, i);
518 #endif
519 }
520 }
521
522 eventCtr = workPart = workOffset = 0;
523
524 ///Read results back from all kernels
525 for (i = 0; i < maxActiveDevices; ++i) {
526 if (i == maxActiveDevices - 1)
527 workPart = numKeys - workOffset;
528
529 else
530 workPart = devParam[engaged_devices[i]].devGws;
531
532 if ((int)workPart <= 0)
533 workPart = devParam[engaged_devices[i]].devLws;
534
535 #ifdef _DEBUG
536 gettimeofday(&startc, NULL) ;
537 fprintf(stderr, "Work Offset:%d Work Part Size:%d Event No:%d",workOffset,workPart,eventCtr);
538 #endif
539
540 ///Read results back from device
541 HANDLE_CLERROR(clEnqueueReadBuffer(queue[engaged_devices[i]],
542 devBuffer[engaged_devices[i]].bufferDcc2Hashes,
543 CL_FALSE, 0,
544 4 * workPart * sizeof(cl_uint),
545 hostDcc2Hashes + 4 * workOffset,
546 0,
547 NULL,
548 NULL), "Write :FAILED");
549 workOffset += workPart;
550
551 #ifdef _DEBUG
552 gettimeofday(&endc, NULL);
553 fprintf(stderr, "GPU enqueue time:%f\n",(endc.tv_sec - startc.tv_sec) + (double)(endc.tv_usec - startc.tv_usec) / 1000000.000) ;
554 #endif
555 HANDLE_CLERROR(clReleaseEvent(events[i]), "Error releasing events[i].");
556 }
557
558 for (i = 0; i < maxActiveDevices; ++i)
559 HANDLE_CLERROR(clFinish(queue[engaged_devices[i]]), "Finish Error");
560
561 }
562
563 #endif
564