1 /*
2  * This software is Copyright (c) 2015 Sayantan Datta <stdatta at openwall dot com>
3  * and it is hereby released to the general public under the following terms:
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted.
6  */
7 #ifdef HAVE_OPENCL
8 
9 #include <sys/time.h>
10 
11 #include "opencl_mscash2_helper_plug.h"
12 #include "options.h"
13 
14 #define PADDING				1024
15 
16 typedef struct {
17 	unsigned int 	istate[5];
18 	unsigned int 	ostate[5];
19 	unsigned int 	buf[5];
20 	unsigned int 	out[4];
21 } devIterTempSz;
22 
23 typedef struct {
24 	cl_mem bufferDccHashes;
25 	cl_mem bufferSha1Hashes;
26 	cl_mem bufferSalt;
27 	cl_mem bufferDcc2Hashes;
28 	cl_mem bufferIterTemp;
29 } deviceBuffer;
30 
31 typedef struct {
32 	cl_kernel devKernel[4];
33 	size_t devLws;
34 	size_t devGws;
35 	unsigned int devInUse;
36 } deviceParam;
37 
38 static deviceBuffer *devBuffer = NULL;
39 static deviceParam *devParam = NULL;
40 static cl_event *events = NULL;
41 static unsigned int eventCtr = 0;
42 static unsigned int maxActiveDevices = 0;
43 
initNumDevices(void)44 void initNumDevices(void)
45 {
46 	devBuffer = (deviceBuffer *) mem_calloc(MAX_GPU_DEVICES, sizeof(deviceBuffer));
47 	devParam = (deviceParam *) mem_calloc(MAX_GPU_DEVICES, sizeof(deviceParam));
48 	events = (cl_event *) mem_alloc(MAX_GPU_DEVICES * sizeof(cl_event));
49 }
50 
createDevObjGws(size_t gws,int jtrUniqDevId)51 static void createDevObjGws(size_t gws, int jtrUniqDevId)
52 {
53 	devBuffer[jtrUniqDevId].bufferDccHashes = clCreateBuffer(context[jtrUniqDevId], CL_MEM_READ_ONLY, 4 * (gws + PADDING) * sizeof(cl_uint), NULL, &ret_code);
54 	HANDLE_CLERROR(ret_code, "Failed allocating bufferDccHashes.");
55 
56 	devBuffer[jtrUniqDevId].bufferDcc2Hashes = clCreateBuffer(context[jtrUniqDevId], CL_MEM_WRITE_ONLY, 4 * (gws + PADDING) * sizeof(cl_uint), NULL, &ret_code);
57 	HANDLE_CLERROR(ret_code, "Failed allocating bufferDcc2Hashes.");
58 
59 	devBuffer[jtrUniqDevId].bufferIterTemp = clCreateBuffer(context[jtrUniqDevId], CL_MEM_READ_WRITE, (gws + PADDING) * sizeof(devIterTempSz), NULL, &ret_code);
60 	HANDLE_CLERROR(ret_code, "Failed allocating bufferIterTemp.");
61 
62 	devBuffer[jtrUniqDevId].bufferSha1Hashes = clCreateBuffer(context[jtrUniqDevId], CL_MEM_READ_WRITE, 5 * (gws + PADDING) * sizeof(cl_uint), NULL, &ret_code);
63 	HANDLE_CLERROR(ret_code, "Failed allocating bufferSha1Hashes.");
64 
65 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[0], 0, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferDccHashes), "Set Kernel 0 Arg 0 :FAILED");
66 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[0], 3, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferIterTemp), "Set Kernel 0 Arg 3 :FAILED");
67 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[1], 0, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferDccHashes), "Set Kernel 1 Arg 0 :FAILED");
68 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[1], 1, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferIterTemp), "Set Kernel 1 Arg 1 :FAILED");
69 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[1], 2, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferSha1Hashes), "Set Kernel 1 Arg 2 :FAILED");
70 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[2], 0, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferIterTemp), "Set Kernel 2 Arg 0 :FAILED");
71 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[3], 0, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferIterTemp), "Set Kernel 3 Arg 0 :FAILED");
72 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[3], 1, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferDcc2Hashes), "Set Kernel 3 Arg 1 :FAILED");
73 }
74 
releaseDevObjGws(int jtrUniqDevId)75 static void releaseDevObjGws(int jtrUniqDevId)
76 {
77 	if (devBuffer[jtrUniqDevId].bufferIterTemp) {
78 		HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferDccHashes), "Failed releasing bufferDccHashes.");
79 		HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferDcc2Hashes), "Failed releasing bufferDcc2Hashes.");
80 		HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferIterTemp), "Failed releasing bufferIterTemp.");
81 		HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferSha1Hashes), "Failed releasing bufferSha1Hashes.");
82 		devBuffer[jtrUniqDevId].bufferIterTemp = 0;
83 	}
84 }
85 
createDevObj(int jtrUniqDevId)86 static void createDevObj(int jtrUniqDevId)
87 {
88 	devBuffer[jtrUniqDevId].bufferSalt = clCreateBuffer(context[jtrUniqDevId], CL_MEM_READ_ONLY, SALT_BUFFER_SIZE, NULL, &ret_code);
89 	HANDLE_CLERROR(ret_code, "Failed allocating bufferSalt.");
90 
91 	HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[0], 1, sizeof(cl_mem), &devBuffer[jtrUniqDevId].bufferSalt), "Set Kernel 0 Arg 1 :FAILED");
92 }
93 
releaseDevObj(int jtrUniqDevId)94 static void releaseDevObj(int jtrUniqDevId)
95 {
96 	if (devBuffer[jtrUniqDevId].bufferSalt) {
97 		HANDLE_CLERROR(clReleaseMemObject(devBuffer[jtrUniqDevId].bufferSalt), "Failed releasing bufferSalt.");
98 		devBuffer[jtrUniqDevId].bufferSalt = 0;
99 	}
100 }
101 
releaseAll()102 void releaseAll()
103 {
104 	int 	i;
105 
106 	for (i = 0; i < get_number_of_devices_in_use(); i++) {
107 	releaseDevObjGws(engaged_devices[i]);
108 	releaseDevObj(engaged_devices[i]);
109 	if (devParam[engaged_devices[i]].devKernel[0]) {
110 		HANDLE_CLERROR(clReleaseKernel(devParam[engaged_devices[i]].devKernel[0]), "Error releasing kernel pbkdf2_preprocess_short");
111 		HANDLE_CLERROR(clReleaseKernel(devParam[engaged_devices[i]].devKernel[1]), "Error releasing kernel pbkdf2_preprocess_long");
112 		HANDLE_CLERROR(clReleaseKernel(devParam[engaged_devices[i]].devKernel[2]), "Error releasing kernel pbkdf2_iter");
113 		HANDLE_CLERROR(clReleaseKernel(devParam[engaged_devices[i]].devKernel[3]), "Error releasing kernel pbkdf2_postprocess");
114 		HANDLE_CLERROR(clReleaseProgram(program[engaged_devices[i]]), "Error releasing Program");
115 		devParam[engaged_devices[i]].devKernel[0] = 0;
116 		}
117 	 }
118 
119 	 MEM_FREE(events);
120 	 MEM_FREE(devBuffer);
121 	 MEM_FREE(devParam);
122 }
123 
findLwsLimit(int jtrUniqDevId)124 static size_t findLwsLimit(int jtrUniqDevId)
125 {
126 	size_t minLws[4] = { 0 };
127 
128 	minLws[0] = get_kernel_max_lws(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[0]);
129 	minLws[1] = get_kernel_max_lws(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[1]);
130 	minLws[2] = get_kernel_max_lws(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[2]);
131 	minLws[3] = get_kernel_max_lws(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[3]);
132 
133 	if (minLws[0] > minLws[1])
134 		minLws[0] = minLws[1];
135 	if (minLws[2] > minLws[3])
136 		minLws[2] = minLws[3];
137 	if (minLws[0] > minLws[2])
138 		minLws[0] = minLws[2];
139 
140 	return minLws[0];
141 }
142 
preferredLwsSize(int jtrUniqDevId)143 static size_t preferredLwsSize(int jtrUniqDevId)
144 {
145 	size_t minLws[4] = { 0 };
146 
147 	minLws[0] = get_kernel_preferred_multiple(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[0]);
148 	minLws[1] = get_kernel_preferred_multiple(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[1]);
149 	minLws[2] = get_kernel_preferred_multiple(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[2]);
150 	minLws[3] = get_kernel_preferred_multiple(jtrUniqDevId, devParam[jtrUniqDevId].devKernel[3]);
151 
152 	if (minLws[0] > minLws[1])
153 		minLws[0] = minLws[1];
154 	if (minLws[2] > minLws[3])
155 		minLws[2] = minLws[3];
156 	if (minLws[0] > minLws[2])
157 		minLws[0] = minLws[2];
158 
159 	return minLws[0];
160 }
161 
execKernel(cl_uint * hostDccHashes,cl_uint * hostSha1Hashes,cl_uint * hostSalt,cl_uint saltlen,unsigned int iterCount,cl_uint * hostDcc2Hashes,cl_uint keyCount,int jtrUniqDevId,cl_command_queue cmdQueue)162 static void execKernel(cl_uint *hostDccHashes, cl_uint *hostSha1Hashes, cl_uint *hostSalt, cl_uint saltlen, unsigned int iterCount, cl_uint *hostDcc2Hashes, cl_uint keyCount, int jtrUniqDevId, cl_command_queue cmdQueue)
163 {
164 	size_t 		N = keyCount, *M = devParam[jtrUniqDevId].devLws ? &devParam[jtrUniqDevId].devLws : NULL;
165 	unsigned int 	i, itrCntKrnl = ITERATION_COUNT_PER_CALL;
166 
167 	N = devParam[jtrUniqDevId].devLws ? (keyCount + devParam[jtrUniqDevId].devLws - 1) / devParam[jtrUniqDevId].devLws * devParam[jtrUniqDevId].devLws : keyCount;
168 
169 	HANDLE_CLERROR(clEnqueueWriteBuffer(cmdQueue, devBuffer[jtrUniqDevId].bufferDccHashes, CL_FALSE, 0, 4 * keyCount * sizeof(cl_uint), hostDccHashes, 0, NULL, NULL ), "Failed in clEnqueueWriteBuffer bufferDccHashes.");
170 	if (saltlen > 22)
171 		HANDLE_CLERROR(clEnqueueWriteBuffer(cmdQueue, devBuffer[jtrUniqDevId].bufferSha1Hashes, CL_FALSE, 0, 5 * keyCount * sizeof(cl_uint), hostSha1Hashes, 0, NULL, NULL ), "Failed in clEnqueueWriteBuffer bufferSha1Hashes.");
172 	else
173 	      HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[0], 2, sizeof(cl_uint), &saltlen), "Set Kernel 0 Arg 2 :FAILED");
174 
175 	HANDLE_CLERROR(clEnqueueWriteBuffer(cmdQueue, devBuffer[jtrUniqDevId].bufferSalt, CL_FALSE, 0, SALT_BUFFER_SIZE, hostSalt, 0, NULL, NULL ), "Failed in clEnqueueWriteBuffer bufferSalt.");
176 
177 	if (saltlen < 23)
178 		HANDLE_CLERROR(clEnqueueNDRangeKernel(cmdQueue, devParam[jtrUniqDevId].devKernel[0], 1, NULL, &N, M, 0, NULL, NULL), "Failed in clEnqueueNDRangeKernel devKernel[0].");
179 	else
180 		HANDLE_CLERROR(clEnqueueNDRangeKernel(cmdQueue, devParam[jtrUniqDevId].devKernel[1], 1, NULL, &N, M, 0, NULL, NULL), "Failed in clEnqueueNDRangeKernel devKernel[1].");
181 
182 	for (i = 0; i < iterCount - 1; i += itrCntKrnl ) {
183 		if (i + itrCntKrnl >= iterCount)
184 			itrCntKrnl = iterCount - i - 1;
185 
186 		HANDLE_CLERROR(clSetKernelArg(devParam[jtrUniqDevId].devKernel[2], 1, sizeof(cl_uint), &itrCntKrnl), "Set Kernel 1 Arg 1 :FAILED");
187 
188 		M = devParam[jtrUniqDevId].devLws ? &devParam[jtrUniqDevId].devLws : NULL;
189 		HANDLE_CLERROR(clEnqueueNDRangeKernel(cmdQueue, devParam[jtrUniqDevId].devKernel[2], 1, NULL, &N, M, 0, NULL, NULL), "Failed in clEnqueueNDRangeKernel devKernel[2].");
190 
191 		opencl_process_event();
192 	}
193 
194 	M = devParam[jtrUniqDevId].devLws ? &devParam[jtrUniqDevId].devLws : NULL;
195 	HANDLE_CLERROR(clEnqueueNDRangeKernel(cmdQueue, devParam[jtrUniqDevId].devKernel[3], 1, NULL, &N, M, 0, NULL, &events[eventCtr]), "Failed in clEnqueueNDRangeKernel devKernel[2].");
196 
197         eventCtr++;
198 }
199 
autoTune(int jtrUniqDevId,long double kernelRunMs)200 static size_t autoTune(int jtrUniqDevId, long double kernelRunMs)
201 {
202 	size_t gwsLimit, gwsInit, gwsRound;
203 	size_t lwsLimit, lwsInit;
204 
205 	struct timeval startc, endc;
206 	long double timeMs = 0, minTimeMs = 0;
207 
208 	size_t pcount, count;
209 
210 	int tuneGws, tuneLws;
211 
212 	cl_uint *hostDccHashes, *hostSalt, *hostDcc2Hashes;
213 
214 	unsigned int i;
215 	unsigned int a = 0xffaabbcc;
216 	unsigned int b = 0xbbccaaee;
217 	unsigned int c = 0xccffbbdd;
218 	unsigned int d = 0xff123456;
219 
220 	gwsLimit = get_max_mem_alloc_size
221 		   (jtrUniqDevId) / sizeof(devIterTempSz);
222 	get_power_of_two(gwsLimit);
223 	if (gwsLimit + PADDING >
224 		get_max_mem_alloc_size
225 		(jtrUniqDevId) / sizeof(devIterTempSz))
226 		gwsLimit >>= 1;
227 
228 	lwsLimit = findLwsLimit(jtrUniqDevId);
229 	lwsInit = preferredLwsSize(jtrUniqDevId);
230 
231 	gwsInit = 1024;
232 	gwsRound = 8192;
233 	if (cpu(device_info[jtrUniqDevId])) {
234 		gwsInit = 256;
235 		gwsRound = 64;
236 		if (lwsLimit > 8)
237 			lwsLimit = 8;
238 		if (lwsInit > 8)
239 			lwsInit = 8;
240 	}
241 
242 	if (gwsInit > gwsLimit)
243 		gwsInit = gwsLimit;
244 	if (gwsInit < lwsInit)
245 		lwsInit = gwsInit;
246 
247 	local_work_size = 0;
248 	global_work_size = 0;
249 	tuneGws = 1;
250 	tuneLws = 1;
251 	opencl_get_user_preferences(FORMAT_LABEL);
252 	if (local_work_size) {
253 		tuneLws = 0;
254 		if (local_work_size & (local_work_size - 1))
255 			get_power_of_two(local_work_size);
256 		if (local_work_size > lwsLimit)
257 			local_work_size = lwsLimit;
258 	}
259 	if (global_work_size)
260 		tuneGws = 0;
261 
262 	devParam[jtrUniqDevId].devLws = local_work_size;
263 	devParam[jtrUniqDevId].devGws = global_work_size;
264 
265 #if 0
266 	 fprintf(stderr, "lwsInit:"Zu" lwsLimit:"Zu""
267 			 " gwsInit:"Zu" gwsLimit:"Zu"\n",
268 			  lwsInit, lwsLimit, gwsInit,
269 			  gwsLimit);
270 #endif
271 	/* Auto tune start.*/
272 	pcount = gwsInit;
273 	count = 0;
274 #define calcMs(start, end)	\
275 		((long double)(end.tv_sec - start.tv_sec) * 1000.000 + \
276 			(long double)(end.tv_usec - start.tv_usec) / 1000.000)
277 	if (tuneGws) {
278 		createDevObjGws(pcount, jtrUniqDevId);
279 		hostDccHashes = (cl_uint *) mem_alloc(pcount * sizeof(cl_uint) * 4);
280 		hostDcc2Hashes = (cl_uint *) mem_calloc(pcount * 4, sizeof(cl_uint));
281 		hostSalt = (cl_uint *) mem_alloc(SALT_BUFFER_SIZE);
282 		for (i = 0; i < pcount; i++) {
283 			hostDccHashes[i * 4] = a++;
284 			hostDccHashes[i * 4 + 1] = a + b++;
285 			hostDccHashes[i * 4 + 2] = c++;
286 			hostDccHashes[i * 4 + 3] = c + d++;
287 		}
288 		memset(hostSalt, 0x2B, SALT_BUFFER_SIZE);
289 
290 		gettimeofday(&startc, NULL);
291 		eventCtr = 0;
292 		execKernel(hostDccHashes, NULL, hostSalt, 20, 10240, hostDcc2Hashes, pcount, jtrUniqDevId, queue[jtrUniqDevId]);
293 		HANDLE_CLERROR(clFinish(queue[jtrUniqDevId]), "Finish Error");
294 		gettimeofday(&endc, NULL);
295 
296 		timeMs = calcMs(startc, endc);
297 		count = (size_t)((kernelRunMs / timeMs) * (long double)gwsInit);
298 		count = GET_NEXT_MULTIPLE(count, gwsRound);
299 
300 		MEM_FREE(hostDccHashes);
301 		MEM_FREE(hostDcc2Hashes);
302 		MEM_FREE(hostSalt);
303 		releaseDevObjGws(jtrUniqDevId);
304 
305 		pcount = count;
306 		createDevObjGws(pcount, jtrUniqDevId);
307 		hostDccHashes = (cl_uint *) mem_alloc(pcount * sizeof(cl_uint) * 4);
308 		hostDcc2Hashes = (cl_uint *) mem_calloc(pcount * 4, sizeof(cl_uint));
309 		hostSalt = (cl_uint *) mem_alloc(SALT_BUFFER_SIZE);
310 		for (i = 0; i < pcount; i++) {
311 			hostDccHashes[i * 4] = a++;
312 			hostDccHashes[i * 4 + 1] = a + b++;
313 			hostDccHashes[i * 4 + 2] = c++;
314 			hostDccHashes[i * 4 + 3] = c + d++;
315 		}
316 		memset(hostSalt, 0x2B, SALT_BUFFER_SIZE);
317 
318 		gettimeofday(&startc, NULL);
319 		eventCtr = 0;
320 		execKernel(hostDccHashes, NULL, hostSalt, 20, 10240, hostDcc2Hashes, pcount, jtrUniqDevId, queue[jtrUniqDevId]);
321 		HANDLE_CLERROR(clFinish(queue[jtrUniqDevId]), "Finish Error");
322 		gettimeofday(&endc, NULL);
323 
324 		timeMs = calcMs(startc, endc);
325 		count = (size_t)((kernelRunMs / timeMs) * (long double)count);
326 		count = GET_NEXT_MULTIPLE(count, gwsRound);
327 
328 		MEM_FREE(hostDccHashes);
329 		MEM_FREE(hostDcc2Hashes);
330 		MEM_FREE(hostSalt);
331 	}
332 
333 	if (tuneGws && tuneLws)
334 		releaseDevObjGws(jtrUniqDevId);
335 
336 	if (tuneLws) {
337 		size_t bestLws;
338 		count = tuneGws ? count : devParam[jtrUniqDevId].devGws;
339 
340 		createDevObjGws(count, jtrUniqDevId);
341 		pcount = count;
342 		hostDccHashes = (cl_uint *) mem_alloc(pcount * sizeof(cl_uint) * 4);
343 		hostDcc2Hashes = (cl_uint *) mem_calloc(pcount * 4, sizeof(cl_uint));
344 		hostSalt = (cl_uint *) mem_alloc(SALT_BUFFER_SIZE);
345 		for (i = 0; i < pcount; i++) {
346 			hostDccHashes[i * 4] = a++;
347 			hostDccHashes[i * 4 + 1] = a + b++;
348 			hostDccHashes[i * 4 + 2] = c++;
349 			hostDccHashes[i * 4 + 3] = c + d++;
350 		}
351 		memset(hostSalt, 0x2B, SALT_BUFFER_SIZE);
352 
353 		devParam[jtrUniqDevId].devLws = lwsInit;
354 
355 		gettimeofday(&startc, NULL);
356 		eventCtr = 0;
357 		execKernel(hostDccHashes, NULL, hostSalt, 20, 10240, hostDcc2Hashes, pcount, jtrUniqDevId, queue[jtrUniqDevId]);
358 		HANDLE_CLERROR(clFinish(queue[jtrUniqDevId]), "Finish Error");
359 		gettimeofday(&endc, NULL);
360 
361 		timeMs = calcMs(startc, endc);
362 
363 		minTimeMs = timeMs;
364 		bestLws = devParam[jtrUniqDevId].devLws;
365 
366 		devParam[jtrUniqDevId].devLws = 2 * lwsInit;
367 
368 		while (devParam[jtrUniqDevId].devLws <= lwsLimit) {
369 			for (i = 0; i < pcount; i++) {
370 				hostDccHashes[i * 4] = a++;
371 				hostDccHashes[i * 4 + 1] = a + b++;
372 				hostDccHashes[i * 4 + 2] = c++;
373 				hostDccHashes[i * 4 + 3] = c + d++;
374 			}
375 			gettimeofday(&startc, NULL);
376 			pcount = count;
377 			eventCtr = 0;
378 			execKernel(hostDccHashes, NULL, hostSalt, 20, 10240, hostDcc2Hashes, pcount, jtrUniqDevId, queue[jtrUniqDevId]);
379 			HANDLE_CLERROR(clFinish(queue[jtrUniqDevId]), "Finish Error");
380 			gettimeofday(&endc, NULL);
381 
382 			timeMs = calcMs(startc, endc);
383 
384 			if (minTimeMs > timeMs) {
385 				minTimeMs = timeMs;
386 				bestLws = devParam[jtrUniqDevId].devLws;
387 			}
388 
389 			devParam[jtrUniqDevId].devLws *= 2;
390 		}
391 
392 		devParam[jtrUniqDevId].devLws = bestLws;
393 
394 		if (devParam[jtrUniqDevId].devLws > lwsLimit)
395 			devParam[jtrUniqDevId].devLws = lwsLimit;
396 
397 		MEM_FREE(hostDccHashes);
398 		MEM_FREE(hostDcc2Hashes);
399 		MEM_FREE(hostSalt);
400 	}
401 
402 	if (tuneGws && tuneLws) {
403 		count = (size_t)((kernelRunMs / minTimeMs) * (long double)count);
404 		count = GET_NEXT_MULTIPLE(count, gwsRound);
405 	}
406 
407 	if (tuneGws) {
408 		if (count > gwsLimit)
409 			count = gwsLimit;
410 		releaseDevObjGws(jtrUniqDevId);
411 		createDevObjGws(count, jtrUniqDevId);
412 		devParam[jtrUniqDevId].devGws = count;
413 	}
414 
415 	if (!tuneGws && !tuneLws)
416 		createDevObjGws(devParam[jtrUniqDevId].devGws, jtrUniqDevId);
417 	/* Auto tune finish.*/
418 
419 	if (devParam[jtrUniqDevId].devGws % gwsRound) {
420 		devParam[jtrUniqDevId].devGws = GET_NEXT_MULTIPLE(devParam[jtrUniqDevId].devGws, gwsRound);
421 		releaseDevObjGws(jtrUniqDevId);
422 		if (devParam[jtrUniqDevId].devGws > gwsLimit)
423 			devParam[jtrUniqDevId].devGws = gwsLimit;
424 		createDevObjGws(devParam[jtrUniqDevId].devGws, jtrUniqDevId);
425 	}
426 
427 	if (devParam[jtrUniqDevId].devGws > gwsLimit) {
428 		releaseDevObjGws(jtrUniqDevId);
429 		devParam[jtrUniqDevId].devGws = gwsLimit;
430 		createDevObjGws(devParam[jtrUniqDevId].devGws, jtrUniqDevId);
431 	}
432 
433 	if (options.verbosity > VERB_LEGACY)
434 	fprintf(stdout, "Device %d  GWS: "Zu", LWS: "Zu"\n", jtrUniqDevId,
435 			devParam[jtrUniqDevId].devGws, devParam[jtrUniqDevId].devLws);
436 
437 #undef calcMs
438 	return devParam[jtrUniqDevId].devGws;
439 }
440 
selectDevice(int jtrUniqDevId,struct fmt_main * self)441 size_t selectDevice(int jtrUniqDevId, struct fmt_main *self)
442 {
443 	char buildOpts[300];
444 
445 	sprintf(buildOpts, "-D SALT_BUFFER_SIZE=" Zu, SALT_BUFFER_SIZE);
446 	opencl_init("$JOHN/kernels/pbkdf2_kernel.cl", jtrUniqDevId, buildOpts);
447 
448 	devParam[jtrUniqDevId].devKernel[0] = clCreateKernel(program[jtrUniqDevId], "pbkdf2_preprocess_short", &ret_code);
449 	HANDLE_CLERROR(ret_code, "Error creating kernel pbkdf2_preprocess_short.");
450 
451 	devParam[jtrUniqDevId].devKernel[1] = clCreateKernel(program[jtrUniqDevId], "pbkdf2_preprocess_long", &ret_code);
452 	HANDLE_CLERROR(ret_code, "Error creating kernel pbkdf2_preprocess_long.");
453 
454 	devParam[jtrUniqDevId].devKernel[2] = clCreateKernel(program[jtrUniqDevId], "pbkdf2_iter", &ret_code);
455 	HANDLE_CLERROR(ret_code, "Error creating kernel pbkdf2_iter.");
456 
457 	devParam[jtrUniqDevId].devKernel[3] = clCreateKernel(program[jtrUniqDevId], "pbkdf2_postprocess", &ret_code);
458 	HANDLE_CLERROR(ret_code, "Error creating kernel pbkdf2_postprocess.");
459 
460 	createDevObj(jtrUniqDevId);
461 
462 	maxActiveDevices++;
463 
464 	return autoTune(jtrUniqDevId, 1000);
465 }
466 
dcc2Execute(cl_uint * hostDccHashes,cl_uint * hostSha1Hashes,cl_uint * hostSalt,cl_uint saltlen,cl_uint iterCount,cl_uint * hostDcc2Hashes,cl_uint numKeys)467 void dcc2Execute(cl_uint *hostDccHashes, cl_uint *hostSha1Hashes, cl_uint *hostSalt, cl_uint saltlen, cl_uint iterCount, cl_uint *hostDcc2Hashes, cl_uint numKeys)
468 {
469 	int 		i;
470 	unsigned int 	workPart, workOffset = 0;
471 	cl_int 		ret;
472 
473 #ifdef  _DEBUG
474 	struct timeval startc, endc;
475 #endif
476 
477 	eventCtr = 0;
478 	memset(hostDcc2Hashes, 0, numKeys * sizeof(cl_uint));
479 
480 	///Divide memory and work
481 	for (i = 0; i < maxActiveDevices; ++i) {
482 		if (i == maxActiveDevices - 1)
483 			workPart = numKeys - workOffset;
484 		else
485 			workPart = devParam[engaged_devices[i]].devGws;
486 
487 		if ((int)workPart <= 0)
488 			workPart = devParam[engaged_devices[i]].devLws;
489 #ifdef  _DEBUG
490 		gettimeofday(&startc, NULL) ;
491 		fprintf(stderr, "Work Offset:%d  Work Part Size:%d Event No:%d",workOffset,workPart,event_ctr);
492 
493 		if (workPart != devParam[engaged_devices[i]].devGws)
494 			fprintf(stderr, "Deficit: %d "Zu"\n",  engaged_devices[i], devParam[engaged_devices[i]].devGws - workPart);
495 #endif
496 
497 		///call to execKernel()
498 		execKernel(hostDccHashes + 4 * workOffset, hostSha1Hashes + 5 * workOffset, hostSalt, saltlen, iterCount, hostDcc2Hashes + 4 * workOffset, workPart, engaged_devices[i], queue[engaged_devices[i]]);
499 		workOffset += workPart;
500 
501 #ifdef  _DEBUG
502 		gettimeofday(&endc, NULL);
503 		fprintf(stderr, "GPU enqueue time:%f\n",(endc.tv_sec - startc.tv_sec) + (double)(endc.tv_usec - startc.tv_usec) / 1000000.000) ;
504 #endif
505 	}
506 
507 	///Synchronize all kernels
508 	for (i = maxActiveDevices - 1; i >= 0; --i)
509 		HANDLE_CLERROR(clFlush(queue[engaged_devices[i]]), "Flush Error");
510 
511 	for (i = 0; i < maxActiveDevices; ++i) {
512 		while (1) {
513 			HANDLE_CLERROR(clGetEventInfo(events[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL), "Error in Get Event Info");
514 			if ((ret) == CL_COMPLETE)
515 				break;
516 #ifdef  _DEBUG
517 			 printf("%d%d ", ret, i);
518 #endif
519 		}
520 	}
521 
522 	eventCtr = workPart = workOffset = 0;
523 
524 	///Read results back from all kernels
525 	for (i = 0; i < maxActiveDevices; ++i) {
526 		if (i == maxActiveDevices - 1)
527 			workPart = numKeys - workOffset;
528 
529 		else
530 			workPart = devParam[engaged_devices[i]].devGws;
531 
532 		if ((int)workPart <= 0)
533 			workPart = devParam[engaged_devices[i]].devLws;
534 
535 #ifdef  _DEBUG
536 		gettimeofday(&startc, NULL) ;
537 		fprintf(stderr, "Work Offset:%d  Work Part Size:%d Event No:%d",workOffset,workPart,eventCtr);
538 #endif
539 
540 		///Read results back from device
541 		HANDLE_CLERROR(clEnqueueReadBuffer(queue[engaged_devices[i]],
542 					devBuffer[engaged_devices[i]].bufferDcc2Hashes,
543 						CL_FALSE, 0,
544 						4 * workPart * sizeof(cl_uint),
545 						hostDcc2Hashes + 4 * workOffset,
546 						0,
547 						NULL,
548 						NULL), "Write :FAILED");
549 			workOffset += workPart;
550 
551 #ifdef  _DEBUG
552 			gettimeofday(&endc, NULL);
553 			fprintf(stderr, "GPU enqueue time:%f\n",(endc.tv_sec - startc.tv_sec) + (double)(endc.tv_usec - startc.tv_usec) / 1000000.000) ;
554 #endif
555 			HANDLE_CLERROR(clReleaseEvent(events[i]), "Error releasing events[i].");
556 		}
557 
558 	for (i = 0; i < maxActiveDevices; ++i)
559 		HANDLE_CLERROR(clFinish(queue[engaged_devices[i]]), "Finish Error");
560 
561 }
562 
563 #endif
564