1 /*
2  * This software is Copyright (c) 2015 Sayantan Datta <std2048 at gmail dot com>
3  * and it is hereby released to the general public under the following terms:
4  * Redistribution and use in source and binary forms, with or without modification, are permitted.
5  * Based on Solar Designer implementation of DES_bs_b.c in jtr-v1.7.9
6  */
7 
8 #if HAVE_OPENCL
9 
10 #include <string.h>
11 #include <sys/time.h>
12 
13 #include "opencl_lm.h"
14 #include "options.h"
15 #include "opencl_lm_hst_dev_shared.h"
16 #include "bt_interface.h"
17 #include "mask_ext.h"
18 
19 #define PADDING 	2048
20 
21 #define get_num_bits(r, v)			\
22 {						\
23 	r = (v & 0xAAAAAAAA) != 0;		\
24 	r |= ((v & 0xFFFF0000) != 0) << 4;	\
25 	r |= ((v & 0xFF00FF00) != 0) << 3;	\
26 	r |= ((v & 0xF0F0F0F0) != 0) << 2;	\
27 	r |= ((v & 0xCCCCCCCC) != 0) << 1;	\
28 }
29 
30 static cl_mem buffer_lm_key_idx, buffer_raw_keys, buffer_lm_keys, buffer_int_lm_keys, buffer_int_key_loc, buffer_hash_ids, buffer_bitmap_dupe, buffer_offset_table, buffer_hash_table, buffer_bitmaps;
31 static unsigned int num_loaded_hashes, *hash_ids = NULL, *zero_buffer = NULL;
32 static size_t current_gws;
33 static unsigned int mask_mode;
34 static unsigned int static_gpu_locations[MASK_FMT_INT_PLHDR];
35 
36 static unsigned int hash_table_size, offset_table_size;
37 
38 static int lm_crypt(int *pcount, struct db_salt *salt);
39 
40 typedef union {
41 	unsigned char c[8][sizeof(lm_vector)];
42 	lm_vector v[8];
43 } key_page;
44 
45 #define vxorf(a, b) 					\
46 	((a) ^ (b))
47 #define vnot(dst, a) 					\
48 	(dst) = ~(a)
49 #define vand(dst, a, b) 				\
50 	(dst) = (a) & (b)
51 #define vor(dst, a, b) 					\
52 	(dst) = (a) | (b)
53 #define vandn(dst, a, b) 				\
54 	(dst) = (a) & ~(b)
55 #define vxor(dst, a, b) 				\
56 	(dst) = vxorf((a), (b))
57 #define vshl(dst, src, shift) 				\
58 	(dst) = (src) << (shift)
59 #define vshr(dst, src, shift) 				\
60 	(dst) = (src) >> (shift)
61 #define vshl1(dst, src) 				\
62 	vshl((dst), (src), 1)
63 
64 #define kvtype vtype
65 #define kvand vand
66 #define kvor vor
67 #define kvshl1 vshl1
68 #define kvshl vshl
69 #define kvshr vshr
70 
71 #define mask01 0x01010101
72 #define mask02 0x02020202
73 #define mask04 0x04040404
74 #define mask08 0x08080808
75 #define mask10 0x10101010
76 #define mask20 0x20202020
77 #define mask40 0x40404040
78 #define mask80 0x80808080
79 
80 #define kvand_shl1_or(dst, src, mask) 			\
81 	kvand(tmp, src, mask); 				\
82 	kvshl1(tmp, tmp); 				\
83 	kvor(dst, dst, tmp)
84 
85 #define kvand_shl_or(dst, src, mask, shift) 		\
86 	kvand(tmp, src, mask); 				\
87 	kvshl(tmp, tmp, shift); 			\
88 	kvor(dst, dst, tmp)
89 
90 #define kvand_shl1(dst, src, mask) 			\
91 	kvand(tmp, src, mask) ;				\
92 	kvshl1(dst, tmp)
93 
94 #define kvand_or(dst, src, mask) 			\
95 	kvand(tmp, src, mask); 				\
96 	kvor(dst, dst, tmp)
97 
98 #define kvand_shr_or(dst, src, mask, shift)		\
99 	kvand(tmp, src, mask); 				\
100 	kvshr(tmp, tmp, shift); 			\
101 	kvor(dst, dst, tmp)
102 
103 #define kvand_shr(dst, src, mask, shift) 		\
104 	kvand(tmp, src, mask); 				\
105 	kvshr(dst, tmp, shift)
106 
107 #define LOAD_V 						\
108 	kvtype v0 = *(kvtype *)&vp[0]; 	\
109 	kvtype v1 = *(kvtype *)&vp[1]; 	\
110 	kvtype v2 = *(kvtype *)&vp[2]; 	\
111 	kvtype v3 = *(kvtype *)&vp[3]; 	\
112 	kvtype v4 = *(kvtype *)&vp[4]; 	\
113 	kvtype v5 = *(kvtype *)&vp[5]; 	\
114 	kvtype v6 = *(kvtype *)&vp[6]; 	\
115 	kvtype v7 = *(kvtype *)&vp[7];
116 
117 #define FINALIZE_NEXT_KEY_BIT_0g { 			\
118 	kvtype m = mask01, va, vb, tmp; 		\
119 	kvand(va, v0, m); 				\
120 	kvand_shl1(vb, v1, m); 				\
121 	kvand_shl_or(va, v2, m, 2); 			\
122 	kvand_shl_or(vb, v3, m, 3); 			\
123 	kvand_shl_or(va, v4, m, 4); 			\
124 	kvand_shl_or(vb, v5, m, 5); 			\
125 	kvand_shl_or(va, v6, m, 6); 			\
126 	kvand_shl_or(vb, v7, m, 7); 			\
127 	kvor(kp[0], va, vb); 				\
128 	kp += 1;					\
129 }
130 
131 #define FINALIZE_NEXT_KEY_BIT_1g { 			\
132 	kvtype m = mask02, va, vb, tmp; 		\
133 	kvand_shr(va, v0, m, 1); 			\
134 	kvand(vb, v1, m); 				\
135 	kvand_shl1_or(va, v2, m); 			\
136 	kvand_shl_or(vb, v3, m, 2); 			\
137 	kvand_shl_or(va, v4, m, 3); 			\
138 	kvand_shl_or(vb, v5, m, 4); 			\
139 	kvand_shl_or(va, v6, m, 5); 			\
140 	kvand_shl_or(vb, v7, m, 6); 			\
141 	kvor(kp[0], va, vb); 				\
142 	kp += 1;					\
143 }
144 
145 #define FINALIZE_NEXT_KEY_BIT_2g { 			\
146 	kvtype m = mask04, va, vb, tmp; 		\
147 	kvand_shr(va, v0, m, 2); 			\
148 	kvand_shr(vb, v1, m, 1); 			\
149 	kvand_or(va, v2, m); 				\
150 	kvand_shl1_or(vb, v3, m); 			\
151 	kvand_shl_or(va, v4, m, 2); 			\
152 	kvand_shl_or(vb, v5, m, 3); 			\
153 	kvand_shl_or(va, v6, m, 4); 			\
154 	kvand_shl_or(vb, v7, m, 5); 			\
155 	kvor(kp[0], va, vb); 				\
156 	kp += 1;					\
157 }
158 
159 #define FINALIZE_NEXT_KEY_BIT_3g { 			\
160 	kvtype m = mask08, va, vb, tmp; 		\
161 	kvand_shr(va, v0, m, 3); 			\
162 	kvand_shr(vb, v1, m, 2); 			\
163 	kvand_shr_or(va, v2, m, 1); 			\
164 	kvand_or(vb, v3, m); 				\
165 	kvand_shl1_or(va, v4, m); 			\
166 	kvand_shl_or(vb, v5, m, 2); 			\
167 	kvand_shl_or(va, v6, m, 3); 			\
168 	kvand_shl_or(vb, v7, m, 4); 			\
169 	kvor(kp[0], va, vb); 				\
170 	kp += 1;					\
171 }
172 
173 #define FINALIZE_NEXT_KEY_BIT_4g { 			\
174 	kvtype m = mask10, va, vb, tmp; 		\
175 	kvand_shr(va, v0, m, 4); 			\
176 	kvand_shr(vb, v1, m, 3); 			\
177 	kvand_shr_or(va, v2, m, 2); 			\
178 	kvand_shr_or(vb, v3, m, 1); 			\
179 	kvand_or(va, v4, m); 				\
180 	kvand_shl1_or(vb, v5, m); 			\
181 	kvand_shl_or(va, v6, m, 2); 			\
182 	kvand_shl_or(vb, v7, m, 3); 			\
183 	kvor(kp[0], va, vb); 				\
184 	kp += 1;					\
185 }
186 
187 #define FINALIZE_NEXT_KEY_BIT_5g { 			\
188 	kvtype m = mask20, va, vb, tmp; 		\
189 	kvand_shr(va, v0, m, 5); 			\
190 	kvand_shr(vb, v1, m, 4); 			\
191 	kvand_shr_or(va, v2, m, 3); 			\
192 	kvand_shr_or(vb, v3, m, 2); 			\
193 	kvand_shr_or(va, v4, m, 1); 			\
194 	kvand_or(vb, v5, m); 				\
195 	kvand_shl1_or(va, v6, m); 			\
196 	kvand_shl_or(vb, v7, m, 2); 			\
197 	kvor(kp[0], va, vb); 				\
198 	kp += 1;					\
199 }
200 
201 #define FINALIZE_NEXT_KEY_BIT_6g { 			\
202 	kvtype m = mask40, va, vb, tmp; 		\
203 	kvand_shr(va, v0, m, 6); 			\
204 	kvand_shr(vb, v1, m, 5); 			\
205 	kvand_shr_or(va, v2, m, 4); 			\
206 	kvand_shr_or(vb, v3, m, 3); 			\
207 	kvand_shr_or(va, v4, m, 2); 			\
208 	kvand_shr_or(vb, v5, m, 1); 			\
209 	kvand_or(va, v6, m); 				\
210 	kvand_shl1_or(vb, v7, m); 			\
211 	kvor(kp[0], va, vb); 				\
212 	kp += 1;					\
213 }
214 
215 #define FINALIZE_NEXT_KEY_BIT_7g { 			\
216 	kvtype m = mask80, va, vb, tmp; 		\
217 	kvand_shr(va, v0, m, 7); 			\
218 	kvand_shr(vb, v1, m, 6); 			\
219 	kvand_shr_or(va, v2, m, 5); 			\
220 	kvand_shr_or(vb, v3, m, 4); 			\
221 	kvand_shr_or(va, v4, m, 3); 			\
222 	kvand_shr_or(vb, v5, m, 2); 			\
223 	kvand_shr_or(va, v6, m, 1); 			\
224 	kvand_or(vb, v7, m); 				\
225 	kvor(kp[0], va, vb); 				\
226 	kp += 1;					\
227 }
228 
229 
lm_finalize_int_keys()230 static void lm_finalize_int_keys()
231 {
232 	key_page *int_key_page[MASK_FMT_INT_PLHDR];
233 	unsigned int *final_key_pages[MASK_FMT_INT_PLHDR], i, j;
234 
235 	for (i = 0; i < MASK_FMT_INT_PLHDR; i++) {
236 		int_key_page[i] = (key_page *) mem_alloc(((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(key_page));
237 		final_key_pages[i] = (unsigned int *) mem_alloc(8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int));
238 		memset(int_key_page[i], 0x7f, ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(key_page));
239 		memset(final_key_pages[i], 0xff, 8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int));
240 	}
241 
242 	for (i = 0; i < mask_int_cand.num_int_cand && mask_int_cand.int_cand; i++) {
243 		j = i >> LM_LOG_DEPTH;
244 		int_key_page[0][j].c[(i & (LM_DEPTH - 1)) & 7][(i & (LM_DEPTH - 1)) >> 3] = opencl_lm_u[mask_int_cand.int_cand[i].x[0] & 0xFF];
245 #if MASK_FMT_INT_PLHDR > 1
246 		if (mask_skip_ranges[1] != -1)
247 			int_key_page[1][j].c[(i & (LM_DEPTH - 1)) & 7][(i & (LM_DEPTH - 1)) >> 3] = opencl_lm_u[mask_int_cand.int_cand[i].x[1] & 0xFF];
248 #endif
249 #if MASK_FMT_INT_PLHDR > 2
250 		if (mask_skip_ranges[2] != -1)
251 			int_key_page[2][j].c[(i & (LM_DEPTH - 1)) & 7][(i & (LM_DEPTH - 1)) >> 3] = opencl_lm_u[mask_int_cand.int_cand[i].x[2] & 0xFF];
252 #endif
253 #if MASK_FMT_INT_PLHDR > 3
254 		if (mask_skip_ranges[3] != -1)
255 			int_key_page[3][j].c[(i & (LM_DEPTH - 1)) & 7][(i & (LM_DEPTH - 1)) >> 3] = opencl_lm_u[mask_int_cand.int_cand[i].x[3] & 0xFF];
256 #endif
257 	}
258 
259 	for (j = 0; j < MASK_FMT_INT_PLHDR; j++) {
260 		if (mask_skip_ranges == NULL || mask_skip_ranges[j] == -1)
261 			continue;
262 		for (i = 0; i < ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH); i++) {
263 			lm_vector *kp = (lm_vector *)&final_key_pages[j][8 * i];
264 			lm_vector *vp = (lm_vector *)&int_key_page[j][i].v[0];
265 			LOAD_V
266 			FINALIZE_NEXT_KEY_BIT_0g
267 			FINALIZE_NEXT_KEY_BIT_1g
268 			FINALIZE_NEXT_KEY_BIT_2g
269 			FINALIZE_NEXT_KEY_BIT_3g
270 			FINALIZE_NEXT_KEY_BIT_4g
271 			FINALIZE_NEXT_KEY_BIT_5g
272 			FINALIZE_NEXT_KEY_BIT_6g
273 			FINALIZE_NEXT_KEY_BIT_7g
274 		}
275 
276 		HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_int_lm_keys, CL_TRUE, j * 8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int),
277 				8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int), final_key_pages[j], 0, NULL, NULL ), "Failed Copy data to gpu");
278 	}
279 
280 	for (i = 0; i < MASK_FMT_INT_PLHDR; i++) {
281 		MEM_FREE(int_key_page[i]);
282 		MEM_FREE(final_key_pages[i]);
283 	}
284 }
285 
create_buffer_gws(size_t gws)286 static void create_buffer_gws(size_t gws)
287 {
288 	unsigned int i;
289 
290 	opencl_lm_all = (opencl_lm_combined*) mem_alloc ((gws + PADDING)* sizeof(opencl_lm_combined));
291 	opencl_lm_keys = (opencl_lm_transfer*) mem_alloc ((gws + PADDING)* sizeof(opencl_lm_transfer));
292 	opencl_lm_int_key_loc = (unsigned int*) mem_calloc((gws + PADDING), sizeof(unsigned int));
293 
294 	memset(opencl_lm_keys, 0x6f, (gws + PADDING)* sizeof(opencl_lm_transfer));
295 
296 	buffer_raw_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, (gws + PADDING) * sizeof(opencl_lm_transfer), NULL, &ret_code);
297 	HANDLE_CLERROR(ret_code, "Failed creating buffer_raw_keys.");
298 
299 	buffer_lm_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE, (gws + PADDING) * sizeof(lm_vector) * 56, NULL, &ret_code);
300 	HANDLE_CLERROR(ret_code, "Failed creating buffer_lm_keys.");
301 
302 	buffer_int_key_loc = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE, (gws + PADDING) * sizeof(unsigned int), NULL, &ret_code);
303 	HANDLE_CLERROR(ret_code, "Failed creating buffer_lm_keys.");
304 
305 	for (i = 0; i < (gws + PADDING); i++)
306 		opencl_lm_init(i);
307 }
308 
set_kernel_args_gws()309 static void set_kernel_args_gws()
310 {
311 	size_t static_param_size = 101;
312 	char *kernel_name = (char *) mem_calloc(static_param_size, sizeof(char));
313 	cl_uint num_args;
314 
315 	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof(cl_mem), &buffer_raw_keys), "Failed setting kernel argument buffer_raw_keys, kernel 0.");
316 	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(cl_mem), &buffer_int_key_loc), "Failed setting kernel argument buffer_int_key_loc, kernel 0.");
317 
318 	HANDLE_CLERROR(clGetKernelInfo(crypt_kernel, CL_KERNEL_FUNCTION_NAME, static_param_size - 1, kernel_name, NULL), "Failed to query kernel name.");
319 	HANDLE_CLERROR(clGetKernelInfo(crypt_kernel, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &num_args, NULL), "Failed to query kernel num args.");
320 
321 	if (!strncmp(kernel_name, "lm_bs_b", 7) && num_args == 10)
322 		HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(cl_mem), &buffer_lm_keys), "Failed setting kernel argument buffer_lm_keys, kernel lm_bs_b.");
323 
324 	if (!strncmp(kernel_name, "lm_bs_f", 7) && num_args == 9)
325 		HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(cl_mem), &buffer_lm_keys), "Failed setting kernel argument buffer_lm_keys, kernel lm_bs_f.");
326 
327 	MEM_FREE(kernel_name);
328 }
329 
release_buffer_gws()330 static void release_buffer_gws()
331 {
332 	if (opencl_lm_all) {
333 		MEM_FREE(opencl_lm_all);
334 		MEM_FREE(opencl_lm_keys);
335 		MEM_FREE(opencl_lm_int_key_loc);
336 		HANDLE_CLERROR(clReleaseMemObject(buffer_raw_keys), "Error releasing buffer_raw_keys.");
337 		HANDLE_CLERROR(clReleaseMemObject(buffer_lm_keys), "Error releasing buffer_lm_keys.");
338 		HANDLE_CLERROR(clReleaseMemObject(buffer_int_key_loc), "Error releasing buffer_int_key_loc.");
339 	}
340 }
341 
create_buffer(unsigned int num_loaded_hashes,OFFSET_TABLE_WORD * offset_table,unsigned int ot_size,unsigned int ht_size,unsigned int * bitmaps,unsigned int bmp_size_bits)342 static void create_buffer(unsigned int num_loaded_hashes, OFFSET_TABLE_WORD *offset_table, unsigned int ot_size, unsigned int ht_size, unsigned int *bitmaps, unsigned int bmp_size_bits)
343 {
344 	unsigned int active_placeholders, i;
345 
346 	hash_ids     = (unsigned int *) mem_calloc (3 * num_loaded_hashes + 1, sizeof(unsigned int));
347 	zero_buffer = (unsigned int *) mem_calloc (((ht_size - 1) / 32 + 1), sizeof(unsigned int));
348 
349 	opencl_lm_init_index();
350 
351 	active_placeholders = 1;
352 	if (mask_skip_ranges)
353 	for (i = 0; i < MASK_FMT_INT_PLHDR; i++) {
354 		if (mask_skip_ranges[i] != -1)
355 			active_placeholders++;
356 	}
357 
358 	buffer_lm_key_idx = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 768 * sizeof(unsigned int), opencl_lm_index768, &ret_code);
359 	HANDLE_CLERROR(ret_code, "Failed creating buffer_lm_key_idx.");
360 
361 	buffer_int_lm_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, active_placeholders * 8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int), NULL, &ret_code);
362 	HANDLE_CLERROR(ret_code, "Failed creating buffer_int_lm_keys.");
363 
364 	buffer_offset_table = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ot_size * sizeof(OFFSET_TABLE_WORD), offset_table, &ret_code);
365 	HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_offset_table.");
366 
367 	buffer_hash_table = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ht_size * sizeof(unsigned int) * 2, hash_table_64, &ret_code);
368 	HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_hash_table.");
369 
370 	buffer_bitmaps = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bmp_size_bits >> 3, bitmaps, &ret_code);
371 	HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_bitmaps.");
372 
373 	buffer_hash_ids = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE, (3 * num_loaded_hashes + 1) * sizeof(unsigned int), NULL, &ret_code);
374 	HANDLE_CLERROR(ret_code, "Failed creating buffer_hash_ids.");
375 
376 	buffer_bitmap_dupe = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ((ht_size - 1) / 32 + 1) * sizeof(unsigned int), zero_buffer, &ret_code);
377 	HANDLE_CLERROR(ret_code, "Failed creating buffer_bitmap_dupe.");
378 
379 	HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_hash_ids, CL_TRUE, 0, sizeof(cl_uint), zero_buffer, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_hash_ids.");
380 
381 	lm_finalize_int_keys();
382 }
383 
set_kernel_args()384 static void set_kernel_args()
385 {
386 	size_t static_param_size = 101;
387 	unsigned int ctr = 2;
388 	char *kernel_name = (char *) mem_calloc(static_param_size, sizeof(char));
389 	cl_uint num_args;
390 
391 	HANDLE_CLERROR(clGetKernelInfo(crypt_kernel, CL_KERNEL_FUNCTION_NAME, static_param_size - 1, kernel_name, NULL), "Failed to query kernel name.");
392 	HANDLE_CLERROR(clGetKernelInfo(crypt_kernel, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &num_args, NULL), "Failed to query kernel num args.");
393 
394 	if (!strncmp(kernel_name, "lm_bs_b", 7)) {
395 		if (num_args == 10)
396 			ctr++;
397 		HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_lm_key_idx), "Failed setting kernel argument buffer_lm_key_idx, kernel 0.");
398 	}
399 	if (!strncmp(kernel_name, "lm_bs_f", 7) && num_args == 9)
400 		ctr++;
401 
402 	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_int_lm_keys), "Failed setting kernel argument buffer_int_lm_keys, kernel 0.");
403 	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_offset_table), "Failed setting kernel argument buffer_offset_table, kernel 0.");
404 	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_hash_table), "Failed setting kernel argument buffer_hash_table, kernel 0.");
405 	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_bitmaps), "Failed setting kernel argument buffer_bitmaps, kernel 0.");
406 	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_hash_ids), "Failed setting kernel argument buffer_hash_ids, kernel 0.");
407 	HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_bitmap_dupe), "Failed setting kernel argument buffer_bitmap_dupe, kernel 0.");
408 
409 	MEM_FREE(kernel_name);
410 }
411 
release_buffer()412 static void release_buffer()
413 {
414 	if (buffer_bitmap_dupe) {
415 		MEM_FREE(hash_ids);
416 		MEM_FREE(zero_buffer);
417 		HANDLE_CLERROR(clReleaseMemObject(buffer_lm_key_idx), "Error releasing buffer_lm_key_idx");
418 		HANDLE_CLERROR(clReleaseMemObject(buffer_hash_ids), "Error releasing buffer_hash_ids.");
419 		HANDLE_CLERROR(clReleaseMemObject(buffer_offset_table), "Error releasing buffer_offset_table.");
420 		HANDLE_CLERROR(clReleaseMemObject(buffer_hash_table), "Error releasing buffer_hash_table.");
421 		HANDLE_CLERROR(clReleaseMemObject(buffer_bitmaps), "Error releasing buffer_bitmaps.");
422 		HANDLE_CLERROR(clReleaseMemObject(buffer_bitmap_dupe), "Error releasing buffer_bitmap_dupe.");
423 		HANDLE_CLERROR(clReleaseMemObject(buffer_int_lm_keys), "Error releasing buffer_int_lm_keys.");
424 		buffer_bitmap_dupe = 0;
425 	}
426 }
427 
init_kernels(char * bitmap_params,unsigned int full_unroll,size_t s_mem_lws,unsigned int use_local_mem,unsigned int use_last_build_opt)428 static void init_kernels(char *bitmap_params, unsigned int full_unroll, size_t s_mem_lws, unsigned int use_local_mem, unsigned int use_last_build_opt)
429 {
430 	static unsigned int last_build_opts[3];
431 	char build_opts[500];
432 	cl_ulong const_cache_size;
433 	unsigned int i;
434 
435 	for (i = 0; i < MASK_FMT_INT_PLHDR; i++)
436 		if (mask_skip_ranges && mask_skip_ranges[i] != -1)
437 			static_gpu_locations[i] = mask_int_cand.int_cpu_mask_ctx->
438 				ranges[mask_skip_ranges[i]].pos;
439 		else
440 			static_gpu_locations[i] = -1;
441 
442 	HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &const_cache_size, 0), "failed to get CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE.");
443 
444 	if (!use_last_build_opt) {
445 		sprintf(build_opts, "-D FULL_UNROLL=%u -D USE_LOCAL_MEM=%u -D WORK_GROUP_SIZE="Zu""
446 		" -D OFFSET_TABLE_SIZE=%u -D HASH_TABLE_SIZE=%u -D MASK_ENABLE=%u -D ITER_COUNT=%u -D LOC_0=%d"
447 #if MASK_FMT_INT_PLHDR > 1
448 		" -D LOC_1=%d "
449 #endif
450 #if MASK_FMT_INT_PLHDR > 2
451 		"-D LOC_2=%d "
452 #endif
453 #if MASK_FMT_INT_PLHDR > 3
454 		"-D LOC_3=%d"
455 #endif
456 		" -D IS_STATIC_GPU_MASK=%d -D CONST_CACHE_SIZE=%llu %s" ,
457 		full_unroll, use_local_mem, s_mem_lws, offset_table_size,  hash_table_size, mask_mode,
458 		((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH), static_gpu_locations[0]
459 #if MASK_FMT_INT_PLHDR > 1
460 		, static_gpu_locations[1]
461 #endif
462 #if MASK_FMT_INT_PLHDR > 2
463 		, static_gpu_locations[2]
464 #endif
465 #if MASK_FMT_INT_PLHDR > 3
466 		, static_gpu_locations[3]
467 #endif
468 		, mask_gpu_is_static, (unsigned long long)const_cache_size, bitmap_params);
469 
470 		last_build_opts[0] = full_unroll;
471 		last_build_opts[1] = use_local_mem;
472 		last_build_opts[2] = s_mem_lws;
473 	}
474 	else {
475 		sprintf(build_opts, "-cl-kernel-arg-info -D FULL_UNROLL=%u -D USE_LOCAL_MEM=%u -D WORK_GROUP_SIZE="Zu""
476 		" -D OFFSET_TABLE_SIZE=%u -D HASH_TABLE_SIZE=%u -D MASK_ENABLE=%u -D ITER_COUNT=%u -D LOC_0=%d"
477 #if MASK_FMT_INT_PLHDR > 1
478 		" -D LOC_1=%d "
479 #endif
480 #if MASK_FMT_INT_PLHDR > 2
481 		"-D LOC_2=%d "
482 #endif
483 #if MASK_FMT_INT_PLHDR > 3
484 		"-D LOC_3=%d"
485 #endif
486 		" -D IS_STATIC_GPU_MASK=%d -D CONST_CACHE_SIZE=%llu %s" ,
487 		last_build_opts[0], last_build_opts[1], (size_t)last_build_opts[2], offset_table_size,  hash_table_size, mask_mode,
488 		((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH), static_gpu_locations[0]
489 #if MASK_FMT_INT_PLHDR > 1
490 		, static_gpu_locations[1]
491 #endif
492 #if MASK_FMT_INT_PLHDR > 2
493 		, static_gpu_locations[2]
494 #endif
495 #if MASK_FMT_INT_PLHDR > 3
496 		, static_gpu_locations[3]
497 #endif
498 		, mask_gpu_is_static, (unsigned long long)const_cache_size, bitmap_params);
499 	}
500 
501 
502 	if (use_last_build_opt ? last_build_opts[0] : full_unroll)
503 		opencl_build_kernel("$JOHN/kernels/lm_kernel_f.cl",
504 		                    gpu_id, build_opts, 0);
505 	else
506 		opencl_build_kernel("$JOHN/kernels/lm_kernel_b.cl",
507 		                    gpu_id, build_opts, 0);
508 
509 	if (use_last_build_opt ? last_build_opts[0] : full_unroll) {
510 		crypt_kernel = clCreateKernel(program[gpu_id], "lm_bs_f", &ret_code);
511 		HANDLE_CLERROR(ret_code, "Failed creating kernel lm_bs_f.");
512 	}
513 	else {
514 		crypt_kernel = clCreateKernel(program[gpu_id], "lm_bs_b", &ret_code);
515 		HANDLE_CLERROR(ret_code, "Failed creating kernel lm_bs_b.");
516 	}
517 }
518 
release_kernels()519 static void release_kernels()
520 {
521 	if (crypt_kernel) {
522 		HANDLE_CLERROR(clReleaseKernel(crypt_kernel), "Error releasing kernel 0");
523 		crypt_kernel = 0;
524 	}
525 }
526 
clean_all_buffers()527 static void clean_all_buffers()
528 {
529 	release_buffer_gws();
530 	release_buffer();
531 	release_kernels();
532 	MEM_FREE(hash_table_64);
533 	if (program[gpu_id]) {
534 		HANDLE_CLERROR(clReleaseProgram(program[gpu_id]),
535 			"Error releasing Program");
536 		program[gpu_id] = 0;
537 	}
538 }
539 
540 /* if returns 0x800000, means there is no restriction on lws due to local memory limitations.*/
541 /* if returns 0, means local memory shouldn't be allocated.*/
find_smem_lws_limit(unsigned int full_unroll,unsigned int use_local_mem,unsigned int force_global_keys)542 static size_t find_smem_lws_limit(unsigned int full_unroll, unsigned int use_local_mem, unsigned int force_global_keys)
543 {
544 	cl_ulong s_mem_sz = get_local_memory_size(gpu_id);
545 	size_t expected_lws_limit;
546 	cl_uint warp_size;
547 
548 	if (force_global_keys) {
549 		if (s_mem_sz > 768 * sizeof(cl_short) || full_unroll)
550 			return 0x800000;
551 		else
552 			return 0;
553 	}
554 
555 	if (!s_mem_sz)
556 		return 0;
557 
558 	if (gpu_amd(device_info[gpu_id])) {
559 		if (clGetDeviceInfo(devices[gpu_id], CL_DEVICE_WAVEFRONT_WIDTH_AMD,
560 		                    sizeof(cl_uint), &warp_size, 0) != CL_SUCCESS)
561 			warp_size = 64;
562 	}
563 	else if (gpu_nvidia(device_info[gpu_id])) {
564 		if (clGetDeviceInfo(devices[gpu_id], CL_DEVICE_WARP_SIZE_NV,
565 		                    sizeof(cl_uint), &warp_size, 0) != CL_SUCCESS)
566 			warp_size = 32;
567 	}
568 	else
569 		return 0;
570 
571 	if (full_unroll || !use_local_mem) {
572 		expected_lws_limit = s_mem_sz /
573 				(sizeof(lm_vector) * 56);
574 		if (!expected_lws_limit)
575 			return 0;
576 		expected_lws_limit = GET_MULTIPLE_OR_ZERO(
577 				expected_lws_limit, warp_size);
578 	}
579 	else {
580 		if (s_mem_sz > 768 * sizeof(cl_short)) {
581 			s_mem_sz -= 768 * sizeof(cl_short);
582 			expected_lws_limit = s_mem_sz /
583 					(sizeof(lm_vector) * 56);
584 			if (!expected_lws_limit)
585 				return 0x800000;
586 			expected_lws_limit = GET_MULTIPLE_OR_ZERO(
587 				expected_lws_limit, warp_size);
588 		}
589 		else
590 			return 0;
591 	}
592 
593 	if (warp_size == 1 && expected_lws_limit & (expected_lws_limit - 1)) {
594 		get_power_of_two(expected_lws_limit);
595 		expected_lws_limit >>= 1;
596 	}
597 	return expected_lws_limit;
598 }
599 
600 #define calc_ms(start, end)	\
601 		((long double)(end.tv_sec - start.tv_sec) * 1000.000 + \
602 			(long double)(end.tv_usec - start.tv_usec) / 1000.000)
603 
604 /* Sets global_work_size and max_keys_per_crypt. */
gws_tune(size_t gws_init,long double kernel_run_ms,int gws_tune_flag,struct fmt_main * format,int mask_mode)605 static void gws_tune(size_t gws_init, long double kernel_run_ms, int gws_tune_flag, struct fmt_main *format, int mask_mode)
606 {
607 	unsigned int i;
608 	char key[PLAINTEXT_LENGTH + 1] = "alterit";
609 
610 	struct timeval startc, endc;
611 	long double time_ms = 0;
612 	int pcount;
613 	unsigned int lm_log_depth = mask_mode ? 0 : LM_LOG_DEPTH;
614 
615 	size_t gws_limit = get_max_mem_alloc_size(gpu_id) / sizeof(opencl_lm_transfer);
616 	if (gws_limit > PADDING)
617 		gws_limit -= PADDING;
618 
619 	if (gws_limit & (gws_limit - 1)) {
620 		get_power_of_two(gws_limit);
621 		gws_limit >>= 1;
622 	}
623 
624 #if SIZEOF_SIZE_T > 4
625 	/* We can't process more than 4G keys per crypt() */
626 	while (gws_limit * mask_int_cand.num_int_cand > 0xffffffffUL)
627 		gws_limit >>= 1;
628 #endif
629 
630 	if (gws_tune_flag)
631 		global_work_size = gws_init;
632 
633 	if (global_work_size > gws_limit)
634 		global_work_size = gws_limit;
635 
636 	if (gws_tune_flag) {
637 		release_buffer_gws();
638 		create_buffer_gws(global_work_size);
639 		set_kernel_args_gws();
640 
641 		format->methods.clear_keys();
642 		for (i = 0; i < (global_work_size << lm_log_depth); i++) {
643 			key[i & 3] = i & 255;
644 			key[(i & 3) + 3] = i ^ 0x3E;
645 			format->methods.set_key(key, i);
646 		}
647 
648 		gettimeofday(&startc, NULL);
649 		pcount = (int)(global_work_size << lm_log_depth);
650 		lm_crypt((int *)&pcount, NULL);
651 		gettimeofday(&endc, NULL);
652 
653 		time_ms = calc_ms(startc, endc);
654 		global_work_size = (size_t)((kernel_run_ms / time_ms) * (long double)global_work_size);
655 	}
656 
657 	if (global_work_size < local_work_size)
658 		global_work_size = local_work_size;
659 
660 	get_power_of_two(global_work_size);
661 
662 	if (global_work_size > gws_limit)
663 		global_work_size = gws_limit;
664 
665 	release_buffer_gws();
666 	create_buffer_gws(global_work_size);
667 	set_kernel_args_gws();
668 
669 	/* for hash_ids[3*x + 1], 27 bits for storing gid and 5 bits for bs depth. */
670 	//assert(global_work_size <= ((1U << 28) - 1));
671 	fmt_opencl_lm.params.max_keys_per_crypt = global_work_size << lm_log_depth;
672 
673 	fmt_opencl_lm.params.min_keys_per_crypt =
674 		opencl_calc_min_kpc(local_work_size, global_work_size,
675 		                    1 << lm_log_depth);
676 }
677 
auto_tune_all(char * bitmap_params,unsigned int num_loaded_hashes,long double kernel_run_ms,struct fmt_main * format,int mask_mode)678 static void auto_tune_all(char *bitmap_params, unsigned int num_loaded_hashes, long double kernel_run_ms, struct fmt_main *format, int mask_mode)
679 {
680 	unsigned int full_unroll = 0;
681 	unsigned int use_local_mem = 1;
682 	unsigned int force_global_keys = 1;
683 	unsigned int gws_tune_flag = 1;
684 	unsigned int lws_tune_flag = 1;
685 
686 	size_t s_mem_limited_lws;
687 
688 	struct timeval startc, endc;
689 	long double time_ms = 0;
690 
691 	char key[PLAINTEXT_LENGTH + 1] = "alterit";
692 
693 	unsigned int lm_log_depth = mask_mode ? 0 : LM_LOG_DEPTH;
694 
695 	if (cpu(device_info[gpu_id])) {
696 		force_global_keys = 1;
697 		use_local_mem = 0;
698 		full_unroll = 1;
699 		kernel_run_ms = 5;
700 	}
701 	else if (amd_vliw4(device_info[gpu_id]) || amd_vliw5(device_info[gpu_id]) || gpu_intel(device_info[gpu_id])) {
702 		force_global_keys = 0;
703 		use_local_mem = 1;
704 		full_unroll = 0;
705 	}
706 	else if (platform_apple(platform_id) && gpu_nvidia(device_info[gpu_id])) {
707 		force_global_keys = 1;
708 		use_local_mem = 0;
709 		full_unroll = 1;
710 	}
711 	else if (gpu(device_info[gpu_id])) {
712 		force_global_keys = 0;
713 		use_local_mem = 1;
714 		full_unroll = 1;
715 	}
716 	else {
717 		force_global_keys = 1;
718 		use_local_mem = 0;
719 		full_unroll = 0;
720 		kernel_run_ms = 40;
721 	}
722 
723 	local_work_size = 0;
724 	global_work_size = 0;
725 	gws_tune_flag = 1;
726 	lws_tune_flag = 1;
727 	opencl_get_user_preferences(FORMAT_LABEL);
728 	if (global_work_size)
729 		gws_tune_flag = 0;
730 	if (local_work_size) {
731 		lws_tune_flag = 0;
732 		if (local_work_size & (local_work_size - 1)) {
733 			get_power_of_two(local_work_size);
734 		}
735 	}
736 
737 	s_mem_limited_lws = find_smem_lws_limit(
738 			full_unroll, use_local_mem, force_global_keys);
739 #if 0
740 	fprintf(stdout, "%s() Limit_smem:"Zu", Full_unroll_flag:%u,"
741 		"Use_local_mem:%u, Force_global_keys:%u\n",
742 	        __FUNCTION__,
743 	        s_mem_limited_lws, full_unroll, use_local_mem,
744 	        force_global_keys);
745 #endif
746 
747 	if (s_mem_limited_lws == 0x800000 || !s_mem_limited_lws) {
748 		long double best_time_ms;
749 		size_t best_lws, lws_limit;
750 
751 		release_kernels();
752 		init_kernels(bitmap_params, full_unroll, 0, use_local_mem && s_mem_limited_lws, 0);
753 		set_kernel_args();
754 
755 		gws_tune(1024, 2 * kernel_run_ms, gws_tune_flag, format, mask_mode);
756 		gws_tune(global_work_size, kernel_run_ms, gws_tune_flag, format, mask_mode);
757 
758 		lws_limit = get_kernel_max_lws(gpu_id, crypt_kernel);
759 
760 		if (lws_limit > global_work_size)
761 			lws_limit = global_work_size;
762 
763 		if (lws_tune_flag) {
764 			if (gpu(device_info[gpu_id]) && lws_limit >= 32)
765 				local_work_size = 32;
766 			else
767 				local_work_size = get_kernel_preferred_multiple(gpu_id, crypt_kernel);
768 		}
769 		if (local_work_size > lws_limit)
770 			local_work_size = lws_limit;
771 
772 		if (lws_tune_flag) {
773 			time_ms = 0;
774 			best_time_ms = 999999.00;
775 			best_lws = local_work_size;
776 			while (local_work_size <= lws_limit &&
777 				local_work_size <= PADDING) {
778 				int pcount, i;
779 				format->methods.clear_keys();
780 				for (i = 0; i < (global_work_size << lm_log_depth); i++) {
781 					key[i & 3] = i & 255;
782 					key[(i & 3) + 3] = i ^ 0x3F;
783 					format->methods.set_key(key, i);
784 				}
785 				gettimeofday(&startc, NULL);
786 				pcount = (int)(global_work_size << lm_log_depth);
787 				lm_crypt((int *)&pcount, NULL);
788 				gettimeofday(&endc, NULL);
789 
790 				time_ms = calc_ms(startc, endc);
791 
792 				if (time_ms < best_time_ms) {
793 					best_lws = local_work_size;
794 					best_time_ms = time_ms;
795 				}
796 #if 0
797 	fprintf(stdout, "GWS: "Zu", LWS: "Zu", Limit_smem:"Zu", Limit_kernel:"Zu","
798 		"Current time:%Lf, Best time:%Lf\n",
799 		global_work_size, local_work_size, s_mem_limited_lws,
800 		get_kernel_max_lws(gpu_id, crypt_kernel), time_ms,
801 		best_time_ms);
802 #endif
803 				local_work_size *= 2;
804 			}
805 			local_work_size = best_lws;
806 			gws_tune(global_work_size, kernel_run_ms, gws_tune_flag, format, mask_mode);
807 		}
808 	}
809 
810 	else {
811 		long double best_time_ms;
812 		size_t best_lws;
813 		cl_uint warp_size;
814 
815 		if (gpu_amd(device_info[gpu_id])) {
816 			if (clGetDeviceInfo(devices[gpu_id], CL_DEVICE_WAVEFRONT_WIDTH_AMD,
817 			                    sizeof(cl_uint), &warp_size, 0) != CL_SUCCESS)
818 				warp_size = 64;
819 		}
820 		else if (gpu_nvidia(device_info[gpu_id])) {
821 			if (clGetDeviceInfo(devices[gpu_id], CL_DEVICE_WARP_SIZE_NV,
822 			                    sizeof(cl_uint), &warp_size, 0) != CL_SUCCESS)
823 				warp_size = 32;
824 		}
825 		else {
826 			warp_size = 1;
827 			fprintf(stderr, "Possible auto_tune fail!!.\n");
828 		}
829 		if (lws_tune_flag)
830 			local_work_size = warp_size;
831 		if (local_work_size > s_mem_limited_lws)
832 			local_work_size = s_mem_limited_lws;
833 
834 		release_kernels();
835 		init_kernels(bitmap_params, full_unroll, local_work_size, use_local_mem, 0);
836 
837 		if (local_work_size > get_kernel_max_lws(gpu_id, crypt_kernel)) {
838 			local_work_size = get_kernel_max_lws(gpu_id, crypt_kernel);
839 			release_kernels();
840 			init_kernels(bitmap_params, full_unroll, local_work_size, use_local_mem, 0);
841 		}
842 
843 		set_kernel_args();
844 		gws_tune(1024, 2 * kernel_run_ms, gws_tune_flag, format, mask_mode);
845 		gws_tune(global_work_size, kernel_run_ms, gws_tune_flag, format, mask_mode);
846 
847 		if (global_work_size < s_mem_limited_lws) {
848 			s_mem_limited_lws = global_work_size;
849 			if (local_work_size > s_mem_limited_lws)
850 				local_work_size = s_mem_limited_lws;
851 		}
852 
853 		if (lws_tune_flag) {
854 			best_time_ms = 999999.00;
855 			best_lws = local_work_size;
856 			while (local_work_size <= s_mem_limited_lws &&
857 				local_work_size <= PADDING) {
858 				int pcount, i;
859 
860 				release_kernels();
861 				init_kernels(bitmap_params, full_unroll, local_work_size, use_local_mem, 0);
862 				set_kernel_args();
863 				set_kernel_args_gws();
864 
865 				format->methods.clear_keys();
866 				for (i = 0; i < (global_work_size << lm_log_depth); i++) {
867 					key[i & 3] = i & 255;
868 					key[(i & 3) + 3] = i ^ 0x3E;
869 					format->methods.set_key(key, i);
870 				}
871 
872 				gettimeofday(&startc, NULL);
873 				pcount = (int)(global_work_size << lm_log_depth);
874 				lm_crypt((int *)&pcount, NULL);
875 				gettimeofday(&endc, NULL);
876 				time_ms = calc_ms(startc, endc);
877 
878 				if (time_ms < best_time_ms &&
879 				  local_work_size <= get_kernel_max_lws(
880 				    gpu_id, crypt_kernel)) {
881 					best_lws = local_work_size;
882 					best_time_ms = time_ms;
883 				}
884 #if 0
885 	fprintf(stdout, "GWS: "Zu", LWS: "Zu", Limit_smem:"Zu", Limit_kernel:"Zu","
886 		"Current time:%Lf, Best time:%Lf\n",
887 		global_work_size, local_work_size, s_mem_limited_lws,
888 		get_kernel_max_lws(gpu_id, crypt_kernel), time_ms,
889 		best_time_ms);
890 #endif
891 				if (gpu(device_info[gpu_id])) {
892 					if (local_work_size < 16)
893 						local_work_size = 16;
894 					else if (local_work_size < 32)
895 						local_work_size = 32;
896 					else if (local_work_size < 64)
897 						local_work_size = 64;
898 					else if (local_work_size < 96)
899 						local_work_size = 96;
900 					else if (local_work_size < 128)
901 						local_work_size = 128;
902 					else
903 						local_work_size += warp_size;
904 				}
905 				else
906 					local_work_size *= 2;
907 			}
908 			local_work_size = best_lws;
909 			release_kernels();
910 			init_kernels(bitmap_params, full_unroll, local_work_size, use_local_mem, 0);
911 			set_kernel_args();
912 			gws_tune(global_work_size, kernel_run_ms, gws_tune_flag, format, mask_mode);
913 		}
914 	}
915 	if (options.verbosity > VERB_LEGACY)
916 		fprintf(stdout, "GWS: "Zu", LWS: "Zu"\n",
917 		        global_work_size, local_work_size);
918 }
919 
920 /* Use only for smaller bitmaps < 16MB */
prepare_bitmap_2(cl_ulong bmp_sz_bits,cl_uint ** bitmaps_ptr,int * loaded_hashes)921 static void prepare_bitmap_2(cl_ulong bmp_sz_bits, cl_uint **bitmaps_ptr, int *loaded_hashes)
922 {
923 	unsigned int i;
924 	MEM_FREE(*bitmaps_ptr);
925 	*bitmaps_ptr = (cl_uint*) mem_calloc((bmp_sz_bits >> 4), sizeof(cl_uint));
926 
927 	for (i = 0; i < num_loaded_hashes; i++) {
928 		unsigned int bmp_idx = loaded_hashes[2 * i + 1] & (bmp_sz_bits - 1);
929 		(*bitmaps_ptr)[bmp_idx >> 5] |= (1U << (bmp_idx & 31));
930 
931 		bmp_idx = loaded_hashes[2 * i] & (bmp_sz_bits - 1);
932 		(*bitmaps_ptr)[(bmp_sz_bits >> 5) + (bmp_idx >> 5)] |=
933 			(1U << (bmp_idx & 31));
934 	}
935 }
936 
prepare_bitmap_1(cl_ulong bmp_sz_bits,cl_uint ** bitmaps_ptr,int * loaded_hashes)937 static void prepare_bitmap_1(cl_ulong bmp_sz_bits, cl_uint **bitmaps_ptr, int *loaded_hashes)
938 {
939 	unsigned int i;
940 	MEM_FREE(*bitmaps_ptr);
941 	*bitmaps_ptr = (cl_uint*) mem_calloc((bmp_sz_bits >> 5), sizeof(cl_uint));
942 
943 	for (i = 0; i < num_loaded_hashes; i++) {
944 		unsigned int bmp_idx = loaded_hashes[2 * i + 1] & (bmp_sz_bits - 1);
945 		(*bitmaps_ptr)[bmp_idx >> 5] |= (1U << (bmp_idx & 31));
946 	}
947 }
948 
select_bitmap(unsigned int num_ld_hashes,int * loaded_hashes,unsigned int * bitmap_size_bits,unsigned int ** bitmaps_ptr)949 static char* select_bitmap(unsigned int num_ld_hashes, int *loaded_hashes, unsigned int *bitmap_size_bits, unsigned int **bitmaps_ptr)
950 {
951 	static char kernel_params[200];
952 	unsigned int cmp_steps = 2, bits_req = 32;
953 
954 	if (num_ld_hashes <= 5100) {
955 		if (amd_gcn_10(device_info[gpu_id]) ||
956 			amd_vliw4(device_info[gpu_id]))
957 			*bitmap_size_bits = 512 * 1024;
958 
959 		else
960 			*bitmap_size_bits = 256 * 1024;
961 
962 	}
963 
964 	else if (num_ld_hashes <= 10100) {
965 		if (amd_gcn_10(device_info[gpu_id]) ||
966 			amd_vliw4(device_info[gpu_id]))
967 			*bitmap_size_bits = 512 * 1024;
968 
969 		else
970 			*bitmap_size_bits = 256 * 1024;
971 	}
972 
973 	else if (num_ld_hashes <= 20100) {
974 		if (amd_gcn_10(device_info[gpu_id]))
975 			*bitmap_size_bits = 1024 * 1024;
976 
977 		else
978 			*bitmap_size_bits = 512 * 1024;
979 	}
980 
981 	else if (num_ld_hashes <= 250100)
982 		*bitmap_size_bits = 2048 * 1024;
983 
984 	else if (num_ld_hashes <= 1100100) {
985 		if (!amd_gcn_11(device_info[gpu_id]))
986 			*bitmap_size_bits = 4096 * 1024;
987 
988 		else
989 			*bitmap_size_bits = 2048 * 1024;
990 	}
991 
992 	else if (num_ld_hashes <= 1500100) {
993 		*bitmap_size_bits = 4096 * 1024 * 2;
994 		cmp_steps = 1;
995 	}
996 
997 	else if (num_ld_hashes <= 2700100) {
998 		*bitmap_size_bits = 4096 * 1024 * 2 * 2;
999 		cmp_steps = 1;
1000 	}
1001 
1002 	else {
1003 		cl_ulong mult = num_ld_hashes / 2700100;
1004 		cl_ulong buf_sz;
1005 		*bitmap_size_bits = 4096 * 4096;
1006 		get_power_of_two(mult);
1007 		*bitmap_size_bits *= mult;
1008 		buf_sz = get_max_mem_alloc_size(gpu_id);
1009 		if (buf_sz & (buf_sz - 1)) {
1010 			get_power_of_two(buf_sz);
1011 			buf_sz >>= 1;
1012 		}
1013 		if (buf_sz >= 536870912)
1014 			buf_sz = 536870912;
1015 		if (((*bitmap_size_bits) >> 3) > buf_sz)
1016 			*bitmap_size_bits = buf_sz << 3;
1017 		cmp_steps = 1;
1018 	}
1019 
1020 	if (cmp_steps == 1)
1021 		prepare_bitmap_1(*bitmap_size_bits, bitmaps_ptr, loaded_hashes);
1022 
1023 	else
1024 		prepare_bitmap_2(*bitmap_size_bits, bitmaps_ptr, loaded_hashes);
1025 
1026 	get_num_bits(bits_req, (*bitmap_size_bits));
1027 
1028 	sprintf(kernel_params,
1029 		"-D SELECT_CMP_STEPS=%u"
1030 		" -D BITMAP_SIZE_BITS_LESS_ONE="LLu" -D REQ_BITMAP_BITS=%u",
1031 		cmp_steps, (unsigned long long)(*bitmap_size_bits) - 1, bits_req);
1032 
1033 	*bitmap_size_bits *= cmp_steps;
1034 
1035 	return kernel_params;
1036 }
1037 
prepare_table(struct db_salt * salt,OFFSET_TABLE_WORD ** offset_table_ptr,unsigned int * bitmap_size_bits,unsigned ** bitmaps_ptr)1038 static char* prepare_table(struct db_salt *salt, OFFSET_TABLE_WORD **offset_table_ptr, unsigned int *bitmap_size_bits, unsigned **bitmaps_ptr)
1039 {
1040 	int *bin, i;
1041 	struct db_password *pw, *last;
1042 	char *bitmap_params;
1043 	int *loaded_hashes;
1044 
1045 	num_loaded_hashes = salt->count;
1046 	loaded_hashes = (int *)mem_alloc(num_loaded_hashes * sizeof(int) * 2);
1047 
1048 	last = pw = salt->list;
1049 	i = 0;
1050 	do {
1051 		bin = (int *)pw->binary;
1052 		if (bin == NULL) {
1053 			if (last == pw)
1054 				salt->list = pw->next;
1055 			else
1056 				last->next = pw->next;
1057 		} else {
1058 			last = pw;
1059 			loaded_hashes[2 * i] = bin[0];
1060 			loaded_hashes[2 * i + 1] = bin[1];
1061 			i++;
1062 		}
1063 	} while ((pw = pw->next)) ;
1064 
1065 	if (i > (salt->count)) {
1066 		fprintf(stderr,
1067 			"Something went wrong while preparing hashes(%d, %d)..Exiting..\n", i, salt->count);
1068 		error();
1069 	}
1070 
1071 	num_loaded_hashes = create_perfect_hash_table(64, (void *)loaded_hashes,
1072 				num_loaded_hashes,
1073 			        offset_table_ptr,
1074 			        &offset_table_size,
1075 			        &hash_table_size, 0);
1076 
1077 	if (!num_loaded_hashes) {
1078 		MEM_FREE(hash_table_64);
1079 		MEM_FREE((*offset_table_ptr));
1080 		fprintf(stderr, "Failed to create Hash Table for cracking.\n");
1081 		error();
1082 	}
1083 
1084 	bitmap_params = select_bitmap(num_loaded_hashes, loaded_hashes, bitmap_size_bits, bitmaps_ptr);
1085 	MEM_FREE(loaded_hashes);
1086 
1087 	return bitmap_params;
1088 }
1089 
get_key(int index)1090 static char *get_key(int index)
1091 {
1092       get_key_body();
1093 }
1094 
get_key_mm(int index)1095 static char *get_key_mm(int index)
1096 {
1097 	static char out[PLAINTEXT_LENGTH + 1];
1098 	unsigned int section, depth, iter;
1099 	unsigned char *src, i;
1100 	char *dst;
1101 
1102 	if (hash_ids == NULL || hash_ids[0] == 0 ||
1103 	    index > hash_ids[0] || hash_ids[0] > num_loaded_hashes) {
1104 		section = 0;
1105 		depth = 0;
1106 		iter = 0;
1107 	}
1108 	else {
1109 		section = hash_ids[3 * index + 1] / 32;
1110 		depth  = hash_ids[3 * index + 1] & 31;
1111 		iter = hash_ids[3 * index + 2];
1112 	}
1113 
1114 	if (section > global_work_size ) {
1115 		//fprintf(stderr, "Get key error! %u "Zu"\n", section, global_work_size);
1116 		section = 0;
1117 		depth = 0;
1118 		iter = 0;
1119 	}
1120 
1121 	if (mask_skip_ranges && mask_int_cand.num_int_cand > 1) {
1122 		for (i = 0; i < MASK_FMT_INT_PLHDR && mask_skip_ranges[i] != -1; i++)
1123 			if (mask_gpu_is_static)
1124 				opencl_lm_keys[section].xkeys.c[static_gpu_locations[i]][depth & 7][depth >> 3] = opencl_lm_u[mask_int_cand.int_cand[iter * 32 + depth].x[i]];
1125 			else
1126 				opencl_lm_keys[section].xkeys.c[(opencl_lm_int_key_loc[section] & (0xff << (i * 8))) >> (i * 8)][depth & 7][depth >> 3] = opencl_lm_u[mask_int_cand.int_cand[iter * 32 + depth].x[i]];
1127 	}
1128 
1129 	src = opencl_lm_all[section].pxkeys[depth];
1130 	dst = out;
1131 	while (dst < &out[PLAINTEXT_LENGTH] && (*dst = *src)) {
1132 		src += sizeof(lm_vector) * 8;
1133 		dst++;
1134 	}
1135 	*dst = 0;
1136 
1137 	return out;
1138 }
1139 
reset(struct db_main * db)1140 static void reset(struct db_main *db)
1141 {
1142 	if (db->real && db == db->real) {
1143 		struct db_salt *salt;
1144 		unsigned int *bitmaps = NULL;
1145 		OFFSET_TABLE_WORD *offset_table = NULL;
1146 		char *bitmap_params;
1147 		unsigned int bitmap_size_bits = 0;
1148 
1149 		release_buffer();
1150 		release_buffer_gws();
1151 		release_kernels();
1152 		MEM_FREE(hash_table_64);
1153 
1154 		salt = db->salts;
1155 		bitmap_params = prepare_table(salt, &offset_table, &bitmap_size_bits, &bitmaps);
1156 		create_buffer(num_loaded_hashes, offset_table, offset_table_size, hash_table_size, bitmaps, bitmap_size_bits);
1157 
1158 		if (options.flags & FLG_MASK_CHK) {
1159 			mask_mode = 1;
1160 			fmt_opencl_lm.methods.set_key = opencl_lm_set_key_mm;
1161 			fmt_opencl_lm.methods.get_key = get_key_mm;
1162 		}
1163 
1164 		auto_tune_all(bitmap_params, num_loaded_hashes, 100, &fmt_opencl_lm, mask_mode);
1165 		MEM_FREE(offset_table);
1166 		MEM_FREE(bitmaps);
1167 	}
1168 	else {
1169 		int i, *binary;
1170 		char *ciphertext, *bitmap_params;
1171 		unsigned int *bitmaps = NULL;
1172 		unsigned int bitmap_size_bits = 0;
1173 		OFFSET_TABLE_WORD *offset_table = NULL;
1174 		int *loaded_hashes;
1175 
1176 		num_loaded_hashes = 0;
1177 		while (fmt_opencl_lm.params.tests[num_loaded_hashes].ciphertext) num_loaded_hashes++;
1178 
1179 		loaded_hashes = (int *) mem_alloc (num_loaded_hashes * sizeof(int) * 2);
1180 
1181 		i = 0;
1182 		while (fmt_opencl_lm.params.tests[i].ciphertext) {
1183 			char **fields = fmt_opencl_lm.params.tests[i].fields;
1184 			if (!fields[1])
1185 				fields[1] = fmt_opencl_lm.params.tests[i].ciphertext;
1186 			ciphertext = fmt_opencl_lm.methods.split(fmt_opencl_lm.methods.prepare(fields, &fmt_opencl_lm), 0, &fmt_opencl_lm);
1187 			binary = (int *)fmt_opencl_lm.methods.binary(ciphertext);
1188 			loaded_hashes[2 * i] = binary[0];
1189 			loaded_hashes[2 * i + 1] = binary[1];
1190 			i++;
1191 			//fprintf(stderr, "C:%s B:%d %d %d\n", ciphertext, binary[0], binary[1], i == num_loaded_hashes );
1192 		}
1193 
1194 		num_loaded_hashes = create_perfect_hash_table(64, (void *)loaded_hashes,
1195 				num_loaded_hashes,
1196 			        &offset_table,
1197 			        &offset_table_size,
1198 			        &hash_table_size, 0);
1199 
1200 		if (!num_loaded_hashes) {
1201 			MEM_FREE(hash_table_64);
1202 			MEM_FREE(offset_table);
1203 			fprintf(stderr, "Failed to create Hash Table for self test.\n");
1204 			error();
1205 		}
1206 		bitmap_params = select_bitmap(num_loaded_hashes, loaded_hashes, &bitmap_size_bits, &bitmaps);
1207 		create_buffer(num_loaded_hashes, offset_table, offset_table_size, hash_table_size, bitmaps, bitmap_size_bits);
1208 		auto_tune_all(bitmap_params, num_loaded_hashes, 100, &fmt_opencl_lm, 0);
1209 
1210 		MEM_FREE(offset_table);
1211 		MEM_FREE(bitmaps);
1212 		MEM_FREE(loaded_hashes);
1213 		hash_ids[0] = 0;
1214 	}
1215 }
1216 
init_global_variables()1217 static void init_global_variables()
1218 {
1219 	mask_int_cand_target = opencl_speed_index(gpu_id) / 300;
1220 }
1221 
lm_crypt(int * pcount,struct db_salt * salt)1222 static int lm_crypt(int *pcount, struct db_salt *salt)
1223 {
1224 	const int count = mask_mode ?
1225 		*pcount : (*pcount + LM_DEPTH - 1) >> LM_LOG_DEPTH;
1226 	size_t *lws = local_work_size ? &local_work_size : NULL;
1227 	current_gws = GET_NEXT_MULTIPLE(count, local_work_size);
1228 
1229 #if 0
1230 	fprintf(stderr, "pcount %d count %d lws "Zu" gws "Zu" cur_gws "Zu" static: %d\n", *pcount, count, local_work_size, global_work_size, current_gws, mask_gpu_is_static);
1231 #endif
1232 	if (salt != NULL && salt->count > 4500 &&
1233 		(num_loaded_hashes - num_loaded_hashes / 10) > salt->count) {
1234 		char *bitmap_params;
1235 		unsigned int *bitmaps = NULL;
1236 		unsigned int bitmap_size_bits = 0;
1237 		OFFSET_TABLE_WORD *offset_table = NULL;
1238 
1239 		release_buffer();
1240 		release_kernels();
1241 		MEM_FREE(hash_table_64);
1242 
1243 		bitmap_params = prepare_table(salt, &offset_table, &bitmap_size_bits, &bitmaps);
1244 		create_buffer(num_loaded_hashes, offset_table, offset_table_size, hash_table_size, bitmaps, bitmap_size_bits);
1245 
1246 		init_kernels(bitmap_params, 0, 0, 0, 1);
1247 
1248 		set_kernel_args();
1249 		set_kernel_args_gws();
1250 
1251 		MEM_FREE(offset_table);
1252 		MEM_FREE(bitmaps);
1253 	}
1254 
1255 	BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_raw_keys, CL_FALSE, 0, current_gws * sizeof(opencl_lm_transfer), opencl_lm_keys, 0, NULL, NULL ), "Failed Copy data to gpu");
1256 
1257 	if (!mask_gpu_is_static)
1258 		BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_int_key_loc, CL_FALSE, 0, current_gws * sizeof(unsigned int), opencl_lm_int_key_loc, 0, NULL, NULL ), "Failed Copy data to gpu");
1259 
1260 	BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL, &current_gws, lws, 0, NULL, NULL), "Failed enqueue kernel lm_bs_*.");
1261 	BENCH_CLERROR(clFinish(queue[gpu_id]), "Kernel failed");
1262 
1263 	BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_hash_ids, CL_TRUE, 0, sizeof(unsigned int), hash_ids, 0, NULL, NULL), "Read FAILED\n");
1264 
1265 	if (hash_ids[0] > num_loaded_hashes) {
1266 		fprintf(stderr, "Error, crypt_all kernel.\n");
1267 		error();
1268 	}
1269 
1270 	if (hash_ids[0]) {
1271 		BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_hash_ids, CL_TRUE, 0, (3 * hash_ids[0] + 1) * sizeof(unsigned int), hash_ids, 0, NULL, NULL), "Read FAILED\n");
1272 		BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_bitmap_dupe, CL_TRUE, 0, ((hash_table_size - 1)/32 + 1) * sizeof(cl_uint), zero_buffer, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_bitmap_dupe.");
1273 		BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_hash_ids, CL_TRUE, 0, sizeof(cl_uint), zero_buffer, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_hash_ids.");
1274 	}
1275 
1276 	*pcount *= mask_int_cand.num_int_cand;
1277 
1278 	 return hash_ids[0];
1279 }
1280 
opencl_lm_get_hash_0(int index)1281 int opencl_lm_get_hash_0(int index)
1282 {
1283 	return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_0;
1284 }
1285 
opencl_lm_get_hash_1(int index)1286 int opencl_lm_get_hash_1(int index)
1287 {
1288 	return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_1;
1289 }
1290 
opencl_lm_get_hash_2(int index)1291 int opencl_lm_get_hash_2(int index)
1292 {
1293 	return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_2;
1294 }
1295 
opencl_lm_get_hash_3(int index)1296 int opencl_lm_get_hash_3(int index)
1297 {
1298 	return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_3;
1299 }
1300 
opencl_lm_get_hash_4(int index)1301 int opencl_lm_get_hash_4(int index)
1302 {
1303 	return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_4;
1304 }
1305 
opencl_lm_get_hash_5(int index)1306 int opencl_lm_get_hash_5(int index)
1307 {
1308 	return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_5;
1309 }
1310 
opencl_lm_get_hash_6(int index)1311 int opencl_lm_get_hash_6(int index)
1312 {
1313 	return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_6;
1314 }
1315 
cmp_one(void * binary,int index)1316 static int cmp_one(void *binary, int index)
1317 {
1318 	if (((int *)binary)[0] == hash_table_64[hash_ids[3 + 3 * index]])
1319 		return 1;
1320 	return 0;
1321 }
1322 
cmp_exact(char * source,int index)1323 static int cmp_exact(char *source, int index)
1324 {
1325 	int *binary = opencl_lm_get_binary(source + 4);
1326 
1327 	if (binary[1] == hash_table_64[hash_ids[3 + 3 * index] + hash_table_size])
1328 		return 1;
1329 	return 0;
1330 }
1331 
opencl_lm_b_register_functions(struct fmt_main * fmt)1332 void opencl_lm_b_register_functions(struct fmt_main *fmt)
1333 {
1334 	fmt->methods.done = &clean_all_buffers;
1335 	fmt->methods.reset = &reset;
1336 	fmt->methods.get_key = &get_key;
1337 	fmt->methods.crypt_all = &lm_crypt;
1338 	fmt->methods.cmp_exact = cmp_exact;
1339 	fmt->methods.cmp_one = cmp_one;
1340 	opencl_lm_init_global_variables = &init_global_variables;
1341 }
1342 
1343 #endif /* #if HAVE_OPENCL */
1344