1 /*
2 * This software is Copyright (c) 2015 Sayantan Datta <std2048 at gmail dot com>
3 * and it is hereby released to the general public under the following terms:
4 * Redistribution and use in source and binary forms, with or without modification, are permitted.
5 * Based on Solar Designer implementation of DES_bs_b.c in jtr-v1.7.9
6 */
7
8 #if HAVE_OPENCL
9
10 #include <string.h>
11 #include <sys/time.h>
12
13 #include "opencl_lm.h"
14 #include "options.h"
15 #include "opencl_lm_hst_dev_shared.h"
16 #include "bt_interface.h"
17 #include "mask_ext.h"
18
19 #define PADDING 2048
20
21 #define get_num_bits(r, v) \
22 { \
23 r = (v & 0xAAAAAAAA) != 0; \
24 r |= ((v & 0xFFFF0000) != 0) << 4; \
25 r |= ((v & 0xFF00FF00) != 0) << 3; \
26 r |= ((v & 0xF0F0F0F0) != 0) << 2; \
27 r |= ((v & 0xCCCCCCCC) != 0) << 1; \
28 }
29
30 static cl_mem buffer_lm_key_idx, buffer_raw_keys, buffer_lm_keys, buffer_int_lm_keys, buffer_int_key_loc, buffer_hash_ids, buffer_bitmap_dupe, buffer_offset_table, buffer_hash_table, buffer_bitmaps;
31 static unsigned int num_loaded_hashes, *hash_ids = NULL, *zero_buffer = NULL;
32 static size_t current_gws;
33 static unsigned int mask_mode;
34 static unsigned int static_gpu_locations[MASK_FMT_INT_PLHDR];
35
36 static unsigned int hash_table_size, offset_table_size;
37
38 static int lm_crypt(int *pcount, struct db_salt *salt);
39
40 typedef union {
41 unsigned char c[8][sizeof(lm_vector)];
42 lm_vector v[8];
43 } key_page;
44
45 #define vxorf(a, b) \
46 ((a) ^ (b))
47 #define vnot(dst, a) \
48 (dst) = ~(a)
49 #define vand(dst, a, b) \
50 (dst) = (a) & (b)
51 #define vor(dst, a, b) \
52 (dst) = (a) | (b)
53 #define vandn(dst, a, b) \
54 (dst) = (a) & ~(b)
55 #define vxor(dst, a, b) \
56 (dst) = vxorf((a), (b))
57 #define vshl(dst, src, shift) \
58 (dst) = (src) << (shift)
59 #define vshr(dst, src, shift) \
60 (dst) = (src) >> (shift)
61 #define vshl1(dst, src) \
62 vshl((dst), (src), 1)
63
64 #define kvtype vtype
65 #define kvand vand
66 #define kvor vor
67 #define kvshl1 vshl1
68 #define kvshl vshl
69 #define kvshr vshr
70
71 #define mask01 0x01010101
72 #define mask02 0x02020202
73 #define mask04 0x04040404
74 #define mask08 0x08080808
75 #define mask10 0x10101010
76 #define mask20 0x20202020
77 #define mask40 0x40404040
78 #define mask80 0x80808080
79
80 #define kvand_shl1_or(dst, src, mask) \
81 kvand(tmp, src, mask); \
82 kvshl1(tmp, tmp); \
83 kvor(dst, dst, tmp)
84
85 #define kvand_shl_or(dst, src, mask, shift) \
86 kvand(tmp, src, mask); \
87 kvshl(tmp, tmp, shift); \
88 kvor(dst, dst, tmp)
89
90 #define kvand_shl1(dst, src, mask) \
91 kvand(tmp, src, mask) ; \
92 kvshl1(dst, tmp)
93
94 #define kvand_or(dst, src, mask) \
95 kvand(tmp, src, mask); \
96 kvor(dst, dst, tmp)
97
98 #define kvand_shr_or(dst, src, mask, shift) \
99 kvand(tmp, src, mask); \
100 kvshr(tmp, tmp, shift); \
101 kvor(dst, dst, tmp)
102
103 #define kvand_shr(dst, src, mask, shift) \
104 kvand(tmp, src, mask); \
105 kvshr(dst, tmp, shift)
106
107 #define LOAD_V \
108 kvtype v0 = *(kvtype *)&vp[0]; \
109 kvtype v1 = *(kvtype *)&vp[1]; \
110 kvtype v2 = *(kvtype *)&vp[2]; \
111 kvtype v3 = *(kvtype *)&vp[3]; \
112 kvtype v4 = *(kvtype *)&vp[4]; \
113 kvtype v5 = *(kvtype *)&vp[5]; \
114 kvtype v6 = *(kvtype *)&vp[6]; \
115 kvtype v7 = *(kvtype *)&vp[7];
116
117 #define FINALIZE_NEXT_KEY_BIT_0g { \
118 kvtype m = mask01, va, vb, tmp; \
119 kvand(va, v0, m); \
120 kvand_shl1(vb, v1, m); \
121 kvand_shl_or(va, v2, m, 2); \
122 kvand_shl_or(vb, v3, m, 3); \
123 kvand_shl_or(va, v4, m, 4); \
124 kvand_shl_or(vb, v5, m, 5); \
125 kvand_shl_or(va, v6, m, 6); \
126 kvand_shl_or(vb, v7, m, 7); \
127 kvor(kp[0], va, vb); \
128 kp += 1; \
129 }
130
131 #define FINALIZE_NEXT_KEY_BIT_1g { \
132 kvtype m = mask02, va, vb, tmp; \
133 kvand_shr(va, v0, m, 1); \
134 kvand(vb, v1, m); \
135 kvand_shl1_or(va, v2, m); \
136 kvand_shl_or(vb, v3, m, 2); \
137 kvand_shl_or(va, v4, m, 3); \
138 kvand_shl_or(vb, v5, m, 4); \
139 kvand_shl_or(va, v6, m, 5); \
140 kvand_shl_or(vb, v7, m, 6); \
141 kvor(kp[0], va, vb); \
142 kp += 1; \
143 }
144
145 #define FINALIZE_NEXT_KEY_BIT_2g { \
146 kvtype m = mask04, va, vb, tmp; \
147 kvand_shr(va, v0, m, 2); \
148 kvand_shr(vb, v1, m, 1); \
149 kvand_or(va, v2, m); \
150 kvand_shl1_or(vb, v3, m); \
151 kvand_shl_or(va, v4, m, 2); \
152 kvand_shl_or(vb, v5, m, 3); \
153 kvand_shl_or(va, v6, m, 4); \
154 kvand_shl_or(vb, v7, m, 5); \
155 kvor(kp[0], va, vb); \
156 kp += 1; \
157 }
158
159 #define FINALIZE_NEXT_KEY_BIT_3g { \
160 kvtype m = mask08, va, vb, tmp; \
161 kvand_shr(va, v0, m, 3); \
162 kvand_shr(vb, v1, m, 2); \
163 kvand_shr_or(va, v2, m, 1); \
164 kvand_or(vb, v3, m); \
165 kvand_shl1_or(va, v4, m); \
166 kvand_shl_or(vb, v5, m, 2); \
167 kvand_shl_or(va, v6, m, 3); \
168 kvand_shl_or(vb, v7, m, 4); \
169 kvor(kp[0], va, vb); \
170 kp += 1; \
171 }
172
173 #define FINALIZE_NEXT_KEY_BIT_4g { \
174 kvtype m = mask10, va, vb, tmp; \
175 kvand_shr(va, v0, m, 4); \
176 kvand_shr(vb, v1, m, 3); \
177 kvand_shr_or(va, v2, m, 2); \
178 kvand_shr_or(vb, v3, m, 1); \
179 kvand_or(va, v4, m); \
180 kvand_shl1_or(vb, v5, m); \
181 kvand_shl_or(va, v6, m, 2); \
182 kvand_shl_or(vb, v7, m, 3); \
183 kvor(kp[0], va, vb); \
184 kp += 1; \
185 }
186
187 #define FINALIZE_NEXT_KEY_BIT_5g { \
188 kvtype m = mask20, va, vb, tmp; \
189 kvand_shr(va, v0, m, 5); \
190 kvand_shr(vb, v1, m, 4); \
191 kvand_shr_or(va, v2, m, 3); \
192 kvand_shr_or(vb, v3, m, 2); \
193 kvand_shr_or(va, v4, m, 1); \
194 kvand_or(vb, v5, m); \
195 kvand_shl1_or(va, v6, m); \
196 kvand_shl_or(vb, v7, m, 2); \
197 kvor(kp[0], va, vb); \
198 kp += 1; \
199 }
200
201 #define FINALIZE_NEXT_KEY_BIT_6g { \
202 kvtype m = mask40, va, vb, tmp; \
203 kvand_shr(va, v0, m, 6); \
204 kvand_shr(vb, v1, m, 5); \
205 kvand_shr_or(va, v2, m, 4); \
206 kvand_shr_or(vb, v3, m, 3); \
207 kvand_shr_or(va, v4, m, 2); \
208 kvand_shr_or(vb, v5, m, 1); \
209 kvand_or(va, v6, m); \
210 kvand_shl1_or(vb, v7, m); \
211 kvor(kp[0], va, vb); \
212 kp += 1; \
213 }
214
215 #define FINALIZE_NEXT_KEY_BIT_7g { \
216 kvtype m = mask80, va, vb, tmp; \
217 kvand_shr(va, v0, m, 7); \
218 kvand_shr(vb, v1, m, 6); \
219 kvand_shr_or(va, v2, m, 5); \
220 kvand_shr_or(vb, v3, m, 4); \
221 kvand_shr_or(va, v4, m, 3); \
222 kvand_shr_or(vb, v5, m, 2); \
223 kvand_shr_or(va, v6, m, 1); \
224 kvand_or(vb, v7, m); \
225 kvor(kp[0], va, vb); \
226 kp += 1; \
227 }
228
229
lm_finalize_int_keys()230 static void lm_finalize_int_keys()
231 {
232 key_page *int_key_page[MASK_FMT_INT_PLHDR];
233 unsigned int *final_key_pages[MASK_FMT_INT_PLHDR], i, j;
234
235 for (i = 0; i < MASK_FMT_INT_PLHDR; i++) {
236 int_key_page[i] = (key_page *) mem_alloc(((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(key_page));
237 final_key_pages[i] = (unsigned int *) mem_alloc(8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int));
238 memset(int_key_page[i], 0x7f, ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(key_page));
239 memset(final_key_pages[i], 0xff, 8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int));
240 }
241
242 for (i = 0; i < mask_int_cand.num_int_cand && mask_int_cand.int_cand; i++) {
243 j = i >> LM_LOG_DEPTH;
244 int_key_page[0][j].c[(i & (LM_DEPTH - 1)) & 7][(i & (LM_DEPTH - 1)) >> 3] = opencl_lm_u[mask_int_cand.int_cand[i].x[0] & 0xFF];
245 #if MASK_FMT_INT_PLHDR > 1
246 if (mask_skip_ranges[1] != -1)
247 int_key_page[1][j].c[(i & (LM_DEPTH - 1)) & 7][(i & (LM_DEPTH - 1)) >> 3] = opencl_lm_u[mask_int_cand.int_cand[i].x[1] & 0xFF];
248 #endif
249 #if MASK_FMT_INT_PLHDR > 2
250 if (mask_skip_ranges[2] != -1)
251 int_key_page[2][j].c[(i & (LM_DEPTH - 1)) & 7][(i & (LM_DEPTH - 1)) >> 3] = opencl_lm_u[mask_int_cand.int_cand[i].x[2] & 0xFF];
252 #endif
253 #if MASK_FMT_INT_PLHDR > 3
254 if (mask_skip_ranges[3] != -1)
255 int_key_page[3][j].c[(i & (LM_DEPTH - 1)) & 7][(i & (LM_DEPTH - 1)) >> 3] = opencl_lm_u[mask_int_cand.int_cand[i].x[3] & 0xFF];
256 #endif
257 }
258
259 for (j = 0; j < MASK_FMT_INT_PLHDR; j++) {
260 if (mask_skip_ranges == NULL || mask_skip_ranges[j] == -1)
261 continue;
262 for (i = 0; i < ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH); i++) {
263 lm_vector *kp = (lm_vector *)&final_key_pages[j][8 * i];
264 lm_vector *vp = (lm_vector *)&int_key_page[j][i].v[0];
265 LOAD_V
266 FINALIZE_NEXT_KEY_BIT_0g
267 FINALIZE_NEXT_KEY_BIT_1g
268 FINALIZE_NEXT_KEY_BIT_2g
269 FINALIZE_NEXT_KEY_BIT_3g
270 FINALIZE_NEXT_KEY_BIT_4g
271 FINALIZE_NEXT_KEY_BIT_5g
272 FINALIZE_NEXT_KEY_BIT_6g
273 FINALIZE_NEXT_KEY_BIT_7g
274 }
275
276 HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_int_lm_keys, CL_TRUE, j * 8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int),
277 8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int), final_key_pages[j], 0, NULL, NULL ), "Failed Copy data to gpu");
278 }
279
280 for (i = 0; i < MASK_FMT_INT_PLHDR; i++) {
281 MEM_FREE(int_key_page[i]);
282 MEM_FREE(final_key_pages[i]);
283 }
284 }
285
create_buffer_gws(size_t gws)286 static void create_buffer_gws(size_t gws)
287 {
288 unsigned int i;
289
290 opencl_lm_all = (opencl_lm_combined*) mem_alloc ((gws + PADDING)* sizeof(opencl_lm_combined));
291 opencl_lm_keys = (opencl_lm_transfer*) mem_alloc ((gws + PADDING)* sizeof(opencl_lm_transfer));
292 opencl_lm_int_key_loc = (unsigned int*) mem_calloc((gws + PADDING), sizeof(unsigned int));
293
294 memset(opencl_lm_keys, 0x6f, (gws + PADDING)* sizeof(opencl_lm_transfer));
295
296 buffer_raw_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, (gws + PADDING) * sizeof(opencl_lm_transfer), NULL, &ret_code);
297 HANDLE_CLERROR(ret_code, "Failed creating buffer_raw_keys.");
298
299 buffer_lm_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE, (gws + PADDING) * sizeof(lm_vector) * 56, NULL, &ret_code);
300 HANDLE_CLERROR(ret_code, "Failed creating buffer_lm_keys.");
301
302 buffer_int_key_loc = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE, (gws + PADDING) * sizeof(unsigned int), NULL, &ret_code);
303 HANDLE_CLERROR(ret_code, "Failed creating buffer_lm_keys.");
304
305 for (i = 0; i < (gws + PADDING); i++)
306 opencl_lm_init(i);
307 }
308
set_kernel_args_gws()309 static void set_kernel_args_gws()
310 {
311 size_t static_param_size = 101;
312 char *kernel_name = (char *) mem_calloc(static_param_size, sizeof(char));
313 cl_uint num_args;
314
315 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof(cl_mem), &buffer_raw_keys), "Failed setting kernel argument buffer_raw_keys, kernel 0.");
316 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(cl_mem), &buffer_int_key_loc), "Failed setting kernel argument buffer_int_key_loc, kernel 0.");
317
318 HANDLE_CLERROR(clGetKernelInfo(crypt_kernel, CL_KERNEL_FUNCTION_NAME, static_param_size - 1, kernel_name, NULL), "Failed to query kernel name.");
319 HANDLE_CLERROR(clGetKernelInfo(crypt_kernel, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &num_args, NULL), "Failed to query kernel num args.");
320
321 if (!strncmp(kernel_name, "lm_bs_b", 7) && num_args == 10)
322 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(cl_mem), &buffer_lm_keys), "Failed setting kernel argument buffer_lm_keys, kernel lm_bs_b.");
323
324 if (!strncmp(kernel_name, "lm_bs_f", 7) && num_args == 9)
325 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(cl_mem), &buffer_lm_keys), "Failed setting kernel argument buffer_lm_keys, kernel lm_bs_f.");
326
327 MEM_FREE(kernel_name);
328 }
329
release_buffer_gws()330 static void release_buffer_gws()
331 {
332 if (opencl_lm_all) {
333 MEM_FREE(opencl_lm_all);
334 MEM_FREE(opencl_lm_keys);
335 MEM_FREE(opencl_lm_int_key_loc);
336 HANDLE_CLERROR(clReleaseMemObject(buffer_raw_keys), "Error releasing buffer_raw_keys.");
337 HANDLE_CLERROR(clReleaseMemObject(buffer_lm_keys), "Error releasing buffer_lm_keys.");
338 HANDLE_CLERROR(clReleaseMemObject(buffer_int_key_loc), "Error releasing buffer_int_key_loc.");
339 }
340 }
341
create_buffer(unsigned int num_loaded_hashes,OFFSET_TABLE_WORD * offset_table,unsigned int ot_size,unsigned int ht_size,unsigned int * bitmaps,unsigned int bmp_size_bits)342 static void create_buffer(unsigned int num_loaded_hashes, OFFSET_TABLE_WORD *offset_table, unsigned int ot_size, unsigned int ht_size, unsigned int *bitmaps, unsigned int bmp_size_bits)
343 {
344 unsigned int active_placeholders, i;
345
346 hash_ids = (unsigned int *) mem_calloc (3 * num_loaded_hashes + 1, sizeof(unsigned int));
347 zero_buffer = (unsigned int *) mem_calloc (((ht_size - 1) / 32 + 1), sizeof(unsigned int));
348
349 opencl_lm_init_index();
350
351 active_placeholders = 1;
352 if (mask_skip_ranges)
353 for (i = 0; i < MASK_FMT_INT_PLHDR; i++) {
354 if (mask_skip_ranges[i] != -1)
355 active_placeholders++;
356 }
357
358 buffer_lm_key_idx = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 768 * sizeof(unsigned int), opencl_lm_index768, &ret_code);
359 HANDLE_CLERROR(ret_code, "Failed creating buffer_lm_key_idx.");
360
361 buffer_int_lm_keys = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY, active_placeholders * 8 * ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH) * sizeof(unsigned int), NULL, &ret_code);
362 HANDLE_CLERROR(ret_code, "Failed creating buffer_int_lm_keys.");
363
364 buffer_offset_table = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ot_size * sizeof(OFFSET_TABLE_WORD), offset_table, &ret_code);
365 HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_offset_table.");
366
367 buffer_hash_table = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ht_size * sizeof(unsigned int) * 2, hash_table_64, &ret_code);
368 HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_hash_table.");
369
370 buffer_bitmaps = clCreateBuffer(context[gpu_id], CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bmp_size_bits >> 3, bitmaps, &ret_code);
371 HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_bitmaps.");
372
373 buffer_hash_ids = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE, (3 * num_loaded_hashes + 1) * sizeof(unsigned int), NULL, &ret_code);
374 HANDLE_CLERROR(ret_code, "Failed creating buffer_hash_ids.");
375
376 buffer_bitmap_dupe = clCreateBuffer(context[gpu_id], CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ((ht_size - 1) / 32 + 1) * sizeof(unsigned int), zero_buffer, &ret_code);
377 HANDLE_CLERROR(ret_code, "Failed creating buffer_bitmap_dupe.");
378
379 HANDLE_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_hash_ids, CL_TRUE, 0, sizeof(cl_uint), zero_buffer, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_hash_ids.");
380
381 lm_finalize_int_keys();
382 }
383
set_kernel_args()384 static void set_kernel_args()
385 {
386 size_t static_param_size = 101;
387 unsigned int ctr = 2;
388 char *kernel_name = (char *) mem_calloc(static_param_size, sizeof(char));
389 cl_uint num_args;
390
391 HANDLE_CLERROR(clGetKernelInfo(crypt_kernel, CL_KERNEL_FUNCTION_NAME, static_param_size - 1, kernel_name, NULL), "Failed to query kernel name.");
392 HANDLE_CLERROR(clGetKernelInfo(crypt_kernel, CL_KERNEL_NUM_ARGS, sizeof(cl_uint), &num_args, NULL), "Failed to query kernel num args.");
393
394 if (!strncmp(kernel_name, "lm_bs_b", 7)) {
395 if (num_args == 10)
396 ctr++;
397 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_lm_key_idx), "Failed setting kernel argument buffer_lm_key_idx, kernel 0.");
398 }
399 if (!strncmp(kernel_name, "lm_bs_f", 7) && num_args == 9)
400 ctr++;
401
402 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_int_lm_keys), "Failed setting kernel argument buffer_int_lm_keys, kernel 0.");
403 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_offset_table), "Failed setting kernel argument buffer_offset_table, kernel 0.");
404 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_hash_table), "Failed setting kernel argument buffer_hash_table, kernel 0.");
405 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_bitmaps), "Failed setting kernel argument buffer_bitmaps, kernel 0.");
406 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_hash_ids), "Failed setting kernel argument buffer_hash_ids, kernel 0.");
407 HANDLE_CLERROR(clSetKernelArg(crypt_kernel, ctr++, sizeof(cl_mem), &buffer_bitmap_dupe), "Failed setting kernel argument buffer_bitmap_dupe, kernel 0.");
408
409 MEM_FREE(kernel_name);
410 }
411
release_buffer()412 static void release_buffer()
413 {
414 if (buffer_bitmap_dupe) {
415 MEM_FREE(hash_ids);
416 MEM_FREE(zero_buffer);
417 HANDLE_CLERROR(clReleaseMemObject(buffer_lm_key_idx), "Error releasing buffer_lm_key_idx");
418 HANDLE_CLERROR(clReleaseMemObject(buffer_hash_ids), "Error releasing buffer_hash_ids.");
419 HANDLE_CLERROR(clReleaseMemObject(buffer_offset_table), "Error releasing buffer_offset_table.");
420 HANDLE_CLERROR(clReleaseMemObject(buffer_hash_table), "Error releasing buffer_hash_table.");
421 HANDLE_CLERROR(clReleaseMemObject(buffer_bitmaps), "Error releasing buffer_bitmaps.");
422 HANDLE_CLERROR(clReleaseMemObject(buffer_bitmap_dupe), "Error releasing buffer_bitmap_dupe.");
423 HANDLE_CLERROR(clReleaseMemObject(buffer_int_lm_keys), "Error releasing buffer_int_lm_keys.");
424 buffer_bitmap_dupe = 0;
425 }
426 }
427
init_kernels(char * bitmap_params,unsigned int full_unroll,size_t s_mem_lws,unsigned int use_local_mem,unsigned int use_last_build_opt)428 static void init_kernels(char *bitmap_params, unsigned int full_unroll, size_t s_mem_lws, unsigned int use_local_mem, unsigned int use_last_build_opt)
429 {
430 static unsigned int last_build_opts[3];
431 char build_opts[500];
432 cl_ulong const_cache_size;
433 unsigned int i;
434
435 for (i = 0; i < MASK_FMT_INT_PLHDR; i++)
436 if (mask_skip_ranges && mask_skip_ranges[i] != -1)
437 static_gpu_locations[i] = mask_int_cand.int_cpu_mask_ctx->
438 ranges[mask_skip_ranges[i]].pos;
439 else
440 static_gpu_locations[i] = -1;
441
442 HANDLE_CLERROR(clGetDeviceInfo(devices[gpu_id], CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &const_cache_size, 0), "failed to get CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE.");
443
444 if (!use_last_build_opt) {
445 sprintf(build_opts, "-D FULL_UNROLL=%u -D USE_LOCAL_MEM=%u -D WORK_GROUP_SIZE="Zu""
446 " -D OFFSET_TABLE_SIZE=%u -D HASH_TABLE_SIZE=%u -D MASK_ENABLE=%u -D ITER_COUNT=%u -D LOC_0=%d"
447 #if MASK_FMT_INT_PLHDR > 1
448 " -D LOC_1=%d "
449 #endif
450 #if MASK_FMT_INT_PLHDR > 2
451 "-D LOC_2=%d "
452 #endif
453 #if MASK_FMT_INT_PLHDR > 3
454 "-D LOC_3=%d"
455 #endif
456 " -D IS_STATIC_GPU_MASK=%d -D CONST_CACHE_SIZE=%llu %s" ,
457 full_unroll, use_local_mem, s_mem_lws, offset_table_size, hash_table_size, mask_mode,
458 ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH), static_gpu_locations[0]
459 #if MASK_FMT_INT_PLHDR > 1
460 , static_gpu_locations[1]
461 #endif
462 #if MASK_FMT_INT_PLHDR > 2
463 , static_gpu_locations[2]
464 #endif
465 #if MASK_FMT_INT_PLHDR > 3
466 , static_gpu_locations[3]
467 #endif
468 , mask_gpu_is_static, (unsigned long long)const_cache_size, bitmap_params);
469
470 last_build_opts[0] = full_unroll;
471 last_build_opts[1] = use_local_mem;
472 last_build_opts[2] = s_mem_lws;
473 }
474 else {
475 sprintf(build_opts, "-cl-kernel-arg-info -D FULL_UNROLL=%u -D USE_LOCAL_MEM=%u -D WORK_GROUP_SIZE="Zu""
476 " -D OFFSET_TABLE_SIZE=%u -D HASH_TABLE_SIZE=%u -D MASK_ENABLE=%u -D ITER_COUNT=%u -D LOC_0=%d"
477 #if MASK_FMT_INT_PLHDR > 1
478 " -D LOC_1=%d "
479 #endif
480 #if MASK_FMT_INT_PLHDR > 2
481 "-D LOC_2=%d "
482 #endif
483 #if MASK_FMT_INT_PLHDR > 3
484 "-D LOC_3=%d"
485 #endif
486 " -D IS_STATIC_GPU_MASK=%d -D CONST_CACHE_SIZE=%llu %s" ,
487 last_build_opts[0], last_build_opts[1], (size_t)last_build_opts[2], offset_table_size, hash_table_size, mask_mode,
488 ((mask_int_cand.num_int_cand + LM_DEPTH - 1) >> LM_LOG_DEPTH), static_gpu_locations[0]
489 #if MASK_FMT_INT_PLHDR > 1
490 , static_gpu_locations[1]
491 #endif
492 #if MASK_FMT_INT_PLHDR > 2
493 , static_gpu_locations[2]
494 #endif
495 #if MASK_FMT_INT_PLHDR > 3
496 , static_gpu_locations[3]
497 #endif
498 , mask_gpu_is_static, (unsigned long long)const_cache_size, bitmap_params);
499 }
500
501
502 if (use_last_build_opt ? last_build_opts[0] : full_unroll)
503 opencl_build_kernel("$JOHN/kernels/lm_kernel_f.cl",
504 gpu_id, build_opts, 0);
505 else
506 opencl_build_kernel("$JOHN/kernels/lm_kernel_b.cl",
507 gpu_id, build_opts, 0);
508
509 if (use_last_build_opt ? last_build_opts[0] : full_unroll) {
510 crypt_kernel = clCreateKernel(program[gpu_id], "lm_bs_f", &ret_code);
511 HANDLE_CLERROR(ret_code, "Failed creating kernel lm_bs_f.");
512 }
513 else {
514 crypt_kernel = clCreateKernel(program[gpu_id], "lm_bs_b", &ret_code);
515 HANDLE_CLERROR(ret_code, "Failed creating kernel lm_bs_b.");
516 }
517 }
518
release_kernels()519 static void release_kernels()
520 {
521 if (crypt_kernel) {
522 HANDLE_CLERROR(clReleaseKernel(crypt_kernel), "Error releasing kernel 0");
523 crypt_kernel = 0;
524 }
525 }
526
clean_all_buffers()527 static void clean_all_buffers()
528 {
529 release_buffer_gws();
530 release_buffer();
531 release_kernels();
532 MEM_FREE(hash_table_64);
533 if (program[gpu_id]) {
534 HANDLE_CLERROR(clReleaseProgram(program[gpu_id]),
535 "Error releasing Program");
536 program[gpu_id] = 0;
537 }
538 }
539
540 /* if returns 0x800000, means there is no restriction on lws due to local memory limitations.*/
541 /* if returns 0, means local memory shouldn't be allocated.*/
find_smem_lws_limit(unsigned int full_unroll,unsigned int use_local_mem,unsigned int force_global_keys)542 static size_t find_smem_lws_limit(unsigned int full_unroll, unsigned int use_local_mem, unsigned int force_global_keys)
543 {
544 cl_ulong s_mem_sz = get_local_memory_size(gpu_id);
545 size_t expected_lws_limit;
546 cl_uint warp_size;
547
548 if (force_global_keys) {
549 if (s_mem_sz > 768 * sizeof(cl_short) || full_unroll)
550 return 0x800000;
551 else
552 return 0;
553 }
554
555 if (!s_mem_sz)
556 return 0;
557
558 if (gpu_amd(device_info[gpu_id])) {
559 if (clGetDeviceInfo(devices[gpu_id], CL_DEVICE_WAVEFRONT_WIDTH_AMD,
560 sizeof(cl_uint), &warp_size, 0) != CL_SUCCESS)
561 warp_size = 64;
562 }
563 else if (gpu_nvidia(device_info[gpu_id])) {
564 if (clGetDeviceInfo(devices[gpu_id], CL_DEVICE_WARP_SIZE_NV,
565 sizeof(cl_uint), &warp_size, 0) != CL_SUCCESS)
566 warp_size = 32;
567 }
568 else
569 return 0;
570
571 if (full_unroll || !use_local_mem) {
572 expected_lws_limit = s_mem_sz /
573 (sizeof(lm_vector) * 56);
574 if (!expected_lws_limit)
575 return 0;
576 expected_lws_limit = GET_MULTIPLE_OR_ZERO(
577 expected_lws_limit, warp_size);
578 }
579 else {
580 if (s_mem_sz > 768 * sizeof(cl_short)) {
581 s_mem_sz -= 768 * sizeof(cl_short);
582 expected_lws_limit = s_mem_sz /
583 (sizeof(lm_vector) * 56);
584 if (!expected_lws_limit)
585 return 0x800000;
586 expected_lws_limit = GET_MULTIPLE_OR_ZERO(
587 expected_lws_limit, warp_size);
588 }
589 else
590 return 0;
591 }
592
593 if (warp_size == 1 && expected_lws_limit & (expected_lws_limit - 1)) {
594 get_power_of_two(expected_lws_limit);
595 expected_lws_limit >>= 1;
596 }
597 return expected_lws_limit;
598 }
599
600 #define calc_ms(start, end) \
601 ((long double)(end.tv_sec - start.tv_sec) * 1000.000 + \
602 (long double)(end.tv_usec - start.tv_usec) / 1000.000)
603
604 /* Sets global_work_size and max_keys_per_crypt. */
gws_tune(size_t gws_init,long double kernel_run_ms,int gws_tune_flag,struct fmt_main * format,int mask_mode)605 static void gws_tune(size_t gws_init, long double kernel_run_ms, int gws_tune_flag, struct fmt_main *format, int mask_mode)
606 {
607 unsigned int i;
608 char key[PLAINTEXT_LENGTH + 1] = "alterit";
609
610 struct timeval startc, endc;
611 long double time_ms = 0;
612 int pcount;
613 unsigned int lm_log_depth = mask_mode ? 0 : LM_LOG_DEPTH;
614
615 size_t gws_limit = get_max_mem_alloc_size(gpu_id) / sizeof(opencl_lm_transfer);
616 if (gws_limit > PADDING)
617 gws_limit -= PADDING;
618
619 if (gws_limit & (gws_limit - 1)) {
620 get_power_of_two(gws_limit);
621 gws_limit >>= 1;
622 }
623
624 #if SIZEOF_SIZE_T > 4
625 /* We can't process more than 4G keys per crypt() */
626 while (gws_limit * mask_int_cand.num_int_cand > 0xffffffffUL)
627 gws_limit >>= 1;
628 #endif
629
630 if (gws_tune_flag)
631 global_work_size = gws_init;
632
633 if (global_work_size > gws_limit)
634 global_work_size = gws_limit;
635
636 if (gws_tune_flag) {
637 release_buffer_gws();
638 create_buffer_gws(global_work_size);
639 set_kernel_args_gws();
640
641 format->methods.clear_keys();
642 for (i = 0; i < (global_work_size << lm_log_depth); i++) {
643 key[i & 3] = i & 255;
644 key[(i & 3) + 3] = i ^ 0x3E;
645 format->methods.set_key(key, i);
646 }
647
648 gettimeofday(&startc, NULL);
649 pcount = (int)(global_work_size << lm_log_depth);
650 lm_crypt((int *)&pcount, NULL);
651 gettimeofday(&endc, NULL);
652
653 time_ms = calc_ms(startc, endc);
654 global_work_size = (size_t)((kernel_run_ms / time_ms) * (long double)global_work_size);
655 }
656
657 if (global_work_size < local_work_size)
658 global_work_size = local_work_size;
659
660 get_power_of_two(global_work_size);
661
662 if (global_work_size > gws_limit)
663 global_work_size = gws_limit;
664
665 release_buffer_gws();
666 create_buffer_gws(global_work_size);
667 set_kernel_args_gws();
668
669 /* for hash_ids[3*x + 1], 27 bits for storing gid and 5 bits for bs depth. */
670 //assert(global_work_size <= ((1U << 28) - 1));
671 fmt_opencl_lm.params.max_keys_per_crypt = global_work_size << lm_log_depth;
672
673 fmt_opencl_lm.params.min_keys_per_crypt =
674 opencl_calc_min_kpc(local_work_size, global_work_size,
675 1 << lm_log_depth);
676 }
677
auto_tune_all(char * bitmap_params,unsigned int num_loaded_hashes,long double kernel_run_ms,struct fmt_main * format,int mask_mode)678 static void auto_tune_all(char *bitmap_params, unsigned int num_loaded_hashes, long double kernel_run_ms, struct fmt_main *format, int mask_mode)
679 {
680 unsigned int full_unroll = 0;
681 unsigned int use_local_mem = 1;
682 unsigned int force_global_keys = 1;
683 unsigned int gws_tune_flag = 1;
684 unsigned int lws_tune_flag = 1;
685
686 size_t s_mem_limited_lws;
687
688 struct timeval startc, endc;
689 long double time_ms = 0;
690
691 char key[PLAINTEXT_LENGTH + 1] = "alterit";
692
693 unsigned int lm_log_depth = mask_mode ? 0 : LM_LOG_DEPTH;
694
695 if (cpu(device_info[gpu_id])) {
696 force_global_keys = 1;
697 use_local_mem = 0;
698 full_unroll = 1;
699 kernel_run_ms = 5;
700 }
701 else if (amd_vliw4(device_info[gpu_id]) || amd_vliw5(device_info[gpu_id]) || gpu_intel(device_info[gpu_id])) {
702 force_global_keys = 0;
703 use_local_mem = 1;
704 full_unroll = 0;
705 }
706 else if (platform_apple(platform_id) && gpu_nvidia(device_info[gpu_id])) {
707 force_global_keys = 1;
708 use_local_mem = 0;
709 full_unroll = 1;
710 }
711 else if (gpu(device_info[gpu_id])) {
712 force_global_keys = 0;
713 use_local_mem = 1;
714 full_unroll = 1;
715 }
716 else {
717 force_global_keys = 1;
718 use_local_mem = 0;
719 full_unroll = 0;
720 kernel_run_ms = 40;
721 }
722
723 local_work_size = 0;
724 global_work_size = 0;
725 gws_tune_flag = 1;
726 lws_tune_flag = 1;
727 opencl_get_user_preferences(FORMAT_LABEL);
728 if (global_work_size)
729 gws_tune_flag = 0;
730 if (local_work_size) {
731 lws_tune_flag = 0;
732 if (local_work_size & (local_work_size - 1)) {
733 get_power_of_two(local_work_size);
734 }
735 }
736
737 s_mem_limited_lws = find_smem_lws_limit(
738 full_unroll, use_local_mem, force_global_keys);
739 #if 0
740 fprintf(stdout, "%s() Limit_smem:"Zu", Full_unroll_flag:%u,"
741 "Use_local_mem:%u, Force_global_keys:%u\n",
742 __FUNCTION__,
743 s_mem_limited_lws, full_unroll, use_local_mem,
744 force_global_keys);
745 #endif
746
747 if (s_mem_limited_lws == 0x800000 || !s_mem_limited_lws) {
748 long double best_time_ms;
749 size_t best_lws, lws_limit;
750
751 release_kernels();
752 init_kernels(bitmap_params, full_unroll, 0, use_local_mem && s_mem_limited_lws, 0);
753 set_kernel_args();
754
755 gws_tune(1024, 2 * kernel_run_ms, gws_tune_flag, format, mask_mode);
756 gws_tune(global_work_size, kernel_run_ms, gws_tune_flag, format, mask_mode);
757
758 lws_limit = get_kernel_max_lws(gpu_id, crypt_kernel);
759
760 if (lws_limit > global_work_size)
761 lws_limit = global_work_size;
762
763 if (lws_tune_flag) {
764 if (gpu(device_info[gpu_id]) && lws_limit >= 32)
765 local_work_size = 32;
766 else
767 local_work_size = get_kernel_preferred_multiple(gpu_id, crypt_kernel);
768 }
769 if (local_work_size > lws_limit)
770 local_work_size = lws_limit;
771
772 if (lws_tune_flag) {
773 time_ms = 0;
774 best_time_ms = 999999.00;
775 best_lws = local_work_size;
776 while (local_work_size <= lws_limit &&
777 local_work_size <= PADDING) {
778 int pcount, i;
779 format->methods.clear_keys();
780 for (i = 0; i < (global_work_size << lm_log_depth); i++) {
781 key[i & 3] = i & 255;
782 key[(i & 3) + 3] = i ^ 0x3F;
783 format->methods.set_key(key, i);
784 }
785 gettimeofday(&startc, NULL);
786 pcount = (int)(global_work_size << lm_log_depth);
787 lm_crypt((int *)&pcount, NULL);
788 gettimeofday(&endc, NULL);
789
790 time_ms = calc_ms(startc, endc);
791
792 if (time_ms < best_time_ms) {
793 best_lws = local_work_size;
794 best_time_ms = time_ms;
795 }
796 #if 0
797 fprintf(stdout, "GWS: "Zu", LWS: "Zu", Limit_smem:"Zu", Limit_kernel:"Zu","
798 "Current time:%Lf, Best time:%Lf\n",
799 global_work_size, local_work_size, s_mem_limited_lws,
800 get_kernel_max_lws(gpu_id, crypt_kernel), time_ms,
801 best_time_ms);
802 #endif
803 local_work_size *= 2;
804 }
805 local_work_size = best_lws;
806 gws_tune(global_work_size, kernel_run_ms, gws_tune_flag, format, mask_mode);
807 }
808 }
809
810 else {
811 long double best_time_ms;
812 size_t best_lws;
813 cl_uint warp_size;
814
815 if (gpu_amd(device_info[gpu_id])) {
816 if (clGetDeviceInfo(devices[gpu_id], CL_DEVICE_WAVEFRONT_WIDTH_AMD,
817 sizeof(cl_uint), &warp_size, 0) != CL_SUCCESS)
818 warp_size = 64;
819 }
820 else if (gpu_nvidia(device_info[gpu_id])) {
821 if (clGetDeviceInfo(devices[gpu_id], CL_DEVICE_WARP_SIZE_NV,
822 sizeof(cl_uint), &warp_size, 0) != CL_SUCCESS)
823 warp_size = 32;
824 }
825 else {
826 warp_size = 1;
827 fprintf(stderr, "Possible auto_tune fail!!.\n");
828 }
829 if (lws_tune_flag)
830 local_work_size = warp_size;
831 if (local_work_size > s_mem_limited_lws)
832 local_work_size = s_mem_limited_lws;
833
834 release_kernels();
835 init_kernels(bitmap_params, full_unroll, local_work_size, use_local_mem, 0);
836
837 if (local_work_size > get_kernel_max_lws(gpu_id, crypt_kernel)) {
838 local_work_size = get_kernel_max_lws(gpu_id, crypt_kernel);
839 release_kernels();
840 init_kernels(bitmap_params, full_unroll, local_work_size, use_local_mem, 0);
841 }
842
843 set_kernel_args();
844 gws_tune(1024, 2 * kernel_run_ms, gws_tune_flag, format, mask_mode);
845 gws_tune(global_work_size, kernel_run_ms, gws_tune_flag, format, mask_mode);
846
847 if (global_work_size < s_mem_limited_lws) {
848 s_mem_limited_lws = global_work_size;
849 if (local_work_size > s_mem_limited_lws)
850 local_work_size = s_mem_limited_lws;
851 }
852
853 if (lws_tune_flag) {
854 best_time_ms = 999999.00;
855 best_lws = local_work_size;
856 while (local_work_size <= s_mem_limited_lws &&
857 local_work_size <= PADDING) {
858 int pcount, i;
859
860 release_kernels();
861 init_kernels(bitmap_params, full_unroll, local_work_size, use_local_mem, 0);
862 set_kernel_args();
863 set_kernel_args_gws();
864
865 format->methods.clear_keys();
866 for (i = 0; i < (global_work_size << lm_log_depth); i++) {
867 key[i & 3] = i & 255;
868 key[(i & 3) + 3] = i ^ 0x3E;
869 format->methods.set_key(key, i);
870 }
871
872 gettimeofday(&startc, NULL);
873 pcount = (int)(global_work_size << lm_log_depth);
874 lm_crypt((int *)&pcount, NULL);
875 gettimeofday(&endc, NULL);
876 time_ms = calc_ms(startc, endc);
877
878 if (time_ms < best_time_ms &&
879 local_work_size <= get_kernel_max_lws(
880 gpu_id, crypt_kernel)) {
881 best_lws = local_work_size;
882 best_time_ms = time_ms;
883 }
884 #if 0
885 fprintf(stdout, "GWS: "Zu", LWS: "Zu", Limit_smem:"Zu", Limit_kernel:"Zu","
886 "Current time:%Lf, Best time:%Lf\n",
887 global_work_size, local_work_size, s_mem_limited_lws,
888 get_kernel_max_lws(gpu_id, crypt_kernel), time_ms,
889 best_time_ms);
890 #endif
891 if (gpu(device_info[gpu_id])) {
892 if (local_work_size < 16)
893 local_work_size = 16;
894 else if (local_work_size < 32)
895 local_work_size = 32;
896 else if (local_work_size < 64)
897 local_work_size = 64;
898 else if (local_work_size < 96)
899 local_work_size = 96;
900 else if (local_work_size < 128)
901 local_work_size = 128;
902 else
903 local_work_size += warp_size;
904 }
905 else
906 local_work_size *= 2;
907 }
908 local_work_size = best_lws;
909 release_kernels();
910 init_kernels(bitmap_params, full_unroll, local_work_size, use_local_mem, 0);
911 set_kernel_args();
912 gws_tune(global_work_size, kernel_run_ms, gws_tune_flag, format, mask_mode);
913 }
914 }
915 if (options.verbosity > VERB_LEGACY)
916 fprintf(stdout, "GWS: "Zu", LWS: "Zu"\n",
917 global_work_size, local_work_size);
918 }
919
920 /* Use only for smaller bitmaps < 16MB */
prepare_bitmap_2(cl_ulong bmp_sz_bits,cl_uint ** bitmaps_ptr,int * loaded_hashes)921 static void prepare_bitmap_2(cl_ulong bmp_sz_bits, cl_uint **bitmaps_ptr, int *loaded_hashes)
922 {
923 unsigned int i;
924 MEM_FREE(*bitmaps_ptr);
925 *bitmaps_ptr = (cl_uint*) mem_calloc((bmp_sz_bits >> 4), sizeof(cl_uint));
926
927 for (i = 0; i < num_loaded_hashes; i++) {
928 unsigned int bmp_idx = loaded_hashes[2 * i + 1] & (bmp_sz_bits - 1);
929 (*bitmaps_ptr)[bmp_idx >> 5] |= (1U << (bmp_idx & 31));
930
931 bmp_idx = loaded_hashes[2 * i] & (bmp_sz_bits - 1);
932 (*bitmaps_ptr)[(bmp_sz_bits >> 5) + (bmp_idx >> 5)] |=
933 (1U << (bmp_idx & 31));
934 }
935 }
936
prepare_bitmap_1(cl_ulong bmp_sz_bits,cl_uint ** bitmaps_ptr,int * loaded_hashes)937 static void prepare_bitmap_1(cl_ulong bmp_sz_bits, cl_uint **bitmaps_ptr, int *loaded_hashes)
938 {
939 unsigned int i;
940 MEM_FREE(*bitmaps_ptr);
941 *bitmaps_ptr = (cl_uint*) mem_calloc((bmp_sz_bits >> 5), sizeof(cl_uint));
942
943 for (i = 0; i < num_loaded_hashes; i++) {
944 unsigned int bmp_idx = loaded_hashes[2 * i + 1] & (bmp_sz_bits - 1);
945 (*bitmaps_ptr)[bmp_idx >> 5] |= (1U << (bmp_idx & 31));
946 }
947 }
948
select_bitmap(unsigned int num_ld_hashes,int * loaded_hashes,unsigned int * bitmap_size_bits,unsigned int ** bitmaps_ptr)949 static char* select_bitmap(unsigned int num_ld_hashes, int *loaded_hashes, unsigned int *bitmap_size_bits, unsigned int **bitmaps_ptr)
950 {
951 static char kernel_params[200];
952 unsigned int cmp_steps = 2, bits_req = 32;
953
954 if (num_ld_hashes <= 5100) {
955 if (amd_gcn_10(device_info[gpu_id]) ||
956 amd_vliw4(device_info[gpu_id]))
957 *bitmap_size_bits = 512 * 1024;
958
959 else
960 *bitmap_size_bits = 256 * 1024;
961
962 }
963
964 else if (num_ld_hashes <= 10100) {
965 if (amd_gcn_10(device_info[gpu_id]) ||
966 amd_vliw4(device_info[gpu_id]))
967 *bitmap_size_bits = 512 * 1024;
968
969 else
970 *bitmap_size_bits = 256 * 1024;
971 }
972
973 else if (num_ld_hashes <= 20100) {
974 if (amd_gcn_10(device_info[gpu_id]))
975 *bitmap_size_bits = 1024 * 1024;
976
977 else
978 *bitmap_size_bits = 512 * 1024;
979 }
980
981 else if (num_ld_hashes <= 250100)
982 *bitmap_size_bits = 2048 * 1024;
983
984 else if (num_ld_hashes <= 1100100) {
985 if (!amd_gcn_11(device_info[gpu_id]))
986 *bitmap_size_bits = 4096 * 1024;
987
988 else
989 *bitmap_size_bits = 2048 * 1024;
990 }
991
992 else if (num_ld_hashes <= 1500100) {
993 *bitmap_size_bits = 4096 * 1024 * 2;
994 cmp_steps = 1;
995 }
996
997 else if (num_ld_hashes <= 2700100) {
998 *bitmap_size_bits = 4096 * 1024 * 2 * 2;
999 cmp_steps = 1;
1000 }
1001
1002 else {
1003 cl_ulong mult = num_ld_hashes / 2700100;
1004 cl_ulong buf_sz;
1005 *bitmap_size_bits = 4096 * 4096;
1006 get_power_of_two(mult);
1007 *bitmap_size_bits *= mult;
1008 buf_sz = get_max_mem_alloc_size(gpu_id);
1009 if (buf_sz & (buf_sz - 1)) {
1010 get_power_of_two(buf_sz);
1011 buf_sz >>= 1;
1012 }
1013 if (buf_sz >= 536870912)
1014 buf_sz = 536870912;
1015 if (((*bitmap_size_bits) >> 3) > buf_sz)
1016 *bitmap_size_bits = buf_sz << 3;
1017 cmp_steps = 1;
1018 }
1019
1020 if (cmp_steps == 1)
1021 prepare_bitmap_1(*bitmap_size_bits, bitmaps_ptr, loaded_hashes);
1022
1023 else
1024 prepare_bitmap_2(*bitmap_size_bits, bitmaps_ptr, loaded_hashes);
1025
1026 get_num_bits(bits_req, (*bitmap_size_bits));
1027
1028 sprintf(kernel_params,
1029 "-D SELECT_CMP_STEPS=%u"
1030 " -D BITMAP_SIZE_BITS_LESS_ONE="LLu" -D REQ_BITMAP_BITS=%u",
1031 cmp_steps, (unsigned long long)(*bitmap_size_bits) - 1, bits_req);
1032
1033 *bitmap_size_bits *= cmp_steps;
1034
1035 return kernel_params;
1036 }
1037
prepare_table(struct db_salt * salt,OFFSET_TABLE_WORD ** offset_table_ptr,unsigned int * bitmap_size_bits,unsigned ** bitmaps_ptr)1038 static char* prepare_table(struct db_salt *salt, OFFSET_TABLE_WORD **offset_table_ptr, unsigned int *bitmap_size_bits, unsigned **bitmaps_ptr)
1039 {
1040 int *bin, i;
1041 struct db_password *pw, *last;
1042 char *bitmap_params;
1043 int *loaded_hashes;
1044
1045 num_loaded_hashes = salt->count;
1046 loaded_hashes = (int *)mem_alloc(num_loaded_hashes * sizeof(int) * 2);
1047
1048 last = pw = salt->list;
1049 i = 0;
1050 do {
1051 bin = (int *)pw->binary;
1052 if (bin == NULL) {
1053 if (last == pw)
1054 salt->list = pw->next;
1055 else
1056 last->next = pw->next;
1057 } else {
1058 last = pw;
1059 loaded_hashes[2 * i] = bin[0];
1060 loaded_hashes[2 * i + 1] = bin[1];
1061 i++;
1062 }
1063 } while ((pw = pw->next)) ;
1064
1065 if (i > (salt->count)) {
1066 fprintf(stderr,
1067 "Something went wrong while preparing hashes(%d, %d)..Exiting..\n", i, salt->count);
1068 error();
1069 }
1070
1071 num_loaded_hashes = create_perfect_hash_table(64, (void *)loaded_hashes,
1072 num_loaded_hashes,
1073 offset_table_ptr,
1074 &offset_table_size,
1075 &hash_table_size, 0);
1076
1077 if (!num_loaded_hashes) {
1078 MEM_FREE(hash_table_64);
1079 MEM_FREE((*offset_table_ptr));
1080 fprintf(stderr, "Failed to create Hash Table for cracking.\n");
1081 error();
1082 }
1083
1084 bitmap_params = select_bitmap(num_loaded_hashes, loaded_hashes, bitmap_size_bits, bitmaps_ptr);
1085 MEM_FREE(loaded_hashes);
1086
1087 return bitmap_params;
1088 }
1089
get_key(int index)1090 static char *get_key(int index)
1091 {
1092 get_key_body();
1093 }
1094
get_key_mm(int index)1095 static char *get_key_mm(int index)
1096 {
1097 static char out[PLAINTEXT_LENGTH + 1];
1098 unsigned int section, depth, iter;
1099 unsigned char *src, i;
1100 char *dst;
1101
1102 if (hash_ids == NULL || hash_ids[0] == 0 ||
1103 index > hash_ids[0] || hash_ids[0] > num_loaded_hashes) {
1104 section = 0;
1105 depth = 0;
1106 iter = 0;
1107 }
1108 else {
1109 section = hash_ids[3 * index + 1] / 32;
1110 depth = hash_ids[3 * index + 1] & 31;
1111 iter = hash_ids[3 * index + 2];
1112 }
1113
1114 if (section > global_work_size ) {
1115 //fprintf(stderr, "Get key error! %u "Zu"\n", section, global_work_size);
1116 section = 0;
1117 depth = 0;
1118 iter = 0;
1119 }
1120
1121 if (mask_skip_ranges && mask_int_cand.num_int_cand > 1) {
1122 for (i = 0; i < MASK_FMT_INT_PLHDR && mask_skip_ranges[i] != -1; i++)
1123 if (mask_gpu_is_static)
1124 opencl_lm_keys[section].xkeys.c[static_gpu_locations[i]][depth & 7][depth >> 3] = opencl_lm_u[mask_int_cand.int_cand[iter * 32 + depth].x[i]];
1125 else
1126 opencl_lm_keys[section].xkeys.c[(opencl_lm_int_key_loc[section] & (0xff << (i * 8))) >> (i * 8)][depth & 7][depth >> 3] = opencl_lm_u[mask_int_cand.int_cand[iter * 32 + depth].x[i]];
1127 }
1128
1129 src = opencl_lm_all[section].pxkeys[depth];
1130 dst = out;
1131 while (dst < &out[PLAINTEXT_LENGTH] && (*dst = *src)) {
1132 src += sizeof(lm_vector) * 8;
1133 dst++;
1134 }
1135 *dst = 0;
1136
1137 return out;
1138 }
1139
reset(struct db_main * db)1140 static void reset(struct db_main *db)
1141 {
1142 if (db->real && db == db->real) {
1143 struct db_salt *salt;
1144 unsigned int *bitmaps = NULL;
1145 OFFSET_TABLE_WORD *offset_table = NULL;
1146 char *bitmap_params;
1147 unsigned int bitmap_size_bits = 0;
1148
1149 release_buffer();
1150 release_buffer_gws();
1151 release_kernels();
1152 MEM_FREE(hash_table_64);
1153
1154 salt = db->salts;
1155 bitmap_params = prepare_table(salt, &offset_table, &bitmap_size_bits, &bitmaps);
1156 create_buffer(num_loaded_hashes, offset_table, offset_table_size, hash_table_size, bitmaps, bitmap_size_bits);
1157
1158 if (options.flags & FLG_MASK_CHK) {
1159 mask_mode = 1;
1160 fmt_opencl_lm.methods.set_key = opencl_lm_set_key_mm;
1161 fmt_opencl_lm.methods.get_key = get_key_mm;
1162 }
1163
1164 auto_tune_all(bitmap_params, num_loaded_hashes, 100, &fmt_opencl_lm, mask_mode);
1165 MEM_FREE(offset_table);
1166 MEM_FREE(bitmaps);
1167 }
1168 else {
1169 int i, *binary;
1170 char *ciphertext, *bitmap_params;
1171 unsigned int *bitmaps = NULL;
1172 unsigned int bitmap_size_bits = 0;
1173 OFFSET_TABLE_WORD *offset_table = NULL;
1174 int *loaded_hashes;
1175
1176 num_loaded_hashes = 0;
1177 while (fmt_opencl_lm.params.tests[num_loaded_hashes].ciphertext) num_loaded_hashes++;
1178
1179 loaded_hashes = (int *) mem_alloc (num_loaded_hashes * sizeof(int) * 2);
1180
1181 i = 0;
1182 while (fmt_opencl_lm.params.tests[i].ciphertext) {
1183 char **fields = fmt_opencl_lm.params.tests[i].fields;
1184 if (!fields[1])
1185 fields[1] = fmt_opencl_lm.params.tests[i].ciphertext;
1186 ciphertext = fmt_opencl_lm.methods.split(fmt_opencl_lm.methods.prepare(fields, &fmt_opencl_lm), 0, &fmt_opencl_lm);
1187 binary = (int *)fmt_opencl_lm.methods.binary(ciphertext);
1188 loaded_hashes[2 * i] = binary[0];
1189 loaded_hashes[2 * i + 1] = binary[1];
1190 i++;
1191 //fprintf(stderr, "C:%s B:%d %d %d\n", ciphertext, binary[0], binary[1], i == num_loaded_hashes );
1192 }
1193
1194 num_loaded_hashes = create_perfect_hash_table(64, (void *)loaded_hashes,
1195 num_loaded_hashes,
1196 &offset_table,
1197 &offset_table_size,
1198 &hash_table_size, 0);
1199
1200 if (!num_loaded_hashes) {
1201 MEM_FREE(hash_table_64);
1202 MEM_FREE(offset_table);
1203 fprintf(stderr, "Failed to create Hash Table for self test.\n");
1204 error();
1205 }
1206 bitmap_params = select_bitmap(num_loaded_hashes, loaded_hashes, &bitmap_size_bits, &bitmaps);
1207 create_buffer(num_loaded_hashes, offset_table, offset_table_size, hash_table_size, bitmaps, bitmap_size_bits);
1208 auto_tune_all(bitmap_params, num_loaded_hashes, 100, &fmt_opencl_lm, 0);
1209
1210 MEM_FREE(offset_table);
1211 MEM_FREE(bitmaps);
1212 MEM_FREE(loaded_hashes);
1213 hash_ids[0] = 0;
1214 }
1215 }
1216
init_global_variables()1217 static void init_global_variables()
1218 {
1219 mask_int_cand_target = opencl_speed_index(gpu_id) / 300;
1220 }
1221
lm_crypt(int * pcount,struct db_salt * salt)1222 static int lm_crypt(int *pcount, struct db_salt *salt)
1223 {
1224 const int count = mask_mode ?
1225 *pcount : (*pcount + LM_DEPTH - 1) >> LM_LOG_DEPTH;
1226 size_t *lws = local_work_size ? &local_work_size : NULL;
1227 current_gws = GET_NEXT_MULTIPLE(count, local_work_size);
1228
1229 #if 0
1230 fprintf(stderr, "pcount %d count %d lws "Zu" gws "Zu" cur_gws "Zu" static: %d\n", *pcount, count, local_work_size, global_work_size, current_gws, mask_gpu_is_static);
1231 #endif
1232 if (salt != NULL && salt->count > 4500 &&
1233 (num_loaded_hashes - num_loaded_hashes / 10) > salt->count) {
1234 char *bitmap_params;
1235 unsigned int *bitmaps = NULL;
1236 unsigned int bitmap_size_bits = 0;
1237 OFFSET_TABLE_WORD *offset_table = NULL;
1238
1239 release_buffer();
1240 release_kernels();
1241 MEM_FREE(hash_table_64);
1242
1243 bitmap_params = prepare_table(salt, &offset_table, &bitmap_size_bits, &bitmaps);
1244 create_buffer(num_loaded_hashes, offset_table, offset_table_size, hash_table_size, bitmaps, bitmap_size_bits);
1245
1246 init_kernels(bitmap_params, 0, 0, 0, 1);
1247
1248 set_kernel_args();
1249 set_kernel_args_gws();
1250
1251 MEM_FREE(offset_table);
1252 MEM_FREE(bitmaps);
1253 }
1254
1255 BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_raw_keys, CL_FALSE, 0, current_gws * sizeof(opencl_lm_transfer), opencl_lm_keys, 0, NULL, NULL ), "Failed Copy data to gpu");
1256
1257 if (!mask_gpu_is_static)
1258 BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_int_key_loc, CL_FALSE, 0, current_gws * sizeof(unsigned int), opencl_lm_int_key_loc, 0, NULL, NULL ), "Failed Copy data to gpu");
1259
1260 BENCH_CLERROR(clEnqueueNDRangeKernel(queue[gpu_id], crypt_kernel, 1, NULL, ¤t_gws, lws, 0, NULL, NULL), "Failed enqueue kernel lm_bs_*.");
1261 BENCH_CLERROR(clFinish(queue[gpu_id]), "Kernel failed");
1262
1263 BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_hash_ids, CL_TRUE, 0, sizeof(unsigned int), hash_ids, 0, NULL, NULL), "Read FAILED\n");
1264
1265 if (hash_ids[0] > num_loaded_hashes) {
1266 fprintf(stderr, "Error, crypt_all kernel.\n");
1267 error();
1268 }
1269
1270 if (hash_ids[0]) {
1271 BENCH_CLERROR(clEnqueueReadBuffer(queue[gpu_id], buffer_hash_ids, CL_TRUE, 0, (3 * hash_ids[0] + 1) * sizeof(unsigned int), hash_ids, 0, NULL, NULL), "Read FAILED\n");
1272 BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_bitmap_dupe, CL_TRUE, 0, ((hash_table_size - 1)/32 + 1) * sizeof(cl_uint), zero_buffer, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_bitmap_dupe.");
1273 BENCH_CLERROR(clEnqueueWriteBuffer(queue[gpu_id], buffer_hash_ids, CL_TRUE, 0, sizeof(cl_uint), zero_buffer, 0, NULL, NULL), "failed in clEnqueueWriteBuffer buffer_hash_ids.");
1274 }
1275
1276 *pcount *= mask_int_cand.num_int_cand;
1277
1278 return hash_ids[0];
1279 }
1280
opencl_lm_get_hash_0(int index)1281 int opencl_lm_get_hash_0(int index)
1282 {
1283 return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_0;
1284 }
1285
opencl_lm_get_hash_1(int index)1286 int opencl_lm_get_hash_1(int index)
1287 {
1288 return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_1;
1289 }
1290
opencl_lm_get_hash_2(int index)1291 int opencl_lm_get_hash_2(int index)
1292 {
1293 return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_2;
1294 }
1295
opencl_lm_get_hash_3(int index)1296 int opencl_lm_get_hash_3(int index)
1297 {
1298 return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_3;
1299 }
1300
opencl_lm_get_hash_4(int index)1301 int opencl_lm_get_hash_4(int index)
1302 {
1303 return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_4;
1304 }
1305
opencl_lm_get_hash_5(int index)1306 int opencl_lm_get_hash_5(int index)
1307 {
1308 return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_5;
1309 }
1310
opencl_lm_get_hash_6(int index)1311 int opencl_lm_get_hash_6(int index)
1312 {
1313 return hash_table_64[hash_ids[3 + 3 * index]] & PH_MASK_6;
1314 }
1315
cmp_one(void * binary,int index)1316 static int cmp_one(void *binary, int index)
1317 {
1318 if (((int *)binary)[0] == hash_table_64[hash_ids[3 + 3 * index]])
1319 return 1;
1320 return 0;
1321 }
1322
cmp_exact(char * source,int index)1323 static int cmp_exact(char *source, int index)
1324 {
1325 int *binary = opencl_lm_get_binary(source + 4);
1326
1327 if (binary[1] == hash_table_64[hash_ids[3 + 3 * index] + hash_table_size])
1328 return 1;
1329 return 0;
1330 }
1331
opencl_lm_b_register_functions(struct fmt_main * fmt)1332 void opencl_lm_b_register_functions(struct fmt_main *fmt)
1333 {
1334 fmt->methods.done = &clean_all_buffers;
1335 fmt->methods.reset = &reset;
1336 fmt->methods.get_key = &get_key;
1337 fmt->methods.crypt_all = &lm_crypt;
1338 fmt->methods.cmp_exact = cmp_exact;
1339 fmt->methods.cmp_one = cmp_one;
1340 opencl_lm_init_global_variables = &init_global_variables;
1341 }
1342
1343 #endif /* #if HAVE_OPENCL */
1344