1 #include "opencl_DES_hst_dev_shared.h"
2 #include "opencl_device_info.h"
3 
4 typedef unsigned WORD vtype;
5 
6 /*
7  * Some devices/drivers has problems with the optimized 'goto' program flow.
8  * Some AMD driver versions can't build the "fast goto" version but those who
9  * can runs faster. Hawaii on 14.9 fails, Tahiti on 14.9 does not (!?).
10  *
11  * Nvidia can build either kernel but GTX980 is significantly faster with the
12  * "safe goto" version (7% faster for one salt, 16% for many salts).
13  *
14  * OSX' Intel HD4000 driver [1.2(Sep25 2014 22:26:04)] fails building the
15  * "fast goto" version.
16  */
17 #if nvidia_sm_5x(DEVICE_INFO) || gpu_intel(DEVICE_INFO) || __MESA__ ||  \
18 	(gpu_amd(DEVICE_INFO) && DEV_VER_MAJOR >= 1573 && !defined(__Tahiti__)) || \
19 	(gpu_amd(DEVICE_INFO) && DEV_VER_MAJOR >= 1702)
20 //#warning Using 'safe goto' kernel
21 #define SAFE_GOTO
22 #else
23 //#warning Using 'fast goto' kernel
24 #endif
25 
26 #if no_byte_addressable(DEVICE_INFO)
27 #define RV7xx
28 #endif
29 #if gpu_nvidia(DEVICE_INFO)
30 #define _NV
31 #endif
32 
33 #define vxorf(a, b) 					\
34 	((a) ^ (b))
35 #define vnot(dst, a) 					\
36 	(dst) = ~(a)
37 #define vand(dst, a, b) 				\
38 	(dst) = (a) & (b)
39 #define vor(dst, a, b) 					\
40 	(dst) = (a) | (b)
41 #define vandn(dst, a, b) 				\
42 	(dst) = (a) & ~(b)
43 #define vxor(dst, a, b) 				\
44 	(dst) = vxorf((a), (b))
45 #define vshl(dst, src, shift) 				\
46 	(dst) = (src) << (shift)
47 #define vshr(dst, src, shift) 				\
48 	(dst) = (src) >> (shift)
49 #define vshl1(dst, src) 				\
50 	vshl((dst), (src), 1)
51 
52 #if HAVE_LUT3
53 #define vsel(dst, a, b, c)	(dst) = lut3(a, b, c, 0xd8)
54 #elif defined(_NV) || __CPU__
55 #define vsel(dst, a, b, c) 				\
56 	(dst) = (((a) & ~(c)) ^ ((b) & (c)))
57 #else
58 #define vsel(dst, a, b, c) 				\
59 	(dst) = bitselect((a),(b),(c))
60 #endif
61 
62 #if defined(_NV) || __CPU__
63 #include "opencl_sboxes.h"
64 #else
65 #include "opencl_sboxes-s.h"
66 #endif
67 
68 #define vst_private(dst, ofs, src) 			\
69 	*((__private vtype *)((__private DES_bs_vector *)&(dst) + (ofs))) = (src)
70 
71 #define DES_bs_clear_block_8(j) 			\
72 	vst_private(B[j] , 0, zero); 			\
73 	vst_private(B[j] , 1, zero); 			\
74 	vst_private(B[j] , 2, zero); 			\
75 	vst_private(B[j] , 3, zero); 			\
76 	vst_private(B[j] , 4, zero); 			\
77 	vst_private(B[j] , 5, zero); 			\
78 	vst_private(B[j] , 6, zero); 			\
79 	vst_private(B[j] , 7, zero);
80 
81 #define DES_bs_clear_block 				\
82 	DES_bs_clear_block_8(0); 			\
83 	DES_bs_clear_block_8(8); 			\
84 	DES_bs_clear_block_8(16); 			\
85 	DES_bs_clear_block_8(24); 			\
86 	DES_bs_clear_block_8(32); 			\
87 	DES_bs_clear_block_8(40); 			\
88 	DES_bs_clear_block_8(48); 			\
89 	DES_bs_clear_block_8(56);
90 
cmp(__private unsigned DES_bs_vector * B,__global int * uncracked_hashes,int num_uncracked_hashes,volatile __global uint * hash_ids,volatile __global uint * bitmap_dupe,__global DES_bs_vector * cracked_hashes,int section)91 inline void cmp(__private unsigned DES_bs_vector *B,
92 	  __global int *uncracked_hashes,
93 	  int num_uncracked_hashes,
94 	  volatile __global uint *hash_ids,
95 	  volatile __global uint *bitmap_dupe,
96 	  __global DES_bs_vector *cracked_hashes,
97 	  int section) {
98 
99 	int value[2] , mask, i, bit;
100 
101 	for (i = 0; i < num_uncracked_hashes; i++) {
102 
103 		value[0] = uncracked_hashes[i];
104 		value[1] = uncracked_hashes[i + num_uncracked_hashes];
105 
106 		mask = B[0] ^ -(value[0] & 1);
107 
108 		for (bit = 1; bit < 32; bit++)
109 			mask |= B[bit] ^ -((value[0] >> bit) & 1);
110 
111 		for (; bit < 64; bit += 2) {
112 			mask |= B[bit] ^ -((value[1] >> (bit & 0x1F)) & 1);
113 			mask |= B[bit + 1] ^ -((value[1] >> ((bit + 1) & 0x1F)) & 1);
114 		}
115 
116 		if (mask != ~(int)0) {
117 			if (!(atomic_or(&bitmap_dupe[i/32], (1U << (i % 32))) & (1U << (i % 32)))) {
118 				mask = atomic_inc(&hash_ids[0]);
119 				hash_ids[1 + 2 * mask] = section;
120 				hash_ids[2 + 2 * mask] = 0;
121 				for (bit = 0; bit < 64; bit++)
122 					cracked_hashes[mask * 64 + bit] = (DES_bs_vector)B[bit];
123 
124 			}
125 		}
126 	}
127 }
128