1 #include "opencl_DES_hst_dev_shared.h"
2 #include "opencl_device_info.h"
3
4 typedef unsigned WORD vtype;
5
6 /*
7 * Some devices/drivers has problems with the optimized 'goto' program flow.
8 * Some AMD driver versions can't build the "fast goto" version but those who
9 * can runs faster. Hawaii on 14.9 fails, Tahiti on 14.9 does not (!?).
10 *
11 * Nvidia can build either kernel but GTX980 is significantly faster with the
12 * "safe goto" version (7% faster for one salt, 16% for many salts).
13 *
14 * OSX' Intel HD4000 driver [1.2(Sep25 2014 22:26:04)] fails building the
15 * "fast goto" version.
16 */
17 #if nvidia_sm_5x(DEVICE_INFO) || gpu_intel(DEVICE_INFO) || __MESA__ || \
18 (gpu_amd(DEVICE_INFO) && DEV_VER_MAJOR >= 1573 && !defined(__Tahiti__)) || \
19 (gpu_amd(DEVICE_INFO) && DEV_VER_MAJOR >= 1702)
20 //#warning Using 'safe goto' kernel
21 #define SAFE_GOTO
22 #else
23 //#warning Using 'fast goto' kernel
24 #endif
25
26 #if no_byte_addressable(DEVICE_INFO)
27 #define RV7xx
28 #endif
29 #if gpu_nvidia(DEVICE_INFO)
30 #define _NV
31 #endif
32
33 #define vxorf(a, b) \
34 ((a) ^ (b))
35 #define vnot(dst, a) \
36 (dst) = ~(a)
37 #define vand(dst, a, b) \
38 (dst) = (a) & (b)
39 #define vor(dst, a, b) \
40 (dst) = (a) | (b)
41 #define vandn(dst, a, b) \
42 (dst) = (a) & ~(b)
43 #define vxor(dst, a, b) \
44 (dst) = vxorf((a), (b))
45 #define vshl(dst, src, shift) \
46 (dst) = (src) << (shift)
47 #define vshr(dst, src, shift) \
48 (dst) = (src) >> (shift)
49 #define vshl1(dst, src) \
50 vshl((dst), (src), 1)
51
52 #if HAVE_LUT3
53 #define vsel(dst, a, b, c) (dst) = lut3(a, b, c, 0xd8)
54 #elif defined(_NV) || __CPU__
55 #define vsel(dst, a, b, c) \
56 (dst) = (((a) & ~(c)) ^ ((b) & (c)))
57 #else
58 #define vsel(dst, a, b, c) \
59 (dst) = bitselect((a),(b),(c))
60 #endif
61
62 #if defined(_NV) || __CPU__
63 #include "opencl_sboxes.h"
64 #else
65 #include "opencl_sboxes-s.h"
66 #endif
67
68 #define vst_private(dst, ofs, src) \
69 *((__private vtype *)((__private DES_bs_vector *)&(dst) + (ofs))) = (src)
70
71 #define DES_bs_clear_block_8(j) \
72 vst_private(B[j] , 0, zero); \
73 vst_private(B[j] , 1, zero); \
74 vst_private(B[j] , 2, zero); \
75 vst_private(B[j] , 3, zero); \
76 vst_private(B[j] , 4, zero); \
77 vst_private(B[j] , 5, zero); \
78 vst_private(B[j] , 6, zero); \
79 vst_private(B[j] , 7, zero);
80
81 #define DES_bs_clear_block \
82 DES_bs_clear_block_8(0); \
83 DES_bs_clear_block_8(8); \
84 DES_bs_clear_block_8(16); \
85 DES_bs_clear_block_8(24); \
86 DES_bs_clear_block_8(32); \
87 DES_bs_clear_block_8(40); \
88 DES_bs_clear_block_8(48); \
89 DES_bs_clear_block_8(56);
90
cmp(__private unsigned DES_bs_vector * B,__global int * uncracked_hashes,int num_uncracked_hashes,volatile __global uint * hash_ids,volatile __global uint * bitmap_dupe,__global DES_bs_vector * cracked_hashes,int section)91 inline void cmp(__private unsigned DES_bs_vector *B,
92 __global int *uncracked_hashes,
93 int num_uncracked_hashes,
94 volatile __global uint *hash_ids,
95 volatile __global uint *bitmap_dupe,
96 __global DES_bs_vector *cracked_hashes,
97 int section) {
98
99 int value[2] , mask, i, bit;
100
101 for (i = 0; i < num_uncracked_hashes; i++) {
102
103 value[0] = uncracked_hashes[i];
104 value[1] = uncracked_hashes[i + num_uncracked_hashes];
105
106 mask = B[0] ^ -(value[0] & 1);
107
108 for (bit = 1; bit < 32; bit++)
109 mask |= B[bit] ^ -((value[0] >> bit) & 1);
110
111 for (; bit < 64; bit += 2) {
112 mask |= B[bit] ^ -((value[1] >> (bit & 0x1F)) & 1);
113 mask |= B[bit + 1] ^ -((value[1] >> ((bit + 1) & 0x1F)) & 1);
114 }
115
116 if (mask != ~(int)0) {
117 if (!(atomic_or(&bitmap_dupe[i/32], (1U << (i % 32))) & (1U << (i % 32)))) {
118 mask = atomic_inc(&hash_ids[0]);
119 hash_ids[1 + 2 * mask] = section;
120 hash_ids[2 + 2 * mask] = 0;
121 for (bit = 0; bit < 64; bit++)
122 cracked_hashes[mask * 64 + bit] = (DES_bs_vector)B[bit];
123
124 }
125 }
126 }
127 }
128