1 /*
2 * OpenCL RC4
3 *
4 * Copyright (c) 2014, magnum
5 * This software is hereby released to the general public under
6 * the following terms: Redistribution and use in source and binary
7 * forms, with or without modification, are permitted.
8 *
9 * NOTICE: After changes in headers, you probably need to drop cached
10 * kernels to ensure the changes take effect.
11 *
12 */
13
14 #ifndef _OPENCL_RC4_H
15 #define _OPENCL_RC4_H
16
17 #include "opencl_misc.h"
18
19 #define RC4_IV32
20
21 #if !gpu_amd(DEVICE_INFO) || DEV_VER_MAJOR < 1445
22 /* bug in Catalyst 14.9, besides it is slower */
23 #define RC4_UNROLLED_KEY
24 #define RC4_UNROLLED
25 #endif
26
27 #if !defined(__OS_X__) && __GPU__ /* Actually we want discrete GPUs */
28 #define RC4_USE_LOCAL
29 #endif
30
31 #ifdef RC4_IV32
32 __constant uint rc4_iv[64] = { 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
33 0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c,
34 0x23222120, 0x27262524, 0x2b2a2928, 0x2f2e2d2c,
35 0x33323130, 0x37363534, 0x3b3a3938, 0x3f3e3d3c,
36 0x43424140, 0x47464544, 0x4b4a4948, 0x4f4e4d4c,
37 0x53525150, 0x57565554, 0x5b5a5958, 0x5f5e5d5c,
38 0x63626160, 0x67666564, 0x6b6a6968, 0x6f6e6d6c,
39 0x73727170, 0x77767574, 0x7b7a7978, 0x7f7e7d7c,
40 0x83828180, 0x87868584, 0x8b8a8988, 0x8f8e8d8c,
41 0x93929190, 0x97969594, 0x9b9a9998, 0x9f9e9d9c,
42 0xa3a2a1a0, 0xa7a6a5a4, 0xabaaa9a8, 0xafaeadac,
43 0xb3b2b1b0, 0xb7b6b5b4, 0xbbbab9b8, 0xbfbebdbc,
44 0xc3c2c1c0, 0xc7c6c5c4, 0xcbcac9c8, 0xcfcecdcc,
45 0xd3d2d1d0, 0xd7d6d5d4, 0xdbdad9d8, 0xdfdedddc,
46 0xe3e2e1e0, 0xe7e6e5e4, 0xebeae9e8, 0xefeeedec,
47 0xf3f2f1f0, 0xf7f6f5f4, 0xfbfaf9f8, 0xfffefdfc };
48 #endif
49
50 #ifndef RC4_USE_LOCAL
51 #undef GETCHAR_L
52 #define GETCHAR_L GETCHAR
53 #undef PUTCHAR_L
54 #define PUTCHAR_L PUTCHAR
55 #endif
56
57 #undef swap_byte
58 #define swap_byte(a, b) { \
59 uint tmp = GETCHAR_L(state, a); \
60 PUTCHAR_L(state, a, GETCHAR_L(state, b)); \
61 PUTCHAR_L(state, b, tmp); \
62 }
63 #undef swap_no_inc
64 #define swap_no_inc(n) { \
65 index2 = (GETCHAR(key, index1) + GETCHAR_L(state, n) + index2) & 255; \
66 swap_byte(n, index2); \
67 }
68 #undef swap_state
69 #define swap_state(n) { \
70 swap_no_inc(n); \
71 index1 = (index1 + 1) & 15; /* WARNING: &15 == %keylen */ \
72 }
73 #undef swap_anc_inc
74 #define swap_and_inc(n) { \
75 swap_no_inc(n); \
76 index1++; n++; \
77 }
78
79 /*
80 * One-shot RC4 with fixed keylen of 16. No byte addressed stores.
81 */
rc4(__local uint * restrict state,const uint * restrict key,uint * buf,uint len)82 inline void rc4(
83 #ifdef RC4_USE_LOCAL
84 __local uint *restrict state,
85 #endif
86 const uint *restrict key,
87 #ifdef RC4_IN_PLACE
88 uint *buf,
89 #else
90 MAYBE_CONSTANT uint *restrict in,
91 __global uint *restrict out,
92 #endif
93 uint len)
94 {
95 uint x;
96 uint y = 0;
97 uint index1 = 0;
98 uint index2 = 0;
99 #ifndef RC4_USE_LOCAL
100 uint state[256/4];
101 #endif
102
103 /* RC4_init() */
104 #ifdef RC4_IV32
105 for (x = 0; x < 256/4; x++)
106 state[x] = rc4_iv[x];
107 #else
108 for (x = 0; x < 256; x++)
109 PUTCHAR_L(state, x, x);
110 #endif
111
112 /* RC4_set_key() */
113 #ifdef RC4_UNROLLED_KEY
114 /* Unrolled for hard-coded key length 16 */
115 for (x = 0; x < 256; x++) {
116 swap_and_inc(x);
117 swap_and_inc(x);
118 swap_and_inc(x);
119 swap_and_inc(x);
120 swap_and_inc(x);
121 swap_and_inc(x);
122 swap_and_inc(x);
123 swap_and_inc(x);
124 swap_and_inc(x);
125 swap_and_inc(x);
126 swap_and_inc(x);
127 swap_and_inc(x);
128 swap_and_inc(x);
129 swap_and_inc(x);
130 swap_and_inc(x);
131 swap_no_inc(x);
132 index1 = 0;
133 }
134 #else
135 for (x = 0; x < 256; x++)
136 swap_state(x);
137 #endif
138
139 /* RC4() */
140 #ifdef RC4_UNROLLED
141 /* Unrolled to 32-bit xor */
142 for (x = 1; x <= len; x++) {
143 uint xor_word;
144
145 y = (GETCHAR_L(state, x) + y) & 255;
146 swap_byte(x, y);
147 xor_word = GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255);
148 x++;
149
150 y = (GETCHAR_L(state, x) + y) & 255;
151 swap_byte(x, y);
152 xor_word += GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255) << 8;
153 x++;
154
155 y = (GETCHAR_L(state, x) + y) & 255;
156 swap_byte(x, y);
157 xor_word += GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255) << 16;
158 x++;
159
160 y = (GETCHAR_L(state, x) + y) & 255;
161 swap_byte(x, y);
162 xor_word += GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255) << 24;
163
164 #ifdef RC4_IN_PLACE
165 *buf++ ^= xor_word;
166 #else
167 *out++ = *in++ ^ xor_word;
168 #endif
169 }
170 #else /* RC4_UNROLLED */
171 #pragma unroll
172 for (x = 1; x <= len; x++) {
173 y = (GETCHAR_L(state, x) + y) & 255;
174 swap_byte(x, y);
175 #ifdef RC4_IN_PLACE
176 XORCHAR(buf, x - 1, GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255));
177 #else
178 PUTCHAR_G(out, x - 1, GETCHAR_MC(in, x - 1) ^ (GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255)));
179 #endif
180 }
181 #endif /* RC4_UNROLLED */
182 }
183
184 #endif /* _OPENCL_RC4_H */
185