1 /*
2  * OpenCL RC4
3  *
4  * Copyright (c) 2014, magnum
5  * This software is hereby released to the general public under
6  * the following terms: Redistribution and use in source and binary
7  * forms, with or without modification, are permitted.
8  *
9  * NOTICE: After changes in headers, you probably need to drop cached
10  * kernels to ensure the changes take effect.
11  *
12  */
13 
14 #ifndef _OPENCL_RC4_H
15 #define _OPENCL_RC4_H
16 
17 #include "opencl_misc.h"
18 
19 #define RC4_IV32
20 
21 #if !gpu_amd(DEVICE_INFO) || DEV_VER_MAJOR < 1445
22 /* bug in Catalyst 14.9, besides it is slower */
23 #define RC4_UNROLLED_KEY
24 #define RC4_UNROLLED
25 #endif
26 
27 #if !defined(__OS_X__) && __GPU__ /* Actually we want discrete GPUs */
28 #define RC4_USE_LOCAL
29 #endif
30 
31 #ifdef RC4_IV32
32 __constant uint rc4_iv[64] = { 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
33                                0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c,
34                                0x23222120, 0x27262524, 0x2b2a2928, 0x2f2e2d2c,
35                                0x33323130, 0x37363534, 0x3b3a3938, 0x3f3e3d3c,
36                                0x43424140, 0x47464544, 0x4b4a4948, 0x4f4e4d4c,
37                                0x53525150, 0x57565554, 0x5b5a5958, 0x5f5e5d5c,
38                                0x63626160, 0x67666564, 0x6b6a6968, 0x6f6e6d6c,
39                                0x73727170, 0x77767574, 0x7b7a7978, 0x7f7e7d7c,
40                                0x83828180, 0x87868584, 0x8b8a8988, 0x8f8e8d8c,
41                                0x93929190, 0x97969594, 0x9b9a9998, 0x9f9e9d9c,
42                                0xa3a2a1a0, 0xa7a6a5a4, 0xabaaa9a8, 0xafaeadac,
43                                0xb3b2b1b0, 0xb7b6b5b4, 0xbbbab9b8, 0xbfbebdbc,
44                                0xc3c2c1c0, 0xc7c6c5c4, 0xcbcac9c8, 0xcfcecdcc,
45                                0xd3d2d1d0, 0xd7d6d5d4, 0xdbdad9d8, 0xdfdedddc,
46                                0xe3e2e1e0, 0xe7e6e5e4, 0xebeae9e8, 0xefeeedec,
47                                0xf3f2f1f0, 0xf7f6f5f4, 0xfbfaf9f8, 0xfffefdfc };
48 #endif
49 
50 #ifndef RC4_USE_LOCAL
51 #undef GETCHAR_L
52 #define GETCHAR_L GETCHAR
53 #undef PUTCHAR_L
54 #define PUTCHAR_L PUTCHAR
55 #endif
56 
57 #undef swap_byte
58 #define swap_byte(a, b) {	  \
59 		uint tmp = GETCHAR_L(state, a); \
60 		PUTCHAR_L(state, a, GETCHAR_L(state, b)); \
61 		PUTCHAR_L(state, b, tmp); \
62 	}
63 #undef swap_no_inc
64 #define swap_no_inc(n) {	  \
65 		index2 = (GETCHAR(key, index1) + GETCHAR_L(state, n) + index2) & 255; \
66 		swap_byte(n, index2); \
67 	}
68 #undef swap_state
69 #define swap_state(n) {	  \
70 		swap_no_inc(n); \
71 		index1 = (index1 + 1) & 15; /* WARNING: &15 == %keylen */ \
72 	}
73 #undef swap_anc_inc
74 #define swap_and_inc(n) {	  \
75 		swap_no_inc(n); \
76 		index1++; n++; \
77 	}
78 
79 /*
80  * One-shot RC4 with fixed keylen of 16. No byte addressed stores.
81  */
rc4(__local uint * restrict state,const uint * restrict key,uint * buf,uint len)82 inline void rc4(
83 #ifdef RC4_USE_LOCAL
84                 __local uint *restrict state,
85 #endif
86                 const uint *restrict key,
87 #ifdef RC4_IN_PLACE
88                 uint *buf,
89 #else
90                 MAYBE_CONSTANT uint *restrict in,
91                 __global uint *restrict out,
92 #endif
93                 uint len)
94 {
95 	uint x;
96 	uint y = 0;
97 	uint index1 = 0;
98 	uint index2 = 0;
99 #ifndef RC4_USE_LOCAL
100 	uint state[256/4];
101 #endif
102 
103 	/* RC4_init() */
104 #ifdef RC4_IV32
105 	for (x = 0; x < 256/4; x++)
106 		state[x] = rc4_iv[x];
107 #else
108 	for (x = 0; x < 256; x++)
109 		PUTCHAR_L(state, x, x);
110 #endif
111 
112 	/* RC4_set_key() */
113 #ifdef RC4_UNROLLED_KEY
114 	/* Unrolled for hard-coded key length 16 */
115 	for (x = 0; x < 256; x++) {
116 		swap_and_inc(x);
117 		swap_and_inc(x);
118 		swap_and_inc(x);
119 		swap_and_inc(x);
120 		swap_and_inc(x);
121 		swap_and_inc(x);
122 		swap_and_inc(x);
123 		swap_and_inc(x);
124 		swap_and_inc(x);
125 		swap_and_inc(x);
126 		swap_and_inc(x);
127 		swap_and_inc(x);
128 		swap_and_inc(x);
129 		swap_and_inc(x);
130 		swap_and_inc(x);
131 		swap_no_inc(x);
132 		index1 = 0;
133 	}
134 #else
135 	for (x = 0; x < 256; x++)
136 		swap_state(x);
137 #endif
138 
139 	/* RC4() */
140 #ifdef RC4_UNROLLED
141 	/* Unrolled to 32-bit xor */
142 	for (x = 1; x <= len; x++) {
143 		uint xor_word;
144 
145 		y = (GETCHAR_L(state, x) + y) & 255;
146 		swap_byte(x, y);
147 		xor_word = GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255);
148 		x++;
149 
150 		y = (GETCHAR_L(state, x) + y) & 255;
151 		swap_byte(x, y);
152 		xor_word += GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255) << 8;
153 		x++;
154 
155 		y = (GETCHAR_L(state, x) + y) & 255;
156 		swap_byte(x, y);
157 		xor_word += GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255) << 16;
158 		x++;
159 
160 		y = (GETCHAR_L(state, x) + y) & 255;
161 		swap_byte(x, y);
162 		xor_word += GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255) << 24;
163 
164 #ifdef RC4_IN_PLACE
165 		*buf++ ^= xor_word;
166 #else
167 		*out++ = *in++ ^ xor_word;
168 #endif
169 	}
170 #else /* RC4_UNROLLED */
171 #pragma unroll
172 	for (x = 1; x <= len; x++) {
173 		y = (GETCHAR_L(state, x) + y) & 255;
174 		swap_byte(x, y);
175 #ifdef RC4_IN_PLACE
176 		XORCHAR(buf, x - 1, GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255));
177 #else
178 		PUTCHAR_G(out, x - 1, GETCHAR_MC(in, x - 1) ^ (GETCHAR_L(state, (GETCHAR_L(state, x) + GETCHAR_L(state, y)) & 255)));
179 #endif
180 	}
181 #endif /* RC4_UNROLLED */
182 }
183 
184 #endif /* _OPENCL_RC4_H */
185