1 /**
2 * Author......: Jens Steube <jens.steube@gmail.com>
3 * License.....: MIT
4 */
5
6 static const uint32_t tables[4][256] =
7 {
8 {
9 0x00072000, 0x00075000, 0x00074800, 0x00071000,
10 0x00076800, 0x00074000, 0x00070000, 0x00077000,
11 0x00073000, 0x00075800, 0x00070800, 0x00076000,
12 0x00073800, 0x00077800, 0x00072800, 0x00071800,
13 0x0005a000, 0x0005d000, 0x0005c800, 0x00059000,
14 0x0005e800, 0x0005c000, 0x00058000, 0x0005f000,
15 0x0005b000, 0x0005d800, 0x00058800, 0x0005e000,
16 0x0005b800, 0x0005f800, 0x0005a800, 0x00059800,
17 0x00022000, 0x00025000, 0x00024800, 0x00021000,
18 0x00026800, 0x00024000, 0x00020000, 0x00027000,
19 0x00023000, 0x00025800, 0x00020800, 0x00026000,
20 0x00023800, 0x00027800, 0x00022800, 0x00021800,
21 0x00062000, 0x00065000, 0x00064800, 0x00061000,
22 0x00066800, 0x00064000, 0x00060000, 0x00067000,
23 0x00063000, 0x00065800, 0x00060800, 0x00066000,
24 0x00063800, 0x00067800, 0x00062800, 0x00061800,
25 0x00032000, 0x00035000, 0x00034800, 0x00031000,
26 0x00036800, 0x00034000, 0x00030000, 0x00037000,
27 0x00033000, 0x00035800, 0x00030800, 0x00036000,
28 0x00033800, 0x00037800, 0x00032800, 0x00031800,
29 0x0006a000, 0x0006d000, 0x0006c800, 0x00069000,
30 0x0006e800, 0x0006c000, 0x00068000, 0x0006f000,
31 0x0006b000, 0x0006d800, 0x00068800, 0x0006e000,
32 0x0006b800, 0x0006f800, 0x0006a800, 0x00069800,
33 0x0007a000, 0x0007d000, 0x0007c800, 0x00079000,
34 0x0007e800, 0x0007c000, 0x00078000, 0x0007f000,
35 0x0007b000, 0x0007d800, 0x00078800, 0x0007e000,
36 0x0007b800, 0x0007f800, 0x0007a800, 0x00079800,
37 0x00052000, 0x00055000, 0x00054800, 0x00051000,
38 0x00056800, 0x00054000, 0x00050000, 0x00057000,
39 0x00053000, 0x00055800, 0x00050800, 0x00056000,
40 0x00053800, 0x00057800, 0x00052800, 0x00051800,
41 0x00012000, 0x00015000, 0x00014800, 0x00011000,
42 0x00016800, 0x00014000, 0x00010000, 0x00017000,
43 0x00013000, 0x00015800, 0x00010800, 0x00016000,
44 0x00013800, 0x00017800, 0x00012800, 0x00011800,
45 0x0001a000, 0x0001d000, 0x0001c800, 0x00019000,
46 0x0001e800, 0x0001c000, 0x00018000, 0x0001f000,
47 0x0001b000, 0x0001d800, 0x00018800, 0x0001e000,
48 0x0001b800, 0x0001f800, 0x0001a800, 0x00019800,
49 0x00042000, 0x00045000, 0x00044800, 0x00041000,
50 0x00046800, 0x00044000, 0x00040000, 0x00047000,
51 0x00043000, 0x00045800, 0x00040800, 0x00046000,
52 0x00043800, 0x00047800, 0x00042800, 0x00041800,
53 0x0000a000, 0x0000d000, 0x0000c800, 0x00009000,
54 0x0000e800, 0x0000c000, 0x00008000, 0x0000f000,
55 0x0000b000, 0x0000d800, 0x00008800, 0x0000e000,
56 0x0000b800, 0x0000f800, 0x0000a800, 0x00009800,
57 0x00002000, 0x00005000, 0x00004800, 0x00001000,
58 0x00006800, 0x00004000, 0x00000000, 0x00007000,
59 0x00003000, 0x00005800, 0x00000800, 0x00006000,
60 0x00003800, 0x00007800, 0x00002800, 0x00001800,
61 0x0003a000, 0x0003d000, 0x0003c800, 0x00039000,
62 0x0003e800, 0x0003c000, 0x00038000, 0x0003f000,
63 0x0003b000, 0x0003d800, 0x00038800, 0x0003e000,
64 0x0003b800, 0x0003f800, 0x0003a800, 0x00039800,
65 0x0002a000, 0x0002d000, 0x0002c800, 0x00029000,
66 0x0002e800, 0x0002c000, 0x00028000, 0x0002f000,
67 0x0002b000, 0x0002d800, 0x00028800, 0x0002e000,
68 0x0002b800, 0x0002f800, 0x0002a800, 0x00029800,
69 0x0004a000, 0x0004d000, 0x0004c800, 0x00049000,
70 0x0004e800, 0x0004c000, 0x00048000, 0x0004f000,
71 0x0004b000, 0x0004d800, 0x00048800, 0x0004e000,
72 0x0004b800, 0x0004f800, 0x0004a800, 0x00049800,
73 },
74 {
75 0x03a80000, 0x03c00000, 0x03880000, 0x03e80000,
76 0x03d00000, 0x03980000, 0x03a00000, 0x03900000,
77 0x03f00000, 0x03f80000, 0x03e00000, 0x03b80000,
78 0x03b00000, 0x03800000, 0x03c80000, 0x03d80000,
79 0x06a80000, 0x06c00000, 0x06880000, 0x06e80000,
80 0x06d00000, 0x06980000, 0x06a00000, 0x06900000,
81 0x06f00000, 0x06f80000, 0x06e00000, 0x06b80000,
82 0x06b00000, 0x06800000, 0x06c80000, 0x06d80000,
83 0x05280000, 0x05400000, 0x05080000, 0x05680000,
84 0x05500000, 0x05180000, 0x05200000, 0x05100000,
85 0x05700000, 0x05780000, 0x05600000, 0x05380000,
86 0x05300000, 0x05000000, 0x05480000, 0x05580000,
87 0x00a80000, 0x00c00000, 0x00880000, 0x00e80000,
88 0x00d00000, 0x00980000, 0x00a00000, 0x00900000,
89 0x00f00000, 0x00f80000, 0x00e00000, 0x00b80000,
90 0x00b00000, 0x00800000, 0x00c80000, 0x00d80000,
91 0x00280000, 0x00400000, 0x00080000, 0x00680000,
92 0x00500000, 0x00180000, 0x00200000, 0x00100000,
93 0x00700000, 0x00780000, 0x00600000, 0x00380000,
94 0x00300000, 0x00000000, 0x00480000, 0x00580000,
95 0x04280000, 0x04400000, 0x04080000, 0x04680000,
96 0x04500000, 0x04180000, 0x04200000, 0x04100000,
97 0x04700000, 0x04780000, 0x04600000, 0x04380000,
98 0x04300000, 0x04000000, 0x04480000, 0x04580000,
99 0x04a80000, 0x04c00000, 0x04880000, 0x04e80000,
100 0x04d00000, 0x04980000, 0x04a00000, 0x04900000,
101 0x04f00000, 0x04f80000, 0x04e00000, 0x04b80000,
102 0x04b00000, 0x04800000, 0x04c80000, 0x04d80000,
103 0x07a80000, 0x07c00000, 0x07880000, 0x07e80000,
104 0x07d00000, 0x07980000, 0x07a00000, 0x07900000,
105 0x07f00000, 0x07f80000, 0x07e00000, 0x07b80000,
106 0x07b00000, 0x07800000, 0x07c80000, 0x07d80000,
107 0x07280000, 0x07400000, 0x07080000, 0x07680000,
108 0x07500000, 0x07180000, 0x07200000, 0x07100000,
109 0x07700000, 0x07780000, 0x07600000, 0x07380000,
110 0x07300000, 0x07000000, 0x07480000, 0x07580000,
111 0x02280000, 0x02400000, 0x02080000, 0x02680000,
112 0x02500000, 0x02180000, 0x02200000, 0x02100000,
113 0x02700000, 0x02780000, 0x02600000, 0x02380000,
114 0x02300000, 0x02000000, 0x02480000, 0x02580000,
115 0x03280000, 0x03400000, 0x03080000, 0x03680000,
116 0x03500000, 0x03180000, 0x03200000, 0x03100000,
117 0x03700000, 0x03780000, 0x03600000, 0x03380000,
118 0x03300000, 0x03000000, 0x03480000, 0x03580000,
119 0x06280000, 0x06400000, 0x06080000, 0x06680000,
120 0x06500000, 0x06180000, 0x06200000, 0x06100000,
121 0x06700000, 0x06780000, 0x06600000, 0x06380000,
122 0x06300000, 0x06000000, 0x06480000, 0x06580000,
123 0x05a80000, 0x05c00000, 0x05880000, 0x05e80000,
124 0x05d00000, 0x05980000, 0x05a00000, 0x05900000,
125 0x05f00000, 0x05f80000, 0x05e00000, 0x05b80000,
126 0x05b00000, 0x05800000, 0x05c80000, 0x05d80000,
127 0x01280000, 0x01400000, 0x01080000, 0x01680000,
128 0x01500000, 0x01180000, 0x01200000, 0x01100000,
129 0x01700000, 0x01780000, 0x01600000, 0x01380000,
130 0x01300000, 0x01000000, 0x01480000, 0x01580000,
131 0x02a80000, 0x02c00000, 0x02880000, 0x02e80000,
132 0x02d00000, 0x02980000, 0x02a00000, 0x02900000,
133 0x02f00000, 0x02f80000, 0x02e00000, 0x02b80000,
134 0x02b00000, 0x02800000, 0x02c80000, 0x02d80000,
135 0x01a80000, 0x01c00000, 0x01880000, 0x01e80000,
136 0x01d00000, 0x01980000, 0x01a00000, 0x01900000,
137 0x01f00000, 0x01f80000, 0x01e00000, 0x01b80000,
138 0x01b00000, 0x01800000, 0x01c80000, 0x01d80000,
139 },
140 {
141 0x30000002, 0x60000002, 0x38000002, 0x08000002,
142 0x28000002, 0x78000002, 0x68000002, 0x40000002,
143 0x20000002, 0x50000002, 0x48000002, 0x70000002,
144 0x00000002, 0x18000002, 0x58000002, 0x10000002,
145 0xb0000005, 0xe0000005, 0xb8000005, 0x88000005,
146 0xa8000005, 0xf8000005, 0xe8000005, 0xc0000005,
147 0xa0000005, 0xd0000005, 0xc8000005, 0xf0000005,
148 0x80000005, 0x98000005, 0xd8000005, 0x90000005,
149 0x30000005, 0x60000005, 0x38000005, 0x08000005,
150 0x28000005, 0x78000005, 0x68000005, 0x40000005,
151 0x20000005, 0x50000005, 0x48000005, 0x70000005,
152 0x00000005, 0x18000005, 0x58000005, 0x10000005,
153 0x30000000, 0x60000000, 0x38000000, 0x08000000,
154 0x28000000, 0x78000000, 0x68000000, 0x40000000,
155 0x20000000, 0x50000000, 0x48000000, 0x70000000,
156 0x00000000, 0x18000000, 0x58000000, 0x10000000,
157 0xb0000003, 0xe0000003, 0xb8000003, 0x88000003,
158 0xa8000003, 0xf8000003, 0xe8000003, 0xc0000003,
159 0xa0000003, 0xd0000003, 0xc8000003, 0xf0000003,
160 0x80000003, 0x98000003, 0xd8000003, 0x90000003,
161 0x30000001, 0x60000001, 0x38000001, 0x08000001,
162 0x28000001, 0x78000001, 0x68000001, 0x40000001,
163 0x20000001, 0x50000001, 0x48000001, 0x70000001,
164 0x00000001, 0x18000001, 0x58000001, 0x10000001,
165 0xb0000000, 0xe0000000, 0xb8000000, 0x88000000,
166 0xa8000000, 0xf8000000, 0xe8000000, 0xc0000000,
167 0xa0000000, 0xd0000000, 0xc8000000, 0xf0000000,
168 0x80000000, 0x98000000, 0xd8000000, 0x90000000,
169 0xb0000006, 0xe0000006, 0xb8000006, 0x88000006,
170 0xa8000006, 0xf8000006, 0xe8000006, 0xc0000006,
171 0xa0000006, 0xd0000006, 0xc8000006, 0xf0000006,
172 0x80000006, 0x98000006, 0xd8000006, 0x90000006,
173 0xb0000001, 0xe0000001, 0xb8000001, 0x88000001,
174 0xa8000001, 0xf8000001, 0xe8000001, 0xc0000001,
175 0xa0000001, 0xd0000001, 0xc8000001, 0xf0000001,
176 0x80000001, 0x98000001, 0xd8000001, 0x90000001,
177 0x30000003, 0x60000003, 0x38000003, 0x08000003,
178 0x28000003, 0x78000003, 0x68000003, 0x40000003,
179 0x20000003, 0x50000003, 0x48000003, 0x70000003,
180 0x00000003, 0x18000003, 0x58000003, 0x10000003,
181 0x30000004, 0x60000004, 0x38000004, 0x08000004,
182 0x28000004, 0x78000004, 0x68000004, 0x40000004,
183 0x20000004, 0x50000004, 0x48000004, 0x70000004,
184 0x00000004, 0x18000004, 0x58000004, 0x10000004,
185 0xb0000002, 0xe0000002, 0xb8000002, 0x88000002,
186 0xa8000002, 0xf8000002, 0xe8000002, 0xc0000002,
187 0xa0000002, 0xd0000002, 0xc8000002, 0xf0000002,
188 0x80000002, 0x98000002, 0xd8000002, 0x90000002,
189 0xb0000004, 0xe0000004, 0xb8000004, 0x88000004,
190 0xa8000004, 0xf8000004, 0xe8000004, 0xc0000004,
191 0xa0000004, 0xd0000004, 0xc8000004, 0xf0000004,
192 0x80000004, 0x98000004, 0xd8000004, 0x90000004,
193 0x30000006, 0x60000006, 0x38000006, 0x08000006,
194 0x28000006, 0x78000006, 0x68000006, 0x40000006,
195 0x20000006, 0x50000006, 0x48000006, 0x70000006,
196 0x00000006, 0x18000006, 0x58000006, 0x10000006,
197 0xb0000007, 0xe0000007, 0xb8000007, 0x88000007,
198 0xa8000007, 0xf8000007, 0xe8000007, 0xc0000007,
199 0xa0000007, 0xd0000007, 0xc8000007, 0xf0000007,
200 0x80000007, 0x98000007, 0xd8000007, 0x90000007,
201 0x30000007, 0x60000007, 0x38000007, 0x08000007,
202 0x28000007, 0x78000007, 0x68000007, 0x40000007,
203 0x20000007, 0x50000007, 0x48000007, 0x70000007,
204 0x00000007, 0x18000007, 0x58000007, 0x10000007,
205 },
206 {
207 0x000000e8, 0x000000d8, 0x000000a0, 0x00000088,
208 0x00000098, 0x000000f8, 0x000000a8, 0x000000c8,
209 0x00000080, 0x000000d0, 0x000000f0, 0x000000b8,
210 0x000000b0, 0x000000c0, 0x00000090, 0x000000e0,
211 0x000007e8, 0x000007d8, 0x000007a0, 0x00000788,
212 0x00000798, 0x000007f8, 0x000007a8, 0x000007c8,
213 0x00000780, 0x000007d0, 0x000007f0, 0x000007b8,
214 0x000007b0, 0x000007c0, 0x00000790, 0x000007e0,
215 0x000006e8, 0x000006d8, 0x000006a0, 0x00000688,
216 0x00000698, 0x000006f8, 0x000006a8, 0x000006c8,
217 0x00000680, 0x000006d0, 0x000006f0, 0x000006b8,
218 0x000006b0, 0x000006c0, 0x00000690, 0x000006e0,
219 0x00000068, 0x00000058, 0x00000020, 0x00000008,
220 0x00000018, 0x00000078, 0x00000028, 0x00000048,
221 0x00000000, 0x00000050, 0x00000070, 0x00000038,
222 0x00000030, 0x00000040, 0x00000010, 0x00000060,
223 0x000002e8, 0x000002d8, 0x000002a0, 0x00000288,
224 0x00000298, 0x000002f8, 0x000002a8, 0x000002c8,
225 0x00000280, 0x000002d0, 0x000002f0, 0x000002b8,
226 0x000002b0, 0x000002c0, 0x00000290, 0x000002e0,
227 0x000003e8, 0x000003d8, 0x000003a0, 0x00000388,
228 0x00000398, 0x000003f8, 0x000003a8, 0x000003c8,
229 0x00000380, 0x000003d0, 0x000003f0, 0x000003b8,
230 0x000003b0, 0x000003c0, 0x00000390, 0x000003e0,
231 0x00000568, 0x00000558, 0x00000520, 0x00000508,
232 0x00000518, 0x00000578, 0x00000528, 0x00000548,
233 0x00000500, 0x00000550, 0x00000570, 0x00000538,
234 0x00000530, 0x00000540, 0x00000510, 0x00000560,
235 0x00000268, 0x00000258, 0x00000220, 0x00000208,
236 0x00000218, 0x00000278, 0x00000228, 0x00000248,
237 0x00000200, 0x00000250, 0x00000270, 0x00000238,
238 0x00000230, 0x00000240, 0x00000210, 0x00000260,
239 0x000004e8, 0x000004d8, 0x000004a0, 0x00000488,
240 0x00000498, 0x000004f8, 0x000004a8, 0x000004c8,
241 0x00000480, 0x000004d0, 0x000004f0, 0x000004b8,
242 0x000004b0, 0x000004c0, 0x00000490, 0x000004e0,
243 0x00000168, 0x00000158, 0x00000120, 0x00000108,
244 0x00000118, 0x00000178, 0x00000128, 0x00000148,
245 0x00000100, 0x00000150, 0x00000170, 0x00000138,
246 0x00000130, 0x00000140, 0x00000110, 0x00000160,
247 0x000001e8, 0x000001d8, 0x000001a0, 0x00000188,
248 0x00000198, 0x000001f8, 0x000001a8, 0x000001c8,
249 0x00000180, 0x000001d0, 0x000001f0, 0x000001b8,
250 0x000001b0, 0x000001c0, 0x00000190, 0x000001e0,
251 0x00000768, 0x00000758, 0x00000720, 0x00000708,
252 0x00000718, 0x00000778, 0x00000728, 0x00000748,
253 0x00000700, 0x00000750, 0x00000770, 0x00000738,
254 0x00000730, 0x00000740, 0x00000710, 0x00000760,
255 0x00000368, 0x00000358, 0x00000320, 0x00000308,
256 0x00000318, 0x00000378, 0x00000328, 0x00000348,
257 0x00000300, 0x00000350, 0x00000370, 0x00000338,
258 0x00000330, 0x00000340, 0x00000310, 0x00000360,
259 0x000005e8, 0x000005d8, 0x000005a0, 0x00000588,
260 0x00000598, 0x000005f8, 0x000005a8, 0x000005c8,
261 0x00000580, 0x000005d0, 0x000005f0, 0x000005b8,
262 0x000005b0, 0x000005c0, 0x00000590, 0x000005e0,
263 0x00000468, 0x00000458, 0x00000420, 0x00000408,
264 0x00000418, 0x00000478, 0x00000428, 0x00000448,
265 0x00000400, 0x00000450, 0x00000470, 0x00000438,
266 0x00000430, 0x00000440, 0x00000410, 0x00000460,
267 0x00000668, 0x00000658, 0x00000620, 0x00000608,
268 0x00000618, 0x00000678, 0x00000628, 0x00000648,
269 0x00000600, 0x00000650, 0x00000670, 0x00000638,
270 0x00000630, 0x00000640, 0x00000610, 0x00000660,
271 }
272 };
273
274 #define round(k1,k2) \
275 { \
276 uint32_t t; \
277 t = (k1) + r; \
278 l ^= tables[0][(t >> 0) & 0xff] ^ \
279 tables[1][(t >> 8) & 0xff] ^ \
280 tables[2][(t >> 16) & 0xff] ^ \
281 tables[3][(t >> 24) & 0xff]; \
282 t = (k2) + l; \
283 r ^= tables[0][(t >> 0) & 0xff] ^ \
284 tables[1][(t >> 8) & 0xff] ^ \
285 tables[2][(t >> 16) & 0xff] ^ \
286 tables[3][(t >> 24) & 0xff]; \
287 }
288
289 #define R(k,h,s,i) \
290 { \
291 uint32_t r; \
292 uint32_t l; \
293 r = h[i + 0]; \
294 l = h[i + 1]; \
295 round (k[0], k[1]); \
296 round (k[2], k[3]); \
297 round (k[4], k[5]); \
298 round (k[6], k[7]); \
299 round (k[0], k[1]); \
300 round (k[2], k[3]); \
301 round (k[4], k[5]); \
302 round (k[6], k[7]); \
303 round (k[0], k[1]); \
304 round (k[2], k[3]); \
305 round (k[4], k[5]); \
306 round (k[6], k[7]); \
307 round (k[7], k[6]); \
308 round (k[5], k[4]); \
309 round (k[3], k[2]); \
310 round (k[1], k[0]); \
311 s[i + 0] = l; \
312 s[i + 1] = r; \
313 }
314
315 #define X(w,u,v) \
316 w[0] = u[0] ^ v[0]; \
317 w[1] = u[1] ^ v[1]; \
318 w[2] = u[2] ^ v[2]; \
319 w[3] = u[3] ^ v[3]; \
320 w[4] = u[4] ^ v[4]; \
321 w[5] = u[5] ^ v[5]; \
322 w[6] = u[6] ^ v[6]; \
323 w[7] = u[7] ^ v[7];
324
325 #define P(k,w) \
326 k[0] = ((w[0] & 0x000000ff) << 0) \
327 | ((w[2] & 0x000000ff) << 8) \
328 | ((w[4] & 0x000000ff) << 16) \
329 | ((w[6] & 0x000000ff) << 24); \
330 k[1] = ((w[0] & 0x0000ff00) >> 8) \
331 | ((w[2] & 0x0000ff00) >> 0) \
332 | ((w[4] & 0x0000ff00) << 8) \
333 | ((w[6] & 0x0000ff00) << 16); \
334 k[2] = ((w[0] & 0x00ff0000) >> 16) \
335 | ((w[2] & 0x00ff0000) >> 8) \
336 | ((w[4] & 0x00ff0000) << 0) \
337 | ((w[6] & 0x00ff0000) << 8); \
338 k[3] = ((w[0] & 0xff000000) >> 24) \
339 | ((w[2] & 0xff000000) >> 16) \
340 | ((w[4] & 0xff000000) >> 8) \
341 | ((w[6] & 0xff000000) >> 0); \
342 k[4] = ((w[1] & 0x000000ff) << 0) \
343 | ((w[3] & 0x000000ff) << 8) \
344 | ((w[5] & 0x000000ff) << 16) \
345 | ((w[7] & 0x000000ff) << 24); \
346 k[5] = ((w[1] & 0x0000ff00) >> 8) \
347 | ((w[3] & 0x0000ff00) >> 0) \
348 | ((w[5] & 0x0000ff00) << 8) \
349 | ((w[7] & 0x0000ff00) << 16); \
350 k[6] = ((w[1] & 0x00ff0000) >> 16) \
351 | ((w[3] & 0x00ff0000) >> 8) \
352 | ((w[5] & 0x00ff0000) << 0) \
353 | ((w[7] & 0x00ff0000) << 8); \
354 k[7] = ((w[1] & 0xff000000) >> 24) \
355 | ((w[3] & 0xff000000) >> 16) \
356 | ((w[5] & 0xff000000) >> 8) \
357 | ((w[7] & 0xff000000) >> 0);
358
359 #define A(x) \
360 { \
361 uint32_t l; \
362 uint32_t r; \
363 l = x[0] ^ x[2]; \
364 r = x[1] ^ x[3]; \
365 x[0] = x[2]; \
366 x[1] = x[3]; \
367 x[2] = x[4]; \
368 x[3] = x[5]; \
369 x[4] = x[6]; \
370 x[5] = x[7]; \
371 x[6] = l; \
372 x[7] = r; \
373 }
374
375 #define AA(x) \
376 { \
377 uint32_t l; \
378 uint32_t r; \
379 l = x[0]; \
380 r = x[2]; \
381 x[0] = x[4]; \
382 x[2] = x[6]; \
383 x[4] = l ^ r; \
384 x[6] = x[0] ^ r; \
385 l = x[1]; \
386 r = x[3]; \
387 x[1] = x[5]; \
388 x[3] = x[7]; \
389 x[5] = l ^ r; \
390 x[7] = x[1] ^ r; \
391 }
392
393 #define C(x) \
394 x[0] ^= 0xff00ff00; \
395 x[1] ^= 0xff00ff00; \
396 x[2] ^= 0x00ff00ff; \
397 x[3] ^= 0x00ff00ff; \
398 x[4] ^= 0x00ffff00; \
399 x[5] ^= 0xff0000ff; \
400 x[6] ^= 0x000000ff; \
401 x[7] ^= 0xff00ffff;
402
403 #define SHIFT12(u,m,s) \
404 u[0] = m[0] ^ s[6]; \
405 u[1] = m[1] ^ s[7]; \
406 u[2] = m[2] ^ (s[0] << 16) \
407 ^ (s[0] >> 16) \
408 ^ (s[0] & 0x0000ffff) \
409 ^ (s[1] & 0x0000ffff) \
410 ^ (s[1] >> 16) \
411 ^ (s[2] << 16) \
412 ^ s[6] \
413 ^ (s[6] << 16) \
414 ^ (s[7] & 0xffff0000) \
415 ^ (s[7] >> 16); \
416 u[3] = m[3] ^ (s[0] & 0x0000ffff) \
417 ^ (s[0] << 16) \
418 ^ (s[1] & 0x0000ffff) \
419 ^ (s[1] << 16) \
420 ^ (s[1] >> 16) \
421 ^ (s[2] << 16) \
422 ^ (s[2] >> 16) \
423 ^ (s[3] << 16) \
424 ^ s[6] \
425 ^ (s[6] << 16) \
426 ^ (s[6] >> 16) \
427 ^ (s[7] & 0x0000ffff) \
428 ^ (s[7] << 16) \
429 ^ (s[7] >> 16); \
430 u[4] = m[4] ^ (s[0] & 0xffff0000) \
431 ^ (s[0] << 16) \
432 ^ (s[0] >> 16) \
433 ^ (s[1] & 0xffff0000) \
434 ^ (s[1] >> 16) \
435 ^ (s[2] << 16) \
436 ^ (s[2] >> 16) \
437 ^ (s[3] << 16) \
438 ^ (s[3] >> 16) \
439 ^ (s[4] << 16) \
440 ^ (s[6] << 16) \
441 ^ (s[6] >> 16) \
442 ^ (s[7] & 0x0000ffff) \
443 ^ (s[7] << 16) \
444 ^ (s[7] >> 16); \
445 u[5] = m[5] ^ (s[0] << 16) \
446 ^ (s[0] >> 16) \
447 ^ (s[0] & 0xffff0000) \
448 ^ (s[1] & 0x0000ffff) \
449 ^ s[2] \
450 ^ (s[2] >> 16) \
451 ^ (s[3] << 16) \
452 ^ (s[3] >> 16) \
453 ^ (s[4] << 16) \
454 ^ (s[4] >> 16) \
455 ^ (s[5] << 16) \
456 ^ (s[6] << 16) \
457 ^ (s[6] >> 16) \
458 ^ (s[7] & 0xffff0000) \
459 ^ (s[7] << 16) \
460 ^ (s[7] >> 16); \
461 u[6] = m[6] ^ s[0] \
462 ^ (s[1] >> 16) \
463 ^ (s[2] << 16) \
464 ^ s[3] \
465 ^ (s[3] >> 16) \
466 ^ (s[4] << 16) \
467 ^ (s[4] >> 16) \
468 ^ (s[5] << 16) \
469 ^ (s[5] >> 16) \
470 ^ s[6] \
471 ^ (s[6] << 16) \
472 ^ (s[6] >> 16) \
473 ^ (s[7] << 16); \
474 u[7] = m[7] ^ (s[0] & 0xffff0000) \
475 ^ (s[0] << 16) \
476 ^ (s[1] & 0x0000ffff) \
477 ^ (s[1] << 16) \
478 ^ (s[2] >> 16) \
479 ^ (s[3] << 16) \
480 ^ s[4] \
481 ^ (s[4] >> 16) \
482 ^ (s[5] << 16) \
483 ^ (s[5] >> 16) \
484 ^ (s[6] >> 16) \
485 ^ (s[7] & 0x0000ffff) \
486 ^ (s[7] << 16) \
487 ^ (s[7] >> 16);
488
489 #define SHIFT16(h,v,u) \
490 v[0] = h[0] ^ (u[1] << 16) \
491 ^ (u[0] >> 16); \
492 v[1] = h[1] ^ (u[2] << 16) \
493 ^ (u[1] >> 16); \
494 v[2] = h[2] ^ (u[3] << 16) \
495 ^ (u[2] >> 16); \
496 v[3] = h[3] ^ (u[4] << 16) \
497 ^ (u[3] >> 16); \
498 v[4] = h[4] ^ (u[5] << 16) \
499 ^ (u[4] >> 16); \
500 v[5] = h[5] ^ (u[6] << 16) \
501 ^ (u[5] >> 16); \
502 v[6] = h[6] ^ (u[7] << 16) \
503 ^ (u[6] >> 16); \
504 v[7] = h[7] ^ (u[0] & 0xffff0000) \
505 ^ (u[0] << 16) \
506 ^ (u[7] >> 16) \
507 ^ (u[1] & 0xffff0000) \
508 ^ (u[1] << 16) \
509 ^ (u[6] << 16) \
510 ^ (u[7] & 0xffff0000);
511
512 #define SHIFT61(h,v) \
513 h[0] = (v[0] & 0xffff0000) \
514 ^ (v[0] << 16) \
515 ^ (v[0] >> 16) \
516 ^ (v[1] >> 16) \
517 ^ (v[1] & 0xffff0000) \
518 ^ (v[2] << 16) \
519 ^ (v[3] >> 16) \
520 ^ (v[4] << 16) \
521 ^ (v[5] >> 16) \
522 ^ v[5] \
523 ^ (v[6] >> 16) \
524 ^ (v[7] << 16) \
525 ^ (v[7] >> 16) \
526 ^ (v[7] & 0x0000ffff); \
527 h[1] = (v[0] << 16) \
528 ^ (v[0] >> 16) \
529 ^ (v[0] & 0xffff0000) \
530 ^ (v[1] & 0x0000ffff) \
531 ^ v[2] \
532 ^ (v[2] >> 16) \
533 ^ (v[3] << 16) \
534 ^ (v[4] >> 16) \
535 ^ (v[5] << 16) \
536 ^ (v[6] << 16) \
537 ^ v[6] \
538 ^ (v[7] & 0xffff0000) \
539 ^ (v[7] >> 16); \
540 h[2] = (v[0] & 0x0000ffff) \
541 ^ (v[0] << 16) \
542 ^ (v[1] << 16) \
543 ^ (v[1] >> 16) \
544 ^ (v[1] & 0xffff0000) \
545 ^ (v[2] << 16) \
546 ^ (v[3] >> 16) \
547 ^ v[3] \
548 ^ (v[4] << 16) \
549 ^ (v[5] >> 16) \
550 ^ v[6] \
551 ^ (v[6] >> 16) \
552 ^ (v[7] & 0x0000ffff) \
553 ^ (v[7] << 16) \
554 ^ (v[7] >> 16); \
555 h[3] = (v[0] << 16) \
556 ^ (v[0] >> 16) \
557 ^ (v[0] & 0xffff0000) \
558 ^ (v[1] & 0xffff0000) \
559 ^ (v[1] >> 16) \
560 ^ (v[2] << 16) \
561 ^ (v[2] >> 16) \
562 ^ v[2] \
563 ^ (v[3] << 16) \
564 ^ (v[4] >> 16) \
565 ^ v[4] \
566 ^ (v[5] << 16) \
567 ^ (v[6] << 16) \
568 ^ (v[7] & 0x0000ffff) \
569 ^ (v[7] >> 16); \
570 h[4] = (v[0] >> 16) \
571 ^ (v[1] << 16) \
572 ^ v[1] \
573 ^ (v[2] >> 16) \
574 ^ v[2] \
575 ^ (v[3] << 16) \
576 ^ (v[3] >> 16) \
577 ^ v[3] \
578 ^ (v[4] << 16) \
579 ^ (v[5] >> 16) \
580 ^ v[5] \
581 ^ (v[6] << 16) \
582 ^ (v[6] >> 16) \
583 ^ (v[7] << 16); \
584 h[5] = (v[0] << 16) \
585 ^ (v[0] & 0xffff0000) \
586 ^ (v[1] << 16) \
587 ^ (v[1] >> 16) \
588 ^ (v[1] & 0xffff0000) \
589 ^ (v[2] << 16) \
590 ^ v[2] \
591 ^ (v[3] >> 16) \
592 ^ v[3] \
593 ^ (v[4] << 16) \
594 ^ (v[4] >> 16) \
595 ^ v[4] \
596 ^ (v[5] << 16) \
597 ^ (v[6] << 16) \
598 ^ (v[6] >> 16) \
599 ^ v[6] \
600 ^ (v[7] << 16) \
601 ^ (v[7] >> 16) \
602 ^ (v[7] & 0xffff0000); \
603 h[6] = v[0] \
604 ^ v[2] \
605 ^ (v[2] >> 16) \
606 ^ v[3] \
607 ^ (v[3] << 16) \
608 ^ v[4] \
609 ^ (v[4] >> 16) \
610 ^ (v[5] << 16) \
611 ^ (v[5] >> 16) \
612 ^ v[5] \
613 ^ (v[6] << 16) \
614 ^ (v[6] >> 16) \
615 ^ v[6] \
616 ^ (v[7] << 16) \
617 ^ v[7]; \
618 h[7] = v[0] \
619 ^ (v[0] >> 16) \
620 ^ (v[1] << 16) \
621 ^ (v[1] >> 16) \
622 ^ (v[2] << 16) \
623 ^ (v[3] >> 16) \
624 ^ v[3] \
625 ^ (v[4] << 16) \
626 ^ v[4] \
627 ^ (v[5] >> 16) \
628 ^ v[5] \
629 ^ (v[6] << 16) \
630 ^ (v[6] >> 16) \
631 ^ (v[7] << 16) \
632 ^ v[7];
633
634 #define PASS0(h,s,u,v) \
635 { \
636 uint32_t k[8]; \
637 uint32_t w[8]; \
638 X (w, u, v); \
639 P (k, w); \
640 R (k, h, s, 0); \
641 A (u); \
642 AA (v); \
643 }
644
645 #define PASS2(h,s,u,v) \
646 { \
647 uint32_t k[8]; \
648 uint32_t w[8]; \
649 X (w, u, v); \
650 P (k, w); \
651 R (k, h, s, 2); \
652 A (u); \
653 C (u); \
654 AA (v); \
655 }
656
657 #define PASS4(h,s,u,v) \
658 { \
659 uint32_t k[8]; \
660 uint32_t w[8]; \
661 X (w, u, v); \
662 P (k, w); \
663 R (k, h, s, 4); \
664 A (u); \
665 AA (v); \
666 }
667
668 #define PASS6(h,s,u,v) \
669 { \
670 uint32_t k[8]; \
671 uint32_t w[8]; \
672 X (w, u, v); \
673 P (k, w); \
674 R (k, h, s, 6); \
675 }
676
677
678 ////////////////////////////////////
679 // FUCKING SMART XOR MACROS START //
680 ////////////////////////////////////
681 #define XOR10(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9) \
682 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(x0, x1), \
683 _mm_xor_si128(x2, x3)), \
684 _mm_xor_si128(_mm_xor_si128(x4, x5), \
685 _mm_xor_si128(x6, x7))), \
686 _mm_xor_si128(x8, x9));
687
688 #define XOR11(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) \
689 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(x0, x1), \
690 _mm_xor_si128(x2, x3)), \
691 _mm_xor_si128(_mm_xor_si128(x4, x5), \
692 _mm_xor_si128(x6, x7))), \
693 _mm_xor_si128(_mm_xor_si128(x8, x9), \
694 x10));
695 #define XOR13(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) \
696 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
697 _mm_xor_si128( x2, x3)), \
698 _mm_xor_si128(_mm_xor_si128( x4, x5), \
699 _mm_xor_si128( x6, x7))), \
700 _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
701 _mm_xor_si128(x10, x11)), \
702 x12))
703
704 #define XOR14(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) \
705 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
706 _mm_xor_si128( x2, x3)), \
707 _mm_xor_si128(_mm_xor_si128( x4, x5), \
708 _mm_xor_si128( x6, x7))), \
709 _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
710 _mm_xor_si128(x10, x11)), \
711 _mm_xor_si128(x12, x13)));
712
713 #define XOR15(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) \
714 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
715 _mm_xor_si128( x2, x3)), \
716 _mm_xor_si128(_mm_xor_si128( x4, x5), \
717 _mm_xor_si128( x6, x7))), \
718 _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
719 _mm_xor_si128(x10, x11)), \
720 _mm_xor_si128(_mm_xor_si128(x12, x13), \
721 x14)));
722
723 #define XOR16(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
724 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
725 _mm_xor_si128( x2, x3)), \
726 _mm_xor_si128(_mm_xor_si128( x4, x5), \
727 _mm_xor_si128( x6, x7))), \
728 _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
729 _mm_xor_si128(x10, x11)), \
730 _mm_xor_si128(_mm_xor_si128(x12, x13), \
731 _mm_xor_si128(x14, x15))));
732
733 #define XOR17(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) \
734 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
735 _mm_xor_si128( x2, x3)), \
736 _mm_xor_si128(_mm_xor_si128( x4, x5), \
737 _mm_xor_si128( x6, x7))), \
738 _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
739 _mm_xor_si128(x10, x11)), \
740 _mm_xor_si128(_mm_xor_si128(x12, x13), \
741 _mm_xor_si128(x14, x15)))), \
742 x16);
743
744 #define XOR19(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18) \
745 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
746 _mm_xor_si128( x2, x3)), \
747 _mm_xor_si128(_mm_xor_si128( x4, x5), \
748 _mm_xor_si128( x6, x7))), \
749 _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
750 _mm_xor_si128(x10, x11)), \
751 _mm_xor_si128(_mm_xor_si128(x12, x13), \
752 _mm_xor_si128(x14, x15)))), \
753 _mm_xor_si128(_mm_xor_si128(x16, x17), \
754 x18))
755
756 #define XOR20(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19) \
757 store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0, x1), \
758 _mm_xor_si128( x2, x3)), \
759 _mm_xor_si128(_mm_xor_si128( x4, x5), \
760 _mm_xor_si128( x6, x7))), \
761 _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8, x9), \
762 _mm_xor_si128(x10, x11)), \
763 _mm_xor_si128(_mm_xor_si128(x12, x13), \
764 _mm_xor_si128(x14, x15)))), \
765 _mm_xor_si128(_mm_xor_si128(x16, x17), \
766 _mm_xor_si128(x18, x19)));
767
768 //////////////////////////////////
769 // FUCKING SMART XOR MACROS END //
770 //////////////////////////////////
771
772
773 ////////////////////////
774 // SSE2 DEFINES START //
775 ////////////////////////
776
777 #define round0_SSE2(k1,k2) \
778 { \
779 uint32_t t; \
780 uint32_t *_k1 = (uint32_t *)&k1; \
781 uint32_t *_k2 = (uint32_t *)&k2; \
782 uint32_t *_l = (uint32_t *)&l; \
783 uint32_t *_r = (uint32_t *)&r; \
784 t = (_k1[0]) + _r[0]; \
785 _l[0] ^= tables[0][(t >> 0) & 0xff] ^ \
786 tables[1][(t >> 8) & 0xff] ^ \
787 tables[2][(t >> 16) & 0xff] ^ \
788 tables[3][(t >> 24) & 0xff]; \
789 t = (_k2[0]) + _l[0]; \
790 _r[0] ^= tables[0][(t >> 0) & 0xff] ^ \
791 tables[1][(t >> 8) & 0xff] ^ \
792 tables[2][(t >> 16) & 0xff] ^ \
793 tables[3][(t >> 24) & 0xff]; \
794 }
795
796 #define round1_SSE2(k1,k2) \
797 { \
798 uint32_t t; \
799 uint32_t *_k1 = (uint32_t *)&k1; \
800 uint32_t *_k2 = (uint32_t *)&k2; \
801 uint32_t *_l = (uint32_t *)&l; \
802 uint32_t *_r = (uint32_t *)&r; \
803 t = (_k1[1]) + _r[1]; \
804 _l[1] ^= tables[0][(t >> 0) & 0xff] ^ \
805 tables[1][(t >> 8) & 0xff] ^ \
806 tables[2][(t >> 16) & 0xff] ^ \
807 tables[3][(t >> 24) & 0xff]; \
808 t = (_k2[1]) + _l[1]; \
809 _r[1] ^= tables[0][(t >> 0) & 0xff] ^ \
810 tables[1][(t >> 8) & 0xff] ^ \
811 tables[2][(t >> 16) & 0xff] ^ \
812 tables[3][(t >> 24) & 0xff]; \
813 }
814
815 #define round2_SSE2(k1,k2) \
816 { \
817 uint32_t t; \
818 uint32_t *_k1 = (uint32_t *)&k1; \
819 uint32_t *_k2 = (uint32_t *)&k2; \
820 uint32_t *_l = (uint32_t *)&l; \
821 uint32_t *_r = (uint32_t *)&r; \
822 t = (_k1[2]) + _r[2]; \
823 _l[2] ^= tables[0][(t >> 0) & 0xff] ^ \
824 tables[1][(t >> 8) & 0xff] ^ \
825 tables[2][(t >> 16) & 0xff] ^ \
826 tables[3][(t >> 24) & 0xff]; \
827 t = (_k2[2]) + _l[2]; \
828 _r[2] ^= tables[0][(t >> 0) & 0xff] ^ \
829 tables[1][(t >> 8) & 0xff] ^ \
830 tables[2][(t >> 16) & 0xff] ^ \
831 tables[3][(t >> 24) & 0xff]; \
832 }
833
834 #define round3_SSE2(k1,k2) \
835 { \
836 uint32_t t; \
837 uint32_t *_k1 = (uint32_t *)&k1; \
838 uint32_t *_k2 = (uint32_t *)&k2; \
839 uint32_t *_l = (uint32_t *)&l; \
840 uint32_t *_r = (uint32_t *)&r; \
841 t = (_k1[3]) + _r[3]; \
842 _l[3] ^= tables[0][(t >> 0) & 0xff] ^ \
843 tables[1][(t >> 8) & 0xff] ^ \
844 tables[2][(t >> 16) & 0xff] ^ \
845 tables[3][(t >> 24) & 0xff]; \
846 t = (_k2[3]) + _l[3]; \
847 _r[3] ^= tables[0][(t >> 0) & 0xff] ^ \
848 tables[1][(t >> 8) & 0xff] ^ \
849 tables[2][(t >> 16) & 0xff] ^ \
850 tables[3][(t >> 24) & 0xff]; \
851 }
852
853 #define R_SSE2(k,h,s,i) \
854 { \
855 __m128i r; \
856 __m128i l; \
857 r = h[i + 0]; \
858 l = h[i + 1]; \
859 round0_SSE2 (k[0], k[1]); \
860 round1_SSE2 (k[0], k[1]); \
861 round2_SSE2 (k[0], k[1]); \
862 round3_SSE2 (k[0], k[1]); \
863 round0_SSE2 (k[2], k[3]); \
864 round1_SSE2 (k[2], k[3]); \
865 round2_SSE2 (k[2], k[3]); \
866 round3_SSE2 (k[2], k[3]); \
867 round0_SSE2 (k[4], k[5]); \
868 round1_SSE2 (k[4], k[5]); \
869 round2_SSE2 (k[4], k[5]); \
870 round3_SSE2 (k[4], k[5]); \
871 round0_SSE2 (k[6], k[7]); \
872 round1_SSE2 (k[6], k[7]); \
873 round2_SSE2 (k[6], k[7]); \
874 round3_SSE2 (k[6], k[7]); \
875 round0_SSE2 (k[0], k[1]); \
876 round1_SSE2 (k[0], k[1]); \
877 round2_SSE2 (k[0], k[1]); \
878 round3_SSE2 (k[0], k[1]); \
879 round0_SSE2 (k[2], k[3]); \
880 round1_SSE2 (k[2], k[3]); \
881 round2_SSE2 (k[2], k[3]); \
882 round3_SSE2 (k[2], k[3]); \
883 round0_SSE2 (k[4], k[5]); \
884 round1_SSE2 (k[4], k[5]); \
885 round2_SSE2 (k[4], k[5]); \
886 round3_SSE2 (k[4], k[5]); \
887 round0_SSE2 (k[6], k[7]); \
888 round1_SSE2 (k[6], k[7]); \
889 round2_SSE2 (k[6], k[7]); \
890 round3_SSE2 (k[6], k[7]); \
891 round0_SSE2 (k[0], k[1]); \
892 round1_SSE2 (k[0], k[1]); \
893 round2_SSE2 (k[0], k[1]); \
894 round3_SSE2 (k[0], k[1]); \
895 round0_SSE2 (k[2], k[3]); \
896 round1_SSE2 (k[2], k[3]); \
897 round2_SSE2 (k[2], k[3]); \
898 round3_SSE2 (k[2], k[3]); \
899 round0_SSE2 (k[4], k[5]); \
900 round1_SSE2 (k[4], k[5]); \
901 round2_SSE2 (k[4], k[5]); \
902 round3_SSE2 (k[4], k[5]); \
903 round0_SSE2 (k[6], k[7]); \
904 round1_SSE2 (k[6], k[7]); \
905 round2_SSE2 (k[6], k[7]); \
906 round3_SSE2 (k[6], k[7]); \
907 round0_SSE2 (k[7], k[6]); \
908 round1_SSE2 (k[7], k[6]); \
909 round2_SSE2 (k[7], k[6]); \
910 round3_SSE2 (k[7], k[6]); \
911 round0_SSE2 (k[5], k[4]); \
912 round1_SSE2 (k[5], k[4]); \
913 round2_SSE2 (k[5], k[4]); \
914 round3_SSE2 (k[5], k[4]); \
915 round0_SSE2 (k[3], k[2]); \
916 round1_SSE2 (k[3], k[2]); \
917 round2_SSE2 (k[3], k[2]); \
918 round3_SSE2 (k[3], k[2]); \
919 round0_SSE2 (k[1], k[0]); \
920 round1_SSE2 (k[1], k[0]); \
921 round2_SSE2 (k[1], k[0]); \
922 round3_SSE2 (k[1], k[0]); \
923 s[i + 0] = l; \
924 s[i + 1] = r; \
925 }
926
927 #define X_SSE2(w,u,v) \
928 w[0] = _mm_xor_si128(u[0], v[0]); \
929 w[1] = _mm_xor_si128(u[1], v[1]); \
930 w[2] = _mm_xor_si128(u[2], v[2]); \
931 w[3] = _mm_xor_si128(u[3], v[3]); \
932 w[4] = _mm_xor_si128(u[4], v[4]); \
933 w[5] = _mm_xor_si128(u[5], v[5]); \
934 w[6] = _mm_xor_si128(u[6], v[6]); \
935 w[7] = _mm_xor_si128(u[7], v[7]);
936
937 #define P_SSE2(k,w) \
938 k[0] = _mm_or_si128(_mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x000000ff)), 0), \
939 _mm_slli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x000000ff)), 8) ), \
940 _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x000000ff)), 16), \
941 _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x000000ff)), 24) ) ); \
942 k[1] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x0000ff00)), 8), \
943 _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x0000ff00)), 0) ), \
944 _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x0000ff00)), 8), \
945 _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x0000ff00)), 16) ) ); \
946 k[2] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x00ff0000)), 16), \
947 _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x00ff0000)), 8) ), \
948 _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x00ff0000)), 0), \
949 _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x00ff0000)), 8) ) ); \
950 k[3] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0xff000000)), 24), \
951 _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0xff000000)), 16) ), \
952 _mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0xff000000)), 8), \
953 _mm_srli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0xff000000)), 0) ) ); \
954 k[4] = _mm_or_si128(_mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x000000ff)), 0), \
955 _mm_slli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x000000ff)), 8) ), \
956 _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x000000ff)), 16), \
957 _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x000000ff)), 24) ) ); \
958 k[5] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x0000ff00)), 8), \
959 _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x0000ff00)), 0) ), \
960 _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x0000ff00)), 8), \
961 _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x0000ff00)), 16) ) ); \
962 k[6] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x00ff0000)), 16), \
963 _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x00ff0000)), 8) ), \
964 _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x00ff0000)), 0), \
965 _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x00ff0000)), 8) ) ); \
966 k[7] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0xff000000)), 24), \
967 _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0xff000000)), 16) ), \
968 _mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0xff000000)), 8), \
969 _mm_srli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0xff000000)), 0) ) );
970
971 #define A_SSE2(x) \
972 { \
973 __m128i l; \
974 __m128i r; \
975 l = _mm_xor_si128(x[0], x[2]); \
976 r = _mm_xor_si128(x[1], x[3]); \
977 x[0] = x[2]; \
978 x[1] = x[3]; \
979 x[2] = x[4]; \
980 x[3] = x[5]; \
981 x[4] = x[6]; \
982 x[5] = x[7]; \
983 x[6] = l; \
984 x[7] = r; \
985 }
986
987 #define AA_SSE2(x) \
988 { \
989 __m128i l; \
990 __m128i r; \
991 l = x[0]; \
992 r = x[2]; \
993 x[0] = x[4]; \
994 x[2] = x[6]; \
995 x[4] = _mm_xor_si128(l, r); \
996 x[6] = _mm_xor_si128(x[0], r); \
997 l = x[1]; \
998 r = x[3]; \
999 x[1] = x[5]; \
1000 x[3] = x[7]; \
1001 x[5] = _mm_xor_si128(l, r); \
1002 x[7] = _mm_xor_si128(x[1], r); \
1003 }
1004
1005 #define C_SSE2(x) \
1006 x[0] = _mm_xor_si128(x[0], _mm_set1_epi32(0xff00ff00)); \
1007 x[1] = _mm_xor_si128(x[1], _mm_set1_epi32(0xff00ff00)); \
1008 x[2] = _mm_xor_si128(x[2], _mm_set1_epi32(0x00ff00ff)); \
1009 x[3] = _mm_xor_si128(x[3], _mm_set1_epi32(0x00ff00ff)); \
1010 x[4] = _mm_xor_si128(x[4], _mm_set1_epi32(0x00ffff00)); \
1011 x[5] = _mm_xor_si128(x[5], _mm_set1_epi32(0xff0000ff)); \
1012 x[6] = _mm_xor_si128(x[6], _mm_set1_epi32(0x000000ff)); \
1013 x[7] = _mm_xor_si128(x[7], _mm_set1_epi32(0xff00ffff));
1014
1015 #define SHIFT12_SSE2(u,m,s) \
1016 u[0] = _mm_xor_si128(m[0], s[6]); \
1017 u[1] = _mm_xor_si128(m[1], s[7]); \
1018 XOR11(u[2], \
1019 m[2], \
1020 _mm_slli_epi32(s[0], 16), \
1021 _mm_srli_epi32(s[0], 16), \
1022 _mm_and_si128(s[0], _mm_set1_epi32(0x0000ffff)), \
1023 _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)), \
1024 _mm_srli_epi32(s[1], 16), \
1025 _mm_slli_epi32(s[2], 16), \
1026 s[6], \
1027 _mm_slli_epi32(s[6], 16), \
1028 _mm_and_si128(s[7], _mm_set1_epi32(0xffff0000)), \
1029 _mm_srli_epi32(s[7], 16)); \
1030 XOR15(u[3], \
1031 m[3], \
1032 _mm_and_si128(s[0], _mm_set1_epi32(0x0000ffff)), \
1033 _mm_slli_epi32(s[0], 16), \
1034 _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)), \
1035 _mm_slli_epi32(s[1], 16), \
1036 _mm_srli_epi32(s[1], 16), \
1037 _mm_slli_epi32(s[2], 16), \
1038 _mm_srli_epi32(s[2], 16), \
1039 _mm_slli_epi32(s[3], 16), \
1040 s[6], \
1041 _mm_slli_epi32(s[6], 16), \
1042 _mm_srli_epi32(s[6], 16), \
1043 _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)), \
1044 _mm_slli_epi32(s[7], 16), \
1045 _mm_srli_epi32(s[7], 16)); \
1046 XOR16(u[4], \
1047 m[4], \
1048 _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)), \
1049 _mm_slli_epi32(s[0], 16), \
1050 _mm_srli_epi32(s[0], 16), \
1051 _mm_and_si128(s[1], _mm_set1_epi32(0xffff0000)), \
1052 _mm_srli_epi32(s[1], 16), \
1053 _mm_slli_epi32(s[2], 16), \
1054 _mm_srli_epi32(s[2], 16), \
1055 _mm_slli_epi32(s[3], 16), \
1056 _mm_srli_epi32(s[3], 16), \
1057 _mm_slli_epi32(s[4], 16), \
1058 _mm_slli_epi32(s[6], 16), \
1059 _mm_srli_epi32(s[6], 16), \
1060 _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)), \
1061 _mm_slli_epi32(s[7], 16), \
1062 _mm_srli_epi32(s[7], 16)); \
1063 XOR17(u[5], \
1064 m[5], \
1065 _mm_slli_epi32(s[0], 16), \
1066 _mm_srli_epi32(s[0], 16), \
1067 _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)), \
1068 _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)), \
1069 s[2], \
1070 _mm_srli_epi32(s[2], 16), \
1071 _mm_slli_epi32(s[3], 16), \
1072 _mm_srli_epi32(s[3], 16), \
1073 _mm_slli_epi32(s[4], 16), \
1074 _mm_srli_epi32(s[4], 16), \
1075 _mm_slli_epi32(s[5], 16), \
1076 _mm_slli_epi32(s[6], 16), \
1077 _mm_srli_epi32(s[6], 16), \
1078 _mm_and_si128(s[7], _mm_set1_epi32(0xffff0000)), \
1079 _mm_slli_epi32(s[7], 16), \
1080 _mm_srli_epi32(s[7], 16)); \
1081 XOR14(u[6], \
1082 m[6], \
1083 s[0], \
1084 _mm_srli_epi32(s[1], 16), \
1085 _mm_slli_epi32(s[2], 16), \
1086 s[3], \
1087 _mm_srli_epi32(s[3], 16), \
1088 _mm_slli_epi32(s[4], 16), \
1089 _mm_srli_epi32(s[4], 16), \
1090 _mm_slli_epi32(s[5], 16), \
1091 _mm_srli_epi32(s[5], 16), \
1092 s[6], \
1093 _mm_slli_epi32(s[6], 16), \
1094 _mm_srli_epi32(s[6], 16), \
1095 _mm_slli_epi32(s[7], 16)); \
1096 XOR15(u[7], \
1097 m[7], \
1098 _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)), \
1099 _mm_slli_epi32(s[0], 16), \
1100 _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)), \
1101 _mm_slli_epi32(s[1], 16), \
1102 _mm_srli_epi32(s[2], 16), \
1103 _mm_slli_epi32(s[3], 16), \
1104 s[4], \
1105 _mm_srli_epi32(s[4], 16), \
1106 _mm_slli_epi32(s[5], 16), \
1107 _mm_srli_epi32(s[5], 16), \
1108 _mm_srli_epi32(s[6], 16), \
1109 _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)), \
1110 _mm_slli_epi32(s[7], 16), \
1111 _mm_srli_epi32(s[7], 16));
1112
1113 #define SHIFT16_SSE2(h,v,u) \
1114 v[0] = _mm_xor_si128( _mm_xor_si128(h[0],_mm_slli_epi32(u[1], 16)), \
1115 _mm_srli_epi32(u[0], 16)); \
1116 v[1] = _mm_xor_si128(_mm_xor_si128( h[1], _mm_slli_epi32(u[2], 16)), \
1117 _mm_srli_epi32(u[1], 16)); \
1118 v[2] = _mm_xor_si128(_mm_xor_si128( h[2], _mm_slli_epi32(u[3], 16)), \
1119 _mm_srli_epi32(u[2], 16)); \
1120 v[3] = _mm_xor_si128(_mm_xor_si128( h[3], _mm_slli_epi32(u[4], 16)), \
1121 _mm_srli_epi32(u[3], 16)); \
1122 v[4] = _mm_xor_si128(_mm_xor_si128( h[4], _mm_slli_epi32(u[5], 16)), \
1123 _mm_srli_epi32(u[4], 16)); \
1124 v[5] = _mm_xor_si128(_mm_xor_si128( h[5], _mm_slli_epi32(u[6], 16)), \
1125 _mm_srli_epi32(u[5], 16)); \
1126 v[6] = _mm_xor_si128(_mm_xor_si128( h[6], _mm_slli_epi32(u[7], 16)), \
1127 _mm_srli_epi32(u[6], 16)); \
1128 v[7] = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(h[7], \
1129 _mm_and_si128(u[0], _mm_set1_epi32(0xffff0000))), \
1130 _mm_xor_si128(_mm_slli_epi32(u[0], 16), \
1131 _mm_srli_epi32(u[7], 16))), \
1132 _mm_xor_si128(_mm_xor_si128(_mm_and_si128(u[1], _mm_set1_epi32(0xffff0000)), \
1133 _mm_slli_epi32(u[1], 16)), \
1134 _mm_xor_si128(_mm_slli_epi32(u[6], 16), \
1135 _mm_and_si128(u[7], _mm_set1_epi32(0xffff0000)))));
1136
1137 #define SHIFT61_SSE2(h,v) \
1138 XOR14(h[0], \
1139 _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)), \
1140 _mm_slli_epi32(v[0], 16), \
1141 _mm_srli_epi32(v[0], 16), \
1142 _mm_srli_epi32(v[1], 16), \
1143 _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)), \
1144 _mm_slli_epi32(v[2], 16), \
1145 _mm_srli_epi32(v[3], 16), \
1146 _mm_slli_epi32(v[4], 16), \
1147 _mm_srli_epi32(v[5], 16), \
1148 v[5], \
1149 _mm_srli_epi32(v[6], 16), \
1150 _mm_slli_epi32(v[7], 16), \
1151 _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)), \
1152 _mm_srli_epi32(v[7], 16)); \
1153 XOR13(h[1], \
1154 _mm_slli_epi32(v[0], 16), \
1155 _mm_srli_epi32(v[0], 16), \
1156 _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)), \
1157 _mm_and_si128(v[1], _mm_set1_epi32(0x0000ffff)), \
1158 _mm_srli_epi32(v[2], 16), \
1159 v[2], \
1160 _mm_slli_epi32(v[3], 16), \
1161 _mm_srli_epi32(v[4], 16), \
1162 _mm_slli_epi32(v[5], 16), \
1163 _mm_slli_epi32(v[6], 16), \
1164 _mm_and_si128(v[7], _mm_set1_epi32(0xffff0000)), \
1165 v[6], \
1166 _mm_srli_epi32(v[7], 16)); \
1167 XOR15(h[2], \
1168 _mm_and_si128(v[0], _mm_set1_epi32(0x0000ffff)), \
1169 _mm_slli_epi32(v[0], 16), \
1170 _mm_slli_epi32(v[1], 16), \
1171 _mm_srli_epi32(v[1], 16), \
1172 _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)), \
1173 _mm_slli_epi32(v[2], 16), \
1174 _mm_srli_epi32(v[3], 16), \
1175 v[3], \
1176 _mm_slli_epi32(v[4], 16), \
1177 _mm_srli_epi32(v[5], 16), \
1178 _mm_srli_epi32(v[6], 16), \
1179 v[6], \
1180 _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)), \
1181 _mm_slli_epi32(v[7], 16), \
1182 _mm_srli_epi32(v[7], 16)); \
1183 XOR15(h[3], \
1184 _mm_slli_epi32(v[0], 16), \
1185 _mm_srli_epi32(v[0], 16), \
1186 _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)), \
1187 _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)), \
1188 _mm_srli_epi32(v[1], 16), \
1189 _mm_slli_epi32(v[2], 16), \
1190 _mm_srli_epi32(v[2], 16), \
1191 v[2], \
1192 _mm_slli_epi32(v[3], 16), \
1193 _mm_srli_epi32(v[4], 16), \
1194 _mm_slli_epi32(v[5], 16), \
1195 v[4], \
1196 _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)), \
1197 _mm_slli_epi32(v[6], 16), \
1198 _mm_srli_epi32(v[7], 16)); \
1199 XOR14(h[4], \
1200 _mm_srli_epi32(v[0], 16), \
1201 _mm_slli_epi32(v[1], 16), \
1202 _mm_srli_epi32(v[2], 16), \
1203 v[1], \
1204 _mm_slli_epi32(v[3], 16), \
1205 v[2], \
1206 _mm_srli_epi32(v[3], 16), \
1207 v[3], \
1208 _mm_slli_epi32(v[4], 16), \
1209 _mm_srli_epi32(v[5], 16), \
1210 _mm_slli_epi32(v[6], 16), \
1211 v[5], \
1212 _mm_srli_epi32(v[6], 16), \
1213 _mm_slli_epi32(v[7], 16)); \
1214 XOR19(h[5], \
1215 _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)), \
1216 _mm_slli_epi32(v[0], 16), \
1217 _mm_slli_epi32(v[1], 16), \
1218 _mm_srli_epi32(v[1], 16), \
1219 _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)), \
1220 _mm_slli_epi32(v[2], 16), \
1221 _mm_srli_epi32(v[3], 16), \
1222 v[2], \
1223 _mm_slli_epi32(v[4], 16), \
1224 v[3], \
1225 _mm_srli_epi32(v[4], 16), \
1226 v[4], \
1227 _mm_slli_epi32(v[5], 16), \
1228 _mm_slli_epi32(v[6], 16), \
1229 _mm_srli_epi32(v[6], 16), \
1230 v[6], \
1231 _mm_slli_epi32(v[7], 16), \
1232 _mm_srli_epi32(v[7], 16), \
1233 _mm_and_si128(v[7], _mm_set1_epi32(0xffff0000))); \
1234 XOR15(h[6], \
1235 v[0], \
1236 v[2], \
1237 _mm_srli_epi32(v[2], 16), \
1238 v[3], \
1239 _mm_slli_epi32(v[3], 16), \
1240 v[4], \
1241 _mm_srli_epi32(v[4], 16), \
1242 _mm_slli_epi32(v[5], 16), \
1243 _mm_srli_epi32(v[5], 16), \
1244 v[5], \
1245 _mm_slli_epi32(v[6], 16), \
1246 _mm_srli_epi32(v[6], 16), \
1247 _mm_slli_epi32(v[7], 16), \
1248 v[6], \
1249 v[7]); \
1250 XOR15(h[7], \
1251 _mm_srli_epi32(v[0], 16), \
1252 v[0], \
1253 _mm_slli_epi32(v[1], 16), \
1254 _mm_srli_epi32(v[1], 16), \
1255 _mm_slli_epi32(v[2], 16), \
1256 _mm_srli_epi32(v[3], 16), \
1257 _mm_slli_epi32(v[4], 16), \
1258 v[3], \
1259 _mm_srli_epi32(v[5], 16), \
1260 v[4], \
1261 _mm_slli_epi32(v[6], 16), \
1262 v[5], \
1263 _mm_srli_epi32(v[6], 16), \
1264 _mm_slli_epi32(v[7], 16), \
1265 v[7]);
1266
1267 #define PASS0_SSE2(h,s,u,v) \
1268 { \
1269 __m128i k[8]; \
1270 __m128i w[8]; \
1271 X_SSE2 (w, u, v); \
1272 P_SSE2 (k, w); \
1273 R_SSE2 (k, h, s, 0); \
1274 A_SSE2 (u); \
1275 AA_SSE2 (v); \
1276 }
1277
1278 #define PASS2_SSE2(h,s,u,v) \
1279 { \
1280 __m128i k[8]; \
1281 __m128i w[8]; \
1282 X_SSE2 (w, u, v); \
1283 P_SSE2 (k, w); \
1284 R_SSE2 (k, h, s, 2); \
1285 A_SSE2 (u); \
1286 C_SSE2 (u); \
1287 AA_SSE2 (v); \
1288 }
1289
1290 #define PASS4_SSE2(h,s,u,v) \
1291 { \
1292 __m128i k[8]; \
1293 __m128i w[8]; \
1294 X_SSE2 (w, u, v); \
1295 P_SSE2 (k, w); \
1296 R_SSE2 (k, h, s, 4); \
1297 A_SSE2 (u); \
1298 AA_SSE2 (v); \
1299 }
1300
1301 #define PASS6_SSE2(h,s,u,v) \
1302 { \
1303 __m128i k[8]; \
1304 __m128i w[8]; \
1305 X_SSE2 (w, u, v); \
1306 P_SSE2 (k, w); \
1307 R_SSE2 (k, h, s, 6); \
1308 }
1309
1310
1311 //////////////////////
1312 // SSE2 DEFINES END //
1313 //////////////////////
1314
1315
hashcat_gost_64(uint32_t digests[8][4],uint32_t blocks[16][4])1316 void hashcat_gost_64 (uint32_t digests[8][4], uint32_t blocks[16][4])
1317 {
1318 /**
1319 * base
1320 */
1321
1322 int id;
1323
1324 for (id = 0; id < 4; id++)
1325 {
1326 uint32_t data[8];
1327
1328 data[0] = blocks[0][id];
1329 data[1] = blocks[1][id];
1330 data[2] = blocks[2][id];
1331 data[3] = blocks[3][id];
1332 data[4] = blocks[4][id];
1333 data[5] = blocks[5][id];
1334 data[6] = blocks[6][id];
1335 data[7] = blocks[7][id];
1336
1337 uint32_t state[16];
1338
1339 state[ 0] = 0;
1340 state[ 1] = 0;
1341 state[ 2] = 0;
1342 state[ 3] = 0;
1343 state[ 4] = 0;
1344 state[ 5] = 0;
1345 state[ 6] = 0;
1346 state[ 7] = 0;
1347 state[ 8] = data[0];
1348 state[ 9] = data[1];
1349 state[10] = data[2];
1350 state[11] = data[3];
1351 state[12] = data[4];
1352 state[13] = data[5];
1353 state[14] = data[6];
1354 state[15] = data[7];
1355
1356 uint32_t state_m[8];
1357 uint32_t data_m[8];
1358
1359 /* gost1 */
1360
1361 state_m[0] = state[0];
1362 state_m[1] = state[1];
1363 state_m[2] = state[2];
1364 state_m[3] = state[3];
1365 state_m[4] = state[4];
1366 state_m[5] = state[5];
1367 state_m[6] = state[6];
1368 state_m[7] = state[7];
1369
1370 data_m[0] = data[0];
1371 data_m[1] = data[1];
1372 data_m[2] = data[2];
1373 data_m[3] = data[3];
1374 data_m[4] = data[4];
1375 data_m[5] = data[5];
1376 data_m[6] = data[6];
1377 data_m[7] = data[7];
1378
1379 uint32_t tmp[8];
1380
1381 PASS0 (state, tmp, state_m, data_m);
1382 PASS2 (state, tmp, state_m, data_m);
1383 PASS4 (state, tmp, state_m, data_m);
1384 PASS6 (state, tmp, state_m, data_m);
1385
1386 SHIFT12 (state_m, data, tmp);
1387 SHIFT16 (state, data_m, state_m);
1388 SHIFT61 (state, data_m);
1389
1390 data[0] = blocks[15][id];
1391 data[1] = 0;
1392 data[2] = 0;
1393 data[3] = 0;
1394 data[4] = 0;
1395 data[5] = 0;
1396 data[6] = 0;
1397 data[7] = 0;
1398
1399 /* gost2 */
1400
1401 state_m[0] = state[0];
1402 state_m[1] = state[1];
1403 state_m[2] = state[2];
1404 state_m[3] = state[3];
1405 state_m[4] = state[4];
1406 state_m[5] = state[5];
1407 state_m[6] = state[6];
1408 state_m[7] = state[7];
1409
1410 data_m[0] = data[0];
1411 data_m[1] = data[1];
1412 data_m[2] = data[2];
1413 data_m[3] = data[3];
1414 data_m[4] = data[4];
1415 data_m[5] = data[5];
1416 data_m[6] = data[6];
1417 data_m[7] = data[7];
1418
1419 PASS0 (state, tmp, state_m, data_m);
1420 PASS2 (state, tmp, state_m, data_m);
1421 PASS4 (state, tmp, state_m, data_m);
1422 PASS6 (state, tmp, state_m, data_m);
1423
1424 SHIFT12 (state_m, data, tmp);
1425 SHIFT16 (state, data_m, state_m);
1426 SHIFT61 (state, data_m);
1427
1428 /* gost3 */
1429
1430 data[0] = state[ 8];
1431 data[1] = state[ 9];
1432 data[2] = state[10];
1433 data[3] = state[11];
1434 data[4] = state[12];
1435 data[5] = state[13];
1436 data[6] = state[14];
1437 data[7] = state[15];
1438
1439 state_m[0] = state[0];
1440 state_m[1] = state[1];
1441 state_m[2] = state[2];
1442 state_m[3] = state[3];
1443 state_m[4] = state[4];
1444 state_m[5] = state[5];
1445 state_m[6] = state[6];
1446 state_m[7] = state[7];
1447
1448 data_m[0] = data[0];
1449 data_m[1] = data[1];
1450 data_m[2] = data[2];
1451 data_m[3] = data[3];
1452 data_m[4] = data[4];
1453 data_m[5] = data[5];
1454 data_m[6] = data[6];
1455 data_m[7] = data[7];
1456
1457 PASS0 (state, tmp, state_m, data_m);
1458 PASS2 (state, tmp, state_m, data_m);
1459 PASS4 (state, tmp, state_m, data_m);
1460 PASS6 (state, tmp, state_m, data_m);
1461
1462 SHIFT12 (state_m, data, tmp);
1463 SHIFT16 (state, data_m, state_m);
1464 SHIFT61 (state, data_m);
1465
1466 /* store */
1467
1468 digests[0][id] = state[0];
1469 digests[1][id] = state[1];
1470 digests[2][id] = state[2];
1471 digests[3][id] = state[3];
1472 digests[4][id] = state[4];
1473 digests[5][id] = state[5];
1474 digests[6][id] = state[6];
1475 digests[7][id] = state[7];
1476
1477 BYTESWAP (digests[0][id]);
1478 BYTESWAP (digests[1][id]);
1479 BYTESWAP (digests[2][id]);
1480 BYTESWAP (digests[3][id]);
1481 BYTESWAP (digests[4][id]);
1482 BYTESWAP (digests[5][id]);
1483 BYTESWAP (digests[6][id]);
1484 BYTESWAP (digests[7][id]);
1485 }
1486 }
1487
1488
hashcat_gost_64_sse2(__m128i digests[8],__m128i blocks[16])1489 void hashcat_gost_64_sse2 (__m128i digests[8], __m128i blocks[16])
1490 {
1491 __m128i data[8];
1492
1493 data[0] = blocks[0];
1494 data[1] = blocks[1];
1495 data[2] = blocks[2];
1496 data[3] = blocks[3];
1497 data[4] = blocks[4];
1498 data[5] = blocks[5];
1499 data[6] = blocks[6];
1500 data[7] = blocks[7];
1501
1502 __m128i state[16];
1503
1504 state[ 0] = _mm_set1_epi32 (0);
1505 state[ 1] = _mm_set1_epi32 (0);
1506 state[ 2] = _mm_set1_epi32 (0);
1507 state[ 3] = _mm_set1_epi32 (0);
1508 state[ 4] = _mm_set1_epi32 (0);
1509 state[ 5] = _mm_set1_epi32 (0);
1510 state[ 6] = _mm_set1_epi32 (0);
1511 state[ 7] = _mm_set1_epi32 (0);
1512 state[ 8] = data[0];
1513 state[ 9] = data[1];
1514 state[10] = data[2];
1515 state[11] = data[3];
1516 state[12] = data[4];
1517 state[13] = data[5];
1518 state[14] = data[6];
1519 state[15] = data[7];
1520
1521 __m128i state_m[8];
1522 __m128i data_m[8];
1523
1524 /* gost1 */
1525
1526 state_m[0] = state[0];
1527 state_m[1] = state[1];
1528 state_m[2] = state[2];
1529 state_m[3] = state[3];
1530 state_m[4] = state[4];
1531 state_m[5] = state[5];
1532 state_m[6] = state[6];
1533 state_m[7] = state[7];
1534
1535 data_m[0] = data[0];
1536 data_m[1] = data[1];
1537 data_m[2] = data[2];
1538 data_m[3] = data[3];
1539 data_m[4] = data[4];
1540 data_m[5] = data[5];
1541 data_m[6] = data[6];
1542 data_m[7] = data[7];
1543
1544 __m128i tmp[8];
1545
1546 PASS0_SSE2 (state, tmp, state_m, data_m);
1547 PASS2_SSE2 (state, tmp, state_m, data_m);
1548 PASS4_SSE2 (state, tmp, state_m, data_m);
1549 PASS6_SSE2 (state, tmp, state_m, data_m);
1550
1551 SHIFT12_SSE2 (state_m, data, tmp);
1552 SHIFT16_SSE2 (state, data_m, state_m);
1553 SHIFT61_SSE2 (state, data_m);
1554
1555 data[0] = blocks[15];
1556 data[1] = _mm_set1_epi32 (0);
1557 data[2] = _mm_set1_epi32 (0);
1558 data[3] = _mm_set1_epi32 (0);
1559 data[4] = _mm_set1_epi32 (0);
1560 data[5] = _mm_set1_epi32 (0);
1561 data[6] = _mm_set1_epi32 (0);
1562 data[7] = _mm_set1_epi32 (0);
1563
1564 /* gost2 */
1565
1566 state_m[0] = state[0];
1567 state_m[1] = state[1];
1568 state_m[2] = state[2];
1569 state_m[3] = state[3];
1570 state_m[4] = state[4];
1571 state_m[5] = state[5];
1572 state_m[6] = state[6];
1573 state_m[7] = state[7];
1574
1575 data_m[0] = data[0];
1576 data_m[1] = data[1];
1577 data_m[2] = data[2];
1578 data_m[3] = data[3];
1579 data_m[4] = data[4];
1580 data_m[5] = data[5];
1581 data_m[6] = data[6];
1582 data_m[7] = data[7];
1583
1584 PASS0_SSE2 (state, tmp, state_m, data_m);
1585 PASS2_SSE2 (state, tmp, state_m, data_m);
1586 PASS4_SSE2 (state, tmp, state_m, data_m);
1587 PASS6_SSE2 (state, tmp, state_m, data_m);
1588
1589 SHIFT12_SSE2 (state_m, data, tmp);
1590 SHIFT16_SSE2 (state, data_m, state_m);
1591 SHIFT61_SSE2 (state, data_m);
1592
1593 /* gost3 */
1594
1595 data[0] = state[ 8];
1596 data[1] = state[ 9];
1597 data[2] = state[10];
1598 data[3] = state[11];
1599 data[4] = state[12];
1600 data[5] = state[13];
1601 data[6] = state[14];
1602 data[7] = state[15];
1603
1604 state_m[0] = state[0];
1605 state_m[1] = state[1];
1606 state_m[2] = state[2];
1607 state_m[3] = state[3];
1608 state_m[4] = state[4];
1609 state_m[5] = state[5];
1610 state_m[6] = state[6];
1611 state_m[7] = state[7];
1612
1613 data_m[0] = data[0];
1614 data_m[1] = data[1];
1615 data_m[2] = data[2];
1616 data_m[3] = data[3];
1617 data_m[4] = data[4];
1618 data_m[5] = data[5];
1619 data_m[6] = data[6];
1620 data_m[7] = data[7];
1621
1622 PASS0_SSE2 (state, tmp, state_m, data_m);
1623 PASS2_SSE2 (state, tmp, state_m, data_m);
1624 PASS4_SSE2 (state, tmp, state_m, data_m);
1625 PASS6_SSE2 (state, tmp, state_m, data_m);
1626
1627 SHIFT12_SSE2 (state_m, data, tmp);
1628 SHIFT16_SSE2 (state, data_m, state_m);
1629 SHIFT61_SSE2 (state, data_m);
1630
1631 /* store */
1632
1633 uint32_t * tmpA;
1634
1635 digests[0] = state[0];
1636 digests[1] = state[1];
1637 digests[2] = state[2];
1638 digests[3] = state[3];
1639 digests[4] = state[4];
1640 digests[5] = state[5];
1641 digests[6] = state[6];
1642 digests[7] = state[7];
1643
1644 tmpA = (uint32_t *)&digests[0];
1645 BYTESWAP (tmpA[0]);
1646 BYTESWAP (tmpA[1]);
1647 BYTESWAP (tmpA[2]);
1648 BYTESWAP (tmpA[3]);
1649 tmpA = (uint32_t *)&digests[1];
1650 BYTESWAP (tmpA[0]);
1651 BYTESWAP (tmpA[1]);
1652 BYTESWAP (tmpA[2]);
1653 BYTESWAP (tmpA[3]);
1654 tmpA = (uint32_t *)&digests[2];
1655 BYTESWAP (tmpA[0]);
1656 BYTESWAP (tmpA[1]);
1657 BYTESWAP (tmpA[2]);
1658 BYTESWAP (tmpA[3]);
1659 tmpA = (uint32_t *)&digests[3];
1660 BYTESWAP (tmpA[0]);
1661 BYTESWAP (tmpA[1]);
1662 BYTESWAP (tmpA[2]);
1663 BYTESWAP (tmpA[3]);
1664 tmpA = (uint32_t *)&digests[4];
1665 BYTESWAP (tmpA[0]);
1666 BYTESWAP (tmpA[1]);
1667 BYTESWAP (tmpA[2]);
1668 BYTESWAP (tmpA[3]);
1669 tmpA = (uint32_t *)&digests[5];
1670 BYTESWAP (tmpA[0]);
1671 BYTESWAP (tmpA[1]);
1672 BYTESWAP (tmpA[2]);
1673 BYTESWAP (tmpA[3]);
1674 tmpA = (uint32_t *)&digests[6];
1675 BYTESWAP (tmpA[0]);
1676 BYTESWAP (tmpA[1]);
1677 BYTESWAP (tmpA[2]);
1678 BYTESWAP (tmpA[3]);
1679 tmpA = (uint32_t *)&digests[7];
1680 BYTESWAP (tmpA[0]);
1681 BYTESWAP (tmpA[1]);
1682 BYTESWAP (tmpA[2]);
1683 BYTESWAP (tmpA[3]);
1684
1685 }