1 /**
2  * Author......: Jens Steube <jens.steube@gmail.com>
3  * License.....: MIT
4  */
5 
6 static const uint32_t tables[4][256] =
7 {
8   {
9     0x00072000, 0x00075000, 0x00074800, 0x00071000,
10     0x00076800, 0x00074000, 0x00070000, 0x00077000,
11     0x00073000, 0x00075800, 0x00070800, 0x00076000,
12     0x00073800, 0x00077800, 0x00072800, 0x00071800,
13     0x0005a000, 0x0005d000, 0x0005c800, 0x00059000,
14     0x0005e800, 0x0005c000, 0x00058000, 0x0005f000,
15     0x0005b000, 0x0005d800, 0x00058800, 0x0005e000,
16     0x0005b800, 0x0005f800, 0x0005a800, 0x00059800,
17     0x00022000, 0x00025000, 0x00024800, 0x00021000,
18     0x00026800, 0x00024000, 0x00020000, 0x00027000,
19     0x00023000, 0x00025800, 0x00020800, 0x00026000,
20     0x00023800, 0x00027800, 0x00022800, 0x00021800,
21     0x00062000, 0x00065000, 0x00064800, 0x00061000,
22     0x00066800, 0x00064000, 0x00060000, 0x00067000,
23     0x00063000, 0x00065800, 0x00060800, 0x00066000,
24     0x00063800, 0x00067800, 0x00062800, 0x00061800,
25     0x00032000, 0x00035000, 0x00034800, 0x00031000,
26     0x00036800, 0x00034000, 0x00030000, 0x00037000,
27     0x00033000, 0x00035800, 0x00030800, 0x00036000,
28     0x00033800, 0x00037800, 0x00032800, 0x00031800,
29     0x0006a000, 0x0006d000, 0x0006c800, 0x00069000,
30     0x0006e800, 0x0006c000, 0x00068000, 0x0006f000,
31     0x0006b000, 0x0006d800, 0x00068800, 0x0006e000,
32     0x0006b800, 0x0006f800, 0x0006a800, 0x00069800,
33     0x0007a000, 0x0007d000, 0x0007c800, 0x00079000,
34     0x0007e800, 0x0007c000, 0x00078000, 0x0007f000,
35     0x0007b000, 0x0007d800, 0x00078800, 0x0007e000,
36     0x0007b800, 0x0007f800, 0x0007a800, 0x00079800,
37     0x00052000, 0x00055000, 0x00054800, 0x00051000,
38     0x00056800, 0x00054000, 0x00050000, 0x00057000,
39     0x00053000, 0x00055800, 0x00050800, 0x00056000,
40     0x00053800, 0x00057800, 0x00052800, 0x00051800,
41     0x00012000, 0x00015000, 0x00014800, 0x00011000,
42     0x00016800, 0x00014000, 0x00010000, 0x00017000,
43     0x00013000, 0x00015800, 0x00010800, 0x00016000,
44     0x00013800, 0x00017800, 0x00012800, 0x00011800,
45     0x0001a000, 0x0001d000, 0x0001c800, 0x00019000,
46     0x0001e800, 0x0001c000, 0x00018000, 0x0001f000,
47     0x0001b000, 0x0001d800, 0x00018800, 0x0001e000,
48     0x0001b800, 0x0001f800, 0x0001a800, 0x00019800,
49     0x00042000, 0x00045000, 0x00044800, 0x00041000,
50     0x00046800, 0x00044000, 0x00040000, 0x00047000,
51     0x00043000, 0x00045800, 0x00040800, 0x00046000,
52     0x00043800, 0x00047800, 0x00042800, 0x00041800,
53     0x0000a000, 0x0000d000, 0x0000c800, 0x00009000,
54     0x0000e800, 0x0000c000, 0x00008000, 0x0000f000,
55     0x0000b000, 0x0000d800, 0x00008800, 0x0000e000,
56     0x0000b800, 0x0000f800, 0x0000a800, 0x00009800,
57     0x00002000, 0x00005000, 0x00004800, 0x00001000,
58     0x00006800, 0x00004000, 0x00000000, 0x00007000,
59     0x00003000, 0x00005800, 0x00000800, 0x00006000,
60     0x00003800, 0x00007800, 0x00002800, 0x00001800,
61     0x0003a000, 0x0003d000, 0x0003c800, 0x00039000,
62     0x0003e800, 0x0003c000, 0x00038000, 0x0003f000,
63     0x0003b000, 0x0003d800, 0x00038800, 0x0003e000,
64     0x0003b800, 0x0003f800, 0x0003a800, 0x00039800,
65     0x0002a000, 0x0002d000, 0x0002c800, 0x00029000,
66     0x0002e800, 0x0002c000, 0x00028000, 0x0002f000,
67     0x0002b000, 0x0002d800, 0x00028800, 0x0002e000,
68     0x0002b800, 0x0002f800, 0x0002a800, 0x00029800,
69     0x0004a000, 0x0004d000, 0x0004c800, 0x00049000,
70     0x0004e800, 0x0004c000, 0x00048000, 0x0004f000,
71     0x0004b000, 0x0004d800, 0x00048800, 0x0004e000,
72     0x0004b800, 0x0004f800, 0x0004a800, 0x00049800,
73   },
74   {
75     0x03a80000, 0x03c00000, 0x03880000, 0x03e80000,
76     0x03d00000, 0x03980000, 0x03a00000, 0x03900000,
77     0x03f00000, 0x03f80000, 0x03e00000, 0x03b80000,
78     0x03b00000, 0x03800000, 0x03c80000, 0x03d80000,
79     0x06a80000, 0x06c00000, 0x06880000, 0x06e80000,
80     0x06d00000, 0x06980000, 0x06a00000, 0x06900000,
81     0x06f00000, 0x06f80000, 0x06e00000, 0x06b80000,
82     0x06b00000, 0x06800000, 0x06c80000, 0x06d80000,
83     0x05280000, 0x05400000, 0x05080000, 0x05680000,
84     0x05500000, 0x05180000, 0x05200000, 0x05100000,
85     0x05700000, 0x05780000, 0x05600000, 0x05380000,
86     0x05300000, 0x05000000, 0x05480000, 0x05580000,
87     0x00a80000, 0x00c00000, 0x00880000, 0x00e80000,
88     0x00d00000, 0x00980000, 0x00a00000, 0x00900000,
89     0x00f00000, 0x00f80000, 0x00e00000, 0x00b80000,
90     0x00b00000, 0x00800000, 0x00c80000, 0x00d80000,
91     0x00280000, 0x00400000, 0x00080000, 0x00680000,
92     0x00500000, 0x00180000, 0x00200000, 0x00100000,
93     0x00700000, 0x00780000, 0x00600000, 0x00380000,
94     0x00300000, 0x00000000, 0x00480000, 0x00580000,
95     0x04280000, 0x04400000, 0x04080000, 0x04680000,
96     0x04500000, 0x04180000, 0x04200000, 0x04100000,
97     0x04700000, 0x04780000, 0x04600000, 0x04380000,
98     0x04300000, 0x04000000, 0x04480000, 0x04580000,
99     0x04a80000, 0x04c00000, 0x04880000, 0x04e80000,
100     0x04d00000, 0x04980000, 0x04a00000, 0x04900000,
101     0x04f00000, 0x04f80000, 0x04e00000, 0x04b80000,
102     0x04b00000, 0x04800000, 0x04c80000, 0x04d80000,
103     0x07a80000, 0x07c00000, 0x07880000, 0x07e80000,
104     0x07d00000, 0x07980000, 0x07a00000, 0x07900000,
105     0x07f00000, 0x07f80000, 0x07e00000, 0x07b80000,
106     0x07b00000, 0x07800000, 0x07c80000, 0x07d80000,
107     0x07280000, 0x07400000, 0x07080000, 0x07680000,
108     0x07500000, 0x07180000, 0x07200000, 0x07100000,
109     0x07700000, 0x07780000, 0x07600000, 0x07380000,
110     0x07300000, 0x07000000, 0x07480000, 0x07580000,
111     0x02280000, 0x02400000, 0x02080000, 0x02680000,
112     0x02500000, 0x02180000, 0x02200000, 0x02100000,
113     0x02700000, 0x02780000, 0x02600000, 0x02380000,
114     0x02300000, 0x02000000, 0x02480000, 0x02580000,
115     0x03280000, 0x03400000, 0x03080000, 0x03680000,
116     0x03500000, 0x03180000, 0x03200000, 0x03100000,
117     0x03700000, 0x03780000, 0x03600000, 0x03380000,
118     0x03300000, 0x03000000, 0x03480000, 0x03580000,
119     0x06280000, 0x06400000, 0x06080000, 0x06680000,
120     0x06500000, 0x06180000, 0x06200000, 0x06100000,
121     0x06700000, 0x06780000, 0x06600000, 0x06380000,
122     0x06300000, 0x06000000, 0x06480000, 0x06580000,
123     0x05a80000, 0x05c00000, 0x05880000, 0x05e80000,
124     0x05d00000, 0x05980000, 0x05a00000, 0x05900000,
125     0x05f00000, 0x05f80000, 0x05e00000, 0x05b80000,
126     0x05b00000, 0x05800000, 0x05c80000, 0x05d80000,
127     0x01280000, 0x01400000, 0x01080000, 0x01680000,
128     0x01500000, 0x01180000, 0x01200000, 0x01100000,
129     0x01700000, 0x01780000, 0x01600000, 0x01380000,
130     0x01300000, 0x01000000, 0x01480000, 0x01580000,
131     0x02a80000, 0x02c00000, 0x02880000, 0x02e80000,
132     0x02d00000, 0x02980000, 0x02a00000, 0x02900000,
133     0x02f00000, 0x02f80000, 0x02e00000, 0x02b80000,
134     0x02b00000, 0x02800000, 0x02c80000, 0x02d80000,
135     0x01a80000, 0x01c00000, 0x01880000, 0x01e80000,
136     0x01d00000, 0x01980000, 0x01a00000, 0x01900000,
137     0x01f00000, 0x01f80000, 0x01e00000, 0x01b80000,
138     0x01b00000, 0x01800000, 0x01c80000, 0x01d80000,
139   },
140   {
141     0x30000002, 0x60000002, 0x38000002, 0x08000002,
142     0x28000002, 0x78000002, 0x68000002, 0x40000002,
143     0x20000002, 0x50000002, 0x48000002, 0x70000002,
144     0x00000002, 0x18000002, 0x58000002, 0x10000002,
145     0xb0000005, 0xe0000005, 0xb8000005, 0x88000005,
146     0xa8000005, 0xf8000005, 0xe8000005, 0xc0000005,
147     0xa0000005, 0xd0000005, 0xc8000005, 0xf0000005,
148     0x80000005, 0x98000005, 0xd8000005, 0x90000005,
149     0x30000005, 0x60000005, 0x38000005, 0x08000005,
150     0x28000005, 0x78000005, 0x68000005, 0x40000005,
151     0x20000005, 0x50000005, 0x48000005, 0x70000005,
152     0x00000005, 0x18000005, 0x58000005, 0x10000005,
153     0x30000000, 0x60000000, 0x38000000, 0x08000000,
154     0x28000000, 0x78000000, 0x68000000, 0x40000000,
155     0x20000000, 0x50000000, 0x48000000, 0x70000000,
156     0x00000000, 0x18000000, 0x58000000, 0x10000000,
157     0xb0000003, 0xe0000003, 0xb8000003, 0x88000003,
158     0xa8000003, 0xf8000003, 0xe8000003, 0xc0000003,
159     0xa0000003, 0xd0000003, 0xc8000003, 0xf0000003,
160     0x80000003, 0x98000003, 0xd8000003, 0x90000003,
161     0x30000001, 0x60000001, 0x38000001, 0x08000001,
162     0x28000001, 0x78000001, 0x68000001, 0x40000001,
163     0x20000001, 0x50000001, 0x48000001, 0x70000001,
164     0x00000001, 0x18000001, 0x58000001, 0x10000001,
165     0xb0000000, 0xe0000000, 0xb8000000, 0x88000000,
166     0xa8000000, 0xf8000000, 0xe8000000, 0xc0000000,
167     0xa0000000, 0xd0000000, 0xc8000000, 0xf0000000,
168     0x80000000, 0x98000000, 0xd8000000, 0x90000000,
169     0xb0000006, 0xe0000006, 0xb8000006, 0x88000006,
170     0xa8000006, 0xf8000006, 0xe8000006, 0xc0000006,
171     0xa0000006, 0xd0000006, 0xc8000006, 0xf0000006,
172     0x80000006, 0x98000006, 0xd8000006, 0x90000006,
173     0xb0000001, 0xe0000001, 0xb8000001, 0x88000001,
174     0xa8000001, 0xf8000001, 0xe8000001, 0xc0000001,
175     0xa0000001, 0xd0000001, 0xc8000001, 0xf0000001,
176     0x80000001, 0x98000001, 0xd8000001, 0x90000001,
177     0x30000003, 0x60000003, 0x38000003, 0x08000003,
178     0x28000003, 0x78000003, 0x68000003, 0x40000003,
179     0x20000003, 0x50000003, 0x48000003, 0x70000003,
180     0x00000003, 0x18000003, 0x58000003, 0x10000003,
181     0x30000004, 0x60000004, 0x38000004, 0x08000004,
182     0x28000004, 0x78000004, 0x68000004, 0x40000004,
183     0x20000004, 0x50000004, 0x48000004, 0x70000004,
184     0x00000004, 0x18000004, 0x58000004, 0x10000004,
185     0xb0000002, 0xe0000002, 0xb8000002, 0x88000002,
186     0xa8000002, 0xf8000002, 0xe8000002, 0xc0000002,
187     0xa0000002, 0xd0000002, 0xc8000002, 0xf0000002,
188     0x80000002, 0x98000002, 0xd8000002, 0x90000002,
189     0xb0000004, 0xe0000004, 0xb8000004, 0x88000004,
190     0xa8000004, 0xf8000004, 0xe8000004, 0xc0000004,
191     0xa0000004, 0xd0000004, 0xc8000004, 0xf0000004,
192     0x80000004, 0x98000004, 0xd8000004, 0x90000004,
193     0x30000006, 0x60000006, 0x38000006, 0x08000006,
194     0x28000006, 0x78000006, 0x68000006, 0x40000006,
195     0x20000006, 0x50000006, 0x48000006, 0x70000006,
196     0x00000006, 0x18000006, 0x58000006, 0x10000006,
197     0xb0000007, 0xe0000007, 0xb8000007, 0x88000007,
198     0xa8000007, 0xf8000007, 0xe8000007, 0xc0000007,
199     0xa0000007, 0xd0000007, 0xc8000007, 0xf0000007,
200     0x80000007, 0x98000007, 0xd8000007, 0x90000007,
201     0x30000007, 0x60000007, 0x38000007, 0x08000007,
202     0x28000007, 0x78000007, 0x68000007, 0x40000007,
203     0x20000007, 0x50000007, 0x48000007, 0x70000007,
204     0x00000007, 0x18000007, 0x58000007, 0x10000007,
205   },
206   {
207     0x000000e8, 0x000000d8, 0x000000a0, 0x00000088,
208     0x00000098, 0x000000f8, 0x000000a8, 0x000000c8,
209     0x00000080, 0x000000d0, 0x000000f0, 0x000000b8,
210     0x000000b0, 0x000000c0, 0x00000090, 0x000000e0,
211     0x000007e8, 0x000007d8, 0x000007a0, 0x00000788,
212     0x00000798, 0x000007f8, 0x000007a8, 0x000007c8,
213     0x00000780, 0x000007d0, 0x000007f0, 0x000007b8,
214     0x000007b0, 0x000007c0, 0x00000790, 0x000007e0,
215     0x000006e8, 0x000006d8, 0x000006a0, 0x00000688,
216     0x00000698, 0x000006f8, 0x000006a8, 0x000006c8,
217     0x00000680, 0x000006d0, 0x000006f0, 0x000006b8,
218     0x000006b0, 0x000006c0, 0x00000690, 0x000006e0,
219     0x00000068, 0x00000058, 0x00000020, 0x00000008,
220     0x00000018, 0x00000078, 0x00000028, 0x00000048,
221     0x00000000, 0x00000050, 0x00000070, 0x00000038,
222     0x00000030, 0x00000040, 0x00000010, 0x00000060,
223     0x000002e8, 0x000002d8, 0x000002a0, 0x00000288,
224     0x00000298, 0x000002f8, 0x000002a8, 0x000002c8,
225     0x00000280, 0x000002d0, 0x000002f0, 0x000002b8,
226     0x000002b0, 0x000002c0, 0x00000290, 0x000002e0,
227     0x000003e8, 0x000003d8, 0x000003a0, 0x00000388,
228     0x00000398, 0x000003f8, 0x000003a8, 0x000003c8,
229     0x00000380, 0x000003d0, 0x000003f0, 0x000003b8,
230     0x000003b0, 0x000003c0, 0x00000390, 0x000003e0,
231     0x00000568, 0x00000558, 0x00000520, 0x00000508,
232     0x00000518, 0x00000578, 0x00000528, 0x00000548,
233     0x00000500, 0x00000550, 0x00000570, 0x00000538,
234     0x00000530, 0x00000540, 0x00000510, 0x00000560,
235     0x00000268, 0x00000258, 0x00000220, 0x00000208,
236     0x00000218, 0x00000278, 0x00000228, 0x00000248,
237     0x00000200, 0x00000250, 0x00000270, 0x00000238,
238     0x00000230, 0x00000240, 0x00000210, 0x00000260,
239     0x000004e8, 0x000004d8, 0x000004a0, 0x00000488,
240     0x00000498, 0x000004f8, 0x000004a8, 0x000004c8,
241     0x00000480, 0x000004d0, 0x000004f0, 0x000004b8,
242     0x000004b0, 0x000004c0, 0x00000490, 0x000004e0,
243     0x00000168, 0x00000158, 0x00000120, 0x00000108,
244     0x00000118, 0x00000178, 0x00000128, 0x00000148,
245     0x00000100, 0x00000150, 0x00000170, 0x00000138,
246     0x00000130, 0x00000140, 0x00000110, 0x00000160,
247     0x000001e8, 0x000001d8, 0x000001a0, 0x00000188,
248     0x00000198, 0x000001f8, 0x000001a8, 0x000001c8,
249     0x00000180, 0x000001d0, 0x000001f0, 0x000001b8,
250     0x000001b0, 0x000001c0, 0x00000190, 0x000001e0,
251     0x00000768, 0x00000758, 0x00000720, 0x00000708,
252     0x00000718, 0x00000778, 0x00000728, 0x00000748,
253     0x00000700, 0x00000750, 0x00000770, 0x00000738,
254     0x00000730, 0x00000740, 0x00000710, 0x00000760,
255     0x00000368, 0x00000358, 0x00000320, 0x00000308,
256     0x00000318, 0x00000378, 0x00000328, 0x00000348,
257     0x00000300, 0x00000350, 0x00000370, 0x00000338,
258     0x00000330, 0x00000340, 0x00000310, 0x00000360,
259     0x000005e8, 0x000005d8, 0x000005a0, 0x00000588,
260     0x00000598, 0x000005f8, 0x000005a8, 0x000005c8,
261     0x00000580, 0x000005d0, 0x000005f0, 0x000005b8,
262     0x000005b0, 0x000005c0, 0x00000590, 0x000005e0,
263     0x00000468, 0x00000458, 0x00000420, 0x00000408,
264     0x00000418, 0x00000478, 0x00000428, 0x00000448,
265     0x00000400, 0x00000450, 0x00000470, 0x00000438,
266     0x00000430, 0x00000440, 0x00000410, 0x00000460,
267     0x00000668, 0x00000658, 0x00000620, 0x00000608,
268     0x00000618, 0x00000678, 0x00000628, 0x00000648,
269     0x00000600, 0x00000650, 0x00000670, 0x00000638,
270     0x00000630, 0x00000640, 0x00000610, 0x00000660,
271   }
272 };
273 
274 #define round(k1,k2)                  \
275 {                                     \
276   uint32_t t;                         \
277   t = (k1) + r;                       \
278   l ^= tables[0][(t >>  0) & 0xff] ^  \
279        tables[1][(t >>  8) & 0xff] ^  \
280        tables[2][(t >> 16) & 0xff] ^  \
281        tables[3][(t >> 24) & 0xff];   \
282   t = (k2) + l;                       \
283   r ^= tables[0][(t >>  0) & 0xff] ^  \
284        tables[1][(t >>  8) & 0xff] ^  \
285        tables[2][(t >> 16) & 0xff] ^  \
286        tables[3][(t >> 24) & 0xff];   \
287 }
288 
289 #define R(k,h,s,i)    \
290 {                     \
291   uint32_t r;         \
292   uint32_t l;         \
293   r = h[i + 0];       \
294   l = h[i + 1];       \
295   round (k[0], k[1]); \
296   round (k[2], k[3]); \
297   round (k[4], k[5]); \
298   round (k[6], k[7]); \
299   round (k[0], k[1]); \
300   round (k[2], k[3]); \
301   round (k[4], k[5]); \
302   round (k[6], k[7]); \
303   round (k[0], k[1]); \
304   round (k[2], k[3]); \
305   round (k[4], k[5]); \
306   round (k[6], k[7]); \
307   round (k[7], k[6]); \
308   round (k[5], k[4]); \
309   round (k[3], k[2]); \
310   round (k[1], k[0]); \
311   s[i + 0] = l;       \
312   s[i + 1] = r;       \
313 }
314 
315 #define X(w,u,v)      \
316   w[0] = u[0] ^ v[0]; \
317   w[1] = u[1] ^ v[1]; \
318   w[2] = u[2] ^ v[2]; \
319   w[3] = u[3] ^ v[3]; \
320   w[4] = u[4] ^ v[4]; \
321   w[5] = u[5] ^ v[5]; \
322   w[6] = u[6] ^ v[6]; \
323   w[7] = u[7] ^ v[7];
324 
325 #define P(k,w)                        \
326   k[0] = ((w[0] & 0x000000ff) <<  0)  \
327        | ((w[2] & 0x000000ff) <<  8)  \
328        | ((w[4] & 0x000000ff) << 16)  \
329        | ((w[6] & 0x000000ff) << 24); \
330   k[1] = ((w[0] & 0x0000ff00) >>  8)  \
331        | ((w[2] & 0x0000ff00) >>  0)  \
332        | ((w[4] & 0x0000ff00) <<  8)  \
333        | ((w[6] & 0x0000ff00) << 16); \
334   k[2] = ((w[0] & 0x00ff0000) >> 16)  \
335        | ((w[2] & 0x00ff0000) >>  8)  \
336        | ((w[4] & 0x00ff0000) <<  0)  \
337        | ((w[6] & 0x00ff0000) <<  8); \
338   k[3] = ((w[0] & 0xff000000) >> 24)  \
339        | ((w[2] & 0xff000000) >> 16)  \
340        | ((w[4] & 0xff000000) >>  8)  \
341        | ((w[6] & 0xff000000) >>  0); \
342   k[4] = ((w[1] & 0x000000ff) <<  0)  \
343        | ((w[3] & 0x000000ff) <<  8)  \
344        | ((w[5] & 0x000000ff) << 16)  \
345        | ((w[7] & 0x000000ff) << 24); \
346   k[5] = ((w[1] & 0x0000ff00) >>  8)  \
347        | ((w[3] & 0x0000ff00) >>  0)  \
348        | ((w[5] & 0x0000ff00) <<  8)  \
349        | ((w[7] & 0x0000ff00) << 16); \
350   k[6] = ((w[1] & 0x00ff0000) >> 16)  \
351        | ((w[3] & 0x00ff0000) >>  8)  \
352        | ((w[5] & 0x00ff0000) <<  0)  \
353        | ((w[7] & 0x00ff0000) <<  8); \
354   k[7] = ((w[1] & 0xff000000) >> 24)  \
355        | ((w[3] & 0xff000000) >> 16)  \
356        | ((w[5] & 0xff000000) >>  8)  \
357        | ((w[7] & 0xff000000) >>  0);
358 
359 #define A(x)        \
360 {                   \
361   uint32_t l;       \
362   uint32_t r;       \
363   l = x[0] ^ x[2];  \
364   r = x[1] ^ x[3];  \
365   x[0] = x[2];      \
366   x[1] = x[3];      \
367   x[2] = x[4];      \
368   x[3] = x[5];      \
369   x[4] = x[6];      \
370   x[5] = x[7];      \
371   x[6] = l;         \
372   x[7] = r;         \
373 }
374 
375 #define AA(x)       \
376 {                   \
377   uint32_t l;       \
378   uint32_t r;       \
379   l    = x[0];      \
380   r    = x[2];      \
381   x[0] = x[4];      \
382   x[2] = x[6];      \
383   x[4] = l ^ r;     \
384   x[6] = x[0] ^ r;  \
385   l    = x[1];      \
386   r    = x[3];      \
387   x[1] = x[5];      \
388   x[3] = x[7];      \
389   x[5] = l ^ r;     \
390   x[7] = x[1] ^ r;  \
391 }
392 
393 #define C(x)          \
394   x[0] ^= 0xff00ff00; \
395   x[1] ^= 0xff00ff00; \
396   x[2] ^= 0x00ff00ff; \
397   x[3] ^= 0x00ff00ff; \
398   x[4] ^= 0x00ffff00; \
399   x[5] ^= 0xff0000ff; \
400   x[6] ^= 0x000000ff; \
401   x[7] ^= 0xff00ffff;
402 
403 #define SHIFT12(u,m,s)              \
404   u[0] = m[0] ^ s[6];               \
405   u[1] = m[1] ^ s[7];               \
406   u[2] = m[2] ^ (s[0] << 16)        \
407               ^ (s[0] >> 16)        \
408               ^ (s[0] & 0x0000ffff) \
409               ^ (s[1] & 0x0000ffff) \
410               ^ (s[1] >> 16)        \
411               ^ (s[2] << 16)        \
412               ^ s[6]                \
413               ^ (s[6] << 16)        \
414               ^ (s[7] & 0xffff0000) \
415               ^ (s[7] >> 16);       \
416   u[3] = m[3] ^ (s[0] & 0x0000ffff) \
417               ^ (s[0] << 16)        \
418               ^ (s[1] & 0x0000ffff) \
419               ^ (s[1] << 16)        \
420               ^ (s[1] >> 16)        \
421               ^ (s[2] << 16)        \
422               ^ (s[2] >> 16)        \
423               ^ (s[3] << 16)        \
424               ^ s[6]                \
425               ^ (s[6] << 16)        \
426               ^ (s[6] >> 16)        \
427               ^ (s[7] & 0x0000ffff) \
428               ^ (s[7] << 16)        \
429               ^ (s[7] >> 16);       \
430   u[4] = m[4] ^ (s[0] & 0xffff0000) \
431               ^ (s[0] << 16)        \
432               ^ (s[0] >> 16)        \
433               ^ (s[1] & 0xffff0000) \
434               ^ (s[1] >> 16)        \
435               ^ (s[2] << 16)        \
436               ^ (s[2] >> 16)        \
437               ^ (s[3] << 16)        \
438               ^ (s[3] >> 16)        \
439               ^ (s[4] << 16)        \
440               ^ (s[6] << 16)        \
441               ^ (s[6] >> 16)        \
442               ^ (s[7] & 0x0000ffff) \
443               ^ (s[7] << 16)        \
444               ^ (s[7] >> 16);       \
445   u[5] = m[5] ^ (s[0] << 16)        \
446               ^ (s[0] >> 16)        \
447               ^ (s[0] & 0xffff0000) \
448               ^ (s[1] & 0x0000ffff) \
449               ^ s[2]                \
450               ^ (s[2] >> 16)        \
451               ^ (s[3] << 16)        \
452               ^ (s[3] >> 16)        \
453               ^ (s[4] << 16)        \
454               ^ (s[4] >> 16)        \
455               ^ (s[5] << 16)        \
456               ^ (s[6] << 16)        \
457               ^ (s[6] >> 16)        \
458               ^ (s[7] & 0xffff0000) \
459               ^ (s[7] << 16)        \
460               ^ (s[7] >> 16);       \
461   u[6] = m[6] ^ s[0]                \
462               ^ (s[1] >> 16)        \
463               ^ (s[2] << 16)        \
464               ^ s[3]                \
465               ^ (s[3] >> 16)        \
466               ^ (s[4] << 16)        \
467               ^ (s[4] >> 16)        \
468               ^ (s[5] << 16)        \
469               ^ (s[5] >> 16)        \
470               ^ s[6]                \
471               ^ (s[6] << 16)        \
472               ^ (s[6] >> 16)        \
473               ^ (s[7] << 16);       \
474   u[7] = m[7] ^ (s[0] & 0xffff0000) \
475               ^ (s[0] << 16)        \
476               ^ (s[1] & 0x0000ffff) \
477               ^ (s[1] << 16)        \
478               ^ (s[2] >> 16)        \
479               ^ (s[3] << 16)        \
480               ^ s[4]                \
481               ^ (s[4] >> 16)        \
482               ^ (s[5] << 16)        \
483               ^ (s[5] >> 16)        \
484               ^ (s[6] >> 16)        \
485               ^ (s[7] & 0x0000ffff) \
486               ^ (s[7] << 16)        \
487               ^ (s[7] >> 16);
488 
489 #define SHIFT16(h,v,u)              \
490   v[0] = h[0] ^ (u[1] << 16)        \
491               ^ (u[0] >> 16);       \
492   v[1] = h[1] ^ (u[2] << 16)        \
493               ^ (u[1] >> 16);       \
494   v[2] = h[2] ^ (u[3] << 16)        \
495               ^ (u[2] >> 16);       \
496   v[3] = h[3] ^ (u[4] << 16)        \
497               ^ (u[3] >> 16);       \
498   v[4] = h[4] ^ (u[5] << 16)        \
499               ^ (u[4] >> 16);       \
500   v[5] = h[5] ^ (u[6] << 16)        \
501               ^ (u[5] >> 16);       \
502   v[6] = h[6] ^ (u[7] << 16)        \
503               ^ (u[6] >> 16);       \
504   v[7] = h[7] ^ (u[0] & 0xffff0000) \
505               ^ (u[0] << 16)        \
506               ^ (u[7] >> 16)        \
507               ^ (u[1] & 0xffff0000) \
508               ^ (u[1] << 16)        \
509               ^ (u[6] << 16)        \
510               ^ (u[7] & 0xffff0000);
511 
512 #define SHIFT61(h,v)          \
513   h[0] = (v[0] & 0xffff0000)  \
514        ^ (v[0] << 16)         \
515        ^ (v[0] >> 16)         \
516        ^ (v[1] >> 16)         \
517        ^ (v[1] & 0xffff0000)  \
518        ^ (v[2] << 16)         \
519        ^ (v[3] >> 16)         \
520        ^ (v[4] << 16)         \
521        ^ (v[5] >> 16)         \
522        ^ v[5]                 \
523        ^ (v[6] >> 16)         \
524        ^ (v[7] << 16)         \
525        ^ (v[7] >> 16)         \
526        ^ (v[7] & 0x0000ffff); \
527   h[1] = (v[0] << 16)         \
528        ^ (v[0] >> 16)         \
529        ^ (v[0] & 0xffff0000)  \
530        ^ (v[1] & 0x0000ffff)  \
531        ^ v[2]                 \
532        ^ (v[2] >> 16)         \
533        ^ (v[3] << 16)         \
534        ^ (v[4] >> 16)         \
535        ^ (v[5] << 16)         \
536        ^ (v[6] << 16)         \
537        ^ v[6]                 \
538        ^ (v[7] & 0xffff0000)  \
539        ^ (v[7] >> 16);        \
540   h[2] = (v[0] & 0x0000ffff)  \
541        ^ (v[0] << 16)         \
542        ^ (v[1] << 16)         \
543        ^ (v[1] >> 16)         \
544        ^ (v[1] & 0xffff0000)  \
545        ^ (v[2] << 16)         \
546        ^ (v[3] >> 16)         \
547        ^ v[3]                 \
548        ^ (v[4] << 16)         \
549        ^ (v[5] >> 16)         \
550        ^ v[6]                 \
551        ^ (v[6] >> 16)         \
552        ^ (v[7] & 0x0000ffff)  \
553        ^ (v[7] << 16)         \
554        ^ (v[7] >> 16);        \
555   h[3] = (v[0] << 16)         \
556        ^ (v[0] >> 16)         \
557        ^ (v[0] & 0xffff0000)  \
558        ^ (v[1] & 0xffff0000)  \
559        ^ (v[1] >> 16)         \
560        ^ (v[2] << 16)         \
561        ^ (v[2] >> 16)         \
562        ^ v[2]                 \
563        ^ (v[3] << 16)         \
564        ^ (v[4] >> 16)         \
565        ^ v[4]                 \
566        ^ (v[5] << 16)         \
567        ^ (v[6] << 16)         \
568        ^ (v[7] & 0x0000ffff)  \
569        ^ (v[7] >> 16);        \
570   h[4] = (v[0] >> 16)         \
571        ^ (v[1] << 16)         \
572        ^ v[1]                 \
573        ^ (v[2] >> 16)         \
574        ^ v[2]                 \
575        ^ (v[3] << 16)         \
576        ^ (v[3] >> 16)         \
577        ^ v[3]                 \
578        ^ (v[4] << 16)         \
579        ^ (v[5] >> 16)         \
580        ^ v[5]                 \
581        ^ (v[6] << 16)         \
582        ^ (v[6] >> 16)         \
583        ^ (v[7] << 16);        \
584   h[5] = (v[0] << 16)         \
585        ^ (v[0] & 0xffff0000)  \
586        ^ (v[1] << 16)         \
587        ^ (v[1] >> 16)         \
588        ^ (v[1] & 0xffff0000)  \
589        ^ (v[2] << 16)         \
590        ^ v[2]                 \
591        ^ (v[3] >> 16)         \
592        ^ v[3]                 \
593        ^ (v[4] << 16)         \
594        ^ (v[4] >> 16)         \
595        ^ v[4]                 \
596        ^ (v[5] << 16)         \
597        ^ (v[6] << 16)         \
598        ^ (v[6] >> 16)         \
599        ^ v[6]                 \
600        ^ (v[7] << 16)         \
601        ^ (v[7] >> 16)         \
602        ^ (v[7] & 0xffff0000); \
603   h[6] = v[0]                 \
604        ^ v[2]                 \
605        ^ (v[2] >> 16)         \
606        ^ v[3]                 \
607        ^ (v[3] << 16)         \
608        ^ v[4]                 \
609        ^ (v[4] >> 16)         \
610        ^ (v[5] << 16)         \
611        ^ (v[5] >> 16)         \
612        ^ v[5]                 \
613        ^ (v[6] << 16)         \
614        ^ (v[6] >> 16)         \
615        ^ v[6]                 \
616        ^ (v[7] << 16)         \
617        ^ v[7];                \
618   h[7] = v[0]                 \
619        ^ (v[0] >> 16)         \
620        ^ (v[1] << 16)         \
621        ^ (v[1] >> 16)         \
622        ^ (v[2] << 16)         \
623        ^ (v[3] >> 16)         \
624        ^ v[3]                 \
625        ^ (v[4] << 16)         \
626        ^ v[4]                 \
627        ^ (v[5] >> 16)         \
628        ^ v[5]                 \
629        ^ (v[6] << 16)         \
630        ^ (v[6] >> 16)         \
631        ^ (v[7] << 16)         \
632        ^ v[7];
633 
634 #define PASS0(h,s,u,v)  \
635 {                       \
636   uint32_t k[8];        \
637   uint32_t w[8];        \
638   X (w, u, v);          \
639   P (k, w);             \
640   R (k, h, s, 0);       \
641   A (u);                \
642   AA (v);               \
643 }
644 
645 #define PASS2(h,s,u,v)  \
646 {                       \
647   uint32_t k[8];        \
648   uint32_t w[8];        \
649   X (w, u, v);          \
650   P (k, w);             \
651   R (k, h, s, 2);       \
652   A (u);                \
653   C (u);                \
654   AA (v);               \
655 }
656 
657 #define PASS4(h,s,u,v)  \
658 {                       \
659   uint32_t k[8];        \
660   uint32_t w[8];        \
661   X (w, u, v);          \
662   P (k, w);             \
663   R (k, h, s, 4);       \
664   A (u);                \
665   AA (v);               \
666 }
667 
668 #define PASS6(h,s,u,v)  \
669 {                       \
670   uint32_t k[8];        \
671   uint32_t w[8];        \
672   X (w, u, v);          \
673   P (k, w);             \
674   R (k, h, s, 6);       \
675 }
676 
677 
678 ////////////////////////////////////
679 // FUCKING SMART XOR MACROS START //
680 ////////////////////////////////////
681 #define XOR10(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9)                 \
682   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(x0, x1),   \
683                                                     _mm_xor_si128(x2, x3)),  \
684                                       _mm_xor_si128(_mm_xor_si128(x4, x5),   \
685                                                     _mm_xor_si128(x6, x7))), \
686                         _mm_xor_si128(x8, x9));
687 
688 #define XOR11(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10)            \
689   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(x0, x1),   \
690                                                     _mm_xor_si128(x2, x3)),  \
691                                       _mm_xor_si128(_mm_xor_si128(x4, x5),   \
692                                                     _mm_xor_si128(x6, x7))), \
693                         _mm_xor_si128(_mm_xor_si128(x8, x9),                 \
694                                       x10));
695 #define XOR13(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12)    \
696   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0,  x1),   \
697                                                     _mm_xor_si128( x2,  x3)),  \
698                                       _mm_xor_si128(_mm_xor_si128( x4,  x5),   \
699                                                     _mm_xor_si128( x6,  x7))), \
700                         _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8,  x9),   \
701                                                     _mm_xor_si128(x10, x11)),  \
702                                       x12))
703 
704 #define XOR14(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) \
705   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0,  x1),     \
706                                                     _mm_xor_si128( x2,  x3)),    \
707                                       _mm_xor_si128(_mm_xor_si128( x4,  x5),     \
708                                                     _mm_xor_si128( x6,  x7))),   \
709                         _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8,  x9),     \
710                                                     _mm_xor_si128(x10, x11)),    \
711                                       _mm_xor_si128(x12, x13)));
712 
713 #define XOR15(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) \
714   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0,  x1),          \
715                                                     _mm_xor_si128( x2,  x3)),         \
716                                       _mm_xor_si128(_mm_xor_si128( x4,  x5),          \
717                                                     _mm_xor_si128( x6,  x7))),        \
718                         _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8,  x9),          \
719                                                     _mm_xor_si128(x10, x11)),         \
720                                       _mm_xor_si128(_mm_xor_si128(x12, x13),          \
721                                                     x14)));
722 
723 #define XOR16(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) \
724   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0,  x1),               \
725                                                     _mm_xor_si128( x2,  x3)),              \
726                                       _mm_xor_si128(_mm_xor_si128( x4,  x5),               \
727                                                     _mm_xor_si128( x6,  x7))),             \
728                         _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8,  x9),               \
729                                                     _mm_xor_si128(x10, x11)),              \
730                                       _mm_xor_si128(_mm_xor_si128(x12, x13),               \
731                                                     _mm_xor_si128(x14, x15))));
732 
733 #define XOR17(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) \
734   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0,  x1),      \
735                                                                   _mm_xor_si128( x2,  x3)),     \
736                                                     _mm_xor_si128(_mm_xor_si128( x4,  x5),      \
737                                                                   _mm_xor_si128( x6,  x7))),    \
738                                       _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8,  x9),      \
739                                                                   _mm_xor_si128(x10, x11)),     \
740                                                     _mm_xor_si128(_mm_xor_si128(x12, x13),      \
741                                                                   _mm_xor_si128(x14, x15)))),   \
742                         x16);
743 
744 #define XOR19(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18) \
745   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0,  x1),                \
746                                                                   _mm_xor_si128( x2,  x3)),               \
747                                                     _mm_xor_si128(_mm_xor_si128( x4,  x5),                \
748                                                                   _mm_xor_si128( x6,  x7))),              \
749                                       _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8,  x9),                \
750                                                                   _mm_xor_si128(x10, x11)),               \
751                                                     _mm_xor_si128(_mm_xor_si128(x12, x13),                \
752                                                                   _mm_xor_si128(x14, x15)))),             \
753                         _mm_xor_si128(_mm_xor_si128(x16, x17),                                            \
754                                       x18))
755 
756 #define XOR20(store, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19) \
757   store = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x0,  x1),                     \
758                                                                   _mm_xor_si128( x2,  x3)),                    \
759                                                     _mm_xor_si128(_mm_xor_si128( x4,  x5),                     \
760                                                                   _mm_xor_si128( x6,  x7))),                   \
761                                       _mm_xor_si128(_mm_xor_si128(_mm_xor_si128( x8,  x9),                     \
762                                                                   _mm_xor_si128(x10, x11)),                    \
763                                                     _mm_xor_si128(_mm_xor_si128(x12, x13),                     \
764                                                                   _mm_xor_si128(x14, x15)))),                  \
765                         _mm_xor_si128(_mm_xor_si128(x16, x17),                                                 \
766                                       _mm_xor_si128(x18, x19)));
767 
768 //////////////////////////////////
769 // FUCKING SMART XOR MACROS END //
770 //////////////////////////////////
771 
772 
773 ////////////////////////
774 // SSE2 DEFINES START //
775 ////////////////////////
776 
777 #define round0_SSE2(k1,k2)               \
778 {                                        \
779   uint32_t t;                            \
780   uint32_t *_k1 = (uint32_t *)&k1;       \
781   uint32_t *_k2 = (uint32_t *)&k2;       \
782   uint32_t *_l  = (uint32_t *)&l;        \
783   uint32_t *_r  = (uint32_t *)&r;        \
784   t = (_k1[0]) + _r[0];                  \
785   _l[0] ^= tables[0][(t >>  0) & 0xff] ^ \
786            tables[1][(t >>  8) & 0xff] ^ \
787            tables[2][(t >> 16) & 0xff] ^ \
788            tables[3][(t >> 24) & 0xff];  \
789   t = (_k2[0]) + _l[0];                  \
790   _r[0] ^= tables[0][(t >>  0) & 0xff] ^ \
791            tables[1][(t >>  8) & 0xff] ^ \
792            tables[2][(t >> 16) & 0xff] ^ \
793            tables[3][(t >> 24) & 0xff];  \
794 }
795 
796 #define round1_SSE2(k1,k2)               \
797 {                                        \
798   uint32_t t;                            \
799   uint32_t *_k1 = (uint32_t *)&k1;       \
800   uint32_t *_k2 = (uint32_t *)&k2;       \
801   uint32_t *_l  = (uint32_t *)&l;        \
802   uint32_t *_r  = (uint32_t *)&r;        \
803   t = (_k1[1]) + _r[1];                  \
804   _l[1] ^= tables[0][(t >>  0) & 0xff] ^ \
805            tables[1][(t >>  8) & 0xff] ^ \
806            tables[2][(t >> 16) & 0xff] ^ \
807            tables[3][(t >> 24) & 0xff];  \
808   t = (_k2[1]) + _l[1];                  \
809   _r[1] ^= tables[0][(t >>  0) & 0xff] ^ \
810            tables[1][(t >>  8) & 0xff] ^ \
811            tables[2][(t >> 16) & 0xff] ^ \
812            tables[3][(t >> 24) & 0xff];  \
813 }
814 
815 #define round2_SSE2(k1,k2)               \
816 {                                        \
817   uint32_t t;                            \
818   uint32_t *_k1 = (uint32_t *)&k1;       \
819   uint32_t *_k2 = (uint32_t *)&k2;       \
820   uint32_t *_l  = (uint32_t *)&l;        \
821   uint32_t *_r  = (uint32_t *)&r;        \
822   t = (_k1[2]) + _r[2];                  \
823   _l[2] ^= tables[0][(t >>  0) & 0xff] ^ \
824            tables[1][(t >>  8) & 0xff] ^ \
825            tables[2][(t >> 16) & 0xff] ^ \
826            tables[3][(t >> 24) & 0xff];  \
827   t = (_k2[2]) + _l[2];                  \
828   _r[2] ^= tables[0][(t >>  0) & 0xff] ^ \
829            tables[1][(t >>  8) & 0xff] ^ \
830            tables[2][(t >> 16) & 0xff] ^ \
831            tables[3][(t >> 24) & 0xff];  \
832 }
833 
834 #define round3_SSE2(k1,k2)               \
835 {                                        \
836   uint32_t t;                            \
837   uint32_t *_k1 = (uint32_t *)&k1;       \
838   uint32_t *_k2 = (uint32_t *)&k2;       \
839   uint32_t *_l  = (uint32_t *)&l;        \
840   uint32_t *_r  = (uint32_t *)&r;        \
841   t = (_k1[3]) + _r[3];                  \
842   _l[3] ^= tables[0][(t >>  0) & 0xff] ^ \
843            tables[1][(t >>  8) & 0xff] ^ \
844            tables[2][(t >> 16) & 0xff] ^ \
845            tables[3][(t >> 24) & 0xff];  \
846   t = (_k2[3]) + _l[3];                  \
847   _r[3] ^= tables[0][(t >>  0) & 0xff] ^ \
848            tables[1][(t >>  8) & 0xff] ^ \
849            tables[2][(t >> 16) & 0xff] ^ \
850            tables[3][(t >> 24) & 0xff];  \
851 }
852 
853 #define R_SSE2(k,h,s,i)     \
854 {                           \
855   __m128i r;                \
856   __m128i l;                \
857   r = h[i + 0];             \
858   l = h[i + 1];             \
859   round0_SSE2 (k[0], k[1]); \
860   round1_SSE2 (k[0], k[1]); \
861   round2_SSE2 (k[0], k[1]); \
862   round3_SSE2 (k[0], k[1]); \
863   round0_SSE2 (k[2], k[3]); \
864   round1_SSE2 (k[2], k[3]); \
865   round2_SSE2 (k[2], k[3]); \
866   round3_SSE2 (k[2], k[3]); \
867   round0_SSE2 (k[4], k[5]); \
868   round1_SSE2 (k[4], k[5]); \
869   round2_SSE2 (k[4], k[5]); \
870   round3_SSE2 (k[4], k[5]); \
871   round0_SSE2 (k[6], k[7]); \
872   round1_SSE2 (k[6], k[7]); \
873   round2_SSE2 (k[6], k[7]); \
874   round3_SSE2 (k[6], k[7]); \
875   round0_SSE2 (k[0], k[1]); \
876   round1_SSE2 (k[0], k[1]); \
877   round2_SSE2 (k[0], k[1]); \
878   round3_SSE2 (k[0], k[1]); \
879   round0_SSE2 (k[2], k[3]); \
880   round1_SSE2 (k[2], k[3]); \
881   round2_SSE2 (k[2], k[3]); \
882   round3_SSE2 (k[2], k[3]); \
883   round0_SSE2 (k[4], k[5]); \
884   round1_SSE2 (k[4], k[5]); \
885   round2_SSE2 (k[4], k[5]); \
886   round3_SSE2 (k[4], k[5]); \
887   round0_SSE2 (k[6], k[7]); \
888   round1_SSE2 (k[6], k[7]); \
889   round2_SSE2 (k[6], k[7]); \
890   round3_SSE2 (k[6], k[7]); \
891   round0_SSE2 (k[0], k[1]); \
892   round1_SSE2 (k[0], k[1]); \
893   round2_SSE2 (k[0], k[1]); \
894   round3_SSE2 (k[0], k[1]); \
895   round0_SSE2 (k[2], k[3]); \
896   round1_SSE2 (k[2], k[3]); \
897   round2_SSE2 (k[2], k[3]); \
898   round3_SSE2 (k[2], k[3]); \
899   round0_SSE2 (k[4], k[5]); \
900   round1_SSE2 (k[4], k[5]); \
901   round2_SSE2 (k[4], k[5]); \
902   round3_SSE2 (k[4], k[5]); \
903   round0_SSE2 (k[6], k[7]); \
904   round1_SSE2 (k[6], k[7]); \
905   round2_SSE2 (k[6], k[7]); \
906   round3_SSE2 (k[6], k[7]); \
907   round0_SSE2 (k[7], k[6]); \
908   round1_SSE2 (k[7], k[6]); \
909   round2_SSE2 (k[7], k[6]); \
910   round3_SSE2 (k[7], k[6]); \
911   round0_SSE2 (k[5], k[4]); \
912   round1_SSE2 (k[5], k[4]); \
913   round2_SSE2 (k[5], k[4]); \
914   round3_SSE2 (k[5], k[4]); \
915   round0_SSE2 (k[3], k[2]); \
916   round1_SSE2 (k[3], k[2]); \
917   round2_SSE2 (k[3], k[2]); \
918   round3_SSE2 (k[3], k[2]); \
919   round0_SSE2 (k[1], k[0]); \
920   round1_SSE2 (k[1], k[0]); \
921   round2_SSE2 (k[1], k[0]); \
922   round3_SSE2 (k[1], k[0]); \
923   s[i + 0] = l;             \
924   s[i + 1] = r;             \
925 }
926 
927 #define X_SSE2(w,u,v)               \
928   w[0] = _mm_xor_si128(u[0], v[0]); \
929   w[1] = _mm_xor_si128(u[1], v[1]); \
930   w[2] = _mm_xor_si128(u[2], v[2]); \
931   w[3] = _mm_xor_si128(u[3], v[3]); \
932   w[4] = _mm_xor_si128(u[4], v[4]); \
933   w[5] = _mm_xor_si128(u[5], v[5]); \
934   w[6] = _mm_xor_si128(u[6], v[6]); \
935   w[7] = _mm_xor_si128(u[7], v[7]);
936 
937 #define P_SSE2(k,w)                                                                                             \
938   k[0] = _mm_or_si128(_mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x000000ff)),  0),     \
939                                     _mm_slli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x000000ff)),  8) ),   \
940                       _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x000000ff)), 16),     \
941                                     _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x000000ff)), 24) ) ); \
942   k[1] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x0000ff00)),  8),     \
943                                     _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x0000ff00)),  0) ),   \
944                       _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x0000ff00)),  8),     \
945                                     _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x0000ff00)), 16) ) ); \
946   k[2] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0x00ff0000)), 16),     \
947                                     _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0x00ff0000)),  8) ),   \
948                       _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0x00ff0000)),  0),     \
949                                     _mm_slli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0x00ff0000)),  8) ) ); \
950   k[3] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[0], _mm_set1_epi32 (0xff000000)), 24),     \
951                                     _mm_srli_epi32( _mm_and_si128 (w[2], _mm_set1_epi32 (0xff000000)), 16) ),   \
952                       _mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[4], _mm_set1_epi32 (0xff000000)),  8),     \
953                                     _mm_srli_epi32( _mm_and_si128 (w[6], _mm_set1_epi32 (0xff000000)),  0) ) ); \
954   k[4] = _mm_or_si128(_mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x000000ff)),  0),     \
955                                     _mm_slli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x000000ff)),  8) ),   \
956                       _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x000000ff)), 16),     \
957                                     _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x000000ff)), 24) ) ); \
958   k[5] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x0000ff00)),  8),     \
959                                     _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x0000ff00)),  0) ),   \
960                       _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x0000ff00)),  8),     \
961                                     _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x0000ff00)), 16) ) ); \
962   k[6] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0x00ff0000)), 16),     \
963                                     _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0x00ff0000)),  8) ),   \
964                       _mm_or_si128( _mm_slli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0x00ff0000)),  0),     \
965                                     _mm_slli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0x00ff0000)),  8) ) ); \
966   k[7] = _mm_or_si128(_mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[1], _mm_set1_epi32 (0xff000000)), 24),     \
967                                     _mm_srli_epi32( _mm_and_si128 (w[3], _mm_set1_epi32 (0xff000000)), 16) ),   \
968                       _mm_or_si128( _mm_srli_epi32( _mm_and_si128 (w[5], _mm_set1_epi32 (0xff000000)),  8),     \
969                                     _mm_srli_epi32( _mm_and_si128 (w[7], _mm_set1_epi32 (0xff000000)),  0) ) );
970 
971 #define A_SSE2(x)                \
972 {                                \
973   __m128i l;                     \
974   __m128i r;                     \
975   l = _mm_xor_si128(x[0], x[2]); \
976   r = _mm_xor_si128(x[1], x[3]); \
977   x[0] = x[2];                   \
978   x[1] = x[3];                   \
979   x[2] = x[4];                   \
980   x[3] = x[5];                   \
981   x[4] = x[6];                   \
982   x[5] = x[7];                   \
983   x[6] = l;                      \
984   x[7] = r;                      \
985 }
986 
987 #define AA_SSE2(x)               \
988 {                                \
989   __m128i l;                     \
990   __m128i r;                     \
991   l    = x[0];                   \
992   r    = x[2];                   \
993   x[0] = x[4];                   \
994   x[2] = x[6];                   \
995   x[4] = _mm_xor_si128(l, r);    \
996   x[6] = _mm_xor_si128(x[0], r); \
997   l    = x[1];                   \
998   r    = x[3];                   \
999   x[1] = x[5];                   \
1000   x[3] = x[7];                   \
1001   x[5] = _mm_xor_si128(l, r);    \
1002   x[7] = _mm_xor_si128(x[1], r); \
1003 }
1004 
1005 #define C_SSE2(x)                                         \
1006   x[0] = _mm_xor_si128(x[0], _mm_set1_epi32(0xff00ff00)); \
1007   x[1] = _mm_xor_si128(x[1], _mm_set1_epi32(0xff00ff00)); \
1008   x[2] = _mm_xor_si128(x[2], _mm_set1_epi32(0x00ff00ff)); \
1009   x[3] = _mm_xor_si128(x[3], _mm_set1_epi32(0x00ff00ff)); \
1010   x[4] = _mm_xor_si128(x[4], _mm_set1_epi32(0x00ffff00)); \
1011   x[5] = _mm_xor_si128(x[5], _mm_set1_epi32(0xff0000ff)); \
1012   x[6] = _mm_xor_si128(x[6], _mm_set1_epi32(0x000000ff)); \
1013   x[7] = _mm_xor_si128(x[7], _mm_set1_epi32(0xff00ffff));
1014 
1015 #define SHIFT12_SSE2(u,m,s)                               \
1016   u[0] = _mm_xor_si128(m[0], s[6]);                       \
1017   u[1] = _mm_xor_si128(m[1], s[7]);                       \
1018   XOR11(u[2],                                             \
1019         m[2],                                             \
1020         _mm_slli_epi32(s[0], 16),                         \
1021         _mm_srli_epi32(s[0], 16),                         \
1022         _mm_and_si128(s[0], _mm_set1_epi32(0x0000ffff)),  \
1023         _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)),  \
1024         _mm_srli_epi32(s[1], 16),                         \
1025         _mm_slli_epi32(s[2], 16),                         \
1026         s[6],                                             \
1027         _mm_slli_epi32(s[6], 16),                         \
1028         _mm_and_si128(s[7], _mm_set1_epi32(0xffff0000)),  \
1029         _mm_srli_epi32(s[7], 16));                        \
1030   XOR15(u[3],                                             \
1031         m[3],                                             \
1032         _mm_and_si128(s[0], _mm_set1_epi32(0x0000ffff)),  \
1033         _mm_slli_epi32(s[0], 16),                         \
1034         _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)),  \
1035         _mm_slli_epi32(s[1], 16),                         \
1036         _mm_srli_epi32(s[1], 16),                         \
1037         _mm_slli_epi32(s[2], 16),                         \
1038         _mm_srli_epi32(s[2], 16),                         \
1039         _mm_slli_epi32(s[3], 16),                         \
1040         s[6],                                             \
1041         _mm_slli_epi32(s[6], 16),                         \
1042         _mm_srli_epi32(s[6], 16),                         \
1043         _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)),  \
1044         _mm_slli_epi32(s[7], 16),                         \
1045         _mm_srli_epi32(s[7], 16));                        \
1046   XOR16(u[4],                                             \
1047         m[4],                                             \
1048         _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)),  \
1049         _mm_slli_epi32(s[0], 16),                         \
1050         _mm_srli_epi32(s[0], 16),                         \
1051         _mm_and_si128(s[1], _mm_set1_epi32(0xffff0000)),  \
1052         _mm_srli_epi32(s[1], 16),                         \
1053         _mm_slli_epi32(s[2], 16),                         \
1054         _mm_srli_epi32(s[2], 16),                         \
1055         _mm_slli_epi32(s[3], 16),                         \
1056         _mm_srli_epi32(s[3], 16),                         \
1057         _mm_slli_epi32(s[4], 16),                         \
1058         _mm_slli_epi32(s[6], 16),                         \
1059         _mm_srli_epi32(s[6], 16),                         \
1060         _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)),  \
1061         _mm_slli_epi32(s[7], 16),                         \
1062         _mm_srli_epi32(s[7], 16));                        \
1063   XOR17(u[5],                                             \
1064         m[5],                                             \
1065         _mm_slli_epi32(s[0], 16),                         \
1066         _mm_srli_epi32(s[0], 16),                         \
1067         _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)),  \
1068         _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)),  \
1069         s[2],                                             \
1070         _mm_srli_epi32(s[2], 16),                         \
1071         _mm_slli_epi32(s[3], 16),                         \
1072         _mm_srli_epi32(s[3], 16),                         \
1073         _mm_slli_epi32(s[4], 16),                         \
1074         _mm_srli_epi32(s[4], 16),                         \
1075         _mm_slli_epi32(s[5], 16),                         \
1076         _mm_slli_epi32(s[6], 16),                         \
1077         _mm_srli_epi32(s[6], 16),                         \
1078         _mm_and_si128(s[7], _mm_set1_epi32(0xffff0000)),  \
1079         _mm_slli_epi32(s[7], 16),                         \
1080         _mm_srli_epi32(s[7], 16));                        \
1081   XOR14(u[6],                                             \
1082         m[6],                                             \
1083         s[0],                                             \
1084         _mm_srli_epi32(s[1], 16),                         \
1085         _mm_slli_epi32(s[2], 16),                         \
1086         s[3],                                             \
1087         _mm_srli_epi32(s[3], 16),                         \
1088         _mm_slli_epi32(s[4], 16),                         \
1089         _mm_srli_epi32(s[4], 16),                         \
1090         _mm_slli_epi32(s[5], 16),                         \
1091         _mm_srli_epi32(s[5], 16),                         \
1092         s[6],                                             \
1093         _mm_slli_epi32(s[6], 16),                         \
1094         _mm_srli_epi32(s[6], 16),                         \
1095         _mm_slli_epi32(s[7], 16));                        \
1096   XOR15(u[7],                                             \
1097         m[7],                                             \
1098         _mm_and_si128(s[0], _mm_set1_epi32(0xffff0000)),  \
1099         _mm_slli_epi32(s[0], 16),                         \
1100         _mm_and_si128(s[1], _mm_set1_epi32(0x0000ffff)),  \
1101         _mm_slli_epi32(s[1], 16),                         \
1102         _mm_srli_epi32(s[2], 16),                         \
1103         _mm_slli_epi32(s[3], 16),                         \
1104         s[4],                                             \
1105         _mm_srli_epi32(s[4], 16),                         \
1106         _mm_slli_epi32(s[5], 16),                         \
1107         _mm_srli_epi32(s[5], 16),                         \
1108         _mm_srli_epi32(s[6], 16),                         \
1109         _mm_and_si128(s[7], _mm_set1_epi32(0x0000ffff)),  \
1110         _mm_slli_epi32(s[7], 16),                         \
1111         _mm_srli_epi32(s[7], 16));
1112 
1113 #define SHIFT16_SSE2(h,v,u)                                                                             \
1114   v[0] = _mm_xor_si128( _mm_xor_si128(h[0],_mm_slli_epi32(u[1], 16)),                                   \
1115                         _mm_srli_epi32(u[0], 16));                                                      \
1116   v[1] = _mm_xor_si128(_mm_xor_si128( h[1], _mm_slli_epi32(u[2], 16)),                                  \
1117                         _mm_srli_epi32(u[1], 16));                                                      \
1118   v[2] = _mm_xor_si128(_mm_xor_si128( h[2], _mm_slli_epi32(u[3], 16)),                                  \
1119                         _mm_srli_epi32(u[2], 16));                                                      \
1120   v[3] = _mm_xor_si128(_mm_xor_si128( h[3], _mm_slli_epi32(u[4], 16)),                                  \
1121                         _mm_srli_epi32(u[3], 16));                                                      \
1122   v[4] = _mm_xor_si128(_mm_xor_si128( h[4], _mm_slli_epi32(u[5], 16)),                                  \
1123                         _mm_srli_epi32(u[4], 16));                                                      \
1124   v[5] = _mm_xor_si128(_mm_xor_si128( h[5], _mm_slli_epi32(u[6], 16)),                                  \
1125                         _mm_srli_epi32(u[5], 16));                                                      \
1126   v[6] = _mm_xor_si128(_mm_xor_si128( h[6], _mm_slli_epi32(u[7], 16)),                                  \
1127                         _mm_srli_epi32(u[6], 16));                                                      \
1128   v[7] =  _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(h[7],                                               \
1129                                                     _mm_and_si128(u[0], _mm_set1_epi32(0xffff0000))),   \
1130                                       _mm_xor_si128(_mm_slli_epi32(u[0], 16),                           \
1131                                                     _mm_srli_epi32(u[7], 16))),                         \
1132                         _mm_xor_si128(_mm_xor_si128(_mm_and_si128(u[1], _mm_set1_epi32(0xffff0000)),    \
1133                                                     _mm_slli_epi32(u[1], 16)),                          \
1134                                       _mm_xor_si128(_mm_slli_epi32(u[6], 16),                           \
1135                                                     _mm_and_si128(u[7], _mm_set1_epi32(0xffff0000)))));
1136 
1137 #define SHIFT61_SSE2(h,v)                                 \
1138   XOR14(h[0],                                             \
1139         _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)),  \
1140         _mm_slli_epi32(v[0], 16),                         \
1141         _mm_srli_epi32(v[0], 16),                         \
1142         _mm_srli_epi32(v[1], 16),                         \
1143         _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)),  \
1144         _mm_slli_epi32(v[2], 16),                         \
1145         _mm_srli_epi32(v[3], 16),                         \
1146         _mm_slli_epi32(v[4], 16),                         \
1147         _mm_srli_epi32(v[5], 16),                         \
1148         v[5],                                             \
1149         _mm_srli_epi32(v[6], 16),                         \
1150         _mm_slli_epi32(v[7], 16),                         \
1151         _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)),  \
1152         _mm_srli_epi32(v[7], 16));                        \
1153   XOR13(h[1],                                             \
1154         _mm_slli_epi32(v[0], 16),                         \
1155         _mm_srli_epi32(v[0], 16),                         \
1156         _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)),  \
1157         _mm_and_si128(v[1], _mm_set1_epi32(0x0000ffff)),  \
1158         _mm_srli_epi32(v[2], 16),                         \
1159         v[2],                                             \
1160         _mm_slli_epi32(v[3], 16),                         \
1161         _mm_srli_epi32(v[4], 16),                         \
1162         _mm_slli_epi32(v[5], 16),                         \
1163         _mm_slli_epi32(v[6], 16),                         \
1164         _mm_and_si128(v[7], _mm_set1_epi32(0xffff0000)),  \
1165         v[6],                                             \
1166         _mm_srli_epi32(v[7], 16));                        \
1167   XOR15(h[2],                                             \
1168         _mm_and_si128(v[0], _mm_set1_epi32(0x0000ffff)),  \
1169         _mm_slli_epi32(v[0], 16),                         \
1170         _mm_slli_epi32(v[1], 16),                         \
1171         _mm_srli_epi32(v[1], 16),                         \
1172         _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)),  \
1173         _mm_slli_epi32(v[2], 16),                         \
1174         _mm_srli_epi32(v[3], 16),                         \
1175         v[3],                                             \
1176         _mm_slli_epi32(v[4], 16),                         \
1177         _mm_srli_epi32(v[5], 16),                         \
1178         _mm_srli_epi32(v[6], 16),                         \
1179         v[6],                                             \
1180         _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)),  \
1181         _mm_slli_epi32(v[7], 16),                         \
1182         _mm_srli_epi32(v[7], 16));                        \
1183   XOR15(h[3],                                             \
1184         _mm_slli_epi32(v[0], 16),                         \
1185         _mm_srli_epi32(v[0], 16),                         \
1186         _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)),  \
1187         _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)),  \
1188         _mm_srli_epi32(v[1], 16),                         \
1189         _mm_slli_epi32(v[2], 16),                         \
1190         _mm_srli_epi32(v[2], 16),                         \
1191         v[2],                                             \
1192         _mm_slli_epi32(v[3], 16),                         \
1193         _mm_srli_epi32(v[4], 16),                         \
1194         _mm_slli_epi32(v[5], 16),                         \
1195         v[4],                                             \
1196         _mm_and_si128(v[7], _mm_set1_epi32(0x0000ffff)),  \
1197         _mm_slli_epi32(v[6], 16),                         \
1198         _mm_srli_epi32(v[7], 16));                        \
1199   XOR14(h[4],                                             \
1200         _mm_srli_epi32(v[0], 16),                         \
1201         _mm_slli_epi32(v[1], 16),                         \
1202         _mm_srli_epi32(v[2], 16),                         \
1203         v[1],                                             \
1204         _mm_slli_epi32(v[3], 16),                         \
1205         v[2],                                             \
1206         _mm_srli_epi32(v[3], 16),                         \
1207         v[3],                                             \
1208         _mm_slli_epi32(v[4], 16),                         \
1209         _mm_srli_epi32(v[5], 16),                         \
1210         _mm_slli_epi32(v[6], 16),                         \
1211         v[5],                                             \
1212         _mm_srli_epi32(v[6], 16),                         \
1213         _mm_slli_epi32(v[7], 16));                        \
1214   XOR19(h[5],                                             \
1215         _mm_and_si128(v[0], _mm_set1_epi32(0xffff0000)),  \
1216         _mm_slli_epi32(v[0], 16),                         \
1217         _mm_slli_epi32(v[1], 16),                         \
1218         _mm_srli_epi32(v[1], 16),                         \
1219         _mm_and_si128(v[1], _mm_set1_epi32(0xffff0000)),  \
1220         _mm_slli_epi32(v[2], 16),                         \
1221         _mm_srli_epi32(v[3], 16),                         \
1222         v[2],                                             \
1223         _mm_slli_epi32(v[4], 16),                         \
1224         v[3],                                             \
1225         _mm_srli_epi32(v[4], 16),                         \
1226         v[4],                                             \
1227         _mm_slli_epi32(v[5], 16),                         \
1228         _mm_slli_epi32(v[6], 16),                         \
1229         _mm_srli_epi32(v[6], 16),                         \
1230         v[6],                                             \
1231         _mm_slli_epi32(v[7], 16),                         \
1232         _mm_srli_epi32(v[7], 16),                         \
1233         _mm_and_si128(v[7], _mm_set1_epi32(0xffff0000))); \
1234   XOR15(h[6],                                             \
1235         v[0],                                             \
1236         v[2],                                             \
1237         _mm_srli_epi32(v[2], 16),                         \
1238         v[3],                                             \
1239         _mm_slli_epi32(v[3], 16),                         \
1240         v[4],                                             \
1241         _mm_srli_epi32(v[4], 16),                         \
1242         _mm_slli_epi32(v[5], 16),                         \
1243         _mm_srli_epi32(v[5], 16),                         \
1244         v[5],                                             \
1245         _mm_slli_epi32(v[6], 16),                         \
1246         _mm_srli_epi32(v[6], 16),                         \
1247         _mm_slli_epi32(v[7], 16),                         \
1248         v[6],                                             \
1249         v[7]);                                            \
1250   XOR15(h[7],                                             \
1251         _mm_srli_epi32(v[0], 16),                         \
1252         v[0],                                             \
1253         _mm_slli_epi32(v[1], 16),                         \
1254         _mm_srli_epi32(v[1], 16),                         \
1255         _mm_slli_epi32(v[2], 16),                         \
1256         _mm_srli_epi32(v[3], 16),                         \
1257         _mm_slli_epi32(v[4], 16),                         \
1258         v[3],                                             \
1259         _mm_srli_epi32(v[5], 16),                         \
1260         v[4],                                             \
1261         _mm_slli_epi32(v[6], 16),                         \
1262         v[5],                                             \
1263         _mm_srli_epi32(v[6], 16),                         \
1264         _mm_slli_epi32(v[7], 16),                         \
1265         v[7]);
1266 
1267 #define PASS0_SSE2(h,s,u,v)  \
1268 {                            \
1269   __m128i k[8];              \
1270   __m128i w[8];              \
1271   X_SSE2 (w, u, v);          \
1272   P_SSE2 (k, w);             \
1273   R_SSE2 (k, h, s, 0);       \
1274   A_SSE2 (u);                \
1275   AA_SSE2 (v);               \
1276 }
1277 
1278 #define PASS2_SSE2(h,s,u,v)  \
1279 {                            \
1280   __m128i k[8];              \
1281   __m128i w[8];              \
1282   X_SSE2 (w, u, v);          \
1283   P_SSE2 (k, w);             \
1284   R_SSE2 (k, h, s, 2);       \
1285   A_SSE2 (u);                \
1286   C_SSE2 (u);                \
1287   AA_SSE2 (v);               \
1288 }
1289 
1290 #define PASS4_SSE2(h,s,u,v)  \
1291 {                            \
1292   __m128i k[8];              \
1293   __m128i w[8];              \
1294   X_SSE2 (w, u, v);          \
1295   P_SSE2 (k, w);             \
1296   R_SSE2 (k, h, s, 4);       \
1297   A_SSE2 (u);                \
1298   AA_SSE2 (v);               \
1299 }
1300 
1301 #define PASS6_SSE2(h,s,u,v)  \
1302 {                            \
1303   __m128i k[8];              \
1304   __m128i w[8];              \
1305   X_SSE2 (w, u, v);          \
1306   P_SSE2 (k, w);             \
1307   R_SSE2 (k, h, s, 6);       \
1308 }
1309 
1310 
1311 //////////////////////
1312 // SSE2 DEFINES END //
1313 //////////////////////
1314 
1315 
hashcat_gost_64(uint32_t digests[8][4],uint32_t blocks[16][4])1316 void hashcat_gost_64 (uint32_t digests[8][4], uint32_t blocks[16][4])
1317 {
1318   /**
1319    * base
1320    */
1321 
1322   int id;
1323 
1324   for (id = 0; id < 4; id++)
1325   {
1326     uint32_t data[8];
1327 
1328     data[0] = blocks[0][id];
1329     data[1] = blocks[1][id];
1330     data[2] = blocks[2][id];
1331     data[3] = blocks[3][id];
1332     data[4] = blocks[4][id];
1333     data[5] = blocks[5][id];
1334     data[6] = blocks[6][id];
1335     data[7] = blocks[7][id];
1336 
1337     uint32_t state[16];
1338 
1339     state[ 0] = 0;
1340     state[ 1] = 0;
1341     state[ 2] = 0;
1342     state[ 3] = 0;
1343     state[ 4] = 0;
1344     state[ 5] = 0;
1345     state[ 6] = 0;
1346     state[ 7] = 0;
1347     state[ 8] = data[0];
1348     state[ 9] = data[1];
1349     state[10] = data[2];
1350     state[11] = data[3];
1351     state[12] = data[4];
1352     state[13] = data[5];
1353     state[14] = data[6];
1354     state[15] = data[7];
1355 
1356     uint32_t state_m[8];
1357     uint32_t data_m[8];
1358 
1359     /* gost1 */
1360 
1361     state_m[0] = state[0];
1362     state_m[1] = state[1];
1363     state_m[2] = state[2];
1364     state_m[3] = state[3];
1365     state_m[4] = state[4];
1366     state_m[5] = state[5];
1367     state_m[6] = state[6];
1368     state_m[7] = state[7];
1369 
1370     data_m[0] = data[0];
1371     data_m[1] = data[1];
1372     data_m[2] = data[2];
1373     data_m[3] = data[3];
1374     data_m[4] = data[4];
1375     data_m[5] = data[5];
1376     data_m[6] = data[6];
1377     data_m[7] = data[7];
1378 
1379     uint32_t tmp[8];
1380 
1381     PASS0 (state, tmp, state_m, data_m);
1382     PASS2 (state, tmp, state_m, data_m);
1383     PASS4 (state, tmp, state_m, data_m);
1384     PASS6 (state, tmp, state_m, data_m);
1385 
1386     SHIFT12 (state_m, data, tmp);
1387     SHIFT16 (state, data_m, state_m);
1388     SHIFT61 (state, data_m);
1389 
1390     data[0] = blocks[15][id];
1391     data[1] = 0;
1392     data[2] = 0;
1393     data[3] = 0;
1394     data[4] = 0;
1395     data[5] = 0;
1396     data[6] = 0;
1397     data[7] = 0;
1398 
1399     /* gost2 */
1400 
1401     state_m[0] = state[0];
1402     state_m[1] = state[1];
1403     state_m[2] = state[2];
1404     state_m[3] = state[3];
1405     state_m[4] = state[4];
1406     state_m[5] = state[5];
1407     state_m[6] = state[6];
1408     state_m[7] = state[7];
1409 
1410     data_m[0] = data[0];
1411     data_m[1] = data[1];
1412     data_m[2] = data[2];
1413     data_m[3] = data[3];
1414     data_m[4] = data[4];
1415     data_m[5] = data[5];
1416     data_m[6] = data[6];
1417     data_m[7] = data[7];
1418 
1419     PASS0 (state, tmp, state_m, data_m);
1420     PASS2 (state, tmp, state_m, data_m);
1421     PASS4 (state, tmp, state_m, data_m);
1422     PASS6 (state, tmp, state_m, data_m);
1423 
1424     SHIFT12 (state_m, data, tmp);
1425     SHIFT16 (state, data_m, state_m);
1426     SHIFT61 (state, data_m);
1427 
1428     /* gost3 */
1429 
1430     data[0] = state[ 8];
1431     data[1] = state[ 9];
1432     data[2] = state[10];
1433     data[3] = state[11];
1434     data[4] = state[12];
1435     data[5] = state[13];
1436     data[6] = state[14];
1437     data[7] = state[15];
1438 
1439     state_m[0] = state[0];
1440     state_m[1] = state[1];
1441     state_m[2] = state[2];
1442     state_m[3] = state[3];
1443     state_m[4] = state[4];
1444     state_m[5] = state[5];
1445     state_m[6] = state[6];
1446     state_m[7] = state[7];
1447 
1448     data_m[0] = data[0];
1449     data_m[1] = data[1];
1450     data_m[2] = data[2];
1451     data_m[3] = data[3];
1452     data_m[4] = data[4];
1453     data_m[5] = data[5];
1454     data_m[6] = data[6];
1455     data_m[7] = data[7];
1456 
1457     PASS0 (state, tmp, state_m, data_m);
1458     PASS2 (state, tmp, state_m, data_m);
1459     PASS4 (state, tmp, state_m, data_m);
1460     PASS6 (state, tmp, state_m, data_m);
1461 
1462     SHIFT12 (state_m, data, tmp);
1463     SHIFT16 (state, data_m, state_m);
1464     SHIFT61 (state, data_m);
1465 
1466     /* store */
1467 
1468     digests[0][id] = state[0];
1469     digests[1][id] = state[1];
1470     digests[2][id] = state[2];
1471     digests[3][id] = state[3];
1472     digests[4][id] = state[4];
1473     digests[5][id] = state[5];
1474     digests[6][id] = state[6];
1475     digests[7][id] = state[7];
1476 
1477     BYTESWAP (digests[0][id]);
1478     BYTESWAP (digests[1][id]);
1479     BYTESWAP (digests[2][id]);
1480     BYTESWAP (digests[3][id]);
1481     BYTESWAP (digests[4][id]);
1482     BYTESWAP (digests[5][id]);
1483     BYTESWAP (digests[6][id]);
1484     BYTESWAP (digests[7][id]);
1485   }
1486 }
1487 
1488 
hashcat_gost_64_sse2(__m128i digests[8],__m128i blocks[16])1489 void hashcat_gost_64_sse2 (__m128i digests[8], __m128i blocks[16])
1490 {
1491   __m128i data[8];
1492 
1493   data[0] = blocks[0];
1494   data[1] = blocks[1];
1495   data[2] = blocks[2];
1496   data[3] = blocks[3];
1497   data[4] = blocks[4];
1498   data[5] = blocks[5];
1499   data[6] = blocks[6];
1500   data[7] = blocks[7];
1501 
1502   __m128i state[16];
1503 
1504   state[ 0] = _mm_set1_epi32 (0);
1505   state[ 1] = _mm_set1_epi32 (0);
1506   state[ 2] = _mm_set1_epi32 (0);
1507   state[ 3] = _mm_set1_epi32 (0);
1508   state[ 4] = _mm_set1_epi32 (0);
1509   state[ 5] = _mm_set1_epi32 (0);
1510   state[ 6] = _mm_set1_epi32 (0);
1511   state[ 7] = _mm_set1_epi32 (0);
1512   state[ 8] = data[0];
1513   state[ 9] = data[1];
1514   state[10] = data[2];
1515   state[11] = data[3];
1516   state[12] = data[4];
1517   state[13] = data[5];
1518   state[14] = data[6];
1519   state[15] = data[7];
1520 
1521   __m128i state_m[8];
1522   __m128i data_m[8];
1523 
1524   /* gost1 */
1525 
1526   state_m[0] = state[0];
1527   state_m[1] = state[1];
1528   state_m[2] = state[2];
1529   state_m[3] = state[3];
1530   state_m[4] = state[4];
1531   state_m[5] = state[5];
1532   state_m[6] = state[6];
1533   state_m[7] = state[7];
1534 
1535   data_m[0] = data[0];
1536   data_m[1] = data[1];
1537   data_m[2] = data[2];
1538   data_m[3] = data[3];
1539   data_m[4] = data[4];
1540   data_m[5] = data[5];
1541   data_m[6] = data[6];
1542   data_m[7] = data[7];
1543 
1544   __m128i tmp[8];
1545 
1546   PASS0_SSE2 (state, tmp, state_m, data_m);
1547   PASS2_SSE2 (state, tmp, state_m, data_m);
1548   PASS4_SSE2 (state, tmp, state_m, data_m);
1549   PASS6_SSE2 (state, tmp, state_m, data_m);
1550 
1551   SHIFT12_SSE2 (state_m, data, tmp);
1552   SHIFT16_SSE2 (state, data_m, state_m);
1553   SHIFT61_SSE2 (state, data_m);
1554 
1555   data[0] = blocks[15];
1556   data[1] = _mm_set1_epi32 (0);
1557   data[2] = _mm_set1_epi32 (0);
1558   data[3] = _mm_set1_epi32 (0);
1559   data[4] = _mm_set1_epi32 (0);
1560   data[5] = _mm_set1_epi32 (0);
1561   data[6] = _mm_set1_epi32 (0);
1562   data[7] = _mm_set1_epi32 (0);
1563 
1564   /* gost2 */
1565 
1566   state_m[0] = state[0];
1567   state_m[1] = state[1];
1568   state_m[2] = state[2];
1569   state_m[3] = state[3];
1570   state_m[4] = state[4];
1571   state_m[5] = state[5];
1572   state_m[6] = state[6];
1573   state_m[7] = state[7];
1574 
1575   data_m[0] = data[0];
1576   data_m[1] = data[1];
1577   data_m[2] = data[2];
1578   data_m[3] = data[3];
1579   data_m[4] = data[4];
1580   data_m[5] = data[5];
1581   data_m[6] = data[6];
1582   data_m[7] = data[7];
1583 
1584   PASS0_SSE2 (state, tmp, state_m, data_m);
1585   PASS2_SSE2 (state, tmp, state_m, data_m);
1586   PASS4_SSE2 (state, tmp, state_m, data_m);
1587   PASS6_SSE2 (state, tmp, state_m, data_m);
1588 
1589   SHIFT12_SSE2 (state_m, data, tmp);
1590   SHIFT16_SSE2 (state, data_m, state_m);
1591   SHIFT61_SSE2 (state, data_m);
1592 
1593   /* gost3 */
1594 
1595   data[0] = state[ 8];
1596   data[1] = state[ 9];
1597   data[2] = state[10];
1598   data[3] = state[11];
1599   data[4] = state[12];
1600   data[5] = state[13];
1601   data[6] = state[14];
1602   data[7] = state[15];
1603 
1604   state_m[0] = state[0];
1605   state_m[1] = state[1];
1606   state_m[2] = state[2];
1607   state_m[3] = state[3];
1608   state_m[4] = state[4];
1609   state_m[5] = state[5];
1610   state_m[6] = state[6];
1611   state_m[7] = state[7];
1612 
1613   data_m[0] = data[0];
1614   data_m[1] = data[1];
1615   data_m[2] = data[2];
1616   data_m[3] = data[3];
1617   data_m[4] = data[4];
1618   data_m[5] = data[5];
1619   data_m[6] = data[6];
1620   data_m[7] = data[7];
1621 
1622   PASS0_SSE2 (state, tmp, state_m, data_m);
1623   PASS2_SSE2 (state, tmp, state_m, data_m);
1624   PASS4_SSE2 (state, tmp, state_m, data_m);
1625   PASS6_SSE2 (state, tmp, state_m, data_m);
1626 
1627   SHIFT12_SSE2 (state_m, data, tmp);
1628   SHIFT16_SSE2 (state, data_m, state_m);
1629   SHIFT61_SSE2 (state, data_m);
1630 
1631   /* store */
1632 
1633   uint32_t * tmpA;
1634 
1635   digests[0] = state[0];
1636   digests[1] = state[1];
1637   digests[2] = state[2];
1638   digests[3] = state[3];
1639   digests[4] = state[4];
1640   digests[5] = state[5];
1641   digests[6] = state[6];
1642   digests[7] = state[7];
1643 
1644   tmpA = (uint32_t *)&digests[0];
1645   BYTESWAP (tmpA[0]);
1646   BYTESWAP (tmpA[1]);
1647   BYTESWAP (tmpA[2]);
1648   BYTESWAP (tmpA[3]);
1649   tmpA = (uint32_t *)&digests[1];
1650   BYTESWAP (tmpA[0]);
1651   BYTESWAP (tmpA[1]);
1652   BYTESWAP (tmpA[2]);
1653   BYTESWAP (tmpA[3]);
1654   tmpA = (uint32_t *)&digests[2];
1655   BYTESWAP (tmpA[0]);
1656   BYTESWAP (tmpA[1]);
1657   BYTESWAP (tmpA[2]);
1658   BYTESWAP (tmpA[3]);
1659   tmpA = (uint32_t *)&digests[3];
1660   BYTESWAP (tmpA[0]);
1661   BYTESWAP (tmpA[1]);
1662   BYTESWAP (tmpA[2]);
1663   BYTESWAP (tmpA[3]);
1664   tmpA = (uint32_t *)&digests[4];
1665   BYTESWAP (tmpA[0]);
1666   BYTESWAP (tmpA[1]);
1667   BYTESWAP (tmpA[2]);
1668   BYTESWAP (tmpA[3]);
1669   tmpA = (uint32_t *)&digests[5];
1670   BYTESWAP (tmpA[0]);
1671   BYTESWAP (tmpA[1]);
1672   BYTESWAP (tmpA[2]);
1673   BYTESWAP (tmpA[3]);
1674   tmpA = (uint32_t *)&digests[6];
1675   BYTESWAP (tmpA[0]);
1676   BYTESWAP (tmpA[1]);
1677   BYTESWAP (tmpA[2]);
1678   BYTESWAP (tmpA[3]);
1679   tmpA = (uint32_t *)&digests[7];
1680   BYTESWAP (tmpA[0]);
1681   BYTESWAP (tmpA[1]);
1682   BYTESWAP (tmpA[2]);
1683   BYTESWAP (tmpA[3]);
1684 
1685 }