1/**
2 * Author......: See docs/credits.txt
3 * License.....: MIT
4 * NOTE........: sboxes for maxwell were taken from DeepLearningJohnDoe, license below
5 *             : sboxes for others were takes fron JtR, license below
6 */
7
8#ifdef KERNEL_STATIC
9#include "inc_vendor.h"
10#include "inc_types.h"
11#include "inc_platform.cl"
12#include "inc_common.cl"
13#endif
14
15#define COMPARE_S "inc_comp_single_bs.cl"
16#define COMPARE_M "inc_comp_multi_bs.cl"
17
18#ifdef IS_NV
19#define KXX_DECL
20#endif
21
22#if (defined IS_AMD || defined IS_HIP)
23#define KXX_DECL
24#endif
25
26#ifdef IS_GENERIC
27#define KXX_DECL
28#endif
29
30#ifdef IS_NV
31
32#if CUDA_ARCH >= 500
33
34//
35// Bitslice DES S-boxes with LOP3.LUT instructions
36// For NVIDIA Maxwell architecture and CUDA 7.5 RC
37// by DeepLearningJohnDoe, version 0.1.6, 2015/07/19
38//
39// Gate counts: 25 24 25 18 25 24 24 23
40// Average: 23.5
41// Depth: 8 7 7 6 8 10 10 8
42// Average: 8
43//
44// Note that same S-box function with a lower gate count isn't necessarily faster.
45//
46// These Boolean expressions corresponding to DES S-boxes were
47// discovered by <deeplearningjohndoe at gmail.com>
48//
49// This file itself is Copyright (c) 2015 by <deeplearningjohndoe at gmail.com>
50// Redistribution and use in source and binary forms, with or without
51// modification, are permitted.
52//
53// The underlying mathematical formulas are NOT copyrighted.
54//
55
56#define LUT(a,b,c,d,e) u32 a; asm ("lop3.b32 %0, %1, %2, %3, "#e";" : "=r"(a): "r"(b), "r"(c), "r"(d));
57
58DECLSPEC void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
59{
60  LUT(xAA55AA5500550055, a1, a4, a6, 0xC1)
61  LUT(xA55AA55AF0F5F0F5, a3, a6, xAA55AA5500550055, 0x9E)
62  LUT(x5F5F5F5FA5A5A5A5, a1, a3, a6, 0xD6)
63  LUT(xF5A0F5A0A55AA55A, a4, xAA55AA5500550055, x5F5F5F5FA5A5A5A5, 0x56)
64  LUT(x947A947AD1E7D1E7, a2, xA55AA55AF0F5F0F5, xF5A0F5A0A55AA55A, 0x6C)
65  LUT(x5FFF5FFFFFFAFFFA, a6, xAA55AA5500550055, x5F5F5F5FA5A5A5A5, 0x7B)
66  LUT(xB96CB96C69936993, a2, xF5A0F5A0A55AA55A, x5FFF5FFFFFFAFFFA, 0xD6)
67  LUT(x3, a5, x947A947AD1E7D1E7, xB96CB96C69936993, 0x6A)
68  LUT(x55EE55EE55EE55EE, a1, a2, a4, 0x7A)
69  LUT(x084C084CB77BB77B, a2, a6, xF5A0F5A0A55AA55A, 0xC9)
70  LUT(x9C329C32E295E295, x947A947AD1E7D1E7, x55EE55EE55EE55EE, x084C084CB77BB77B, 0x72)
71  LUT(xA51EA51E50E050E0, a3, a6, x55EE55EE55EE55EE, 0x29)
72  LUT(x4AD34AD3BE3CBE3C, a2, x947A947AD1E7D1E7, xA51EA51E50E050E0, 0x95)
73  LUT(x2, a5, x9C329C32E295E295, x4AD34AD3BE3CBE3C, 0xC6)
74  LUT(xD955D95595D195D1, a1, a2, x9C329C32E295E295, 0xD2)
75  LUT(x8058805811621162, x947A947AD1E7D1E7, x55EE55EE55EE55EE, x084C084CB77BB77B, 0x90)
76  LUT(x7D0F7D0FC4B3C4B3, xA51EA51E50E050E0, xD955D95595D195D1, x8058805811621162, 0x76)
77  LUT(x0805080500010001, a3, xAA55AA5500550055, xD955D95595D195D1, 0x80)
78  LUT(x4A964A96962D962D, xB96CB96C69936993, x4AD34AD3BE3CBE3C, x0805080500010001, 0xA6)
79  LUT(x4, a5, x7D0F7D0FC4B3C4B3, x4A964A96962D962D, 0xA6)
80  LUT(x148014807B087B08, a1, xAA55AA5500550055, x947A947AD1E7D1E7, 0x21)
81  LUT(x94D894D86B686B68, xA55AA55AF0F5F0F5, x8058805811621162, x148014807B087B08, 0x6A)
82  LUT(x5555555540044004, a1, a6, x084C084CB77BB77B, 0x70)
83  LUT(xAFB4AFB4BF5BBF5B, x5F5F5F5FA5A5A5A5, xA51EA51E50E050E0, x5555555540044004, 0x97)
84  LUT(x1, a5, x94D894D86B686B68, xAFB4AFB4BF5BBF5B, 0x6C)
85
86  *out1 ^= x1;
87  *out2 ^= x2;
88  *out3 ^= x3;
89  *out4 ^= x4;
90}
91
92DECLSPEC void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
93{
94  LUT(xEEEEEEEE99999999, a1, a2, a6, 0x97)
95  LUT(xFFFFEEEE66666666, a5, a6, xEEEEEEEE99999999, 0x67)
96  LUT(x5555FFFFFFFF0000, a1, a5, a6, 0x76)
97  LUT(x6666DDDD5555AAAA, a2, xFFFFEEEE66666666, x5555FFFFFFFF0000, 0x69)
98  LUT(x6969D3D35353ACAC, a3, xFFFFEEEE66666666, x6666DDDD5555AAAA, 0x6A)
99  LUT(xCFCF3030CFCF3030, a2, a3, a5, 0x65)
100  LUT(xE4E4EEEE9999F0F0, a3, xEEEEEEEE99999999, x5555FFFFFFFF0000, 0x8D)
101  LUT(xE5E5BABACDCDB0B0, a1, xCFCF3030CFCF3030, xE4E4EEEE9999F0F0, 0xCA)
102  LUT(x3, a4, x6969D3D35353ACAC, xE5E5BABACDCDB0B0, 0xC6)
103  LUT(x3333CCCC00000000, a2, a5, a6, 0x14)
104  LUT(xCCCCDDDDFFFF0F0F, a5, xE4E4EEEE9999F0F0, x3333CCCC00000000, 0xB5)
105  LUT(x00000101F0F0F0F0, a3, a6, xFFFFEEEE66666666, 0x1C)
106  LUT(x9A9A64646A6A9595, a1, xCFCF3030CFCF3030, x00000101F0F0F0F0, 0x96)
107  LUT(x2, a4, xCCCCDDDDFFFF0F0F, x9A9A64646A6A9595, 0x6A)
108  LUT(x3333BBBB3333FFFF, a1, a2, x6666DDDD5555AAAA, 0xDE)
109  LUT(x1414141441410000, a1, a3, xE4E4EEEE9999F0F0, 0x90)
110  LUT(x7F7FF3F3F5F53939, x6969D3D35353ACAC, x9A9A64646A6A9595, x3333BBBB3333FFFF, 0x79)
111  LUT(x9494E3E34B4B3939, a5, x1414141441410000, x7F7FF3F3F5F53939, 0x29)
112  LUT(x1, a4, x3333BBBB3333FFFF, x9494E3E34B4B3939, 0xA6)
113  LUT(xB1B1BBBBCCCCA5A5, a1, a1, xE4E4EEEE9999F0F0, 0x4A)
114  LUT(xFFFFECECEEEEDDDD, a2, x3333CCCC00000000, x9A9A64646A6A9595, 0xEF)
115  LUT(xB1B1A9A9DCDC8787, xE5E5BABACDCDB0B0, xB1B1BBBBCCCCA5A5, xFFFFECECEEEEDDDD, 0x8D)
116  LUT(xFFFFCCCCEEEE4444, a2, a5, xFFFFEEEE66666666, 0x2B)
117  LUT(x4, a4, xB1B1A9A9DCDC8787, xFFFFCCCCEEEE4444, 0x6C)
118
119  *out1 ^= x1;
120  *out2 ^= x2;
121  *out3 ^= x3;
122  *out4 ^= x4;
123}
124
125DECLSPEC void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
126{
127  LUT(xA50FA50FA50FA50F, a1, a3, a4, 0xC9)
128  LUT(xF0F00F0FF0F0F0F0, a3, a5, a6, 0x4B)
129  LUT(xAF0FA0AAAF0FAF0F, a1, xA50FA50FA50FA50F, xF0F00F0FF0F0F0F0, 0x4D)
130  LUT(x5AA5A55A5AA55AA5, a1, a4, xF0F00F0FF0F0F0F0, 0x69)
131  LUT(xAA005FFFAA005FFF, a3, a5, xA50FA50FA50FA50F, 0xD6)
132  LUT(x5AA5A55A0F5AFAA5, a6, x5AA5A55A5AA55AA5, xAA005FFFAA005FFF, 0x9C)
133  LUT(x1, a2, xAF0FA0AAAF0FAF0F, x5AA5A55A0F5AFAA5, 0xA6)
134  LUT(xAA55AA5500AA00AA, a1, a4, a6, 0x49)
135  LUT(xFAFAA50FFAFAA50F, a1, a5, xA50FA50FA50FA50F, 0x9B)
136  LUT(x50AF0F5AFA50A5A5, a1, xAA55AA5500AA00AA, xFAFAA50FFAFAA50F, 0x66)
137  LUT(xAFAFAFAFFAFAFAFA, a1, a3, a6, 0x6F)
138  LUT(xAFAFFFFFFFFAFAFF, a4, x50AF0F5AFA50A5A5, xAFAFAFAFFAFAFAFA, 0xEB)
139  LUT(x4, a2, x50AF0F5AFA50A5A5, xAFAFFFFFFFFAFAFF, 0x6C)
140  LUT(x500F500F500F500F, a1, a3, a4, 0x98)
141  LUT(xF0505A0505A5050F, x5AA5A55A0F5AFAA5, xAA55AA5500AA00AA, xAFAFAFAFFAFAFAFA, 0x1D)
142  LUT(xF0505A05AA55AAFF, a6, x500F500F500F500F, xF0505A0505A5050F, 0x9A)
143  LUT(xFF005F55FF005F55, a1, a4, xAA005FFFAA005FFF, 0xB2)
144  LUT(xA55F5AF0A55F5AF0, a5, xA50FA50FA50FA50F, x5AA5A55A5AA55AA5, 0x3D)
145  LUT(x5A5F05A5A55F5AF0, a6, xFF005F55FF005F55, xA55F5AF0A55F5AF0, 0xA6)
146  LUT(x3, a2, xF0505A05AA55AAFF, x5A5F05A5A55F5AF0, 0xA6)
147  LUT(x0F0F0F0FA5A5A5A5, a1, a3, a6, 0xC6)
148  LUT(x5FFFFF5FFFA0FFA0, x5AA5A55A5AA55AA5, xAFAFAFAFFAFAFAFA, x0F0F0F0FA5A5A5A5, 0xDB)
149  LUT(xF5555AF500A05FFF, a5, xFAFAA50FFAFAA50F, xF0505A0505A5050F, 0xB9)
150  LUT(x05A5AAF55AFA55A5, xF0505A05AA55AAFF, x0F0F0F0FA5A5A5A5, xF5555AF500A05FFF, 0x9B)
151  LUT(x2, a2, x5FFFFF5FFFA0FFA0, x05A5AAF55AFA55A5, 0xA6)
152
153  *out1 ^= x1;
154  *out2 ^= x2;
155  *out3 ^= x3;
156  *out4 ^= x4;
157}
158
159DECLSPEC void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
160{
161  LUT(x55F055F055F055F0, a1, a3, a4, 0x72)
162  LUT(xA500F5F0A500F5F0, a3, a5, x55F055F055F055F0, 0xAD)
163  LUT(xF50AF50AF50AF50A, a1, a3, a4, 0x59)
164  LUT(xF5FA0FFFF5FA0FFF, a3, a5, xF50AF50AF50AF50A, 0xE7)
165  LUT(x61C8F93C61C8F93C, a2, xA500F5F0A500F5F0, xF5FA0FFFF5FA0FFF, 0xC6)
166  LUT(x9999666699996666, a1, a2, a5, 0x69)
167  LUT(x22C022C022C022C0, a2, a4, x55F055F055F055F0, 0x18)
168  LUT(xB35C94A6B35C94A6, xF5FA0FFFF5FA0FFF, x9999666699996666, x22C022C022C022C0, 0x63)
169  LUT(x4, a6, x61C8F93C61C8F93C, xB35C94A6B35C94A6, 0x6A)
170  LUT(x4848484848484848, a1, a2, a3, 0x12)
171  LUT(x55500AAA55500AAA, a1, a5, xF5FA0FFFF5FA0FFF, 0x28)
172  LUT(x3C90B3D63C90B3D6, x61C8F93C61C8F93C, x4848484848484848, x55500AAA55500AAA, 0x1E)
173  LUT(x8484333384843333, a1, x9999666699996666, x4848484848484848, 0x14)
174  LUT(x4452F1AC4452F1AC, xF50AF50AF50AF50A, xF5FA0FFFF5FA0FFF, xB35C94A6B35C94A6, 0x78)
175  LUT(x9586CA379586CA37, x55500AAA55500AAA, x8484333384843333, x4452F1AC4452F1AC, 0xD6)
176  LUT(x2, a6, x3C90B3D63C90B3D6, x9586CA379586CA37, 0x6A)
177  LUT(x1, a6, x3C90B3D63C90B3D6, x9586CA379586CA37, 0xA9)
178  LUT(x3, a6, x61C8F93C61C8F93C, xB35C94A6B35C94A6, 0x56)
179
180  *out1 ^= x1;
181  *out2 ^= x2;
182  *out3 ^= x3;
183  *out4 ^= x4;
184}
185
186DECLSPEC void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
187{
188  LUT(xA0A0A0A0FFFFFFFF, a1, a3, a6, 0xAB)
189  LUT(xFFFF00005555FFFF, a1, a5, a6, 0xB9)
190  LUT(xB3B320207777FFFF, a2, xA0A0A0A0FFFFFFFF, xFFFF00005555FFFF, 0xE8)
191  LUT(x50505A5A5A5A5050, a1, a3, xFFFF00005555FFFF, 0x34)
192  LUT(xA2A2FFFF2222FFFF, a1, a5, xB3B320207777FFFF, 0xCE)
193  LUT(x2E2E6969A4A46363, a2, x50505A5A5A5A5050, xA2A2FFFF2222FFFF, 0x29)
194  LUT(x3, a4, xB3B320207777FFFF, x2E2E6969A4A46363, 0xA6)
195  LUT(xA5A50A0AA5A50A0A, a1, a3, a5, 0x49)
196  LUT(x969639396969C6C6, a2, a6, xA5A50A0AA5A50A0A, 0x96)
197  LUT(x1B1B1B1B1B1B1B1B, a1, a2, a3, 0xCA)
198  LUT(xBFBFBFBFF6F6F9F9, a3, xA0A0A0A0FFFFFFFF, x969639396969C6C6, 0x7E)
199  LUT(x5B5BA4A4B8B81D1D, xFFFF00005555FFFF, x1B1B1B1B1B1B1B1B, xBFBFBFBFF6F6F9F9, 0x96)
200  LUT(x2, a4, x969639396969C6C6, x5B5BA4A4B8B81D1D, 0xCA)
201  LUT(x5555BBBBFFFF5555, a1, a2, xFFFF00005555FFFF, 0xE5)
202  LUT(x6D6D9C9C95956969, x50505A5A5A5A5050, xA2A2FFFF2222FFFF, x969639396969C6C6, 0x97)
203  LUT(x1A1A67676A6AB4B4, xA5A50A0AA5A50A0A, x5555BBBBFFFF5555, x6D6D9C9C95956969, 0x47)
204  LUT(xA0A0FFFFAAAA0000, a3, xFFFF00005555FFFF, xA5A50A0AA5A50A0A, 0x3B)
205  LUT(x36369C9CC1C1D6D6, x969639396969C6C6, x6D6D9C9C95956969, xA0A0FFFFAAAA0000, 0xD9)
206  LUT(x1, a4, x1A1A67676A6AB4B4, x36369C9CC1C1D6D6, 0xCA)
207  LUT(x5555F0F0F5F55555, a1, a3, xFFFF00005555FFFF, 0xB1)
208  LUT(x79790202DCDC0808, xA2A2FFFF2222FFFF, xA5A50A0AA5A50A0A, x969639396969C6C6, 0x47)
209  LUT(x6C6CF2F229295D5D, xBFBFBFBFF6F6F9F9, x5555F0F0F5F55555, x79790202DCDC0808, 0x6E)
210  LUT(xA3A3505010101A1A, a2, xA2A2FFFF2222FFFF, x36369C9CC1C1D6D6, 0x94)
211  LUT(x7676C7C74F4FC7C7, a1, x2E2E6969A4A46363, xA3A3505010101A1A, 0xD9)
212  LUT(x4, a4, x6C6CF2F229295D5D, x7676C7C74F4FC7C7, 0xC6)
213
214  *out1 ^= x1;
215  *out2 ^= x2;
216  *out3 ^= x3;
217  *out4 ^= x4;
218}
219
220DECLSPEC void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
221{
222  LUT(x5050F5F55050F5F5, a1, a3, a5, 0xB2)
223  LUT(x6363C6C66363C6C6, a1, a2, x5050F5F55050F5F5, 0x66)
224  LUT(xAAAA5555AAAA5555, a1, a1, a5, 0xA9)
225  LUT(x3A3A65653A3A6565, a3, x6363C6C66363C6C6, xAAAA5555AAAA5555, 0xA9)
226  LUT(x5963A3C65963A3C6, a4, x6363C6C66363C6C6, x3A3A65653A3A6565, 0xC6)
227  LUT(xE7E76565E7E76565, a5, x6363C6C66363C6C6, x3A3A65653A3A6565, 0xAD)
228  LUT(x455D45DF455D45DF, a1, a4, xE7E76565E7E76565, 0xE4)
229  LUT(x4, a6, x5963A3C65963A3C6, x455D45DF455D45DF, 0x6C)
230  LUT(x1101220211012202, a2, xAAAA5555AAAA5555, x5963A3C65963A3C6, 0x20)
231  LUT(xF00F0FF0F00F0FF0, a3, a4, a5, 0x69)
232  LUT(x16E94A9716E94A97, xE7E76565E7E76565, x1101220211012202, xF00F0FF0F00F0FF0, 0x9E)
233  LUT(x2992922929929229, a1, a2, xF00F0FF0F00F0FF0, 0x49)
234  LUT(xAFAF9823AFAF9823, a5, x5050F5F55050F5F5, x2992922929929229, 0x93)
235  LUT(x3, a6, x16E94A9716E94A97, xAFAF9823AFAF9823, 0x6C)
236  LUT(x4801810248018102, a4, x5963A3C65963A3C6, x1101220211012202, 0xA4)
237  LUT(x5EE8FFFD5EE8FFFD, a5, x16E94A9716E94A97, x4801810248018102, 0x76)
238  LUT(xF0FF00FFF0FF00FF, a3, a4, a5, 0xCD)
239  LUT(x942D9A67942D9A67, x3A3A65653A3A6565, x5EE8FFFD5EE8FFFD, xF0FF00FFF0FF00FF, 0x86)
240  LUT(x1, a6, x5EE8FFFD5EE8FFFD, x942D9A67942D9A67, 0xA6)
241  LUT(x6A40D4ED6F4DD4EE, a2, x4, xAFAF9823AFAF9823, 0x2D)
242  LUT(x6CA89C7869A49C79, x1101220211012202, x16E94A9716E94A97, x6A40D4ED6F4DD4EE, 0x26)
243  LUT(xD6DE73F9D6DE73F9, a3, x6363C6C66363C6C6, x455D45DF455D45DF, 0x6B)
244  LUT(x925E63E1965A63E1, x3A3A65653A3A6565, x6CA89C7869A49C79, xD6DE73F9D6DE73F9, 0xA2)
245  LUT(x2, a6, x6CA89C7869A49C79, x925E63E1965A63E1, 0xCA)
246
247  *out1 ^= x1;
248  *out2 ^= x2;
249  *out3 ^= x3;
250  *out4 ^= x4;
251}
252
253DECLSPEC void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
254{
255  LUT(x88AA88AA88AA88AA, a1, a2, a4, 0x0B)
256  LUT(xAAAAFF00AAAAFF00, a1, a4, a5, 0x27)
257  LUT(xADAFF8A5ADAFF8A5, a3, x88AA88AA88AA88AA, xAAAAFF00AAAAFF00, 0x9E)
258  LUT(x0A0AF5F50A0AF5F5, a1, a3, a5, 0xA6)
259  LUT(x6B69C5DC6B69C5DC, a2, xADAFF8A5ADAFF8A5, x0A0AF5F50A0AF5F5, 0x6B)
260  LUT(x1C69B2DC1C69B2DC, a4, x88AA88AA88AA88AA, x6B69C5DC6B69C5DC, 0xA9)
261  LUT(x1, a6, xADAFF8A5ADAFF8A5, x1C69B2DC1C69B2DC, 0x6A)
262  LUT(x9C9C9C9C9C9C9C9C, a1, a2, a3, 0x63)
263  LUT(xE6E63BFDE6E63BFD, a2, xAAAAFF00AAAAFF00, x0A0AF5F50A0AF5F5, 0xE7)
264  LUT(x6385639E6385639E, a4, x9C9C9C9C9C9C9C9C, xE6E63BFDE6E63BFD, 0x93)
265  LUT(x5959C4CE5959C4CE, a2, x6B69C5DC6B69C5DC, xE6E63BFDE6E63BFD, 0x5D)
266  LUT(x5B53F53B5B53F53B, a4, x0A0AF5F50A0AF5F5, x5959C4CE5959C4CE, 0x6E)
267  LUT(x3, a6, x6385639E6385639E, x5B53F53B5B53F53B, 0xC6)
268  LUT(xFAF505FAFAF505FA, a3, a4, x0A0AF5F50A0AF5F5, 0x6D)
269  LUT(x6A65956A6A65956A, a3, x9C9C9C9C9C9C9C9C, xFAF505FAFAF505FA, 0xA6)
270  LUT(x8888CCCC8888CCCC, a1, a2, a5, 0x23)
271  LUT(x94E97A9494E97A94, x1C69B2DC1C69B2DC, x6A65956A6A65956A, x8888CCCC8888CCCC, 0x72)
272  LUT(x4, a6, x6A65956A6A65956A, x94E97A9494E97A94, 0xAC)
273  LUT(xA050A050A050A050, a1, a3, a4, 0x21)
274  LUT(xC1B87A2BC1B87A2B, xAAAAFF00AAAAFF00, x5B53F53B5B53F53B, x94E97A9494E97A94, 0xA4)
275  LUT(xE96016B7E96016B7, x8888CCCC8888CCCC, xA050A050A050A050, xC1B87A2BC1B87A2B, 0x96)
276  LUT(xE3CF1FD5E3CF1FD5, x88AA88AA88AA88AA, x6A65956A6A65956A, xE96016B7E96016B7, 0x3E)
277  LUT(x6776675B6776675B, xADAFF8A5ADAFF8A5, x94E97A9494E97A94, xE3CF1FD5E3CF1FD5, 0x6B)
278  LUT(x2, a6, xE96016B7E96016B7, x6776675B6776675B, 0xC6)
279
280  *out1 ^= x1;
281  *out2 ^= x2;
282  *out3 ^= x3;
283  *out4 ^= x4;
284}
285
286DECLSPEC void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
287{
288  LUT(xEEEE3333EEEE3333, a1, a2, a5, 0x9D)
289  LUT(xBBBBBBBBBBBBBBBB, a1, a1, a2, 0x83)
290  LUT(xDDDDAAAADDDDAAAA, a1, a2, a5, 0x5B)
291  LUT(x29295A5A29295A5A, a3, xBBBBBBBBBBBBBBBB, xDDDDAAAADDDDAAAA, 0x85)
292  LUT(xC729695AC729695A, a4, xEEEE3333EEEE3333, x29295A5A29295A5A, 0xA6)
293  LUT(x3BF77B7B3BF77B7B, a2, a5, xC729695AC729695A, 0xF9)
294  LUT(x2900FF002900FF00, a4, a5, x29295A5A29295A5A, 0x0E)
295  LUT(x56B3803F56B3803F, xBBBBBBBBBBBBBBBB, x3BF77B7B3BF77B7B, x2900FF002900FF00, 0x61)
296  LUT(x4, a6, xC729695AC729695A, x56B3803F56B3803F, 0x6C)
297  LUT(xFBFBFBFBFBFBFBFB, a1, a2, a3, 0xDF)
298  LUT(x3012B7B73012B7B7, a2, a5, xC729695AC729695A, 0xD4)
299  LUT(x34E9B34C34E9B34C, a4, xFBFBFBFBFBFBFBFB, x3012B7B73012B7B7, 0x69)
300  LUT(xBFEAEBBEBFEAEBBE, a1, x29295A5A29295A5A, x34E9B34C34E9B34C, 0x6F)
301  LUT(xFFAEAFFEFFAEAFFE, a3, xBBBBBBBBBBBBBBBB, xBFEAEBBEBFEAEBBE, 0xB9)
302  LUT(x2, a6, x34E9B34C34E9B34C, xFFAEAFFEFFAEAFFE, 0xC6)
303  LUT(xCFDE88BBCFDE88BB, a2, xDDDDAAAADDDDAAAA, x34E9B34C34E9B34C, 0x5C)
304  LUT(x3055574530555745, a1, xC729695AC729695A, xCFDE88BBCFDE88BB, 0x71)
305  LUT(x99DDEEEE99DDEEEE, a4, xBBBBBBBBBBBBBBBB, xDDDDAAAADDDDAAAA, 0xB9)
306  LUT(x693CD926693CD926, x3BF77B7B3BF77B7B, x34E9B34C34E9B34C, x99DDEEEE99DDEEEE, 0x69)
307  LUT(x3, a6, x3055574530555745, x693CD926693CD926, 0x6A)
308  LUT(x9955EE559955EE55, a1, a4, x99DDEEEE99DDEEEE, 0xE2)
309  LUT(x9D48FA949D48FA94, x3BF77B7B3BF77B7B, xBFEAEBBEBFEAEBBE, x9955EE559955EE55, 0x9C)
310  LUT(x1, a6, xC729695AC729695A, x9D48FA949D48FA94, 0x39)
311
312  *out1 ^= x1;
313  *out2 ^= x2;
314  *out3 ^= x3;
315  *out4 ^= x4;
316}
317
318#else
319
320/*
321 * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC
322 * architectures.  These use AND, OR, XOR, NOT, and AND-NOT gates.
323 *
324 * Gate counts: 49 44 46 33 48 46 46 41
325 * Average: 44.125
326 *
327 * Several same-gate-count expressions for each S-box are included (for use on
328 * different CPUs/GPUs).
329 *
330 * These Boolean expressions corresponding to DES S-boxes have been generated
331 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
332 * John the Ripper password cracker: http://www.openwall.com/john/
333 * Being mathematical formulas, they are not copyrighted and are free for reuse
334 * by anyone.
335 *
336 * This file (a specific representation of the S-box expressions, surrounding
337 * logic) is Copyright (c) 2011 by Solar Designer <solar at openwall.com>.
338 * Redistribution and use in source and binary forms, with or without
339 * modification, are permitted.  (This is a heavily cut-down "BSD license".)
340 *
341 * The effort has been sponsored by Rapid7: http://www.rapid7.com
342 */
343
344DECLSPEC void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
345{
346    u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969,
347        x25202160;
348    u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93;
349    u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69;
350    u32 x0A0A0000, x0AD80096, x00999900, x0AD99996;
351    u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC;
352    u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0;
353    u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A;
354    u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205;
355    u32 x00, x01, x10, x11, x20, x21, x30, x31;
356
357    x55005500 = a1 & ~a5;
358    x5A0F5A0F = a4 ^ x55005500;
359    x3333FFFF = a3 | a6;
360    x66666666 = a1 ^ a3;
361    x22226666 = x3333FFFF & x66666666;
362    x2D2D6969 = a4 ^ x22226666;
363    x25202160 = x2D2D6969 & ~x5A0F5A0F;
364
365    x00FFFF00 = a5 ^ a6;
366    x33CCCC33 = a3 ^ x00FFFF00;
367    x4803120C = x5A0F5A0F & ~x33CCCC33;
368    x2222FFFF = a6 | x22226666;
369    x6A21EDF3 = x4803120C ^ x2222FFFF;
370    x4A01CC93 = x6A21EDF3 & ~x25202160;
371
372    x5555FFFF = a1 | a6;
373    x7F75FFFF = x6A21EDF3 | x5555FFFF;
374    x00D20096 = a5 & ~x2D2D6969;
375    x7FA7FF69 = x7F75FFFF ^ x00D20096;
376
377    x0A0A0000 = a4 & ~x5555FFFF;
378    x0AD80096 = x00D20096 ^ x0A0A0000;
379    x00999900 = x00FFFF00 & ~x66666666;
380    x0AD99996 = x0AD80096 | x00999900;
381
382    x22332233 = a3 & ~x55005500;
383    x257AA5F0 = x5A0F5A0F ^ x7F75FFFF;
384    x054885C0 = x257AA5F0 & ~x22332233;
385    xFAB77A3F = ~x054885C0;
386    x2221EDF3 = x3333FFFF & x6A21EDF3;
387    xD89697CC = xFAB77A3F ^ x2221EDF3;
388    x20 = x7FA7FF69 & ~a2;
389    x21 = x20 ^ xD89697CC;
390    *out3 ^= x21;
391
392    x05B77AC0 = x00FFFF00 ^ x054885C0;
393    x05F77AD6 = x00D20096 | x05B77AC0;
394    x36C48529 = x3333FFFF ^ x05F77AD6;
395    x6391D07C = a1 ^ x36C48529;
396    xBB0747B0 = xD89697CC ^ x6391D07C;
397    x00 = x25202160 | a2;
398    x01 = x00 ^ xBB0747B0;
399    *out1 ^= x01;
400
401    x4C460000 = x3333FFFF ^ x7F75FFFF;
402    x4EDF9996 = x0AD99996 | x4C460000;
403    x2D4E49EA = x6391D07C ^ x4EDF9996;
404    xBBFFFFB0 = x00FFFF00 | xBB0747B0;
405    x96B1B65A = x2D4E49EA ^ xBBFFFFB0;
406    x10 = x4A01CC93 | a2;
407    x11 = x10 ^ x96B1B65A;
408    *out2 ^= x11;
409
410    x5AFF5AFF = a5 | x5A0F5A0F;
411    x52B11215 = x5AFF5AFF & ~x2D4E49EA;
412    x4201C010 = x4A01CC93 & x6391D07C;
413    x10B0D205 = x52B11215 ^ x4201C010;
414    x30 = x10B0D205 | a2;
415    x31 = x30 ^ x0AD99996;
416    *out4 ^= x31;
417}
418
419DECLSPEC void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
420{
421    u32 x33CC33CC;
422    u32 x55550000, x00AA00FF, x33BB33FF;
423    u32 x33CC0000, x11441144, x11BB11BB, x003311BB;
424    u32 x00000F0F, x336600FF, x332200FF, x332200F0;
425    u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95;
426    u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39;
427    u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53;
428    u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F;
429    u32 x0A451047, xBBDFDD7B, xB19ACD3C;
430    u32 x00, x01, x10, x11, x20, x21, x30, x31;
431
432    x33CC33CC = a2 ^ a5;
433
434    x55550000 = a1 & ~a6;
435    x00AA00FF = a5 & ~x55550000;
436    x33BB33FF = a2 | x00AA00FF;
437
438    x33CC0000 = x33CC33CC & ~a6;
439    x11441144 = a1 & x33CC33CC;
440    x11BB11BB = a5 ^ x11441144;
441    x003311BB = x11BB11BB & ~x33CC0000;
442
443    x00000F0F = a3 & a6;
444    x336600FF = x00AA00FF ^ x33CC0000;
445    x332200FF = x33BB33FF & x336600FF;
446    x332200F0 = x332200FF & ~x00000F0F;
447
448    x0302000F = a3 & x332200FF;
449    xAAAAAAAA = ~a1;
450    xA9A8AAA5 = x0302000F ^ xAAAAAAAA;
451    x33CCCC33 = a6 ^ x33CC33CC;
452    x33CCC030 = x33CCCC33 & ~x00000F0F;
453    x9A646A95 = xA9A8AAA5 ^ x33CCC030;
454    x10 = a4 & ~x332200F0;
455    x11 = x10 ^ x9A646A95;
456    *out2 ^= x11;
457
458    x00333303 = a2 & ~x33CCC030;
459    x118822B8 = x11BB11BB ^ x00333303;
460    xA8208805 = xA9A8AAA5 & ~x118822B8;
461    x3CC3C33C = a3 ^ x33CCCC33;
462    x94E34B39 = xA8208805 ^ x3CC3C33C;
463    x00 = x33BB33FF & ~a4;
464    x01 = x00 ^ x94E34B39;
465    *out1 ^= x01;
466
467    x0331330C = x0302000F ^ x00333303;
468    x3FF3F33C = x3CC3C33C | x0331330C;
469    xA9DF596A = x33BB33FF ^ x9A646A95;
470    xA9DF5F6F = x00000F0F | xA9DF596A;
471    x962CAC53 = x3FF3F33C ^ xA9DF5F6F;
472
473    xA9466A6A = x332200FF ^ x9A646A95;
474    x3DA52153 = x94E34B39 ^ xA9466A6A;
475    x29850143 = xA9DF5F6F & x3DA52153;
476    x33C0330C = x33CC33CC & x3FF3F33C;
477    x1A45324F = x29850143 ^ x33C0330C;
478    x20 = x1A45324F | a4;
479    x21 = x20 ^ x962CAC53;
480    *out3 ^= x21;
481
482    x0A451047 = x1A45324F & ~x118822B8;
483    xBBDFDD7B = x33CCCC33 | xA9DF596A;
484    xB19ACD3C = x0A451047 ^ xBBDFDD7B;
485    x30 = x003311BB | a4;
486    x31 = x30 ^ xB19ACD3C;
487    *out4 ^= x31;
488}
489
490DECLSPEC void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
491{
492    u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4;
493    u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00;
494    u32 x00005EF4, x00FF5EFF, x00555455, x3C699796;
495    u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F;
496    u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8;
497    u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A;
498    u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356;
499    u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B;
500    u32 x00, x01, x10, x11, x20, x21, x30, x31;
501
502    x44444444 = a1 & ~a2;
503    x0F0FF0F0 = a3 ^ a6;
504    x4F4FF4F4 = x44444444 | x0F0FF0F0;
505    x00FFFF00 = a4 ^ a6;
506    x00AAAA00 = x00FFFF00 & ~a1;
507    x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00;
508
509    x3C3CC3C3 = a2 ^ x0F0FF0F0;
510    x3C3C0000 = x3C3CC3C3 & ~a6;
511    x7373F4F4 = x4F4FF4F4 ^ x3C3C0000;
512    x0C840A00 = x4FE55EF4 & ~x7373F4F4;
513
514    x00005EF4 = a6 & x4FE55EF4;
515    x00FF5EFF = a4 | x00005EF4;
516    x00555455 = a1 & x00FF5EFF;
517    x3C699796 = x3C3CC3C3 ^ x00555455;
518    x30 = x4FE55EF4 & ~a5;
519    x31 = x30 ^ x3C699796;
520    *out4 ^= x31;
521
522    x000FF000 = x0F0FF0F0 & x00FFFF00;
523    x55AA55AA = a1 ^ a4;
524    x26D9A15E = x7373F4F4 ^ x55AA55AA;
525    x2FDFAF5F = a3 | x26D9A15E;
526    x2FD00F5F = x2FDFAF5F & ~x000FF000;
527
528    x55AAFFAA = x00AAAA00 | x55AA55AA;
529    x28410014 = x3C699796 & ~x55AAFFAA;
530    x000000FF = a4 & a6;
531    x000000CC = x000000FF & ~a2;
532    x284100D8 = x28410014 ^ x000000CC;
533
534    x204100D0 = x7373F4F4 & x284100D8;
535    x3C3CC3FF = x3C3CC3C3 | x000000FF;
536    x1C3CC32F = x3C3CC3FF & ~x204100D0;
537    x4969967A = a1 ^ x1C3CC32F;
538    x10 = x2FD00F5F & a5;
539    x11 = x10 ^ x4969967A;
540    *out2 ^= x11;
541
542    x4CC44CC4 = x4FE55EF4 & ~a2;
543    x40C040C0 = x4CC44CC4 & ~a3;
544    xC3C33C3C = ~x3C3CC3C3;
545    x9669C396 = x55AAFFAA ^ xC3C33C3C;
546    xD6A98356 = x40C040C0 ^ x9669C396;
547    x00 = a5 & ~x0C840A00;
548    x01 = x00 ^ xD6A98356;
549    *out1 ^= x01;
550
551    xD6E9C3D6 = x40C040C0 | x9669C396;
552    x4CEEEEC4 = x00AAAA00 | x4CC44CC4;
553    x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4;
554    x001A000B = a4 & ~x4FE55EF4;
555    x9A1F2D1B = x9A072D12 | x001A000B;
556    x20 = a5 & ~x284100D8;
557    x21 = x20 ^ x9A1F2D1B;
558    *out3 ^= x21;
559}
560
561DECLSPEC void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
562{
563    u32 x5A5A5A5A, x0F0FF0F0;
564    u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F,
565        x52FBCA0F, x61C8F93C;
566    u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6;
567    u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1;
568    u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6;
569    u32 x00, x01, x10, x11, x20, x21, x30, x31;
570
571    x5A5A5A5A = a1 ^ a3;
572    x0F0FF0F0 = a3 ^ a5;
573    x33FF33FF = a2 | a4;
574    x33FFCC00 = a5 ^ x33FF33FF;
575    x0C0030F0 = x0F0FF0F0 & ~x33FFCC00;
576    x0C0CC0C0 = x0F0FF0F0 & ~a2;
577    x0CF3C03F = a4 ^ x0C0CC0C0;
578    x5EFBDA7F = x5A5A5A5A | x0CF3C03F;
579    x52FBCA0F = x5EFBDA7F & ~x0C0030F0;
580    x61C8F93C = a2 ^ x52FBCA0F;
581
582    x00C0C03C = x0CF3C03F & x61C8F93C;
583    x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C;
584    x3B92A366 = x5A5A5A5A ^ x61C8F93C;
585    x30908326 = x3B92A366 & ~x0F0F30C0;
586    x3C90B3D6 = x0C0030F0 ^ x30908326;
587
588    x33CC33CC = a2 ^ a4;
589    x0C0CFFFF = a5 | x0C0CC0C0;
590    x379E5C99 = x3B92A366 ^ x0C0CFFFF;
591    x04124C11 = x379E5C99 & ~x33CC33CC;
592    x56E9861E = x52FBCA0F ^ x04124C11;
593    x00 = a6 & ~x3C90B3D6;
594    x01 = x00 ^ x56E9861E;
595    *out1 ^= x01;
596
597    xA91679E1 = ~x56E9861E;
598    x10 = x3C90B3D6 & ~a6;
599    x11 = x10 ^ xA91679E1;
600    *out2 ^= x11;
601
602    x9586CA37 = x3C90B3D6 ^ xA91679E1;
603    x8402C833 = x9586CA37 & ~x33CC33CC;
604    x84C2C83F = x00C0C03C | x8402C833;
605    xB35C94A6 = x379E5C99 ^ x84C2C83F;
606    x20 = x61C8F93C | a6;
607    x21 = x20 ^ xB35C94A6;
608    *out3 ^= x21;
609
610    x30 = a6 & x61C8F93C;
611    x31 = x30 ^ xB35C94A6;
612    *out4 ^= x31;
613}
614
615DECLSPEC void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
616{
617    u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F;
618    u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B;
619    u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7;
620    u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF;
621    u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A;
622    u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2;
623    u32 x22222222, x16BCEE97, x0F080B04, x19B4E593;
624    u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D;
625    u32 x00, x01, x10, x11, x20, x21, x30, x31;
626
627    x77777777 = a1 | a3;
628    x77770000 = x77777777 & ~a6;
629    x22225555 = a1 ^ x77770000;
630    x11116666 = a3 ^ x22225555;
631    x1F1F6F6F = a4 | x11116666;
632
633    x70700000 = x77770000 & ~a4;
634    x43433333 = a3 ^ x70700000;
635    x00430033 = a5 & x43433333;
636    x55557777 = a1 | x11116666;
637    x55167744 = x00430033 ^ x55557777;
638    x5A19784B = a4 ^ x55167744;
639
640    x5A1987B4 = a6 ^ x5A19784B;
641    x7A3BD7F5 = x22225555 | x5A1987B4;
642    x003B00F5 = a5 & x7A3BD7F5;
643    x221955A0 = x22225555 ^ x003B00F5;
644    x05050707 = a4 & x55557777;
645    x271C52A7 = x221955A0 ^ x05050707;
646
647    x2A2A82A0 = x7A3BD7F5 & ~a1;
648    x6969B193 = x43433333 ^ x2A2A82A0;
649    x1FE06F90 = a5 ^ x1F1F6F6F;
650    x16804E00 = x1FE06F90 & ~x6969B193;
651    xE97FB1FF = ~x16804E00;
652    x20 = xE97FB1FF & ~a2;
653    x21 = x20 ^ x5A19784B;
654    *out3 ^= x21;
655
656    x43403302 = x43433333 & ~x003B00F5;
657    x35CAED30 = x2A2A82A0 ^ x1FE06F90;
658    x37DEFFB7 = x271C52A7 | x35CAED30;
659    x349ECCB5 = x37DEFFB7 & ~x43403302;
660    x0B01234A = x1F1F6F6F & ~x349ECCB5;
661
662    x101884B4 = x5A1987B4 & x349ECCB5;
663    x0FF8EB24 = x1FE06F90 ^ x101884B4;
664    x41413333 = x43433333 & x55557777;
665    x4FF9FB37 = x0FF8EB24 | x41413333;
666    x4FC2FBC2 = x003B00F5 ^ x4FF9FB37;
667    x30 = x4FC2FBC2 & a2;
668    x31 = x30 ^ x271C52A7;
669    *out4 ^= x31;
670
671    x22222222 = a1 ^ x77777777;
672    x16BCEE97 = x349ECCB5 ^ x22222222;
673    x0F080B04 = a4 & x0FF8EB24;
674    x19B4E593 = x16BCEE97 ^ x0F080B04;
675    x00 = x0B01234A | a2;
676    x01 = x00 ^ x19B4E593;
677    *out1 ^= x01;
678
679    x5C5C5C5C = x1F1F6F6F ^ x43433333;
680    x4448184C = x5C5C5C5C & ~x19B4E593;
681    x2DDABE71 = x22225555 ^ x0FF8EB24;
682    x6992A63D = x4448184C ^ x2DDABE71;
683    x10 = x1F1F6F6F & a2;
684    x11 = x10 ^ x6992A63D;
685    *out2 ^= x11;
686}
687
688DECLSPEC void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
689{
690    u32 x33CC33CC;
691    u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099;
692    u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6;
693    u32 x09030C06, x09030000, x336622FF, x3A6522FF;
694    u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD;
695    u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B;
696    u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479;
697    u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5;
698    u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67;
699    u32 x00, x01, x10, x11, x20, x21, x30, x31;
700
701    x33CC33CC = a2 ^ a5;
702
703    x3333FFFF = a2 | a6;
704    x11115555 = a1 & x3333FFFF;
705    x22DD6699 = x33CC33CC ^ x11115555;
706    x22DD9966 = a6 ^ x22DD6699;
707    x00220099 = a5 & ~x22DD9966;
708
709    x00551144 = a1 & x22DD9966;
710    x33662277 = a2 ^ x00551144;
711    x5A5A5A5A = a1 ^ a3;
712    x7B7E7A7F = x33662277 | x5A5A5A5A;
713    x59A31CE6 = x22DD6699 ^ x7B7E7A7F;
714
715    x09030C06 = a3 & x59A31CE6;
716    x09030000 = x09030C06 & ~a6;
717    x336622FF = x00220099 | x33662277;
718    x3A6522FF = x09030000 ^ x336622FF;
719    x30 = x3A6522FF & a4;
720    x31 = x30 ^ x59A31CE6;
721    *out4 ^= x31;
722
723    x484D494C = a2 ^ x7B7E7A7F;
724    x0000B6B3 = a6 & ~x484D494C;
725    x0F0FB9BC = a3 ^ x0000B6B3;
726    x00FC00F9 = a5 & ~x09030C06;
727    x0FFFB9FD = x0F0FB9BC | x00FC00F9;
728
729    x5DF75DF7 = a1 | x59A31CE6;
730    x116600F7 = x336622FF & x5DF75DF7;
731    x1E69B94B = x0F0FB9BC ^ x116600F7;
732    x1668B94B = x1E69B94B & ~x09030000;
733    x20 = x00220099 | a4;
734    x21 = x20 ^ x1668B94B;
735    *out3 ^= x21;
736
737    x7B7B7B7B = a2 | x5A5A5A5A;
738    x411E5984 = x3A6522FF ^ x7B7B7B7B;
739    x1FFFFDFD = x11115555 | x0FFFB9FD;
740    x5EE1A479 = x411E5984 ^ x1FFFFDFD;
741
742    x3CB4DFD2 = x22DD6699 ^ x1E69B94B;
743    x004B002D = a5 & ~x3CB4DFD2;
744    xB7B2B6B3 = ~x484D494C;
745    xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3;
746    xCC82CDE5 = x004B002D ^ xCCC9CDC8;
747    x10 = xCC82CDE5 & ~a4;
748    x11 = x10 ^ x5EE1A479;
749    *out2 ^= x11;
750
751    x0055EEBB = a6 ^ x00551144;
752    x5A5AECE9 = a1 ^ x0F0FB9BC;
753    x0050ECA9 = x0055EEBB & x5A5AECE9;
754    xC5CAC1CE = x09030C06 ^ xCCC9CDC8;
755    xC59A2D67 = x0050ECA9 ^ xC5CAC1CE;
756    x00 = x0FFFB9FD & ~a4;
757    x01 = x00 ^ xC59A2D67;
758    *out1 ^= x01;
759}
760
761DECLSPEC void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
762{
763    u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841;
764    u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78;
765    u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D;
766    u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B;
767    u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB;
768    u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867;
769    u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD;
770    u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A;
771    u32 x00, x01, x10, x11, x20, x21, x30, x31;
772
773    x0FF00FF0 = a4 ^ a5;
774    x3CC33CC3 = a3 ^ x0FF00FF0;
775    x00003CC3 = a6 & x3CC33CC3;
776    x0F000F00 = a4 & x0FF00FF0;
777    x5A555A55 = a2 ^ x0F000F00;
778    x00001841 = x00003CC3 & x5A555A55;
779
780    x00000F00 = a6 & x0F000F00;
781    x33333C33 = a3 ^ x00000F00;
782    x7B777E77 = x5A555A55 | x33333C33;
783    x0FF0F00F = a6 ^ x0FF00FF0;
784    x74878E78 = x7B777E77 ^ x0FF0F00F;
785    x30 = a1 & ~x00001841;
786    x31 = x30 ^ x74878E78;
787    *out4 ^= x31;
788
789    x003C003C = a5 & ~x3CC33CC3;
790    x5A7D5A7D = x5A555A55 | x003C003C;
791    x333300F0 = x00003CC3 ^ x33333C33;
792    x694E5A8D = x5A7D5A7D ^ x333300F0;
793
794    x0FF0CCCC = x00003CC3 ^ x0FF0F00F;
795    x000F0303 = a4 & ~x0FF0CCCC;
796    x5A505854 = x5A555A55 & ~x000F0303;
797    x33CC000F = a5 ^ x333300F0;
798    x699C585B = x5A505854 ^ x33CC000F;
799
800    x7F878F78 = x0F000F00 | x74878E78;
801    x21101013 = a3 & x699C585B;
802    x7F979F7B = x7F878F78 | x21101013;
803    x30030CC0 = x3CC33CC3 & ~x0FF0F00F;
804    x4F9493BB = x7F979F7B ^ x30030CC0;
805    x00 = x4F9493BB & ~a1;
806    x01 = x00 ^ x694E5A8D;
807    *out1 ^= x01;
808
809    x6F9CDBFB = x699C585B | x4F9493BB;
810    x0000DBFB = a6 & x6F9CDBFB;
811    x00005151 = a2 & x0000DBFB;
812    x26DAC936 = x694E5A8D ^ x4F9493BB;
813    x26DA9867 = x00005151 ^ x26DAC936;
814
815    x27DA9877 = x21101013 | x26DA9867;
816    x27DA438C = x0000DBFB ^ x27DA9877;
817    x2625C9C9 = a5 ^ x26DAC936;
818    x27FFCBCD = x27DA438C | x2625C9C9;
819    x20 = x27FFCBCD & a1;
820    x21 = x20 ^ x699C585B;
821    *out3 ^= x21;
822
823    x27FF1036 = x0000DBFB ^ x27FFCBCD;
824    x27FF103E = x003C003C | x27FF1036;
825    xB06B6C44 = ~x4F9493BB;
826    x97947C7A = x27FF103E ^ xB06B6C44;
827    x10 = x97947C7A & ~a1;
828    x11 = x10 ^ x26DA9867;
829    *out2 ^= x11;
830}
831
832DECLSPEC void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
833{
834    u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001;
835    u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745;
836    u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3;
837    u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A;
838    u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926;
839    u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F;
840    u32 xF700A600, x61008000, x03B7856B, x62B7056B;
841    u32 x00, x01, x10, x11, x20, x21, x30, x31;
842
843    x0C0C0C0C = a3 & ~a2;
844    x0000F0F0 = a5 & ~a3;
845    x00FFF00F = a4 ^ x0000F0F0;
846    x00555005 = a1 & x00FFF00F;
847    x00515001 = x00555005 & ~x0C0C0C0C;
848
849    x33000330 = a2 & ~x00FFF00F;
850    x77555775 = a1 | x33000330;
851    x30303030 = a2 & ~a3;
852    x3030CFCF = a5 ^ x30303030;
853    x30104745 = x77555775 & x3030CFCF;
854    x30555745 = x00555005 | x30104745;
855
856    xFF000FF0 = ~x00FFF00F;
857    xCF1048B5 = x30104745 ^ xFF000FF0;
858    x080A080A = a3 & ~x77555775;
859    xC71A40BF = xCF1048B5 ^ x080A080A;
860    xCB164CB3 = x0C0C0C0C ^ xC71A40BF;
861    x10 = x00515001 | a6;
862    x11 = x10 ^ xCB164CB3;
863    *out2 ^= x11;
864
865    x9E4319E6 = a1 ^ xCB164CB3;
866    x000019E6 = a5 & x9E4319E6;
867    xF429738C = a2 ^ xC71A40BF;
868    xF4296A6A = x000019E6 ^ xF429738C;
869    xC729695A = x33000330 ^ xF4296A6A;
870
871    xC47C3D2F = x30555745 ^ xF4296A6A;
872    xF77F3F3F = a2 | xC47C3D2F;
873    x9E43E619 = a5 ^ x9E4319E6;
874    x693CD926 = xF77F3F3F ^ x9E43E619;
875    x20 = x30555745 & a6;
876    x21 = x20 ^ x693CD926;
877    *out3 ^= x21;
878
879    xF719A695 = x3030CFCF ^ xC729695A;
880    xF4FF73FF = a4 | xF429738C;
881    x03E6D56A = xF719A695 ^ xF4FF73FF;
882    x56B3803F = a1 ^ x03E6D56A;
883    x30 = x56B3803F & a6;
884    x31 = x30 ^ xC729695A;
885    *out4 ^= x31;
886
887    xF700A600 = xF719A695 & ~a4;
888    x61008000 = x693CD926 & xF700A600;
889    x03B7856B = x00515001 ^ x03E6D56A;
890    x62B7056B = x61008000 ^ x03B7856B;
891    x00 = x62B7056B | a6;
892    x01 = x00 ^ xC729695A;
893    *out1 ^= x01;
894}
895
896#endif
897#endif
898
899#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
900
901/*
902 * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC
903 * architectures.  These use AND, OR, XOR, NOT, and AND-NOT gates.
904 *
905 * Gate counts: 49 44 46 33 48 46 46 41
906 * Average: 44.125
907 *
908 * Several same-gate-count expressions for each S-box are included (for use on
909 * different CPUs/GPUs).
910 *
911 * These Boolean expressions corresponding to DES S-boxes have been generated
912 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
913 * John the Ripper password cracker: http://www.openwall.com/john/
914 * Being mathematical formulas, they are not copyrighted and are free for reuse
915 * by anyone.
916 *
917 * This file (a specific representation of the S-box expressions, surrounding
918 * logic) is Copyright (c) 2011 by Solar Designer <solar at openwall.com>.
919 * Redistribution and use in source and binary forms, with or without
920 * modification, are permitted.  (This is a heavily cut-down "BSD license".)
921 *
922 * The effort has been sponsored by Rapid7: http://www.rapid7.com
923 */
924
925DECLSPEC void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
926{
927    u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969,
928        x25202160;
929    u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93;
930    u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69;
931    u32 x0A0A0000, x0AD80096, x00999900, x0AD99996;
932    u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC;
933    u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0;
934    u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A;
935    u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205;
936    u32 x00, x01, x10, x11, x20, x21, x30, x31;
937
938    x55005500 = a1 & ~a5;
939    x5A0F5A0F = a4 ^ x55005500;
940    x3333FFFF = a3 | a6;
941    x66666666 = a1 ^ a3;
942    x22226666 = x3333FFFF & x66666666;
943    x2D2D6969 = a4 ^ x22226666;
944    x25202160 = x2D2D6969 & ~x5A0F5A0F;
945
946    x00FFFF00 = a5 ^ a6;
947    x33CCCC33 = a3 ^ x00FFFF00;
948    x4803120C = x5A0F5A0F & ~x33CCCC33;
949    x2222FFFF = a6 | x22226666;
950    x6A21EDF3 = x4803120C ^ x2222FFFF;
951    x4A01CC93 = x6A21EDF3 & ~x25202160;
952
953    x5555FFFF = a1 | a6;
954    x7F75FFFF = x6A21EDF3 | x5555FFFF;
955    x00D20096 = a5 & ~x2D2D6969;
956    x7FA7FF69 = x7F75FFFF ^ x00D20096;
957
958    x0A0A0000 = a4 & ~x5555FFFF;
959    x0AD80096 = x00D20096 ^ x0A0A0000;
960    x00999900 = x00FFFF00 & ~x66666666;
961    x0AD99996 = x0AD80096 | x00999900;
962
963    x22332233 = a3 & ~x55005500;
964    x257AA5F0 = x5A0F5A0F ^ x7F75FFFF;
965    x054885C0 = x257AA5F0 & ~x22332233;
966    xFAB77A3F = ~x054885C0;
967    x2221EDF3 = x3333FFFF & x6A21EDF3;
968    xD89697CC = xFAB77A3F ^ x2221EDF3;
969    x20 = x7FA7FF69 & ~a2;
970    x21 = x20 ^ xD89697CC;
971    *out3 ^= x21;
972
973    x05B77AC0 = x00FFFF00 ^ x054885C0;
974    x05F77AD6 = x00D20096 | x05B77AC0;
975    x36C48529 = x3333FFFF ^ x05F77AD6;
976    x6391D07C = a1 ^ x36C48529;
977    xBB0747B0 = xD89697CC ^ x6391D07C;
978    x00 = x25202160 | a2;
979    x01 = x00 ^ xBB0747B0;
980    *out1 ^= x01;
981
982    x4C460000 = x3333FFFF ^ x7F75FFFF;
983    x4EDF9996 = x0AD99996 | x4C460000;
984    x2D4E49EA = x6391D07C ^ x4EDF9996;
985    xBBFFFFB0 = x00FFFF00 | xBB0747B0;
986    x96B1B65A = x2D4E49EA ^ xBBFFFFB0;
987    x10 = x4A01CC93 | a2;
988    x11 = x10 ^ x96B1B65A;
989    *out2 ^= x11;
990
991    x5AFF5AFF = a5 | x5A0F5A0F;
992    x52B11215 = x5AFF5AFF & ~x2D4E49EA;
993    x4201C010 = x4A01CC93 & x6391D07C;
994    x10B0D205 = x52B11215 ^ x4201C010;
995    x30 = x10B0D205 | a2;
996    x31 = x30 ^ x0AD99996;
997    *out4 ^= x31;
998}
999
1000DECLSPEC void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1001{
1002    u32 x33CC33CC;
1003    u32 x55550000, x00AA00FF, x33BB33FF;
1004    u32 x33CC0000, x11441144, x11BB11BB, x003311BB;
1005    u32 x00000F0F, x336600FF, x332200FF, x332200F0;
1006    u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95;
1007    u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39;
1008    u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53;
1009    u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F;
1010    u32 x0A451047, xBBDFDD7B, xB19ACD3C;
1011    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1012
1013    x33CC33CC = a2 ^ a5;
1014
1015    x55550000 = a1 & ~a6;
1016    x00AA00FF = a5 & ~x55550000;
1017    x33BB33FF = a2 | x00AA00FF;
1018
1019    x33CC0000 = x33CC33CC & ~a6;
1020    x11441144 = a1 & x33CC33CC;
1021    x11BB11BB = a5 ^ x11441144;
1022    x003311BB = x11BB11BB & ~x33CC0000;
1023
1024    x00000F0F = a3 & a6;
1025    x336600FF = x00AA00FF ^ x33CC0000;
1026    x332200FF = x33BB33FF & x336600FF;
1027    x332200F0 = x332200FF & ~x00000F0F;
1028
1029    x0302000F = a3 & x332200FF;
1030    xAAAAAAAA = ~a1;
1031    xA9A8AAA5 = x0302000F ^ xAAAAAAAA;
1032    x33CCCC33 = a6 ^ x33CC33CC;
1033    x33CCC030 = x33CCCC33 & ~x00000F0F;
1034    x9A646A95 = xA9A8AAA5 ^ x33CCC030;
1035    x10 = a4 & ~x332200F0;
1036    x11 = x10 ^ x9A646A95;
1037    *out2 ^= x11;
1038
1039    x00333303 = a2 & ~x33CCC030;
1040    x118822B8 = x11BB11BB ^ x00333303;
1041    xA8208805 = xA9A8AAA5 & ~x118822B8;
1042    x3CC3C33C = a3 ^ x33CCCC33;
1043    x94E34B39 = xA8208805 ^ x3CC3C33C;
1044    x00 = x33BB33FF & ~a4;
1045    x01 = x00 ^ x94E34B39;
1046    *out1 ^= x01;
1047
1048    x0331330C = x0302000F ^ x00333303;
1049    x3FF3F33C = x3CC3C33C | x0331330C;
1050    xA9DF596A = x33BB33FF ^ x9A646A95;
1051    xA9DF5F6F = x00000F0F | xA9DF596A;
1052    x962CAC53 = x3FF3F33C ^ xA9DF5F6F;
1053
1054    xA9466A6A = x332200FF ^ x9A646A95;
1055    x3DA52153 = x94E34B39 ^ xA9466A6A;
1056    x29850143 = xA9DF5F6F & x3DA52153;
1057    x33C0330C = x33CC33CC & x3FF3F33C;
1058    x1A45324F = x29850143 ^ x33C0330C;
1059    x20 = x1A45324F | a4;
1060    x21 = x20 ^ x962CAC53;
1061    *out3 ^= x21;
1062
1063    x0A451047 = x1A45324F & ~x118822B8;
1064    xBBDFDD7B = x33CCCC33 | xA9DF596A;
1065    xB19ACD3C = x0A451047 ^ xBBDFDD7B;
1066    x30 = x003311BB | a4;
1067    x31 = x30 ^ xB19ACD3C;
1068    *out4 ^= x31;
1069}
1070
1071DECLSPEC void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1072{
1073    u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4;
1074    u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00;
1075    u32 x00005EF4, x00FF5EFF, x00555455, x3C699796;
1076    u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F;
1077    u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8;
1078    u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A;
1079    u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356;
1080    u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B;
1081    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1082
1083    x44444444 = a1 & ~a2;
1084    x0F0FF0F0 = a3 ^ a6;
1085    x4F4FF4F4 = x44444444 | x0F0FF0F0;
1086    x00FFFF00 = a4 ^ a6;
1087    x00AAAA00 = x00FFFF00 & ~a1;
1088    x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00;
1089
1090    x3C3CC3C3 = a2 ^ x0F0FF0F0;
1091    x3C3C0000 = x3C3CC3C3 & ~a6;
1092    x7373F4F4 = x4F4FF4F4 ^ x3C3C0000;
1093    x0C840A00 = x4FE55EF4 & ~x7373F4F4;
1094
1095    x00005EF4 = a6 & x4FE55EF4;
1096    x00FF5EFF = a4 | x00005EF4;
1097    x00555455 = a1 & x00FF5EFF;
1098    x3C699796 = x3C3CC3C3 ^ x00555455;
1099    x30 = x4FE55EF4 & ~a5;
1100    x31 = x30 ^ x3C699796;
1101    *out4 ^= x31;
1102
1103    x000FF000 = x0F0FF0F0 & x00FFFF00;
1104    x55AA55AA = a1 ^ a4;
1105    x26D9A15E = x7373F4F4 ^ x55AA55AA;
1106    x2FDFAF5F = a3 | x26D9A15E;
1107    x2FD00F5F = x2FDFAF5F & ~x000FF000;
1108
1109    x55AAFFAA = x00AAAA00 | x55AA55AA;
1110    x28410014 = x3C699796 & ~x55AAFFAA;
1111    x000000FF = a4 & a6;
1112    x000000CC = x000000FF & ~a2;
1113    x284100D8 = x28410014 ^ x000000CC;
1114
1115    x204100D0 = x7373F4F4 & x284100D8;
1116    x3C3CC3FF = x3C3CC3C3 | x000000FF;
1117    x1C3CC32F = x3C3CC3FF & ~x204100D0;
1118    x4969967A = a1 ^ x1C3CC32F;
1119    x10 = x2FD00F5F & a5;
1120    x11 = x10 ^ x4969967A;
1121    *out2 ^= x11;
1122
1123    x4CC44CC4 = x4FE55EF4 & ~a2;
1124    x40C040C0 = x4CC44CC4 & ~a3;
1125    xC3C33C3C = ~x3C3CC3C3;
1126    x9669C396 = x55AAFFAA ^ xC3C33C3C;
1127    xD6A98356 = x40C040C0 ^ x9669C396;
1128    x00 = a5 & ~x0C840A00;
1129    x01 = x00 ^ xD6A98356;
1130    *out1 ^= x01;
1131
1132    xD6E9C3D6 = x40C040C0 | x9669C396;
1133    x4CEEEEC4 = x00AAAA00 | x4CC44CC4;
1134    x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4;
1135    x001A000B = a4 & ~x4FE55EF4;
1136    x9A1F2D1B = x9A072D12 | x001A000B;
1137    x20 = a5 & ~x284100D8;
1138    x21 = x20 ^ x9A1F2D1B;
1139    *out3 ^= x21;
1140}
1141
1142DECLSPEC void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1143{
1144    u32 x5A5A5A5A, x0F0FF0F0;
1145    u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F,
1146        x52FBCA0F, x61C8F93C;
1147    u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6;
1148    u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1;
1149    u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6;
1150    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1151
1152    x5A5A5A5A = a1 ^ a3;
1153    x0F0FF0F0 = a3 ^ a5;
1154    x33FF33FF = a2 | a4;
1155    x33FFCC00 = a5 ^ x33FF33FF;
1156    x0C0030F0 = x0F0FF0F0 & ~x33FFCC00;
1157    x0C0CC0C0 = x0F0FF0F0 & ~a2;
1158    x0CF3C03F = a4 ^ x0C0CC0C0;
1159    x5EFBDA7F = x5A5A5A5A | x0CF3C03F;
1160    x52FBCA0F = x5EFBDA7F & ~x0C0030F0;
1161    x61C8F93C = a2 ^ x52FBCA0F;
1162
1163    x00C0C03C = x0CF3C03F & x61C8F93C;
1164    x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C;
1165    x3B92A366 = x5A5A5A5A ^ x61C8F93C;
1166    x30908326 = x3B92A366 & ~x0F0F30C0;
1167    x3C90B3D6 = x0C0030F0 ^ x30908326;
1168
1169    x33CC33CC = a2 ^ a4;
1170    x0C0CFFFF = a5 | x0C0CC0C0;
1171    x379E5C99 = x3B92A366 ^ x0C0CFFFF;
1172    x04124C11 = x379E5C99 & ~x33CC33CC;
1173    x56E9861E = x52FBCA0F ^ x04124C11;
1174    x00 = a6 & ~x3C90B3D6;
1175    x01 = x00 ^ x56E9861E;
1176    *out1 ^= x01;
1177
1178    xA91679E1 = ~x56E9861E;
1179    x10 = x3C90B3D6 & ~a6;
1180    x11 = x10 ^ xA91679E1;
1181    *out2 ^= x11;
1182
1183    x9586CA37 = x3C90B3D6 ^ xA91679E1;
1184    x8402C833 = x9586CA37 & ~x33CC33CC;
1185    x84C2C83F = x00C0C03C | x8402C833;
1186    xB35C94A6 = x379E5C99 ^ x84C2C83F;
1187    x20 = x61C8F93C | a6;
1188    x21 = x20 ^ xB35C94A6;
1189    *out3 ^= x21;
1190
1191    x30 = a6 & x61C8F93C;
1192    x31 = x30 ^ xB35C94A6;
1193    *out4 ^= x31;
1194}
1195
1196DECLSPEC void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1197{
1198    u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F;
1199    u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B;
1200    u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7;
1201    u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF;
1202    u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A;
1203    u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2;
1204    u32 x22222222, x16BCEE97, x0F080B04, x19B4E593;
1205    u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D;
1206    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1207
1208    x77777777 = a1 | a3;
1209    x77770000 = x77777777 & ~a6;
1210    x22225555 = a1 ^ x77770000;
1211    x11116666 = a3 ^ x22225555;
1212    x1F1F6F6F = a4 | x11116666;
1213
1214    x70700000 = x77770000 & ~a4;
1215    x43433333 = a3 ^ x70700000;
1216    x00430033 = a5 & x43433333;
1217    x55557777 = a1 | x11116666;
1218    x55167744 = x00430033 ^ x55557777;
1219    x5A19784B = a4 ^ x55167744;
1220
1221    x5A1987B4 = a6 ^ x5A19784B;
1222    x7A3BD7F5 = x22225555 | x5A1987B4;
1223    x003B00F5 = a5 & x7A3BD7F5;
1224    x221955A0 = x22225555 ^ x003B00F5;
1225    x05050707 = a4 & x55557777;
1226    x271C52A7 = x221955A0 ^ x05050707;
1227
1228    x2A2A82A0 = x7A3BD7F5 & ~a1;
1229    x6969B193 = x43433333 ^ x2A2A82A0;
1230    x1FE06F90 = a5 ^ x1F1F6F6F;
1231    x16804E00 = x1FE06F90 & ~x6969B193;
1232    xE97FB1FF = ~x16804E00;
1233    x20 = xE97FB1FF & ~a2;
1234    x21 = x20 ^ x5A19784B;
1235    *out3 ^= x21;
1236
1237    x43403302 = x43433333 & ~x003B00F5;
1238    x35CAED30 = x2A2A82A0 ^ x1FE06F90;
1239    x37DEFFB7 = x271C52A7 | x35CAED30;
1240    x349ECCB5 = x37DEFFB7 & ~x43403302;
1241    x0B01234A = x1F1F6F6F & ~x349ECCB5;
1242
1243    x101884B4 = x5A1987B4 & x349ECCB5;
1244    x0FF8EB24 = x1FE06F90 ^ x101884B4;
1245    x41413333 = x43433333 & x55557777;
1246    x4FF9FB37 = x0FF8EB24 | x41413333;
1247    x4FC2FBC2 = x003B00F5 ^ x4FF9FB37;
1248    x30 = x4FC2FBC2 & a2;
1249    x31 = x30 ^ x271C52A7;
1250    *out4 ^= x31;
1251
1252    x22222222 = a1 ^ x77777777;
1253    x16BCEE97 = x349ECCB5 ^ x22222222;
1254    x0F080B04 = a4 & x0FF8EB24;
1255    x19B4E593 = x16BCEE97 ^ x0F080B04;
1256    x00 = x0B01234A | a2;
1257    x01 = x00 ^ x19B4E593;
1258    *out1 ^= x01;
1259
1260    x5C5C5C5C = x1F1F6F6F ^ x43433333;
1261    x4448184C = x5C5C5C5C & ~x19B4E593;
1262    x2DDABE71 = x22225555 ^ x0FF8EB24;
1263    x6992A63D = x4448184C ^ x2DDABE71;
1264    x10 = x1F1F6F6F & a2;
1265    x11 = x10 ^ x6992A63D;
1266    *out2 ^= x11;
1267}
1268
1269DECLSPEC void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1270{
1271    u32 x33CC33CC;
1272    u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099;
1273    u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6;
1274    u32 x09030C06, x09030000, x336622FF, x3A6522FF;
1275    u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD;
1276    u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B;
1277    u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479;
1278    u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5;
1279    u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67;
1280    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1281
1282    x33CC33CC = a2 ^ a5;
1283
1284    x3333FFFF = a2 | a6;
1285    x11115555 = a1 & x3333FFFF;
1286    x22DD6699 = x33CC33CC ^ x11115555;
1287    x22DD9966 = a6 ^ x22DD6699;
1288    x00220099 = a5 & ~x22DD9966;
1289
1290    x00551144 = a1 & x22DD9966;
1291    x33662277 = a2 ^ x00551144;
1292    x5A5A5A5A = a1 ^ a3;
1293    x7B7E7A7F = x33662277 | x5A5A5A5A;
1294    x59A31CE6 = x22DD6699 ^ x7B7E7A7F;
1295
1296    x09030C06 = a3 & x59A31CE6;
1297    x09030000 = x09030C06 & ~a6;
1298    x336622FF = x00220099 | x33662277;
1299    x3A6522FF = x09030000 ^ x336622FF;
1300    x30 = x3A6522FF & a4;
1301    x31 = x30 ^ x59A31CE6;
1302    *out4 ^= x31;
1303
1304    x484D494C = a2 ^ x7B7E7A7F;
1305    x0000B6B3 = a6 & ~x484D494C;
1306    x0F0FB9BC = a3 ^ x0000B6B3;
1307    x00FC00F9 = a5 & ~x09030C06;
1308    x0FFFB9FD = x0F0FB9BC | x00FC00F9;
1309
1310    x5DF75DF7 = a1 | x59A31CE6;
1311    x116600F7 = x336622FF & x5DF75DF7;
1312    x1E69B94B = x0F0FB9BC ^ x116600F7;
1313    x1668B94B = x1E69B94B & ~x09030000;
1314    x20 = x00220099 | a4;
1315    x21 = x20 ^ x1668B94B;
1316    *out3 ^= x21;
1317
1318    x7B7B7B7B = a2 | x5A5A5A5A;
1319    x411E5984 = x3A6522FF ^ x7B7B7B7B;
1320    x1FFFFDFD = x11115555 | x0FFFB9FD;
1321    x5EE1A479 = x411E5984 ^ x1FFFFDFD;
1322
1323    x3CB4DFD2 = x22DD6699 ^ x1E69B94B;
1324    x004B002D = a5 & ~x3CB4DFD2;
1325    xB7B2B6B3 = ~x484D494C;
1326    xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3;
1327    xCC82CDE5 = x004B002D ^ xCCC9CDC8;
1328    x10 = xCC82CDE5 & ~a4;
1329    x11 = x10 ^ x5EE1A479;
1330    *out2 ^= x11;
1331
1332    x0055EEBB = a6 ^ x00551144;
1333    x5A5AECE9 = a1 ^ x0F0FB9BC;
1334    x0050ECA9 = x0055EEBB & x5A5AECE9;
1335    xC5CAC1CE = x09030C06 ^ xCCC9CDC8;
1336    xC59A2D67 = x0050ECA9 ^ xC5CAC1CE;
1337    x00 = x0FFFB9FD & ~a4;
1338    x01 = x00 ^ xC59A2D67;
1339    *out1 ^= x01;
1340}
1341
1342DECLSPEC void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1343{
1344    u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841;
1345    u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78;
1346    u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D;
1347    u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B;
1348    u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB;
1349    u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867;
1350    u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD;
1351    u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A;
1352    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1353
1354    x0FF00FF0 = a4 ^ a5;
1355    x3CC33CC3 = a3 ^ x0FF00FF0;
1356    x00003CC3 = a6 & x3CC33CC3;
1357    x0F000F00 = a4 & x0FF00FF0;
1358    x5A555A55 = a2 ^ x0F000F00;
1359    x00001841 = x00003CC3 & x5A555A55;
1360
1361    x00000F00 = a6 & x0F000F00;
1362    x33333C33 = a3 ^ x00000F00;
1363    x7B777E77 = x5A555A55 | x33333C33;
1364    x0FF0F00F = a6 ^ x0FF00FF0;
1365    x74878E78 = x7B777E77 ^ x0FF0F00F;
1366    x30 = a1 & ~x00001841;
1367    x31 = x30 ^ x74878E78;
1368    *out4 ^= x31;
1369
1370    x003C003C = a5 & ~x3CC33CC3;
1371    x5A7D5A7D = x5A555A55 | x003C003C;
1372    x333300F0 = x00003CC3 ^ x33333C33;
1373    x694E5A8D = x5A7D5A7D ^ x333300F0;
1374
1375    x0FF0CCCC = x00003CC3 ^ x0FF0F00F;
1376    x000F0303 = a4 & ~x0FF0CCCC;
1377    x5A505854 = x5A555A55 & ~x000F0303;
1378    x33CC000F = a5 ^ x333300F0;
1379    x699C585B = x5A505854 ^ x33CC000F;
1380
1381    x7F878F78 = x0F000F00 | x74878E78;
1382    x21101013 = a3 & x699C585B;
1383    x7F979F7B = x7F878F78 | x21101013;
1384    x30030CC0 = x3CC33CC3 & ~x0FF0F00F;
1385    x4F9493BB = x7F979F7B ^ x30030CC0;
1386    x00 = x4F9493BB & ~a1;
1387    x01 = x00 ^ x694E5A8D;
1388    *out1 ^= x01;
1389
1390    x6F9CDBFB = x699C585B | x4F9493BB;
1391    x0000DBFB = a6 & x6F9CDBFB;
1392    x00005151 = a2 & x0000DBFB;
1393    x26DAC936 = x694E5A8D ^ x4F9493BB;
1394    x26DA9867 = x00005151 ^ x26DAC936;
1395
1396    x27DA9877 = x21101013 | x26DA9867;
1397    x27DA438C = x0000DBFB ^ x27DA9877;
1398    x2625C9C9 = a5 ^ x26DAC936;
1399    x27FFCBCD = x27DA438C | x2625C9C9;
1400    x20 = x27FFCBCD & a1;
1401    x21 = x20 ^ x699C585B;
1402    *out3 ^= x21;
1403
1404    x27FF1036 = x0000DBFB ^ x27FFCBCD;
1405    x27FF103E = x003C003C | x27FF1036;
1406    xB06B6C44 = ~x4F9493BB;
1407    x97947C7A = x27FF103E ^ xB06B6C44;
1408    x10 = x97947C7A & ~a1;
1409    x11 = x10 ^ x26DA9867;
1410    *out2 ^= x11;
1411}
1412
1413DECLSPEC void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1414{
1415    u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001;
1416    u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745;
1417    u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3;
1418    u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A;
1419    u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926;
1420    u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F;
1421    u32 xF700A600, x61008000, x03B7856B, x62B7056B;
1422    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1423
1424    x0C0C0C0C = a3 & ~a2;
1425    x0000F0F0 = a5 & ~a3;
1426    x00FFF00F = a4 ^ x0000F0F0;
1427    x00555005 = a1 & x00FFF00F;
1428    x00515001 = x00555005 & ~x0C0C0C0C;
1429
1430    x33000330 = a2 & ~x00FFF00F;
1431    x77555775 = a1 | x33000330;
1432    x30303030 = a2 & ~a3;
1433    x3030CFCF = a5 ^ x30303030;
1434    x30104745 = x77555775 & x3030CFCF;
1435    x30555745 = x00555005 | x30104745;
1436
1437    xFF000FF0 = ~x00FFF00F;
1438    xCF1048B5 = x30104745 ^ xFF000FF0;
1439    x080A080A = a3 & ~x77555775;
1440    xC71A40BF = xCF1048B5 ^ x080A080A;
1441    xCB164CB3 = x0C0C0C0C ^ xC71A40BF;
1442    x10 = x00515001 | a6;
1443    x11 = x10 ^ xCB164CB3;
1444    *out2 ^= x11;
1445
1446    x9E4319E6 = a1 ^ xCB164CB3;
1447    x000019E6 = a5 & x9E4319E6;
1448    xF429738C = a2 ^ xC71A40BF;
1449    xF4296A6A = x000019E6 ^ xF429738C;
1450    xC729695A = x33000330 ^ xF4296A6A;
1451
1452    xC47C3D2F = x30555745 ^ xF4296A6A;
1453    xF77F3F3F = a2 | xC47C3D2F;
1454    x9E43E619 = a5 ^ x9E4319E6;
1455    x693CD926 = xF77F3F3F ^ x9E43E619;
1456    x20 = x30555745 & a6;
1457    x21 = x20 ^ x693CD926;
1458    *out3 ^= x21;
1459
1460    xF719A695 = x3030CFCF ^ xC729695A;
1461    xF4FF73FF = a4 | xF429738C;
1462    x03E6D56A = xF719A695 ^ xF4FF73FF;
1463    x56B3803F = a1 ^ x03E6D56A;
1464    x30 = x56B3803F & a6;
1465    x31 = x30 ^ xC729695A;
1466    *out4 ^= x31;
1467
1468    xF700A600 = xF719A695 & ~a4;
1469    x61008000 = x693CD926 & xF700A600;
1470    x03B7856B = x00515001 ^ x03E6D56A;
1471    x62B7056B = x61008000 ^ x03B7856B;
1472    x00 = x62B7056B | a6;
1473    x01 = x00 ^ xC729695A;
1474    *out1 ^= x01;
1475}
1476
1477#endif
1478
1479#define SWAP(a, b) { u32 tmp=*a;*a=*b;*b=tmp; }
1480
1481#define DATASWAP   \
1482  SWAP (D00, D32); \
1483  SWAP (D01, D33); \
1484  SWAP (D02, D34); \
1485  SWAP (D03, D35); \
1486  SWAP (D04, D36); \
1487  SWAP (D05, D37); \
1488  SWAP (D06, D38); \
1489  SWAP (D07, D39); \
1490  SWAP (D08, D40); \
1491  SWAP (D09, D41); \
1492  SWAP (D10, D42); \
1493  SWAP (D11, D43); \
1494  SWAP (D12, D44); \
1495  SWAP (D13, D45); \
1496  SWAP (D14, D46); \
1497  SWAP (D15, D47); \
1498  SWAP (D16, D48); \
1499  SWAP (D17, D49); \
1500  SWAP (D18, D50); \
1501  SWAP (D19, D51); \
1502  SWAP (D20, D52); \
1503  SWAP (D21, D53); \
1504  SWAP (D22, D54); \
1505  SWAP (D23, D55); \
1506  SWAP (D24, D56); \
1507  SWAP (D25, D57); \
1508  SWAP (D26, D58); \
1509  SWAP (D27, D59); \
1510  SWAP (D28, D60); \
1511  SWAP (D29, D61); \
1512  SWAP (D30, D62); \
1513  SWAP (D31, D63);
1514
1515#define KEYSET00 { k00 = K08; k01 = K44; k02 = K29; k03 = K52; k04 = K42; k05 = K14; k06 = K28; k07 = K49; k08 = K01; k09 = K07; k10 = K16; k11 = K36; k12 = K02; k13 = K30; k14 = K22; k15 = K21; k16 = K38; k17 = K50; k18 = K51; k19 = K00; k20 = K31; k21 = K23; k22 = K15; k23 = K35; k24 = K19; k25 = K24; k26 = K34; k27 = K47; k28 = K32; k29 = K03; k30 = K41; k31 = K26; k32 = K04; k33 = K46; k34 = K20; k35 = K25; k36 = K53; k37 = K18; k38 = K33; k39 = K55; k40 = K13; k41 = K17; k42 = K39; k43 = K12; k44 = K11; k45 = K54; k46 = K48; k47 = K27; }
1516#define KEYSET10 { k00 = K49; k01 = K28; k02 = K45; k03 = K36; k04 = K01; k05 = K30; k06 = K44; k07 = K08; k08 = K42; k09 = K23; k10 = K00; k11 = K52; k12 = K43; k13 = K14; k14 = K38; k15 = K37; k16 = K22; k17 = K09; k18 = K35; k19 = K16; k20 = K15; k21 = K07; k22 = K31; k23 = K51; k24 = K03; k25 = K40; k26 = K46; k27 = K04; k28 = K20; k29 = K19; k30 = K53; k31 = K10; k32 = K47; k33 = K34; k34 = K32; k35 = K13; k36 = K41; k37 = K06; k38 = K17; k39 = K12; k40 = K25; k41 = K33; k42 = K27; k43 = K55; k44 = K54; k45 = K11; k46 = K05; k47 = K39; }
1517#define KEYSET01 { k00 = K01; k01 = K37; k02 = K22; k03 = K45; k04 = K35; k05 = K07; k06 = K21; k07 = K42; k08 = K51; k09 = K00; k10 = K09; k11 = K29; k12 = K52; k13 = K23; k14 = K15; k15 = K14; k16 = K31; k17 = K43; k18 = K44; k19 = K50; k20 = K49; k21 = K16; k22 = K08; k23 = K28; k24 = K12; k25 = K17; k26 = K27; k27 = K40; k28 = K25; k29 = K55; k30 = K34; k31 = K19; k32 = K24; k33 = K39; k34 = K13; k35 = K18; k36 = K46; k37 = K11; k38 = K26; k39 = K48; k40 = K06; k41 = K10; k42 = K32; k43 = K05; k44 = K04; k45 = K47; k46 = K41; k47 = K20; }
1518#define KEYSET11 { k00 = K35; k01 = K14; k02 = K31; k03 = K22; k04 = K44; k05 = K16; k06 = K30; k07 = K51; k08 = K28; k09 = K09; k10 = K43; k11 = K38; k12 = K29; k13 = K00; k14 = K49; k15 = K23; k16 = K08; k17 = K52; k18 = K21; k19 = K02; k20 = K01; k21 = K50; k22 = K42; k23 = K37; k24 = K48; k25 = K26; k26 = K32; k27 = K17; k28 = K06; k29 = K05; k30 = K39; k31 = K55; k32 = K33; k33 = K20; k34 = K18; k35 = K54; k36 = K27; k37 = K47; k38 = K03; k39 = K53; k40 = K11; k41 = K19; k42 = K13; k43 = K41; k44 = K40; k45 = K24; k46 = K46; k47 = K25; }
1519#define KEYSET02 { k00 = K44; k01 = K23; k02 = K08; k03 = K31; k04 = K21; k05 = K50; k06 = K07; k07 = K28; k08 = K37; k09 = K43; k10 = K52; k11 = K15; k12 = K38; k13 = K09; k14 = K01; k15 = K00; k16 = K42; k17 = K29; k18 = K30; k19 = K36; k20 = K35; k21 = K02; k22 = K51; k23 = K14; k24 = K53; k25 = K03; k26 = K13; k27 = K26; k28 = K11; k29 = K41; k30 = K20; k31 = K05; k32 = K10; k33 = K25; k34 = K54; k35 = K04; k36 = K32; k37 = K24; k38 = K12; k39 = K34; k40 = K47; k41 = K55; k42 = K18; k43 = K46; k44 = K17; k45 = K33; k46 = K27; k47 = K06; }
1520#define KEYSET12 { k00 = K21; k01 = K00; k02 = K42; k03 = K08; k04 = K30; k05 = K02; k06 = K16; k07 = K37; k08 = K14; k09 = K52; k10 = K29; k11 = K49; k12 = K15; k13 = K43; k14 = K35; k15 = K09; k16 = K51; k17 = K38; k18 = K07; k19 = K45; k20 = K44; k21 = K36; k22 = K28; k23 = K23; k24 = K34; k25 = K12; k26 = K18; k27 = K03; k28 = K47; k29 = K46; k30 = K25; k31 = K41; k32 = K19; k33 = K06; k34 = K04; k35 = K40; k36 = K13; k37 = K33; k38 = K48; k39 = K39; k40 = K24; k41 = K05; k42 = K54; k43 = K27; k44 = K26; k45 = K10; k46 = K32; k47 = K11; }
1521#define KEYSET03 { k00 = K30; k01 = K09; k02 = K51; k03 = K42; k04 = K07; k05 = K36; k06 = K50; k07 = K14; k08 = K23; k09 = K29; k10 = K38; k11 = K01; k12 = K49; k13 = K52; k14 = K44; k15 = K43; k16 = K28; k17 = K15; k18 = K16; k19 = K22; k20 = K21; k21 = K45; k22 = K37; k23 = K00; k24 = K39; k25 = K48; k26 = K54; k27 = K12; k28 = K24; k29 = K27; k30 = K06; k31 = K46; k32 = K55; k33 = K11; k34 = K40; k35 = K17; k36 = K18; k37 = K10; k38 = K53; k39 = K20; k40 = K33; k41 = K41; k42 = K04; k43 = K32; k44 = K03; k45 = K19; k46 = K13; k47 = K47; }
1522#define KEYSET13 { k00 = K07; k01 = K43; k02 = K28; k03 = K51; k04 = K16; k05 = K45; k06 = K02; k07 = K23; k08 = K00; k09 = K38; k10 = K15; k11 = K35; k12 = K01; k13 = K29; k14 = K21; k15 = K52; k16 = K37; k17 = K49; k18 = K50; k19 = K31; k20 = K30; k21 = K22; k22 = K14; k23 = K09; k24 = K20; k25 = K53; k26 = K04; k27 = K48; k28 = K33; k29 = K32; k30 = K11; k31 = K27; k32 = K05; k33 = K47; k34 = K17; k35 = K26; k36 = K54; k37 = K19; k38 = K34; k39 = K25; k40 = K10; k41 = K46; k42 = K40; k43 = K13; k44 = K12; k45 = K55; k46 = K18; k47 = K24; }
1523#define KEYSET04 { k00 = K16; k01 = K52; k02 = K37; k03 = K28; k04 = K50; k05 = K22; k06 = K36; k07 = K00; k08 = K09; k09 = K15; k10 = K49; k11 = K44; k12 = K35; k13 = K38; k14 = K30; k15 = K29; k16 = K14; k17 = K01; k18 = K02; k19 = K08; k20 = K07; k21 = K31; k22 = K23; k23 = K43; k24 = K25; k25 = K34; k26 = K40; k27 = K53; k28 = K10; k29 = K13; k30 = K47; k31 = K32; k32 = K41; k33 = K24; k34 = K26; k35 = K03; k36 = K04; k37 = K55; k38 = K39; k39 = K06; k40 = K19; k41 = K27; k42 = K17; k43 = K18; k44 = K48; k45 = K05; k46 = K54; k47 = K33; }
1524#define KEYSET14 { k00 = K50; k01 = K29; k02 = K14; k03 = K37; k04 = K02; k05 = K31; k06 = K45; k07 = K09; k08 = K43; k09 = K49; k10 = K01; k11 = K21; k12 = K44; k13 = K15; k14 = K07; k15 = K38; k16 = K23; k17 = K35; k18 = K36; k19 = K42; k20 = K16; k21 = K08; k22 = K00; k23 = K52; k24 = K06; k25 = K39; k26 = K17; k27 = K34; k28 = K19; k29 = K18; k30 = K24; k31 = K13; k32 = K46; k33 = K33; k34 = K03; k35 = K12; k36 = K40; k37 = K05; k38 = K20; k39 = K11; k40 = K55; k41 = K32; k42 = K26; k43 = K54; k44 = K53; k45 = K41; k46 = K04; k47 = K10; }
1525#define KEYSET05 { k00 = K02; k01 = K38; k02 = K23; k03 = K14; k04 = K36; k05 = K08; k06 = K22; k07 = K43; k08 = K52; k09 = K01; k10 = K35; k11 = K30; k12 = K21; k13 = K49; k14 = K16; k15 = K15; k16 = K00; k17 = K44; k18 = K45; k19 = K51; k20 = K50; k21 = K42; k22 = K09; k23 = K29; k24 = K11; k25 = K20; k26 = K26; k27 = K39; k28 = K55; k29 = K54; k30 = K33; k31 = K18; k32 = K27; k33 = K10; k34 = K12; k35 = K48; k36 = K17; k37 = K41; k38 = K25; k39 = K47; k40 = K05; k41 = K13; k42 = K03; k43 = K04; k44 = K34; k45 = K46; k46 = K40; k47 = K19; }
1526#define KEYSET15 { k00 = K36; k01 = K15; k02 = K00; k03 = K23; k04 = K45; k05 = K42; k06 = K31; k07 = K52; k08 = K29; k09 = K35; k10 = K44; k11 = K07; k12 = K30; k13 = K01; k14 = K50; k15 = K49; k16 = K09; k17 = K21; k18 = K22; k19 = K28; k20 = K02; k21 = K51; k22 = K43; k23 = K38; k24 = K47; k25 = K25; k26 = K03; k27 = K20; k28 = K05; k29 = K04; k30 = K10; k31 = K54; k32 = K32; k33 = K19; k34 = K48; k35 = K53; k36 = K26; k37 = K46; k38 = K06; k39 = K24; k40 = K41; k41 = K18; k42 = K12; k43 = K40; k44 = K39; k45 = K27; k46 = K17; k47 = K55; }
1527#define KEYSET06 { k00 = K45; k01 = K49; k02 = K09; k03 = K00; k04 = K22; k05 = K51; k06 = K08; k07 = K29; k08 = K38; k09 = K44; k10 = K21; k11 = K16; k12 = K07; k13 = K35; k14 = K02; k15 = K01; k16 = K43; k17 = K30; k18 = K31; k19 = K37; k20 = K36; k21 = K28; k22 = K52; k23 = K15; k24 = K24; k25 = K06; k26 = K12; k27 = K25; k28 = K41; k29 = K40; k30 = K19; k31 = K04; k32 = K13; k33 = K55; k34 = K53; k35 = K34; k36 = K03; k37 = K27; k38 = K11; k39 = K33; k40 = K46; k41 = K54; k42 = K48; k43 = K17; k44 = K20; k45 = K32; k46 = K26; k47 = K05; }
1528#define KEYSET16 { k00 = K22; k01 = K01; k02 = K43; k03 = K09; k04 = K31; k05 = K28; k06 = K42; k07 = K38; k08 = K15; k09 = K21; k10 = K30; k11 = K50; k12 = K16; k13 = K44; k14 = K36; k15 = K35; k16 = K52; k17 = K07; k18 = K08; k19 = K14; k20 = K45; k21 = K37; k22 = K29; k23 = K49; k24 = K33; k25 = K11; k26 = K48; k27 = K06; k28 = K46; k29 = K17; k30 = K55; k31 = K40; k32 = K18; k33 = K05; k34 = K34; k35 = K39; k36 = K12; k37 = K32; k38 = K47; k39 = K10; k40 = K27; k41 = K04; k42 = K53; k43 = K26; k44 = K25; k45 = K13; k46 = K03; k47 = K41; }
1529#define KEYSET07 { k00 = K31; k01 = K35; k02 = K52; k03 = K43; k04 = K08; k05 = K37; k06 = K51; k07 = K15; k08 = K49; k09 = K30; k10 = K07; k11 = K02; k12 = K50; k13 = K21; k14 = K45; k15 = K44; k16 = K29; k17 = K16; k18 = K42; k19 = K23; k20 = K22; k21 = K14; k22 = K38; k23 = K01; k24 = K10; k25 = K47; k26 = K53; k27 = K11; k28 = K27; k29 = K26; k30 = K05; k31 = K17; k32 = K54; k33 = K41; k34 = K39; k35 = K20; k36 = K48; k37 = K13; k38 = K24; k39 = K19; k40 = K32; k41 = K40; k42 = K34; k43 = K03; k44 = K06; k45 = K18; k46 = K12; k47 = K46; }
1530#define KEYSET17 { k00 = K15; k01 = K51; k02 = K36; k03 = K02; k04 = K49; k05 = K21; k06 = K35; k07 = K31; k08 = K08; k09 = K14; k10 = K23; k11 = K43; k12 = K09; k13 = K37; k14 = K29; k15 = K28; k16 = K45; k17 = K00; k18 = K01; k19 = K07; k20 = K38; k21 = K30; k22 = K22; k23 = K42; k24 = K26; k25 = K04; k26 = K41; k27 = K54; k28 = K39; k29 = K10; k30 = K48; k31 = K33; k32 = K11; k33 = K53; k34 = K27; k35 = K32; k36 = K05; k37 = K25; k38 = K40; k39 = K03; k40 = K20; k41 = K24; k42 = K46; k43 = K19; k44 = K18; k45 = K06; k46 = K55; k47 = K34; }
1531
1532DECLSPEC void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 K04, const u32 K05, const u32 K06, const u32 K07, const u32 K08, const u32 K09, const u32 K10, const u32 K11, const u32 K12, const u32 K13, const u32 K14, const u32 K15, const u32 K16, const u32 K17, const u32 K18, const u32 K19, const u32 K20, const u32 K21, const u32 K22, const u32 K23, const u32 K24, const u32 K25, const u32 K26, const u32 K27, const u32 K28, const u32 K29, const u32 K30, const u32 K31, const u32 K32, const u32 K33, const u32 K34, const u32 K35, const u32 K36, const u32 K37, const u32 K38, const u32 K39, const u32 K40, const u32 K41, const u32 K42, const u32 K43, const u32 K44, const u32 K45, const u32 K46, const u32 K47, const u32 K48, const u32 K49, const u32 K50, const u32 K51, const u32 K52, const u32 K53, const u32 K54, const u32 K55, u32 *D00, u32 *D01, u32 *D02, u32 *D03, u32 *D04, u32 *D05, u32 *D06, u32 *D07, u32 *D08, u32 *D09, u32 *D10, u32 *D11, u32 *D12, u32 *D13, u32 *D14, u32 *D15, u32 *D16, u32 *D17, u32 *D18, u32 *D19, u32 *D20, u32 *D21, u32 *D22, u32 *D23, u32 *D24, u32 *D25, u32 *D26, u32 *D27, u32 *D28, u32 *D29, u32 *D30, u32 *D31, u32 *D32, u32 *D33, u32 *D34, u32 *D35, u32 *D36, u32 *D37, u32 *D38, u32 *D39, u32 *D40, u32 *D41, u32 *D42, u32 *D43, u32 *D44, u32 *D45, u32 *D46, u32 *D47, u32 *D48, u32 *D49, u32 *D50, u32 *D51, u32 *D52, u32 *D53, u32 *D54, u32 *D55, u32 *D56, u32 *D57, u32 *D58, u32 *D59, u32 *D60, u32 *D61, u32 *D62, u32 *D63)
1533{
1534  KXX_DECL u32 k00, k01, k02, k03, k04, k05;
1535  KXX_DECL u32 k06, k07, k08, k09, k10, k11;
1536  KXX_DECL u32 k12, k13, k14, k15, k16, k17;
1537  KXX_DECL u32 k18, k19, k20, k21, k22, k23;
1538  KXX_DECL u32 k24, k25, k26, k27, k28, k29;
1539  KXX_DECL u32 k30, k31, k32, k33, k34, k35;
1540  KXX_DECL u32 k36, k37, k38, k39, k40, k41;
1541  KXX_DECL u32 k42, k43, k44, k45, k46, k47;
1542
1543  #ifdef _unroll
1544  #pragma unroll
1545  #endif
1546  for (u32 i = 0; i < 2; i++)
1547  {
1548    if (i) KEYSET10 else KEYSET00
1549
1550    s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30);
1551    s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17);
1552    s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05);
1553    s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00);
1554    s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02);
1555    s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18);
1556    s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06);
1557    s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20);
1558
1559    if (i) KEYSET11 else KEYSET01
1560
1561    s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62);
1562    s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49);
1563    s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37);
1564    s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32);
1565    s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34);
1566    s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50);
1567    s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38);
1568    s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52);
1569
1570    if (i) KEYSET12 else KEYSET02
1571
1572    s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30);
1573    s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17);
1574    s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05);
1575    s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00);
1576    s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02);
1577    s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18);
1578    s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06);
1579    s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20);
1580
1581    if (i) KEYSET13 else KEYSET03
1582
1583    s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62);
1584    s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49);
1585    s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37);
1586    s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32);
1587    s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34);
1588    s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50);
1589    s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38);
1590    s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52);
1591
1592    if (i) KEYSET14 else KEYSET04
1593
1594    s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30);
1595    s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17);
1596    s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05);
1597    s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00);
1598    s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02);
1599    s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18);
1600    s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06);
1601    s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20);
1602
1603    if (i) KEYSET15 else KEYSET05
1604
1605    s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62);
1606    s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49);
1607    s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37);
1608    s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32);
1609    s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34);
1610    s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50);
1611    s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38);
1612    s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52);
1613
1614    if (i) KEYSET16 else KEYSET06
1615
1616    s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30);
1617    s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17);
1618    s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05);
1619    s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00);
1620    s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02);
1621    s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18);
1622    s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06);
1623    s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20);
1624
1625    if (i) KEYSET17 else KEYSET07
1626
1627    s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62);
1628    s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49);
1629    s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37);
1630    s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32);
1631    s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34);
1632    s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50);
1633    s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38);
1634    s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52);
1635  }
1636}
1637
1638DECLSPEC void transpose32c (u32 *data)
1639{
1640  #define swap(x,y,j,m)               \
1641     t  = ((x) ^ ((y) >> (j))) & (m); \
1642    (x) = (x) ^ t;                    \
1643    (y) = (y) ^ (t << (j));
1644
1645  u32 t;
1646
1647  swap (data[ 0], data[16], 16, 0x0000ffff);
1648  swap (data[ 1], data[17], 16, 0x0000ffff);
1649  swap (data[ 2], data[18], 16, 0x0000ffff);
1650  swap (data[ 3], data[19], 16, 0x0000ffff);
1651  swap (data[ 4], data[20], 16, 0x0000ffff);
1652  swap (data[ 5], data[21], 16, 0x0000ffff);
1653  swap (data[ 6], data[22], 16, 0x0000ffff);
1654  swap (data[ 7], data[23], 16, 0x0000ffff);
1655  swap (data[ 8], data[24], 16, 0x0000ffff);
1656  swap (data[ 9], data[25], 16, 0x0000ffff);
1657  swap (data[10], data[26], 16, 0x0000ffff);
1658  swap (data[11], data[27], 16, 0x0000ffff);
1659  swap (data[12], data[28], 16, 0x0000ffff);
1660  swap (data[13], data[29], 16, 0x0000ffff);
1661  swap (data[14], data[30], 16, 0x0000ffff);
1662  swap (data[15], data[31], 16, 0x0000ffff);
1663  swap (data[ 0], data[ 8],  8, 0x00ff00ff);
1664  swap (data[ 1], data[ 9],  8, 0x00ff00ff);
1665  swap (data[ 2], data[10],  8, 0x00ff00ff);
1666  swap (data[ 3], data[11],  8, 0x00ff00ff);
1667  swap (data[ 4], data[12],  8, 0x00ff00ff);
1668  swap (data[ 5], data[13],  8, 0x00ff00ff);
1669  swap (data[ 6], data[14],  8, 0x00ff00ff);
1670  swap (data[ 7], data[15],  8, 0x00ff00ff);
1671  swap (data[ 0], data[ 4],  4, 0x0f0f0f0f);
1672  swap (data[ 1], data[ 5],  4, 0x0f0f0f0f);
1673  swap (data[ 2], data[ 6],  4, 0x0f0f0f0f);
1674  swap (data[ 3], data[ 7],  4, 0x0f0f0f0f);
1675  swap (data[ 0], data[ 2],  2, 0x33333333);
1676  swap (data[ 1], data[ 3],  2, 0x33333333);
1677  swap (data[ 0], data[ 1],  1, 0x55555555);
1678  swap (data[ 2], data[ 3],  1, 0x55555555);
1679  swap (data[ 4], data[ 6],  2, 0x33333333);
1680  swap (data[ 5], data[ 7],  2, 0x33333333);
1681  swap (data[ 4], data[ 5],  1, 0x55555555);
1682  swap (data[ 6], data[ 7],  1, 0x55555555);
1683  swap (data[ 8], data[12],  4, 0x0f0f0f0f);
1684  swap (data[ 9], data[13],  4, 0x0f0f0f0f);
1685  swap (data[10], data[14],  4, 0x0f0f0f0f);
1686  swap (data[11], data[15],  4, 0x0f0f0f0f);
1687  swap (data[ 8], data[10],  2, 0x33333333);
1688  swap (data[ 9], data[11],  2, 0x33333333);
1689  swap (data[ 8], data[ 9],  1, 0x55555555);
1690  swap (data[10], data[11],  1, 0x55555555);
1691  swap (data[12], data[14],  2, 0x33333333);
1692  swap (data[13], data[15],  2, 0x33333333);
1693  swap (data[12], data[13],  1, 0x55555555);
1694  swap (data[14], data[15],  1, 0x55555555);
1695  swap (data[16], data[24],  8, 0x00ff00ff);
1696  swap (data[17], data[25],  8, 0x00ff00ff);
1697  swap (data[18], data[26],  8, 0x00ff00ff);
1698  swap (data[19], data[27],  8, 0x00ff00ff);
1699  swap (data[20], data[28],  8, 0x00ff00ff);
1700  swap (data[21], data[29],  8, 0x00ff00ff);
1701  swap (data[22], data[30],  8, 0x00ff00ff);
1702  swap (data[23], data[31],  8, 0x00ff00ff);
1703  swap (data[16], data[20],  4, 0x0f0f0f0f);
1704  swap (data[17], data[21],  4, 0x0f0f0f0f);
1705  swap (data[18], data[22],  4, 0x0f0f0f0f);
1706  swap (data[19], data[23],  4, 0x0f0f0f0f);
1707  swap (data[16], data[18],  2, 0x33333333);
1708  swap (data[17], data[19],  2, 0x33333333);
1709  swap (data[16], data[17],  1, 0x55555555);
1710  swap (data[18], data[19],  1, 0x55555555);
1711  swap (data[20], data[22],  2, 0x33333333);
1712  swap (data[21], data[23],  2, 0x33333333);
1713  swap (data[20], data[21],  1, 0x55555555);
1714  swap (data[22], data[23],  1, 0x55555555);
1715  swap (data[24], data[28],  4, 0x0f0f0f0f);
1716  swap (data[25], data[29],  4, 0x0f0f0f0f);
1717  swap (data[26], data[30],  4, 0x0f0f0f0f);
1718  swap (data[27], data[31],  4, 0x0f0f0f0f);
1719  swap (data[24], data[26],  2, 0x33333333);
1720  swap (data[25], data[27],  2, 0x33333333);
1721  swap (data[24], data[25],  1, 0x55555555);
1722  swap (data[26], data[27],  1, 0x55555555);
1723  swap (data[28], data[30],  2, 0x33333333);
1724  swap (data[29], data[31],  2, 0x33333333);
1725  swap (data[28], data[29],  1, 0x55555555);
1726  swap (data[30], data[31],  1, 0x55555555);
1727}
1728
1729//
1730// transpose bitslice mod  : attention race conditions, need different buffers for *in and *out
1731//
1732
1733KERNEL_FQ void m03000_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_b)
1734{
1735  const u64 gid = get_global_id (0);
1736
1737  const u32 block = gid / 32;
1738  const u32 slice = gid % 32;
1739
1740  const u32 w0 = mod[gid];
1741
1742  for (int i = 0; i < 32; i += 8)
1743  {
1744    hc_atomic_or (&words_buf_b[block].b[i + 0], (((w0 >> (i + 7)) & 1) << slice));
1745    hc_atomic_or (&words_buf_b[block].b[i + 1], (((w0 >> (i + 6)) & 1) << slice));
1746    hc_atomic_or (&words_buf_b[block].b[i + 2], (((w0 >> (i + 5)) & 1) << slice));
1747    hc_atomic_or (&words_buf_b[block].b[i + 3], (((w0 >> (i + 4)) & 1) << slice));
1748    hc_atomic_or (&words_buf_b[block].b[i + 4], (((w0 >> (i + 3)) & 1) << slice));
1749    hc_atomic_or (&words_buf_b[block].b[i + 5], (((w0 >> (i + 2)) & 1) << slice));
1750    hc_atomic_or (&words_buf_b[block].b[i + 6], (((w0 >> (i + 1)) & 1) << slice));
1751    hc_atomic_or (&words_buf_b[block].b[i + 7], (((w0 >> (i + 0)) & 1) << slice));
1752  }
1753}
1754
1755KERNEL_FQ void m03000_mxx (KERN_ATTR_BITSLICE ())
1756{
1757  /**
1758   * base
1759   */
1760
1761  const u64 gid = get_global_id (0);
1762  const u64 lid = get_local_id (0);
1763
1764  if (gid >= gid_max) return;
1765
1766  /**
1767   * base
1768   */
1769
1770  const u32 w0s = pws[gid].i[0];
1771  const u32 w1s = pws[gid].i[1];
1772
1773  #define K00 (((w0s >> ( 0 + 7)) & 1) ? -1 : 0)
1774  #define K01 (((w0s >> ( 0 + 6)) & 1) ? -1 : 0)
1775  #define K02 (((w0s >> ( 0 + 5)) & 1) ? -1 : 0)
1776  #define K03 (((w0s >> ( 0 + 4)) & 1) ? -1 : 0)
1777  #define K04 (((w0s >> ( 0 + 3)) & 1) ? -1 : 0)
1778  #define K05 (((w0s >> ( 0 + 2)) & 1) ? -1 : 0)
1779  #define K06 (((w0s >> ( 0 + 1)) & 1) ? -1 : 0)
1780  #define K07 (((w0s >> ( 0 + 0)) & 1) ? -1 : 0)
1781  #define K08 (((w0s >> ( 8 + 7)) & 1) ? -1 : 0)
1782  #define K09 (((w0s >> ( 8 + 6)) & 1) ? -1 : 0)
1783  #define K10 (((w0s >> ( 8 + 5)) & 1) ? -1 : 0)
1784  #define K11 (((w0s >> ( 8 + 4)) & 1) ? -1 : 0)
1785  #define K12 (((w0s >> ( 8 + 3)) & 1) ? -1 : 0)
1786  #define K13 (((w0s >> ( 8 + 2)) & 1) ? -1 : 0)
1787  #define K14 (((w0s >> ( 8 + 1)) & 1) ? -1 : 0)
1788  #define K15 (((w0s >> ( 8 + 0)) & 1) ? -1 : 0)
1789  #define K16 (((w0s >> (16 + 7)) & 1) ? -1 : 0)
1790  #define K17 (((w0s >> (16 + 6)) & 1) ? -1 : 0)
1791  #define K18 (((w0s >> (16 + 5)) & 1) ? -1 : 0)
1792  #define K19 (((w0s >> (16 + 4)) & 1) ? -1 : 0)
1793  #define K20 (((w0s >> (16 + 3)) & 1) ? -1 : 0)
1794  #define K21 (((w0s >> (16 + 2)) & 1) ? -1 : 0)
1795  #define K22 (((w0s >> (16 + 1)) & 1) ? -1 : 0)
1796  #define K23 (((w0s >> (16 + 0)) & 1) ? -1 : 0)
1797  #define K24 (((w0s >> (24 + 7)) & 1) ? -1 : 0)
1798  #define K25 (((w0s >> (24 + 6)) & 1) ? -1 : 0)
1799  #define K26 (((w0s >> (24 + 5)) & 1) ? -1 : 0)
1800  #define K27 (((w0s >> (24 + 4)) & 1) ? -1 : 0)
1801  #define K28 (((w0s >> (24 + 3)) & 1) ? -1 : 0)
1802  #define K29 (((w0s >> (24 + 2)) & 1) ? -1 : 0)
1803  #define K30 (((w0s >> (24 + 1)) & 1) ? -1 : 0)
1804  #define K31 (((w0s >> (24 + 0)) & 1) ? -1 : 0)
1805  #define K32 (((w1s >> ( 0 + 7)) & 1) ? -1 : 0)
1806  #define K33 (((w1s >> ( 0 + 6)) & 1) ? -1 : 0)
1807  #define K34 (((w1s >> ( 0 + 5)) & 1) ? -1 : 0)
1808  #define K35 (((w1s >> ( 0 + 4)) & 1) ? -1 : 0)
1809  #define K36 (((w1s >> ( 0 + 3)) & 1) ? -1 : 0)
1810  #define K37 (((w1s >> ( 0 + 2)) & 1) ? -1 : 0)
1811  #define K38 (((w1s >> ( 0 + 1)) & 1) ? -1 : 0)
1812  #define K39 (((w1s >> ( 0 + 0)) & 1) ? -1 : 0)
1813  #define K40 (((w1s >> ( 8 + 7)) & 1) ? -1 : 0)
1814  #define K41 (((w1s >> ( 8 + 6)) & 1) ? -1 : 0)
1815  #define K42 (((w1s >> ( 8 + 5)) & 1) ? -1 : 0)
1816  #define K43 (((w1s >> ( 8 + 4)) & 1) ? -1 : 0)
1817  #define K44 (((w1s >> ( 8 + 3)) & 1) ? -1 : 0)
1818  #define K45 (((w1s >> ( 8 + 2)) & 1) ? -1 : 0)
1819  #define K46 (((w1s >> ( 8 + 1)) & 1) ? -1 : 0)
1820  #define K47 (((w1s >> ( 8 + 0)) & 1) ? -1 : 0)
1821  #define K48 (((w1s >> (16 + 7)) & 1) ? -1 : 0)
1822  #define K49 (((w1s >> (16 + 6)) & 1) ? -1 : 0)
1823  #define K50 (((w1s >> (16 + 5)) & 1) ? -1 : 0)
1824  #define K51 (((w1s >> (16 + 4)) & 1) ? -1 : 0)
1825  #define K52 (((w1s >> (16 + 3)) & 1) ? -1 : 0)
1826  #define K53 (((w1s >> (16 + 2)) & 1) ? -1 : 0)
1827  #define K54 (((w1s >> (16 + 1)) & 1) ? -1 : 0)
1828  #define K55 (((w1s >> (16 + 0)) & 1) ? -1 : 0)
1829
1830  /**
1831   * inner loop
1832   */
1833
1834  for (u32 il_pos = 0; il_pos < il_cnt; il_pos += 32)
1835  {
1836    u32 k00 = K00;
1837    u32 k01 = K01;
1838    u32 k02 = K02;
1839    u32 k03 = K03;
1840    u32 k04 = K04;
1841    u32 k05 = K05;
1842    u32 k06 = K06;
1843    u32 k07 = K07;
1844    u32 k08 = K08;
1845    u32 k09 = K09;
1846    u32 k10 = K10;
1847    u32 k11 = K11;
1848    u32 k12 = K12;
1849    u32 k13 = K13;
1850    u32 k14 = K14;
1851    u32 k15 = K15;
1852    u32 k16 = K16;
1853    u32 k17 = K17;
1854    u32 k18 = K18;
1855    u32 k19 = K19;
1856    u32 k20 = K20;
1857    u32 k21 = K21;
1858    u32 k22 = K22;
1859    u32 k23 = K23;
1860    u32 k24 = K24;
1861    u32 k25 = K25;
1862    u32 k26 = K26;
1863    u32 k27 = K27;
1864    u32 k28 = K28;
1865    u32 k29 = K29;
1866    u32 k30 = K30;
1867    u32 k31 = K31;
1868
1869    const u32 pc_pos = il_pos / 32;
1870
1871    k00 |= words_buf_s[pc_pos].b[ 0];
1872    k01 |= words_buf_s[pc_pos].b[ 1];
1873    k02 |= words_buf_s[pc_pos].b[ 2];
1874    k03 |= words_buf_s[pc_pos].b[ 3];
1875    k04 |= words_buf_s[pc_pos].b[ 4];
1876    k05 |= words_buf_s[pc_pos].b[ 5];
1877    k06 |= words_buf_s[pc_pos].b[ 6];
1878    k07 |= words_buf_s[pc_pos].b[ 7];
1879    k08 |= words_buf_s[pc_pos].b[ 8];
1880    k09 |= words_buf_s[pc_pos].b[ 9];
1881    k10 |= words_buf_s[pc_pos].b[10];
1882    k11 |= words_buf_s[pc_pos].b[11];
1883    k12 |= words_buf_s[pc_pos].b[12];
1884    k13 |= words_buf_s[pc_pos].b[13];
1885    k14 |= words_buf_s[pc_pos].b[14];
1886    k15 |= words_buf_s[pc_pos].b[15];
1887    k16 |= words_buf_s[pc_pos].b[16];
1888    k17 |= words_buf_s[pc_pos].b[17];
1889    k18 |= words_buf_s[pc_pos].b[18];
1890    k19 |= words_buf_s[pc_pos].b[19];
1891    k20 |= words_buf_s[pc_pos].b[20];
1892    k21 |= words_buf_s[pc_pos].b[21];
1893    k22 |= words_buf_s[pc_pos].b[22];
1894    k23 |= words_buf_s[pc_pos].b[23];
1895    k24 |= words_buf_s[pc_pos].b[24];
1896    k25 |= words_buf_s[pc_pos].b[25];
1897    k26 |= words_buf_s[pc_pos].b[26];
1898    k27 |= words_buf_s[pc_pos].b[27];
1899    k28 |= words_buf_s[pc_pos].b[28];
1900    k29 |= words_buf_s[pc_pos].b[29];
1901    k30 |= words_buf_s[pc_pos].b[30];
1902    k31 |= words_buf_s[pc_pos].b[31];
1903
1904    // KGS!@#$% including IP
1905
1906    u32 D00 = 0;
1907    u32 D01 = 0;
1908    u32 D02 = 0;
1909    u32 D03 = 0xffffffff;
1910    u32 D04 = 0;
1911    u32 D05 = 0xffffffff;
1912    u32 D06 = 0xffffffff;
1913    u32 D07 = 0xffffffff;
1914    u32 D08 = 0;
1915    u32 D09 = 0;
1916    u32 D10 = 0;
1917    u32 D11 = 0;
1918    u32 D12 = 0;
1919    u32 D13 = 0xffffffff;
1920    u32 D14 = 0;
1921    u32 D15 = 0;
1922    u32 D16 = 0xffffffff;
1923    u32 D17 = 0xffffffff;
1924    u32 D18 = 0;
1925    u32 D19 = 0;
1926    u32 D20 = 0;
1927    u32 D21 = 0;
1928    u32 D22 = 0xffffffff;
1929    u32 D23 = 0;
1930    u32 D24 = 0xffffffff;
1931    u32 D25 = 0;
1932    u32 D26 = 0xffffffff;
1933    u32 D27 = 0;
1934    u32 D28 = 0xffffffff;
1935    u32 D29 = 0xffffffff;
1936    u32 D30 = 0xffffffff;
1937    u32 D31 = 0xffffffff;
1938    u32 D32 = 0;
1939    u32 D33 = 0;
1940    u32 D34 = 0;
1941    u32 D35 = 0;
1942    u32 D36 = 0;
1943    u32 D37 = 0;
1944    u32 D38 = 0;
1945    u32 D39 = 0;
1946    u32 D40 = 0xffffffff;
1947    u32 D41 = 0xffffffff;
1948    u32 D42 = 0xffffffff;
1949    u32 D43 = 0;
1950    u32 D44 = 0xffffffff;
1951    u32 D45 = 0;
1952    u32 D46 = 0;
1953    u32 D47 = 0;
1954    u32 D48 = 0;
1955    u32 D49 = 0;
1956    u32 D50 = 0;
1957    u32 D51 = 0;
1958    u32 D52 = 0;
1959    u32 D53 = 0;
1960    u32 D54 = 0;
1961    u32 D55 = 0xffffffff;
1962    u32 D56 = 0;
1963    u32 D57 = 0;
1964    u32 D58 = 0xffffffff;
1965    u32 D59 = 0;
1966    u32 D60 = 0;
1967    u32 D61 = 0xffffffff;
1968    u32 D62 = 0xffffffff;
1969    u32 D63 = 0xffffffff;
1970
1971    DES
1972    (
1973      k00, k01, k02, k03, k04, k05, k06,
1974      k07, k08, k09, k10, k11, k12, k13,
1975      k14, k15, k16, k17, k18, k19, k20,
1976      k21, k22, k23, k24, k25, k26, k27,
1977      k28, k29, k30, k31, K32, K33, K34,
1978      K35, K36, K37, K38, K39, K40, K41,
1979      K42, K43, K44, K45, K46, K47, K48,
1980      K49, K50, K51, K52, K53, K54, K55,
1981      &D00, &D01, &D02, &D03, &D04, &D05, &D06, &D07,
1982      &D08, &D09, &D10, &D11, &D12, &D13, &D14, &D15,
1983      &D16, &D17, &D18, &D19, &D20, &D21, &D22, &D23,
1984      &D24, &D25, &D26, &D27, &D28, &D29, &D30, &D31,
1985      &D32, &D33, &D34, &D35, &D36, &D37, &D38, &D39,
1986      &D40, &D41, &D42, &D43, &D44, &D45, &D46, &D47,
1987      &D48, &D49, &D50, &D51, &D52, &D53, &D54, &D55,
1988      &D56, &D57, &D58, &D59, &D60, &D61, &D62, &D63
1989    );
1990
1991    u32 out[64];
1992
1993    out[ 0] = D00;
1994    out[ 1] = D01;
1995    out[ 2] = D02;
1996    out[ 3] = D03;
1997    out[ 4] = D04;
1998    out[ 5] = D05;
1999    out[ 6] = D06;
2000    out[ 7] = D07;
2001    out[ 8] = D08;
2002    out[ 9] = D09;
2003    out[10] = D10;
2004    out[11] = D11;
2005    out[12] = D12;
2006    out[13] = D13;
2007    out[14] = D14;
2008    out[15] = D15;
2009    out[16] = D16;
2010    out[17] = D17;
2011    out[18] = D18;
2012    out[19] = D19;
2013    out[20] = D20;
2014    out[21] = D21;
2015    out[22] = D22;
2016    out[23] = D23;
2017    out[24] = D24;
2018    out[25] = D25;
2019    out[26] = D26;
2020    out[27] = D27;
2021    out[28] = D28;
2022    out[29] = D29;
2023    out[30] = D30;
2024    out[31] = D31;
2025    out[32] = D32;
2026    out[33] = D33;
2027    out[34] = D34;
2028    out[35] = D35;
2029    out[36] = D36;
2030    out[37] = D37;
2031    out[38] = D38;
2032    out[39] = D39;
2033    out[40] = D40;
2034    out[41] = D41;
2035    out[42] = D42;
2036    out[43] = D43;
2037    out[44] = D44;
2038    out[45] = D45;
2039    out[46] = D46;
2040    out[47] = D47;
2041    out[48] = D48;
2042    out[49] = D49;
2043    out[50] = D50;
2044    out[51] = D51;
2045    out[52] = D52;
2046    out[53] = D53;
2047    out[54] = D54;
2048    out[55] = D55;
2049    out[56] = D56;
2050    out[57] = D57;
2051    out[58] = D58;
2052    out[59] = D59;
2053    out[60] = D60;
2054    out[61] = D61;
2055    out[62] = D62;
2056    out[63] = D63;
2057
2058    if (digests_cnt < 16)
2059    {
2060      for (u32 d = 0; d < digests_cnt; d++)
2061      {
2062        const u32 final_hash_pos = DIGESTS_OFFSET + d;
2063
2064        if (hashes_shown[final_hash_pos]) continue;
2065
2066        u32 search[2];
2067
2068        search[0] = digests_buf[final_hash_pos].digest_buf[DGST_R0];
2069        search[1] = digests_buf[final_hash_pos].digest_buf[DGST_R1];
2070
2071        u32 tmpResult = 0;
2072
2073        #pragma unroll
2074        for (int i = 0; i < 32; i++)
2075        {
2076          const u32 b0 = -((search[0] >> i) & 1);
2077          const u32 b1 = -((search[1] >> i) & 1);
2078
2079          tmpResult |= out[ 0 + i] ^ b0;
2080          tmpResult |= out[32 + i] ^ b1;
2081        }
2082
2083        if (tmpResult == 0xffffffff) continue;
2084
2085        const u32 slice = ffz (tmpResult);
2086
2087        const u32 r0 = search[0];
2088        const u32 r1 = search[1];
2089        #ifdef KERNEL_STATIC
2090        const u32 r2 = 0;
2091        const u32 r3 = 0;
2092        #endif
2093
2094        #include COMPARE_M
2095      }
2096    }
2097    else
2098    {
2099      u32 out0[32];
2100      u32 out1[32];
2101
2102      #pragma unroll
2103      for (int i = 0; i < 32; i++)
2104      {
2105        out0[i] = out[ 0 + 31 - i];
2106        out1[i] = out[32 + 31 - i];
2107      }
2108
2109      transpose32c (out0);
2110      transpose32c (out1);
2111
2112      #pragma unroll
2113      for (int slice = 0; slice < 32; slice++)
2114      {
2115        const u32 r0 = out0[31 - slice];
2116        const u32 r1 = out1[31 - slice];
2117        #ifdef KERNEL_STATIC
2118        const u32 r2 = 0;
2119        const u32 r3 = 0;
2120        #endif
2121
2122        #include COMPARE_M
2123      }
2124    }
2125  }
2126}
2127
2128KERNEL_FQ void m03000_sxx (KERN_ATTR_BITSLICE ())
2129{
2130  /**
2131   * base
2132   */
2133
2134  const u64 gid = get_global_id (0);
2135  const u64 lid = get_local_id (0);
2136
2137  if (gid >= gid_max) return;
2138
2139  /**
2140   * digest
2141   */
2142
2143  const u32 s0 = digests_buf[0].digest_buf[0];
2144  const u32 s1 = digests_buf[0].digest_buf[1];
2145
2146  const u32 S00 = (((s0 >>  0) & 1) ? -1 : 0);
2147  const u32 S01 = (((s0 >>  1) & 1) ? -1 : 0);
2148  const u32 S02 = (((s0 >>  2) & 1) ? -1 : 0);
2149  const u32 S03 = (((s0 >>  3) & 1) ? -1 : 0);
2150  const u32 S04 = (((s0 >>  4) & 1) ? -1 : 0);
2151  const u32 S05 = (((s0 >>  5) & 1) ? -1 : 0);
2152  const u32 S06 = (((s0 >>  6) & 1) ? -1 : 0);
2153  const u32 S07 = (((s0 >>  7) & 1) ? -1 : 0);
2154  const u32 S08 = (((s0 >>  8) & 1) ? -1 : 0);
2155  const u32 S09 = (((s0 >>  9) & 1) ? -1 : 0);
2156  const u32 S10 = (((s0 >> 10) & 1) ? -1 : 0);
2157  const u32 S11 = (((s0 >> 11) & 1) ? -1 : 0);
2158  const u32 S12 = (((s0 >> 12) & 1) ? -1 : 0);
2159  const u32 S13 = (((s0 >> 13) & 1) ? -1 : 0);
2160  const u32 S14 = (((s0 >> 14) & 1) ? -1 : 0);
2161  const u32 S15 = (((s0 >> 15) & 1) ? -1 : 0);
2162  const u32 S16 = (((s0 >> 16) & 1) ? -1 : 0);
2163  const u32 S17 = (((s0 >> 17) & 1) ? -1 : 0);
2164  const u32 S18 = (((s0 >> 18) & 1) ? -1 : 0);
2165  const u32 S19 = (((s0 >> 19) & 1) ? -1 : 0);
2166  const u32 S20 = (((s0 >> 20) & 1) ? -1 : 0);
2167  const u32 S21 = (((s0 >> 21) & 1) ? -1 : 0);
2168  const u32 S22 = (((s0 >> 22) & 1) ? -1 : 0);
2169  const u32 S23 = (((s0 >> 23) & 1) ? -1 : 0);
2170  const u32 S24 = (((s0 >> 24) & 1) ? -1 : 0);
2171  const u32 S25 = (((s0 >> 25) & 1) ? -1 : 0);
2172  const u32 S26 = (((s0 >> 26) & 1) ? -1 : 0);
2173  const u32 S27 = (((s0 >> 27) & 1) ? -1 : 0);
2174  const u32 S28 = (((s0 >> 28) & 1) ? -1 : 0);
2175  const u32 S29 = (((s0 >> 29) & 1) ? -1 : 0);
2176  const u32 S30 = (((s0 >> 30) & 1) ? -1 : 0);
2177  const u32 S31 = (((s0 >> 31) & 1) ? -1 : 0);
2178  const u32 S32 = (((s1 >>  0) & 1) ? -1 : 0);
2179  const u32 S33 = (((s1 >>  1) & 1) ? -1 : 0);
2180  const u32 S34 = (((s1 >>  2) & 1) ? -1 : 0);
2181  const u32 S35 = (((s1 >>  3) & 1) ? -1 : 0);
2182  const u32 S36 = (((s1 >>  4) & 1) ? -1 : 0);
2183  const u32 S37 = (((s1 >>  5) & 1) ? -1 : 0);
2184  const u32 S38 = (((s1 >>  6) & 1) ? -1 : 0);
2185  const u32 S39 = (((s1 >>  7) & 1) ? -1 : 0);
2186  const u32 S40 = (((s1 >>  8) & 1) ? -1 : 0);
2187  const u32 S41 = (((s1 >>  9) & 1) ? -1 : 0);
2188  const u32 S42 = (((s1 >> 10) & 1) ? -1 : 0);
2189  const u32 S43 = (((s1 >> 11) & 1) ? -1 : 0);
2190  const u32 S44 = (((s1 >> 12) & 1) ? -1 : 0);
2191  const u32 S45 = (((s1 >> 13) & 1) ? -1 : 0);
2192  const u32 S46 = (((s1 >> 14) & 1) ? -1 : 0);
2193  const u32 S47 = (((s1 >> 15) & 1) ? -1 : 0);
2194  const u32 S48 = (((s1 >> 16) & 1) ? -1 : 0);
2195  const u32 S49 = (((s1 >> 17) & 1) ? -1 : 0);
2196  const u32 S50 = (((s1 >> 18) & 1) ? -1 : 0);
2197  const u32 S51 = (((s1 >> 19) & 1) ? -1 : 0);
2198  const u32 S52 = (((s1 >> 20) & 1) ? -1 : 0);
2199  const u32 S53 = (((s1 >> 21) & 1) ? -1 : 0);
2200  const u32 S54 = (((s1 >> 22) & 1) ? -1 : 0);
2201  const u32 S55 = (((s1 >> 23) & 1) ? -1 : 0);
2202  const u32 S56 = (((s1 >> 24) & 1) ? -1 : 0);
2203  const u32 S57 = (((s1 >> 25) & 1) ? -1 : 0);
2204  const u32 S58 = (((s1 >> 26) & 1) ? -1 : 0);
2205  const u32 S59 = (((s1 >> 27) & 1) ? -1 : 0);
2206  const u32 S60 = (((s1 >> 28) & 1) ? -1 : 0);
2207  const u32 S61 = (((s1 >> 29) & 1) ? -1 : 0);
2208  const u32 S62 = (((s1 >> 30) & 1) ? -1 : 0);
2209  const u32 S63 = (((s1 >> 31) & 1) ? -1 : 0);
2210
2211  /**
2212   * base
2213   */
2214
2215  const u32 w0s = pws[gid].i[0];
2216  const u32 w1s = pws[gid].i[1];
2217
2218  #define K00 (((w0s >> ( 0 + 7)) & 1) ? -1 : 0)
2219  #define K01 (((w0s >> ( 0 + 6)) & 1) ? -1 : 0)
2220  #define K02 (((w0s >> ( 0 + 5)) & 1) ? -1 : 0)
2221  #define K03 (((w0s >> ( 0 + 4)) & 1) ? -1 : 0)
2222  #define K04 (((w0s >> ( 0 + 3)) & 1) ? -1 : 0)
2223  #define K05 (((w0s >> ( 0 + 2)) & 1) ? -1 : 0)
2224  #define K06 (((w0s >> ( 0 + 1)) & 1) ? -1 : 0)
2225  #define K07 (((w0s >> ( 0 + 0)) & 1) ? -1 : 0)
2226  #define K08 (((w0s >> ( 8 + 7)) & 1) ? -1 : 0)
2227  #define K09 (((w0s >> ( 8 + 6)) & 1) ? -1 : 0)
2228  #define K10 (((w0s >> ( 8 + 5)) & 1) ? -1 : 0)
2229  #define K11 (((w0s >> ( 8 + 4)) & 1) ? -1 : 0)
2230  #define K12 (((w0s >> ( 8 + 3)) & 1) ? -1 : 0)
2231  #define K13 (((w0s >> ( 8 + 2)) & 1) ? -1 : 0)
2232  #define K14 (((w0s >> ( 8 + 1)) & 1) ? -1 : 0)
2233  #define K15 (((w0s >> ( 8 + 0)) & 1) ? -1 : 0)
2234  #define K16 (((w0s >> (16 + 7)) & 1) ? -1 : 0)
2235  #define K17 (((w0s >> (16 + 6)) & 1) ? -1 : 0)
2236  #define K18 (((w0s >> (16 + 5)) & 1) ? -1 : 0)
2237  #define K19 (((w0s >> (16 + 4)) & 1) ? -1 : 0)
2238  #define K20 (((w0s >> (16 + 3)) & 1) ? -1 : 0)
2239  #define K21 (((w0s >> (16 + 2)) & 1) ? -1 : 0)
2240  #define K22 (((w0s >> (16 + 1)) & 1) ? -1 : 0)
2241  #define K23 (((w0s >> (16 + 0)) & 1) ? -1 : 0)
2242  #define K24 (((w0s >> (24 + 7)) & 1) ? -1 : 0)
2243  #define K25 (((w0s >> (24 + 6)) & 1) ? -1 : 0)
2244  #define K26 (((w0s >> (24 + 5)) & 1) ? -1 : 0)
2245  #define K27 (((w0s >> (24 + 4)) & 1) ? -1 : 0)
2246  #define K28 (((w0s >> (24 + 3)) & 1) ? -1 : 0)
2247  #define K29 (((w0s >> (24 + 2)) & 1) ? -1 : 0)
2248  #define K30 (((w0s >> (24 + 1)) & 1) ? -1 : 0)
2249  #define K31 (((w0s >> (24 + 0)) & 1) ? -1 : 0)
2250  #define K32 (((w1s >> ( 0 + 7)) & 1) ? -1 : 0)
2251  #define K33 (((w1s >> ( 0 + 6)) & 1) ? -1 : 0)
2252  #define K34 (((w1s >> ( 0 + 5)) & 1) ? -1 : 0)
2253  #define K35 (((w1s >> ( 0 + 4)) & 1) ? -1 : 0)
2254  #define K36 (((w1s >> ( 0 + 3)) & 1) ? -1 : 0)
2255  #define K37 (((w1s >> ( 0 + 2)) & 1) ? -1 : 0)
2256  #define K38 (((w1s >> ( 0 + 1)) & 1) ? -1 : 0)
2257  #define K39 (((w1s >> ( 0 + 0)) & 1) ? -1 : 0)
2258  #define K40 (((w1s >> ( 8 + 7)) & 1) ? -1 : 0)
2259  #define K41 (((w1s >> ( 8 + 6)) & 1) ? -1 : 0)
2260  #define K42 (((w1s >> ( 8 + 5)) & 1) ? -1 : 0)
2261  #define K43 (((w1s >> ( 8 + 4)) & 1) ? -1 : 0)
2262  #define K44 (((w1s >> ( 8 + 3)) & 1) ? -1 : 0)
2263  #define K45 (((w1s >> ( 8 + 2)) & 1) ? -1 : 0)
2264  #define K46 (((w1s >> ( 8 + 1)) & 1) ? -1 : 0)
2265  #define K47 (((w1s >> ( 8 + 0)) & 1) ? -1 : 0)
2266  #define K48 (((w1s >> (16 + 7)) & 1) ? -1 : 0)
2267  #define K49 (((w1s >> (16 + 6)) & 1) ? -1 : 0)
2268  #define K50 (((w1s >> (16 + 5)) & 1) ? -1 : 0)
2269  #define K51 (((w1s >> (16 + 4)) & 1) ? -1 : 0)
2270  #define K52 (((w1s >> (16 + 3)) & 1) ? -1 : 0)
2271  #define K53 (((w1s >> (16 + 2)) & 1) ? -1 : 0)
2272  #define K54 (((w1s >> (16 + 1)) & 1) ? -1 : 0)
2273  #define K55 (((w1s >> (16 + 0)) & 1) ? -1 : 0)
2274
2275  /**
2276   * inner loop
2277   */
2278
2279  for (u32 il_pos = 0; il_pos < il_cnt; il_pos += 32)
2280  {
2281    u32 k00 = K00;
2282    u32 k01 = K01;
2283    u32 k02 = K02;
2284    u32 k03 = K03;
2285    u32 k04 = K04;
2286    u32 k05 = K05;
2287    u32 k06 = K06;
2288    u32 k07 = K07;
2289    u32 k08 = K08;
2290    u32 k09 = K09;
2291    u32 k10 = K10;
2292    u32 k11 = K11;
2293    u32 k12 = K12;
2294    u32 k13 = K13;
2295    u32 k14 = K14;
2296    u32 k15 = K15;
2297    u32 k16 = K16;
2298    u32 k17 = K17;
2299    u32 k18 = K18;
2300    u32 k19 = K19;
2301    u32 k20 = K20;
2302    u32 k21 = K21;
2303    u32 k22 = K22;
2304    u32 k23 = K23;
2305    u32 k24 = K24;
2306    u32 k25 = K25;
2307    u32 k26 = K26;
2308    u32 k27 = K27;
2309    u32 k28 = K28;
2310    u32 k29 = K29;
2311    u32 k30 = K30;
2312    u32 k31 = K31;
2313
2314    const u32 pc_pos = il_pos / 32;
2315
2316    k00 |= words_buf_s[pc_pos].b[ 0];
2317    k01 |= words_buf_s[pc_pos].b[ 1];
2318    k02 |= words_buf_s[pc_pos].b[ 2];
2319    k03 |= words_buf_s[pc_pos].b[ 3];
2320    k04 |= words_buf_s[pc_pos].b[ 4];
2321    k05 |= words_buf_s[pc_pos].b[ 5];
2322    k06 |= words_buf_s[pc_pos].b[ 6];
2323    k07 |= words_buf_s[pc_pos].b[ 7];
2324    k08 |= words_buf_s[pc_pos].b[ 8];
2325    k09 |= words_buf_s[pc_pos].b[ 9];
2326    k10 |= words_buf_s[pc_pos].b[10];
2327    k11 |= words_buf_s[pc_pos].b[11];
2328    k12 |= words_buf_s[pc_pos].b[12];
2329    k13 |= words_buf_s[pc_pos].b[13];
2330    k14 |= words_buf_s[pc_pos].b[14];
2331    k15 |= words_buf_s[pc_pos].b[15];
2332    k16 |= words_buf_s[pc_pos].b[16];
2333    k17 |= words_buf_s[pc_pos].b[17];
2334    k18 |= words_buf_s[pc_pos].b[18];
2335    k19 |= words_buf_s[pc_pos].b[19];
2336    k20 |= words_buf_s[pc_pos].b[20];
2337    k21 |= words_buf_s[pc_pos].b[21];
2338    k22 |= words_buf_s[pc_pos].b[22];
2339    k23 |= words_buf_s[pc_pos].b[23];
2340    k24 |= words_buf_s[pc_pos].b[24];
2341    k25 |= words_buf_s[pc_pos].b[25];
2342    k26 |= words_buf_s[pc_pos].b[26];
2343    k27 |= words_buf_s[pc_pos].b[27];
2344    k28 |= words_buf_s[pc_pos].b[28];
2345    k29 |= words_buf_s[pc_pos].b[29];
2346    k30 |= words_buf_s[pc_pos].b[30];
2347    k31 |= words_buf_s[pc_pos].b[31];
2348
2349    // KGS!@#$% including IP
2350
2351    u32 D00 = 0;
2352    u32 D01 = 0;
2353    u32 D02 = 0;
2354    u32 D03 = 0xffffffff;
2355    u32 D04 = 0;
2356    u32 D05 = 0xffffffff;
2357    u32 D06 = 0xffffffff;
2358    u32 D07 = 0xffffffff;
2359    u32 D08 = 0;
2360    u32 D09 = 0;
2361    u32 D10 = 0;
2362    u32 D11 = 0;
2363    u32 D12 = 0;
2364    u32 D13 = 0xffffffff;
2365    u32 D14 = 0;
2366    u32 D15 = 0;
2367    u32 D16 = 0xffffffff;
2368    u32 D17 = 0xffffffff;
2369    u32 D18 = 0;
2370    u32 D19 = 0;
2371    u32 D20 = 0;
2372    u32 D21 = 0;
2373    u32 D22 = 0xffffffff;
2374    u32 D23 = 0;
2375    u32 D24 = 0xffffffff;
2376    u32 D25 = 0;
2377    u32 D26 = 0xffffffff;
2378    u32 D27 = 0;
2379    u32 D28 = 0xffffffff;
2380    u32 D29 = 0xffffffff;
2381    u32 D30 = 0xffffffff;
2382    u32 D31 = 0xffffffff;
2383    u32 D32 = 0;
2384    u32 D33 = 0;
2385    u32 D34 = 0;
2386    u32 D35 = 0;
2387    u32 D36 = 0;
2388    u32 D37 = 0;
2389    u32 D38 = 0;
2390    u32 D39 = 0;
2391    u32 D40 = 0xffffffff;
2392    u32 D41 = 0xffffffff;
2393    u32 D42 = 0xffffffff;
2394    u32 D43 = 0;
2395    u32 D44 = 0xffffffff;
2396    u32 D45 = 0;
2397    u32 D46 = 0;
2398    u32 D47 = 0;
2399    u32 D48 = 0;
2400    u32 D49 = 0;
2401    u32 D50 = 0;
2402    u32 D51 = 0;
2403    u32 D52 = 0;
2404    u32 D53 = 0;
2405    u32 D54 = 0;
2406    u32 D55 = 0xffffffff;
2407    u32 D56 = 0;
2408    u32 D57 = 0;
2409    u32 D58 = 0xffffffff;
2410    u32 D59 = 0;
2411    u32 D60 = 0;
2412    u32 D61 = 0xffffffff;
2413    u32 D62 = 0xffffffff;
2414    u32 D63 = 0xffffffff;
2415
2416    DES
2417    (
2418      k00, k01, k02, k03, k04, k05, k06,
2419      k07, k08, k09, k10, k11, k12, k13,
2420      k14, k15, k16, k17, k18, k19, k20,
2421      k21, k22, k23, k24, k25, k26, k27,
2422      k28, k29, k30, k31, K32, K33, K34,
2423      K35, K36, K37, K38, K39, K40, K41,
2424      K42, K43, K44, K45, K46, K47, K48,
2425      K49, K50, K51, K52, K53, K54, K55,
2426      &D00, &D01, &D02, &D03, &D04, &D05, &D06, &D07,
2427      &D08, &D09, &D10, &D11, &D12, &D13, &D14, &D15,
2428      &D16, &D17, &D18, &D19, &D20, &D21, &D22, &D23,
2429      &D24, &D25, &D26, &D27, &D28, &D29, &D30, &D31,
2430      &D32, &D33, &D34, &D35, &D36, &D37, &D38, &D39,
2431      &D40, &D41, &D42, &D43, &D44, &D45, &D46, &D47,
2432      &D48, &D49, &D50, &D51, &D52, &D53, &D54, &D55,
2433      &D56, &D57, &D58, &D59, &D60, &D61, &D62, &D63
2434    );
2435
2436    u32 tmpResult = 0;
2437
2438    tmpResult |= D00 ^ S00;
2439    tmpResult |= D01 ^ S01;
2440    tmpResult |= D02 ^ S02;
2441    tmpResult |= D03 ^ S03;
2442    tmpResult |= D04 ^ S04;
2443    tmpResult |= D05 ^ S05;
2444    tmpResult |= D06 ^ S06;
2445    tmpResult |= D07 ^ S07;
2446    tmpResult |= D08 ^ S08;
2447    tmpResult |= D09 ^ S09;
2448    tmpResult |= D10 ^ S10;
2449    tmpResult |= D11 ^ S11;
2450    tmpResult |= D12 ^ S12;
2451    tmpResult |= D13 ^ S13;
2452    tmpResult |= D14 ^ S14;
2453    tmpResult |= D15 ^ S15;
2454
2455    if (tmpResult == 0xffffffff) continue;
2456
2457    tmpResult |= D16 ^ S16;
2458    tmpResult |= D17 ^ S17;
2459    tmpResult |= D18 ^ S18;
2460    tmpResult |= D19 ^ S19;
2461    tmpResult |= D20 ^ S20;
2462    tmpResult |= D21 ^ S21;
2463    tmpResult |= D22 ^ S22;
2464    tmpResult |= D23 ^ S23;
2465    tmpResult |= D24 ^ S24;
2466    tmpResult |= D25 ^ S25;
2467    tmpResult |= D26 ^ S26;
2468    tmpResult |= D27 ^ S27;
2469    tmpResult |= D28 ^ S28;
2470    tmpResult |= D29 ^ S29;
2471    tmpResult |= D30 ^ S30;
2472    tmpResult |= D31 ^ S31;
2473
2474    if (tmpResult == 0xffffffff) continue;
2475
2476    tmpResult |= D32 ^ S32;
2477    tmpResult |= D33 ^ S33;
2478    tmpResult |= D34 ^ S34;
2479    tmpResult |= D35 ^ S35;
2480    tmpResult |= D36 ^ S36;
2481    tmpResult |= D37 ^ S37;
2482    tmpResult |= D38 ^ S38;
2483    tmpResult |= D39 ^ S39;
2484    tmpResult |= D40 ^ S40;
2485    tmpResult |= D41 ^ S41;
2486    tmpResult |= D42 ^ S42;
2487    tmpResult |= D43 ^ S43;
2488    tmpResult |= D44 ^ S44;
2489    tmpResult |= D45 ^ S45;
2490    tmpResult |= D46 ^ S46;
2491    tmpResult |= D47 ^ S47;
2492
2493    if (tmpResult == 0xffffffff) continue;
2494
2495    tmpResult |= D48 ^ S48;
2496    tmpResult |= D49 ^ S49;
2497    tmpResult |= D50 ^ S50;
2498    tmpResult |= D51 ^ S51;
2499    tmpResult |= D52 ^ S52;
2500    tmpResult |= D53 ^ S53;
2501    tmpResult |= D54 ^ S54;
2502    tmpResult |= D55 ^ S55;
2503    tmpResult |= D56 ^ S56;
2504    tmpResult |= D57 ^ S57;
2505    tmpResult |= D58 ^ S58;
2506    tmpResult |= D59 ^ S59;
2507    tmpResult |= D60 ^ S60;
2508    tmpResult |= D61 ^ S61;
2509    tmpResult |= D62 ^ S62;
2510    tmpResult |= D63 ^ S63;
2511
2512    if (tmpResult == 0xffffffff) continue;
2513
2514    const u32 slice = ffz (tmpResult);
2515
2516    #ifdef KERNEL_STATIC
2517    #include COMPARE_S
2518    #endif
2519  }
2520}
2521