1/**
2 * Author......: See docs/credits.txt
3 * License.....: MIT
4 * NOTE........: sboxes for maxwell were taken from DeepLearningJohnDoe, license below
5 *             : sboxes for others were takes fron JtR, license below
6 */
7
8#ifdef KERNEL_STATIC
9#include "inc_vendor.h"
10#include "inc_types.h"
11#include "inc_platform.cl"
12#include "inc_common.cl"
13#endif
14
15#define COMPARE_S "inc_comp_single_bs.cl"
16#define COMPARE_M "inc_comp_multi_bs.cl"
17
18#ifdef IS_NV
19#define KXX_DECL
20#endif
21
22#if (defined IS_AMD || defined IS_HIP)
23#define KXX_DECL
24#endif
25
26#ifdef IS_GENERIC
27#define KXX_DECL
28#endif
29
30#ifdef IS_NV
31
32#if CUDA_ARCH >= 500
33
34//
35// Bitslice DES S-boxes with LOP3.LUT instructions
36// For NVIDIA Maxwell architecture and CUDA 7.5 RC
37// by DeepLearningJohnDoe, version 0.1.6, 2015/07/19
38//
39// Gate counts: 25 24 25 18 25 24 24 23
40// Average: 23.5
41// Depth: 8 7 7 6 8 10 10 8
42// Average: 8
43//
44// Note that same S-box function with a lower gate count isn't necessarily faster.
45//
46// These Boolean expressions corresponding to DES S-boxes were
47// discovered by <deeplearningjohndoe at gmail.com>
48//
49// This file itself is Copyright (c) 2015 by <deeplearningjohndoe at gmail.com>
50// Redistribution and use in source and binary forms, with or without
51// modification, are permitted.
52//
53// The underlying mathematical formulas are NOT copyrighted.
54//
55
56#define LUT(a,b,c,d,e) u32 a; asm ("lop3.b32 %0, %1, %2, %3, "#e";" : "=r"(a): "r"(b), "r"(c), "r"(d));
57
58DECLSPEC void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
59{
60  LUT(xAA55AA5500550055, a1, a4, a6, 0xC1)
61  LUT(xA55AA55AF0F5F0F5, a3, a6, xAA55AA5500550055, 0x9E)
62  LUT(x5F5F5F5FA5A5A5A5, a1, a3, a6, 0xD6)
63  LUT(xF5A0F5A0A55AA55A, a4, xAA55AA5500550055, x5F5F5F5FA5A5A5A5, 0x56)
64  LUT(x947A947AD1E7D1E7, a2, xA55AA55AF0F5F0F5, xF5A0F5A0A55AA55A, 0x6C)
65  LUT(x5FFF5FFFFFFAFFFA, a6, xAA55AA5500550055, x5F5F5F5FA5A5A5A5, 0x7B)
66  LUT(xB96CB96C69936993, a2, xF5A0F5A0A55AA55A, x5FFF5FFFFFFAFFFA, 0xD6)
67  LUT(x3, a5, x947A947AD1E7D1E7, xB96CB96C69936993, 0x6A)
68  LUT(x55EE55EE55EE55EE, a1, a2, a4, 0x7A)
69  LUT(x084C084CB77BB77B, a2, a6, xF5A0F5A0A55AA55A, 0xC9)
70  LUT(x9C329C32E295E295, x947A947AD1E7D1E7, x55EE55EE55EE55EE, x084C084CB77BB77B, 0x72)
71  LUT(xA51EA51E50E050E0, a3, a6, x55EE55EE55EE55EE, 0x29)
72  LUT(x4AD34AD3BE3CBE3C, a2, x947A947AD1E7D1E7, xA51EA51E50E050E0, 0x95)
73  LUT(x2, a5, x9C329C32E295E295, x4AD34AD3BE3CBE3C, 0xC6)
74  LUT(xD955D95595D195D1, a1, a2, x9C329C32E295E295, 0xD2)
75  LUT(x8058805811621162, x947A947AD1E7D1E7, x55EE55EE55EE55EE, x084C084CB77BB77B, 0x90)
76  LUT(x7D0F7D0FC4B3C4B3, xA51EA51E50E050E0, xD955D95595D195D1, x8058805811621162, 0x76)
77  LUT(x0805080500010001, a3, xAA55AA5500550055, xD955D95595D195D1, 0x80)
78  LUT(x4A964A96962D962D, xB96CB96C69936993, x4AD34AD3BE3CBE3C, x0805080500010001, 0xA6)
79  LUT(x4, a5, x7D0F7D0FC4B3C4B3, x4A964A96962D962D, 0xA6)
80  LUT(x148014807B087B08, a1, xAA55AA5500550055, x947A947AD1E7D1E7, 0x21)
81  LUT(x94D894D86B686B68, xA55AA55AF0F5F0F5, x8058805811621162, x148014807B087B08, 0x6A)
82  LUT(x5555555540044004, a1, a6, x084C084CB77BB77B, 0x70)
83  LUT(xAFB4AFB4BF5BBF5B, x5F5F5F5FA5A5A5A5, xA51EA51E50E050E0, x5555555540044004, 0x97)
84  LUT(x1, a5, x94D894D86B686B68, xAFB4AFB4BF5BBF5B, 0x6C)
85
86  *out1 ^= x1;
87  *out2 ^= x2;
88  *out3 ^= x3;
89  *out4 ^= x4;
90}
91
92DECLSPEC void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
93{
94  LUT(xEEEEEEEE99999999, a1, a2, a6, 0x97)
95  LUT(xFFFFEEEE66666666, a5, a6, xEEEEEEEE99999999, 0x67)
96  LUT(x5555FFFFFFFF0000, a1, a5, a6, 0x76)
97  LUT(x6666DDDD5555AAAA, a2, xFFFFEEEE66666666, x5555FFFFFFFF0000, 0x69)
98  LUT(x6969D3D35353ACAC, a3, xFFFFEEEE66666666, x6666DDDD5555AAAA, 0x6A)
99  LUT(xCFCF3030CFCF3030, a2, a3, a5, 0x65)
100  LUT(xE4E4EEEE9999F0F0, a3, xEEEEEEEE99999999, x5555FFFFFFFF0000, 0x8D)
101  LUT(xE5E5BABACDCDB0B0, a1, xCFCF3030CFCF3030, xE4E4EEEE9999F0F0, 0xCA)
102  LUT(x3, a4, x6969D3D35353ACAC, xE5E5BABACDCDB0B0, 0xC6)
103  LUT(x3333CCCC00000000, a2, a5, a6, 0x14)
104  LUT(xCCCCDDDDFFFF0F0F, a5, xE4E4EEEE9999F0F0, x3333CCCC00000000, 0xB5)
105  LUT(x00000101F0F0F0F0, a3, a6, xFFFFEEEE66666666, 0x1C)
106  LUT(x9A9A64646A6A9595, a1, xCFCF3030CFCF3030, x00000101F0F0F0F0, 0x96)
107  LUT(x2, a4, xCCCCDDDDFFFF0F0F, x9A9A64646A6A9595, 0x6A)
108  LUT(x3333BBBB3333FFFF, a1, a2, x6666DDDD5555AAAA, 0xDE)
109  LUT(x1414141441410000, a1, a3, xE4E4EEEE9999F0F0, 0x90)
110  LUT(x7F7FF3F3F5F53939, x6969D3D35353ACAC, x9A9A64646A6A9595, x3333BBBB3333FFFF, 0x79)
111  LUT(x9494E3E34B4B3939, a5, x1414141441410000, x7F7FF3F3F5F53939, 0x29)
112  LUT(x1, a4, x3333BBBB3333FFFF, x9494E3E34B4B3939, 0xA6)
113  LUT(xB1B1BBBBCCCCA5A5, a1, a1, xE4E4EEEE9999F0F0, 0x4A)
114  LUT(xFFFFECECEEEEDDDD, a2, x3333CCCC00000000, x9A9A64646A6A9595, 0xEF)
115  LUT(xB1B1A9A9DCDC8787, xE5E5BABACDCDB0B0, xB1B1BBBBCCCCA5A5, xFFFFECECEEEEDDDD, 0x8D)
116  LUT(xFFFFCCCCEEEE4444, a2, a5, xFFFFEEEE66666666, 0x2B)
117  LUT(x4, a4, xB1B1A9A9DCDC8787, xFFFFCCCCEEEE4444, 0x6C)
118
119  *out1 ^= x1;
120  *out2 ^= x2;
121  *out3 ^= x3;
122  *out4 ^= x4;
123}
124
125DECLSPEC void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
126{
127  LUT(xA50FA50FA50FA50F, a1, a3, a4, 0xC9)
128  LUT(xF0F00F0FF0F0F0F0, a3, a5, a6, 0x4B)
129  LUT(xAF0FA0AAAF0FAF0F, a1, xA50FA50FA50FA50F, xF0F00F0FF0F0F0F0, 0x4D)
130  LUT(x5AA5A55A5AA55AA5, a1, a4, xF0F00F0FF0F0F0F0, 0x69)
131  LUT(xAA005FFFAA005FFF, a3, a5, xA50FA50FA50FA50F, 0xD6)
132  LUT(x5AA5A55A0F5AFAA5, a6, x5AA5A55A5AA55AA5, xAA005FFFAA005FFF, 0x9C)
133  LUT(x1, a2, xAF0FA0AAAF0FAF0F, x5AA5A55A0F5AFAA5, 0xA6)
134  LUT(xAA55AA5500AA00AA, a1, a4, a6, 0x49)
135  LUT(xFAFAA50FFAFAA50F, a1, a5, xA50FA50FA50FA50F, 0x9B)
136  LUT(x50AF0F5AFA50A5A5, a1, xAA55AA5500AA00AA, xFAFAA50FFAFAA50F, 0x66)
137  LUT(xAFAFAFAFFAFAFAFA, a1, a3, a6, 0x6F)
138  LUT(xAFAFFFFFFFFAFAFF, a4, x50AF0F5AFA50A5A5, xAFAFAFAFFAFAFAFA, 0xEB)
139  LUT(x4, a2, x50AF0F5AFA50A5A5, xAFAFFFFFFFFAFAFF, 0x6C)
140  LUT(x500F500F500F500F, a1, a3, a4, 0x98)
141  LUT(xF0505A0505A5050F, x5AA5A55A0F5AFAA5, xAA55AA5500AA00AA, xAFAFAFAFFAFAFAFA, 0x1D)
142  LUT(xF0505A05AA55AAFF, a6, x500F500F500F500F, xF0505A0505A5050F, 0x9A)
143  LUT(xFF005F55FF005F55, a1, a4, xAA005FFFAA005FFF, 0xB2)
144  LUT(xA55F5AF0A55F5AF0, a5, xA50FA50FA50FA50F, x5AA5A55A5AA55AA5, 0x3D)
145  LUT(x5A5F05A5A55F5AF0, a6, xFF005F55FF005F55, xA55F5AF0A55F5AF0, 0xA6)
146  LUT(x3, a2, xF0505A05AA55AAFF, x5A5F05A5A55F5AF0, 0xA6)
147  LUT(x0F0F0F0FA5A5A5A5, a1, a3, a6, 0xC6)
148  LUT(x5FFFFF5FFFA0FFA0, x5AA5A55A5AA55AA5, xAFAFAFAFFAFAFAFA, x0F0F0F0FA5A5A5A5, 0xDB)
149  LUT(xF5555AF500A05FFF, a5, xFAFAA50FFAFAA50F, xF0505A0505A5050F, 0xB9)
150  LUT(x05A5AAF55AFA55A5, xF0505A05AA55AAFF, x0F0F0F0FA5A5A5A5, xF5555AF500A05FFF, 0x9B)
151  LUT(x2, a2, x5FFFFF5FFFA0FFA0, x05A5AAF55AFA55A5, 0xA6)
152
153  *out1 ^= x1;
154  *out2 ^= x2;
155  *out3 ^= x3;
156  *out4 ^= x4;
157}
158
159DECLSPEC void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
160{
161  LUT(x55F055F055F055F0, a1, a3, a4, 0x72)
162  LUT(xA500F5F0A500F5F0, a3, a5, x55F055F055F055F0, 0xAD)
163  LUT(xF50AF50AF50AF50A, a1, a3, a4, 0x59)
164  LUT(xF5FA0FFFF5FA0FFF, a3, a5, xF50AF50AF50AF50A, 0xE7)
165  LUT(x61C8F93C61C8F93C, a2, xA500F5F0A500F5F0, xF5FA0FFFF5FA0FFF, 0xC6)
166  LUT(x9999666699996666, a1, a2, a5, 0x69)
167  LUT(x22C022C022C022C0, a2, a4, x55F055F055F055F0, 0x18)
168  LUT(xB35C94A6B35C94A6, xF5FA0FFFF5FA0FFF, x9999666699996666, x22C022C022C022C0, 0x63)
169  LUT(x4, a6, x61C8F93C61C8F93C, xB35C94A6B35C94A6, 0x6A)
170  LUT(x4848484848484848, a1, a2, a3, 0x12)
171  LUT(x55500AAA55500AAA, a1, a5, xF5FA0FFFF5FA0FFF, 0x28)
172  LUT(x3C90B3D63C90B3D6, x61C8F93C61C8F93C, x4848484848484848, x55500AAA55500AAA, 0x1E)
173  LUT(x8484333384843333, a1, x9999666699996666, x4848484848484848, 0x14)
174  LUT(x4452F1AC4452F1AC, xF50AF50AF50AF50A, xF5FA0FFFF5FA0FFF, xB35C94A6B35C94A6, 0x78)
175  LUT(x9586CA379586CA37, x55500AAA55500AAA, x8484333384843333, x4452F1AC4452F1AC, 0xD6)
176  LUT(x2, a6, x3C90B3D63C90B3D6, x9586CA379586CA37, 0x6A)
177  LUT(x1, a6, x3C90B3D63C90B3D6, x9586CA379586CA37, 0xA9)
178  LUT(x3, a6, x61C8F93C61C8F93C, xB35C94A6B35C94A6, 0x56)
179
180  *out1 ^= x1;
181  *out2 ^= x2;
182  *out3 ^= x3;
183  *out4 ^= x4;
184}
185
186DECLSPEC void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
187{
188  LUT(xA0A0A0A0FFFFFFFF, a1, a3, a6, 0xAB)
189  LUT(xFFFF00005555FFFF, a1, a5, a6, 0xB9)
190  LUT(xB3B320207777FFFF, a2, xA0A0A0A0FFFFFFFF, xFFFF00005555FFFF, 0xE8)
191  LUT(x50505A5A5A5A5050, a1, a3, xFFFF00005555FFFF, 0x34)
192  LUT(xA2A2FFFF2222FFFF, a1, a5, xB3B320207777FFFF, 0xCE)
193  LUT(x2E2E6969A4A46363, a2, x50505A5A5A5A5050, xA2A2FFFF2222FFFF, 0x29)
194  LUT(x3, a4, xB3B320207777FFFF, x2E2E6969A4A46363, 0xA6)
195  LUT(xA5A50A0AA5A50A0A, a1, a3, a5, 0x49)
196  LUT(x969639396969C6C6, a2, a6, xA5A50A0AA5A50A0A, 0x96)
197  LUT(x1B1B1B1B1B1B1B1B, a1, a2, a3, 0xCA)
198  LUT(xBFBFBFBFF6F6F9F9, a3, xA0A0A0A0FFFFFFFF, x969639396969C6C6, 0x7E)
199  LUT(x5B5BA4A4B8B81D1D, xFFFF00005555FFFF, x1B1B1B1B1B1B1B1B, xBFBFBFBFF6F6F9F9, 0x96)
200  LUT(x2, a4, x969639396969C6C6, x5B5BA4A4B8B81D1D, 0xCA)
201  LUT(x5555BBBBFFFF5555, a1, a2, xFFFF00005555FFFF, 0xE5)
202  LUT(x6D6D9C9C95956969, x50505A5A5A5A5050, xA2A2FFFF2222FFFF, x969639396969C6C6, 0x97)
203  LUT(x1A1A67676A6AB4B4, xA5A50A0AA5A50A0A, x5555BBBBFFFF5555, x6D6D9C9C95956969, 0x47)
204  LUT(xA0A0FFFFAAAA0000, a3, xFFFF00005555FFFF, xA5A50A0AA5A50A0A, 0x3B)
205  LUT(x36369C9CC1C1D6D6, x969639396969C6C6, x6D6D9C9C95956969, xA0A0FFFFAAAA0000, 0xD9)
206  LUT(x1, a4, x1A1A67676A6AB4B4, x36369C9CC1C1D6D6, 0xCA)
207  LUT(x5555F0F0F5F55555, a1, a3, xFFFF00005555FFFF, 0xB1)
208  LUT(x79790202DCDC0808, xA2A2FFFF2222FFFF, xA5A50A0AA5A50A0A, x969639396969C6C6, 0x47)
209  LUT(x6C6CF2F229295D5D, xBFBFBFBFF6F6F9F9, x5555F0F0F5F55555, x79790202DCDC0808, 0x6E)
210  LUT(xA3A3505010101A1A, a2, xA2A2FFFF2222FFFF, x36369C9CC1C1D6D6, 0x94)
211  LUT(x7676C7C74F4FC7C7, a1, x2E2E6969A4A46363, xA3A3505010101A1A, 0xD9)
212  LUT(x4, a4, x6C6CF2F229295D5D, x7676C7C74F4FC7C7, 0xC6)
213
214  *out1 ^= x1;
215  *out2 ^= x2;
216  *out3 ^= x3;
217  *out4 ^= x4;
218}
219
220DECLSPEC void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
221{
222  LUT(x5050F5F55050F5F5, a1, a3, a5, 0xB2)
223  LUT(x6363C6C66363C6C6, a1, a2, x5050F5F55050F5F5, 0x66)
224  LUT(xAAAA5555AAAA5555, a1, a1, a5, 0xA9)
225  LUT(x3A3A65653A3A6565, a3, x6363C6C66363C6C6, xAAAA5555AAAA5555, 0xA9)
226  LUT(x5963A3C65963A3C6, a4, x6363C6C66363C6C6, x3A3A65653A3A6565, 0xC6)
227  LUT(xE7E76565E7E76565, a5, x6363C6C66363C6C6, x3A3A65653A3A6565, 0xAD)
228  LUT(x455D45DF455D45DF, a1, a4, xE7E76565E7E76565, 0xE4)
229  LUT(x4, a6, x5963A3C65963A3C6, x455D45DF455D45DF, 0x6C)
230  LUT(x1101220211012202, a2, xAAAA5555AAAA5555, x5963A3C65963A3C6, 0x20)
231  LUT(xF00F0FF0F00F0FF0, a3, a4, a5, 0x69)
232  LUT(x16E94A9716E94A97, xE7E76565E7E76565, x1101220211012202, xF00F0FF0F00F0FF0, 0x9E)
233  LUT(x2992922929929229, a1, a2, xF00F0FF0F00F0FF0, 0x49)
234  LUT(xAFAF9823AFAF9823, a5, x5050F5F55050F5F5, x2992922929929229, 0x93)
235  LUT(x3, a6, x16E94A9716E94A97, xAFAF9823AFAF9823, 0x6C)
236  LUT(x4801810248018102, a4, x5963A3C65963A3C6, x1101220211012202, 0xA4)
237  LUT(x5EE8FFFD5EE8FFFD, a5, x16E94A9716E94A97, x4801810248018102, 0x76)
238  LUT(xF0FF00FFF0FF00FF, a3, a4, a5, 0xCD)
239  LUT(x942D9A67942D9A67, x3A3A65653A3A6565, x5EE8FFFD5EE8FFFD, xF0FF00FFF0FF00FF, 0x86)
240  LUT(x1, a6, x5EE8FFFD5EE8FFFD, x942D9A67942D9A67, 0xA6)
241  LUT(x6A40D4ED6F4DD4EE, a2, x4, xAFAF9823AFAF9823, 0x2D)
242  LUT(x6CA89C7869A49C79, x1101220211012202, x16E94A9716E94A97, x6A40D4ED6F4DD4EE, 0x26)
243  LUT(xD6DE73F9D6DE73F9, a3, x6363C6C66363C6C6, x455D45DF455D45DF, 0x6B)
244  LUT(x925E63E1965A63E1, x3A3A65653A3A6565, x6CA89C7869A49C79, xD6DE73F9D6DE73F9, 0xA2)
245  LUT(x2, a6, x6CA89C7869A49C79, x925E63E1965A63E1, 0xCA)
246
247  *out1 ^= x1;
248  *out2 ^= x2;
249  *out3 ^= x3;
250  *out4 ^= x4;
251}
252
253DECLSPEC void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
254{
255  LUT(x88AA88AA88AA88AA, a1, a2, a4, 0x0B)
256  LUT(xAAAAFF00AAAAFF00, a1, a4, a5, 0x27)
257  LUT(xADAFF8A5ADAFF8A5, a3, x88AA88AA88AA88AA, xAAAAFF00AAAAFF00, 0x9E)
258  LUT(x0A0AF5F50A0AF5F5, a1, a3, a5, 0xA6)
259  LUT(x6B69C5DC6B69C5DC, a2, xADAFF8A5ADAFF8A5, x0A0AF5F50A0AF5F5, 0x6B)
260  LUT(x1C69B2DC1C69B2DC, a4, x88AA88AA88AA88AA, x6B69C5DC6B69C5DC, 0xA9)
261  LUT(x1, a6, xADAFF8A5ADAFF8A5, x1C69B2DC1C69B2DC, 0x6A)
262  LUT(x9C9C9C9C9C9C9C9C, a1, a2, a3, 0x63)
263  LUT(xE6E63BFDE6E63BFD, a2, xAAAAFF00AAAAFF00, x0A0AF5F50A0AF5F5, 0xE7)
264  LUT(x6385639E6385639E, a4, x9C9C9C9C9C9C9C9C, xE6E63BFDE6E63BFD, 0x93)
265  LUT(x5959C4CE5959C4CE, a2, x6B69C5DC6B69C5DC, xE6E63BFDE6E63BFD, 0x5D)
266  LUT(x5B53F53B5B53F53B, a4, x0A0AF5F50A0AF5F5, x5959C4CE5959C4CE, 0x6E)
267  LUT(x3, a6, x6385639E6385639E, x5B53F53B5B53F53B, 0xC6)
268  LUT(xFAF505FAFAF505FA, a3, a4, x0A0AF5F50A0AF5F5, 0x6D)
269  LUT(x6A65956A6A65956A, a3, x9C9C9C9C9C9C9C9C, xFAF505FAFAF505FA, 0xA6)
270  LUT(x8888CCCC8888CCCC, a1, a2, a5, 0x23)
271  LUT(x94E97A9494E97A94, x1C69B2DC1C69B2DC, x6A65956A6A65956A, x8888CCCC8888CCCC, 0x72)
272  LUT(x4, a6, x6A65956A6A65956A, x94E97A9494E97A94, 0xAC)
273  LUT(xA050A050A050A050, a1, a3, a4, 0x21)
274  LUT(xC1B87A2BC1B87A2B, xAAAAFF00AAAAFF00, x5B53F53B5B53F53B, x94E97A9494E97A94, 0xA4)
275  LUT(xE96016B7E96016B7, x8888CCCC8888CCCC, xA050A050A050A050, xC1B87A2BC1B87A2B, 0x96)
276  LUT(xE3CF1FD5E3CF1FD5, x88AA88AA88AA88AA, x6A65956A6A65956A, xE96016B7E96016B7, 0x3E)
277  LUT(x6776675B6776675B, xADAFF8A5ADAFF8A5, x94E97A9494E97A94, xE3CF1FD5E3CF1FD5, 0x6B)
278  LUT(x2, a6, xE96016B7E96016B7, x6776675B6776675B, 0xC6)
279
280  *out1 ^= x1;
281  *out2 ^= x2;
282  *out3 ^= x3;
283  *out4 ^= x4;
284}
285
286DECLSPEC void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
287{
288  LUT(xEEEE3333EEEE3333, a1, a2, a5, 0x9D)
289  LUT(xBBBBBBBBBBBBBBBB, a1, a1, a2, 0x83)
290  LUT(xDDDDAAAADDDDAAAA, a1, a2, a5, 0x5B)
291  LUT(x29295A5A29295A5A, a3, xBBBBBBBBBBBBBBBB, xDDDDAAAADDDDAAAA, 0x85)
292  LUT(xC729695AC729695A, a4, xEEEE3333EEEE3333, x29295A5A29295A5A, 0xA6)
293  LUT(x3BF77B7B3BF77B7B, a2, a5, xC729695AC729695A, 0xF9)
294  LUT(x2900FF002900FF00, a4, a5, x29295A5A29295A5A, 0x0E)
295  LUT(x56B3803F56B3803F, xBBBBBBBBBBBBBBBB, x3BF77B7B3BF77B7B, x2900FF002900FF00, 0x61)
296  LUT(x4, a6, xC729695AC729695A, x56B3803F56B3803F, 0x6C)
297  LUT(xFBFBFBFBFBFBFBFB, a1, a2, a3, 0xDF)
298  LUT(x3012B7B73012B7B7, a2, a5, xC729695AC729695A, 0xD4)
299  LUT(x34E9B34C34E9B34C, a4, xFBFBFBFBFBFBFBFB, x3012B7B73012B7B7, 0x69)
300  LUT(xBFEAEBBEBFEAEBBE, a1, x29295A5A29295A5A, x34E9B34C34E9B34C, 0x6F)
301  LUT(xFFAEAFFEFFAEAFFE, a3, xBBBBBBBBBBBBBBBB, xBFEAEBBEBFEAEBBE, 0xB9)
302  LUT(x2, a6, x34E9B34C34E9B34C, xFFAEAFFEFFAEAFFE, 0xC6)
303  LUT(xCFDE88BBCFDE88BB, a2, xDDDDAAAADDDDAAAA, x34E9B34C34E9B34C, 0x5C)
304  LUT(x3055574530555745, a1, xC729695AC729695A, xCFDE88BBCFDE88BB, 0x71)
305  LUT(x99DDEEEE99DDEEEE, a4, xBBBBBBBBBBBBBBBB, xDDDDAAAADDDDAAAA, 0xB9)
306  LUT(x693CD926693CD926, x3BF77B7B3BF77B7B, x34E9B34C34E9B34C, x99DDEEEE99DDEEEE, 0x69)
307  LUT(x3, a6, x3055574530555745, x693CD926693CD926, 0x6A)
308  LUT(x9955EE559955EE55, a1, a4, x99DDEEEE99DDEEEE, 0xE2)
309  LUT(x9D48FA949D48FA94, x3BF77B7B3BF77B7B, xBFEAEBBEBFEAEBBE, x9955EE559955EE55, 0x9C)
310  LUT(x1, a6, xC729695AC729695A, x9D48FA949D48FA94, 0x39)
311
312  *out1 ^= x1;
313  *out2 ^= x2;
314  *out3 ^= x3;
315  *out4 ^= x4;
316}
317
318#else
319
320/*
321 * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC
322 * architectures.  These use AND, OR, XOR, NOT, and AND-NOT gates.
323 *
324 * Gate counts: 49 44 46 33 48 46 46 41
325 * Average: 44.125
326 *
327 * Several same-gate-count expressions for each S-box are included (for use on
328 * different CPUs/GPUs).
329 *
330 * These Boolean expressions corresponding to DES S-boxes have been generated
331 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
332 * John the Ripper password cracker: http://www.openwall.com/john/
333 * Being mathematical formulas, they are not copyrighted and are free for reuse
334 * by anyone.
335 *
336 * This file (a specific representation of the S-box expressions, surrounding
337 * logic) is Copyright (c) 2011 by Solar Designer <solar at openwall.com>.
338 * Redistribution and use in source and binary forms, with or without
339 * modification, are permitted.  (This is a heavily cut-down "BSD license".)
340 *
341 * The effort has been sponsored by Rapid7: http://www.rapid7.com
342 */
343
344DECLSPEC void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
345{
346    u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969,
347        x25202160;
348    u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93;
349    u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69;
350    u32 x0A0A0000, x0AD80096, x00999900, x0AD99996;
351    u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC;
352    u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0;
353    u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A;
354    u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205;
355    u32 x00, x01, x10, x11, x20, x21, x30, x31;
356
357    x55005500 = a1 & ~a5;
358    x5A0F5A0F = a4 ^ x55005500;
359    x3333FFFF = a3 | a6;
360    x66666666 = a1 ^ a3;
361    x22226666 = x3333FFFF & x66666666;
362    x2D2D6969 = a4 ^ x22226666;
363    x25202160 = x2D2D6969 & ~x5A0F5A0F;
364
365    x00FFFF00 = a5 ^ a6;
366    x33CCCC33 = a3 ^ x00FFFF00;
367    x4803120C = x5A0F5A0F & ~x33CCCC33;
368    x2222FFFF = a6 | x22226666;
369    x6A21EDF3 = x4803120C ^ x2222FFFF;
370    x4A01CC93 = x6A21EDF3 & ~x25202160;
371
372    x5555FFFF = a1 | a6;
373    x7F75FFFF = x6A21EDF3 | x5555FFFF;
374    x00D20096 = a5 & ~x2D2D6969;
375    x7FA7FF69 = x7F75FFFF ^ x00D20096;
376
377    x0A0A0000 = a4 & ~x5555FFFF;
378    x0AD80096 = x00D20096 ^ x0A0A0000;
379    x00999900 = x00FFFF00 & ~x66666666;
380    x0AD99996 = x0AD80096 | x00999900;
381
382    x22332233 = a3 & ~x55005500;
383    x257AA5F0 = x5A0F5A0F ^ x7F75FFFF;
384    x054885C0 = x257AA5F0 & ~x22332233;
385    xFAB77A3F = ~x054885C0;
386    x2221EDF3 = x3333FFFF & x6A21EDF3;
387    xD89697CC = xFAB77A3F ^ x2221EDF3;
388    x20 = x7FA7FF69 & ~a2;
389    x21 = x20 ^ xD89697CC;
390    *out3 ^= x21;
391
392    x05B77AC0 = x00FFFF00 ^ x054885C0;
393    x05F77AD6 = x00D20096 | x05B77AC0;
394    x36C48529 = x3333FFFF ^ x05F77AD6;
395    x6391D07C = a1 ^ x36C48529;
396    xBB0747B0 = xD89697CC ^ x6391D07C;
397    x00 = x25202160 | a2;
398    x01 = x00 ^ xBB0747B0;
399    *out1 ^= x01;
400
401    x4C460000 = x3333FFFF ^ x7F75FFFF;
402    x4EDF9996 = x0AD99996 | x4C460000;
403    x2D4E49EA = x6391D07C ^ x4EDF9996;
404    xBBFFFFB0 = x00FFFF00 | xBB0747B0;
405    x96B1B65A = x2D4E49EA ^ xBBFFFFB0;
406    x10 = x4A01CC93 | a2;
407    x11 = x10 ^ x96B1B65A;
408    *out2 ^= x11;
409
410    x5AFF5AFF = a5 | x5A0F5A0F;
411    x52B11215 = x5AFF5AFF & ~x2D4E49EA;
412    x4201C010 = x4A01CC93 & x6391D07C;
413    x10B0D205 = x52B11215 ^ x4201C010;
414    x30 = x10B0D205 | a2;
415    x31 = x30 ^ x0AD99996;
416    *out4 ^= x31;
417}
418
419DECLSPEC void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
420{
421    u32 x33CC33CC;
422    u32 x55550000, x00AA00FF, x33BB33FF;
423    u32 x33CC0000, x11441144, x11BB11BB, x003311BB;
424    u32 x00000F0F, x336600FF, x332200FF, x332200F0;
425    u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95;
426    u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39;
427    u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53;
428    u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F;
429    u32 x0A451047, xBBDFDD7B, xB19ACD3C;
430    u32 x00, x01, x10, x11, x20, x21, x30, x31;
431
432    x33CC33CC = a2 ^ a5;
433
434    x55550000 = a1 & ~a6;
435    x00AA00FF = a5 & ~x55550000;
436    x33BB33FF = a2 | x00AA00FF;
437
438    x33CC0000 = x33CC33CC & ~a6;
439    x11441144 = a1 & x33CC33CC;
440    x11BB11BB = a5 ^ x11441144;
441    x003311BB = x11BB11BB & ~x33CC0000;
442
443    x00000F0F = a3 & a6;
444    x336600FF = x00AA00FF ^ x33CC0000;
445    x332200FF = x33BB33FF & x336600FF;
446    x332200F0 = x332200FF & ~x00000F0F;
447
448    x0302000F = a3 & x332200FF;
449    xAAAAAAAA = ~a1;
450    xA9A8AAA5 = x0302000F ^ xAAAAAAAA;
451    x33CCCC33 = a6 ^ x33CC33CC;
452    x33CCC030 = x33CCCC33 & ~x00000F0F;
453    x9A646A95 = xA9A8AAA5 ^ x33CCC030;
454    x10 = a4 & ~x332200F0;
455    x11 = x10 ^ x9A646A95;
456    *out2 ^= x11;
457
458    x00333303 = a2 & ~x33CCC030;
459    x118822B8 = x11BB11BB ^ x00333303;
460    xA8208805 = xA9A8AAA5 & ~x118822B8;
461    x3CC3C33C = a3 ^ x33CCCC33;
462    x94E34B39 = xA8208805 ^ x3CC3C33C;
463    x00 = x33BB33FF & ~a4;
464    x01 = x00 ^ x94E34B39;
465    *out1 ^= x01;
466
467    x0331330C = x0302000F ^ x00333303;
468    x3FF3F33C = x3CC3C33C | x0331330C;
469    xA9DF596A = x33BB33FF ^ x9A646A95;
470    xA9DF5F6F = x00000F0F | xA9DF596A;
471    x962CAC53 = x3FF3F33C ^ xA9DF5F6F;
472
473    xA9466A6A = x332200FF ^ x9A646A95;
474    x3DA52153 = x94E34B39 ^ xA9466A6A;
475    x29850143 = xA9DF5F6F & x3DA52153;
476    x33C0330C = x33CC33CC & x3FF3F33C;
477    x1A45324F = x29850143 ^ x33C0330C;
478    x20 = x1A45324F | a4;
479    x21 = x20 ^ x962CAC53;
480    *out3 ^= x21;
481
482    x0A451047 = x1A45324F & ~x118822B8;
483    xBBDFDD7B = x33CCCC33 | xA9DF596A;
484    xB19ACD3C = x0A451047 ^ xBBDFDD7B;
485    x30 = x003311BB | a4;
486    x31 = x30 ^ xB19ACD3C;
487    *out4 ^= x31;
488}
489
490DECLSPEC void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
491{
492    u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4;
493    u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00;
494    u32 x00005EF4, x00FF5EFF, x00555455, x3C699796;
495    u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F;
496    u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8;
497    u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A;
498    u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356;
499    u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B;
500    u32 x00, x01, x10, x11, x20, x21, x30, x31;
501
502    x44444444 = a1 & ~a2;
503    x0F0FF0F0 = a3 ^ a6;
504    x4F4FF4F4 = x44444444 | x0F0FF0F0;
505    x00FFFF00 = a4 ^ a6;
506    x00AAAA00 = x00FFFF00 & ~a1;
507    x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00;
508
509    x3C3CC3C3 = a2 ^ x0F0FF0F0;
510    x3C3C0000 = x3C3CC3C3 & ~a6;
511    x7373F4F4 = x4F4FF4F4 ^ x3C3C0000;
512    x0C840A00 = x4FE55EF4 & ~x7373F4F4;
513
514    x00005EF4 = a6 & x4FE55EF4;
515    x00FF5EFF = a4 | x00005EF4;
516    x00555455 = a1 & x00FF5EFF;
517    x3C699796 = x3C3CC3C3 ^ x00555455;
518    x30 = x4FE55EF4 & ~a5;
519    x31 = x30 ^ x3C699796;
520    *out4 ^= x31;
521
522    x000FF000 = x0F0FF0F0 & x00FFFF00;
523    x55AA55AA = a1 ^ a4;
524    x26D9A15E = x7373F4F4 ^ x55AA55AA;
525    x2FDFAF5F = a3 | x26D9A15E;
526    x2FD00F5F = x2FDFAF5F & ~x000FF000;
527
528    x55AAFFAA = x00AAAA00 | x55AA55AA;
529    x28410014 = x3C699796 & ~x55AAFFAA;
530    x000000FF = a4 & a6;
531    x000000CC = x000000FF & ~a2;
532    x284100D8 = x28410014 ^ x000000CC;
533
534    x204100D0 = x7373F4F4 & x284100D8;
535    x3C3CC3FF = x3C3CC3C3 | x000000FF;
536    x1C3CC32F = x3C3CC3FF & ~x204100D0;
537    x4969967A = a1 ^ x1C3CC32F;
538    x10 = x2FD00F5F & a5;
539    x11 = x10 ^ x4969967A;
540    *out2 ^= x11;
541
542    x4CC44CC4 = x4FE55EF4 & ~a2;
543    x40C040C0 = x4CC44CC4 & ~a3;
544    xC3C33C3C = ~x3C3CC3C3;
545    x9669C396 = x55AAFFAA ^ xC3C33C3C;
546    xD6A98356 = x40C040C0 ^ x9669C396;
547    x00 = a5 & ~x0C840A00;
548    x01 = x00 ^ xD6A98356;
549    *out1 ^= x01;
550
551    xD6E9C3D6 = x40C040C0 | x9669C396;
552    x4CEEEEC4 = x00AAAA00 | x4CC44CC4;
553    x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4;
554    x001A000B = a4 & ~x4FE55EF4;
555    x9A1F2D1B = x9A072D12 | x001A000B;
556    x20 = a5 & ~x284100D8;
557    x21 = x20 ^ x9A1F2D1B;
558    *out3 ^= x21;
559}
560
561DECLSPEC void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
562{
563    u32 x5A5A5A5A, x0F0FF0F0;
564    u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F,
565        x52FBCA0F, x61C8F93C;
566    u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6;
567    u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1;
568    u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6;
569    u32 x00, x01, x10, x11, x20, x21, x30, x31;
570
571    x5A5A5A5A = a1 ^ a3;
572    x0F0FF0F0 = a3 ^ a5;
573    x33FF33FF = a2 | a4;
574    x33FFCC00 = a5 ^ x33FF33FF;
575    x0C0030F0 = x0F0FF0F0 & ~x33FFCC00;
576    x0C0CC0C0 = x0F0FF0F0 & ~a2;
577    x0CF3C03F = a4 ^ x0C0CC0C0;
578    x5EFBDA7F = x5A5A5A5A | x0CF3C03F;
579    x52FBCA0F = x5EFBDA7F & ~x0C0030F0;
580    x61C8F93C = a2 ^ x52FBCA0F;
581
582    x00C0C03C = x0CF3C03F & x61C8F93C;
583    x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C;
584    x3B92A366 = x5A5A5A5A ^ x61C8F93C;
585    x30908326 = x3B92A366 & ~x0F0F30C0;
586    x3C90B3D6 = x0C0030F0 ^ x30908326;
587
588    x33CC33CC = a2 ^ a4;
589    x0C0CFFFF = a5 | x0C0CC0C0;
590    x379E5C99 = x3B92A366 ^ x0C0CFFFF;
591    x04124C11 = x379E5C99 & ~x33CC33CC;
592    x56E9861E = x52FBCA0F ^ x04124C11;
593    x00 = a6 & ~x3C90B3D6;
594    x01 = x00 ^ x56E9861E;
595    *out1 ^= x01;
596
597    xA91679E1 = ~x56E9861E;
598    x10 = x3C90B3D6 & ~a6;
599    x11 = x10 ^ xA91679E1;
600    *out2 ^= x11;
601
602    x9586CA37 = x3C90B3D6 ^ xA91679E1;
603    x8402C833 = x9586CA37 & ~x33CC33CC;
604    x84C2C83F = x00C0C03C | x8402C833;
605    xB35C94A6 = x379E5C99 ^ x84C2C83F;
606    x20 = x61C8F93C | a6;
607    x21 = x20 ^ xB35C94A6;
608    *out3 ^= x21;
609
610    x30 = a6 & x61C8F93C;
611    x31 = x30 ^ xB35C94A6;
612    *out4 ^= x31;
613}
614
615DECLSPEC void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
616{
617    u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F;
618    u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B;
619    u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7;
620    u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF;
621    u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A;
622    u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2;
623    u32 x22222222, x16BCEE97, x0F080B04, x19B4E593;
624    u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D;
625    u32 x00, x01, x10, x11, x20, x21, x30, x31;
626
627    x77777777 = a1 | a3;
628    x77770000 = x77777777 & ~a6;
629    x22225555 = a1 ^ x77770000;
630    x11116666 = a3 ^ x22225555;
631    x1F1F6F6F = a4 | x11116666;
632
633    x70700000 = x77770000 & ~a4;
634    x43433333 = a3 ^ x70700000;
635    x00430033 = a5 & x43433333;
636    x55557777 = a1 | x11116666;
637    x55167744 = x00430033 ^ x55557777;
638    x5A19784B = a4 ^ x55167744;
639
640    x5A1987B4 = a6 ^ x5A19784B;
641    x7A3BD7F5 = x22225555 | x5A1987B4;
642    x003B00F5 = a5 & x7A3BD7F5;
643    x221955A0 = x22225555 ^ x003B00F5;
644    x05050707 = a4 & x55557777;
645    x271C52A7 = x221955A0 ^ x05050707;
646
647    x2A2A82A0 = x7A3BD7F5 & ~a1;
648    x6969B193 = x43433333 ^ x2A2A82A0;
649    x1FE06F90 = a5 ^ x1F1F6F6F;
650    x16804E00 = x1FE06F90 & ~x6969B193;
651    xE97FB1FF = ~x16804E00;
652    x20 = xE97FB1FF & ~a2;
653    x21 = x20 ^ x5A19784B;
654    *out3 ^= x21;
655
656    x43403302 = x43433333 & ~x003B00F5;
657    x35CAED30 = x2A2A82A0 ^ x1FE06F90;
658    x37DEFFB7 = x271C52A7 | x35CAED30;
659    x349ECCB5 = x37DEFFB7 & ~x43403302;
660    x0B01234A = x1F1F6F6F & ~x349ECCB5;
661
662    x101884B4 = x5A1987B4 & x349ECCB5;
663    x0FF8EB24 = x1FE06F90 ^ x101884B4;
664    x41413333 = x43433333 & x55557777;
665    x4FF9FB37 = x0FF8EB24 | x41413333;
666    x4FC2FBC2 = x003B00F5 ^ x4FF9FB37;
667    x30 = x4FC2FBC2 & a2;
668    x31 = x30 ^ x271C52A7;
669    *out4 ^= x31;
670
671    x22222222 = a1 ^ x77777777;
672    x16BCEE97 = x349ECCB5 ^ x22222222;
673    x0F080B04 = a4 & x0FF8EB24;
674    x19B4E593 = x16BCEE97 ^ x0F080B04;
675    x00 = x0B01234A | a2;
676    x01 = x00 ^ x19B4E593;
677    *out1 ^= x01;
678
679    x5C5C5C5C = x1F1F6F6F ^ x43433333;
680    x4448184C = x5C5C5C5C & ~x19B4E593;
681    x2DDABE71 = x22225555 ^ x0FF8EB24;
682    x6992A63D = x4448184C ^ x2DDABE71;
683    x10 = x1F1F6F6F & a2;
684    x11 = x10 ^ x6992A63D;
685    *out2 ^= x11;
686}
687
688DECLSPEC void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
689{
690    u32 x33CC33CC;
691    u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099;
692    u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6;
693    u32 x09030C06, x09030000, x336622FF, x3A6522FF;
694    u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD;
695    u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B;
696    u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479;
697    u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5;
698    u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67;
699    u32 x00, x01, x10, x11, x20, x21, x30, x31;
700
701    x33CC33CC = a2 ^ a5;
702
703    x3333FFFF = a2 | a6;
704    x11115555 = a1 & x3333FFFF;
705    x22DD6699 = x33CC33CC ^ x11115555;
706    x22DD9966 = a6 ^ x22DD6699;
707    x00220099 = a5 & ~x22DD9966;
708
709    x00551144 = a1 & x22DD9966;
710    x33662277 = a2 ^ x00551144;
711    x5A5A5A5A = a1 ^ a3;
712    x7B7E7A7F = x33662277 | x5A5A5A5A;
713    x59A31CE6 = x22DD6699 ^ x7B7E7A7F;
714
715    x09030C06 = a3 & x59A31CE6;
716    x09030000 = x09030C06 & ~a6;
717    x336622FF = x00220099 | x33662277;
718    x3A6522FF = x09030000 ^ x336622FF;
719    x30 = x3A6522FF & a4;
720    x31 = x30 ^ x59A31CE6;
721    *out4 ^= x31;
722
723    x484D494C = a2 ^ x7B7E7A7F;
724    x0000B6B3 = a6 & ~x484D494C;
725    x0F0FB9BC = a3 ^ x0000B6B3;
726    x00FC00F9 = a5 & ~x09030C06;
727    x0FFFB9FD = x0F0FB9BC | x00FC00F9;
728
729    x5DF75DF7 = a1 | x59A31CE6;
730    x116600F7 = x336622FF & x5DF75DF7;
731    x1E69B94B = x0F0FB9BC ^ x116600F7;
732    x1668B94B = x1E69B94B & ~x09030000;
733    x20 = x00220099 | a4;
734    x21 = x20 ^ x1668B94B;
735    *out3 ^= x21;
736
737    x7B7B7B7B = a2 | x5A5A5A5A;
738    x411E5984 = x3A6522FF ^ x7B7B7B7B;
739    x1FFFFDFD = x11115555 | x0FFFB9FD;
740    x5EE1A479 = x411E5984 ^ x1FFFFDFD;
741
742    x3CB4DFD2 = x22DD6699 ^ x1E69B94B;
743    x004B002D = a5 & ~x3CB4DFD2;
744    xB7B2B6B3 = ~x484D494C;
745    xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3;
746    xCC82CDE5 = x004B002D ^ xCCC9CDC8;
747    x10 = xCC82CDE5 & ~a4;
748    x11 = x10 ^ x5EE1A479;
749    *out2 ^= x11;
750
751    x0055EEBB = a6 ^ x00551144;
752    x5A5AECE9 = a1 ^ x0F0FB9BC;
753    x0050ECA9 = x0055EEBB & x5A5AECE9;
754    xC5CAC1CE = x09030C06 ^ xCCC9CDC8;
755    xC59A2D67 = x0050ECA9 ^ xC5CAC1CE;
756    x00 = x0FFFB9FD & ~a4;
757    x01 = x00 ^ xC59A2D67;
758    *out1 ^= x01;
759}
760
761DECLSPEC void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
762{
763    u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841;
764    u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78;
765    u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D;
766    u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B;
767    u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB;
768    u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867;
769    u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD;
770    u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A;
771    u32 x00, x01, x10, x11, x20, x21, x30, x31;
772
773    x0FF00FF0 = a4 ^ a5;
774    x3CC33CC3 = a3 ^ x0FF00FF0;
775    x00003CC3 = a6 & x3CC33CC3;
776    x0F000F00 = a4 & x0FF00FF0;
777    x5A555A55 = a2 ^ x0F000F00;
778    x00001841 = x00003CC3 & x5A555A55;
779
780    x00000F00 = a6 & x0F000F00;
781    x33333C33 = a3 ^ x00000F00;
782    x7B777E77 = x5A555A55 | x33333C33;
783    x0FF0F00F = a6 ^ x0FF00FF0;
784    x74878E78 = x7B777E77 ^ x0FF0F00F;
785    x30 = a1 & ~x00001841;
786    x31 = x30 ^ x74878E78;
787    *out4 ^= x31;
788
789    x003C003C = a5 & ~x3CC33CC3;
790    x5A7D5A7D = x5A555A55 | x003C003C;
791    x333300F0 = x00003CC3 ^ x33333C33;
792    x694E5A8D = x5A7D5A7D ^ x333300F0;
793
794    x0FF0CCCC = x00003CC3 ^ x0FF0F00F;
795    x000F0303 = a4 & ~x0FF0CCCC;
796    x5A505854 = x5A555A55 & ~x000F0303;
797    x33CC000F = a5 ^ x333300F0;
798    x699C585B = x5A505854 ^ x33CC000F;
799
800    x7F878F78 = x0F000F00 | x74878E78;
801    x21101013 = a3 & x699C585B;
802    x7F979F7B = x7F878F78 | x21101013;
803    x30030CC0 = x3CC33CC3 & ~x0FF0F00F;
804    x4F9493BB = x7F979F7B ^ x30030CC0;
805    x00 = x4F9493BB & ~a1;
806    x01 = x00 ^ x694E5A8D;
807    *out1 ^= x01;
808
809    x6F9CDBFB = x699C585B | x4F9493BB;
810    x0000DBFB = a6 & x6F9CDBFB;
811    x00005151 = a2 & x0000DBFB;
812    x26DAC936 = x694E5A8D ^ x4F9493BB;
813    x26DA9867 = x00005151 ^ x26DAC936;
814
815    x27DA9877 = x21101013 | x26DA9867;
816    x27DA438C = x0000DBFB ^ x27DA9877;
817    x2625C9C9 = a5 ^ x26DAC936;
818    x27FFCBCD = x27DA438C | x2625C9C9;
819    x20 = x27FFCBCD & a1;
820    x21 = x20 ^ x699C585B;
821    *out3 ^= x21;
822
823    x27FF1036 = x0000DBFB ^ x27FFCBCD;
824    x27FF103E = x003C003C | x27FF1036;
825    xB06B6C44 = ~x4F9493BB;
826    x97947C7A = x27FF103E ^ xB06B6C44;
827    x10 = x97947C7A & ~a1;
828    x11 = x10 ^ x26DA9867;
829    *out2 ^= x11;
830}
831
832DECLSPEC void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
833{
834    u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001;
835    u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745;
836    u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3;
837    u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A;
838    u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926;
839    u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F;
840    u32 xF700A600, x61008000, x03B7856B, x62B7056B;
841    u32 x00, x01, x10, x11, x20, x21, x30, x31;
842
843    x0C0C0C0C = a3 & ~a2;
844    x0000F0F0 = a5 & ~a3;
845    x00FFF00F = a4 ^ x0000F0F0;
846    x00555005 = a1 & x00FFF00F;
847    x00515001 = x00555005 & ~x0C0C0C0C;
848
849    x33000330 = a2 & ~x00FFF00F;
850    x77555775 = a1 | x33000330;
851    x30303030 = a2 & ~a3;
852    x3030CFCF = a5 ^ x30303030;
853    x30104745 = x77555775 & x3030CFCF;
854    x30555745 = x00555005 | x30104745;
855
856    xFF000FF0 = ~x00FFF00F;
857    xCF1048B5 = x30104745 ^ xFF000FF0;
858    x080A080A = a3 & ~x77555775;
859    xC71A40BF = xCF1048B5 ^ x080A080A;
860    xCB164CB3 = x0C0C0C0C ^ xC71A40BF;
861    x10 = x00515001 | a6;
862    x11 = x10 ^ xCB164CB3;
863    *out2 ^= x11;
864
865    x9E4319E6 = a1 ^ xCB164CB3;
866    x000019E6 = a5 & x9E4319E6;
867    xF429738C = a2 ^ xC71A40BF;
868    xF4296A6A = x000019E6 ^ xF429738C;
869    xC729695A = x33000330 ^ xF4296A6A;
870
871    xC47C3D2F = x30555745 ^ xF4296A6A;
872    xF77F3F3F = a2 | xC47C3D2F;
873    x9E43E619 = a5 ^ x9E4319E6;
874    x693CD926 = xF77F3F3F ^ x9E43E619;
875    x20 = x30555745 & a6;
876    x21 = x20 ^ x693CD926;
877    *out3 ^= x21;
878
879    xF719A695 = x3030CFCF ^ xC729695A;
880    xF4FF73FF = a4 | xF429738C;
881    x03E6D56A = xF719A695 ^ xF4FF73FF;
882    x56B3803F = a1 ^ x03E6D56A;
883    x30 = x56B3803F & a6;
884    x31 = x30 ^ xC729695A;
885    *out4 ^= x31;
886
887    xF700A600 = xF719A695 & ~a4;
888    x61008000 = x693CD926 & xF700A600;
889    x03B7856B = x00515001 ^ x03E6D56A;
890    x62B7056B = x61008000 ^ x03B7856B;
891    x00 = x62B7056B | a6;
892    x01 = x00 ^ xC729695A;
893    *out1 ^= x01;
894}
895
896#endif
897#endif
898
899#if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC
900
901/*
902 * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC
903 * architectures.  These use AND, OR, XOR, NOT, and AND-NOT gates.
904 *
905 * Gate counts: 49 44 46 33 48 46 46 41
906 * Average: 44.125
907 *
908 * Several same-gate-count expressions for each S-box are included (for use on
909 * different CPUs/GPUs).
910 *
911 * These Boolean expressions corresponding to DES S-boxes have been generated
912 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
913 * John the Ripper password cracker: http://www.openwall.com/john/
914 * Being mathematical formulas, they are not copyrighted and are free for reuse
915 * by anyone.
916 *
917 * This file (a specific representation of the S-box expressions, surrounding
918 * logic) is Copyright (c) 2011 by Solar Designer <solar at openwall.com>.
919 * Redistribution and use in source and binary forms, with or without
920 * modification, are permitted.  (This is a heavily cut-down "BSD license".)
921 *
922 * The effort has been sponsored by Rapid7: http://www.rapid7.com
923 */
924
925DECLSPEC void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
926{
927    u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969,
928        x25202160;
929    u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93;
930    u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69;
931    u32 x0A0A0000, x0AD80096, x00999900, x0AD99996;
932    u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC;
933    u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0;
934    u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A;
935    u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205;
936    u32 x00, x01, x10, x11, x20, x21, x30, x31;
937
938    x55005500 = a1 & ~a5;
939    x5A0F5A0F = a4 ^ x55005500;
940    x3333FFFF = a3 | a6;
941    x66666666 = a1 ^ a3;
942    x22226666 = x3333FFFF & x66666666;
943    x2D2D6969 = a4 ^ x22226666;
944    x25202160 = x2D2D6969 & ~x5A0F5A0F;
945
946    x00FFFF00 = a5 ^ a6;
947    x33CCCC33 = a3 ^ x00FFFF00;
948    x4803120C = x5A0F5A0F & ~x33CCCC33;
949    x2222FFFF = a6 | x22226666;
950    x6A21EDF3 = x4803120C ^ x2222FFFF;
951    x4A01CC93 = x6A21EDF3 & ~x25202160;
952
953    x5555FFFF = a1 | a6;
954    x7F75FFFF = x6A21EDF3 | x5555FFFF;
955    x00D20096 = a5 & ~x2D2D6969;
956    x7FA7FF69 = x7F75FFFF ^ x00D20096;
957
958    x0A0A0000 = a4 & ~x5555FFFF;
959    x0AD80096 = x00D20096 ^ x0A0A0000;
960    x00999900 = x00FFFF00 & ~x66666666;
961    x0AD99996 = x0AD80096 | x00999900;
962
963    x22332233 = a3 & ~x55005500;
964    x257AA5F0 = x5A0F5A0F ^ x7F75FFFF;
965    x054885C0 = x257AA5F0 & ~x22332233;
966    xFAB77A3F = ~x054885C0;
967    x2221EDF3 = x3333FFFF & x6A21EDF3;
968    xD89697CC = xFAB77A3F ^ x2221EDF3;
969    x20 = x7FA7FF69 & ~a2;
970    x21 = x20 ^ xD89697CC;
971    *out3 ^= x21;
972
973    x05B77AC0 = x00FFFF00 ^ x054885C0;
974    x05F77AD6 = x00D20096 | x05B77AC0;
975    x36C48529 = x3333FFFF ^ x05F77AD6;
976    x6391D07C = a1 ^ x36C48529;
977    xBB0747B0 = xD89697CC ^ x6391D07C;
978    x00 = x25202160 | a2;
979    x01 = x00 ^ xBB0747B0;
980    *out1 ^= x01;
981
982    x4C460000 = x3333FFFF ^ x7F75FFFF;
983    x4EDF9996 = x0AD99996 | x4C460000;
984    x2D4E49EA = x6391D07C ^ x4EDF9996;
985    xBBFFFFB0 = x00FFFF00 | xBB0747B0;
986    x96B1B65A = x2D4E49EA ^ xBBFFFFB0;
987    x10 = x4A01CC93 | a2;
988    x11 = x10 ^ x96B1B65A;
989    *out2 ^= x11;
990
991    x5AFF5AFF = a5 | x5A0F5A0F;
992    x52B11215 = x5AFF5AFF & ~x2D4E49EA;
993    x4201C010 = x4A01CC93 & x6391D07C;
994    x10B0D205 = x52B11215 ^ x4201C010;
995    x30 = x10B0D205 | a2;
996    x31 = x30 ^ x0AD99996;
997    *out4 ^= x31;
998}
999
1000DECLSPEC void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1001{
1002    u32 x33CC33CC;
1003    u32 x55550000, x00AA00FF, x33BB33FF;
1004    u32 x33CC0000, x11441144, x11BB11BB, x003311BB;
1005    u32 x00000F0F, x336600FF, x332200FF, x332200F0;
1006    u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95;
1007    u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39;
1008    u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53;
1009    u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F;
1010    u32 x0A451047, xBBDFDD7B, xB19ACD3C;
1011    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1012
1013    x33CC33CC = a2 ^ a5;
1014
1015    x55550000 = a1 & ~a6;
1016    x00AA00FF = a5 & ~x55550000;
1017    x33BB33FF = a2 | x00AA00FF;
1018
1019    x33CC0000 = x33CC33CC & ~a6;
1020    x11441144 = a1 & x33CC33CC;
1021    x11BB11BB = a5 ^ x11441144;
1022    x003311BB = x11BB11BB & ~x33CC0000;
1023
1024    x00000F0F = a3 & a6;
1025    x336600FF = x00AA00FF ^ x33CC0000;
1026    x332200FF = x33BB33FF & x336600FF;
1027    x332200F0 = x332200FF & ~x00000F0F;
1028
1029    x0302000F = a3 & x332200FF;
1030    xAAAAAAAA = ~a1;
1031    xA9A8AAA5 = x0302000F ^ xAAAAAAAA;
1032    x33CCCC33 = a6 ^ x33CC33CC;
1033    x33CCC030 = x33CCCC33 & ~x00000F0F;
1034    x9A646A95 = xA9A8AAA5 ^ x33CCC030;
1035    x10 = a4 & ~x332200F0;
1036    x11 = x10 ^ x9A646A95;
1037    *out2 ^= x11;
1038
1039    x00333303 = a2 & ~x33CCC030;
1040    x118822B8 = x11BB11BB ^ x00333303;
1041    xA8208805 = xA9A8AAA5 & ~x118822B8;
1042    x3CC3C33C = a3 ^ x33CCCC33;
1043    x94E34B39 = xA8208805 ^ x3CC3C33C;
1044    x00 = x33BB33FF & ~a4;
1045    x01 = x00 ^ x94E34B39;
1046    *out1 ^= x01;
1047
1048    x0331330C = x0302000F ^ x00333303;
1049    x3FF3F33C = x3CC3C33C | x0331330C;
1050    xA9DF596A = x33BB33FF ^ x9A646A95;
1051    xA9DF5F6F = x00000F0F | xA9DF596A;
1052    x962CAC53 = x3FF3F33C ^ xA9DF5F6F;
1053
1054    xA9466A6A = x332200FF ^ x9A646A95;
1055    x3DA52153 = x94E34B39 ^ xA9466A6A;
1056    x29850143 = xA9DF5F6F & x3DA52153;
1057    x33C0330C = x33CC33CC & x3FF3F33C;
1058    x1A45324F = x29850143 ^ x33C0330C;
1059    x20 = x1A45324F | a4;
1060    x21 = x20 ^ x962CAC53;
1061    *out3 ^= x21;
1062
1063    x0A451047 = x1A45324F & ~x118822B8;
1064    xBBDFDD7B = x33CCCC33 | xA9DF596A;
1065    xB19ACD3C = x0A451047 ^ xBBDFDD7B;
1066    x30 = x003311BB | a4;
1067    x31 = x30 ^ xB19ACD3C;
1068    *out4 ^= x31;
1069}
1070
1071DECLSPEC void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1072{
1073    u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4;
1074    u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00;
1075    u32 x00005EF4, x00FF5EFF, x00555455, x3C699796;
1076    u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F;
1077    u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8;
1078    u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A;
1079    u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356;
1080    u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B;
1081    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1082
1083    x44444444 = a1 & ~a2;
1084    x0F0FF0F0 = a3 ^ a6;
1085    x4F4FF4F4 = x44444444 | x0F0FF0F0;
1086    x00FFFF00 = a4 ^ a6;
1087    x00AAAA00 = x00FFFF00 & ~a1;
1088    x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00;
1089
1090    x3C3CC3C3 = a2 ^ x0F0FF0F0;
1091    x3C3C0000 = x3C3CC3C3 & ~a6;
1092    x7373F4F4 = x4F4FF4F4 ^ x3C3C0000;
1093    x0C840A00 = x4FE55EF4 & ~x7373F4F4;
1094
1095    x00005EF4 = a6 & x4FE55EF4;
1096    x00FF5EFF = a4 | x00005EF4;
1097    x00555455 = a1 & x00FF5EFF;
1098    x3C699796 = x3C3CC3C3 ^ x00555455;
1099    x30 = x4FE55EF4 & ~a5;
1100    x31 = x30 ^ x3C699796;
1101    *out4 ^= x31;
1102
1103    x000FF000 = x0F0FF0F0 & x00FFFF00;
1104    x55AA55AA = a1 ^ a4;
1105    x26D9A15E = x7373F4F4 ^ x55AA55AA;
1106    x2FDFAF5F = a3 | x26D9A15E;
1107    x2FD00F5F = x2FDFAF5F & ~x000FF000;
1108
1109    x55AAFFAA = x00AAAA00 | x55AA55AA;
1110    x28410014 = x3C699796 & ~x55AAFFAA;
1111    x000000FF = a4 & a6;
1112    x000000CC = x000000FF & ~a2;
1113    x284100D8 = x28410014 ^ x000000CC;
1114
1115    x204100D0 = x7373F4F4 & x284100D8;
1116    x3C3CC3FF = x3C3CC3C3 | x000000FF;
1117    x1C3CC32F = x3C3CC3FF & ~x204100D0;
1118    x4969967A = a1 ^ x1C3CC32F;
1119    x10 = x2FD00F5F & a5;
1120    x11 = x10 ^ x4969967A;
1121    *out2 ^= x11;
1122
1123    x4CC44CC4 = x4FE55EF4 & ~a2;
1124    x40C040C0 = x4CC44CC4 & ~a3;
1125    xC3C33C3C = ~x3C3CC3C3;
1126    x9669C396 = x55AAFFAA ^ xC3C33C3C;
1127    xD6A98356 = x40C040C0 ^ x9669C396;
1128    x00 = a5 & ~x0C840A00;
1129    x01 = x00 ^ xD6A98356;
1130    *out1 ^= x01;
1131
1132    xD6E9C3D6 = x40C040C0 | x9669C396;
1133    x4CEEEEC4 = x00AAAA00 | x4CC44CC4;
1134    x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4;
1135    x001A000B = a4 & ~x4FE55EF4;
1136    x9A1F2D1B = x9A072D12 | x001A000B;
1137    x20 = a5 & ~x284100D8;
1138    x21 = x20 ^ x9A1F2D1B;
1139    *out3 ^= x21;
1140}
1141
1142DECLSPEC void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1143{
1144    u32 x5A5A5A5A, x0F0FF0F0;
1145    u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F,
1146        x52FBCA0F, x61C8F93C;
1147    u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6;
1148    u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1;
1149    u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6;
1150    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1151
1152    x5A5A5A5A = a1 ^ a3;
1153    x0F0FF0F0 = a3 ^ a5;
1154    x33FF33FF = a2 | a4;
1155    x33FFCC00 = a5 ^ x33FF33FF;
1156    x0C0030F0 = x0F0FF0F0 & ~x33FFCC00;
1157    x0C0CC0C0 = x0F0FF0F0 & ~a2;
1158    x0CF3C03F = a4 ^ x0C0CC0C0;
1159    x5EFBDA7F = x5A5A5A5A | x0CF3C03F;
1160    x52FBCA0F = x5EFBDA7F & ~x0C0030F0;
1161    x61C8F93C = a2 ^ x52FBCA0F;
1162
1163    x00C0C03C = x0CF3C03F & x61C8F93C;
1164    x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C;
1165    x3B92A366 = x5A5A5A5A ^ x61C8F93C;
1166    x30908326 = x3B92A366 & ~x0F0F30C0;
1167    x3C90B3D6 = x0C0030F0 ^ x30908326;
1168
1169    x33CC33CC = a2 ^ a4;
1170    x0C0CFFFF = a5 | x0C0CC0C0;
1171    x379E5C99 = x3B92A366 ^ x0C0CFFFF;
1172    x04124C11 = x379E5C99 & ~x33CC33CC;
1173    x56E9861E = x52FBCA0F ^ x04124C11;
1174    x00 = a6 & ~x3C90B3D6;
1175    x01 = x00 ^ x56E9861E;
1176    *out1 ^= x01;
1177
1178    xA91679E1 = ~x56E9861E;
1179    x10 = x3C90B3D6 & ~a6;
1180    x11 = x10 ^ xA91679E1;
1181    *out2 ^= x11;
1182
1183    x9586CA37 = x3C90B3D6 ^ xA91679E1;
1184    x8402C833 = x9586CA37 & ~x33CC33CC;
1185    x84C2C83F = x00C0C03C | x8402C833;
1186    xB35C94A6 = x379E5C99 ^ x84C2C83F;
1187    x20 = x61C8F93C | a6;
1188    x21 = x20 ^ xB35C94A6;
1189    *out3 ^= x21;
1190
1191    x30 = a6 & x61C8F93C;
1192    x31 = x30 ^ xB35C94A6;
1193    *out4 ^= x31;
1194}
1195
1196DECLSPEC void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1197{
1198    u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F;
1199    u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B;
1200    u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7;
1201    u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF;
1202    u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A;
1203    u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2;
1204    u32 x22222222, x16BCEE97, x0F080B04, x19B4E593;
1205    u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D;
1206    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1207
1208    x77777777 = a1 | a3;
1209    x77770000 = x77777777 & ~a6;
1210    x22225555 = a1 ^ x77770000;
1211    x11116666 = a3 ^ x22225555;
1212    x1F1F6F6F = a4 | x11116666;
1213
1214    x70700000 = x77770000 & ~a4;
1215    x43433333 = a3 ^ x70700000;
1216    x00430033 = a5 & x43433333;
1217    x55557777 = a1 | x11116666;
1218    x55167744 = x00430033 ^ x55557777;
1219    x5A19784B = a4 ^ x55167744;
1220
1221    x5A1987B4 = a6 ^ x5A19784B;
1222    x7A3BD7F5 = x22225555 | x5A1987B4;
1223    x003B00F5 = a5 & x7A3BD7F5;
1224    x221955A0 = x22225555 ^ x003B00F5;
1225    x05050707 = a4 & x55557777;
1226    x271C52A7 = x221955A0 ^ x05050707;
1227
1228    x2A2A82A0 = x7A3BD7F5 & ~a1;
1229    x6969B193 = x43433333 ^ x2A2A82A0;
1230    x1FE06F90 = a5 ^ x1F1F6F6F;
1231    x16804E00 = x1FE06F90 & ~x6969B193;
1232    xE97FB1FF = ~x16804E00;
1233    x20 = xE97FB1FF & ~a2;
1234    x21 = x20 ^ x5A19784B;
1235    *out3 ^= x21;
1236
1237    x43403302 = x43433333 & ~x003B00F5;
1238    x35CAED30 = x2A2A82A0 ^ x1FE06F90;
1239    x37DEFFB7 = x271C52A7 | x35CAED30;
1240    x349ECCB5 = x37DEFFB7 & ~x43403302;
1241    x0B01234A = x1F1F6F6F & ~x349ECCB5;
1242
1243    x101884B4 = x5A1987B4 & x349ECCB5;
1244    x0FF8EB24 = x1FE06F90 ^ x101884B4;
1245    x41413333 = x43433333 & x55557777;
1246    x4FF9FB37 = x0FF8EB24 | x41413333;
1247    x4FC2FBC2 = x003B00F5 ^ x4FF9FB37;
1248    x30 = x4FC2FBC2 & a2;
1249    x31 = x30 ^ x271C52A7;
1250    *out4 ^= x31;
1251
1252    x22222222 = a1 ^ x77777777;
1253    x16BCEE97 = x349ECCB5 ^ x22222222;
1254    x0F080B04 = a4 & x0FF8EB24;
1255    x19B4E593 = x16BCEE97 ^ x0F080B04;
1256    x00 = x0B01234A | a2;
1257    x01 = x00 ^ x19B4E593;
1258    *out1 ^= x01;
1259
1260    x5C5C5C5C = x1F1F6F6F ^ x43433333;
1261    x4448184C = x5C5C5C5C & ~x19B4E593;
1262    x2DDABE71 = x22225555 ^ x0FF8EB24;
1263    x6992A63D = x4448184C ^ x2DDABE71;
1264    x10 = x1F1F6F6F & a2;
1265    x11 = x10 ^ x6992A63D;
1266    *out2 ^= x11;
1267}
1268
1269DECLSPEC void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1270{
1271    u32 x33CC33CC;
1272    u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099;
1273    u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6;
1274    u32 x09030C06, x09030000, x336622FF, x3A6522FF;
1275    u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD;
1276    u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B;
1277    u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479;
1278    u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5;
1279    u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67;
1280    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1281
1282    x33CC33CC = a2 ^ a5;
1283
1284    x3333FFFF = a2 | a6;
1285    x11115555 = a1 & x3333FFFF;
1286    x22DD6699 = x33CC33CC ^ x11115555;
1287    x22DD9966 = a6 ^ x22DD6699;
1288    x00220099 = a5 & ~x22DD9966;
1289
1290    x00551144 = a1 & x22DD9966;
1291    x33662277 = a2 ^ x00551144;
1292    x5A5A5A5A = a1 ^ a3;
1293    x7B7E7A7F = x33662277 | x5A5A5A5A;
1294    x59A31CE6 = x22DD6699 ^ x7B7E7A7F;
1295
1296    x09030C06 = a3 & x59A31CE6;
1297    x09030000 = x09030C06 & ~a6;
1298    x336622FF = x00220099 | x33662277;
1299    x3A6522FF = x09030000 ^ x336622FF;
1300    x30 = x3A6522FF & a4;
1301    x31 = x30 ^ x59A31CE6;
1302    *out4 ^= x31;
1303
1304    x484D494C = a2 ^ x7B7E7A7F;
1305    x0000B6B3 = a6 & ~x484D494C;
1306    x0F0FB9BC = a3 ^ x0000B6B3;
1307    x00FC00F9 = a5 & ~x09030C06;
1308    x0FFFB9FD = x0F0FB9BC | x00FC00F9;
1309
1310    x5DF75DF7 = a1 | x59A31CE6;
1311    x116600F7 = x336622FF & x5DF75DF7;
1312    x1E69B94B = x0F0FB9BC ^ x116600F7;
1313    x1668B94B = x1E69B94B & ~x09030000;
1314    x20 = x00220099 | a4;
1315    x21 = x20 ^ x1668B94B;
1316    *out3 ^= x21;
1317
1318    x7B7B7B7B = a2 | x5A5A5A5A;
1319    x411E5984 = x3A6522FF ^ x7B7B7B7B;
1320    x1FFFFDFD = x11115555 | x0FFFB9FD;
1321    x5EE1A479 = x411E5984 ^ x1FFFFDFD;
1322
1323    x3CB4DFD2 = x22DD6699 ^ x1E69B94B;
1324    x004B002D = a5 & ~x3CB4DFD2;
1325    xB7B2B6B3 = ~x484D494C;
1326    xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3;
1327    xCC82CDE5 = x004B002D ^ xCCC9CDC8;
1328    x10 = xCC82CDE5 & ~a4;
1329    x11 = x10 ^ x5EE1A479;
1330    *out2 ^= x11;
1331
1332    x0055EEBB = a6 ^ x00551144;
1333    x5A5AECE9 = a1 ^ x0F0FB9BC;
1334    x0050ECA9 = x0055EEBB & x5A5AECE9;
1335    xC5CAC1CE = x09030C06 ^ xCCC9CDC8;
1336    xC59A2D67 = x0050ECA9 ^ xC5CAC1CE;
1337    x00 = x0FFFB9FD & ~a4;
1338    x01 = x00 ^ xC59A2D67;
1339    *out1 ^= x01;
1340}
1341
1342DECLSPEC void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1343{
1344    u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841;
1345    u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78;
1346    u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D;
1347    u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B;
1348    u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB;
1349    u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867;
1350    u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD;
1351    u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A;
1352    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1353
1354    x0FF00FF0 = a4 ^ a5;
1355    x3CC33CC3 = a3 ^ x0FF00FF0;
1356    x00003CC3 = a6 & x3CC33CC3;
1357    x0F000F00 = a4 & x0FF00FF0;
1358    x5A555A55 = a2 ^ x0F000F00;
1359    x00001841 = x00003CC3 & x5A555A55;
1360
1361    x00000F00 = a6 & x0F000F00;
1362    x33333C33 = a3 ^ x00000F00;
1363    x7B777E77 = x5A555A55 | x33333C33;
1364    x0FF0F00F = a6 ^ x0FF00FF0;
1365    x74878E78 = x7B777E77 ^ x0FF0F00F;
1366    x30 = a1 & ~x00001841;
1367    x31 = x30 ^ x74878E78;
1368    *out4 ^= x31;
1369
1370    x003C003C = a5 & ~x3CC33CC3;
1371    x5A7D5A7D = x5A555A55 | x003C003C;
1372    x333300F0 = x00003CC3 ^ x33333C33;
1373    x694E5A8D = x5A7D5A7D ^ x333300F0;
1374
1375    x0FF0CCCC = x00003CC3 ^ x0FF0F00F;
1376    x000F0303 = a4 & ~x0FF0CCCC;
1377    x5A505854 = x5A555A55 & ~x000F0303;
1378    x33CC000F = a5 ^ x333300F0;
1379    x699C585B = x5A505854 ^ x33CC000F;
1380
1381    x7F878F78 = x0F000F00 | x74878E78;
1382    x21101013 = a3 & x699C585B;
1383    x7F979F7B = x7F878F78 | x21101013;
1384    x30030CC0 = x3CC33CC3 & ~x0FF0F00F;
1385    x4F9493BB = x7F979F7B ^ x30030CC0;
1386    x00 = x4F9493BB & ~a1;
1387    x01 = x00 ^ x694E5A8D;
1388    *out1 ^= x01;
1389
1390    x6F9CDBFB = x699C585B | x4F9493BB;
1391    x0000DBFB = a6 & x6F9CDBFB;
1392    x00005151 = a2 & x0000DBFB;
1393    x26DAC936 = x694E5A8D ^ x4F9493BB;
1394    x26DA9867 = x00005151 ^ x26DAC936;
1395
1396    x27DA9877 = x21101013 | x26DA9867;
1397    x27DA438C = x0000DBFB ^ x27DA9877;
1398    x2625C9C9 = a5 ^ x26DAC936;
1399    x27FFCBCD = x27DA438C | x2625C9C9;
1400    x20 = x27FFCBCD & a1;
1401    x21 = x20 ^ x699C585B;
1402    *out3 ^= x21;
1403
1404    x27FF1036 = x0000DBFB ^ x27FFCBCD;
1405    x27FF103E = x003C003C | x27FF1036;
1406    xB06B6C44 = ~x4F9493BB;
1407    x97947C7A = x27FF103E ^ xB06B6C44;
1408    x10 = x97947C7A & ~a1;
1409    x11 = x10 ^ x26DA9867;
1410    *out2 ^= x11;
1411}
1412
1413DECLSPEC void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4)
1414{
1415    u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001;
1416    u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745;
1417    u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3;
1418    u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A;
1419    u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926;
1420    u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F;
1421    u32 xF700A600, x61008000, x03B7856B, x62B7056B;
1422    u32 x00, x01, x10, x11, x20, x21, x30, x31;
1423
1424    x0C0C0C0C = a3 & ~a2;
1425    x0000F0F0 = a5 & ~a3;
1426    x00FFF00F = a4 ^ x0000F0F0;
1427    x00555005 = a1 & x00FFF00F;
1428    x00515001 = x00555005 & ~x0C0C0C0C;
1429
1430    x33000330 = a2 & ~x00FFF00F;
1431    x77555775 = a1 | x33000330;
1432    x30303030 = a2 & ~a3;
1433    x3030CFCF = a5 ^ x30303030;
1434    x30104745 = x77555775 & x3030CFCF;
1435    x30555745 = x00555005 | x30104745;
1436
1437    xFF000FF0 = ~x00FFF00F;
1438    xCF1048B5 = x30104745 ^ xFF000FF0;
1439    x080A080A = a3 & ~x77555775;
1440    xC71A40BF = xCF1048B5 ^ x080A080A;
1441    xCB164CB3 = x0C0C0C0C ^ xC71A40BF;
1442    x10 = x00515001 | a6;
1443    x11 = x10 ^ xCB164CB3;
1444    *out2 ^= x11;
1445
1446    x9E4319E6 = a1 ^ xCB164CB3;
1447    x000019E6 = a5 & x9E4319E6;
1448    xF429738C = a2 ^ xC71A40BF;
1449    xF4296A6A = x000019E6 ^ xF429738C;
1450    xC729695A = x33000330 ^ xF4296A6A;
1451
1452    xC47C3D2F = x30555745 ^ xF4296A6A;
1453    xF77F3F3F = a2 | xC47C3D2F;
1454    x9E43E619 = a5 ^ x9E4319E6;
1455    x693CD926 = xF77F3F3F ^ x9E43E619;
1456    x20 = x30555745 & a6;
1457    x21 = x20 ^ x693CD926;
1458    *out3 ^= x21;
1459
1460    xF719A695 = x3030CFCF ^ xC729695A;
1461    xF4FF73FF = a4 | xF429738C;
1462    x03E6D56A = xF719A695 ^ xF4FF73FF;
1463    x56B3803F = a1 ^ x03E6D56A;
1464    x30 = x56B3803F & a6;
1465    x31 = x30 ^ xC729695A;
1466    *out4 ^= x31;
1467
1468    xF700A600 = xF719A695 & ~a4;
1469    x61008000 = x693CD926 & xF700A600;
1470    x03B7856B = x00515001 ^ x03E6D56A;
1471    x62B7056B = x61008000 ^ x03B7856B;
1472    x00 = x62B7056B | a6;
1473    x01 = x00 ^ xC729695A;
1474    *out1 ^= x01;
1475}
1476
1477#endif
1478
1479#define SWAP(a, b) { u32 tmp=*a;*a=*b;*b=tmp; }
1480
1481#define DATASWAP   \
1482  SWAP (D00, D32); \
1483  SWAP (D01, D33); \
1484  SWAP (D02, D34); \
1485  SWAP (D03, D35); \
1486  SWAP (D04, D36); \
1487  SWAP (D05, D37); \
1488  SWAP (D06, D38); \
1489  SWAP (D07, D39); \
1490  SWAP (D08, D40); \
1491  SWAP (D09, D41); \
1492  SWAP (D10, D42); \
1493  SWAP (D11, D43); \
1494  SWAP (D12, D44); \
1495  SWAP (D13, D45); \
1496  SWAP (D14, D46); \
1497  SWAP (D15, D47); \
1498  SWAP (D16, D48); \
1499  SWAP (D17, D49); \
1500  SWAP (D18, D50); \
1501  SWAP (D19, D51); \
1502  SWAP (D20, D52); \
1503  SWAP (D21, D53); \
1504  SWAP (D22, D54); \
1505  SWAP (D23, D55); \
1506  SWAP (D24, D56); \
1507  SWAP (D25, D57); \
1508  SWAP (D26, D58); \
1509  SWAP (D27, D59); \
1510  SWAP (D28, D60); \
1511  SWAP (D29, D61); \
1512  SWAP (D30, D62); \
1513  SWAP (D31, D63);
1514
1515#define KEYSET00 { k00 = K08; k01 = K44; k02 = K29; k03 = K52; k04 = K42; k05 = K14; k06 = K28; k07 = K49; k08 = K01; k09 = K07; k10 = K16; k11 = K36; k12 = K02; k13 = K30; k14 = K22; k15 = K21; k16 = K38; k17 = K50; k18 = K51; k19 = K00; k20 = K31; k21 = K23; k22 = K15; k23 = K35; k24 = K19; k25 = K24; k26 = K34; k27 = K47; k28 = K32; k29 = K03; k30 = K41; k31 = K26; k32 = K04; k33 = K46; k34 = K20; k35 = K25; k36 = K53; k37 = K18; k38 = K33; k39 = K55; k40 = K13; k41 = K17; k42 = K39; k43 = K12; k44 = K11; k45 = K54; k46 = K48; k47 = K27; }
1516#define KEYSET10 { k00 = K49; k01 = K28; k02 = K45; k03 = K36; k04 = K01; k05 = K30; k06 = K44; k07 = K08; k08 = K42; k09 = K23; k10 = K00; k11 = K52; k12 = K43; k13 = K14; k14 = K38; k15 = K37; k16 = K22; k17 = K09; k18 = K35; k19 = K16; k20 = K15; k21 = K07; k22 = K31; k23 = K51; k24 = K03; k25 = K40; k26 = K46; k27 = K04; k28 = K20; k29 = K19; k30 = K53; k31 = K10; k32 = K47; k33 = K34; k34 = K32; k35 = K13; k36 = K41; k37 = K06; k38 = K17; k39 = K12; k40 = K25; k41 = K33; k42 = K27; k43 = K55; k44 = K54; k45 = K11; k46 = K05; k47 = K39; }
1517#define KEYSET01 { k00 = K01; k01 = K37; k02 = K22; k03 = K45; k04 = K35; k05 = K07; k06 = K21; k07 = K42; k08 = K51; k09 = K00; k10 = K09; k11 = K29; k12 = K52; k13 = K23; k14 = K15; k15 = K14; k16 = K31; k17 = K43; k18 = K44; k19 = K50; k20 = K49; k21 = K16; k22 = K08; k23 = K28; k24 = K12; k25 = K17; k26 = K27; k27 = K40; k28 = K25; k29 = K55; k30 = K34; k31 = K19; k32 = K24; k33 = K39; k34 = K13; k35 = K18; k36 = K46; k37 = K11; k38 = K26; k39 = K48; k40 = K06; k41 = K10; k42 = K32; k43 = K05; k44 = K04; k45 = K47; k46 = K41; k47 = K20; }
1518#define KEYSET11 { k00 = K35; k01 = K14; k02 = K31; k03 = K22; k04 = K44; k05 = K16; k06 = K30; k07 = K51; k08 = K28; k09 = K09; k10 = K43; k11 = K38; k12 = K29; k13 = K00; k14 = K49; k15 = K23; k16 = K08; k17 = K52; k18 = K21; k19 = K02; k20 = K01; k21 = K50; k22 = K42; k23 = K37; k24 = K48; k25 = K26; k26 = K32; k27 = K17; k28 = K06; k29 = K05; k30 = K39; k31 = K55; k32 = K33; k33 = K20; k34 = K18; k35 = K54; k36 = K27; k37 = K47; k38 = K03; k39 = K53; k40 = K11; k41 = K19; k42 = K13; k43 = K41; k44 = K40; k45 = K24; k46 = K46; k47 = K25; }
1519#define KEYSET02 { k00 = K44; k01 = K23; k02 = K08; k03 = K31; k04 = K21; k05 = K50; k06 = K07; k07 = K28; k08 = K37; k09 = K43; k10 = K52; k11 = K15; k12 = K38; k13 = K09; k14 = K01; k15 = K00; k16 = K42; k17 = K29; k18 = K30; k19 = K36; k20 = K35; k21 = K02; k22 = K51; k23 = K14; k24 = K53; k25 = K03; k26 = K13; k27 = K26; k28 = K11; k29 = K41; k30 = K20; k31 = K05; k32 = K10; k33 = K25; k34 = K54; k35 = K04; k36 = K32; k37 = K24; k38 = K12; k39 = K34; k40 = K47; k41 = K55; k42 = K18; k43 = K46; k44 = K17; k45 = K33; k46 = K27; k47 = K06; }
1520#define KEYSET12 { k00 = K21; k01 = K00; k02 = K42; k03 = K08; k04 = K30; k05 = K02; k06 = K16; k07 = K37; k08 = K14; k09 = K52; k10 = K29; k11 = K49; k12 = K15; k13 = K43; k14 = K35; k15 = K09; k16 = K51; k17 = K38; k18 = K07; k19 = K45; k20 = K44; k21 = K36; k22 = K28; k23 = K23; k24 = K34; k25 = K12; k26 = K18; k27 = K03; k28 = K47; k29 = K46; k30 = K25; k31 = K41; k32 = K19; k33 = K06; k34 = K04; k35 = K40; k36 = K13; k37 = K33; k38 = K48; k39 = K39; k40 = K24; k41 = K05; k42 = K54; k43 = K27; k44 = K26; k45 = K10; k46 = K32; k47 = K11; }
1521#define KEYSET03 { k00 = K30; k01 = K09; k02 = K51; k03 = K42; k04 = K07; k05 = K36; k06 = K50; k07 = K14; k08 = K23; k09 = K29; k10 = K38; k11 = K01; k12 = K49; k13 = K52; k14 = K44; k15 = K43; k16 = K28; k17 = K15; k18 = K16; k19 = K22; k20 = K21; k21 = K45; k22 = K37; k23 = K00; k24 = K39; k25 = K48; k26 = K54; k27 = K12; k28 = K24; k29 = K27; k30 = K06; k31 = K46; k32 = K55; k33 = K11; k34 = K40; k35 = K17; k36 = K18; k37 = K10; k38 = K53; k39 = K20; k40 = K33; k41 = K41; k42 = K04; k43 = K32; k44 = K03; k45 = K19; k46 = K13; k47 = K47; }
1522#define KEYSET13 { k00 = K07; k01 = K43; k02 = K28; k03 = K51; k04 = K16; k05 = K45; k06 = K02; k07 = K23; k08 = K00; k09 = K38; k10 = K15; k11 = K35; k12 = K01; k13 = K29; k14 = K21; k15 = K52; k16 = K37; k17 = K49; k18 = K50; k19 = K31; k20 = K30; k21 = K22; k22 = K14; k23 = K09; k24 = K20; k25 = K53; k26 = K04; k27 = K48; k28 = K33; k29 = K32; k30 = K11; k31 = K27; k32 = K05; k33 = K47; k34 = K17; k35 = K26; k36 = K54; k37 = K19; k38 = K34; k39 = K25; k40 = K10; k41 = K46; k42 = K40; k43 = K13; k44 = K12; k45 = K55; k46 = K18; k47 = K24; }
1523#define KEYSET04 { k00 = K16; k01 = K52; k02 = K37; k03 = K28; k04 = K50; k05 = K22; k06 = K36; k07 = K00; k08 = K09; k09 = K15; k10 = K49; k11 = K44; k12 = K35; k13 = K38; k14 = K30; k15 = K29; k16 = K14; k17 = K01; k18 = K02; k19 = K08; k20 = K07; k21 = K31; k22 = K23; k23 = K43; k24 = K25; k25 = K34; k26 = K40; k27 = K53; k28 = K10; k29 = K13; k30 = K47; k31 = K32; k32 = K41; k33 = K24; k34 = K26; k35 = K03; k36 = K04; k37 = K55; k38 = K39; k39 = K06; k40 = K19; k41 = K27; k42 = K17; k43 = K18; k44 = K48; k45 = K05; k46 = K54; k47 = K33; }
1524#define KEYSET14 { k00 = K50; k01 = K29; k02 = K14; k03 = K37; k04 = K02; k05 = K31; k06 = K45; k07 = K09; k08 = K43; k09 = K49; k10 = K01; k11 = K21; k12 = K44; k13 = K15; k14 = K07; k15 = K38; k16 = K23; k17 = K35; k18 = K36; k19 = K42; k20 = K16; k21 = K08; k22 = K00; k23 = K52; k24 = K06; k25 = K39; k26 = K17; k27 = K34; k28 = K19; k29 = K18; k30 = K24; k31 = K13; k32 = K46; k33 = K33; k34 = K03; k35 = K12; k36 = K40; k37 = K05; k38 = K20; k39 = K11; k40 = K55; k41 = K32; k42 = K26; k43 = K54; k44 = K53; k45 = K41; k46 = K04; k47 = K10; }
1525#define KEYSET05 { k00 = K02; k01 = K38; k02 = K23; k03 = K14; k04 = K36; k05 = K08; k06 = K22; k07 = K43; k08 = K52; k09 = K01; k10 = K35; k11 = K30; k12 = K21; k13 = K49; k14 = K16; k15 = K15; k16 = K00; k17 = K44; k18 = K45; k19 = K51; k20 = K50; k21 = K42; k22 = K09; k23 = K29; k24 = K11; k25 = K20; k26 = K26; k27 = K39; k28 = K55; k29 = K54; k30 = K33; k31 = K18; k32 = K27; k33 = K10; k34 = K12; k35 = K48; k36 = K17; k37 = K41; k38 = K25; k39 = K47; k40 = K05; k41 = K13; k42 = K03; k43 = K04; k44 = K34; k45 = K46; k46 = K40; k47 = K19; }
1526#define KEYSET15 { k00 = K36; k01 = K15; k02 = K00; k03 = K23; k04 = K45; k05 = K42; k06 = K31; k07 = K52; k08 = K29; k09 = K35; k10 = K44; k11 = K07; k12 = K30; k13 = K01; k14 = K50; k15 = K49; k16 = K09; k17 = K21; k18 = K22; k19 = K28; k20 = K02; k21 = K51; k22 = K43; k23 = K38; k24 = K47; k25 = K25; k26 = K03; k27 = K20; k28 = K05; k29 = K04; k30 = K10; k31 = K54; k32 = K32; k33 = K19; k34 = K48; k35 = K53; k36 = K26; k37 = K46; k38 = K06; k39 = K24; k40 = K41; k41 = K18; k42 = K12; k43 = K40; k44 = K39; k45 = K27; k46 = K17; k47 = K55; }
1527#define KEYSET06 { k00 = K45; k01 = K49; k02 = K09; k03 = K00; k04 = K22; k05 = K51; k06 = K08; k07 = K29; k08 = K38; k09 = K44; k10 = K21; k11 = K16; k12 = K07; k13 = K35; k14 = K02; k15 = K01; k16 = K43; k17 = K30; k18 = K31; k19 = K37; k20 = K36; k21 = K28; k22 = K52; k23 = K15; k24 = K24; k25 = K06; k26 = K12; k27 = K25; k28 = K41; k29 = K40; k30 = K19; k31 = K04; k32 = K13; k33 = K55; k34 = K53; k35 = K34; k36 = K03; k37 = K27; k38 = K11; k39 = K33; k40 = K46; k41 = K54; k42 = K48; k43 = K17; k44 = K20; k45 = K32; k46 = K26; k47 = K05; }
1528#define KEYSET16 { k00 = K22; k01 = K01; k02 = K43; k03 = K09; k04 = K31; k05 = K28; k06 = K42; k07 = K38; k08 = K15; k09 = K21; k10 = K30; k11 = K50; k12 = K16; k13 = K44; k14 = K36; k15 = K35; k16 = K52; k17 = K07; k18 = K08; k19 = K14; k20 = K45; k21 = K37; k22 = K29; k23 = K49; k24 = K33; k25 = K11; k26 = K48; k27 = K06; k28 = K46; k29 = K17; k30 = K55; k31 = K40; k32 = K18; k33 = K05; k34 = K34; k35 = K39; k36 = K12; k37 = K32; k38 = K47; k39 = K10; k40 = K27; k41 = K04; k42 = K53; k43 = K26; k44 = K25; k45 = K13; k46 = K03; k47 = K41; }
1529#define KEYSET07 { k00 = K31; k01 = K35; k02 = K52; k03 = K43; k04 = K08; k05 = K37; k06 = K51; k07 = K15; k08 = K49; k09 = K30; k10 = K07; k11 = K02; k12 = K50; k13 = K21; k14 = K45; k15 = K44; k16 = K29; k17 = K16; k18 = K42; k19 = K23; k20 = K22; k21 = K14; k22 = K38; k23 = K01; k24 = K10; k25 = K47; k26 = K53; k27 = K11; k28 = K27; k29 = K26; k30 = K05; k31 = K17; k32 = K54; k33 = K41; k34 = K39; k35 = K20; k36 = K48; k37 = K13; k38 = K24; k39 = K19; k40 = K32; k41 = K40; k42 = K34; k43 = K03; k44 = K06; k45 = K18; k46 = K12; k47 = K46; }
1530#define KEYSET17 { k00 = K15; k01 = K51; k02 = K36; k03 = K02; k04 = K49; k05 = K21; k06 = K35; k07 = K31; k08 = K08; k09 = K14; k10 = K23; k11 = K43; k12 = K09; k13 = K37; k14 = K29; k15 = K28; k16 = K45; k17 = K00; k18 = K01; k19 = K07; k20 = K38; k21 = K30; k22 = K22; k23 = K42; k24 = K26; k25 = K04; k26 = K41; k27 = K54; k28 = K39; k29 = K10; k30 = K48; k31 = K33; k32 = K11; k33 = K53; k34 = K27; k35 = K32; k36 = K05; k37 = K25; k38 = K40; k39 = K03; k40 = K20; k41 = K24; k42 = K46; k43 = K19; k44 = K18; k45 = K06; k46 = K55; k47 = K34; }
1531
1532DECLSPEC void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 K04, const u32 K05, const u32 K06, const u32 K07, const u32 K08, const u32 K09, const u32 K10, const u32 K11, const u32 K12, const u32 K13, const u32 K14, const u32 K15, const u32 K16, const u32 K17, const u32 K18, const u32 K19, const u32 K20, const u32 K21, const u32 K22, const u32 K23, const u32 K24, const u32 K25, const u32 K26, const u32 K27, const u32 K28, const u32 K29, const u32 K30, const u32 K31, const u32 K32, const u32 K33, const u32 K34, const u32 K35, const u32 K36, const u32 K37, const u32 K38, const u32 K39, const u32 K40, const u32 K41, const u32 K42, const u32 K43, const u32 K44, const u32 K45, const u32 K46, const u32 K47, const u32 K48, const u32 K49, const u32 K50, const u32 K51, const u32 K52, const u32 K53, const u32 K54, const u32 K55, u32 *D00, u32 *D01, u32 *D02, u32 *D03, u32 *D04, u32 *D05, u32 *D06, u32 *D07, u32 *D08, u32 *D09, u32 *D10, u32 *D11, u32 *D12, u32 *D13, u32 *D14, u32 *D15, u32 *D16, u32 *D17, u32 *D18, u32 *D19, u32 *D20, u32 *D21, u32 *D22, u32 *D23, u32 *D24, u32 *D25, u32 *D26, u32 *D27, u32 *D28, u32 *D29, u32 *D30, u32 *D31, u32 *D32, u32 *D33, u32 *D34, u32 *D35, u32 *D36, u32 *D37, u32 *D38, u32 *D39, u32 *D40, u32 *D41, u32 *D42, u32 *D43, u32 *D44, u32 *D45, u32 *D46, u32 *D47, u32 *D48, u32 *D49, u32 *D50, u32 *D51, u32 *D52, u32 *D53, u32 *D54, u32 *D55, u32 *D56, u32 *D57, u32 *D58, u32 *D59, u32 *D60, u32 *D61, u32 *D62, u32 *D63)
1533{
1534  KXX_DECL u32 k00, k01, k02, k03, k04, k05;
1535  KXX_DECL u32 k06, k07, k08, k09, k10, k11;
1536  KXX_DECL u32 k12, k13, k14, k15, k16, k17;
1537  KXX_DECL u32 k18, k19, k20, k21, k22, k23;
1538  KXX_DECL u32 k24, k25, k26, k27, k28, k29;
1539  KXX_DECL u32 k30, k31, k32, k33, k34, k35;
1540  KXX_DECL u32 k36, k37, k38, k39, k40, k41;
1541  KXX_DECL u32 k42, k43, k44, k45, k46, k47;
1542
1543  #ifdef _unroll
1544  #pragma unroll
1545  #endif
1546  for (u32 i = 0; i < 2; i++)
1547  {
1548    if (i) KEYSET10 else KEYSET00
1549
1550    s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30);
1551    s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17);
1552    s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05);
1553    s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00);
1554    s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02);
1555    s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18);
1556    s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06);
1557    s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20);
1558
1559    if (i) KEYSET11 else KEYSET01
1560
1561    s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62);
1562    s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49);
1563    s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37);
1564    s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32);
1565    s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34);
1566    s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50);
1567    s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38);
1568    s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52);
1569
1570    if (i) KEYSET12 else KEYSET02
1571
1572    s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30);
1573    s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17);
1574    s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05);
1575    s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00);
1576    s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02);
1577    s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18);
1578    s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06);
1579    s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20);
1580
1581    if (i) KEYSET13 else KEYSET03
1582
1583    s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62);
1584    s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49);
1585    s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37);
1586    s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32);
1587    s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34);
1588    s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50);
1589    s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38);
1590    s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52);
1591
1592    if (i) KEYSET14 else KEYSET04
1593
1594    s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30);
1595    s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17);
1596    s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05);
1597    s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00);
1598    s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02);
1599    s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18);
1600    s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06);
1601    s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20);
1602
1603    if (i) KEYSET15 else KEYSET05
1604
1605    s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62);
1606    s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49);
1607    s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37);
1608    s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32);
1609    s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34);
1610    s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50);
1611    s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38);
1612    s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52);
1613
1614    if (i) KEYSET16 else KEYSET06
1615
1616    s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30);
1617    s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17);
1618    s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05);
1619    s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00);
1620    s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02);
1621    s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18);
1622    s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06);
1623    s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20);
1624
1625    if (i) KEYSET17 else KEYSET07
1626
1627    s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62);
1628    s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49);
1629    s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37);
1630    s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32);
1631    s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34);
1632    s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50);
1633    s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38);
1634    s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52);
1635  }
1636}
1637
1638DECLSPEC void transpose32c (u32 *data)
1639{
1640  #define swap(x,y,j,m)               \
1641     t  = ((x) ^ ((y) >> (j))) & (m); \
1642    (x) = (x) ^ t;                    \
1643    (y) = (y) ^ (t << (j));
1644
1645  u32 t;
1646
1647  swap (data[ 0], data[16], 16, 0x0000ffff);
1648  swap (data[ 1], data[17], 16, 0x0000ffff);
1649  swap (data[ 2], data[18], 16, 0x0000ffff);
1650  swap (data[ 3], data[19], 16, 0x0000ffff);
1651  swap (data[ 4], data[20], 16, 0x0000ffff);
1652  swap (data[ 5], data[21], 16, 0x0000ffff);
1653  swap (data[ 6], data[22], 16, 0x0000ffff);
1654  swap (data[ 7], data[23], 16, 0x0000ffff);
1655  swap (data[ 8], data[24], 16, 0x0000ffff);
1656  swap (data[ 9], data[25], 16, 0x0000ffff);
1657  swap (data[10], data[26], 16, 0x0000ffff);
1658  swap (data[11], data[27], 16, 0x0000ffff);
1659  swap (data[12], data[28], 16, 0x0000ffff);
1660  swap (data[13], data[29], 16, 0x0000ffff);
1661  swap (data[14], data[30], 16, 0x0000ffff);
1662  swap (data[15], data[31], 16, 0x0000ffff);
1663  swap (data[ 0], data[ 8],  8, 0x00ff00ff);
1664  swap (data[ 1], data[ 9],  8, 0x00ff00ff);
1665  swap (data[ 2], data[10],  8, 0x00ff00ff);
1666  swap (data[ 3], data[11],  8, 0x00ff00ff);
1667  swap (data[ 4], data[12],  8, 0x00ff00ff);
1668  swap (data[ 5], data[13],  8, 0x00ff00ff);
1669  swap (data[ 6], data[14],  8, 0x00ff00ff);
1670  swap (data[ 7], data[15],  8, 0x00ff00ff);
1671  swap (data[ 0], data[ 4],  4, 0x0f0f0f0f);
1672  swap (data[ 1], data[ 5],  4, 0x0f0f0f0f);
1673  swap (data[ 2], data[ 6],  4, 0x0f0f0f0f);
1674  swap (data[ 3], data[ 7],  4, 0x0f0f0f0f);
1675  swap (data[ 0], data[ 2],  2, 0x33333333);
1676  swap (data[ 1], data[ 3],  2, 0x33333333);
1677  swap (data[ 0], data[ 1],  1, 0x55555555);
1678  swap (data[ 2], data[ 3],  1, 0x55555555);
1679  swap (data[ 4], data[ 6],  2, 0x33333333);
1680  swap (data[ 5], data[ 7],  2, 0x33333333);
1681  swap (data[ 4], data[ 5],  1, 0x55555555);
1682  swap (data[ 6], data[ 7],  1, 0x55555555);
1683  swap (data[ 8], data[12],  4, 0x0f0f0f0f);
1684  swap (data[ 9], data[13],  4, 0x0f0f0f0f);
1685  swap (data[10], data[14],  4, 0x0f0f0f0f);
1686  swap (data[11], data[15],  4, 0x0f0f0f0f);
1687  swap (data[ 8], data[10],  2, 0x33333333);
1688  swap (data[ 9], data[11],  2, 0x33333333);
1689  swap (data[ 8], data[ 9],  1, 0x55555555);
1690  swap (data[10], data[11],  1, 0x55555555);
1691  swap (data[12], data[14],  2, 0x33333333);
1692  swap (data[13], data[15],  2, 0x33333333);
1693  swap (data[12], data[13],  1, 0x55555555);
1694  swap (data[14], data[15],  1, 0x55555555);
1695  swap (data[16], data[24],  8, 0x00ff00ff);
1696  swap (data[17], data[25],  8, 0x00ff00ff);
1697  swap (data[18], data[26],  8, 0x00ff00ff);
1698  swap (data[19], data[27],  8, 0x00ff00ff);
1699  swap (data[20], data[28],  8, 0x00ff00ff);
1700  swap (data[21], data[29],  8, 0x00ff00ff);
1701  swap (data[22], data[30],  8, 0x00ff00ff);
1702  swap (data[23], data[31],  8, 0x00ff00ff);
1703  swap (data[16], data[20],  4, 0x0f0f0f0f);
1704  swap (data[17], data[21],  4, 0x0f0f0f0f);
1705  swap (data[18], data[22],  4, 0x0f0f0f0f);
1706  swap (data[19], data[23],  4, 0x0f0f0f0f);
1707  swap (data[16], data[18],  2, 0x33333333);
1708  swap (data[17], data[19],  2, 0x33333333);
1709  swap (data[16], data[17],  1, 0x55555555);
1710  swap (data[18], data[19],  1, 0x55555555);
1711  swap (data[20], data[22],  2, 0x33333333);
1712  swap (data[21], data[23],  2, 0x33333333);
1713  swap (data[20], data[21],  1, 0x55555555);
1714  swap (data[22], data[23],  1, 0x55555555);
1715  swap (data[24], data[28],  4, 0x0f0f0f0f);
1716  swap (data[25], data[29],  4, 0x0f0f0f0f);
1717  swap (data[26], data[30],  4, 0x0f0f0f0f);
1718  swap (data[27], data[31],  4, 0x0f0f0f0f);
1719  swap (data[24], data[26],  2, 0x33333333);
1720  swap (data[25], data[27],  2, 0x33333333);
1721  swap (data[24], data[25],  1, 0x55555555);
1722  swap (data[26], data[27],  1, 0x55555555);
1723  swap (data[28], data[30],  2, 0x33333333);
1724  swap (data[29], data[31],  2, 0x33333333);
1725  swap (data[28], data[29],  1, 0x55555555);
1726  swap (data[30], data[31],  1, 0x55555555);
1727}
1728
1729//
1730// transpose bitslice mod : attention race conditions, need different buffers for *in and *out
1731//
1732
1733KERNEL_FQ void m14000_tm (GLOBAL_AS u32 *mod, GLOBAL_AS bs_word_t *words_buf_b)
1734{
1735  const u64 gid = get_global_id (0);
1736
1737  const u32 block = gid / 32;
1738  const u32 slice = gid % 32;
1739
1740  const u32 w0 = mod[gid];
1741
1742  #ifdef _unroll
1743  #pragma unroll
1744  #endif
1745  for (int i = 0, j = 0; i < 32; i += 8, j += 7)
1746  {
1747    hc_atomic_or (&words_buf_b[block].b[j + 0], (((w0 >> (i + 7)) & 1) << slice));
1748    hc_atomic_or (&words_buf_b[block].b[j + 1], (((w0 >> (i + 6)) & 1) << slice));
1749    hc_atomic_or (&words_buf_b[block].b[j + 2], (((w0 >> (i + 5)) & 1) << slice));
1750    hc_atomic_or (&words_buf_b[block].b[j + 3], (((w0 >> (i + 4)) & 1) << slice));
1751    hc_atomic_or (&words_buf_b[block].b[j + 4], (((w0 >> (i + 3)) & 1) << slice));
1752    hc_atomic_or (&words_buf_b[block].b[j + 5], (((w0 >> (i + 2)) & 1) << slice));
1753    hc_atomic_or (&words_buf_b[block].b[j + 6], (((w0 >> (i + 1)) & 1) << slice));
1754  }
1755}
1756
1757KERNEL_FQ void m14000_mxx (KERN_ATTR_BITSLICE ())
1758{
1759  /**
1760   * base
1761   */
1762
1763  const u64 gid = get_global_id (0);
1764  const u64 lid = get_local_id (0);
1765
1766  /**
1767   * salt
1768   */
1769
1770  const u32 salt0 = salt_bufs[SALT_POS].salt_buf_pc[0];
1771  const u32 salt1 = salt_bufs[SALT_POS].salt_buf_pc[1];
1772
1773  // salt1 first, because this is a 64 bit value actually
1774
1775
1776  const u32 d00 = (((salt1 >>  0) & 1) ? -1 : 0);
1777  const u32 d01 = (((salt1 >>  1) & 1) ? -1 : 0);
1778  const u32 d02 = (((salt1 >>  2) & 1) ? -1 : 0);
1779  const u32 d03 = (((salt1 >>  3) & 1) ? -1 : 0);
1780  const u32 d04 = (((salt1 >>  4) & 1) ? -1 : 0);
1781  const u32 d05 = (((salt1 >>  5) & 1) ? -1 : 0);
1782  const u32 d06 = (((salt1 >>  6) & 1) ? -1 : 0);
1783  const u32 d07 = (((salt1 >>  7) & 1) ? -1 : 0);
1784  const u32 d08 = (((salt1 >>  8) & 1) ? -1 : 0);
1785  const u32 d09 = (((salt1 >>  9) & 1) ? -1 : 0);
1786  const u32 d10 = (((salt1 >> 10) & 1) ? -1 : 0);
1787  const u32 d11 = (((salt1 >> 11) & 1) ? -1 : 0);
1788  const u32 d12 = (((salt1 >> 12) & 1) ? -1 : 0);
1789  const u32 d13 = (((salt1 >> 13) & 1) ? -1 : 0);
1790  const u32 d14 = (((salt1 >> 14) & 1) ? -1 : 0);
1791  const u32 d15 = (((salt1 >> 15) & 1) ? -1 : 0);
1792  const u32 d16 = (((salt1 >> 16) & 1) ? -1 : 0);
1793  const u32 d17 = (((salt1 >> 17) & 1) ? -1 : 0);
1794  const u32 d18 = (((salt1 >> 18) & 1) ? -1 : 0);
1795  const u32 d19 = (((salt1 >> 19) & 1) ? -1 : 0);
1796  const u32 d20 = (((salt1 >> 20) & 1) ? -1 : 0);
1797  const u32 d21 = (((salt1 >> 21) & 1) ? -1 : 0);
1798  const u32 d22 = (((salt1 >> 22) & 1) ? -1 : 0);
1799  const u32 d23 = (((salt1 >> 23) & 1) ? -1 : 0);
1800  const u32 d24 = (((salt1 >> 24) & 1) ? -1 : 0);
1801  const u32 d25 = (((salt1 >> 25) & 1) ? -1 : 0);
1802  const u32 d26 = (((salt1 >> 26) & 1) ? -1 : 0);
1803  const u32 d27 = (((salt1 >> 27) & 1) ? -1 : 0);
1804  const u32 d28 = (((salt1 >> 28) & 1) ? -1 : 0);
1805  const u32 d29 = (((salt1 >> 29) & 1) ? -1 : 0);
1806  const u32 d30 = (((salt1 >> 30) & 1) ? -1 : 0);
1807  const u32 d31 = (((salt1 >> 31) & 1) ? -1 : 0);
1808  const u32 d32 = (((salt0 >>  0) & 1) ? -1 : 0);
1809  const u32 d33 = (((salt0 >>  1) & 1) ? -1 : 0);
1810  const u32 d34 = (((salt0 >>  2) & 1) ? -1 : 0);
1811  const u32 d35 = (((salt0 >>  3) & 1) ? -1 : 0);
1812  const u32 d36 = (((salt0 >>  4) & 1) ? -1 : 0);
1813  const u32 d37 = (((salt0 >>  5) & 1) ? -1 : 0);
1814  const u32 d38 = (((salt0 >>  6) & 1) ? -1 : 0);
1815  const u32 d39 = (((salt0 >>  7) & 1) ? -1 : 0);
1816  const u32 d40 = (((salt0 >>  8) & 1) ? -1 : 0);
1817  const u32 d41 = (((salt0 >>  9) & 1) ? -1 : 0);
1818  const u32 d42 = (((salt0 >> 10) & 1) ? -1 : 0);
1819  const u32 d43 = (((salt0 >> 11) & 1) ? -1 : 0);
1820  const u32 d44 = (((salt0 >> 12) & 1) ? -1 : 0);
1821  const u32 d45 = (((salt0 >> 13) & 1) ? -1 : 0);
1822  const u32 d46 = (((salt0 >> 14) & 1) ? -1 : 0);
1823  const u32 d47 = (((salt0 >> 15) & 1) ? -1 : 0);
1824  const u32 d48 = (((salt0 >> 16) & 1) ? -1 : 0);
1825  const u32 d49 = (((salt0 >> 17) & 1) ? -1 : 0);
1826  const u32 d50 = (((salt0 >> 18) & 1) ? -1 : 0);
1827  const u32 d51 = (((salt0 >> 19) & 1) ? -1 : 0);
1828  const u32 d52 = (((salt0 >> 20) & 1) ? -1 : 0);
1829  const u32 d53 = (((salt0 >> 21) & 1) ? -1 : 0);
1830  const u32 d54 = (((salt0 >> 22) & 1) ? -1 : 0);
1831  const u32 d55 = (((salt0 >> 23) & 1) ? -1 : 0);
1832  const u32 d56 = (((salt0 >> 24) & 1) ? -1 : 0);
1833  const u32 d57 = (((salt0 >> 25) & 1) ? -1 : 0);
1834  const u32 d58 = (((salt0 >> 26) & 1) ? -1 : 0);
1835  const u32 d59 = (((salt0 >> 27) & 1) ? -1 : 0);
1836  const u32 d60 = (((salt0 >> 28) & 1) ? -1 : 0);
1837  const u32 d61 = (((salt0 >> 29) & 1) ? -1 : 0);
1838  const u32 d62 = (((salt0 >> 30) & 1) ? -1 : 0);
1839  const u32 d63 = (((salt0 >> 31) & 1) ? -1 : 0);
1840
1841  /**
1842   * base
1843   */
1844
1845  const u32 w0 = pws[gid].i[0];
1846  const u32 w1 = pws[gid].i[1];
1847
1848  #define K00 (((w0 >> ( 0 + 7)) & 1) ? -1 : 0)
1849  #define K01 (((w0 >> ( 0 + 6)) & 1) ? -1 : 0)
1850  #define K02 (((w0 >> ( 0 + 5)) & 1) ? -1 : 0)
1851  #define K03 (((w0 >> ( 0 + 4)) & 1) ? -1 : 0)
1852  #define K04 (((w0 >> ( 0 + 3)) & 1) ? -1 : 0)
1853  #define K05 (((w0 >> ( 0 + 2)) & 1) ? -1 : 0)
1854  #define K06 (((w0 >> ( 0 + 1)) & 1) ? -1 : 0)
1855  #define K07 (((w0 >> ( 8 + 7)) & 1) ? -1 : 0)
1856  #define K08 (((w0 >> ( 8 + 6)) & 1) ? -1 : 0)
1857  #define K09 (((w0 >> ( 8 + 5)) & 1) ? -1 : 0)
1858  #define K10 (((w0 >> ( 8 + 4)) & 1) ? -1 : 0)
1859  #define K11 (((w0 >> ( 8 + 3)) & 1) ? -1 : 0)
1860  #define K12 (((w0 >> ( 8 + 2)) & 1) ? -1 : 0)
1861  #define K13 (((w0 >> ( 8 + 1)) & 1) ? -1 : 0)
1862  #define K14 (((w0 >> (16 + 7)) & 1) ? -1 : 0)
1863  #define K15 (((w0 >> (16 + 6)) & 1) ? -1 : 0)
1864  #define K16 (((w0 >> (16 + 5)) & 1) ? -1 : 0)
1865  #define K17 (((w0 >> (16 + 4)) & 1) ? -1 : 0)
1866  #define K18 (((w0 >> (16 + 3)) & 1) ? -1 : 0)
1867  #define K19 (((w0 >> (16 + 2)) & 1) ? -1 : 0)
1868  #define K20 (((w0 >> (16 + 1)) & 1) ? -1 : 0)
1869  #define K21 (((w0 >> (24 + 7)) & 1) ? -1 : 0)
1870  #define K22 (((w0 >> (24 + 6)) & 1) ? -1 : 0)
1871  #define K23 (((w0 >> (24 + 5)) & 1) ? -1 : 0)
1872  #define K24 (((w0 >> (24 + 4)) & 1) ? -1 : 0)
1873  #define K25 (((w0 >> (24 + 3)) & 1) ? -1 : 0)
1874  #define K26 (((w0 >> (24 + 2)) & 1) ? -1 : 0)
1875  #define K27 (((w0 >> (24 + 1)) & 1) ? -1 : 0)
1876  #define K28 (((w1 >> ( 0 + 7)) & 1) ? -1 : 0)
1877  #define K29 (((w1 >> ( 0 + 6)) & 1) ? -1 : 0)
1878  #define K30 (((w1 >> ( 0 + 5)) & 1) ? -1 : 0)
1879  #define K31 (((w1 >> ( 0 + 4)) & 1) ? -1 : 0)
1880  #define K32 (((w1 >> ( 0 + 3)) & 1) ? -1 : 0)
1881  #define K33 (((w1 >> ( 0 + 2)) & 1) ? -1 : 0)
1882  #define K34 (((w1 >> ( 0 + 1)) & 1) ? -1 : 0)
1883  #define K35 (((w1 >> ( 8 + 7)) & 1) ? -1 : 0)
1884  #define K36 (((w1 >> ( 8 + 6)) & 1) ? -1 : 0)
1885  #define K37 (((w1 >> ( 8 + 5)) & 1) ? -1 : 0)
1886  #define K38 (((w1 >> ( 8 + 4)) & 1) ? -1 : 0)
1887  #define K39 (((w1 >> ( 8 + 3)) & 1) ? -1 : 0)
1888  #define K40 (((w1 >> ( 8 + 2)) & 1) ? -1 : 0)
1889  #define K41 (((w1 >> ( 8 + 1)) & 1) ? -1 : 0)
1890  #define K42 (((w1 >> (16 + 7)) & 1) ? -1 : 0)
1891  #define K43 (((w1 >> (16 + 6)) & 1) ? -1 : 0)
1892  #define K44 (((w1 >> (16 + 5)) & 1) ? -1 : 0)
1893  #define K45 (((w1 >> (16 + 4)) & 1) ? -1 : 0)
1894  #define K46 (((w1 >> (16 + 3)) & 1) ? -1 : 0)
1895  #define K47 (((w1 >> (16 + 2)) & 1) ? -1 : 0)
1896  #define K48 (((w1 >> (16 + 1)) & 1) ? -1 : 0)
1897  #define K49 (((w1 >> (24 + 7)) & 1) ? -1 : 0)
1898  #define K50 (((w1 >> (24 + 6)) & 1) ? -1 : 0)
1899  #define K51 (((w1 >> (24 + 5)) & 1) ? -1 : 0)
1900  #define K52 (((w1 >> (24 + 4)) & 1) ? -1 : 0)
1901  #define K53 (((w1 >> (24 + 3)) & 1) ? -1 : 0)
1902  #define K54 (((w1 >> (24 + 2)) & 1) ? -1 : 0)
1903  #define K55 (((w1 >> (24 + 1)) & 1) ? -1 : 0)
1904
1905  /**
1906   * inner loop
1907   */
1908
1909  for (u32 il_pos = 0; il_pos < il_cnt; il_pos += 32)
1910  {
1911    u32 k00 = K00;
1912    u32 k01 = K01;
1913    u32 k02 = K02;
1914    u32 k03 = K03;
1915    u32 k04 = K04;
1916    u32 k05 = K05;
1917    u32 k06 = K06;
1918    u32 k07 = K07;
1919    u32 k08 = K08;
1920    u32 k09 = K09;
1921    u32 k10 = K10;
1922    u32 k11 = K11;
1923    u32 k12 = K12;
1924    u32 k13 = K13;
1925    u32 k14 = K14;
1926    u32 k15 = K15;
1927    u32 k16 = K16;
1928    u32 k17 = K17;
1929    u32 k18 = K18;
1930    u32 k19 = K19;
1931    u32 k20 = K20;
1932    u32 k21 = K21;
1933    u32 k22 = K22;
1934    u32 k23 = K23;
1935    u32 k24 = K24;
1936    u32 k25 = K25;
1937    u32 k26 = K26;
1938    u32 k27 = K27;
1939
1940    const u32 pc_pos = il_pos / 32;
1941
1942    k00 |= words_buf_s[pc_pos].b[ 0];
1943    k01 |= words_buf_s[pc_pos].b[ 1];
1944    k02 |= words_buf_s[pc_pos].b[ 2];
1945    k03 |= words_buf_s[pc_pos].b[ 3];
1946    k04 |= words_buf_s[pc_pos].b[ 4];
1947    k05 |= words_buf_s[pc_pos].b[ 5];
1948    k06 |= words_buf_s[pc_pos].b[ 6];
1949    k07 |= words_buf_s[pc_pos].b[ 7];
1950    k08 |= words_buf_s[pc_pos].b[ 8];
1951    k09 |= words_buf_s[pc_pos].b[ 9];
1952    k10 |= words_buf_s[pc_pos].b[10];
1953    k11 |= words_buf_s[pc_pos].b[11];
1954    k12 |= words_buf_s[pc_pos].b[12];
1955    k13 |= words_buf_s[pc_pos].b[13];
1956    k14 |= words_buf_s[pc_pos].b[14];
1957    k15 |= words_buf_s[pc_pos].b[15];
1958    k16 |= words_buf_s[pc_pos].b[16];
1959    k17 |= words_buf_s[pc_pos].b[17];
1960    k18 |= words_buf_s[pc_pos].b[18];
1961    k19 |= words_buf_s[pc_pos].b[19];
1962    k20 |= words_buf_s[pc_pos].b[20];
1963    k21 |= words_buf_s[pc_pos].b[21];
1964    k22 |= words_buf_s[pc_pos].b[22];
1965    k23 |= words_buf_s[pc_pos].b[23];
1966    k24 |= words_buf_s[pc_pos].b[24];
1967    k25 |= words_buf_s[pc_pos].b[25];
1968    k26 |= words_buf_s[pc_pos].b[26];
1969    k27 |= words_buf_s[pc_pos].b[27];
1970
1971    u32 D00 = d00;
1972    u32 D01 = d01;
1973    u32 D02 = d02;
1974    u32 D03 = d03;
1975    u32 D04 = d04;
1976    u32 D05 = d05;
1977    u32 D06 = d06;
1978    u32 D07 = d07;
1979    u32 D08 = d08;
1980    u32 D09 = d09;
1981    u32 D10 = d10;
1982    u32 D11 = d11;
1983    u32 D12 = d12;
1984    u32 D13 = d13;
1985    u32 D14 = d14;
1986    u32 D15 = d15;
1987    u32 D16 = d16;
1988    u32 D17 = d17;
1989    u32 D18 = d18;
1990    u32 D19 = d19;
1991    u32 D20 = d20;
1992    u32 D21 = d21;
1993    u32 D22 = d22;
1994    u32 D23 = d23;
1995    u32 D24 = d24;
1996    u32 D25 = d25;
1997    u32 D26 = d26;
1998    u32 D27 = d27;
1999    u32 D28 = d28;
2000    u32 D29 = d29;
2001    u32 D30 = d30;
2002    u32 D31 = d31;
2003    u32 D32 = d32;
2004    u32 D33 = d33;
2005    u32 D34 = d34;
2006    u32 D35 = d35;
2007    u32 D36 = d36;
2008    u32 D37 = d37;
2009    u32 D38 = d38;
2010    u32 D39 = d39;
2011    u32 D40 = d40;
2012    u32 D41 = d41;
2013    u32 D42 = d42;
2014    u32 D43 = d43;
2015    u32 D44 = d44;
2016    u32 D45 = d45;
2017    u32 D46 = d46;
2018    u32 D47 = d47;
2019    u32 D48 = d48;
2020    u32 D49 = d49;
2021    u32 D50 = d50;
2022    u32 D51 = d51;
2023    u32 D52 = d52;
2024    u32 D53 = d53;
2025    u32 D54 = d54;
2026    u32 D55 = d55;
2027    u32 D56 = d56;
2028    u32 D57 = d57;
2029    u32 D58 = d58;
2030    u32 D59 = d59;
2031    u32 D60 = d60;
2032    u32 D61 = d61;
2033    u32 D62 = d62;
2034    u32 D63 = d63;
2035
2036    DES
2037    (
2038      k00, k01, k02, k03, k04, k05, k06,
2039      k07, k08, k09, k10, k11, k12, k13,
2040      k14, k15, k16, k17, k18, k19, k20,
2041      k21, k22, k23, k24, k25, k26, k27,
2042      K28, K29, K30, K31, K32, K33, K34,
2043      K35, K36, K37, K38, K39, K40, K41,
2044      K42, K43, K44, K45, K46, K47, K48,
2045      K49, K50, K51, K52, K53, K54, K55,
2046      &D00, &D01, &D02, &D03, &D04, &D05, &D06, &D07,
2047      &D08, &D09, &D10, &D11, &D12, &D13, &D14, &D15,
2048      &D16, &D17, &D18, &D19, &D20, &D21, &D22, &D23,
2049      &D24, &D25, &D26, &D27, &D28, &D29, &D30, &D31,
2050      &D32, &D33, &D34, &D35, &D36, &D37, &D38, &D39,
2051      &D40, &D41, &D42, &D43, &D44, &D45, &D46, &D47,
2052      &D48, &D49, &D50, &D51, &D52, &D53, &D54, &D55,
2053      &D56, &D57, &D58, &D59, &D60, &D61, &D62, &D63
2054    );
2055
2056    u32 out[64];
2057
2058    out[ 0] = D00;
2059    out[ 1] = D01;
2060    out[ 2] = D02;
2061    out[ 3] = D03;
2062    out[ 4] = D04;
2063    out[ 5] = D05;
2064    out[ 6] = D06;
2065    out[ 7] = D07;
2066    out[ 8] = D08;
2067    out[ 9] = D09;
2068    out[10] = D10;
2069    out[11] = D11;
2070    out[12] = D12;
2071    out[13] = D13;
2072    out[14] = D14;
2073    out[15] = D15;
2074    out[16] = D16;
2075    out[17] = D17;
2076    out[18] = D18;
2077    out[19] = D19;
2078    out[20] = D20;
2079    out[21] = D21;
2080    out[22] = D22;
2081    out[23] = D23;
2082    out[24] = D24;
2083    out[25] = D25;
2084    out[26] = D26;
2085    out[27] = D27;
2086    out[28] = D28;
2087    out[29] = D29;
2088    out[30] = D30;
2089    out[31] = D31;
2090    out[32] = D32;
2091    out[33] = D33;
2092    out[34] = D34;
2093    out[35] = D35;
2094    out[36] = D36;
2095    out[37] = D37;
2096    out[38] = D38;
2097    out[39] = D39;
2098    out[40] = D40;
2099    out[41] = D41;
2100    out[42] = D42;
2101    out[43] = D43;
2102    out[44] = D44;
2103    out[45] = D45;
2104    out[46] = D46;
2105    out[47] = D47;
2106    out[48] = D48;
2107    out[49] = D49;
2108    out[50] = D50;
2109    out[51] = D51;
2110    out[52] = D52;
2111    out[53] = D53;
2112    out[54] = D54;
2113    out[55] = D55;
2114    out[56] = D56;
2115    out[57] = D57;
2116    out[58] = D58;
2117    out[59] = D59;
2118    out[60] = D60;
2119    out[61] = D61;
2120    out[62] = D62;
2121    out[63] = D63;
2122
2123    if (digests_cnt < 16)
2124    {
2125      for (u32 d = 0; d < digests_cnt; d++)
2126      {
2127        const u32 final_hash_pos = DIGESTS_OFFSET + d;
2128
2129        if (hashes_shown[final_hash_pos]) continue;
2130
2131        u32 search[2];
2132
2133        search[0] = digests_buf[final_hash_pos].digest_buf[DGST_R0];
2134        search[1] = digests_buf[final_hash_pos].digest_buf[DGST_R1];
2135
2136        u32 tmpResult = 0;
2137
2138        #ifdef _unroll
2139        #pragma unroll
2140        #endif
2141        for (int i = 0; i < 32; i++)
2142        {
2143          const u32 b0 = -((search[0] >> i) & 1);
2144          const u32 b1 = -((search[1] >> i) & 1);
2145
2146          tmpResult |= out[ 0 + i] ^ b0;
2147          tmpResult |= out[32 + i] ^ b1;
2148        }
2149
2150        if (tmpResult == 0xffffffff) continue;
2151
2152        const u32 slice = ffz (tmpResult);
2153
2154        const u32 r0 = search[0];
2155        const u32 r1 = search[1];
2156        const u32 r2 = 0;
2157        const u32 r3 = 0;
2158
2159        #ifdef KERNEL_STATIC
2160        #include COMPARE_M
2161        #endif
2162      }
2163    }
2164    else
2165    {
2166      u32 out0[32];
2167      u32 out1[32];
2168
2169      #ifdef _unroll
2170      #pragma unroll
2171      #endif
2172      for (int i = 0; i < 32; i++)
2173      {
2174        out0[i] = out[ 0 + i];
2175        out1[i] = out[32 + i];
2176      }
2177
2178      transpose32c (out0);
2179      transpose32c (out1);
2180
2181      #ifdef _unroll
2182      #pragma unroll
2183      #endif
2184      for (int slice = 0; slice < 32; slice++)
2185      {
2186        const u32 r0 = out0[slice];
2187        const u32 r1 = out1[slice];
2188        const u32 r2 = 0;
2189        const u32 r3 = 0;
2190
2191        #ifdef KERNEL_STATIC
2192        #include COMPARE_M
2193        #endif
2194      }
2195    }
2196  }
2197}
2198
2199KERNEL_FQ void m14000_sxx (KERN_ATTR_BITSLICE ())
2200{
2201  /**
2202   * base
2203   */
2204
2205  const u64 gid = get_global_id (0);
2206  const u64 lid = get_local_id (0);
2207
2208  /**
2209   * salt
2210   */
2211
2212  const u32 salt0 = salt_bufs[SALT_POS].salt_buf_pc[0];
2213  const u32 salt1 = salt_bufs[SALT_POS].salt_buf_pc[1];
2214
2215  // salt1 first, because this is a 64 bit value actually
2216
2217  const u32 d00 = (((salt1 >>  0) & 1) ? -1 : 0);
2218  const u32 d01 = (((salt1 >>  1) & 1) ? -1 : 0);
2219  const u32 d02 = (((salt1 >>  2) & 1) ? -1 : 0);
2220  const u32 d03 = (((salt1 >>  3) & 1) ? -1 : 0);
2221  const u32 d04 = (((salt1 >>  4) & 1) ? -1 : 0);
2222  const u32 d05 = (((salt1 >>  5) & 1) ? -1 : 0);
2223  const u32 d06 = (((salt1 >>  6) & 1) ? -1 : 0);
2224  const u32 d07 = (((salt1 >>  7) & 1) ? -1 : 0);
2225  const u32 d08 = (((salt1 >>  8) & 1) ? -1 : 0);
2226  const u32 d09 = (((salt1 >>  9) & 1) ? -1 : 0);
2227  const u32 d10 = (((salt1 >> 10) & 1) ? -1 : 0);
2228  const u32 d11 = (((salt1 >> 11) & 1) ? -1 : 0);
2229  const u32 d12 = (((salt1 >> 12) & 1) ? -1 : 0);
2230  const u32 d13 = (((salt1 >> 13) & 1) ? -1 : 0);
2231  const u32 d14 = (((salt1 >> 14) & 1) ? -1 : 0);
2232  const u32 d15 = (((salt1 >> 15) & 1) ? -1 : 0);
2233  const u32 d16 = (((salt1 >> 16) & 1) ? -1 : 0);
2234  const u32 d17 = (((salt1 >> 17) & 1) ? -1 : 0);
2235  const u32 d18 = (((salt1 >> 18) & 1) ? -1 : 0);
2236  const u32 d19 = (((salt1 >> 19) & 1) ? -1 : 0);
2237  const u32 d20 = (((salt1 >> 20) & 1) ? -1 : 0);
2238  const u32 d21 = (((salt1 >> 21) & 1) ? -1 : 0);
2239  const u32 d22 = (((salt1 >> 22) & 1) ? -1 : 0);
2240  const u32 d23 = (((salt1 >> 23) & 1) ? -1 : 0);
2241  const u32 d24 = (((salt1 >> 24) & 1) ? -1 : 0);
2242  const u32 d25 = (((salt1 >> 25) & 1) ? -1 : 0);
2243  const u32 d26 = (((salt1 >> 26) & 1) ? -1 : 0);
2244  const u32 d27 = (((salt1 >> 27) & 1) ? -1 : 0);
2245  const u32 d28 = (((salt1 >> 28) & 1) ? -1 : 0);
2246  const u32 d29 = (((salt1 >> 29) & 1) ? -1 : 0);
2247  const u32 d30 = (((salt1 >> 30) & 1) ? -1 : 0);
2248  const u32 d31 = (((salt1 >> 31) & 1) ? -1 : 0);
2249  const u32 d32 = (((salt0 >>  0) & 1) ? -1 : 0);
2250  const u32 d33 = (((salt0 >>  1) & 1) ? -1 : 0);
2251  const u32 d34 = (((salt0 >>  2) & 1) ? -1 : 0);
2252  const u32 d35 = (((salt0 >>  3) & 1) ? -1 : 0);
2253  const u32 d36 = (((salt0 >>  4) & 1) ? -1 : 0);
2254  const u32 d37 = (((salt0 >>  5) & 1) ? -1 : 0);
2255  const u32 d38 = (((salt0 >>  6) & 1) ? -1 : 0);
2256  const u32 d39 = (((salt0 >>  7) & 1) ? -1 : 0);
2257  const u32 d40 = (((salt0 >>  8) & 1) ? -1 : 0);
2258  const u32 d41 = (((salt0 >>  9) & 1) ? -1 : 0);
2259  const u32 d42 = (((salt0 >> 10) & 1) ? -1 : 0);
2260  const u32 d43 = (((salt0 >> 11) & 1) ? -1 : 0);
2261  const u32 d44 = (((salt0 >> 12) & 1) ? -1 : 0);
2262  const u32 d45 = (((salt0 >> 13) & 1) ? -1 : 0);
2263  const u32 d46 = (((salt0 >> 14) & 1) ? -1 : 0);
2264  const u32 d47 = (((salt0 >> 15) & 1) ? -1 : 0);
2265  const u32 d48 = (((salt0 >> 16) & 1) ? -1 : 0);
2266  const u32 d49 = (((salt0 >> 17) & 1) ? -1 : 0);
2267  const u32 d50 = (((salt0 >> 18) & 1) ? -1 : 0);
2268  const u32 d51 = (((salt0 >> 19) & 1) ? -1 : 0);
2269  const u32 d52 = (((salt0 >> 20) & 1) ? -1 : 0);
2270  const u32 d53 = (((salt0 >> 21) & 1) ? -1 : 0);
2271  const u32 d54 = (((salt0 >> 22) & 1) ? -1 : 0);
2272  const u32 d55 = (((salt0 >> 23) & 1) ? -1 : 0);
2273  const u32 d56 = (((salt0 >> 24) & 1) ? -1 : 0);
2274  const u32 d57 = (((salt0 >> 25) & 1) ? -1 : 0);
2275  const u32 d58 = (((salt0 >> 26) & 1) ? -1 : 0);
2276  const u32 d59 = (((salt0 >> 27) & 1) ? -1 : 0);
2277  const u32 d60 = (((salt0 >> 28) & 1) ? -1 : 0);
2278  const u32 d61 = (((salt0 >> 29) & 1) ? -1 : 0);
2279  const u32 d62 = (((salt0 >> 30) & 1) ? -1 : 0);
2280  const u32 d63 = (((salt0 >> 31) & 1) ? -1 : 0);
2281
2282  /**
2283   * digest
2284   */
2285
2286  const u32 s0 = digests_buf[0].digest_buf[0];
2287  const u32 s1 = digests_buf[0].digest_buf[1];
2288
2289  const u32 S00 = (((s0 >>  0) & 1) ? -1 : 0);
2290  const u32 S01 = (((s0 >>  1) & 1) ? -1 : 0);
2291  const u32 S02 = (((s0 >>  2) & 1) ? -1 : 0);
2292  const u32 S03 = (((s0 >>  3) & 1) ? -1 : 0);
2293  const u32 S04 = (((s0 >>  4) & 1) ? -1 : 0);
2294  const u32 S05 = (((s0 >>  5) & 1) ? -1 : 0);
2295  const u32 S06 = (((s0 >>  6) & 1) ? -1 : 0);
2296  const u32 S07 = (((s0 >>  7) & 1) ? -1 : 0);
2297  const u32 S08 = (((s0 >>  8) & 1) ? -1 : 0);
2298  const u32 S09 = (((s0 >>  9) & 1) ? -1 : 0);
2299  const u32 S10 = (((s0 >> 10) & 1) ? -1 : 0);
2300  const u32 S11 = (((s0 >> 11) & 1) ? -1 : 0);
2301  const u32 S12 = (((s0 >> 12) & 1) ? -1 : 0);
2302  const u32 S13 = (((s0 >> 13) & 1) ? -1 : 0);
2303  const u32 S14 = (((s0 >> 14) & 1) ? -1 : 0);
2304  const u32 S15 = (((s0 >> 15) & 1) ? -1 : 0);
2305  const u32 S16 = (((s0 >> 16) & 1) ? -1 : 0);
2306  const u32 S17 = (((s0 >> 17) & 1) ? -1 : 0);
2307  const u32 S18 = (((s0 >> 18) & 1) ? -1 : 0);
2308  const u32 S19 = (((s0 >> 19) & 1) ? -1 : 0);
2309  const u32 S20 = (((s0 >> 20) & 1) ? -1 : 0);
2310  const u32 S21 = (((s0 >> 21) & 1) ? -1 : 0);
2311  const u32 S22 = (((s0 >> 22) & 1) ? -1 : 0);
2312  const u32 S23 = (((s0 >> 23) & 1) ? -1 : 0);
2313  const u32 S24 = (((s0 >> 24) & 1) ? -1 : 0);
2314  const u32 S25 = (((s0 >> 25) & 1) ? -1 : 0);
2315  const u32 S26 = (((s0 >> 26) & 1) ? -1 : 0);
2316  const u32 S27 = (((s0 >> 27) & 1) ? -1 : 0);
2317  const u32 S28 = (((s0 >> 28) & 1) ? -1 : 0);
2318  const u32 S29 = (((s0 >> 29) & 1) ? -1 : 0);
2319  const u32 S30 = (((s0 >> 30) & 1) ? -1 : 0);
2320  const u32 S31 = (((s0 >> 31) & 1) ? -1 : 0);
2321  const u32 S32 = (((s1 >>  0) & 1) ? -1 : 0);
2322  const u32 S33 = (((s1 >>  1) & 1) ? -1 : 0);
2323  const u32 S34 = (((s1 >>  2) & 1) ? -1 : 0);
2324  const u32 S35 = (((s1 >>  3) & 1) ? -1 : 0);
2325  const u32 S36 = (((s1 >>  4) & 1) ? -1 : 0);
2326  const u32 S37 = (((s1 >>  5) & 1) ? -1 : 0);
2327  const u32 S38 = (((s1 >>  6) & 1) ? -1 : 0);
2328  const u32 S39 = (((s1 >>  7) & 1) ? -1 : 0);
2329  const u32 S40 = (((s1 >>  8) & 1) ? -1 : 0);
2330  const u32 S41 = (((s1 >>  9) & 1) ? -1 : 0);
2331  const u32 S42 = (((s1 >> 10) & 1) ? -1 : 0);
2332  const u32 S43 = (((s1 >> 11) & 1) ? -1 : 0);
2333  const u32 S44 = (((s1 >> 12) & 1) ? -1 : 0);
2334  const u32 S45 = (((s1 >> 13) & 1) ? -1 : 0);
2335  const u32 S46 = (((s1 >> 14) & 1) ? -1 : 0);
2336  const u32 S47 = (((s1 >> 15) & 1) ? -1 : 0);
2337  const u32 S48 = (((s1 >> 16) & 1) ? -1 : 0);
2338  const u32 S49 = (((s1 >> 17) & 1) ? -1 : 0);
2339  const u32 S50 = (((s1 >> 18) & 1) ? -1 : 0);
2340  const u32 S51 = (((s1 >> 19) & 1) ? -1 : 0);
2341  const u32 S52 = (((s1 >> 20) & 1) ? -1 : 0);
2342  const u32 S53 = (((s1 >> 21) & 1) ? -1 : 0);
2343  const u32 S54 = (((s1 >> 22) & 1) ? -1 : 0);
2344  const u32 S55 = (((s1 >> 23) & 1) ? -1 : 0);
2345  const u32 S56 = (((s1 >> 24) & 1) ? -1 : 0);
2346  const u32 S57 = (((s1 >> 25) & 1) ? -1 : 0);
2347  const u32 S58 = (((s1 >> 26) & 1) ? -1 : 0);
2348  const u32 S59 = (((s1 >> 27) & 1) ? -1 : 0);
2349  const u32 S60 = (((s1 >> 28) & 1) ? -1 : 0);
2350  const u32 S61 = (((s1 >> 29) & 1) ? -1 : 0);
2351  const u32 S62 = (((s1 >> 30) & 1) ? -1 : 0);
2352  const u32 S63 = (((s1 >> 31) & 1) ? -1 : 0);
2353
2354  /**
2355   * base
2356   */
2357
2358  const u32 w0 = pws[gid].i[0];
2359  const u32 w1 = pws[gid].i[1];
2360
2361  #define K00 (((w0 >> ( 0 + 7)) & 1) ? -1 : 0)
2362  #define K01 (((w0 >> ( 0 + 6)) & 1) ? -1 : 0)
2363  #define K02 (((w0 >> ( 0 + 5)) & 1) ? -1 : 0)
2364  #define K03 (((w0 >> ( 0 + 4)) & 1) ? -1 : 0)
2365  #define K04 (((w0 >> ( 0 + 3)) & 1) ? -1 : 0)
2366  #define K05 (((w0 >> ( 0 + 2)) & 1) ? -1 : 0)
2367  #define K06 (((w0 >> ( 0 + 1)) & 1) ? -1 : 0)
2368  #define K07 (((w0 >> ( 8 + 7)) & 1) ? -1 : 0)
2369  #define K08 (((w0 >> ( 8 + 6)) & 1) ? -1 : 0)
2370  #define K09 (((w0 >> ( 8 + 5)) & 1) ? -1 : 0)
2371  #define K10 (((w0 >> ( 8 + 4)) & 1) ? -1 : 0)
2372  #define K11 (((w0 >> ( 8 + 3)) & 1) ? -1 : 0)
2373  #define K12 (((w0 >> ( 8 + 2)) & 1) ? -1 : 0)
2374  #define K13 (((w0 >> ( 8 + 1)) & 1) ? -1 : 0)
2375  #define K14 (((w0 >> (16 + 7)) & 1) ? -1 : 0)
2376  #define K15 (((w0 >> (16 + 6)) & 1) ? -1 : 0)
2377  #define K16 (((w0 >> (16 + 5)) & 1) ? -1 : 0)
2378  #define K17 (((w0 >> (16 + 4)) & 1) ? -1 : 0)
2379  #define K18 (((w0 >> (16 + 3)) & 1) ? -1 : 0)
2380  #define K19 (((w0 >> (16 + 2)) & 1) ? -1 : 0)
2381  #define K20 (((w0 >> (16 + 1)) & 1) ? -1 : 0)
2382  #define K21 (((w0 >> (24 + 7)) & 1) ? -1 : 0)
2383  #define K22 (((w0 >> (24 + 6)) & 1) ? -1 : 0)
2384  #define K23 (((w0 >> (24 + 5)) & 1) ? -1 : 0)
2385  #define K24 (((w0 >> (24 + 4)) & 1) ? -1 : 0)
2386  #define K25 (((w0 >> (24 + 3)) & 1) ? -1 : 0)
2387  #define K26 (((w0 >> (24 + 2)) & 1) ? -1 : 0)
2388  #define K27 (((w0 >> (24 + 1)) & 1) ? -1 : 0)
2389  #define K28 (((w1 >> ( 0 + 7)) & 1) ? -1 : 0)
2390  #define K29 (((w1 >> ( 0 + 6)) & 1) ? -1 : 0)
2391  #define K30 (((w1 >> ( 0 + 5)) & 1) ? -1 : 0)
2392  #define K31 (((w1 >> ( 0 + 4)) & 1) ? -1 : 0)
2393  #define K32 (((w1 >> ( 0 + 3)) & 1) ? -1 : 0)
2394  #define K33 (((w1 >> ( 0 + 2)) & 1) ? -1 : 0)
2395  #define K34 (((w1 >> ( 0 + 1)) & 1) ? -1 : 0)
2396  #define K35 (((w1 >> ( 8 + 7)) & 1) ? -1 : 0)
2397  #define K36 (((w1 >> ( 8 + 6)) & 1) ? -1 : 0)
2398  #define K37 (((w1 >> ( 8 + 5)) & 1) ? -1 : 0)
2399  #define K38 (((w1 >> ( 8 + 4)) & 1) ? -1 : 0)
2400  #define K39 (((w1 >> ( 8 + 3)) & 1) ? -1 : 0)
2401  #define K40 (((w1 >> ( 8 + 2)) & 1) ? -1 : 0)
2402  #define K41 (((w1 >> ( 8 + 1)) & 1) ? -1 : 0)
2403  #define K42 (((w1 >> (16 + 7)) & 1) ? -1 : 0)
2404  #define K43 (((w1 >> (16 + 6)) & 1) ? -1 : 0)
2405  #define K44 (((w1 >> (16 + 5)) & 1) ? -1 : 0)
2406  #define K45 (((w1 >> (16 + 4)) & 1) ? -1 : 0)
2407  #define K46 (((w1 >> (16 + 3)) & 1) ? -1 : 0)
2408  #define K47 (((w1 >> (16 + 2)) & 1) ? -1 : 0)
2409  #define K48 (((w1 >> (16 + 1)) & 1) ? -1 : 0)
2410  #define K49 (((w1 >> (24 + 7)) & 1) ? -1 : 0)
2411  #define K50 (((w1 >> (24 + 6)) & 1) ? -1 : 0)
2412  #define K51 (((w1 >> (24 + 5)) & 1) ? -1 : 0)
2413  #define K52 (((w1 >> (24 + 4)) & 1) ? -1 : 0)
2414  #define K53 (((w1 >> (24 + 3)) & 1) ? -1 : 0)
2415  #define K54 (((w1 >> (24 + 2)) & 1) ? -1 : 0)
2416  #define K55 (((w1 >> (24 + 1)) & 1) ? -1 : 0)
2417
2418  /**
2419   * inner loop
2420   */
2421
2422  for (u32 il_pos = 0; il_pos < il_cnt; il_pos += 32)
2423  {
2424    u32 k00 = K00;
2425    u32 k01 = K01;
2426    u32 k02 = K02;
2427    u32 k03 = K03;
2428    u32 k04 = K04;
2429    u32 k05 = K05;
2430    u32 k06 = K06;
2431    u32 k07 = K07;
2432    u32 k08 = K08;
2433    u32 k09 = K09;
2434    u32 k10 = K10;
2435    u32 k11 = K11;
2436    u32 k12 = K12;
2437    u32 k13 = K13;
2438    u32 k14 = K14;
2439    u32 k15 = K15;
2440    u32 k16 = K16;
2441    u32 k17 = K17;
2442    u32 k18 = K18;
2443    u32 k19 = K19;
2444    u32 k20 = K20;
2445    u32 k21 = K21;
2446    u32 k22 = K22;
2447    u32 k23 = K23;
2448    u32 k24 = K24;
2449    u32 k25 = K25;
2450    u32 k26 = K26;
2451    u32 k27 = K27;
2452
2453    const u32 pc_pos = il_pos / 32;
2454
2455    k00 |= words_buf_s[pc_pos].b[ 0];
2456    k01 |= words_buf_s[pc_pos].b[ 1];
2457    k02 |= words_buf_s[pc_pos].b[ 2];
2458    k03 |= words_buf_s[pc_pos].b[ 3];
2459    k04 |= words_buf_s[pc_pos].b[ 4];
2460    k05 |= words_buf_s[pc_pos].b[ 5];
2461    k06 |= words_buf_s[pc_pos].b[ 6];
2462    k07 |= words_buf_s[pc_pos].b[ 7];
2463    k08 |= words_buf_s[pc_pos].b[ 8];
2464    k09 |= words_buf_s[pc_pos].b[ 9];
2465    k10 |= words_buf_s[pc_pos].b[10];
2466    k11 |= words_buf_s[pc_pos].b[11];
2467    k12 |= words_buf_s[pc_pos].b[12];
2468    k13 |= words_buf_s[pc_pos].b[13];
2469    k14 |= words_buf_s[pc_pos].b[14];
2470    k15 |= words_buf_s[pc_pos].b[15];
2471    k16 |= words_buf_s[pc_pos].b[16];
2472    k17 |= words_buf_s[pc_pos].b[17];
2473    k18 |= words_buf_s[pc_pos].b[18];
2474    k19 |= words_buf_s[pc_pos].b[19];
2475    k20 |= words_buf_s[pc_pos].b[20];
2476    k21 |= words_buf_s[pc_pos].b[21];
2477    k22 |= words_buf_s[pc_pos].b[22];
2478    k23 |= words_buf_s[pc_pos].b[23];
2479    k24 |= words_buf_s[pc_pos].b[24];
2480    k25 |= words_buf_s[pc_pos].b[25];
2481    k26 |= words_buf_s[pc_pos].b[26];
2482    k27 |= words_buf_s[pc_pos].b[27];
2483
2484    u32 D00 = d00;
2485    u32 D01 = d01;
2486    u32 D02 = d02;
2487    u32 D03 = d03;
2488    u32 D04 = d04;
2489    u32 D05 = d05;
2490    u32 D06 = d06;
2491    u32 D07 = d07;
2492    u32 D08 = d08;
2493    u32 D09 = d09;
2494    u32 D10 = d10;
2495    u32 D11 = d11;
2496    u32 D12 = d12;
2497    u32 D13 = d13;
2498    u32 D14 = d14;
2499    u32 D15 = d15;
2500    u32 D16 = d16;
2501    u32 D17 = d17;
2502    u32 D18 = d18;
2503    u32 D19 = d19;
2504    u32 D20 = d20;
2505    u32 D21 = d21;
2506    u32 D22 = d22;
2507    u32 D23 = d23;
2508    u32 D24 = d24;
2509    u32 D25 = d25;
2510    u32 D26 = d26;
2511    u32 D27 = d27;
2512    u32 D28 = d28;
2513    u32 D29 = d29;
2514    u32 D30 = d30;
2515    u32 D31 = d31;
2516    u32 D32 = d32;
2517    u32 D33 = d33;
2518    u32 D34 = d34;
2519    u32 D35 = d35;
2520    u32 D36 = d36;
2521    u32 D37 = d37;
2522    u32 D38 = d38;
2523    u32 D39 = d39;
2524    u32 D40 = d40;
2525    u32 D41 = d41;
2526    u32 D42 = d42;
2527    u32 D43 = d43;
2528    u32 D44 = d44;
2529    u32 D45 = d45;
2530    u32 D46 = d46;
2531    u32 D47 = d47;
2532    u32 D48 = d48;
2533    u32 D49 = d49;
2534    u32 D50 = d50;
2535    u32 D51 = d51;
2536    u32 D52 = d52;
2537    u32 D53 = d53;
2538    u32 D54 = d54;
2539    u32 D55 = d55;
2540    u32 D56 = d56;
2541    u32 D57 = d57;
2542    u32 D58 = d58;
2543    u32 D59 = d59;
2544    u32 D60 = d60;
2545    u32 D61 = d61;
2546    u32 D62 = d62;
2547    u32 D63 = d63;
2548
2549    DES
2550    (
2551      k00, k01, k02, k03, k04, k05, k06,
2552      k07, k08, k09, k10, k11, k12, k13,
2553      k14, k15, k16, k17, k18, k19, k20,
2554      k21, k22, k23, k24, k25, k26, k27,
2555      K28, K29, K30, K31, K32, K33, K34,
2556      K35, K36, K37, K38, K39, K40, K41,
2557      K42, K43, K44, K45, K46, K47, K48,
2558      K49, K50, K51, K52, K53, K54, K55,
2559      &D00, &D01, &D02, &D03, &D04, &D05, &D06, &D07,
2560      &D08, &D09, &D10, &D11, &D12, &D13, &D14, &D15,
2561      &D16, &D17, &D18, &D19, &D20, &D21, &D22, &D23,
2562      &D24, &D25, &D26, &D27, &D28, &D29, &D30, &D31,
2563      &D32, &D33, &D34, &D35, &D36, &D37, &D38, &D39,
2564      &D40, &D41, &D42, &D43, &D44, &D45, &D46, &D47,
2565      &D48, &D49, &D50, &D51, &D52, &D53, &D54, &D55,
2566      &D56, &D57, &D58, &D59, &D60, &D61, &D62, &D63
2567    );
2568
2569    u32 tmpResult = 0;
2570
2571    tmpResult |= D00 ^ S00;
2572    tmpResult |= D01 ^ S01;
2573    tmpResult |= D02 ^ S02;
2574    tmpResult |= D03 ^ S03;
2575    tmpResult |= D04 ^ S04;
2576    tmpResult |= D05 ^ S05;
2577    tmpResult |= D06 ^ S06;
2578    tmpResult |= D07 ^ S07;
2579    tmpResult |= D08 ^ S08;
2580    tmpResult |= D09 ^ S09;
2581    tmpResult |= D10 ^ S10;
2582    tmpResult |= D11 ^ S11;
2583    tmpResult |= D12 ^ S12;
2584    tmpResult |= D13 ^ S13;
2585    tmpResult |= D14 ^ S14;
2586    tmpResult |= D15 ^ S15;
2587
2588    if (tmpResult == 0xffffffff) continue;
2589
2590    tmpResult |= D16 ^ S16;
2591    tmpResult |= D17 ^ S17;
2592    tmpResult |= D18 ^ S18;
2593    tmpResult |= D19 ^ S19;
2594    tmpResult |= D20 ^ S20;
2595    tmpResult |= D21 ^ S21;
2596    tmpResult |= D22 ^ S22;
2597    tmpResult |= D23 ^ S23;
2598    tmpResult |= D24 ^ S24;
2599    tmpResult |= D25 ^ S25;
2600    tmpResult |= D26 ^ S26;
2601    tmpResult |= D27 ^ S27;
2602    tmpResult |= D28 ^ S28;
2603    tmpResult |= D29 ^ S29;
2604    tmpResult |= D30 ^ S30;
2605    tmpResult |= D31 ^ S31;
2606
2607    if (tmpResult == 0xffffffff) continue;
2608
2609    tmpResult |= D32 ^ S32;
2610    tmpResult |= D33 ^ S33;
2611    tmpResult |= D34 ^ S34;
2612    tmpResult |= D35 ^ S35;
2613    tmpResult |= D36 ^ S36;
2614    tmpResult |= D37 ^ S37;
2615    tmpResult |= D38 ^ S38;
2616    tmpResult |= D39 ^ S39;
2617    tmpResult |= D40 ^ S40;
2618    tmpResult |= D41 ^ S41;
2619    tmpResult |= D42 ^ S42;
2620    tmpResult |= D43 ^ S43;
2621    tmpResult |= D44 ^ S44;
2622    tmpResult |= D45 ^ S45;
2623    tmpResult |= D46 ^ S46;
2624    tmpResult |= D47 ^ S47;
2625
2626    if (tmpResult == 0xffffffff) continue;
2627
2628    tmpResult |= D48 ^ S48;
2629    tmpResult |= D49 ^ S49;
2630    tmpResult |= D50 ^ S50;
2631    tmpResult |= D51 ^ S51;
2632    tmpResult |= D52 ^ S52;
2633    tmpResult |= D53 ^ S53;
2634    tmpResult |= D54 ^ S54;
2635    tmpResult |= D55 ^ S55;
2636    tmpResult |= D56 ^ S56;
2637    tmpResult |= D57 ^ S57;
2638    tmpResult |= D58 ^ S58;
2639    tmpResult |= D59 ^ S59;
2640    tmpResult |= D60 ^ S60;
2641    tmpResult |= D61 ^ S61;
2642    tmpResult |= D62 ^ S62;
2643    tmpResult |= D63 ^ S63;
2644
2645    if (tmpResult == 0xffffffff) continue;
2646
2647    const u32 slice = ffz (tmpResult);
2648
2649    #ifdef KERNEL_STATIC
2650    #include COMPARE_S
2651    #endif
2652  }
2653}
2654