1 /*
2 * Bitslice DES S-boxes making use of a vector conditional select operation
3 * (e.g., vsel on PowerPC with AltiVec).
4 *
5 * Gate counts: 36 33 33 26 35 34 34 32
6 * Average: 32.875
7 *
8 * Several same-gate-count expressions for each S-box are included (for use on
9 * different CPUs/GPUs).
10 *
11 * These Boolean expressions corresponding to DES S-boxes have been generated
12 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
13 * John the Ripper password cracker: http://www.openwall.com/john/
14 * Being mathematical formulas, they are not copyrighted and are free for reuse
15 * by anyone.
16 *
17 * This file (a specific representation of the S-box expressions, surrounding
18 * logic) is Copyright (c) 2011 by Solar Designer <solar at openwall.com>.
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted. (This is a heavily cut-down "BSD license".)
21 *
22 * The effort has been sponsored by Rapid7: http://www.rapid7.com
23 */
24
25 #undef regs
26 #if defined(__x86_64__) && defined(__XOP__)
27 #define regs 16
28 #elif defined(__x86_64__)
29 #define regs 15
30 #elif defined(__i386__)
31 /* Hopefully, AMD XOP (but in 32-bit mode) */
32 #define regs 8
33 #else
34 /* PowerPC with AltiVec, etc. */
35 #include <altivec.h>
36 #define regs 32
37 #endif
38
39 #undef latency
40 /* Latency 2 may also mean dual-issue with latency 1 */
41 #define latency 2
42
43 #if regs >= 17 || latency >= 3
44 /* s1-000010, 36 gates, 17 regs, 8/28/65/102/139 stall cycles */
45 MAYBE_INLINE static void
s1(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)46 s1(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
47 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
48 {
49 vtype x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0,
50 x5CA9E295;
51 vtype x55AFD1B7, x3C3C69C3, x6993B874;
52 vtype x5CEDE59F, x09FCE295, x5D91A51E, x529E962D;
53 vtype x29EEADC0, x4B8771A3, x428679F3, x6B68D433;
54 vtype x5BA7E193, x026F12F3, x6B27C493, x94D83B6C;
55 vtype x965E0B0F, x3327A113, x847F0A1F, xD6E19C32;
56 vtype x0DBCE883, x3A25A215, x37994A96;
57 vtype xC9C93B62, x89490F02, xB96C2D16;
58 vtype x0, x1, x2, x3;
59
60 vsel(x0F0F3333, a3, a2, a5);
61 vxor(x3C3C3C3C, a2, a3);
62 vor(x55FF55FF, a1, a4);
63 vxor(x69C369C3, x3C3C3C3C, x55FF55FF);
64 vsel(x0903B73F, a5, x0F0F3333, x69C369C3);
65 vxor(x09FCB7C0, a4, x0903B73F);
66 vxor(x5CA9E295, a1, x09FCB7C0);
67
68 vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333);
69 vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5);
70 vxor(x6993B874, x55AFD1B7, x3C3C69C3);
71
72 vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874);
73 vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5);
74 vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295);
75 vxor(x529E962D, x0F0F3333, x5D91A51E);
76
77 vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F);
78 vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295);
79 vsel(x428679F3, a5, x4B8771A3, x529E962D);
80 vxor(x6B68D433, x29EEADC0, x428679F3);
81
82 vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3);
83 vsel(x026F12F3, a4, x0F0F3333, x529E962D);
84 vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3);
85 vnot(x94D83B6C, x6B27C493);
86 vsel(x0, x94D83B6C, x6B68D433, a6);
87 vxor(*out1, *out1, x0);
88
89 vsel(x965E0B0F, x94D83B6C, a3, x428679F3);
90 vsel(x3327A113, x5BA7E193, a2, x69C369C3);
91 vsel(x847F0A1F, x965E0B0F, a4, x3327A113);
92 vxor(xD6E19C32, x529E962D, x847F0A1F);
93 vsel(x1, xD6E19C32, x5CA9E295, a6);
94 vxor(*out2, *out2, x1);
95
96 vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F);
97 vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F);
98 vxor(x37994A96, x0DBCE883, x3A25A215);
99 vsel(x3, x37994A96, x529E962D, a6);
100 vxor(*out4, *out4, x3);
101
102 vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E);
103 vsel(x89490F02, a3, xC9C93B62, x965E0B0F);
104 vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215);
105 vsel(x2, xB96C2D16, x6993B874, a6);
106 vxor(*out3, *out3, x2);
107 }
108 #else
109 /* s1-000011, 36 gates, 16 regs, 10/37/74/111/148 stall cycles */
110 MAYBE_INLINE static void
s1(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)111 s1(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
112 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
113 {
114 vtype x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0,
115 x5CA9E295;
116 vtype x55AFD1B7, x3C3C69C3, x6993B874;
117 vtype x5CEDE59F, x09FCE295, x5D91A51E, x529E962D;
118 vtype x29EEADC0, x4B8771A3, x428679F3, x6B68D433;
119 vtype x5BA7E193, x026F12F3, x6B27C493, x94D83B6C;
120 vtype x965E0B0F, x3327A113, x847F0A1F, xD6E19C32;
121 vtype x0DBCE883, x3A25A215, x37994A96;
122 vtype x8A487EA7, x8B480F07, xB96C2D16;
123 vtype x0, x1, x2, x3;
124
125 vsel(x0F0F3333, a3, a2, a5);
126 vxor(x3C3C3C3C, a2, a3);
127 vor(x55FF55FF, a1, a4);
128 vxor(x69C369C3, x3C3C3C3C, x55FF55FF);
129 vsel(x0903B73F, a5, x0F0F3333, x69C369C3);
130 vxor(x09FCB7C0, a4, x0903B73F);
131 vxor(x5CA9E295, a1, x09FCB7C0);
132
133 vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333);
134 vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5);
135 vxor(x6993B874, x55AFD1B7, x3C3C69C3);
136
137 vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874);
138 vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5);
139 vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295);
140 vxor(x529E962D, x0F0F3333, x5D91A51E);
141
142 vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F);
143 vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295);
144 vsel(x428679F3, a5, x4B8771A3, x529E962D);
145 vxor(x6B68D433, x29EEADC0, x428679F3);
146
147 vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3);
148 vsel(x026F12F3, a4, x0F0F3333, x529E962D);
149 vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3);
150 vnot(x94D83B6C, x6B27C493);
151 vsel(x0, x94D83B6C, x6B68D433, a6);
152 vxor(*out1, *out1, x0);
153
154 vsel(x965E0B0F, x94D83B6C, a3, x428679F3);
155 vsel(x3327A113, x5BA7E193, a2, x69C369C3);
156 vsel(x847F0A1F, x965E0B0F, a4, x3327A113);
157 vxor(xD6E19C32, x529E962D, x847F0A1F);
158 vsel(x1, xD6E19C32, x5CA9E295, a6);
159 vxor(*out2, *out2, x1);
160
161 vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F);
162 vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F);
163 vxor(x37994A96, x0DBCE883, x3A25A215);
164 vsel(x3, x37994A96, x529E962D, a6);
165 vxor(*out4, *out4, x3);
166
167 vxor(x8A487EA7, x5CA9E295, xD6E19C32);
168 vsel(x8B480F07, a3, x8A487EA7, x847F0A1F);
169 vsel(xB96C2D16, x8B480F07, x3C3C3C3C, x3A25A215);
170 vsel(x2, xB96C2D16, x6993B874, a6);
171 vxor(*out3, *out3, x2);
172 }
173 #endif
174
175 #if regs >= 18 && latency <= 2
176 /* s2-000000, 33 gates, 18 regs, 3/26/57/90/125 stall cycles */
177 MAYBE_INLINE static void
s2(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)178 s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
179 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
180 {
181 vtype x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556;
182 vtype x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A;
183 vtype x0F5AF03C, x6600FF56, x87A5F09C;
184 vtype xA55A963C, x3C69C30F, xB44BC32D;
185 vtype x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2;
186 vtype xB46C662D, x278DB412, xB66CB43B;
187 vtype xD2DC4E52, x27993333, xD2994E33;
188 vtype x278D0F2D, x2E0E547B, x09976748;
189 vtype x0, x1, x2, x3;
190
191 vsel(x55553333, a1, a3, a6);
192 vsel(x0055FF33, a6, x55553333, a5);
193 vsel(x33270F03, a3, a4, x0055FF33);
194 vxor(x66725A56, a1, x33270F03);
195 vxor(x00FFFF00, a5, a6);
196 vxor(x668DA556, x66725A56, x00FFFF00);
197
198 vsel(x0F0F5A56, a4, x66725A56, a6);
199 vnot(xF0F0A5A9, x0F0F5A56);
200 vxor(xA5A5969A, x55553333, xF0F0A5A9);
201 vxor(xA55A699A, x00FFFF00, xA5A5969A);
202 vsel(x1, xA55A699A, x668DA556, a2);
203 vxor(*out2, *out2, x1);
204
205 vxor(x0F5AF03C, a4, x0055FF33);
206 vsel(x6600FF56, x66725A56, a6, x00FFFF00);
207 vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56);
208
209 vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5);
210 vxor(x3C69C30F, a3, x0F5AF03C);
211 vsel(xB44BC32D, xA55A963C, x3C69C30F, a1);
212
213 vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A);
214 vsel(x0F4B0F2D, a4, xB44BC32D, a5);
215 vxor(x699CC37B, x66D7CC56, x0F4B0F2D);
216 vxor(x996C66D2, xF0F0A5A9, x699CC37B);
217 vsel(x0, x996C66D2, xB44BC32D, a2);
218 vxor(*out1, *out1, x0);
219
220 vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00);
221 vsel(x278DB412, x668DA556, xA5A5969A, a1);
222 vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56);
223
224 vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D);
225 vsel(x27993333, x278DB412, a3, x0055FF33);
226 vsel(xD2994E33, xD2DC4E52, x27993333, a5);
227 vsel(x3, x87A5F09C, xD2994E33, a2);
228 vxor(*out4, *out4, x3);
229
230 vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6);
231 vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D);
232 vxor(x09976748, x27993333, x2E0E547B);
233 vsel(x2, xB66CB43B, x09976748, a2);
234 vxor(*out3, *out3, x2);
235 }
236 #elif regs >= 18 && latency >= 4
237 /* s2-000002, 33 gates, 18 regs, 4/22/49/82/117 stall cycles */
238 MAYBE_INLINE static void
s2(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)239 s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
240 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
241 {
242 vtype x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556;
243 vtype x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A;
244 vtype x0F5AF03C, x6600FF56, x87A5F09C;
245 vtype xA55A963C, x3C69C30F, xB44BC32D;
246 vtype x0F4B0F2D, x66D7CC56, x962769FF, x996C66D2;
247 vtype xB46C662D, x278DB412, xB66CB43B;
248 vtype xD2DC4E52, x27993333, xD2994E33;
249 vtype x278D0F2D, x2E0E547B, x09976748;
250 vtype x0, x1, x2, x3;
251
252 vsel(x55553333, a1, a3, a6);
253 vsel(x0055FF33, a6, x55553333, a5);
254 vsel(x33270F03, a3, a4, x0055FF33);
255 vxor(x66725A56, a1, x33270F03);
256 vxor(x00FFFF00, a5, a6);
257 vxor(x668DA556, x66725A56, x00FFFF00);
258
259 vsel(x0F0F5A56, a4, x66725A56, a6);
260 vnot(xF0F0A5A9, x0F0F5A56);
261 vxor(xA5A5969A, x55553333, xF0F0A5A9);
262 vxor(xA55A699A, x00FFFF00, xA5A5969A);
263 vsel(x1, xA55A699A, x668DA556, a2);
264 vxor(*out2, *out2, x1);
265
266 vxor(x0F5AF03C, a4, x0055FF33);
267 vsel(x6600FF56, x66725A56, a6, x00FFFF00);
268 vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56);
269
270 vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5);
271 vxor(x3C69C30F, a3, x0F5AF03C);
272 vsel(xB44BC32D, xA55A963C, x3C69C30F, a1);
273
274 vsel(x0F4B0F2D, a4, xB44BC32D, a5);
275 vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A);
276 vxor(x962769FF, xF0F0A5A9, x66D7CC56);
277 vxor(x996C66D2, x0F4B0F2D, x962769FF);
278 vsel(x0, x996C66D2, xB44BC32D, a2);
279 vxor(*out1, *out1, x0);
280
281 vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00);
282 vsel(x278DB412, x668DA556, xA5A5969A, a1);
283 vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56);
284
285 vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D);
286 vsel(x27993333, x278DB412, a3, x0055FF33);
287 vsel(xD2994E33, xD2DC4E52, x27993333, a5);
288 vsel(x3, x87A5F09C, xD2994E33, a2);
289 vxor(*out4, *out4, x3);
290
291 vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6);
292 vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D);
293 vxor(x09976748, x27993333, x2E0E547B);
294 vsel(x2, xB66CB43B, x09976748, a2);
295 vxor(*out3, *out3, x2);
296 }
297 #else
298 /* s2-000012, 33 gates, 17 regs, 5/17/51/86/121 stall cycles */
299 MAYBE_INLINE static void
s2(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)300 s2(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
301 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
302 {
303 vtype x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556;
304 vtype x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A;
305 vtype x0F5AF03C, x6600FF56, x87A5F09C;
306 vtype x875AF03C, xF00F0FA9, xB44BC32D;
307 vtype x6627A556, xD26C667B, x278DB412, xB66CB43B;
308 vtype x668DC32D, x99723CD2, x996C66D2;
309 vtype xD20E4EA9, x27993333, xD2994E33;
310 vtype x9927C3E1, x089F3F0C, x09976748;
311 vtype x0, x1, x2, x3;
312
313 vsel(x55553333, a1, a3, a6);
314 vsel(x0055FF33, a6, x55553333, a5);
315 vsel(x33270F03, a3, a4, x0055FF33);
316 vxor(x66725A56, a1, x33270F03);
317 vxor(x00FFFF00, a5, a6);
318 vxor(x668DA556, x66725A56, x00FFFF00);
319
320 vsel(x0F0F5A56, a4, x66725A56, a6);
321 vnot(xF0F0A5A9, x0F0F5A56);
322 vxor(xA5A5969A, x55553333, xF0F0A5A9);
323 vxor(xA55A699A, x00FFFF00, xA5A5969A);
324 vsel(x1, xA55A699A, x668DA556, a2);
325 vxor(*out2, *out2, x1);
326
327 vxor(x0F5AF03C, a4, x0055FF33);
328 vsel(x6600FF56, x66725A56, a6, x00FFFF00);
329 vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56);
330
331 vsel(x875AF03C, x87A5F09C, x0F5AF03C, a5);
332 vsel(xF00F0FA9, xF0F0A5A9, a4, x00FFFF00);
333 vsel(xB44BC32D, x875AF03C, xF00F0FA9, a3);
334
335 vsel(x6627A556, x66725A56, x668DA556, x0055FF33);
336 vxor(xD26C667B, xB44BC32D, x6627A556);
337 vsel(x278DB412, x668DA556, xA5A5969A, a1);
338 vsel(xB66CB43B, xD26C667B, x278DB412, x6600FF56);
339
340 vsel(x668DC32D, x668DA556, xB44BC32D, a6);
341 vnot(x99723CD2, x668DC32D);
342 vsel(x996C66D2, x99723CD2, xD26C667B, x00FFFF00);
343 vsel(x0, x996C66D2, xB44BC32D, a2);
344 vxor(*out1, *out1, x0);
345
346 vsel(xD20E4EA9, xF00F0FA9, xD26C667B, x668DC32D);
347 vsel(x27993333, x278DB412, a3, x0055FF33);
348 vsel(xD2994E33, xD20E4EA9, x27993333, a5);
349 vsel(x3, x87A5F09C, xD2994E33, a2);
350 vxor(*out4, *out4, x3);
351
352 vxor(x9927C3E1, x0055FF33, x99723CD2);
353 vsel(x089F3F0C, a4, x00FFFF00, x27993333);
354 vsel(x09976748, x089F3F0C, x9927C3E1, x0F0F5A56);
355 vsel(x2, xB66CB43B, x09976748, a2);
356 vxor(*out3, *out3, x2);
357 }
358 #endif
359
360 #if latency >= 3
361 /* s3-000000, 33 gates, 17 regs, 6/10/33/66/102 stall cycles */
362 MAYBE_INLINE static void
s3(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)363 s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
364 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
365 {
366 vtype x0F330F33, x0F33F0CC, x5A66A599;
367 vtype x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4;
368 vtype x556BA09E, x665A93AC, x99A56C53;
369 vtype x25A1A797, x5713754C, x66559355, x47B135C6;
370 vtype x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1;
371 vtype x9E48CDE4, x655B905E, x00A55CFF, x9E49915E;
372 vtype xD6599874, x05330022, xD2699876;
373 vtype x665F9364, xD573F0F2, xB32C6396;
374 vtype x0, x1, x2, x3;
375
376 vsel(x0F330F33, a4, a3, a5);
377 vxor(x0F33F0CC, a6, x0F330F33);
378 vxor(x5A66A599, a2, x0F33F0CC);
379
380 vsel(x2111B7BB, a3, a6, x5A66A599);
381 vsel(x03FF3033, a5, a3, x0F33F0CC);
382 vsel(x05BB50EE, a5, x0F33F0CC, a2);
383 vsel(x074F201F, x03FF3033, a4, x05BB50EE);
384 vxor(x265E97A4, x2111B7BB, x074F201F);
385
386 vsel(x556BA09E, x5A66A599, x05BB50EE, a4);
387 vsel(x665A93AC, x556BA09E, x265E97A4, a3);
388 vnot(x99A56C53, x665A93AC);
389 vsel(x1, x265E97A4, x99A56C53, a1);
390 vxor(*out2, *out2, x1);
391
392 vxor(x25A1A797, x03FF3033, x265E97A4);
393 vsel(x5713754C, a2, x0F33F0CC, x074F201F);
394 vsel(x66559355, x665A93AC, a2, a5);
395 vsel(x47B135C6, x25A1A797, x5713754C, x66559355);
396
397 vxor(x9A5A5C60, x03FF3033, x99A56C53);
398 vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599);
399 vxor(x87698DB4, x5713754C, xD07AF8F8);
400 vxor(xE13C1EE1, x66559355, x87698DB4);
401
402 vsel(x9E48CDE4, x9A5A5C60, x87698DB4, x265E97A4);
403 vsel(x655B905E, x66559355, x05BB50EE, a4);
404 vsel(x00A55CFF, a5, a6, x9A5A5C60);
405 vsel(x9E49915E, x9E48CDE4, x655B905E, x00A55CFF);
406 vsel(x0, x9E49915E, xE13C1EE1, a1);
407 vxor(*out1, *out1, x0);
408
409 vsel(xD6599874, xD07AF8F8, x66559355, x0F33F0CC);
410 vand(x05330022, x0F330F33, x05BB50EE);
411 vsel(xD2699876, xD6599874, x00A55CFF, x05330022);
412 vsel(x3, x5A66A599, xD2699876, a1);
413 vxor(*out4, *out4, x3);
414
415 vsel(x665F9364, x265E97A4, x66559355, x47B135C6);
416 vsel(xD573F0F2, xD07AF8F8, x05330022, a4);
417 vxor(xB32C6396, x665F9364, xD573F0F2);
418 vsel(x2, xB32C6396, x47B135C6, a1);
419 vxor(*out3, *out3, x2);
420 }
421 #else
422 /* s3-000004, 33 gates, 17 regs, 3/13/41/77/113 stall cycles */
423 MAYBE_INLINE static void
s3(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)424 s3(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
425 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
426 {
427 vtype x0F330F33, x0F33F0CC, x5A66A599;
428 vtype x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4;
429 vtype x556BA09E, x665A93AC, x99A56C53;
430 vtype x25A1A797, x5713754C, x66559355, x47B135C6;
431 vtype x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1;
432 vtype x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E;
433 vtype xD069F8B4, x030FF0C3, xD2699876;
434 vtype xD579DDF4, xD579F0C3, xB32C6396;
435 vtype x0, x1, x2, x3;
436
437 vsel(x0F330F33, a4, a3, a5);
438 vxor(x0F33F0CC, a6, x0F330F33);
439 vxor(x5A66A599, a2, x0F33F0CC);
440
441 vsel(x2111B7BB, a3, a6, x5A66A599);
442 vsel(x03FF3033, a5, a3, x0F33F0CC);
443 vsel(x05BB50EE, a5, x0F33F0CC, a2);
444 vsel(x074F201F, x03FF3033, a4, x05BB50EE);
445 vxor(x265E97A4, x2111B7BB, x074F201F);
446
447 vsel(x556BA09E, x5A66A599, x05BB50EE, a4);
448 vsel(x665A93AC, x556BA09E, x265E97A4, a3);
449 vnot(x99A56C53, x665A93AC);
450 vsel(x1, x265E97A4, x99A56C53, a1);
451 vxor(*out2, *out2, x1);
452
453 vxor(x25A1A797, x03FF3033, x265E97A4);
454 vsel(x5713754C, a2, x0F33F0CC, x074F201F);
455 vsel(x66559355, x665A93AC, a2, a5);
456 vsel(x47B135C6, x25A1A797, x5713754C, x66559355);
457
458 vxor(x9A5A5C60, x03FF3033, x99A56C53);
459 vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599);
460 vxor(x87698DB4, x5713754C, xD07AF8F8);
461 vxor(xE13C1EE1, x66559355, x87698DB4);
462
463 vsel(x000CFFCF, a4, a6, x0F33F0CC);
464 vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE);
465 vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60);
466 vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4);
467 vsel(x0, x9E49915E, xE13C1EE1, a1);
468 vxor(*out1, *out1, x0);
469
470 vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5);
471 vsel(x030FF0C3, x000CFFCF, x03FF3033, a4);
472 vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3);
473 vsel(x3, x5A66A599, xD2699876, a1);
474 vxor(*out4, *out4, x3);
475
476 vsel(xD579DDF4, xD07AF8F8, a2, x5713754C);
477 vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6);
478 vxor(xB32C6396, x66559355, xD579F0C3);
479 vsel(x2, xB32C6396, x47B135C6, a1);
480 vxor(*out3, *out3, x2);
481 }
482 #endif
483
484 #if regs >= 13
485 /* s4-000014, 26 gates, 13 regs, 2/17/42/70/98 stall cycles */
486 MAYBE_INLINE static void
s4(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)487 s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
488 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
489 {
490 vtype x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0,
491 x0AF50F0F, x4CA36B59;
492 vtype xB35C94A6;
493 vtype x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1;
494 vtype x56E9861E;
495 vtype x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A;
496 vtype x31F720B3, x11FB21B3, x4712A7AD, x9586CA37;
497 vtype x0, x1, x2, x3;
498
499 vsel(x0505AFAF, a5, a3, a1);
500 vsel(x0555AF55, x0505AFAF, a1, a4);
501 vxor(x0A5AA05A, a3, x0555AF55);
502 vsel(x46566456, a1, x0A5AA05A, a2);
503 vsel(x0A0A5F5F, a3, a5, a1);
504 vxor(x0AF55FA0, a4, x0A0A5F5F);
505 vsel(x0AF50F0F, x0AF55FA0, a3, a5);
506 vxor(x4CA36B59, x46566456, x0AF50F0F);
507
508 vnot(xB35C94A6, x4CA36B59);
509
510 vsel(x01BB23BB, a4, a2, x0555AF55);
511 vxor(x5050FAFA, a1, x0505AFAF);
512 vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA);
513 vxor(xA91679E1, x0A0A5F5F, xA31C26BE);
514
515 vnot(x56E9861E, xA91679E1);
516
517 vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4);
518 vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F);
519 vsel(x827D9784, xB35C94A6, x0AF55F00, a2);
520 vxor(xD2946D9A, x50E9FA1E, x827D9784);
521 vsel(x2, xD2946D9A, x4CA36B59, a6);
522 vxor(*out3, *out3, x2);
523 vsel(x3, xB35C94A6, xD2946D9A, a6);
524 vxor(*out4, *out4, x3);
525
526 vsel(x31F720B3, a2, a4, x0AF55FA0);
527 vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA);
528 vxor(x4712A7AD, x56E9861E, x11FB21B3);
529 vxor(x9586CA37, xD2946D9A, x4712A7AD);
530 vsel(x0, x56E9861E, x9586CA37, a6);
531 vxor(*out1, *out1, x0);
532 vsel(x1, x9586CA37, xA91679E1, a6);
533 vxor(*out2, *out2, x1);
534 }
535 #else
536 /* s4-000033, 26 gates, 12 regs, 4/22/48/76/104 stall cycles */
537 MAYBE_INLINE static void
s4(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)538 s4(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
539 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
540 {
541 vtype x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0,
542 x0AF50F0F, x4CA36B59;
543 vtype xB35C94A6;
544 vtype x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1;
545 vtype x56E9861E;
546 vtype x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A;
547 vtype xD2F56D00, x46F9870F, x4773A737, x9586CA37;
548 vtype x0, x1, x2, x3;
549
550 vsel(x0505AFAF, a5, a3, a1);
551 vsel(x0555AF55, x0505AFAF, a1, a4);
552 vxor(x0A5AA05A, a3, x0555AF55);
553 vsel(x46566456, a1, x0A5AA05A, a2);
554 vsel(x0A0A5F5F, a3, a5, a1);
555 vxor(x0AF55FA0, a4, x0A0A5F5F);
556 vsel(x0AF50F0F, x0AF55FA0, a3, a5);
557 vxor(x4CA36B59, x46566456, x0AF50F0F);
558
559 vnot(xB35C94A6, x4CA36B59);
560
561 vsel(x01BB23BB, a4, a2, x0555AF55);
562 vxor(x5050FAFA, a1, x0505AFAF);
563 vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA);
564 vxor(xA91679E1, x0A0A5F5F, xA31C26BE);
565
566 vnot(x56E9861E, xA91679E1);
567
568 vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4);
569 vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F);
570 vsel(x827D9784, xB35C94A6, x0AF55F00, a2);
571 vxor(xD2946D9A, x50E9FA1E, x827D9784);
572 vsel(x2, xD2946D9A, x4CA36B59, a6);
573 vxor(*out3, *out3, x2);
574 vsel(x3, xB35C94A6, xD2946D9A, a6);
575 vxor(*out4, *out4, x3);
576
577 vsel(xD2F56D00, xD2946D9A, x0AF55F00, a4);
578 vsel(x46F9870F, x56E9861E, x0AF50F0F, a2);
579 vsel(x4773A737, x46F9870F, a2, x01BB23BB);
580 vxor(x9586CA37, xD2F56D00, x4773A737);
581 vsel(x0, x56E9861E, x9586CA37, a6);
582 vxor(*out1, *out1, x0);
583 vsel(x1, x9586CA37, xA91679E1, a6);
584 vxor(*out2, *out2, x1);
585 }
586 #endif
587
588 #if regs <= 18 && latency <= 2
589 /* s5-000000, 35 gates, 18 regs, 7/33/68/105/142 stall cycles */
590 MAYBE_INLINE static void
s5(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)591 s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
592 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
593 {
594 vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6;
595 vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463;
596 vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C;
597 vtype xAC81CFB2, xF72577AF, x5BA4B81D;
598 vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A;
599 vtype x12E6283D, x9E47D3D4, x1A676AB4;
600 vtype x2E3C69C6, x92C7C296, x369CC1D6;
601 vtype x891556DF, xE5E77F82, x6CF2295D;
602 vtype x0, x1, x2, x3;
603
604 vsel(x550F550F, a1, a3, a5);
605 vnot(xAAF0AAF0, x550F550F);
606 vsel(xA5F5A5F5, xAAF0AAF0, a1, a3);
607 vxor(x96C696C6, a2, xA5F5A5F5);
608 vxor(x00FFFF00, a5, a6);
609 vxor(x963969C6, x96C696C6, x00FFFF00);
610
611 vsel(x2E3C2E3C, a3, xAAF0AAF0, a2);
612 vsel(xB73121F7, a2, x963969C6, x96C696C6);
613 vsel(x1501DF0F, a6, x550F550F, xB73121F7);
614 vsel(x00558A5F, x1501DF0F, a5, a1);
615 vxor(x2E69A463, x2E3C2E3C, x00558A5F);
616
617 vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6);
618 vsel(x045157FD, a6, a1, x0679ED42);
619 vsel(xB32077FF, xB73121F7, a6, x045157FD);
620 vxor(x9D49D39C, x2E69A463, xB32077FF);
621 vsel(x2, x9D49D39C, x2E69A463, a4);
622 vxor(*out3, *out3, x2);
623
624 vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42);
625 vsel(xF72577AF, xB32077FF, x550F550F, a1);
626 vxor(x5BA4B81D, xAC81CFB2, xF72577AF);
627 vsel(x1, x5BA4B81D, x963969C6, a4);
628 vxor(*out2, *out2, x1);
629
630 vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6);
631 vsel(x4895469F, x5BA477AF, x00558A5F, a2);
632 vsel(x3A35273A, x2E3C2E3C, a2, x963969C6);
633 vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D);
634
635 vsel(x12E6283D, a5, x5BA4B81D, x963969C6);
636 vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2);
637 vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F);
638
639 vsel(x2E3C69C6, x2E3C2E3C, x963969C6, a6);
640 vsel(x92C7C296, x96C696C6, x1A676AB4, a1);
641 vsel(x369CC1D6, x2E3C69C6, x92C7C296, x5BA4B81D);
642 vsel(x0, x369CC1D6, x1A676AB4, a4);
643 vxor(*out1, *out1, x0);
644
645 vsel(x891556DF, xB32077FF, x4895469F, x3A35273A);
646 vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D);
647 vxor(x6CF2295D, x891556DF, xE5E77F82);
648 vsel(x3, x1A35669A, x6CF2295D, a4);
649 vxor(*out4, *out4, x3);
650 }
651 #elif regs == 19 || (regs >= 20 && latency >= 3)
652 /* s5-000005, 35 gates, 19 regs, 7/29/60/95/132 stall cycles */
653 MAYBE_INLINE static void
s5(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)654 s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
655 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
656 {
657 vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6;
658 vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463;
659 vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C;
660 vtype xAC81CFB2, xF72577AF, x5BA4B81D;
661 vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A;
662 vtype x12E6283D, x9E47D3D4, x1A676AB4;
663 vtype x2E3CD3D4, x9697C1C6, x369CC1D6;
664 vtype x891556DF, xE5E77F82, x6CF2295D;
665 vtype x0, x1, x2, x3;
666
667 vsel(x550F550F, a1, a3, a5);
668 vnot(xAAF0AAF0, x550F550F);
669 vsel(xA5F5A5F5, xAAF0AAF0, a1, a3);
670 vxor(x96C696C6, a2, xA5F5A5F5);
671 vxor(x00FFFF00, a5, a6);
672 vxor(x963969C6, x96C696C6, x00FFFF00);
673
674 vsel(x2E3C2E3C, a3, xAAF0AAF0, a2);
675 vsel(xB73121F7, a2, x963969C6, x96C696C6);
676 vsel(x1501DF0F, a6, x550F550F, xB73121F7);
677 vsel(x00558A5F, x1501DF0F, a5, a1);
678 vxor(x2E69A463, x2E3C2E3C, x00558A5F);
679
680 vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6);
681 vsel(x045157FD, a6, a1, x0679ED42);
682 vsel(xB32077FF, xB73121F7, a6, x045157FD);
683 vxor(x9D49D39C, x2E69A463, xB32077FF);
684 vsel(x2, x9D49D39C, x2E69A463, a4);
685 vxor(*out3, *out3, x2);
686
687 vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42);
688 vsel(xF72577AF, xB32077FF, x550F550F, a1);
689 vxor(x5BA4B81D, xAC81CFB2, xF72577AF);
690 vsel(x1, x5BA4B81D, x963969C6, a4);
691 vxor(*out2, *out2, x1);
692
693 vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6);
694 vsel(x4895469F, x5BA477AF, x00558A5F, a2);
695 vsel(x3A35273A, x2E3C2E3C, a2, x963969C6);
696 vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D);
697
698 vsel(x12E6283D, a5, x5BA4B81D, x963969C6);
699 vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2);
700 vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F);
701
702 vsel(x2E3CD3D4, x2E3C2E3C, x9E47D3D4, a6);
703 vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD);
704 vsel(x369CC1D6, x2E3CD3D4, x9697C1C6, x5BA477AF);
705 vsel(x0, x369CC1D6, x1A676AB4, a4);
706 vxor(*out1, *out1, x0);
707
708 vsel(x891556DF, xB32077FF, x4895469F, x3A35273A);
709 vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D);
710 vxor(x6CF2295D, x891556DF, xE5E77F82);
711 vsel(x3, x1A35669A, x6CF2295D, a4);
712 vxor(*out4, *out4, x3);
713 }
714 #elif regs <= 18 && latency >= 5
715 /* s5-000011, 35 gates, 18 regs, 9/31/62/95/132 stall cycles */
716 MAYBE_INLINE static void
s5(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)717 s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
718 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
719 {
720 vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6;
721 vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463;
722 vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C;
723 vtype xAC81CFB2, xF72577AF, x5BA4B81D;
724 vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A;
725 vtype x12E6283D, x9E47D3D4, x1A676AB4;
726 vtype x2E3CD3D4, x96DF41C6, x369CC1D6;
727 vtype x891556DF, xE5E77F82, x6CF2295D;
728 vtype x0, x1, x2, x3;
729
730 vsel(x550F550F, a1, a3, a5);
731 vnot(xAAF0AAF0, x550F550F);
732 vsel(xA5F5A5F5, xAAF0AAF0, a1, a3);
733 vxor(x96C696C6, a2, xA5F5A5F5);
734 vxor(x00FFFF00, a5, a6);
735 vxor(x963969C6, x96C696C6, x00FFFF00);
736
737 vsel(x2E3C2E3C, a3, xAAF0AAF0, a2);
738 vsel(xB73121F7, a2, x963969C6, x96C696C6);
739 vsel(x1501DF0F, a6, x550F550F, xB73121F7);
740 vsel(x00558A5F, x1501DF0F, a5, a1);
741 vxor(x2E69A463, x2E3C2E3C, x00558A5F);
742
743 vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6);
744 vsel(x045157FD, a6, a1, x0679ED42);
745 vsel(xB32077FF, xB73121F7, a6, x045157FD);
746 vxor(x9D49D39C, x2E69A463, xB32077FF);
747 vsel(x2, x9D49D39C, x2E69A463, a4);
748 vxor(*out3, *out3, x2);
749
750 vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42);
751 vsel(xF72577AF, xB32077FF, x550F550F, a1);
752 vxor(x5BA4B81D, xAC81CFB2, xF72577AF);
753 vsel(x1, x5BA4B81D, x963969C6, a4);
754 vxor(*out2, *out2, x1);
755
756 vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6);
757 vsel(x4895469F, x5BA477AF, x00558A5F, a2);
758 vsel(x3A35273A, x2E3C2E3C, a2, x963969C6);
759 vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D);
760
761 vsel(x12E6283D, a5, x5BA4B81D, x963969C6);
762 vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2);
763 vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F);
764
765 vsel(x2E3CD3D4, x2E3C2E3C, x9E47D3D4, a6);
766 vsel(x96DF41C6, x963969C6, x96C696C6, x12E6283D);
767 vsel(x369CC1D6, x2E3CD3D4, x96DF41C6, x5BA477AF);
768 vsel(x0, x369CC1D6, x1A676AB4, a4);
769 vxor(*out1, *out1, x0);
770
771 vsel(x891556DF, xB32077FF, x4895469F, x3A35273A);
772 vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D);
773 vxor(x6CF2295D, x891556DF, xE5E77F82);
774 vsel(x3, x1A35669A, x6CF2295D, a4);
775 vxor(*out4, *out4, x3);
776 }
777 #elif regs >= 20
778 /* s5-000016, 35 gates, 20 regs, 6/30/62/98/135 stall cycles */
779 MAYBE_INLINE static void
s5(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)780 s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
781 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
782 {
783 vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6;
784 vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463;
785 vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C;
786 vtype xAC81CFB2, xF72577AF, x5BA4B81D;
787 vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A;
788 vtype x12E6283D, x9E47D3D4, x1A676AB4;
789 vtype x891556DF, xE5E77F82, x6CF2295D;
790 vtype x2E3CA5F5, x9697C1C6, x369CC1D6;
791 vtype x0, x1, x2, x3;
792
793 vsel(x550F550F, a1, a3, a5);
794 vnot(xAAF0AAF0, x550F550F);
795 vsel(xA5F5A5F5, xAAF0AAF0, a1, a3);
796 vxor(x96C696C6, a2, xA5F5A5F5);
797 vxor(x00FFFF00, a5, a6);
798 vxor(x963969C6, x96C696C6, x00FFFF00);
799
800 vsel(x2E3C2E3C, a3, xAAF0AAF0, a2);
801 vsel(xB73121F7, a2, x963969C6, x96C696C6);
802 vsel(x1501DF0F, a6, x550F550F, xB73121F7);
803 vsel(x00558A5F, x1501DF0F, a5, a1);
804 vxor(x2E69A463, x2E3C2E3C, x00558A5F);
805
806 vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6);
807 vsel(x045157FD, a6, a1, x0679ED42);
808 vsel(xB32077FF, xB73121F7, a6, x045157FD);
809 vxor(x9D49D39C, x2E69A463, xB32077FF);
810 vsel(x2, x9D49D39C, x2E69A463, a4);
811 vxor(*out3, *out3, x2);
812
813 vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42);
814 vsel(xF72577AF, xB32077FF, x550F550F, a1);
815 vxor(x5BA4B81D, xAC81CFB2, xF72577AF);
816 vsel(x1, x5BA4B81D, x963969C6, a4);
817 vxor(*out2, *out2, x1);
818
819 vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6);
820 vsel(x4895469F, x5BA477AF, x00558A5F, a2);
821 vsel(x3A35273A, x2E3C2E3C, a2, x963969C6);
822 vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D);
823
824 vsel(x12E6283D, a5, x5BA4B81D, x963969C6);
825 vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2);
826 vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F);
827
828 vsel(x891556DF, xB32077FF, x4895469F, x3A35273A);
829 vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D);
830 vxor(x6CF2295D, x891556DF, xE5E77F82);
831 vsel(x3, x1A35669A, x6CF2295D, a4);
832 vxor(*out4, *out4, x3);
833
834 vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6);
835 vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD);
836 vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF);
837 vsel(x0, x369CC1D6, x1A676AB4, a4);
838 vxor(*out1, *out1, x0);
839 }
840 #else
841 /* s5-000023, 35 gates, 18 regs, 9/30/61/96/133 stall cycles */
842 MAYBE_INLINE static void
s5(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)843 s5(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
844 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
845 {
846 vtype x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6;
847 vtype x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463;
848 vtype x0679ED42, x045157FD, xB32077FF, x9D49D39C;
849 vtype xAC81CFB2, xF72577AF, x5BA4B81D;
850 vtype x5BA477AF, x4895469F, x3A35273A, x1A35669A;
851 vtype x12E6283D, x9E47D3D4, x1A676AB4;
852 vtype x891556DF, xE5E77F82, x6CF2295D;
853 vtype x2E3CD3D4, x96DF41C6, x369CC1D6;
854 vtype x0, x1, x2, x3;
855
856 vsel(x550F550F, a1, a3, a5);
857 vnot(xAAF0AAF0, x550F550F);
858 vsel(xA5F5A5F5, xAAF0AAF0, a1, a3);
859 vxor(x96C696C6, a2, xA5F5A5F5);
860 vxor(x00FFFF00, a5, a6);
861 vxor(x963969C6, x96C696C6, x00FFFF00);
862
863 vsel(x2E3C2E3C, a3, xAAF0AAF0, a2);
864 vsel(xB73121F7, a2, x963969C6, x96C696C6);
865 vsel(x1501DF0F, a6, x550F550F, xB73121F7);
866 vsel(x00558A5F, x1501DF0F, a5, a1);
867 vxor(x2E69A463, x2E3C2E3C, x00558A5F);
868
869 vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6);
870 vsel(x045157FD, a6, a1, x0679ED42);
871 vsel(xB32077FF, xB73121F7, a6, x045157FD);
872 vxor(x9D49D39C, x2E69A463, xB32077FF);
873 vsel(x2, x9D49D39C, x2E69A463, a4);
874 vxor(*out3, *out3, x2);
875
876 vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42);
877 vsel(xF72577AF, xB32077FF, x550F550F, a1);
878 vxor(x5BA4B81D, xAC81CFB2, xF72577AF);
879 vsel(x1, x5BA4B81D, x963969C6, a4);
880 vxor(*out2, *out2, x1);
881
882 vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6);
883 vsel(x4895469F, x5BA477AF, x00558A5F, a2);
884 vsel(x3A35273A, x2E3C2E3C, a2, x963969C6);
885 vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D);
886
887 vsel(x12E6283D, a5, x5BA4B81D, x963969C6);
888 vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2);
889 vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F);
890
891 vsel(x891556DF, xB32077FF, x4895469F, x3A35273A);
892 vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D);
893 vxor(x6CF2295D, x891556DF, xE5E77F82);
894 vsel(x3, x1A35669A, x6CF2295D, a4);
895 vxor(*out4, *out4, x3);
896
897 vsel(x2E3CD3D4, x2E3C2E3C, x9E47D3D4, a6);
898 vsel(x96DF41C6, x963969C6, x96C696C6, x12E6283D);
899 vsel(x369CC1D6, x2E3CD3D4, x96DF41C6, x5BA477AF);
900 vsel(x0, x369CC1D6, x1A676AB4, a4);
901 vxor(*out1, *out1, x0);
902 }
903 #endif
904
905 #if regs >= 16 && latency <= 2
906 /* s6-000000, 34 gates, 16 regs, 5/34/70/107/144 stall cycles */
907 MAYBE_INLINE static void
s6(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)908 s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
909 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
910 {
911 vtype x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A;
912 vtype x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6;
913 vtype x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97;
914 vtype x86CD4C9B, x12E0FFFD, x942D9A67;
915 vtype x142956AB, x455D45DF, x1C3EE619;
916 vtype x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79;
917 vtype x840DBB67, x6DA19C1E, x925E63E1;
918 vtype x9C3CA761, x257A75D5, xB946D2B4;
919 vtype x0, x1, x2, x3;
920
921 vsel(x555500FF, a1, a4, a5);
922 vxor(x666633CC, a2, x555500FF);
923 vsel(x606F30CF, x666633CC, a4, a3);
924 vxor(x353A659A, a1, x606F30CF);
925 vxor(x353A9A65, a5, x353A659A);
926 vnot(xCAC5659A, x353A9A65);
927
928 vsel(x353A6565, x353A659A, x353A9A65, a4);
929 vsel(x0A3F0A6F, a3, a4, x353A6565);
930 vxor(x6C5939A3, x666633CC, x0A3F0A6F);
931 vxor(x5963A3C6, x353A9A65, x6C5939A3);
932
933 vsel(x35FF659A, a4, x353A659A, x353A6565);
934 vxor(x3AF06A95, a3, x35FF659A);
935 vsel(x05CF0A9F, a4, a3, x353A9A65);
936 vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3);
937
938 vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3);
939 vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97);
940 vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD);
941 vsel(x0, xCAC5659A, x942D9A67, a6);
942 vxor(*out1, *out1, x0);
943
944 vsel(x142956AB, x353A659A, x942D9A67, a2);
945 vsel(x455D45DF, a1, x86CD4C9B, x142956AB);
946 vxor(x1C3EE619, x5963A3C6, x455D45DF);
947 vsel(x3, x5963A3C6, x1C3EE619, a6);
948 vxor(*out4, *out4, x3);
949
950 vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65);
951 vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F);
952 vxor(x3CF19C86, x1C3EE619, x20CF7A9F);
953 vxor(x69A49C79, x555500FF, x3CF19C86);
954
955 vsel(x840DBB67, a5, x942D9A67, x86CD4C9B);
956 vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67);
957 vnot(x925E63E1, x6DA19C1E);
958 vsel(x1, x925E63E1, x69A49C79, a6);
959 vxor(*out2, *out2, x1);
960
961 vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86);
962 vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF);
963 vxor(xB946D2B4, x9C3CA761, x257A75D5);
964 vsel(x2, x16E94A97, xB946D2B4, a6);
965 vxor(*out3, *out3, x2);
966 }
967 #elif regs == 15
968 /* s6-000008, 34 gates, 15 regs, 6/25/57/94/131 stall cycles */
969 MAYBE_INLINE static void
s6(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)970 s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
971 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
972 {
973 vtype x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A;
974 vtype x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6;
975 vtype x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97;
976 vtype x86CD4C9B, x12E0FFFD, x942D9A67;
977 vtype x142956AB, x455D45DF, x1C3EE619;
978 vtype xC3C36393, x2D1B471E, xC70B631E, x925E63E1;
979 vtype x8C2F1A67, x965B6386, x69A49C79;
980 vtype x1C2E8201, xA56850B5, xB946D2B4;
981 vtype x0, x1, x2, x3;
982
983 vsel(x555500FF, a1, a4, a5);
984 vxor(x666633CC, a2, x555500FF);
985 vsel(x606F30CF, x666633CC, a4, a3);
986 vxor(x353A659A, a1, x606F30CF);
987 vxor(x353A9A65, a5, x353A659A);
988 vnot(xCAC5659A, x353A9A65);
989
990 vsel(x353A6565, x353A659A, x353A9A65, a4);
991 vsel(x0A3F0A6F, a3, a4, x353A6565);
992 vxor(x6C5939A3, x666633CC, x0A3F0A6F);
993 vxor(x5963A3C6, x353A9A65, x6C5939A3);
994
995 vsel(x35FF659A, a4, x353A659A, x353A6565);
996 vxor(x3AF06A95, a3, x35FF659A);
997 vsel(x05CF0A9F, a4, a3, x353A9A65);
998 vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3);
999
1000 vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3);
1001 vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97);
1002 vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD);
1003 vsel(x0, xCAC5659A, x942D9A67, a6);
1004 vxor(*out1, *out1, x0);
1005
1006 vsel(x142956AB, x353A659A, x942D9A67, a2);
1007 vsel(x455D45DF, a1, x86CD4C9B, x142956AB);
1008 vxor(x1C3EE619, x5963A3C6, x455D45DF);
1009 vsel(x3, x5963A3C6, x1C3EE619, a6);
1010 vxor(*out4, *out4, x3);
1011
1012 vsel(xC3C36393, xCAC5659A, a2, a3);
1013 vsel(x2D1B471E, x353A659A, a3, x5963A3C6);
1014 vsel(xC70B631E, xC3C36393, x2D1B471E, x05CF0A9F);
1015 vxor(x925E63E1, x555500FF, xC70B631E);
1016
1017 vsel(x8C2F1A67, x942D9A67, x0A3F0A6F, x5963A3C6);
1018 vsel(x965B6386, x925E63E1, xC70B631E, x8C2F1A67);
1019 vnot(x69A49C79, x965B6386);
1020 vsel(x1, x925E63E1, x69A49C79, a6);
1021 vxor(*out2, *out2, x1);
1022
1023 vsel(x1C2E8201, x942D9A67, x1C3EE619, x8C2F1A67);
1024 vxor(xA56850B5, a2, x965B6386);
1025 vxor(xB946D2B4, x1C2E8201, xA56850B5);
1026 vsel(x2, x16E94A97, xB946D2B4, a6);
1027 vxor(*out3, *out3, x2);
1028 }
1029 #elif regs <= 14
1030 /* s6-000082, 34 gates, 14 regs, 8/31/65/102/139 stall cycles */
1031 MAYBE_INLINE static void
s6(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)1032 s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
1033 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
1034 {
1035 vtype x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A;
1036 vtype x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6;
1037 vtype x35FF659A, x3AF06A95, x066F0CCF, x16E94A97;
1038 vtype x1872E297, x35BE6539, x1C3EE619;
1039 vtype x86CD4C9B, x12E0FFFD, x942D9A67;
1040 vtype x0A63C087, x9E4E5AE0, x02FA65FD, x925E63E1;
1041 vtype xAB756193, x8A75E187, xB946D2B4;
1042 vtype x375A7BA0, x965B6386, x69A49C79;
1043 vtype x0, x1, x2, x3;
1044
1045 vsel(x555500FF, a1, a4, a5);
1046 vxor(x666633CC, a2, x555500FF);
1047 vsel(x606F30CF, x666633CC, a4, a3);
1048 vxor(x353A659A, a1, x606F30CF);
1049 vxor(x353A9A65, a5, x353A659A);
1050 vnot(xCAC5659A, x353A9A65);
1051
1052 vsel(x353A6565, x353A659A, x353A9A65, a4);
1053 vsel(x0A3F0A6F, a3, a4, x353A6565);
1054 vxor(x6C5939A3, x666633CC, x0A3F0A6F);
1055 vxor(x5963A3C6, x353A9A65, x6C5939A3);
1056
1057 vsel(x35FF659A, a4, x353A659A, x353A6565);
1058 vxor(x3AF06A95, a3, x35FF659A);
1059 vsel(x066F0CCF, a3, a4, x5963A3C6);
1060 vsel(x16E94A97, x3AF06A95, x066F0CCF, x6C5939A3);
1061
1062 vsel(x1872E297, x5963A3C6, x3AF06A95, a1);
1063 vsel(x35BE6539, x35FF659A, x353A6565, x6C5939A3);
1064 vsel(x1C3EE619, x1872E297, x35BE6539, x066F0CCF);
1065 vsel(x3, x5963A3C6, x1C3EE619, a6);
1066 vxor(*out4, *out4, x3);
1067
1068 vsel(x86CD4C9B, xCAC5659A, x066F0CCF, x6C5939A3);
1069 vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97);
1070 vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD);
1071 vsel(x0, xCAC5659A, x942D9A67, a6);
1072 vxor(*out1, *out1, x0);
1073
1074 vsel(x0A63C087, x1872E297, x066F0CCF, a2);
1075 vxor(x9E4E5AE0, x942D9A67, x0A63C087);
1076 vsel(x02FA65FD, x12E0FFFD, a4, x353A9A65);
1077 vsel(x925E63E1, x9E4E5AE0, x02FA65FD, x6C5939A3);
1078
1079 vsel(xAB756193, a2, xCAC5659A, x9E4E5AE0);
1080 vsel(x8A75E187, x0A63C087, xAB756193, x925E63E1);
1081 vxor(xB946D2B4, a2, x8A75E187);
1082 vsel(x2, x16E94A97, xB946D2B4, a6);
1083 vxor(*out3, *out3, x2);
1084
1085 vsel(x375A7BA0, a2, x9E4E5AE0, x16E94A97);
1086 vsel(x965B6386, x8A75E187, x375A7BA0, x1C3EE619);
1087 vnot(x69A49C79, x965B6386);
1088 vsel(x1, x925E63E1, x69A49C79, a6);
1089 vxor(*out2, *out2, x1);
1090 }
1091 #else
1092 /* s6-000461, 34 gates, 16 regs, 7/23/48/82/118 stall cycles */
1093 MAYBE_INLINE static void
s6(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)1094 s6(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
1095 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
1096 {
1097 vtype x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A;
1098 vtype x553A5565, x0A3F0A6F, x6C5939A3, x5963A3C6;
1099 vtype x15FF459A, x1AF04A95, x066F0CCF, x16E94A97;
1100 vtype x1872E297, x55BE5539, x1C3EE619;
1101 vtype x86CD4C9B, x12E0FFFD, x942D9A67;
1102 vtype x2FCAD0F0, x1BF21BB1, x466E4C89, x69A49C79;
1103 vtype x965B6386, x12769BE1, x925E63E1;
1104 vtype x9867CA97, x69339C33, xB946D2B4;
1105 vtype x0, x1, x2, x3;
1106
1107 vsel(x555500FF, a1, a4, a5);
1108 vxor(x666633CC, a2, x555500FF);
1109 vsel(x606F30CF, x666633CC, a4, a3);
1110 vxor(x353A659A, a1, x606F30CF);
1111 vxor(x353A9A65, a5, x353A659A);
1112 vnot(xCAC5659A, x353A9A65);
1113
1114 vsel(x553A5565, a1, x353A9A65, a4);
1115 vsel(x0A3F0A6F, a3, a4, x553A5565);
1116 vxor(x6C5939A3, x666633CC, x0A3F0A6F);
1117 vxor(x5963A3C6, x353A9A65, x6C5939A3);
1118
1119 vsel(x15FF459A, a4, x353A659A, x553A5565);
1120 vxor(x1AF04A95, a3, x15FF459A);
1121 vsel(x066F0CCF, a3, a4, x5963A3C6);
1122 vsel(x16E94A97, x1AF04A95, x066F0CCF, x6C5939A3);
1123
1124 vsel(x1872E297, x5963A3C6, x1AF04A95, a1);
1125 vsel(x55BE5539, x15FF459A, x553A5565, x6C5939A3);
1126 vsel(x1C3EE619, x1872E297, x55BE5539, x066F0CCF);
1127 vsel(x3, x5963A3C6, x1C3EE619, a6);
1128 vxor(*out4, *out4, x3);
1129
1130 vsel(x86CD4C9B, xCAC5659A, x066F0CCF, x6C5939A3);
1131 vsel(x12E0FFFD, a5, x1AF04A95, x16E94A97);
1132 vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD);
1133 vsel(x0, xCAC5659A, x942D9A67, a6);
1134 vxor(*out1, *out1, x0);
1135
1136 vxor(x2FCAD0F0, x353A9A65, x1AF04A95);
1137 vsel(x1BF21BB1, x1AF04A95, a2, x553A5565);
1138 vsel(x466E4C89, x55BE5539, x066F0CCF, x1BF21BB1);
1139 vxor(x69A49C79, x2FCAD0F0, x466E4C89);
1140
1141 vnot(x965B6386, x69A49C79);
1142 vsel(x12769BE1, x1BF21BB1, x942D9A67, x69A49C79);
1143 vsel(x925E63E1, x965B6386, x12769BE1, x555500FF);
1144 vsel(x1, x925E63E1, x69A49C79, a6);
1145 vxor(*out2, *out2, x1);
1146
1147 vsel(x9867CA97, x942D9A67, x1872E297, x2FCAD0F0);
1148 vsel(x69339C33, x69A49C79, a2, a4);
1149 vsel(xB946D2B4, x9867CA97, x2FCAD0F0, x69339C33);
1150 vsel(x2, x16E94A97, xB946D2B4, a6);
1151 vxor(*out3, *out3, x2);
1152 }
1153 #endif
1154
1155 #if regs <= 16 || latency >= 3
1156 /* s7-000013, 34 gates, 15 regs, 9/27/56/88/119 stall cycles */
1157 MAYBE_INLINE static void
s7(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)1158 s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
1159 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
1160 {
1161 vtype x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D;
1162 vtype x00FFFF00, x66666666, x32353235, x26253636, x26DAC936;
1163 vtype x738F9C63, x11EF9867, x26DA9867;
1164 vtype x4B4B9C63, x4B666663, x4E639396;
1165 vtype x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D;
1166 vtype xD728827B, x6698807B, x699C585B;
1167 vtype x738C847B, xA4A71E18, x74878E78;
1168 vtype x333D9639, x74879639, x8B7869C6;
1169 vtype x0, x1, x2, x3;
1170
1171 vsel(x44447777, a2, a6, a3);
1172 vxor(x4B4B7878, a4, x44447777);
1173 vsel(x22772277, a3, a5, a2);
1174 vsel(x0505F5F5, a6, a2, a4);
1175 vsel(x220522F5, x22772277, x0505F5F5, a5);
1176 vxor(x694E5A8D, x4B4B7878, x220522F5);
1177
1178 vxor(x00FFFF00, a5, a6);
1179 vxor(x66666666, a2, a3);
1180 vsel(x32353235, a3, x220522F5, a4);
1181 vsel(x26253636, x66666666, x32353235, x4B4B7878);
1182 vxor(x26DAC936, x00FFFF00, x26253636);
1183 vsel(x0, x26DAC936, x694E5A8D, a1);
1184 vxor(*out1, *out1, x0);
1185
1186 vxor(x738F9C63, a2, x26DAC936);
1187 vsel(x11EF9867, x738F9C63, a5, x66666666);
1188 vsel(x26DA9867, x26DAC936, x11EF9867, a6);
1189
1190 vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6);
1191 vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00);
1192 vxor(x4E639396, x0505F5F5, x4B666663);
1193
1194 vsel(x4E4B393C, x4B4B7878, x4E639396, a2);
1195 vnot(xFF00FF00, a5);
1196 vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235);
1197 vxor(xB14EE41D, x4E4B393C, xFF05DD21);
1198 vsel(x1, xB14EE41D, x26DA9867, a1);
1199 vxor(*out2, *out2, x1);
1200
1201 vxor(xD728827B, x66666666, xB14EE41D);
1202 vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C);
1203 vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21);
1204 vsel(x2, x699C585B, x4E639396, a1);
1205 vxor(*out3, *out3, x2);
1206
1207 vsel(x738C847B, x738F9C63, xD728827B, x4B4B7878);
1208 vxor(xA4A71E18, x738F9C63, xD728827B);
1209 vsel(x74878E78, x738C847B, xA4A71E18, a4);
1210
1211 vsel(x333D9639, x32353235, x738C847B, xB14EE41D);
1212 vsel(x74879639, x74878E78, x333D9639, a6);
1213 vnot(x8B7869C6, x74879639);
1214 vsel(x3, x74878E78, x8B7869C6, a1);
1215 vxor(*out4, *out4, x3);
1216 }
1217 #else
1218 /* s7-000019, 34 gates, 17 regs, 5/28/57/88/119 stall cycles */
1219 MAYBE_INLINE static void
s7(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)1220 s7(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
1221 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
1222 {
1223 vtype x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D;
1224 vtype x00FFFF00, x66666666, x32353235, x26253636, x26DAC936;
1225 vtype x738F9C63, x11EF9867, x26DA9867;
1226 vtype x4B4B9C63, x4B666663, x4E639396;
1227 vtype x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D;
1228 vtype xD728827B, x6698807B, x699C585B;
1229 vtype x778A8877, xA4A71E18, x74878E78;
1230 vtype x204A5845, x74879639, x8B7869C6;
1231 vtype x0, x1, x2, x3;
1232
1233 vsel(x44447777, a2, a6, a3);
1234 vxor(x4B4B7878, a4, x44447777);
1235 vsel(x22772277, a3, a5, a2);
1236 vsel(x0505F5F5, a6, a2, a4);
1237 vsel(x220522F5, x22772277, x0505F5F5, a5);
1238 vxor(x694E5A8D, x4B4B7878, x220522F5);
1239
1240 vxor(x00FFFF00, a5, a6);
1241 vxor(x66666666, a2, a3);
1242 vsel(x32353235, a3, x220522F5, a4);
1243 vsel(x26253636, x66666666, x32353235, x4B4B7878);
1244 vxor(x26DAC936, x00FFFF00, x26253636);
1245 vsel(x0, x26DAC936, x694E5A8D, a1);
1246 vxor(*out1, *out1, x0);
1247
1248 vxor(x738F9C63, a2, x26DAC936);
1249 vsel(x11EF9867, x738F9C63, a5, x66666666);
1250 vsel(x26DA9867, x26DAC936, x11EF9867, a6);
1251
1252 vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6);
1253 vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00);
1254 vxor(x4E639396, x0505F5F5, x4B666663);
1255
1256 vsel(x4E4B393C, x4B4B7878, x4E639396, a2);
1257 vnot(xFF00FF00, a5);
1258 vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235);
1259 vxor(xB14EE41D, x4E4B393C, xFF05DD21);
1260 vsel(x1, xB14EE41D, x26DA9867, a1);
1261 vxor(*out2, *out2, x1);
1262
1263 vxor(xD728827B, x66666666, xB14EE41D);
1264 vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C);
1265 vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21);
1266 vsel(x2, x699C585B, x4E639396, a1);
1267 vxor(*out3, *out3, x2);
1268
1269 vsel(x778A8877, x738F9C63, x26DAC936, x26253636);
1270 vxor(xA4A71E18, x738F9C63, xD728827B);
1271 vsel(x74878E78, x778A8877, xA4A71E18, a4);
1272
1273 vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936);
1274 vsel(x74879639, x74878E78, a3, x204A5845);
1275 vnot(x8B7869C6, x74879639);
1276 vsel(x3, x74878E78, x8B7869C6, a1);
1277 vxor(*out4, *out4, x3);
1278 }
1279 #endif
1280
1281 #if latency >= 3
1282 /* s8-000035, 32 gates, 15 regs, 6/15/47/79/111 stall cycles */
1283 MAYBE_INLINE static void
s8(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)1284 s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
1285 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
1286 {
1287 vtype x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C;
1288 vtype x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63;
1289 vtype x3001F74E, x30555745, x693CD926;
1290 vtype x0C0CD926, x0C3F25E9, x38D696A5;
1291 vtype xC729695A;
1292 vtype x03D2117B, xC778395B, xCB471CB2;
1293 vtype x5425B13F, x56B3803F, x919AE965;
1294 vtype x03DA807F, x613CD515, x62E6556A, xA59E6C31;
1295 vtype x0, x1, x2, x3;
1296
1297 vsel(x0505F5F5, a5, a1, a3);
1298 vxor(x05FAF50A, a4, x0505F5F5);
1299 vsel(x0F0F00FF, a3, a4, a5);
1300 vsel(x22227777, a2, a5, a1);
1301 vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777);
1302 vxor(x34E9B34C, a2, x07DA807F);
1303
1304 vsel(x00FFF00F, x05FAF50A, a4, a3);
1305 vsel(x0033FCCF, a5, x00FFF00F, a2);
1306 vsel(x5565B15C, a1, x34E9B34C, x0033FCCF);
1307 vsel(x0C0C3F3F, a3, a5, a2);
1308 vxor(x59698E63, x5565B15C, x0C0C3F3F);
1309
1310 vsel(x3001F74E, x34E9B34C, a5, x05FAF50A);
1311 vsel(x30555745, x3001F74E, a1, x00FFF00F);
1312 vxor(x693CD926, x59698E63, x30555745);
1313 vsel(x2, x693CD926, x59698E63, a6);
1314 vxor(*out3, *out3, x2);
1315
1316 vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5);
1317 vxor(x0C3F25E9, x0033FCCF, x0C0CD926);
1318 vxor(x38D696A5, x34E9B34C, x0C3F25E9);
1319
1320 vnot(xC729695A, x38D696A5);
1321
1322 vsel(x03D2117B, x07DA807F, a2, x0C0CD926);
1323 vsel(xC778395B, xC729695A, x03D2117B, x30555745);
1324 vxor(xCB471CB2, x0C3F25E9, xC778395B);
1325 vsel(x1, xCB471CB2, x34E9B34C, a6);
1326 vxor(*out2, *out2, x1);
1327
1328 vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B);
1329 vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63);
1330 vxor(x919AE965, xC729695A, x56B3803F);
1331 vsel(x3, xC729695A, x919AE965, a6);
1332 vxor(*out4, *out4, x3);
1333
1334 vsel(x03DA807F, x03D2117B, x07DA807F, x693CD926);
1335 vsel(x613CD515, a1, x693CD926, x34E9B34C);
1336 vxor(x62E6556A, x03DA807F, x613CD515);
1337 vxor(xA59E6C31, xC778395B, x62E6556A);
1338 vsel(x0, xA59E6C31, x38D696A5, a6);
1339 vxor(*out1, *out1, x0);
1340 }
1341 #else
1342 /* s8-000037, 32 gates, 15 regs, 3/17/49/81/113 stall cycles */
1343 MAYBE_INLINE static void
s8(vtype a1,vtype a2,vtype a3,vtype a4,vtype a5,vtype a6,vtype * out1,vtype * out2,vtype * out3,vtype * out4)1344 s8(vtype a1, vtype a2, vtype a3, vtype a4, vtype a5, vtype a6,
1345 vtype * out1, vtype * out2, vtype * out3, vtype * out4)
1346 {
1347 vtype x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C;
1348 vtype x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63;
1349 vtype x3001F74E, x30555745, x693CD926;
1350 vtype x0C0CD926, x0C3F25E9, x38D696A5;
1351 vtype xC729695A;
1352 vtype x03D2117B, xC778395B, xCB471CB2;
1353 vtype x5425B13F, x56B3803F, x919AE965;
1354 vtype x17B3023F, x75555755, x62E6556A, xA59E6C31;
1355 vtype x0, x1, x2, x3;
1356
1357 vsel(x0505F5F5, a5, a1, a3);
1358 vxor(x05FAF50A, a4, x0505F5F5);
1359 vsel(x0F0F00FF, a3, a4, a5);
1360 vsel(x22227777, a2, a5, a1);
1361 vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777);
1362 vxor(x34E9B34C, a2, x07DA807F);
1363
1364 vsel(x00FFF00F, x05FAF50A, a4, a3);
1365 vsel(x0033FCCF, a5, x00FFF00F, a2);
1366 vsel(x5565B15C, a1, x34E9B34C, x0033FCCF);
1367 vsel(x0C0C3F3F, a3, a5, a2);
1368 vxor(x59698E63, x5565B15C, x0C0C3F3F);
1369
1370 vsel(x3001F74E, x34E9B34C, a5, x05FAF50A);
1371 vsel(x30555745, x3001F74E, a1, x00FFF00F);
1372 vxor(x693CD926, x59698E63, x30555745);
1373 vsel(x2, x693CD926, x59698E63, a6);
1374 vxor(*out3, *out3, x2);
1375
1376 vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5);
1377 vxor(x0C3F25E9, x0033FCCF, x0C0CD926);
1378 vxor(x38D696A5, x34E9B34C, x0C3F25E9);
1379
1380 vnot(xC729695A, x38D696A5);
1381
1382 vsel(x03D2117B, x07DA807F, a2, x0C0CD926);
1383 vsel(xC778395B, xC729695A, x03D2117B, x30555745);
1384 vxor(xCB471CB2, x0C3F25E9, xC778395B);
1385 vsel(x1, xCB471CB2, x34E9B34C, a6);
1386 vxor(*out2, *out2, x1);
1387
1388 vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B);
1389 vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63);
1390 vxor(x919AE965, xC729695A, x56B3803F);
1391 vsel(x3, xC729695A, x919AE965, a6);
1392 vxor(*out4, *out4, x3);
1393
1394 vsel(x17B3023F, x07DA807F, a2, x59698E63);
1395 vor(x75555755, a1, x30555745);
1396 vxor(x62E6556A, x17B3023F, x75555755);
1397 vxor(xA59E6C31, xC778395B, x62E6556A);
1398 vsel(x0, xA59E6C31, x38D696A5, a6);
1399 vxor(*out1, *out1, x0);
1400 }
1401 #endif
1402