1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:05:33 EST 2020 */
23 
24 #include "dft/codelet-dft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2bv_64 -include dft/simd/t2b.h -sign 1 */
29 
30 /*
31  * This function contains 519 FP additions, 384 FP multiplications,
32  * (or, 261 additions, 126 multiplications, 258 fused multiply/add),
33  * 107 stack variables, 15 constants, and 128 memory accesses
34  */
35 #include "dft/simd/t2b.h"
36 
t2bv_64(R * ri,R * ii,const R * W,stride rs,INT mb,INT me,INT ms)37 static void t2bv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
40      DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
41      DVK(KP820678790, +0.820678790828660330972281985331011598767386482);
42      DVK(KP098491403, +0.098491403357164253077197521291327432293052451);
43      DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
44      DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
45      DVK(KP303346683, +0.303346683607342391675883946941299872384187453);
46      DVK(KP534511135, +0.534511135950791641089685961295362908582039528);
47      DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
48      DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
49      DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
50      DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
51      DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
52      DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
53      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
54      {
55 	  INT m;
56 	  R *x;
57 	  x = ii;
58 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
59 	       V Ta, T3U, T6l, T7B, T37, T3V, T58, T7a, T1v, T24, T43, T4F, T5F, T7l, T5Q;
60 	       V T7o, T2i, T2R, T4a, T4I, T60, T7s, T6b, T7v, T4k, T4l, T4C, T5x, T7g, T1i;
61 	       V T3b, T5u, T7h, T4h, T4i, T4B, T5o, T7d, TV, T3a, T5l, T7e, T3X, T3Y, Tx;
62 	       V T38, T5f, T7C, T6o, T7b, T1S, T25, T5T, T7m, T46, T4G, T5M, T7p, T2F, T2S;
63 	       V T6e, T7t, T4d, T4J, T67, T7w;
64 	       {
65 		    V T1, T3, T8, T6, T33, T35, T56, T2Y, T30, T55, T2, T7, T5;
66 		    T1 = LD(&(x[0]), ms, &(x[0]));
67 		    T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
68 		    T3 = BYTW(&(W[TWVL * 62]), T2);
69 		    T7 = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
70 		    T8 = BYTW(&(W[TWVL * 94]), T7);
71 		    T5 = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
72 		    T6 = BYTW(&(W[TWVL * 30]), T5);
73 		    {
74 			 V T32, T34, T2X, T2Z;
75 			 T32 = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
76 			 T33 = BYTW(&(W[TWVL * 110]), T32);
77 			 T34 = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
78 			 T35 = BYTW(&(W[TWVL * 46]), T34);
79 			 T56 = VSUB(T33, T35);
80 			 T2X = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
81 			 T2Y = BYTW(&(W[TWVL * 14]), T2X);
82 			 T2Z = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
83 			 T30 = BYTW(&(W[TWVL * 78]), T2Z);
84 			 T55 = VSUB(T2Y, T30);
85 		    }
86 		    {
87 			 V T4, T9, T6j, T6k;
88 			 T4 = VADD(T1, T3);
89 			 T9 = VADD(T6, T8);
90 			 Ta = VSUB(T4, T9);
91 			 T3U = VADD(T4, T9);
92 			 T6j = VSUB(T6, T8);
93 			 T6k = VSUB(T55, T56);
94 			 T6l = VFMA(LDK(KP707106781), T6k, T6j);
95 			 T7B = VFNMS(LDK(KP707106781), T6k, T6j);
96 		    }
97 		    {
98 			 V T31, T36, T54, T57;
99 			 T31 = VADD(T2Y, T30);
100 			 T36 = VADD(T33, T35);
101 			 T37 = VSUB(T31, T36);
102 			 T3V = VADD(T31, T36);
103 			 T54 = VSUB(T1, T3);
104 			 T57 = VADD(T55, T56);
105 			 T58 = VFMA(LDK(KP707106781), T57, T54);
106 			 T7a = VFNMS(LDK(KP707106781), T57, T54);
107 		    }
108 	       }
109 	       {
110 		    V T1m, T1o, T1p, T1r, T1t, T1u, T1Y, T5C, T23, T5D, T41, T42;
111 		    {
112 			 V T1l, T1n, T1q, T1s;
113 			 T1l = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
114 			 T1m = BYTW(&(W[0]), T1l);
115 			 T1n = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
116 			 T1o = BYTW(&(W[TWVL * 64]), T1n);
117 			 T1p = VADD(T1m, T1o);
118 			 T1q = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
119 			 T1r = BYTW(&(W[TWVL * 32]), T1q);
120 			 T1s = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
121 			 T1t = BYTW(&(W[TWVL * 96]), T1s);
122 			 T1u = VADD(T1r, T1t);
123 		    }
124 		    {
125 			 V T1V, T1X, T1U, T1W;
126 			 T1U = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
127 			 T1V = BYTW(&(W[TWVL * 16]), T1U);
128 			 T1W = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
129 			 T1X = BYTW(&(W[TWVL * 80]), T1W);
130 			 T1Y = VADD(T1V, T1X);
131 			 T5C = VSUB(T1V, T1X);
132 		    }
133 		    {
134 			 V T20, T22, T1Z, T21;
135 			 T1Z = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
136 			 T20 = BYTW(&(W[TWVL * 112]), T1Z);
137 			 T21 = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
138 			 T22 = BYTW(&(W[TWVL * 48]), T21);
139 			 T23 = VADD(T20, T22);
140 			 T5D = VSUB(T20, T22);
141 		    }
142 		    T1v = VSUB(T1p, T1u);
143 		    T24 = VSUB(T1Y, T23);
144 		    T41 = VADD(T1p, T1u);
145 		    T42 = VADD(T1Y, T23);
146 		    T43 = VADD(T41, T42);
147 		    T4F = VSUB(T41, T42);
148 		    {
149 			 V T5B, T5E, T5O, T5P;
150 			 T5B = VSUB(T1m, T1o);
151 			 T5E = VADD(T5C, T5D);
152 			 T5F = VFMA(LDK(KP707106781), T5E, T5B);
153 			 T7l = VFNMS(LDK(KP707106781), T5E, T5B);
154 			 T5O = VSUB(T1r, T1t);
155 			 T5P = VSUB(T5C, T5D);
156 			 T5Q = VFMA(LDK(KP707106781), T5P, T5O);
157 			 T7o = VFNMS(LDK(KP707106781), T5P, T5O);
158 		    }
159 	       }
160 	       {
161 		    V T29, T2b, T2c, T2e, T2g, T2h, T2L, T5Y, T2Q, T5X, T48, T49;
162 		    {
163 			 V T28, T2a, T2d, T2f;
164 			 T28 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
165 			 T29 = BYTW(&(W[TWVL * 124]), T28);
166 			 T2a = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
167 			 T2b = BYTW(&(W[TWVL * 60]), T2a);
168 			 T2c = VADD(T29, T2b);
169 			 T2d = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
170 			 T2e = BYTW(&(W[TWVL * 28]), T2d);
171 			 T2f = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
172 			 T2g = BYTW(&(W[TWVL * 92]), T2f);
173 			 T2h = VADD(T2e, T2g);
174 		    }
175 		    {
176 			 V T2I, T2K, T2H, T2J;
177 			 T2H = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
178 			 T2I = BYTW(&(W[TWVL * 108]), T2H);
179 			 T2J = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
180 			 T2K = BYTW(&(W[TWVL * 44]), T2J);
181 			 T2L = VADD(T2I, T2K);
182 			 T5Y = VSUB(T2I, T2K);
183 		    }
184 		    {
185 			 V T2N, T2P, T2M, T2O;
186 			 T2M = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
187 			 T2N = BYTW(&(W[TWVL * 12]), T2M);
188 			 T2O = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
189 			 T2P = BYTW(&(W[TWVL * 76]), T2O);
190 			 T2Q = VADD(T2N, T2P);
191 			 T5X = VSUB(T2N, T2P);
192 		    }
193 		    T2i = VSUB(T2c, T2h);
194 		    T2R = VSUB(T2L, T2Q);
195 		    T48 = VADD(T2c, T2h);
196 		    T49 = VADD(T2Q, T2L);
197 		    T4a = VADD(T48, T49);
198 		    T4I = VSUB(T48, T49);
199 		    {
200 			 V T5W, T5Z, T69, T6a;
201 			 T5W = VSUB(T29, T2b);
202 			 T5Z = VADD(T5X, T5Y);
203 			 T60 = VFMA(LDK(KP707106781), T5Z, T5W);
204 			 T7s = VFNMS(LDK(KP707106781), T5Z, T5W);
205 			 T69 = VSUB(T2g, T2e);
206 			 T6a = VSUB(T5Y, T5X);
207 			 T6b = VFMA(LDK(KP707106781), T6a, T69);
208 			 T7v = VFNMS(LDK(KP707106781), T6a, T69);
209 		    }
210 	       }
211 	       {
212 		    V TX, TZ, T10, T12, T14, T15, T1b, T5s, T1g, T5r, T5v, T5w;
213 		    {
214 			 V TW, TY, T11, T13;
215 			 TW = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
216 			 TX = BYTW(&(W[TWVL * 122]), TW);
217 			 TY = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
218 			 TZ = BYTW(&(W[TWVL * 58]), TY);
219 			 T10 = VADD(TX, TZ);
220 			 T11 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
221 			 T12 = BYTW(&(W[TWVL * 26]), T11);
222 			 T13 = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
223 			 T14 = BYTW(&(W[TWVL * 90]), T13);
224 			 T15 = VADD(T12, T14);
225 		    }
226 		    {
227 			 V T18, T1a, T17, T19;
228 			 T17 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
229 			 T18 = BYTW(&(W[TWVL * 106]), T17);
230 			 T19 = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
231 			 T1a = BYTW(&(W[TWVL * 42]), T19);
232 			 T1b = VADD(T18, T1a);
233 			 T5s = VSUB(T18, T1a);
234 		    }
235 		    {
236 			 V T1d, T1f, T1c, T1e;
237 			 T1c = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
238 			 T1d = BYTW(&(W[TWVL * 10]), T1c);
239 			 T1e = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
240 			 T1f = BYTW(&(W[TWVL * 74]), T1e);
241 			 T1g = VADD(T1d, T1f);
242 			 T5r = VSUB(T1d, T1f);
243 		    }
244 		    T4k = VADD(T10, T15);
245 		    T4l = VADD(T1g, T1b);
246 		    T4C = VSUB(T4k, T4l);
247 		    T5v = VSUB(T14, T12);
248 		    T5w = VSUB(T5s, T5r);
249 		    T5x = VFMA(LDK(KP707106781), T5w, T5v);
250 		    T7g = VFNMS(LDK(KP707106781), T5w, T5v);
251 		    {
252 			 V T16, T1h, T5q, T5t;
253 			 T16 = VSUB(T10, T15);
254 			 T1h = VSUB(T1b, T1g);
255 			 T1i = VFNMS(LDK(KP414213562), T1h, T16);
256 			 T3b = VFMA(LDK(KP414213562), T16, T1h);
257 			 T5q = VSUB(TX, TZ);
258 			 T5t = VADD(T5r, T5s);
259 			 T5u = VFMA(LDK(KP707106781), T5t, T5q);
260 			 T7h = VFNMS(LDK(KP707106781), T5t, T5q);
261 		    }
262 	       }
263 	       {
264 		    V TA, TC, TD, TF, TH, TI, TO, T5i, TT, T5j, T5m, T5n;
265 		    {
266 			 V Tz, TB, TE, TG;
267 			 Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
268 			 TA = BYTW(&(W[TWVL * 2]), Tz);
269 			 TB = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
270 			 TC = BYTW(&(W[TWVL * 66]), TB);
271 			 TD = VADD(TA, TC);
272 			 TE = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
273 			 TF = BYTW(&(W[TWVL * 34]), TE);
274 			 TG = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
275 			 TH = BYTW(&(W[TWVL * 98]), TG);
276 			 TI = VADD(TF, TH);
277 		    }
278 		    {
279 			 V TL, TN, TK, TM;
280 			 TK = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
281 			 TL = BYTW(&(W[TWVL * 18]), TK);
282 			 TM = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
283 			 TN = BYTW(&(W[TWVL * 82]), TM);
284 			 TO = VADD(TL, TN);
285 			 T5i = VSUB(TL, TN);
286 		    }
287 		    {
288 			 V TQ, TS, TP, TR;
289 			 TP = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
290 			 TQ = BYTW(&(W[TWVL * 114]), TP);
291 			 TR = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
292 			 TS = BYTW(&(W[TWVL * 50]), TR);
293 			 TT = VADD(TQ, TS);
294 			 T5j = VSUB(TQ, TS);
295 		    }
296 		    T4h = VADD(TD, TI);
297 		    T4i = VADD(TO, TT);
298 		    T4B = VSUB(T4h, T4i);
299 		    T5m = VSUB(TF, TH);
300 		    T5n = VSUB(T5i, T5j);
301 		    T5o = VFMA(LDK(KP707106781), T5n, T5m);
302 		    T7d = VFNMS(LDK(KP707106781), T5n, T5m);
303 		    {
304 			 V TJ, TU, T5h, T5k;
305 			 TJ = VSUB(TD, TI);
306 			 TU = VSUB(TO, TT);
307 			 TV = VFNMS(LDK(KP414213562), TU, TJ);
308 			 T3a = VFMA(LDK(KP414213562), TJ, TU);
309 			 T5h = VSUB(TA, TC);
310 			 T5k = VADD(T5i, T5j);
311 			 T5l = VFMA(LDK(KP707106781), T5k, T5h);
312 			 T7e = VFNMS(LDK(KP707106781), T5k, T5h);
313 		    }
314 	       }
315 	       {
316 		    V Tf, T59, Tv, T5d, Tk, T5a, Tq, T5c, Tl, Tw;
317 		    {
318 			 V Tc, Te, Tb, Td;
319 			 Tb = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
320 			 Tc = BYTW(&(W[TWVL * 6]), Tb);
321 			 Td = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
322 			 Te = BYTW(&(W[TWVL * 70]), Td);
323 			 Tf = VADD(Tc, Te);
324 			 T59 = VSUB(Tc, Te);
325 		    }
326 		    {
327 			 V Ts, Tu, Tr, Tt;
328 			 Tr = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
329 			 Ts = BYTW(&(W[TWVL * 22]), Tr);
330 			 Tt = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
331 			 Tu = BYTW(&(W[TWVL * 86]), Tt);
332 			 Tv = VADD(Ts, Tu);
333 			 T5d = VSUB(Tu, Ts);
334 		    }
335 		    {
336 			 V Th, Tj, Tg, Ti;
337 			 Tg = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
338 			 Th = BYTW(&(W[TWVL * 38]), Tg);
339 			 Ti = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
340 			 Tj = BYTW(&(W[TWVL * 102]), Ti);
341 			 Tk = VADD(Th, Tj);
342 			 T5a = VSUB(Th, Tj);
343 		    }
344 		    {
345 			 V Tn, Tp, Tm, To;
346 			 Tm = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
347 			 Tn = BYTW(&(W[TWVL * 118]), Tm);
348 			 To = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
349 			 Tp = BYTW(&(W[TWVL * 54]), To);
350 			 Tq = VADD(Tn, Tp);
351 			 T5c = VSUB(Tn, Tp);
352 		    }
353 		    T3X = VADD(Tf, Tk);
354 		    T3Y = VADD(Tq, Tv);
355 		    Tl = VSUB(Tf, Tk);
356 		    Tw = VSUB(Tq, Tv);
357 		    Tx = VADD(Tl, Tw);
358 		    T38 = VSUB(Tl, Tw);
359 		    {
360 			 V T5b, T5e, T6m, T6n;
361 			 T5b = VFNMS(LDK(KP414213562), T5a, T59);
362 			 T5e = VFNMS(LDK(KP414213562), T5d, T5c);
363 			 T5f = VADD(T5b, T5e);
364 			 T7C = VSUB(T5b, T5e);
365 			 T6m = VFMA(LDK(KP414213562), T59, T5a);
366 			 T6n = VFMA(LDK(KP414213562), T5c, T5d);
367 			 T6o = VSUB(T6m, T6n);
368 			 T7b = VADD(T6m, T6n);
369 		    }
370 	       }
371 	       {
372 		    V T1A, T5G, T1Q, T5K, T1F, T5H, T1L, T5J;
373 		    {
374 			 V T1x, T1z, T1w, T1y;
375 			 T1w = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
376 			 T1x = BYTW(&(W[TWVL * 8]), T1w);
377 			 T1y = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
378 			 T1z = BYTW(&(W[TWVL * 72]), T1y);
379 			 T1A = VADD(T1x, T1z);
380 			 T5G = VSUB(T1x, T1z);
381 		    }
382 		    {
383 			 V T1N, T1P, T1M, T1O;
384 			 T1M = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
385 			 T1N = BYTW(&(W[TWVL * 24]), T1M);
386 			 T1O = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
387 			 T1P = BYTW(&(W[TWVL * 88]), T1O);
388 			 T1Q = VADD(T1N, T1P);
389 			 T5K = VSUB(T1N, T1P);
390 		    }
391 		    {
392 			 V T1C, T1E, T1B, T1D;
393 			 T1B = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
394 			 T1C = BYTW(&(W[TWVL * 40]), T1B);
395 			 T1D = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
396 			 T1E = BYTW(&(W[TWVL * 104]), T1D);
397 			 T1F = VADD(T1C, T1E);
398 			 T5H = VSUB(T1C, T1E);
399 		    }
400 		    {
401 			 V T1I, T1K, T1H, T1J;
402 			 T1H = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
403 			 T1I = BYTW(&(W[TWVL * 120]), T1H);
404 			 T1J = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
405 			 T1K = BYTW(&(W[TWVL * 56]), T1J);
406 			 T1L = VADD(T1I, T1K);
407 			 T5J = VSUB(T1I, T1K);
408 		    }
409 		    {
410 			 V T1G, T1R, T5R, T5S;
411 			 T1G = VSUB(T1A, T1F);
412 			 T1R = VSUB(T1L, T1Q);
413 			 T1S = VADD(T1G, T1R);
414 			 T25 = VSUB(T1G, T1R);
415 			 T5R = VFMA(LDK(KP414213562), T5G, T5H);
416 			 T5S = VFNMS(LDK(KP414213562), T5J, T5K);
417 			 T5T = VADD(T5R, T5S);
418 			 T7m = VSUB(T5R, T5S);
419 		    }
420 		    {
421 			 V T44, T45, T5I, T5L;
422 			 T44 = VADD(T1A, T1F);
423 			 T45 = VADD(T1L, T1Q);
424 			 T46 = VADD(T44, T45);
425 			 T4G = VSUB(T44, T45);
426 			 T5I = VFNMS(LDK(KP414213562), T5H, T5G);
427 			 T5L = VFMA(LDK(KP414213562), T5K, T5J);
428 			 T5M = VADD(T5I, T5L);
429 			 T7p = VSUB(T5I, T5L);
430 		    }
431 	       }
432 	       {
433 		    V T2n, T61, T2D, T65, T2s, T62, T2y, T64;
434 		    {
435 			 V T2k, T2m, T2j, T2l;
436 			 T2j = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
437 			 T2k = BYTW(&(W[TWVL * 4]), T2j);
438 			 T2l = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
439 			 T2m = BYTW(&(W[TWVL * 68]), T2l);
440 			 T2n = VADD(T2k, T2m);
441 			 T61 = VSUB(T2k, T2m);
442 		    }
443 		    {
444 			 V T2A, T2C, T2z, T2B;
445 			 T2z = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
446 			 T2A = BYTW(&(W[TWVL * 20]), T2z);
447 			 T2B = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
448 			 T2C = BYTW(&(W[TWVL * 84]), T2B);
449 			 T2D = VADD(T2A, T2C);
450 			 T65 = VSUB(T2C, T2A);
451 		    }
452 		    {
453 			 V T2p, T2r, T2o, T2q;
454 			 T2o = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
455 			 T2p = BYTW(&(W[TWVL * 36]), T2o);
456 			 T2q = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
457 			 T2r = BYTW(&(W[TWVL * 100]), T2q);
458 			 T2s = VADD(T2p, T2r);
459 			 T62 = VSUB(T2r, T2p);
460 		    }
461 		    {
462 			 V T2v, T2x, T2u, T2w;
463 			 T2u = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
464 			 T2v = BYTW(&(W[TWVL * 116]), T2u);
465 			 T2w = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
466 			 T2x = BYTW(&(W[TWVL * 52]), T2w);
467 			 T2y = VADD(T2v, T2x);
468 			 T64 = VSUB(T2v, T2x);
469 		    }
470 		    {
471 			 V T2t, T2E, T6c, T6d;
472 			 T2t = VSUB(T2n, T2s);
473 			 T2E = VSUB(T2y, T2D);
474 			 T2F = VADD(T2t, T2E);
475 			 T2S = VSUB(T2E, T2t);
476 			 T6c = VFNMS(LDK(KP414213562), T61, T62);
477 			 T6d = VFMA(LDK(KP414213562), T64, T65);
478 			 T6e = VADD(T6c, T6d);
479 			 T7t = VSUB(T6d, T6c);
480 		    }
481 		    {
482 			 V T4b, T4c, T63, T66;
483 			 T4b = VADD(T2n, T2s);
484 			 T4c = VADD(T2y, T2D);
485 			 T4d = VADD(T4b, T4c);
486 			 T4J = VSUB(T4c, T4b);
487 			 T63 = VFMA(LDK(KP414213562), T62, T61);
488 			 T66 = VFNMS(LDK(KP414213562), T65, T64);
489 			 T67 = VADD(T63, T66);
490 			 T7w = VSUB(T66, T63);
491 		    }
492 	       }
493 	       {
494 		    V T40, T4s, T4x, T4z, T4f, T4o, T4n, T4t, T4u, T4y;
495 		    {
496 			 V T3W, T3Z, T4v, T4w;
497 			 T3W = VADD(T3U, T3V);
498 			 T3Z = VADD(T3X, T3Y);
499 			 T40 = VSUB(T3W, T3Z);
500 			 T4s = VADD(T3W, T3Z);
501 			 T4v = VADD(T43, T46);
502 			 T4w = VADD(T4a, T4d);
503 			 T4x = VSUB(T4v, T4w);
504 			 T4z = VADD(T4v, T4w);
505 		    }
506 		    {
507 			 V T47, T4e, T4j, T4m;
508 			 T47 = VSUB(T43, T46);
509 			 T4e = VSUB(T4a, T4d);
510 			 T4f = VADD(T47, T4e);
511 			 T4o = VSUB(T47, T4e);
512 			 T4j = VADD(T4h, T4i);
513 			 T4m = VADD(T4k, T4l);
514 			 T4n = VSUB(T4j, T4m);
515 			 T4t = VADD(T4j, T4m);
516 		    }
517 		    T4u = VSUB(T4s, T4t);
518 		    ST(&(x[WS(rs, 48)]), VFNMSI(T4x, T4u), ms, &(x[0]));
519 		    ST(&(x[WS(rs, 16)]), VFMAI(T4x, T4u), ms, &(x[0]));
520 		    T4y = VADD(T4s, T4t);
521 		    ST(&(x[WS(rs, 32)]), VSUB(T4y, T4z), ms, &(x[0]));
522 		    ST(&(x[0]), VADD(T4y, T4z), ms, &(x[0]));
523 		    {
524 			 V T4g, T4p, T4q, T4r;
525 			 T4g = VFNMS(LDK(KP707106781), T4f, T40);
526 			 T4p = VFNMS(LDK(KP707106781), T4o, T4n);
527 			 ST(&(x[WS(rs, 24)]), VFNMSI(T4p, T4g), ms, &(x[0]));
528 			 ST(&(x[WS(rs, 40)]), VFMAI(T4p, T4g), ms, &(x[0]));
529 			 T4q = VFMA(LDK(KP707106781), T4f, T40);
530 			 T4r = VFMA(LDK(KP707106781), T4o, T4n);
531 			 ST(&(x[WS(rs, 8)]), VFMAI(T4r, T4q), ms, &(x[0]));
532 			 ST(&(x[WS(rs, 56)]), VFNMSI(T4r, T4q), ms, &(x[0]));
533 		    }
534 	       }
535 	       {
536 		    V T4E, T4W, T4S, T4X, T4L, T50, T4P, T4Z;
537 		    {
538 			 V T4A, T4D, T4Q, T4R;
539 			 T4A = VSUB(T3U, T3V);
540 			 T4D = VADD(T4B, T4C);
541 			 T4E = VFMA(LDK(KP707106781), T4D, T4A);
542 			 T4W = VFNMS(LDK(KP707106781), T4D, T4A);
543 			 T4Q = VFMA(LDK(KP414213562), T4F, T4G);
544 			 T4R = VFMA(LDK(KP414213562), T4I, T4J);
545 			 T4S = VSUB(T4Q, T4R);
546 			 T4X = VADD(T4Q, T4R);
547 		    }
548 		    {
549 			 V T4H, T4K, T4N, T4O;
550 			 T4H = VFNMS(LDK(KP414213562), T4G, T4F);
551 			 T4K = VFNMS(LDK(KP414213562), T4J, T4I);
552 			 T4L = VADD(T4H, T4K);
553 			 T50 = VSUB(T4H, T4K);
554 			 T4N = VSUB(T3X, T3Y);
555 			 T4O = VSUB(T4B, T4C);
556 			 T4P = VFMA(LDK(KP707106781), T4O, T4N);
557 			 T4Z = VFNMS(LDK(KP707106781), T4O, T4N);
558 		    }
559 		    {
560 			 V T4M, T4T, T52, T53;
561 			 T4M = VFNMS(LDK(KP923879532), T4L, T4E);
562 			 T4T = VFNMS(LDK(KP923879532), T4S, T4P);
563 			 ST(&(x[WS(rs, 28)]), VFNMSI(T4T, T4M), ms, &(x[0]));
564 			 ST(&(x[WS(rs, 36)]), VFMAI(T4T, T4M), ms, &(x[0]));
565 			 T52 = VFMA(LDK(KP923879532), T4X, T4W);
566 			 T53 = VFNMS(LDK(KP923879532), T50, T4Z);
567 			 ST(&(x[WS(rs, 12)]), VFNMSI(T53, T52), ms, &(x[0]));
568 			 ST(&(x[WS(rs, 52)]), VFMAI(T53, T52), ms, &(x[0]));
569 		    }
570 		    {
571 			 V T4U, T4V, T4Y, T51;
572 			 T4U = VFMA(LDK(KP923879532), T4L, T4E);
573 			 T4V = VFMA(LDK(KP923879532), T4S, T4P);
574 			 ST(&(x[WS(rs, 60)]), VFNMSI(T4V, T4U), ms, &(x[0]));
575 			 ST(&(x[WS(rs, 4)]), VFMAI(T4V, T4U), ms, &(x[0]));
576 			 T4Y = VFNMS(LDK(KP923879532), T4X, T4W);
577 			 T51 = VFMA(LDK(KP923879532), T50, T4Z);
578 			 ST(&(x[WS(rs, 20)]), VFMAI(T51, T4Y), ms, &(x[0]));
579 			 ST(&(x[WS(rs, 44)]), VFNMSI(T51, T4Y), ms, &(x[0]));
580 		    }
581 	       }
582 	       {
583 		    V T1k, T3k, T3d, T3n, T2V, T3o, T3g, T3l;
584 		    {
585 			 V Ty, T1j, T39, T3c;
586 			 Ty = VFMA(LDK(KP707106781), Tx, Ta);
587 			 T1j = VADD(TV, T1i);
588 			 T1k = VFMA(LDK(KP923879532), T1j, Ty);
589 			 T3k = VFNMS(LDK(KP923879532), T1j, Ty);
590 			 T39 = VFMA(LDK(KP707106781), T38, T37);
591 			 T3c = VSUB(T3a, T3b);
592 			 T3d = VFMA(LDK(KP923879532), T3c, T39);
593 			 T3n = VFNMS(LDK(KP923879532), T3c, T39);
594 			 {
595 			      V T27, T3e, T2U, T3f;
596 			      {
597 				   V T1T, T26, T2G, T2T;
598 				   T1T = VFMA(LDK(KP707106781), T1S, T1v);
599 				   T26 = VFMA(LDK(KP707106781), T25, T24);
600 				   T27 = VFNMS(LDK(KP198912367), T26, T1T);
601 				   T3e = VFMA(LDK(KP198912367), T1T, T26);
602 				   T2G = VFMA(LDK(KP707106781), T2F, T2i);
603 				   T2T = VFMA(LDK(KP707106781), T2S, T2R);
604 				   T2U = VFNMS(LDK(KP198912367), T2T, T2G);
605 				   T3f = VFMA(LDK(KP198912367), T2G, T2T);
606 			      }
607 			      T2V = VADD(T27, T2U);
608 			      T3o = VSUB(T27, T2U);
609 			      T3g = VSUB(T3e, T3f);
610 			      T3l = VADD(T3e, T3f);
611 			 }
612 		    }
613 		    {
614 			 V T2W, T3h, T3q, T3r;
615 			 T2W = VFNMS(LDK(KP980785280), T2V, T1k);
616 			 T3h = VFNMS(LDK(KP980785280), T3g, T3d);
617 			 ST(&(x[WS(rs, 30)]), VFNMSI(T3h, T2W), ms, &(x[0]));
618 			 ST(&(x[WS(rs, 34)]), VFMAI(T3h, T2W), ms, &(x[0]));
619 			 T3q = VFMA(LDK(KP980785280), T3l, T3k);
620 			 T3r = VFNMS(LDK(KP980785280), T3o, T3n);
621 			 ST(&(x[WS(rs, 14)]), VFNMSI(T3r, T3q), ms, &(x[0]));
622 			 ST(&(x[WS(rs, 50)]), VFMAI(T3r, T3q), ms, &(x[0]));
623 		    }
624 		    {
625 			 V T3i, T3j, T3m, T3p;
626 			 T3i = VFMA(LDK(KP980785280), T2V, T1k);
627 			 T3j = VFMA(LDK(KP980785280), T3g, T3d);
628 			 ST(&(x[WS(rs, 62)]), VFNMSI(T3j, T3i), ms, &(x[0]));
629 			 ST(&(x[WS(rs, 2)]), VFMAI(T3j, T3i), ms, &(x[0]));
630 			 T3m = VFNMS(LDK(KP980785280), T3l, T3k);
631 			 T3p = VFMA(LDK(KP980785280), T3o, T3n);
632 			 ST(&(x[WS(rs, 18)]), VFMAI(T3p, T3m), ms, &(x[0]));
633 			 ST(&(x[WS(rs, 46)]), VFNMSI(T3p, T3m), ms, &(x[0]));
634 		    }
635 	       }
636 	       {
637 		    V T3u, T3M, T3F, T3P, T3B, T3Q, T3I, T3N;
638 		    {
639 			 V T3s, T3t, T3D, T3E;
640 			 T3s = VFNMS(LDK(KP707106781), Tx, Ta);
641 			 T3t = VADD(T3a, T3b);
642 			 T3u = VFMA(LDK(KP923879532), T3t, T3s);
643 			 T3M = VFNMS(LDK(KP923879532), T3t, T3s);
644 			 T3D = VFNMS(LDK(KP707106781), T38, T37);
645 			 T3E = VSUB(TV, T1i);
646 			 T3F = VFNMS(LDK(KP923879532), T3E, T3D);
647 			 T3P = VFMA(LDK(KP923879532), T3E, T3D);
648 			 {
649 			      V T3x, T3G, T3A, T3H;
650 			      {
651 				   V T3v, T3w, T3y, T3z;
652 				   T3v = VFNMS(LDK(KP707106781), T1S, T1v);
653 				   T3w = VFNMS(LDK(KP707106781), T25, T24);
654 				   T3x = VFMA(LDK(KP668178637), T3w, T3v);
655 				   T3G = VFNMS(LDK(KP668178637), T3v, T3w);
656 				   T3y = VFNMS(LDK(KP707106781), T2F, T2i);
657 				   T3z = VFNMS(LDK(KP707106781), T2S, T2R);
658 				   T3A = VFMA(LDK(KP668178637), T3z, T3y);
659 				   T3H = VFNMS(LDK(KP668178637), T3y, T3z);
660 			      }
661 			      T3B = VADD(T3x, T3A);
662 			      T3Q = VSUB(T3x, T3A);
663 			      T3I = VSUB(T3G, T3H);
664 			      T3N = VADD(T3G, T3H);
665 			 }
666 		    }
667 		    {
668 			 V T3C, T3J, T3S, T3T;
669 			 T3C = VFNMS(LDK(KP831469612), T3B, T3u);
670 			 T3J = VFNMS(LDK(KP831469612), T3I, T3F);
671 			 ST(&(x[WS(rs, 38)]), VFNMSI(T3J, T3C), ms, &(x[0]));
672 			 ST(&(x[WS(rs, 26)]), VFMAI(T3J, T3C), ms, &(x[0]));
673 			 T3S = VFNMS(LDK(KP831469612), T3N, T3M);
674 			 T3T = VFMA(LDK(KP831469612), T3Q, T3P);
675 			 ST(&(x[WS(rs, 10)]), VFMAI(T3T, T3S), ms, &(x[0]));
676 			 ST(&(x[WS(rs, 54)]), VFNMSI(T3T, T3S), ms, &(x[0]));
677 		    }
678 		    {
679 			 V T3K, T3L, T3O, T3R;
680 			 T3K = VFMA(LDK(KP831469612), T3B, T3u);
681 			 T3L = VFMA(LDK(KP831469612), T3I, T3F);
682 			 ST(&(x[WS(rs, 6)]), VFNMSI(T3L, T3K), ms, &(x[0]));
683 			 ST(&(x[WS(rs, 58)]), VFMAI(T3L, T3K), ms, &(x[0]));
684 			 T3O = VFMA(LDK(KP831469612), T3N, T3M);
685 			 T3R = VFNMS(LDK(KP831469612), T3Q, T3P);
686 			 ST(&(x[WS(rs, 22)]), VFNMSI(T3R, T3O), ms, &(x[0]));
687 			 ST(&(x[WS(rs, 42)]), VFMAI(T3R, T3O), ms, &(x[0]));
688 		    }
689 	       }
690 	       {
691 		    V T7k, T8j, T7O, T89, T7H, T8g, T7R, T7Y, T7z, T7S, T7K, T7P, T85, T8k, T8c;
692 		    V T8h;
693 		    {
694 			 V T7c, T87, T7j, T88, T7f, T7i;
695 			 T7c = VFNMS(LDK(KP923879532), T7b, T7a);
696 			 T87 = VFNMS(LDK(KP923879532), T7C, T7B);
697 			 T7f = VFNMS(LDK(KP668178637), T7e, T7d);
698 			 T7i = VFNMS(LDK(KP668178637), T7h, T7g);
699 			 T7j = VADD(T7f, T7i);
700 			 T88 = VSUB(T7f, T7i);
701 			 T7k = VFNMS(LDK(KP831469612), T7j, T7c);
702 			 T8j = VFNMS(LDK(KP831469612), T88, T87);
703 			 T7O = VFMA(LDK(KP831469612), T7j, T7c);
704 			 T89 = VFMA(LDK(KP831469612), T88, T87);
705 		    }
706 		    {
707 			 V T7D, T7W, T7G, T7X, T7E, T7F;
708 			 T7D = VFMA(LDK(KP923879532), T7C, T7B);
709 			 T7W = VFMA(LDK(KP923879532), T7b, T7a);
710 			 T7E = VFMA(LDK(KP668178637), T7d, T7e);
711 			 T7F = VFMA(LDK(KP668178637), T7g, T7h);
712 			 T7G = VSUB(T7E, T7F);
713 			 T7X = VADD(T7E, T7F);
714 			 T7H = VFMA(LDK(KP831469612), T7G, T7D);
715 			 T8g = VFNMS(LDK(KP831469612), T7X, T7W);
716 			 T7R = VFNMS(LDK(KP831469612), T7G, T7D);
717 			 T7Y = VFMA(LDK(KP831469612), T7X, T7W);
718 		    }
719 		    {
720 			 V T7r, T7I, T7y, T7J;
721 			 {
722 			      V T7n, T7q, T7u, T7x;
723 			      T7n = VFNMS(LDK(KP923879532), T7m, T7l);
724 			      T7q = VFMA(LDK(KP923879532), T7p, T7o);
725 			      T7r = VFNMS(LDK(KP534511135), T7q, T7n);
726 			      T7I = VFMA(LDK(KP534511135), T7n, T7q);
727 			      T7u = VFNMS(LDK(KP923879532), T7t, T7s);
728 			      T7x = VFMA(LDK(KP923879532), T7w, T7v);
729 			      T7y = VFNMS(LDK(KP534511135), T7x, T7u);
730 			      T7J = VFMA(LDK(KP534511135), T7u, T7x);
731 			 }
732 			 T7z = VADD(T7r, T7y);
733 			 T7S = VSUB(T7r, T7y);
734 			 T7K = VSUB(T7I, T7J);
735 			 T7P = VADD(T7I, T7J);
736 		    }
737 		    {
738 			 V T81, T8a, T84, T8b;
739 			 {
740 			      V T7Z, T80, T82, T83;
741 			      T7Z = VFMA(LDK(KP923879532), T7m, T7l);
742 			      T80 = VFNMS(LDK(KP923879532), T7p, T7o);
743 			      T81 = VFMA(LDK(KP303346683), T80, T7Z);
744 			      T8a = VFNMS(LDK(KP303346683), T7Z, T80);
745 			      T82 = VFMA(LDK(KP923879532), T7t, T7s);
746 			      T83 = VFNMS(LDK(KP923879532), T7w, T7v);
747 			      T84 = VFMA(LDK(KP303346683), T83, T82);
748 			      T8b = VFNMS(LDK(KP303346683), T82, T83);
749 			 }
750 			 T85 = VADD(T81, T84);
751 			 T8k = VSUB(T81, T84);
752 			 T8c = VSUB(T8a, T8b);
753 			 T8h = VADD(T8a, T8b);
754 		    }
755 		    {
756 			 V T7A, T7L, T8i, T8l;
757 			 T7A = VFNMS(LDK(KP881921264), T7z, T7k);
758 			 T7L = VFNMS(LDK(KP881921264), T7K, T7H);
759 			 ST(&(x[WS(rs, 27)]), VFNMSI(T7L, T7A), ms, &(x[WS(rs, 1)]));
760 			 ST(&(x[WS(rs, 37)]), VFMAI(T7L, T7A), ms, &(x[WS(rs, 1)]));
761 			 T8i = VFMA(LDK(KP956940335), T8h, T8g);
762 			 T8l = VFNMS(LDK(KP956940335), T8k, T8j);
763 			 ST(&(x[WS(rs, 19)]), VFNMSI(T8l, T8i), ms, &(x[WS(rs, 1)]));
764 			 ST(&(x[WS(rs, 45)]), VFMAI(T8l, T8i), ms, &(x[WS(rs, 1)]));
765 		    }
766 		    {
767 			 V T8m, T8n, T7M, T7N;
768 			 T8m = VFNMS(LDK(KP956940335), T8h, T8g);
769 			 T8n = VFMA(LDK(KP956940335), T8k, T8j);
770 			 ST(&(x[WS(rs, 13)]), VFMAI(T8n, T8m), ms, &(x[WS(rs, 1)]));
771 			 ST(&(x[WS(rs, 51)]), VFNMSI(T8n, T8m), ms, &(x[WS(rs, 1)]));
772 			 T7M = VFMA(LDK(KP881921264), T7z, T7k);
773 			 T7N = VFMA(LDK(KP881921264), T7K, T7H);
774 			 ST(&(x[WS(rs, 59)]), VFNMSI(T7N, T7M), ms, &(x[WS(rs, 1)]));
775 			 ST(&(x[WS(rs, 5)]), VFMAI(T7N, T7M), ms, &(x[WS(rs, 1)]));
776 		    }
777 		    {
778 			 V T7Q, T7T, T86, T8d;
779 			 T7Q = VFNMS(LDK(KP881921264), T7P, T7O);
780 			 T7T = VFMA(LDK(KP881921264), T7S, T7R);
781 			 ST(&(x[WS(rs, 21)]), VFMAI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
782 			 ST(&(x[WS(rs, 43)]), VFNMSI(T7T, T7Q), ms, &(x[WS(rs, 1)]));
783 			 T86 = VFNMS(LDK(KP956940335), T85, T7Y);
784 			 T8d = VFNMS(LDK(KP956940335), T8c, T89);
785 			 ST(&(x[WS(rs, 35)]), VFNMSI(T8d, T86), ms, &(x[WS(rs, 1)]));
786 			 ST(&(x[WS(rs, 29)]), VFMAI(T8d, T86), ms, &(x[WS(rs, 1)]));
787 		    }
788 		    {
789 			 V T8e, T8f, T7U, T7V;
790 			 T8e = VFMA(LDK(KP956940335), T85, T7Y);
791 			 T8f = VFMA(LDK(KP956940335), T8c, T89);
792 			 ST(&(x[WS(rs, 3)]), VFNMSI(T8f, T8e), ms, &(x[WS(rs, 1)]));
793 			 ST(&(x[WS(rs, 61)]), VFMAI(T8f, T8e), ms, &(x[WS(rs, 1)]));
794 			 T7U = VFMA(LDK(KP881921264), T7P, T7O);
795 			 T7V = VFNMS(LDK(KP881921264), T7S, T7R);
796 			 ST(&(x[WS(rs, 11)]), VFNMSI(T7V, T7U), ms, &(x[WS(rs, 1)]));
797 			 ST(&(x[WS(rs, 53)]), VFMAI(T7V, T7U), ms, &(x[WS(rs, 1)]));
798 		    }
799 	       }
800 	       {
801 		    V T5A, T75, T6A, T6V, T6t, T72, T6D, T6K, T6h, T6E, T6w, T6B, T6R, T76, T6Y;
802 		    V T73;
803 		    {
804 			 V T5g, T6T, T5z, T6U, T5p, T5y;
805 			 T5g = VFMA(LDK(KP923879532), T5f, T58);
806 			 T6T = VFNMS(LDK(KP923879532), T6o, T6l);
807 			 T5p = VFNMS(LDK(KP198912367), T5o, T5l);
808 			 T5y = VFNMS(LDK(KP198912367), T5x, T5u);
809 			 T5z = VADD(T5p, T5y);
810 			 T6U = VSUB(T5p, T5y);
811 			 T5A = VFMA(LDK(KP980785280), T5z, T5g);
812 			 T75 = VFMA(LDK(KP980785280), T6U, T6T);
813 			 T6A = VFNMS(LDK(KP980785280), T5z, T5g);
814 			 T6V = VFNMS(LDK(KP980785280), T6U, T6T);
815 		    }
816 		    {
817 			 V T6p, T6I, T6s, T6J, T6q, T6r;
818 			 T6p = VFMA(LDK(KP923879532), T6o, T6l);
819 			 T6I = VFNMS(LDK(KP923879532), T5f, T58);
820 			 T6q = VFMA(LDK(KP198912367), T5l, T5o);
821 			 T6r = VFMA(LDK(KP198912367), T5u, T5x);
822 			 T6s = VSUB(T6q, T6r);
823 			 T6J = VADD(T6q, T6r);
824 			 T6t = VFMA(LDK(KP980785280), T6s, T6p);
825 			 T72 = VFNMS(LDK(KP980785280), T6J, T6I);
826 			 T6D = VFNMS(LDK(KP980785280), T6s, T6p);
827 			 T6K = VFMA(LDK(KP980785280), T6J, T6I);
828 		    }
829 		    {
830 			 V T5V, T6u, T6g, T6v;
831 			 {
832 			      V T5N, T5U, T68, T6f;
833 			      T5N = VFMA(LDK(KP923879532), T5M, T5F);
834 			      T5U = VFMA(LDK(KP923879532), T5T, T5Q);
835 			      T5V = VFNMS(LDK(KP098491403), T5U, T5N);
836 			      T6u = VFMA(LDK(KP098491403), T5N, T5U);
837 			      T68 = VFMA(LDK(KP923879532), T67, T60);
838 			      T6f = VFMA(LDK(KP923879532), T6e, T6b);
839 			      T6g = VFNMS(LDK(KP098491403), T6f, T68);
840 			      T6v = VFMA(LDK(KP098491403), T68, T6f);
841 			 }
842 			 T6h = VADD(T5V, T6g);
843 			 T6E = VSUB(T5V, T6g);
844 			 T6w = VSUB(T6u, T6v);
845 			 T6B = VADD(T6u, T6v);
846 		    }
847 		    {
848 			 V T6N, T6W, T6Q, T6X;
849 			 {
850 			      V T6L, T6M, T6O, T6P;
851 			      T6L = VFNMS(LDK(KP923879532), T5M, T5F);
852 			      T6M = VFNMS(LDK(KP923879532), T5T, T5Q);
853 			      T6N = VFMA(LDK(KP820678790), T6M, T6L);
854 			      T6W = VFNMS(LDK(KP820678790), T6L, T6M);
855 			      T6O = VFNMS(LDK(KP923879532), T67, T60);
856 			      T6P = VFNMS(LDK(KP923879532), T6e, T6b);
857 			      T6Q = VFMA(LDK(KP820678790), T6P, T6O);
858 			      T6X = VFNMS(LDK(KP820678790), T6O, T6P);
859 			 }
860 			 T6R = VADD(T6N, T6Q);
861 			 T76 = VSUB(T6N, T6Q);
862 			 T6Y = VSUB(T6W, T6X);
863 			 T73 = VADD(T6W, T6X);
864 		    }
865 		    {
866 			 V T6i, T6x, T74, T77;
867 			 T6i = VFNMS(LDK(KP995184726), T6h, T5A);
868 			 T6x = VFNMS(LDK(KP995184726), T6w, T6t);
869 			 ST(&(x[WS(rs, 31)]), VFNMSI(T6x, T6i), ms, &(x[WS(rs, 1)]));
870 			 ST(&(x[WS(rs, 33)]), VFMAI(T6x, T6i), ms, &(x[WS(rs, 1)]));
871 			 T74 = VFMA(LDK(KP773010453), T73, T72);
872 			 T77 = VFNMS(LDK(KP773010453), T76, T75);
873 			 ST(&(x[WS(rs, 23)]), VFNMSI(T77, T74), ms, &(x[WS(rs, 1)]));
874 			 ST(&(x[WS(rs, 41)]), VFMAI(T77, T74), ms, &(x[WS(rs, 1)]));
875 		    }
876 		    {
877 			 V T78, T79, T6y, T6z;
878 			 T78 = VFNMS(LDK(KP773010453), T73, T72);
879 			 T79 = VFMA(LDK(KP773010453), T76, T75);
880 			 ST(&(x[WS(rs, 9)]), VFMAI(T79, T78), ms, &(x[WS(rs, 1)]));
881 			 ST(&(x[WS(rs, 55)]), VFNMSI(T79, T78), ms, &(x[WS(rs, 1)]));
882 			 T6y = VFMA(LDK(KP995184726), T6h, T5A);
883 			 T6z = VFMA(LDK(KP995184726), T6w, T6t);
884 			 ST(&(x[WS(rs, 63)]), VFNMSI(T6z, T6y), ms, &(x[WS(rs, 1)]));
885 			 ST(&(x[WS(rs, 1)]), VFMAI(T6z, T6y), ms, &(x[WS(rs, 1)]));
886 		    }
887 		    {
888 			 V T6C, T6F, T6S, T6Z;
889 			 T6C = VFNMS(LDK(KP995184726), T6B, T6A);
890 			 T6F = VFMA(LDK(KP995184726), T6E, T6D);
891 			 ST(&(x[WS(rs, 17)]), VFMAI(T6F, T6C), ms, &(x[WS(rs, 1)]));
892 			 ST(&(x[WS(rs, 47)]), VFNMSI(T6F, T6C), ms, &(x[WS(rs, 1)]));
893 			 T6S = VFNMS(LDK(KP773010453), T6R, T6K);
894 			 T6Z = VFNMS(LDK(KP773010453), T6Y, T6V);
895 			 ST(&(x[WS(rs, 39)]), VFNMSI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
896 			 ST(&(x[WS(rs, 25)]), VFMAI(T6Z, T6S), ms, &(x[WS(rs, 1)]));
897 		    }
898 		    {
899 			 V T70, T71, T6G, T6H;
900 			 T70 = VFMA(LDK(KP773010453), T6R, T6K);
901 			 T71 = VFMA(LDK(KP773010453), T6Y, T6V);
902 			 ST(&(x[WS(rs, 7)]), VFNMSI(T71, T70), ms, &(x[WS(rs, 1)]));
903 			 ST(&(x[WS(rs, 57)]), VFMAI(T71, T70), ms, &(x[WS(rs, 1)]));
904 			 T6G = VFMA(LDK(KP995184726), T6B, T6A);
905 			 T6H = VFNMS(LDK(KP995184726), T6E, T6D);
906 			 ST(&(x[WS(rs, 15)]), VFNMSI(T6H, T6G), ms, &(x[WS(rs, 1)]));
907 			 ST(&(x[WS(rs, 49)]), VFMAI(T6H, T6G), ms, &(x[WS(rs, 1)]));
908 		    }
909 	       }
910 	  }
911      }
912      VLEAVE();
913 }
914 
915 static const tw_instr twinstr[] = {
916      VTW(0, 1),
917      VTW(0, 2),
918      VTW(0, 3),
919      VTW(0, 4),
920      VTW(0, 5),
921      VTW(0, 6),
922      VTW(0, 7),
923      VTW(0, 8),
924      VTW(0, 9),
925      VTW(0, 10),
926      VTW(0, 11),
927      VTW(0, 12),
928      VTW(0, 13),
929      VTW(0, 14),
930      VTW(0, 15),
931      VTW(0, 16),
932      VTW(0, 17),
933      VTW(0, 18),
934      VTW(0, 19),
935      VTW(0, 20),
936      VTW(0, 21),
937      VTW(0, 22),
938      VTW(0, 23),
939      VTW(0, 24),
940      VTW(0, 25),
941      VTW(0, 26),
942      VTW(0, 27),
943      VTW(0, 28),
944      VTW(0, 29),
945      VTW(0, 30),
946      VTW(0, 31),
947      VTW(0, 32),
948      VTW(0, 33),
949      VTW(0, 34),
950      VTW(0, 35),
951      VTW(0, 36),
952      VTW(0, 37),
953      VTW(0, 38),
954      VTW(0, 39),
955      VTW(0, 40),
956      VTW(0, 41),
957      VTW(0, 42),
958      VTW(0, 43),
959      VTW(0, 44),
960      VTW(0, 45),
961      VTW(0, 46),
962      VTW(0, 47),
963      VTW(0, 48),
964      VTW(0, 49),
965      VTW(0, 50),
966      VTW(0, 51),
967      VTW(0, 52),
968      VTW(0, 53),
969      VTW(0, 54),
970      VTW(0, 55),
971      VTW(0, 56),
972      VTW(0, 57),
973      VTW(0, 58),
974      VTW(0, 59),
975      VTW(0, 60),
976      VTW(0, 61),
977      VTW(0, 62),
978      VTW(0, 63),
979      { TW_NEXT, VL, 0 }
980 };
981 
982 static const ct_desc desc = { 64, XSIMD_STRING("t2bv_64"), twinstr, &GENUS, { 261, 126, 258, 0 }, 0, 0, 0 };
983 
XSIMD(codelet_t2bv_64)984 void XSIMD(codelet_t2bv_64) (planner *p) {
985      X(kdft_dit_register) (p, t2bv_64, &desc);
986 }
987 #else
988 
989 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 64 -name t2bv_64 -include dft/simd/t2b.h -sign 1 */
990 
991 /*
992  * This function contains 519 FP additions, 250 FP multiplications,
993  * (or, 467 additions, 198 multiplications, 52 fused multiply/add),
994  * 107 stack variables, 15 constants, and 128 memory accesses
995  */
996 #include "dft/simd/t2b.h"
997 
t2bv_64(R * ri,R * ii,const R * W,stride rs,INT mb,INT me,INT ms)998 static void t2bv_64(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
999 {
1000      DVK(KP290284677, +0.290284677254462367636192375817395274691476278);
1001      DVK(KP956940335, +0.956940335732208864935797886980269969482849206);
1002      DVK(KP471396736, +0.471396736825997648556387625905254377657460319);
1003      DVK(KP881921264, +0.881921264348355029712756863660388349508442621);
1004      DVK(KP634393284, +0.634393284163645498215171613225493370675687095);
1005      DVK(KP773010453, +0.773010453362736960810906609758469800971041293);
1006      DVK(KP098017140, +0.098017140329560601994195563888641845861136673);
1007      DVK(KP995184726, +0.995184726672196886244836953109479921575474869);
1008      DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
1009      DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
1010      DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
1011      DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
1012      DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
1013      DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
1014      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
1015      {
1016 	  INT m;
1017 	  R *x;
1018 	  x = ii;
1019 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 126)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 126), MAKE_VOLATILE_STRIDE(64, rs)) {
1020 	       V Tg, T4B, T6v, T7G, T3r, T4w, T5q, T7F, T5Y, T62, T28, T4d, T2g, T4a, T7g;
1021 	       V T7Y, T6f, T6j, T2Z, T4k, T37, T4h, T7n, T81, T7w, T7x, T7y, T5M, T6q, T1k;
1022 	       V T4s, T1r, T4t, T7t, T7u, T7v, T5F, T6p, TV, T4p, T12, T4q, T7A, T7B, TD;
1023 	       V T4x, T3k, T4C, T5x, T6s, T1R, T4b, T7j, T7Z, T2j, T4e, T5V, T63, T2I, T4i;
1024 	       V T7q, T82, T3a, T4l, T6c, T6k;
1025 	       {
1026 		    V T1, T3, T3p, T3n, Tb, Td, Te, T6, T8, T9, T2, T3o, T3m;
1027 		    T1 = LD(&(x[0]), ms, &(x[0]));
1028 		    T2 = LD(&(x[WS(rs, 32)]), ms, &(x[0]));
1029 		    T3 = BYTW(&(W[TWVL * 62]), T2);
1030 		    T3o = LD(&(x[WS(rs, 48)]), ms, &(x[0]));
1031 		    T3p = BYTW(&(W[TWVL * 94]), T3o);
1032 		    T3m = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
1033 		    T3n = BYTW(&(W[TWVL * 30]), T3m);
1034 		    {
1035 			 V Ta, Tc, T5, T7;
1036 			 Ta = LD(&(x[WS(rs, 56)]), ms, &(x[0]));
1037 			 Tb = BYTW(&(W[TWVL * 110]), Ta);
1038 			 Tc = LD(&(x[WS(rs, 24)]), ms, &(x[0]));
1039 			 Td = BYTW(&(W[TWVL * 46]), Tc);
1040 			 Te = VSUB(Tb, Td);
1041 			 T5 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
1042 			 T6 = BYTW(&(W[TWVL * 14]), T5);
1043 			 T7 = LD(&(x[WS(rs, 40)]), ms, &(x[0]));
1044 			 T8 = BYTW(&(W[TWVL * 78]), T7);
1045 			 T9 = VSUB(T6, T8);
1046 		    }
1047 		    {
1048 			 V T4, Tf, T6t, T6u;
1049 			 T4 = VSUB(T1, T3);
1050 			 Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
1051 			 Tg = VSUB(T4, Tf);
1052 			 T4B = VADD(T4, Tf);
1053 			 T6t = VADD(T6, T8);
1054 			 T6u = VADD(Tb, Td);
1055 			 T6v = VSUB(T6t, T6u);
1056 			 T7G = VADD(T6t, T6u);
1057 		    }
1058 		    {
1059 			 V T3l, T3q, T5o, T5p;
1060 			 T3l = VMUL(LDK(KP707106781), VSUB(T9, Te));
1061 			 T3q = VSUB(T3n, T3p);
1062 			 T3r = VSUB(T3l, T3q);
1063 			 T4w = VADD(T3q, T3l);
1064 			 T5o = VADD(T1, T3);
1065 			 T5p = VADD(T3n, T3p);
1066 			 T5q = VSUB(T5o, T5p);
1067 			 T7F = VADD(T5o, T5p);
1068 		    }
1069 	       }
1070 	       {
1071 		    V T24, T26, T61, T2b, T2d, T60, T1W, T5W, T21, T5X, T22, T27;
1072 		    {
1073 			 V T23, T25, T2a, T2c;
1074 			 T23 = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
1075 			 T24 = BYTW(&(W[TWVL * 32]), T23);
1076 			 T25 = LD(&(x[WS(rs, 49)]), ms, &(x[WS(rs, 1)]));
1077 			 T26 = BYTW(&(W[TWVL * 96]), T25);
1078 			 T61 = VADD(T24, T26);
1079 			 T2a = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
1080 			 T2b = BYTW(&(W[0]), T2a);
1081 			 T2c = LD(&(x[WS(rs, 33)]), ms, &(x[WS(rs, 1)]));
1082 			 T2d = BYTW(&(W[TWVL * 64]), T2c);
1083 			 T60 = VADD(T2b, T2d);
1084 		    }
1085 		    {
1086 			 V T1T, T1V, T1S, T1U;
1087 			 T1S = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
1088 			 T1T = BYTW(&(W[TWVL * 16]), T1S);
1089 			 T1U = LD(&(x[WS(rs, 41)]), ms, &(x[WS(rs, 1)]));
1090 			 T1V = BYTW(&(W[TWVL * 80]), T1U);
1091 			 T1W = VSUB(T1T, T1V);
1092 			 T5W = VADD(T1T, T1V);
1093 		    }
1094 		    {
1095 			 V T1Y, T20, T1X, T1Z;
1096 			 T1X = LD(&(x[WS(rs, 57)]), ms, &(x[WS(rs, 1)]));
1097 			 T1Y = BYTW(&(W[TWVL * 112]), T1X);
1098 			 T1Z = LD(&(x[WS(rs, 25)]), ms, &(x[WS(rs, 1)]));
1099 			 T20 = BYTW(&(W[TWVL * 48]), T1Z);
1100 			 T21 = VSUB(T1Y, T20);
1101 			 T5X = VADD(T1Y, T20);
1102 		    }
1103 		    T5Y = VSUB(T5W, T5X);
1104 		    T62 = VSUB(T60, T61);
1105 		    T22 = VMUL(LDK(KP707106781), VSUB(T1W, T21));
1106 		    T27 = VSUB(T24, T26);
1107 		    T28 = VSUB(T22, T27);
1108 		    T4d = VADD(T27, T22);
1109 		    {
1110 			 V T2e, T2f, T7e, T7f;
1111 			 T2e = VSUB(T2b, T2d);
1112 			 T2f = VMUL(LDK(KP707106781), VADD(T1W, T21));
1113 			 T2g = VSUB(T2e, T2f);
1114 			 T4a = VADD(T2e, T2f);
1115 			 T7e = VADD(T60, T61);
1116 			 T7f = VADD(T5W, T5X);
1117 			 T7g = VSUB(T7e, T7f);
1118 			 T7Y = VADD(T7e, T7f);
1119 		    }
1120 	       }
1121 	       {
1122 		    V T2V, T2X, T6i, T32, T34, T6h, T2N, T6d, T2S, T6e, T2T, T2Y;
1123 		    {
1124 			 V T2U, T2W, T31, T33;
1125 			 T2U = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
1126 			 T2V = BYTW(&(W[TWVL * 28]), T2U);
1127 			 T2W = LD(&(x[WS(rs, 47)]), ms, &(x[WS(rs, 1)]));
1128 			 T2X = BYTW(&(W[TWVL * 92]), T2W);
1129 			 T6i = VADD(T2V, T2X);
1130 			 T31 = LD(&(x[WS(rs, 63)]), ms, &(x[WS(rs, 1)]));
1131 			 T32 = BYTW(&(W[TWVL * 124]), T31);
1132 			 T33 = LD(&(x[WS(rs, 31)]), ms, &(x[WS(rs, 1)]));
1133 			 T34 = BYTW(&(W[TWVL * 60]), T33);
1134 			 T6h = VADD(T32, T34);
1135 		    }
1136 		    {
1137 			 V T2K, T2M, T2J, T2L;
1138 			 T2J = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
1139 			 T2K = BYTW(&(W[TWVL * 12]), T2J);
1140 			 T2L = LD(&(x[WS(rs, 39)]), ms, &(x[WS(rs, 1)]));
1141 			 T2M = BYTW(&(W[TWVL * 76]), T2L);
1142 			 T2N = VSUB(T2K, T2M);
1143 			 T6d = VADD(T2K, T2M);
1144 		    }
1145 		    {
1146 			 V T2P, T2R, T2O, T2Q;
1147 			 T2O = LD(&(x[WS(rs, 55)]), ms, &(x[WS(rs, 1)]));
1148 			 T2P = BYTW(&(W[TWVL * 108]), T2O);
1149 			 T2Q = LD(&(x[WS(rs, 23)]), ms, &(x[WS(rs, 1)]));
1150 			 T2R = BYTW(&(W[TWVL * 44]), T2Q);
1151 			 T2S = VSUB(T2P, T2R);
1152 			 T6e = VADD(T2P, T2R);
1153 		    }
1154 		    T6f = VSUB(T6d, T6e);
1155 		    T6j = VSUB(T6h, T6i);
1156 		    T2T = VMUL(LDK(KP707106781), VSUB(T2N, T2S));
1157 		    T2Y = VSUB(T2V, T2X);
1158 		    T2Z = VSUB(T2T, T2Y);
1159 		    T4k = VADD(T2Y, T2T);
1160 		    {
1161 			 V T35, T36, T7l, T7m;
1162 			 T35 = VSUB(T32, T34);
1163 			 T36 = VMUL(LDK(KP707106781), VADD(T2N, T2S));
1164 			 T37 = VSUB(T35, T36);
1165 			 T4h = VADD(T35, T36);
1166 			 T7l = VADD(T6h, T6i);
1167 			 T7m = VADD(T6d, T6e);
1168 			 T7n = VSUB(T7l, T7m);
1169 			 T81 = VADD(T7l, T7m);
1170 		    }
1171 	       }
1172 	       {
1173 		    V T1g, T1i, T5K, T1m, T1o, T5J, T18, T5G, T1d, T5H, T5I, T5L;
1174 		    {
1175 			 V T1f, T1h, T1l, T1n;
1176 			 T1f = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
1177 			 T1g = BYTW(&(W[TWVL * 26]), T1f);
1178 			 T1h = LD(&(x[WS(rs, 46)]), ms, &(x[0]));
1179 			 T1i = BYTW(&(W[TWVL * 90]), T1h);
1180 			 T5K = VADD(T1g, T1i);
1181 			 T1l = LD(&(x[WS(rs, 62)]), ms, &(x[0]));
1182 			 T1m = BYTW(&(W[TWVL * 122]), T1l);
1183 			 T1n = LD(&(x[WS(rs, 30)]), ms, &(x[0]));
1184 			 T1o = BYTW(&(W[TWVL * 58]), T1n);
1185 			 T5J = VADD(T1m, T1o);
1186 		    }
1187 		    {
1188 			 V T15, T17, T14, T16;
1189 			 T14 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
1190 			 T15 = BYTW(&(W[TWVL * 10]), T14);
1191 			 T16 = LD(&(x[WS(rs, 38)]), ms, &(x[0]));
1192 			 T17 = BYTW(&(W[TWVL * 74]), T16);
1193 			 T18 = VSUB(T15, T17);
1194 			 T5G = VADD(T15, T17);
1195 		    }
1196 		    {
1197 			 V T1a, T1c, T19, T1b;
1198 			 T19 = LD(&(x[WS(rs, 54)]), ms, &(x[0]));
1199 			 T1a = BYTW(&(W[TWVL * 106]), T19);
1200 			 T1b = LD(&(x[WS(rs, 22)]), ms, &(x[0]));
1201 			 T1c = BYTW(&(W[TWVL * 42]), T1b);
1202 			 T1d = VSUB(T1a, T1c);
1203 			 T5H = VADD(T1a, T1c);
1204 		    }
1205 		    T7w = VADD(T5J, T5K);
1206 		    T7x = VADD(T5G, T5H);
1207 		    T7y = VSUB(T7w, T7x);
1208 		    T5I = VSUB(T5G, T5H);
1209 		    T5L = VSUB(T5J, T5K);
1210 		    T5M = VFNMS(LDK(KP382683432), T5L, VMUL(LDK(KP923879532), T5I));
1211 		    T6q = VFMA(LDK(KP923879532), T5L, VMUL(LDK(KP382683432), T5I));
1212 		    {
1213 			 V T1e, T1j, T1p, T1q;
1214 			 T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
1215 			 T1j = VSUB(T1g, T1i);
1216 			 T1k = VSUB(T1e, T1j);
1217 			 T4s = VADD(T1j, T1e);
1218 			 T1p = VSUB(T1m, T1o);
1219 			 T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
1220 			 T1r = VSUB(T1p, T1q);
1221 			 T4t = VADD(T1p, T1q);
1222 		    }
1223 	       }
1224 	       {
1225 		    V TR, TT, T5A, TX, TZ, T5z, TJ, T5C, TO, T5D, T5B, T5E;
1226 		    {
1227 			 V TQ, TS, TW, TY;
1228 			 TQ = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
1229 			 TR = BYTW(&(W[TWVL * 34]), TQ);
1230 			 TS = LD(&(x[WS(rs, 50)]), ms, &(x[0]));
1231 			 TT = BYTW(&(W[TWVL * 98]), TS);
1232 			 T5A = VADD(TR, TT);
1233 			 TW = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
1234 			 TX = BYTW(&(W[TWVL * 2]), TW);
1235 			 TY = LD(&(x[WS(rs, 34)]), ms, &(x[0]));
1236 			 TZ = BYTW(&(W[TWVL * 66]), TY);
1237 			 T5z = VADD(TX, TZ);
1238 		    }
1239 		    {
1240 			 V TG, TI, TF, TH;
1241 			 TF = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
1242 			 TG = BYTW(&(W[TWVL * 18]), TF);
1243 			 TH = LD(&(x[WS(rs, 42)]), ms, &(x[0]));
1244 			 TI = BYTW(&(W[TWVL * 82]), TH);
1245 			 TJ = VSUB(TG, TI);
1246 			 T5C = VADD(TG, TI);
1247 		    }
1248 		    {
1249 			 V TL, TN, TK, TM;
1250 			 TK = LD(&(x[WS(rs, 58)]), ms, &(x[0]));
1251 			 TL = BYTW(&(W[TWVL * 114]), TK);
1252 			 TM = LD(&(x[WS(rs, 26)]), ms, &(x[0]));
1253 			 TN = BYTW(&(W[TWVL * 50]), TM);
1254 			 TO = VSUB(TL, TN);
1255 			 T5D = VADD(TL, TN);
1256 		    }
1257 		    T7t = VADD(T5z, T5A);
1258 		    T7u = VADD(T5C, T5D);
1259 		    T7v = VSUB(T7t, T7u);
1260 		    T5B = VSUB(T5z, T5A);
1261 		    T5E = VSUB(T5C, T5D);
1262 		    T5F = VFMA(LDK(KP382683432), T5B, VMUL(LDK(KP923879532), T5E));
1263 		    T6p = VFNMS(LDK(KP382683432), T5E, VMUL(LDK(KP923879532), T5B));
1264 		    {
1265 			 V TP, TU, T10, T11;
1266 			 TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
1267 			 TU = VSUB(TR, TT);
1268 			 TV = VSUB(TP, TU);
1269 			 T4p = VADD(TU, TP);
1270 			 T10 = VSUB(TX, TZ);
1271 			 T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
1272 			 T12 = VSUB(T10, T11);
1273 			 T4q = VADD(T10, T11);
1274 		    }
1275 	       }
1276 	       {
1277 		    V Tl, T5r, TB, T5u, Tq, T5s, Tw, T5v, Tr, TC;
1278 		    {
1279 			 V Ti, Tk, Th, Tj;
1280 			 Th = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
1281 			 Ti = BYTW(&(W[TWVL * 6]), Th);
1282 			 Tj = LD(&(x[WS(rs, 36)]), ms, &(x[0]));
1283 			 Tk = BYTW(&(W[TWVL * 70]), Tj);
1284 			 Tl = VSUB(Ti, Tk);
1285 			 T5r = VADD(Ti, Tk);
1286 		    }
1287 		    {
1288 			 V Ty, TA, Tx, Tz;
1289 			 Tx = LD(&(x[WS(rs, 60)]), ms, &(x[0]));
1290 			 Ty = BYTW(&(W[TWVL * 118]), Tx);
1291 			 Tz = LD(&(x[WS(rs, 28)]), ms, &(x[0]));
1292 			 TA = BYTW(&(W[TWVL * 54]), Tz);
1293 			 TB = VSUB(Ty, TA);
1294 			 T5u = VADD(Ty, TA);
1295 		    }
1296 		    {
1297 			 V Tn, Tp, Tm, To;
1298 			 Tm = LD(&(x[WS(rs, 20)]), ms, &(x[0]));
1299 			 Tn = BYTW(&(W[TWVL * 38]), Tm);
1300 			 To = LD(&(x[WS(rs, 52)]), ms, &(x[0]));
1301 			 Tp = BYTW(&(W[TWVL * 102]), To);
1302 			 Tq = VSUB(Tn, Tp);
1303 			 T5s = VADD(Tn, Tp);
1304 		    }
1305 		    {
1306 			 V Tt, Tv, Ts, Tu;
1307 			 Ts = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
1308 			 Tt = BYTW(&(W[TWVL * 22]), Ts);
1309 			 Tu = LD(&(x[WS(rs, 44)]), ms, &(x[0]));
1310 			 Tv = BYTW(&(W[TWVL * 86]), Tu);
1311 			 Tw = VSUB(Tt, Tv);
1312 			 T5v = VADD(Tt, Tv);
1313 		    }
1314 		    T7A = VADD(T5r, T5s);
1315 		    T7B = VADD(T5u, T5v);
1316 		    Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
1317 		    TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
1318 		    TD = VSUB(Tr, TC);
1319 		    T4x = VADD(Tr, TC);
1320 		    {
1321 			 V T3i, T3j, T5t, T5w;
1322 			 T3i = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
1323 			 T3j = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
1324 			 T3k = VSUB(T3i, T3j);
1325 			 T4C = VADD(T3i, T3j);
1326 			 T5t = VSUB(T5r, T5s);
1327 			 T5w = VSUB(T5u, T5v);
1328 			 T5x = VMUL(LDK(KP707106781), VADD(T5t, T5w));
1329 			 T6s = VMUL(LDK(KP707106781), VSUB(T5t, T5w));
1330 		    }
1331 	       }
1332 	       {
1333 		    V T1z, T5P, T1P, T5T, T1E, T5Q, T1K, T5S;
1334 		    {
1335 			 V T1w, T1y, T1v, T1x;
1336 			 T1v = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
1337 			 T1w = BYTW(&(W[TWVL * 8]), T1v);
1338 			 T1x = LD(&(x[WS(rs, 37)]), ms, &(x[WS(rs, 1)]));
1339 			 T1y = BYTW(&(W[TWVL * 72]), T1x);
1340 			 T1z = VSUB(T1w, T1y);
1341 			 T5P = VADD(T1w, T1y);
1342 		    }
1343 		    {
1344 			 V T1M, T1O, T1L, T1N;
1345 			 T1L = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
1346 			 T1M = BYTW(&(W[TWVL * 24]), T1L);
1347 			 T1N = LD(&(x[WS(rs, 45)]), ms, &(x[WS(rs, 1)]));
1348 			 T1O = BYTW(&(W[TWVL * 88]), T1N);
1349 			 T1P = VSUB(T1M, T1O);
1350 			 T5T = VADD(T1M, T1O);
1351 		    }
1352 		    {
1353 			 V T1B, T1D, T1A, T1C;
1354 			 T1A = LD(&(x[WS(rs, 21)]), ms, &(x[WS(rs, 1)]));
1355 			 T1B = BYTW(&(W[TWVL * 40]), T1A);
1356 			 T1C = LD(&(x[WS(rs, 53)]), ms, &(x[WS(rs, 1)]));
1357 			 T1D = BYTW(&(W[TWVL * 104]), T1C);
1358 			 T1E = VSUB(T1B, T1D);
1359 			 T5Q = VADD(T1B, T1D);
1360 		    }
1361 		    {
1362 			 V T1H, T1J, T1G, T1I;
1363 			 T1G = LD(&(x[WS(rs, 61)]), ms, &(x[WS(rs, 1)]));
1364 			 T1H = BYTW(&(W[TWVL * 120]), T1G);
1365 			 T1I = LD(&(x[WS(rs, 29)]), ms, &(x[WS(rs, 1)]));
1366 			 T1J = BYTW(&(W[TWVL * 56]), T1I);
1367 			 T1K = VSUB(T1H, T1J);
1368 			 T5S = VADD(T1H, T1J);
1369 		    }
1370 		    {
1371 			 V T1F, T1Q, T7h, T7i;
1372 			 T1F = VFNMS(LDK(KP382683432), T1E, VMUL(LDK(KP923879532), T1z));
1373 			 T1Q = VFMA(LDK(KP923879532), T1K, VMUL(LDK(KP382683432), T1P));
1374 			 T1R = VSUB(T1F, T1Q);
1375 			 T4b = VADD(T1F, T1Q);
1376 			 T7h = VADD(T5P, T5Q);
1377 			 T7i = VADD(T5S, T5T);
1378 			 T7j = VSUB(T7h, T7i);
1379 			 T7Z = VADD(T7h, T7i);
1380 		    }
1381 		    {
1382 			 V T2h, T2i, T5R, T5U;
1383 			 T2h = VFMA(LDK(KP382683432), T1z, VMUL(LDK(KP923879532), T1E));
1384 			 T2i = VFNMS(LDK(KP382683432), T1K, VMUL(LDK(KP923879532), T1P));
1385 			 T2j = VSUB(T2h, T2i);
1386 			 T4e = VADD(T2h, T2i);
1387 			 T5R = VSUB(T5P, T5Q);
1388 			 T5U = VSUB(T5S, T5T);
1389 			 T5V = VMUL(LDK(KP707106781), VSUB(T5R, T5U));
1390 			 T63 = VMUL(LDK(KP707106781), VADD(T5R, T5U));
1391 		    }
1392 	       }
1393 	       {
1394 		    V T2q, T66, T2G, T6a, T2v, T67, T2B, T69;
1395 		    {
1396 			 V T2n, T2p, T2m, T2o;
1397 			 T2m = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
1398 			 T2n = BYTW(&(W[TWVL * 4]), T2m);
1399 			 T2o = LD(&(x[WS(rs, 35)]), ms, &(x[WS(rs, 1)]));
1400 			 T2p = BYTW(&(W[TWVL * 68]), T2o);
1401 			 T2q = VSUB(T2n, T2p);
1402 			 T66 = VADD(T2n, T2p);
1403 		    }
1404 		    {
1405 			 V T2D, T2F, T2C, T2E;
1406 			 T2C = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
1407 			 T2D = BYTW(&(W[TWVL * 20]), T2C);
1408 			 T2E = LD(&(x[WS(rs, 43)]), ms, &(x[WS(rs, 1)]));
1409 			 T2F = BYTW(&(W[TWVL * 84]), T2E);
1410 			 T2G = VSUB(T2D, T2F);
1411 			 T6a = VADD(T2D, T2F);
1412 		    }
1413 		    {
1414 			 V T2s, T2u, T2r, T2t;
1415 			 T2r = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
1416 			 T2s = BYTW(&(W[TWVL * 36]), T2r);
1417 			 T2t = LD(&(x[WS(rs, 51)]), ms, &(x[WS(rs, 1)]));
1418 			 T2u = BYTW(&(W[TWVL * 100]), T2t);
1419 			 T2v = VSUB(T2s, T2u);
1420 			 T67 = VADD(T2s, T2u);
1421 		    }
1422 		    {
1423 			 V T2y, T2A, T2x, T2z;
1424 			 T2x = LD(&(x[WS(rs, 59)]), ms, &(x[WS(rs, 1)]));
1425 			 T2y = BYTW(&(W[TWVL * 116]), T2x);
1426 			 T2z = LD(&(x[WS(rs, 27)]), ms, &(x[WS(rs, 1)]));
1427 			 T2A = BYTW(&(W[TWVL * 52]), T2z);
1428 			 T2B = VSUB(T2y, T2A);
1429 			 T69 = VADD(T2y, T2A);
1430 		    }
1431 		    {
1432 			 V T2w, T2H, T7o, T7p;
1433 			 T2w = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2q));
1434 			 T2H = VFMA(LDK(KP923879532), T2B, VMUL(LDK(KP382683432), T2G));
1435 			 T2I = VSUB(T2w, T2H);
1436 			 T4i = VADD(T2w, T2H);
1437 			 T7o = VADD(T66, T67);
1438 			 T7p = VADD(T69, T6a);
1439 			 T7q = VSUB(T7o, T7p);
1440 			 T82 = VADD(T7o, T7p);
1441 		    }
1442 		    {
1443 			 V T38, T39, T68, T6b;
1444 			 T38 = VFMA(LDK(KP382683432), T2q, VMUL(LDK(KP923879532), T2v));
1445 			 T39 = VFNMS(LDK(KP382683432), T2B, VMUL(LDK(KP923879532), T2G));
1446 			 T3a = VSUB(T38, T39);
1447 			 T4l = VADD(T38, T39);
1448 			 T68 = VSUB(T66, T67);
1449 			 T6b = VSUB(T69, T6a);
1450 			 T6c = VMUL(LDK(KP707106781), VSUB(T68, T6b));
1451 			 T6k = VMUL(LDK(KP707106781), VADD(T68, T6b));
1452 		    }
1453 	       }
1454 	       {
1455 		    V T7s, T7R, T7M, T7U, T7D, T7T, T7J, T7Q;
1456 		    {
1457 			 V T7k, T7r, T7K, T7L;
1458 			 T7k = VFNMS(LDK(KP382683432), T7j, VMUL(LDK(KP923879532), T7g));
1459 			 T7r = VFMA(LDK(KP923879532), T7n, VMUL(LDK(KP382683432), T7q));
1460 			 T7s = VSUB(T7k, T7r);
1461 			 T7R = VADD(T7k, T7r);
1462 			 T7K = VFMA(LDK(KP382683432), T7g, VMUL(LDK(KP923879532), T7j));
1463 			 T7L = VFNMS(LDK(KP382683432), T7n, VMUL(LDK(KP923879532), T7q));
1464 			 T7M = VSUB(T7K, T7L);
1465 			 T7U = VADD(T7K, T7L);
1466 		    }
1467 		    {
1468 			 V T7z, T7C, T7H, T7I;
1469 			 T7z = VMUL(LDK(KP707106781), VSUB(T7v, T7y));
1470 			 T7C = VSUB(T7A, T7B);
1471 			 T7D = VSUB(T7z, T7C);
1472 			 T7T = VADD(T7C, T7z);
1473 			 T7H = VSUB(T7F, T7G);
1474 			 T7I = VMUL(LDK(KP707106781), VADD(T7v, T7y));
1475 			 T7J = VSUB(T7H, T7I);
1476 			 T7Q = VADD(T7H, T7I);
1477 		    }
1478 		    {
1479 			 V T7E, T7N, T7W, T7X;
1480 			 T7E = VBYI(VSUB(T7s, T7D));
1481 			 T7N = VSUB(T7J, T7M);
1482 			 ST(&(x[WS(rs, 20)]), VADD(T7E, T7N), ms, &(x[0]));
1483 			 ST(&(x[WS(rs, 44)]), VSUB(T7N, T7E), ms, &(x[0]));
1484 			 T7W = VSUB(T7Q, T7R);
1485 			 T7X = VBYI(VSUB(T7U, T7T));
1486 			 ST(&(x[WS(rs, 36)]), VSUB(T7W, T7X), ms, &(x[0]));
1487 			 ST(&(x[WS(rs, 28)]), VADD(T7W, T7X), ms, &(x[0]));
1488 		    }
1489 		    {
1490 			 V T7O, T7P, T7S, T7V;
1491 			 T7O = VBYI(VADD(T7D, T7s));
1492 			 T7P = VADD(T7J, T7M);
1493 			 ST(&(x[WS(rs, 12)]), VADD(T7O, T7P), ms, &(x[0]));
1494 			 ST(&(x[WS(rs, 52)]), VSUB(T7P, T7O), ms, &(x[0]));
1495 			 T7S = VADD(T7Q, T7R);
1496 			 T7V = VBYI(VADD(T7T, T7U));
1497 			 ST(&(x[WS(rs, 60)]), VSUB(T7S, T7V), ms, &(x[0]));
1498 			 ST(&(x[WS(rs, 4)]), VADD(T7S, T7V), ms, &(x[0]));
1499 		    }
1500 	       }
1501 	       {
1502 		    V T84, T8c, T8l, T8n, T87, T8h, T8b, T8g, T8i, T8m;
1503 		    {
1504 			 V T80, T83, T8j, T8k;
1505 			 T80 = VSUB(T7Y, T7Z);
1506 			 T83 = VSUB(T81, T82);
1507 			 T84 = VMUL(LDK(KP707106781), VSUB(T80, T83));
1508 			 T8c = VMUL(LDK(KP707106781), VADD(T80, T83));
1509 			 T8j = VADD(T7Y, T7Z);
1510 			 T8k = VADD(T81, T82);
1511 			 T8l = VBYI(VSUB(T8j, T8k));
1512 			 T8n = VADD(T8j, T8k);
1513 		    }
1514 		    {
1515 			 V T85, T86, T89, T8a;
1516 			 T85 = VADD(T7t, T7u);
1517 			 T86 = VADD(T7w, T7x);
1518 			 T87 = VSUB(T85, T86);
1519 			 T8h = VADD(T85, T86);
1520 			 T89 = VADD(T7F, T7G);
1521 			 T8a = VADD(T7A, T7B);
1522 			 T8b = VSUB(T89, T8a);
1523 			 T8g = VADD(T89, T8a);
1524 		    }
1525 		    T8i = VSUB(T8g, T8h);
1526 		    ST(&(x[WS(rs, 48)]), VSUB(T8i, T8l), ms, &(x[0]));
1527 		    ST(&(x[WS(rs, 16)]), VADD(T8i, T8l), ms, &(x[0]));
1528 		    T8m = VADD(T8g, T8h);
1529 		    ST(&(x[WS(rs, 32)]), VSUB(T8m, T8n), ms, &(x[0]));
1530 		    ST(&(x[0]), VADD(T8m, T8n), ms, &(x[0]));
1531 		    {
1532 			 V T88, T8d, T8e, T8f;
1533 			 T88 = VBYI(VSUB(T84, T87));
1534 			 T8d = VSUB(T8b, T8c);
1535 			 ST(&(x[WS(rs, 24)]), VADD(T88, T8d), ms, &(x[0]));
1536 			 ST(&(x[WS(rs, 40)]), VSUB(T8d, T88), ms, &(x[0]));
1537 			 T8e = VBYI(VADD(T87, T84));
1538 			 T8f = VADD(T8b, T8c);
1539 			 ST(&(x[WS(rs, 8)]), VADD(T8e, T8f), ms, &(x[0]));
1540 			 ST(&(x[WS(rs, 56)]), VSUB(T8f, T8e), ms, &(x[0]));
1541 		    }
1542 	       }
1543 	       {
1544 		    V T5O, T6H, T6x, T6F, T6n, T6I, T6A, T6E;
1545 		    {
1546 			 V T5y, T5N, T6r, T6w;
1547 			 T5y = VSUB(T5q, T5x);
1548 			 T5N = VSUB(T5F, T5M);
1549 			 T5O = VSUB(T5y, T5N);
1550 			 T6H = VADD(T5y, T5N);
1551 			 T6r = VSUB(T6p, T6q);
1552 			 T6w = VSUB(T6s, T6v);
1553 			 T6x = VSUB(T6r, T6w);
1554 			 T6F = VADD(T6w, T6r);
1555 			 {
1556 			      V T65, T6y, T6m, T6z;
1557 			      {
1558 				   V T5Z, T64, T6g, T6l;
1559 				   T5Z = VSUB(T5V, T5Y);
1560 				   T64 = VSUB(T62, T63);
1561 				   T65 = VFMA(LDK(KP831469612), T5Z, VMUL(LDK(KP555570233), T64));
1562 				   T6y = VFNMS(LDK(KP555570233), T5Z, VMUL(LDK(KP831469612), T64));
1563 				   T6g = VSUB(T6c, T6f);
1564 				   T6l = VSUB(T6j, T6k);
1565 				   T6m = VFNMS(LDK(KP555570233), T6l, VMUL(LDK(KP831469612), T6g));
1566 				   T6z = VFMA(LDK(KP555570233), T6g, VMUL(LDK(KP831469612), T6l));
1567 			      }
1568 			      T6n = VSUB(T65, T6m);
1569 			      T6I = VADD(T6y, T6z);
1570 			      T6A = VSUB(T6y, T6z);
1571 			      T6E = VADD(T65, T6m);
1572 			 }
1573 		    }
1574 		    {
1575 			 V T6o, T6B, T6K, T6L;
1576 			 T6o = VADD(T5O, T6n);
1577 			 T6B = VBYI(VADD(T6x, T6A));
1578 			 ST(&(x[WS(rs, 54)]), VSUB(T6o, T6B), ms, &(x[0]));
1579 			 ST(&(x[WS(rs, 10)]), VADD(T6o, T6B), ms, &(x[0]));
1580 			 T6K = VBYI(VADD(T6F, T6E));
1581 			 T6L = VADD(T6H, T6I);
1582 			 ST(&(x[WS(rs, 6)]), VADD(T6K, T6L), ms, &(x[0]));
1583 			 ST(&(x[WS(rs, 58)]), VSUB(T6L, T6K), ms, &(x[0]));
1584 		    }
1585 		    {
1586 			 V T6C, T6D, T6G, T6J;
1587 			 T6C = VSUB(T5O, T6n);
1588 			 T6D = VBYI(VSUB(T6A, T6x));
1589 			 ST(&(x[WS(rs, 42)]), VSUB(T6C, T6D), ms, &(x[0]));
1590 			 ST(&(x[WS(rs, 22)]), VADD(T6C, T6D), ms, &(x[0]));
1591 			 T6G = VBYI(VSUB(T6E, T6F));
1592 			 T6J = VSUB(T6H, T6I);
1593 			 ST(&(x[WS(rs, 26)]), VADD(T6G, T6J), ms, &(x[0]));
1594 			 ST(&(x[WS(rs, 38)]), VSUB(T6J, T6G), ms, &(x[0]));
1595 		    }
1596 	       }
1597 	       {
1598 		    V T6O, T79, T6Z, T77, T6V, T7a, T72, T76;
1599 		    {
1600 			 V T6M, T6N, T6X, T6Y;
1601 			 T6M = VADD(T5q, T5x);
1602 			 T6N = VADD(T6p, T6q);
1603 			 T6O = VSUB(T6M, T6N);
1604 			 T79 = VADD(T6M, T6N);
1605 			 T6X = VADD(T5F, T5M);
1606 			 T6Y = VADD(T6v, T6s);
1607 			 T6Z = VSUB(T6X, T6Y);
1608 			 T77 = VADD(T6Y, T6X);
1609 			 {
1610 			      V T6R, T70, T6U, T71;
1611 			      {
1612 				   V T6P, T6Q, T6S, T6T;
1613 				   T6P = VADD(T5Y, T5V);
1614 				   T6Q = VADD(T62, T63);
1615 				   T6R = VFMA(LDK(KP980785280), T6P, VMUL(LDK(KP195090322), T6Q));
1616 				   T70 = VFNMS(LDK(KP195090322), T6P, VMUL(LDK(KP980785280), T6Q));
1617 				   T6S = VADD(T6f, T6c);
1618 				   T6T = VADD(T6j, T6k);
1619 				   T6U = VFNMS(LDK(KP195090322), T6T, VMUL(LDK(KP980785280), T6S));
1620 				   T71 = VFMA(LDK(KP195090322), T6S, VMUL(LDK(KP980785280), T6T));
1621 			      }
1622 			      T6V = VSUB(T6R, T6U);
1623 			      T7a = VADD(T70, T71);
1624 			      T72 = VSUB(T70, T71);
1625 			      T76 = VADD(T6R, T6U);
1626 			 }
1627 		    }
1628 		    {
1629 			 V T6W, T73, T7c, T7d;
1630 			 T6W = VADD(T6O, T6V);
1631 			 T73 = VBYI(VADD(T6Z, T72));
1632 			 ST(&(x[WS(rs, 50)]), VSUB(T6W, T73), ms, &(x[0]));
1633 			 ST(&(x[WS(rs, 14)]), VADD(T6W, T73), ms, &(x[0]));
1634 			 T7c = VBYI(VADD(T77, T76));
1635 			 T7d = VADD(T79, T7a);
1636 			 ST(&(x[WS(rs, 2)]), VADD(T7c, T7d), ms, &(x[0]));
1637 			 ST(&(x[WS(rs, 62)]), VSUB(T7d, T7c), ms, &(x[0]));
1638 		    }
1639 		    {
1640 			 V T74, T75, T78, T7b;
1641 			 T74 = VSUB(T6O, T6V);
1642 			 T75 = VBYI(VSUB(T72, T6Z));
1643 			 ST(&(x[WS(rs, 46)]), VSUB(T74, T75), ms, &(x[0]));
1644 			 ST(&(x[WS(rs, 18)]), VADD(T74, T75), ms, &(x[0]));
1645 			 T78 = VBYI(VSUB(T76, T77));
1646 			 T7b = VSUB(T79, T7a);
1647 			 ST(&(x[WS(rs, 30)]), VADD(T78, T7b), ms, &(x[0]));
1648 			 ST(&(x[WS(rs, 34)]), VSUB(T7b, T78), ms, &(x[0]));
1649 		    }
1650 	       }
1651 	       {
1652 		    V T4z, T5g, T4R, T59, T4H, T5j, T4O, T55, T4o, T4S, T4K, T4P, T52, T5k, T5c;
1653 		    V T5h;
1654 		    {
1655 			 V T4y, T57, T4v, T58, T4r, T4u;
1656 			 T4y = VADD(T4w, T4x);
1657 			 T57 = VSUB(T4B, T4C);
1658 			 T4r = VFMA(LDK(KP980785280), T4p, VMUL(LDK(KP195090322), T4q));
1659 			 T4u = VFNMS(LDK(KP195090322), T4t, VMUL(LDK(KP980785280), T4s));
1660 			 T4v = VADD(T4r, T4u);
1661 			 T58 = VSUB(T4r, T4u);
1662 			 T4z = VSUB(T4v, T4y);
1663 			 T5g = VADD(T57, T58);
1664 			 T4R = VADD(T4y, T4v);
1665 			 T59 = VSUB(T57, T58);
1666 		    }
1667 		    {
1668 			 V T4D, T54, T4G, T53, T4E, T4F;
1669 			 T4D = VADD(T4B, T4C);
1670 			 T54 = VSUB(T4x, T4w);
1671 			 T4E = VFNMS(LDK(KP195090322), T4p, VMUL(LDK(KP980785280), T4q));
1672 			 T4F = VFMA(LDK(KP195090322), T4s, VMUL(LDK(KP980785280), T4t));
1673 			 T4G = VADD(T4E, T4F);
1674 			 T53 = VSUB(T4E, T4F);
1675 			 T4H = VSUB(T4D, T4G);
1676 			 T5j = VADD(T54, T53);
1677 			 T4O = VADD(T4D, T4G);
1678 			 T55 = VSUB(T53, T54);
1679 		    }
1680 		    {
1681 			 V T4g, T4I, T4n, T4J;
1682 			 {
1683 			      V T4c, T4f, T4j, T4m;
1684 			      T4c = VADD(T4a, T4b);
1685 			      T4f = VADD(T4d, T4e);
1686 			      T4g = VFNMS(LDK(KP098017140), T4f, VMUL(LDK(KP995184726), T4c));
1687 			      T4I = VFMA(LDK(KP098017140), T4c, VMUL(LDK(KP995184726), T4f));
1688 			      T4j = VADD(T4h, T4i);
1689 			      T4m = VADD(T4k, T4l);
1690 			      T4n = VFMA(LDK(KP995184726), T4j, VMUL(LDK(KP098017140), T4m));
1691 			      T4J = VFNMS(LDK(KP098017140), T4j, VMUL(LDK(KP995184726), T4m));
1692 			 }
1693 			 T4o = VSUB(T4g, T4n);
1694 			 T4S = VADD(T4I, T4J);
1695 			 T4K = VSUB(T4I, T4J);
1696 			 T4P = VADD(T4g, T4n);
1697 		    }
1698 		    {
1699 			 V T4Y, T5a, T51, T5b;
1700 			 {
1701 			      V T4W, T4X, T4Z, T50;
1702 			      T4W = VSUB(T4a, T4b);
1703 			      T4X = VSUB(T4e, T4d);
1704 			      T4Y = VFNMS(LDK(KP634393284), T4X, VMUL(LDK(KP773010453), T4W));
1705 			      T5a = VFMA(LDK(KP634393284), T4W, VMUL(LDK(KP773010453), T4X));
1706 			      T4Z = VSUB(T4h, T4i);
1707 			      T50 = VSUB(T4l, T4k);
1708 			      T51 = VFMA(LDK(KP773010453), T4Z, VMUL(LDK(KP634393284), T50));
1709 			      T5b = VFNMS(LDK(KP634393284), T4Z, VMUL(LDK(KP773010453), T50));
1710 			 }
1711 			 T52 = VSUB(T4Y, T51);
1712 			 T5k = VADD(T5a, T5b);
1713 			 T5c = VSUB(T5a, T5b);
1714 			 T5h = VADD(T4Y, T51);
1715 		    }
1716 		    {
1717 			 V T4A, T4L, T5i, T5l;
1718 			 T4A = VBYI(VSUB(T4o, T4z));
1719 			 T4L = VSUB(T4H, T4K);
1720 			 ST(&(x[WS(rs, 17)]), VADD(T4A, T4L), ms, &(x[WS(rs, 1)]));
1721 			 ST(&(x[WS(rs, 47)]), VSUB(T4L, T4A), ms, &(x[WS(rs, 1)]));
1722 			 T5i = VADD(T5g, T5h);
1723 			 T5l = VBYI(VADD(T5j, T5k));
1724 			 ST(&(x[WS(rs, 57)]), VSUB(T5i, T5l), ms, &(x[WS(rs, 1)]));
1725 			 ST(&(x[WS(rs, 7)]), VADD(T5i, T5l), ms, &(x[WS(rs, 1)]));
1726 		    }
1727 		    {
1728 			 V T5m, T5n, T4M, T4N;
1729 			 T5m = VSUB(T5g, T5h);
1730 			 T5n = VBYI(VSUB(T5k, T5j));
1731 			 ST(&(x[WS(rs, 39)]), VSUB(T5m, T5n), ms, &(x[WS(rs, 1)]));
1732 			 ST(&(x[WS(rs, 25)]), VADD(T5m, T5n), ms, &(x[WS(rs, 1)]));
1733 			 T4M = VBYI(VADD(T4z, T4o));
1734 			 T4N = VADD(T4H, T4K);
1735 			 ST(&(x[WS(rs, 15)]), VADD(T4M, T4N), ms, &(x[WS(rs, 1)]));
1736 			 ST(&(x[WS(rs, 49)]), VSUB(T4N, T4M), ms, &(x[WS(rs, 1)]));
1737 		    }
1738 		    {
1739 			 V T4Q, T4T, T56, T5d;
1740 			 T4Q = VADD(T4O, T4P);
1741 			 T4T = VBYI(VADD(T4R, T4S));
1742 			 ST(&(x[WS(rs, 63)]), VSUB(T4Q, T4T), ms, &(x[WS(rs, 1)]));
1743 			 ST(&(x[WS(rs, 1)]), VADD(T4Q, T4T), ms, &(x[WS(rs, 1)]));
1744 			 T56 = VBYI(VSUB(T52, T55));
1745 			 T5d = VSUB(T59, T5c);
1746 			 ST(&(x[WS(rs, 23)]), VADD(T56, T5d), ms, &(x[WS(rs, 1)]));
1747 			 ST(&(x[WS(rs, 41)]), VSUB(T5d, T56), ms, &(x[WS(rs, 1)]));
1748 		    }
1749 		    {
1750 			 V T5e, T5f, T4U, T4V;
1751 			 T5e = VBYI(VADD(T55, T52));
1752 			 T5f = VADD(T59, T5c);
1753 			 ST(&(x[WS(rs, 9)]), VADD(T5e, T5f), ms, &(x[WS(rs, 1)]));
1754 			 ST(&(x[WS(rs, 55)]), VSUB(T5f, T5e), ms, &(x[WS(rs, 1)]));
1755 			 T4U = VSUB(T4O, T4P);
1756 			 T4V = VBYI(VSUB(T4S, T4R));
1757 			 ST(&(x[WS(rs, 33)]), VSUB(T4U, T4V), ms, &(x[WS(rs, 1)]));
1758 			 ST(&(x[WS(rs, 31)]), VADD(T4U, T4V), ms, &(x[WS(rs, 1)]));
1759 		    }
1760 	       }
1761 	       {
1762 		    V T1u, T43, T3D, T3V, T3t, T45, T3B, T3K, T3d, T3E, T3w, T3A, T3R, T46, T3Y;
1763 		    V T42;
1764 		    {
1765 			 V TE, T3U, T1t, T3T, T13, T1s;
1766 			 TE = VSUB(Tg, TD);
1767 			 T3U = VADD(T3r, T3k);
1768 			 T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
1769 			 T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
1770 			 T1t = VSUB(T13, T1s);
1771 			 T3T = VADD(T13, T1s);
1772 			 T1u = VSUB(TE, T1t);
1773 			 T43 = VADD(T3U, T3T);
1774 			 T3D = VADD(TE, T1t);
1775 			 T3V = VSUB(T3T, T3U);
1776 		    }
1777 		    {
1778 			 V T3s, T3I, T3h, T3J, T3f, T3g;
1779 			 T3s = VSUB(T3k, T3r);
1780 			 T3I = VADD(Tg, TD);
1781 			 T3f = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
1782 			 T3g = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
1783 			 T3h = VSUB(T3f, T3g);
1784 			 T3J = VADD(T3f, T3g);
1785 			 T3t = VSUB(T3h, T3s);
1786 			 T45 = VADD(T3I, T3J);
1787 			 T3B = VADD(T3s, T3h);
1788 			 T3K = VSUB(T3I, T3J);
1789 		    }
1790 		    {
1791 			 V T2l, T3u, T3c, T3v;
1792 			 {
1793 			      V T29, T2k, T30, T3b;
1794 			      T29 = VSUB(T1R, T28);
1795 			      T2k = VSUB(T2g, T2j);
1796 			      T2l = VFMA(LDK(KP881921264), T29, VMUL(LDK(KP471396736), T2k));
1797 			      T3u = VFNMS(LDK(KP471396736), T29, VMUL(LDK(KP881921264), T2k));
1798 			      T30 = VSUB(T2I, T2Z);
1799 			      T3b = VSUB(T37, T3a);
1800 			      T3c = VFNMS(LDK(KP471396736), T3b, VMUL(LDK(KP881921264), T30));
1801 			      T3v = VFMA(LDK(KP471396736), T30, VMUL(LDK(KP881921264), T3b));
1802 			 }
1803 			 T3d = VSUB(T2l, T3c);
1804 			 T3E = VADD(T3u, T3v);
1805 			 T3w = VSUB(T3u, T3v);
1806 			 T3A = VADD(T2l, T3c);
1807 		    }
1808 		    {
1809 			 V T3N, T3W, T3Q, T3X;
1810 			 {
1811 			      V T3L, T3M, T3O, T3P;
1812 			      T3L = VADD(T28, T1R);
1813 			      T3M = VADD(T2g, T2j);
1814 			      T3N = VFMA(LDK(KP956940335), T3L, VMUL(LDK(KP290284677), T3M));
1815 			      T3W = VFNMS(LDK(KP290284677), T3L, VMUL(LDK(KP956940335), T3M));
1816 			      T3O = VADD(T2Z, T2I);
1817 			      T3P = VADD(T37, T3a);
1818 			      T3Q = VFNMS(LDK(KP290284677), T3P, VMUL(LDK(KP956940335), T3O));
1819 			      T3X = VFMA(LDK(KP290284677), T3O, VMUL(LDK(KP956940335), T3P));
1820 			 }
1821 			 T3R = VSUB(T3N, T3Q);
1822 			 T46 = VADD(T3W, T3X);
1823 			 T3Y = VSUB(T3W, T3X);
1824 			 T42 = VADD(T3N, T3Q);
1825 		    }
1826 		    {
1827 			 V T3e, T3x, T44, T47;
1828 			 T3e = VADD(T1u, T3d);
1829 			 T3x = VBYI(VADD(T3t, T3w));
1830 			 ST(&(x[WS(rs, 53)]), VSUB(T3e, T3x), ms, &(x[WS(rs, 1)]));
1831 			 ST(&(x[WS(rs, 11)]), VADD(T3e, T3x), ms, &(x[WS(rs, 1)]));
1832 			 T44 = VBYI(VSUB(T42, T43));
1833 			 T47 = VSUB(T45, T46);
1834 			 ST(&(x[WS(rs, 29)]), VADD(T44, T47), ms, &(x[WS(rs, 1)]));
1835 			 ST(&(x[WS(rs, 35)]), VSUB(T47, T44), ms, &(x[WS(rs, 1)]));
1836 		    }
1837 		    {
1838 			 V T48, T49, T3y, T3z;
1839 			 T48 = VBYI(VADD(T43, T42));
1840 			 T49 = VADD(T45, T46);
1841 			 ST(&(x[WS(rs, 3)]), VADD(T48, T49), ms, &(x[WS(rs, 1)]));
1842 			 ST(&(x[WS(rs, 61)]), VSUB(T49, T48), ms, &(x[WS(rs, 1)]));
1843 			 T3y = VSUB(T1u, T3d);
1844 			 T3z = VBYI(VSUB(T3w, T3t));
1845 			 ST(&(x[WS(rs, 43)]), VSUB(T3y, T3z), ms, &(x[WS(rs, 1)]));
1846 			 ST(&(x[WS(rs, 21)]), VADD(T3y, T3z), ms, &(x[WS(rs, 1)]));
1847 		    }
1848 		    {
1849 			 V T3C, T3F, T3S, T3Z;
1850 			 T3C = VBYI(VSUB(T3A, T3B));
1851 			 T3F = VSUB(T3D, T3E);
1852 			 ST(&(x[WS(rs, 27)]), VADD(T3C, T3F), ms, &(x[WS(rs, 1)]));
1853 			 ST(&(x[WS(rs, 37)]), VSUB(T3F, T3C), ms, &(x[WS(rs, 1)]));
1854 			 T3S = VADD(T3K, T3R);
1855 			 T3Z = VBYI(VADD(T3V, T3Y));
1856 			 ST(&(x[WS(rs, 51)]), VSUB(T3S, T3Z), ms, &(x[WS(rs, 1)]));
1857 			 ST(&(x[WS(rs, 13)]), VADD(T3S, T3Z), ms, &(x[WS(rs, 1)]));
1858 		    }
1859 		    {
1860 			 V T40, T41, T3G, T3H;
1861 			 T40 = VSUB(T3K, T3R);
1862 			 T41 = VBYI(VSUB(T3Y, T3V));
1863 			 ST(&(x[WS(rs, 45)]), VSUB(T40, T41), ms, &(x[WS(rs, 1)]));
1864 			 ST(&(x[WS(rs, 19)]), VADD(T40, T41), ms, &(x[WS(rs, 1)]));
1865 			 T3G = VBYI(VADD(T3B, T3A));
1866 			 T3H = VADD(T3D, T3E);
1867 			 ST(&(x[WS(rs, 5)]), VADD(T3G, T3H), ms, &(x[WS(rs, 1)]));
1868 			 ST(&(x[WS(rs, 59)]), VSUB(T3H, T3G), ms, &(x[WS(rs, 1)]));
1869 		    }
1870 	       }
1871 	  }
1872      }
1873      VLEAVE();
1874 }
1875 
1876 static const tw_instr twinstr[] = {
1877      VTW(0, 1),
1878      VTW(0, 2),
1879      VTW(0, 3),
1880      VTW(0, 4),
1881      VTW(0, 5),
1882      VTW(0, 6),
1883      VTW(0, 7),
1884      VTW(0, 8),
1885      VTW(0, 9),
1886      VTW(0, 10),
1887      VTW(0, 11),
1888      VTW(0, 12),
1889      VTW(0, 13),
1890      VTW(0, 14),
1891      VTW(0, 15),
1892      VTW(0, 16),
1893      VTW(0, 17),
1894      VTW(0, 18),
1895      VTW(0, 19),
1896      VTW(0, 20),
1897      VTW(0, 21),
1898      VTW(0, 22),
1899      VTW(0, 23),
1900      VTW(0, 24),
1901      VTW(0, 25),
1902      VTW(0, 26),
1903      VTW(0, 27),
1904      VTW(0, 28),
1905      VTW(0, 29),
1906      VTW(0, 30),
1907      VTW(0, 31),
1908      VTW(0, 32),
1909      VTW(0, 33),
1910      VTW(0, 34),
1911      VTW(0, 35),
1912      VTW(0, 36),
1913      VTW(0, 37),
1914      VTW(0, 38),
1915      VTW(0, 39),
1916      VTW(0, 40),
1917      VTW(0, 41),
1918      VTW(0, 42),
1919      VTW(0, 43),
1920      VTW(0, 44),
1921      VTW(0, 45),
1922      VTW(0, 46),
1923      VTW(0, 47),
1924      VTW(0, 48),
1925      VTW(0, 49),
1926      VTW(0, 50),
1927      VTW(0, 51),
1928      VTW(0, 52),
1929      VTW(0, 53),
1930      VTW(0, 54),
1931      VTW(0, 55),
1932      VTW(0, 56),
1933      VTW(0, 57),
1934      VTW(0, 58),
1935      VTW(0, 59),
1936      VTW(0, 60),
1937      VTW(0, 61),
1938      VTW(0, 62),
1939      VTW(0, 63),
1940      { TW_NEXT, VL, 0 }
1941 };
1942 
1943 static const ct_desc desc = { 64, XSIMD_STRING("t2bv_64"), twinstr, &GENUS, { 467, 198, 52, 0 }, 0, 0, 0 };
1944 
XSIMD(codelet_t2bv_64)1945 void XSIMD(codelet_t2bv_64) (planner *p) {
1946      X(kdft_dit_register) (p, t2bv_64, &desc);
1947 }
1948 #endif
1949