1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:05:34 EST 2020 */
23 
24 #include "dft/codelet-dft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2bv_20 -include dft/simd/t2b.h -sign 1 */
29 
30 /*
31  * This function contains 123 FP additions, 88 FP multiplications,
32  * (or, 77 additions, 42 multiplications, 46 fused multiply/add),
33  * 54 stack variables, 4 constants, and 40 memory accesses
34  */
35 #include "dft/simd/t2b.h"
36 
t2bv_20(R * ri,R * ii,const R * W,stride rs,INT mb,INT me,INT ms)37 static void t2bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
40      DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
41      DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
42      DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
43      {
44 	  INT m;
45 	  R *x;
46 	  x = ii;
47 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
48 	       V T4, TX, T1m, T1K, TF, T14, T15, TQ, Tf, Tq, Tr, T1O, T1P, T1Q, T1w;
49 	       V T1z, T1A, TY, TZ, T10, T1L, T1M, T1N, T1p, T1s, T1t, T1i, T1j;
50 	       {
51 		    V T1, TW, T3, TU, TV, T2, TT, T1k, T1l;
52 		    T1 = LD(&(x[0]), ms, &(x[0]));
53 		    TV = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
54 		    TW = BYTW(&(W[TWVL * 28]), TV);
55 		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
56 		    T3 = BYTW(&(W[TWVL * 18]), T2);
57 		    TT = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
58 		    TU = BYTW(&(W[TWVL * 8]), TT);
59 		    T4 = VSUB(T1, T3);
60 		    TX = VSUB(TU, TW);
61 		    T1k = VADD(T1, T3);
62 		    T1l = VADD(TU, TW);
63 		    T1m = VSUB(T1k, T1l);
64 		    T1K = VADD(T1k, T1l);
65 	       }
66 	       {
67 		    V T9, T1n, TK, T1v, TP, T1y, Te, T1q, Tk, T1u, Tz, T1o, TE, T1r, Tp;
68 		    V T1x;
69 		    {
70 			 V T6, T8, T5, T7;
71 			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
72 			 T6 = BYTW(&(W[TWVL * 6]), T5);
73 			 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
74 			 T8 = BYTW(&(W[TWVL * 26]), T7);
75 			 T9 = VSUB(T6, T8);
76 			 T1n = VADD(T6, T8);
77 		    }
78 		    {
79 			 V TH, TJ, TG, TI;
80 			 TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
81 			 TH = BYTW(&(W[TWVL * 24]), TG);
82 			 TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
83 			 TJ = BYTW(&(W[TWVL * 4]), TI);
84 			 TK = VSUB(TH, TJ);
85 			 T1v = VADD(TH, TJ);
86 		    }
87 		    {
88 			 V TM, TO, TL, TN;
89 			 TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
90 			 TM = BYTW(&(W[TWVL * 32]), TL);
91 			 TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
92 			 TO = BYTW(&(W[TWVL * 12]), TN);
93 			 TP = VSUB(TM, TO);
94 			 T1y = VADD(TM, TO);
95 		    }
96 		    {
97 			 V Tb, Td, Ta, Tc;
98 			 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
99 			 Tb = BYTW(&(W[TWVL * 30]), Ta);
100 			 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
101 			 Td = BYTW(&(W[TWVL * 10]), Tc);
102 			 Te = VSUB(Tb, Td);
103 			 T1q = VADD(Tb, Td);
104 		    }
105 		    {
106 			 V Th, Tj, Tg, Ti;
107 			 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
108 			 Th = BYTW(&(W[TWVL * 14]), Tg);
109 			 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
110 			 Tj = BYTW(&(W[TWVL * 34]), Ti);
111 			 Tk = VSUB(Th, Tj);
112 			 T1u = VADD(Th, Tj);
113 		    }
114 		    {
115 			 V Tw, Ty, Tv, Tx;
116 			 Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
117 			 Tw = BYTW(&(W[TWVL * 16]), Tv);
118 			 Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
119 			 Ty = BYTW(&(W[TWVL * 36]), Tx);
120 			 Tz = VSUB(Tw, Ty);
121 			 T1o = VADD(Tw, Ty);
122 		    }
123 		    {
124 			 V TB, TD, TA, TC;
125 			 TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
126 			 TB = BYTW(&(W[0]), TA);
127 			 TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
128 			 TD = BYTW(&(W[TWVL * 20]), TC);
129 			 TE = VSUB(TB, TD);
130 			 T1r = VADD(TB, TD);
131 		    }
132 		    {
133 			 V Tm, To, Tl, Tn;
134 			 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
135 			 Tm = BYTW(&(W[TWVL * 22]), Tl);
136 			 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
137 			 To = BYTW(&(W[TWVL * 2]), Tn);
138 			 Tp = VSUB(Tm, To);
139 			 T1x = VADD(Tm, To);
140 		    }
141 		    TF = VSUB(Tz, TE);
142 		    T14 = VSUB(T9, Te);
143 		    T15 = VSUB(Tk, Tp);
144 		    TQ = VSUB(TK, TP);
145 		    Tf = VADD(T9, Te);
146 		    Tq = VADD(Tk, Tp);
147 		    Tr = VADD(Tf, Tq);
148 		    T1O = VADD(T1u, T1v);
149 		    T1P = VADD(T1x, T1y);
150 		    T1Q = VADD(T1O, T1P);
151 		    T1w = VSUB(T1u, T1v);
152 		    T1z = VSUB(T1x, T1y);
153 		    T1A = VADD(T1w, T1z);
154 		    TY = VADD(Tz, TE);
155 		    TZ = VADD(TK, TP);
156 		    T10 = VADD(TY, TZ);
157 		    T1L = VADD(T1n, T1o);
158 		    T1M = VADD(T1q, T1r);
159 		    T1N = VADD(T1L, T1M);
160 		    T1p = VSUB(T1n, T1o);
161 		    T1s = VSUB(T1q, T1r);
162 		    T1t = VADD(T1p, T1s);
163 	       }
164 	       T1i = VADD(T4, Tr);
165 	       T1j = VADD(TX, T10);
166 	       ST(&(x[WS(rs, 15)]), VFNMSI(T1j, T1i), ms, &(x[WS(rs, 1)]));
167 	       ST(&(x[WS(rs, 5)]), VFMAI(T1j, T1i), ms, &(x[WS(rs, 1)]));
168 	       {
169 		    V T1T, T1R, T1S, T1X, T1Z, T1V, T1W, T1Y, T1U;
170 		    T1T = VSUB(T1N, T1Q);
171 		    T1R = VADD(T1N, T1Q);
172 		    T1S = VFNMS(LDK(KP250000000), T1R, T1K);
173 		    T1V = VSUB(T1L, T1M);
174 		    T1W = VSUB(T1O, T1P);
175 		    T1X = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1W, T1V));
176 		    T1Z = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1V, T1W));
177 		    ST(&(x[0]), VADD(T1K, T1R), ms, &(x[0]));
178 		    T1Y = VFNMS(LDK(KP559016994), T1T, T1S);
179 		    ST(&(x[WS(rs, 8)]), VFMAI(T1Z, T1Y), ms, &(x[0]));
180 		    ST(&(x[WS(rs, 12)]), VFNMSI(T1Z, T1Y), ms, &(x[0]));
181 		    T1U = VFMA(LDK(KP559016994), T1T, T1S);
182 		    ST(&(x[WS(rs, 4)]), VFNMSI(T1X, T1U), ms, &(x[0]));
183 		    ST(&(x[WS(rs, 16)]), VFMAI(T1X, T1U), ms, &(x[0]));
184 	       }
185 	       {
186 		    V T1D, T1B, T1C, T1H, T1J, T1F, T1G, T1I, T1E;
187 		    T1D = VSUB(T1t, T1A);
188 		    T1B = VADD(T1t, T1A);
189 		    T1C = VFNMS(LDK(KP250000000), T1B, T1m);
190 		    T1F = VSUB(T1w, T1z);
191 		    T1G = VSUB(T1p, T1s);
192 		    T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
193 		    T1J = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
194 		    ST(&(x[WS(rs, 10)]), VADD(T1m, T1B), ms, &(x[0]));
195 		    T1I = VFMA(LDK(KP559016994), T1D, T1C);
196 		    ST(&(x[WS(rs, 6)]), VFMAI(T1J, T1I), ms, &(x[0]));
197 		    ST(&(x[WS(rs, 14)]), VFNMSI(T1J, T1I), ms, &(x[0]));
198 		    T1E = VFNMS(LDK(KP559016994), T1D, T1C);
199 		    ST(&(x[WS(rs, 2)]), VFNMSI(T1H, T1E), ms, &(x[0]));
200 		    ST(&(x[WS(rs, 18)]), VFMAI(T1H, T1E), ms, &(x[0]));
201 	       }
202 	       {
203 		    V TR, T16, T1e, T1b, T13, T1d, Tu, T1a;
204 		    TR = VFMA(LDK(KP618033988), TQ, TF);
205 		    T16 = VFMA(LDK(KP618033988), T15, T14);
206 		    T1e = VFNMS(LDK(KP618033988), T14, T15);
207 		    T1b = VFNMS(LDK(KP618033988), TF, TQ);
208 		    {
209 			 V T11, T12, Ts, Tt;
210 			 T11 = VFNMS(LDK(KP250000000), T10, TX);
211 			 T12 = VSUB(TY, TZ);
212 			 T13 = VFMA(LDK(KP559016994), T12, T11);
213 			 T1d = VFNMS(LDK(KP559016994), T12, T11);
214 			 Ts = VFNMS(LDK(KP250000000), Tr, T4);
215 			 Tt = VSUB(Tf, Tq);
216 			 Tu = VFMA(LDK(KP559016994), Tt, Ts);
217 			 T1a = VFNMS(LDK(KP559016994), Tt, Ts);
218 		    }
219 		    {
220 			 V TS, T17, T1g, T1h;
221 			 TS = VFNMS(LDK(KP951056516), TR, Tu);
222 			 T17 = VFMA(LDK(KP951056516), T16, T13);
223 			 ST(&(x[WS(rs, 19)]), VFNMSI(T17, TS), ms, &(x[WS(rs, 1)]));
224 			 ST(&(x[WS(rs, 1)]), VFMAI(T17, TS), ms, &(x[WS(rs, 1)]));
225 			 T1g = VFNMS(LDK(KP951056516), T1b, T1a);
226 			 T1h = VFMA(LDK(KP951056516), T1e, T1d);
227 			 ST(&(x[WS(rs, 7)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
228 			 ST(&(x[WS(rs, 13)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
229 		    }
230 		    {
231 			 V T18, T19, T1c, T1f;
232 			 T18 = VFMA(LDK(KP951056516), TR, Tu);
233 			 T19 = VFNMS(LDK(KP951056516), T16, T13);
234 			 ST(&(x[WS(rs, 11)]), VFNMSI(T19, T18), ms, &(x[WS(rs, 1)]));
235 			 ST(&(x[WS(rs, 9)]), VFMAI(T19, T18), ms, &(x[WS(rs, 1)]));
236 			 T1c = VFMA(LDK(KP951056516), T1b, T1a);
237 			 T1f = VFNMS(LDK(KP951056516), T1e, T1d);
238 			 ST(&(x[WS(rs, 3)]), VFNMSI(T1f, T1c), ms, &(x[WS(rs, 1)]));
239 			 ST(&(x[WS(rs, 17)]), VFMAI(T1f, T1c), ms, &(x[WS(rs, 1)]));
240 		    }
241 	       }
242 	  }
243      }
244      VLEAVE();
245 }
246 
247 static const tw_instr twinstr[] = {
248      VTW(0, 1),
249      VTW(0, 2),
250      VTW(0, 3),
251      VTW(0, 4),
252      VTW(0, 5),
253      VTW(0, 6),
254      VTW(0, 7),
255      VTW(0, 8),
256      VTW(0, 9),
257      VTW(0, 10),
258      VTW(0, 11),
259      VTW(0, 12),
260      VTW(0, 13),
261      VTW(0, 14),
262      VTW(0, 15),
263      VTW(0, 16),
264      VTW(0, 17),
265      VTW(0, 18),
266      VTW(0, 19),
267      { TW_NEXT, VL, 0 }
268 };
269 
270 static const ct_desc desc = { 20, XSIMD_STRING("t2bv_20"), twinstr, &GENUS, { 77, 42, 46, 0 }, 0, 0, 0 };
271 
XSIMD(codelet_t2bv_20)272 void XSIMD(codelet_t2bv_20) (planner *p) {
273      X(kdft_dit_register) (p, t2bv_20, &desc);
274 }
275 #else
276 
277 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 20 -name t2bv_20 -include dft/simd/t2b.h -sign 1 */
278 
279 /*
280  * This function contains 123 FP additions, 62 FP multiplications,
281  * (or, 111 additions, 50 multiplications, 12 fused multiply/add),
282  * 54 stack variables, 4 constants, and 40 memory accesses
283  */
284 #include "dft/simd/t2b.h"
285 
t2bv_20(R * ri,R * ii,const R * W,stride rs,INT mb,INT me,INT ms)286 static void t2bv_20(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
287 {
288      DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
289      DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
290      DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
291      DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
292      {
293 	  INT m;
294 	  R *x;
295 	  x = ii;
296 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 38)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(20, rs)) {
297 	       V T4, T10, T1B, T1R, TF, T14, T15, TQ, Tf, Tq, Tr, T1N, T1O, T1P, T1t;
298 	       V T1w, T1D, TT, TU, T11, T1K, T1L, T1M, T1m, T1p, T1C, T1i, T1j;
299 	       {
300 		    V T1, TZ, T3, TX, TY, T2, TW, T1z, T1A;
301 		    T1 = LD(&(x[0]), ms, &(x[0]));
302 		    TY = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
303 		    TZ = BYTW(&(W[TWVL * 28]), TY);
304 		    T2 = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
305 		    T3 = BYTW(&(W[TWVL * 18]), T2);
306 		    TW = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
307 		    TX = BYTW(&(W[TWVL * 8]), TW);
308 		    T4 = VSUB(T1, T3);
309 		    T10 = VSUB(TX, TZ);
310 		    T1z = VADD(T1, T3);
311 		    T1A = VADD(TX, TZ);
312 		    T1B = VSUB(T1z, T1A);
313 		    T1R = VADD(T1z, T1A);
314 	       }
315 	       {
316 		    V T9, T1k, TK, T1s, TP, T1v, Te, T1n, Tk, T1r, Tz, T1l, TE, T1o, Tp;
317 		    V T1u;
318 		    {
319 			 V T6, T8, T5, T7;
320 			 T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
321 			 T6 = BYTW(&(W[TWVL * 6]), T5);
322 			 T7 = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
323 			 T8 = BYTW(&(W[TWVL * 26]), T7);
324 			 T9 = VSUB(T6, T8);
325 			 T1k = VADD(T6, T8);
326 		    }
327 		    {
328 			 V TH, TJ, TG, TI;
329 			 TG = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
330 			 TH = BYTW(&(W[TWVL * 24]), TG);
331 			 TI = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
332 			 TJ = BYTW(&(W[TWVL * 4]), TI);
333 			 TK = VSUB(TH, TJ);
334 			 T1s = VADD(TH, TJ);
335 		    }
336 		    {
337 			 V TM, TO, TL, TN;
338 			 TL = LD(&(x[WS(rs, 17)]), ms, &(x[WS(rs, 1)]));
339 			 TM = BYTW(&(W[TWVL * 32]), TL);
340 			 TN = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
341 			 TO = BYTW(&(W[TWVL * 12]), TN);
342 			 TP = VSUB(TM, TO);
343 			 T1v = VADD(TM, TO);
344 		    }
345 		    {
346 			 V Tb, Td, Ta, Tc;
347 			 Ta = LD(&(x[WS(rs, 16)]), ms, &(x[0]));
348 			 Tb = BYTW(&(W[TWVL * 30]), Ta);
349 			 Tc = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
350 			 Td = BYTW(&(W[TWVL * 10]), Tc);
351 			 Te = VSUB(Tb, Td);
352 			 T1n = VADD(Tb, Td);
353 		    }
354 		    {
355 			 V Th, Tj, Tg, Ti;
356 			 Tg = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
357 			 Th = BYTW(&(W[TWVL * 14]), Tg);
358 			 Ti = LD(&(x[WS(rs, 18)]), ms, &(x[0]));
359 			 Tj = BYTW(&(W[TWVL * 34]), Ti);
360 			 Tk = VSUB(Th, Tj);
361 			 T1r = VADD(Th, Tj);
362 		    }
363 		    {
364 			 V Tw, Ty, Tv, Tx;
365 			 Tv = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
366 			 Tw = BYTW(&(W[TWVL * 16]), Tv);
367 			 Tx = LD(&(x[WS(rs, 19)]), ms, &(x[WS(rs, 1)]));
368 			 Ty = BYTW(&(W[TWVL * 36]), Tx);
369 			 Tz = VSUB(Tw, Ty);
370 			 T1l = VADD(Tw, Ty);
371 		    }
372 		    {
373 			 V TB, TD, TA, TC;
374 			 TA = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
375 			 TB = BYTW(&(W[0]), TA);
376 			 TC = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
377 			 TD = BYTW(&(W[TWVL * 20]), TC);
378 			 TE = VSUB(TB, TD);
379 			 T1o = VADD(TB, TD);
380 		    }
381 		    {
382 			 V Tm, To, Tl, Tn;
383 			 Tl = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
384 			 Tm = BYTW(&(W[TWVL * 22]), Tl);
385 			 Tn = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
386 			 To = BYTW(&(W[TWVL * 2]), Tn);
387 			 Tp = VSUB(Tm, To);
388 			 T1u = VADD(Tm, To);
389 		    }
390 		    TF = VSUB(Tz, TE);
391 		    T14 = VSUB(T9, Te);
392 		    T15 = VSUB(Tk, Tp);
393 		    TQ = VSUB(TK, TP);
394 		    Tf = VADD(T9, Te);
395 		    Tq = VADD(Tk, Tp);
396 		    Tr = VADD(Tf, Tq);
397 		    T1N = VADD(T1r, T1s);
398 		    T1O = VADD(T1u, T1v);
399 		    T1P = VADD(T1N, T1O);
400 		    T1t = VSUB(T1r, T1s);
401 		    T1w = VSUB(T1u, T1v);
402 		    T1D = VADD(T1t, T1w);
403 		    TT = VADD(Tz, TE);
404 		    TU = VADD(TK, TP);
405 		    T11 = VADD(TT, TU);
406 		    T1K = VADD(T1k, T1l);
407 		    T1L = VADD(T1n, T1o);
408 		    T1M = VADD(T1K, T1L);
409 		    T1m = VSUB(T1k, T1l);
410 		    T1p = VSUB(T1n, T1o);
411 		    T1C = VADD(T1m, T1p);
412 	       }
413 	       T1i = VADD(T4, Tr);
414 	       T1j = VBYI(VADD(T10, T11));
415 	       ST(&(x[WS(rs, 15)]), VSUB(T1i, T1j), ms, &(x[WS(rs, 1)]));
416 	       ST(&(x[WS(rs, 5)]), VADD(T1i, T1j), ms, &(x[WS(rs, 1)]));
417 	       {
418 		    V T1Q, T1S, T1T, T1X, T1Z, T1V, T1W, T1Y, T1U;
419 		    T1Q = VMUL(LDK(KP559016994), VSUB(T1M, T1P));
420 		    T1S = VADD(T1M, T1P);
421 		    T1T = VFNMS(LDK(KP250000000), T1S, T1R);
422 		    T1V = VSUB(T1K, T1L);
423 		    T1W = VSUB(T1N, T1O);
424 		    T1X = VBYI(VFMA(LDK(KP951056516), T1V, VMUL(LDK(KP587785252), T1W)));
425 		    T1Z = VBYI(VFNMS(LDK(KP951056516), T1W, VMUL(LDK(KP587785252), T1V)));
426 		    ST(&(x[0]), VADD(T1R, T1S), ms, &(x[0]));
427 		    T1Y = VSUB(T1T, T1Q);
428 		    ST(&(x[WS(rs, 8)]), VSUB(T1Y, T1Z), ms, &(x[0]));
429 		    ST(&(x[WS(rs, 12)]), VADD(T1Z, T1Y), ms, &(x[0]));
430 		    T1U = VADD(T1Q, T1T);
431 		    ST(&(x[WS(rs, 4)]), VSUB(T1U, T1X), ms, &(x[0]));
432 		    ST(&(x[WS(rs, 16)]), VADD(T1X, T1U), ms, &(x[0]));
433 	       }
434 	       {
435 		    V T1G, T1E, T1F, T1y, T1I, T1q, T1x, T1J, T1H;
436 		    T1G = VMUL(LDK(KP559016994), VSUB(T1C, T1D));
437 		    T1E = VADD(T1C, T1D);
438 		    T1F = VFNMS(LDK(KP250000000), T1E, T1B);
439 		    T1q = VSUB(T1m, T1p);
440 		    T1x = VSUB(T1t, T1w);
441 		    T1y = VBYI(VFNMS(LDK(KP951056516), T1x, VMUL(LDK(KP587785252), T1q)));
442 		    T1I = VBYI(VFMA(LDK(KP951056516), T1q, VMUL(LDK(KP587785252), T1x)));
443 		    ST(&(x[WS(rs, 10)]), VADD(T1B, T1E), ms, &(x[0]));
444 		    T1J = VADD(T1G, T1F);
445 		    ST(&(x[WS(rs, 6)]), VADD(T1I, T1J), ms, &(x[0]));
446 		    ST(&(x[WS(rs, 14)]), VSUB(T1J, T1I), ms, &(x[0]));
447 		    T1H = VSUB(T1F, T1G);
448 		    ST(&(x[WS(rs, 2)]), VADD(T1y, T1H), ms, &(x[0]));
449 		    ST(&(x[WS(rs, 18)]), VSUB(T1H, T1y), ms, &(x[0]));
450 	       }
451 	       {
452 		    V TR, T16, T1d, T1b, T13, T1e, Tu, T1a;
453 		    TR = VFNMS(LDK(KP951056516), TQ, VMUL(LDK(KP587785252), TF));
454 		    T16 = VFNMS(LDK(KP951056516), T15, VMUL(LDK(KP587785252), T14));
455 		    T1d = VFMA(LDK(KP951056516), T14, VMUL(LDK(KP587785252), T15));
456 		    T1b = VFMA(LDK(KP951056516), TF, VMUL(LDK(KP587785252), TQ));
457 		    {
458 			 V TV, T12, Ts, Tt;
459 			 TV = VMUL(LDK(KP559016994), VSUB(TT, TU));
460 			 T12 = VFNMS(LDK(KP250000000), T11, T10);
461 			 T13 = VSUB(TV, T12);
462 			 T1e = VADD(TV, T12);
463 			 Ts = VFNMS(LDK(KP250000000), Tr, T4);
464 			 Tt = VMUL(LDK(KP559016994), VSUB(Tf, Tq));
465 			 Tu = VSUB(Ts, Tt);
466 			 T1a = VADD(Tt, Ts);
467 		    }
468 		    {
469 			 V TS, T17, T1g, T1h;
470 			 TS = VSUB(Tu, TR);
471 			 T17 = VBYI(VSUB(T13, T16));
472 			 ST(&(x[WS(rs, 17)]), VSUB(TS, T17), ms, &(x[WS(rs, 1)]));
473 			 ST(&(x[WS(rs, 3)]), VADD(TS, T17), ms, &(x[WS(rs, 1)]));
474 			 T1g = VADD(T1a, T1b);
475 			 T1h = VBYI(VSUB(T1e, T1d));
476 			 ST(&(x[WS(rs, 11)]), VSUB(T1g, T1h), ms, &(x[WS(rs, 1)]));
477 			 ST(&(x[WS(rs, 9)]), VADD(T1g, T1h), ms, &(x[WS(rs, 1)]));
478 		    }
479 		    {
480 			 V T18, T19, T1c, T1f;
481 			 T18 = VADD(Tu, TR);
482 			 T19 = VBYI(VADD(T16, T13));
483 			 ST(&(x[WS(rs, 13)]), VSUB(T18, T19), ms, &(x[WS(rs, 1)]));
484 			 ST(&(x[WS(rs, 7)]), VADD(T18, T19), ms, &(x[WS(rs, 1)]));
485 			 T1c = VSUB(T1a, T1b);
486 			 T1f = VBYI(VADD(T1d, T1e));
487 			 ST(&(x[WS(rs, 19)]), VSUB(T1c, T1f), ms, &(x[WS(rs, 1)]));
488 			 ST(&(x[WS(rs, 1)]), VADD(T1c, T1f), ms, &(x[WS(rs, 1)]));
489 		    }
490 	       }
491 	  }
492      }
493      VLEAVE();
494 }
495 
496 static const tw_instr twinstr[] = {
497      VTW(0, 1),
498      VTW(0, 2),
499      VTW(0, 3),
500      VTW(0, 4),
501      VTW(0, 5),
502      VTW(0, 6),
503      VTW(0, 7),
504      VTW(0, 8),
505      VTW(0, 9),
506      VTW(0, 10),
507      VTW(0, 11),
508      VTW(0, 12),
509      VTW(0, 13),
510      VTW(0, 14),
511      VTW(0, 15),
512      VTW(0, 16),
513      VTW(0, 17),
514      VTW(0, 18),
515      VTW(0, 19),
516      { TW_NEXT, VL, 0 }
517 };
518 
519 static const ct_desc desc = { 20, XSIMD_STRING("t2bv_20"), twinstr, &GENUS, { 111, 50, 12, 0 }, 0, 0, 0 };
520 
XSIMD(codelet_t2bv_20)521 void XSIMD(codelet_t2bv_20) (planner *p) {
522      X(kdft_dit_register) (p, t2bv_20, &desc);
523 }
524 #endif
525