1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:06:57 EST 2020 */
23 
24 #include "rdft/codelet-rdft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include rdft/simd/hc2cfv.h */
29 
30 /*
31  * This function contains 143 FP additions, 128 FP multiplications,
32  * (or, 77 additions, 62 multiplications, 66 fused multiply/add),
33  * 129 stack variables, 5 constants, and 40 memory accesses
34  */
35 #include "rdft/simd/hc2cfv.h"
36 
hc2cfdftv_20(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)37 static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
40      DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
41      DVK(KP618033988, +0.618033988749894848204586834365638117720309180);
42      DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
43      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
44      {
45 	  INT m;
46 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
47 	       V T1O, T2j, T2c, T2b, T2i, T1X, Tx, TM, TN, T1x, T1y, T1z, T1u, T1v, T1w;
48 	       V T12, T1d, T1e, T24, T2g, Ti, T1t, T1V, T29, T26, T27, T1W, T25, T1H, T1L;
49 	       V T1B, T1K, T1E, T1F, T1G, T1D, T1A, T1C, T1N, T1I, T1J, T1M;
50 	       {
51 		    V T3, T1Y, TC, T7, Tn, T1P, Tc, Tg, Tw, T1Z, TS, T1S, TL, T21, T17;
52 		    V T1Q, T11, T22, T1c, T1T, T1, T2, Tz, T5, T6, TB, Ty, TA, T4, Ta;
53 		    V Tb, Tk, Te, Tf, Tm, Tj, Tl, T9, Td, T20, T23, T8, Th, T1R, T1U;
54 		    T1 = LD(&(Rp[0]), ms, &(Rp[0]));
55 		    T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
56 		    Ty = LDW(&(W[0]));
57 		    Tz = VZMULIJ(Ty, VFNMSCONJ(T2, T1));
58 		    T5 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
59 		    T6 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
60 		    TA = LDW(&(W[TWVL * 20]));
61 		    TB = VZMULIJ(TA, VFNMSCONJ(T6, T5));
62 		    T3 = VFMACONJ(T2, T1);
63 		    T1Y = VSUB(TB, Tz);
64 		    TC = VADD(Tz, TB);
65 		    T4 = LDW(&(W[TWVL * 18]));
66 		    T7 = VZMULJ(T4, VFMACONJ(T6, T5));
67 		    Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
68 		    Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
69 		    Tj = LDW(&(W[TWVL * 6]));
70 		    Tk = VZMULJ(Tj, VFMACONJ(Tb, Ta));
71 		    Te = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
72 		    Tf = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
73 		    Tl = LDW(&(W[TWVL * 26]));
74 		    Tm = VZMULJ(Tl, VFMACONJ(Tf, Te));
75 		    Tn = VADD(Tk, Tm);
76 		    T1P = VSUB(Tk, Tm);
77 		    T9 = LDW(&(W[TWVL * 8]));
78 		    Tc = VZMULIJ(T9, VFNMSCONJ(Tb, Ta));
79 		    Td = LDW(&(W[TWVL * 28]));
80 		    Tg = VZMULIJ(Td, VFNMSCONJ(Tf, Te));
81 		    {
82 			 V Tr, TP, Tv, TR, Tp, Tq, To, TO, Tt, Tu, Ts, TQ, TG, T14, TK;
83 			 V T16, TE, TF, TD, T13, TI, TJ, TH, T15, TW, T19, T10, T1b, TU, TV;
84 			 V TT, T18, TY, TZ, TX, T1a;
85 			 Tp = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
86 			 Tq = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
87 			 To = LDW(&(W[TWVL * 16]));
88 			 Tr = VZMULIJ(To, VFNMSCONJ(Tq, Tp));
89 			 TO = LDW(&(W[TWVL * 14]));
90 			 TP = VZMULJ(TO, VFMACONJ(Tq, Tp));
91 			 Tt = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
92 			 Tu = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
93 			 Ts = LDW(&(W[TWVL * 36]));
94 			 Tv = VZMULIJ(Ts, VFNMSCONJ(Tu, Tt));
95 			 TQ = LDW(&(W[TWVL * 34]));
96 			 TR = VZMULJ(TQ, VFMACONJ(Tu, Tt));
97 			 Tw = VADD(Tr, Tv);
98 			 T1Z = VSUB(Tv, Tr);
99 			 TS = VADD(TP, TR);
100 			 T1S = VSUB(TP, TR);
101 			 TE = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
102 			 TF = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
103 			 TD = LDW(&(W[TWVL * 30]));
104 			 TG = VZMULJ(TD, VFMACONJ(TF, TE));
105 			 T13 = LDW(&(W[TWVL * 32]));
106 			 T14 = VZMULIJ(T13, VFNMSCONJ(TF, TE));
107 			 TI = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
108 			 TJ = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
109 			 TH = LDW(&(W[TWVL * 10]));
110 			 TK = VZMULJ(TH, VFMACONJ(TJ, TI));
111 			 T15 = LDW(&(W[TWVL * 12]));
112 			 T16 = VZMULIJ(T15, VFNMSCONJ(TJ, TI));
113 			 TL = VADD(TG, TK);
114 			 T21 = VSUB(T16, T14);
115 			 T17 = VADD(T14, T16);
116 			 T1Q = VSUB(TK, TG);
117 			 TU = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
118 			 TV = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
119 			 TT = LDW(&(W[TWVL * 24]));
120 			 TW = VZMULIJ(TT, VFNMSCONJ(TV, TU));
121 			 T18 = LDW(&(W[TWVL * 22]));
122 			 T19 = VZMULJ(T18, VFMACONJ(TV, TU));
123 			 TY = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
124 			 TZ = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
125 			 TX = LDW(&(W[TWVL * 4]));
126 			 T10 = VZMULIJ(TX, VFNMSCONJ(TZ, TY));
127 			 T1a = LDW(&(W[TWVL * 2]));
128 			 T1b = VZMULJ(T1a, VFMACONJ(TZ, TY));
129 			 T11 = VADD(TW, T10);
130 			 T22 = VSUB(T10, TW);
131 			 T1c = VADD(T19, T1b);
132 			 T1T = VSUB(T1b, T19);
133 		    }
134 		    T1O = VSUB(T3, T7);
135 		    T2j = VADD(T1S, T1T);
136 		    T2c = VSUB(T21, T22);
137 		    T2b = VSUB(T1Y, T1Z);
138 		    T2i = VADD(T1P, T1Q);
139 		    T1X = VSUB(Tg, Tc);
140 		    Tx = VSUB(Tn, Tw);
141 		    TM = VSUB(TC, TL);
142 		    TN = VSUB(Tx, TM);
143 		    T1x = VADD(TS, T11);
144 		    T1y = VADD(T17, T1c);
145 		    T1z = VADD(T1x, T1y);
146 		    T1u = VADD(Tn, Tw);
147 		    T1v = VADD(TC, TL);
148 		    T1w = VADD(T1u, T1v);
149 		    T12 = VSUB(TS, T11);
150 		    T1d = VSUB(T17, T1c);
151 		    T1e = VSUB(T12, T1d);
152 		    T20 = VADD(T1Y, T1Z);
153 		    T23 = VADD(T21, T22);
154 		    T24 = VADD(T20, T23);
155 		    T2g = VSUB(T23, T20);
156 		    T8 = VADD(T3, T7);
157 		    Th = VADD(Tc, Tg);
158 		    Ti = VSUB(T8, Th);
159 		    T1t = VADD(T8, Th);
160 		    T1R = VSUB(T1P, T1Q);
161 		    T1U = VSUB(T1S, T1T);
162 		    T1V = VADD(T1R, T1U);
163 		    T29 = VSUB(T1R, T1U);
164 	       }
165 	       T1W = VADD(T1O, T1V);
166 	       T25 = VADD(T1X, T24);
167 	       T26 = VMUL(LDK(KP500000000), VFNMSI(T25, T1W));
168 	       T27 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T25, T1W)));
169 	       ST(&(Rp[WS(rs, 5)]), T26, ms, &(Rp[WS(rs, 1)]));
170 	       ST(&(Rm[WS(rs, 4)]), T27, -ms, &(Rm[0]));
171 	       T1F = VSUB(T1x, T1y);
172 	       T1G = VSUB(T1u, T1v);
173 	       T1H = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1G, T1F));
174 	       T1L = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1F, T1G));
175 	       T1D = VSUB(T1w, T1z);
176 	       T1A = VADD(T1w, T1z);
177 	       T1C = VFNMS(LDK(KP250000000), T1A, T1t);
178 	       T1B = VCONJ(VMUL(LDK(KP500000000), VADD(T1t, T1A)));
179 	       T1K = VFMA(LDK(KP559016994), T1D, T1C);
180 	       T1E = VFNMS(LDK(KP559016994), T1D, T1C);
181 	       ST(&(Rm[WS(rs, 9)]), T1B, -ms, &(Rm[WS(rs, 1)]));
182 	       T1N = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1L, T1K)));
183 	       ST(&(Rm[WS(rs, 5)]), T1N, -ms, &(Rm[WS(rs, 1)]));
184 	       T1I = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1H, T1E)));
185 	       ST(&(Rm[WS(rs, 1)]), T1I, -ms, &(Rm[WS(rs, 1)]));
186 	       T1J = VMUL(LDK(KP500000000), VFMAI(T1H, T1E));
187 	       ST(&(Rp[WS(rs, 2)]), T1J, ms, &(Rp[0]));
188 	       T1M = VMUL(LDK(KP500000000), VFNMSI(T1L, T1K));
189 	       ST(&(Rp[WS(rs, 6)]), T1M, ms, &(Rp[0]));
190 	       {
191 		    V T1m, T1q, T1g, T1p, T1j, T1k, T1l, T1i, T1f, T1h, T1s, T1n, T1o, T1r, T2e;
192 		    V T2A, T2o, T2u, T2l, T2B, T2p, T2x, T2d, T2t, T2a, T2s, T28, T2k, T2w, T2h;
193 		    V T2v, T2f, T2m, T2C, T2D, T2n, T2q, T2y, T2z, T2r;
194 		    T1k = VADD(Tx, TM);
195 		    T1l = VADD(T12, T1d);
196 		    T1m = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), T1l, T1k));
197 		    T1q = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), T1k, T1l));
198 		    T1i = VSUB(TN, T1e);
199 		    T1f = VADD(TN, T1e);
200 		    T1h = VFNMS(LDK(KP250000000), T1f, Ti);
201 		    T1g = VMUL(LDK(KP500000000), VADD(Ti, T1f));
202 		    T1p = VFNMS(LDK(KP559016994), T1i, T1h);
203 		    T1j = VFMA(LDK(KP559016994), T1i, T1h);
204 		    ST(&(Rp[0]), T1g, ms, &(Rp[0]));
205 		    T1s = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1q, T1p)));
206 		    ST(&(Rm[WS(rs, 7)]), T1s, -ms, &(Rm[WS(rs, 1)]));
207 		    T1n = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1m, T1j)));
208 		    ST(&(Rm[WS(rs, 3)]), T1n, -ms, &(Rm[WS(rs, 1)]));
209 		    T1o = VMUL(LDK(KP500000000), VFMAI(T1m, T1j));
210 		    ST(&(Rp[WS(rs, 4)]), T1o, ms, &(Rp[0]));
211 		    T1r = VMUL(LDK(KP500000000), VFNMSI(T1q, T1p));
212 		    ST(&(Rp[WS(rs, 8)]), T1r, ms, &(Rp[0]));
213 		    T2d = VFMA(LDK(KP618033988), T2c, T2b);
214 		    T2t = VFNMS(LDK(KP618033988), T2b, T2c);
215 		    T28 = VFNMS(LDK(KP250000000), T1V, T1O);
216 		    T2a = VFMA(LDK(KP559016994), T29, T28);
217 		    T2s = VFNMS(LDK(KP559016994), T29, T28);
218 		    T2e = VFNMS(LDK(KP951056516), T2d, T2a);
219 		    T2A = VFMA(LDK(KP951056516), T2t, T2s);
220 		    T2o = VFMA(LDK(KP951056516), T2d, T2a);
221 		    T2u = VFNMS(LDK(KP951056516), T2t, T2s);
222 		    T2k = VFMA(LDK(KP618033988), T2j, T2i);
223 		    T2w = VFNMS(LDK(KP618033988), T2i, T2j);
224 		    T2f = VFNMS(LDK(KP250000000), T24, T1X);
225 		    T2h = VFNMS(LDK(KP559016994), T2g, T2f);
226 		    T2v = VFMA(LDK(KP559016994), T2g, T2f);
227 		    T2l = VFNMS(LDK(KP951056516), T2k, T2h);
228 		    T2B = VFMA(LDK(KP951056516), T2w, T2v);
229 		    T2p = VFMA(LDK(KP951056516), T2k, T2h);
230 		    T2x = VFNMS(LDK(KP951056516), T2w, T2v);
231 		    T2m = VMUL(LDK(KP500000000), VFNMSI(T2l, T2e));
232 		    ST(&(Rp[WS(rs, 9)]), T2m, ms, &(Rp[WS(rs, 1)]));
233 		    T2C = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2B, T2A)));
234 		    ST(&(Rm[WS(rs, 6)]), T2C, -ms, &(Rm[0]));
235 		    T2D = VMUL(LDK(KP500000000), VFMAI(T2B, T2A));
236 		    ST(&(Rp[WS(rs, 7)]), T2D, ms, &(Rp[WS(rs, 1)]));
237 		    T2n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2l, T2e)));
238 		    ST(&(Rm[WS(rs, 8)]), T2n, -ms, &(Rm[0]));
239 		    T2q = VMUL(LDK(KP500000000), VFNMSI(T2p, T2o));
240 		    ST(&(Rp[WS(rs, 1)]), T2q, ms, &(Rp[WS(rs, 1)]));
241 		    T2y = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2x, T2u)));
242 		    ST(&(Rm[WS(rs, 2)]), T2y, -ms, &(Rm[0]));
243 		    T2z = VMUL(LDK(KP500000000), VFMAI(T2x, T2u));
244 		    ST(&(Rp[WS(rs, 3)]), T2z, ms, &(Rp[WS(rs, 1)]));
245 		    T2r = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2p, T2o)));
246 		    ST(&(Rm[0]), T2r, -ms, &(Rm[0]));
247 	       }
248 	  }
249      }
250      VLEAVE();
251 }
252 
253 static const tw_instr twinstr[] = {
254      VTW(1, 1),
255      VTW(1, 2),
256      VTW(1, 3),
257      VTW(1, 4),
258      VTW(1, 5),
259      VTW(1, 6),
260      VTW(1, 7),
261      VTW(1, 8),
262      VTW(1, 9),
263      VTW(1, 10),
264      VTW(1, 11),
265      VTW(1, 12),
266      VTW(1, 13),
267      VTW(1, 14),
268      VTW(1, 15),
269      VTW(1, 16),
270      VTW(1, 17),
271      VTW(1, 18),
272      VTW(1, 19),
273      { TW_NEXT, VL, 0 }
274 };
275 
276 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, { 77, 62, 66, 0 } };
277 
XSIMD(codelet_hc2cfdftv_20)278 void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
279      X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
280 }
281 #else
282 
283 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 20 -dit -name hc2cfdftv_20 -include rdft/simd/hc2cfv.h */
284 
285 /*
286  * This function contains 143 FP additions, 77 FP multiplications,
287  * (or, 131 additions, 65 multiplications, 12 fused multiply/add),
288  * 141 stack variables, 9 constants, and 40 memory accesses
289  */
290 #include "rdft/simd/hc2cfv.h"
291 
hc2cfdftv_20(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)292 static void hc2cfdftv_20(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
293 {
294      DVK(KP293892626, +0.293892626146236564584352977319536384298826219);
295      DVK(KP475528258, +0.475528258147576786058219666689691071702849317);
296      DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
297      DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
298      DVK(KP125000000, +0.125000000000000000000000000000000000000000000);
299      DVK(KP279508497, +0.279508497187473712051146708591409529430077295);
300      DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
301      DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
302      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
303      {
304 	  INT m;
305 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 38)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 38), MAKE_VOLATILE_STRIDE(80, rs)) {
306 	       V TW, T1x, T2i, T2A, T1r, T1s, T1a, T1y, T1l, Tn, TK, TL, T1p, T1o, T27;
307 	       V T2t, T2a, T2u, T2e, T2C, T20, T2w, T23, T2x, T2d, T2B, T1W, T1X, T1U, T1V;
308 	       V T2z, T2K, T2G, T2N, T2J, T2v, T2y, T2F, T2D, T2E, T2M, T2H, T2I, T2L;
309 	       {
310 		    V T1u, T5, Tg, T1c, TV, T13, Ta, T1w, TQ, T11, TI, T1j, Tx, T18, Tl;
311 		    V T1e, TD, T1h, Ts, T16, T2g, T2h, T14, T19, T1f, T1k, Tb, Tm, Ty, TJ;
312 		    V T25, T26, T28, T29, T1Y, T1Z, T21, T22;
313 		    {
314 			 V T4, T3, T2, T1, Tf, Te, Td, Tc, T1b, TU, TT, TS, TR, T12, T9;
315 			 V T8, T7, T6, T1v, TP, TO, TN, TM, T10, TH, TG, TF, TE, T1i, Tw;
316 			 V Tv, Tu, Tt, T17, Tk, Tj, Ti, Th, T1d, TC, TB, TA, Tz, T1g, Tr;
317 			 V Tq, Tp, To, T15;
318 			 T4 = LD(&(Rp[0]), ms, &(Rp[0]));
319 			 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
320 			 T3 = VCONJ(T2);
321 			 T1u = VADD(T4, T3);
322 			 T1 = LDW(&(W[0]));
323 			 T5 = VZMULIJ(T1, VSUB(T3, T4));
324 			 Tf = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
325 			 Td = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
326 			 Te = VCONJ(Td);
327 			 Tc = LDW(&(W[TWVL * 16]));
328 			 Tg = VZMULIJ(Tc, VSUB(Te, Tf));
329 			 T1b = LDW(&(W[TWVL * 14]));
330 			 T1c = VZMULJ(T1b, VADD(Te, Tf));
331 			 TU = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
332 			 TS = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
333 			 TT = VCONJ(TS);
334 			 TR = LDW(&(W[TWVL * 28]));
335 			 TV = VZMULIJ(TR, VSUB(TT, TU));
336 			 T12 = LDW(&(W[TWVL * 26]));
337 			 T13 = VZMULJ(T12, VADD(TT, TU));
338 			 T9 = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
339 			 T7 = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
340 			 T8 = VCONJ(T7);
341 			 T6 = LDW(&(W[TWVL * 20]));
342 			 Ta = VZMULIJ(T6, VSUB(T8, T9));
343 			 T1v = LDW(&(W[TWVL * 18]));
344 			 T1w = VZMULJ(T1v, VADD(T9, T8));
345 			 TP = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
346 			 TN = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
347 			 TO = VCONJ(TN);
348 			 TM = LDW(&(W[TWVL * 8]));
349 			 TQ = VZMULIJ(TM, VSUB(TO, TP));
350 			 T10 = LDW(&(W[TWVL * 6]));
351 			 T11 = VZMULJ(T10, VADD(TO, TP));
352 			 TH = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
353 			 TF = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
354 			 TG = VCONJ(TF);
355 			 TE = LDW(&(W[TWVL * 4]));
356 			 TI = VZMULIJ(TE, VSUB(TG, TH));
357 			 T1i = LDW(&(W[TWVL * 2]));
358 			 T1j = VZMULJ(T1i, VADD(TG, TH));
359 			 Tw = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
360 			 Tu = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
361 			 Tv = VCONJ(Tu);
362 			 Tt = LDW(&(W[TWVL * 12]));
363 			 Tx = VZMULIJ(Tt, VSUB(Tv, Tw));
364 			 T17 = LDW(&(W[TWVL * 10]));
365 			 T18 = VZMULJ(T17, VADD(Tw, Tv));
366 			 Tk = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
367 			 Ti = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
368 			 Tj = VCONJ(Ti);
369 			 Th = LDW(&(W[TWVL * 36]));
370 			 Tl = VZMULIJ(Th, VSUB(Tj, Tk));
371 			 T1d = LDW(&(W[TWVL * 34]));
372 			 T1e = VZMULJ(T1d, VADD(Tj, Tk));
373 			 TC = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
374 			 TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
375 			 TB = VCONJ(TA);
376 			 Tz = LDW(&(W[TWVL * 24]));
377 			 TD = VZMULIJ(Tz, VSUB(TB, TC));
378 			 T1g = LDW(&(W[TWVL * 22]));
379 			 T1h = VZMULJ(T1g, VADD(TB, TC));
380 			 Tr = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
381 			 Tp = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
382 			 Tq = VCONJ(Tp);
383 			 To = LDW(&(W[TWVL * 32]));
384 			 Ts = VZMULIJ(To, VSUB(Tq, Tr));
385 			 T15 = LDW(&(W[TWVL * 30]));
386 			 T16 = VZMULJ(T15, VADD(Tr, Tq));
387 		    }
388 		    TW = VSUB(TQ, TV);
389 		    T1x = VSUB(T1u, T1w);
390 		    T2g = VADD(T1u, T1w);
391 		    T2h = VADD(TQ, TV);
392 		    T2i = VADD(T2g, T2h);
393 		    T2A = VSUB(T2g, T2h);
394 		    T14 = VSUB(T11, T13);
395 		    T19 = VSUB(T16, T18);
396 		    T1r = VADD(T14, T19);
397 		    T1f = VSUB(T1c, T1e);
398 		    T1k = VSUB(T1h, T1j);
399 		    T1s = VADD(T1f, T1k);
400 		    T1a = VSUB(T14, T19);
401 		    T1y = VADD(T1r, T1s);
402 		    T1l = VSUB(T1f, T1k);
403 		    Tb = VSUB(T5, Ta);
404 		    Tm = VSUB(Tg, Tl);
405 		    Tn = VADD(Tb, Tm);
406 		    Ty = VSUB(Ts, Tx);
407 		    TJ = VSUB(TD, TI);
408 		    TK = VADD(Ty, TJ);
409 		    TL = VADD(Tn, TK);
410 		    T1p = VSUB(Ty, TJ);
411 		    T1o = VSUB(Tb, Tm);
412 		    T25 = VADD(T1c, T1e);
413 		    T26 = VADD(TD, TI);
414 		    T27 = VADD(T25, T26);
415 		    T2t = VSUB(T25, T26);
416 		    T28 = VADD(Ts, Tx);
417 		    T29 = VADD(T1h, T1j);
418 		    T2a = VADD(T28, T29);
419 		    T2u = VSUB(T29, T28);
420 		    T2e = VADD(T27, T2a);
421 		    T2C = VADD(T2t, T2u);
422 		    T1Y = VADD(T11, T13);
423 		    T1Z = VADD(Tg, Tl);
424 		    T20 = VADD(T1Y, T1Z);
425 		    T2w = VSUB(T1Y, T1Z);
426 		    T21 = VADD(T5, Ta);
427 		    T22 = VADD(T16, T18);
428 		    T23 = VADD(T21, T22);
429 		    T2x = VSUB(T22, T21);
430 		    T2d = VADD(T20, T23);
431 		    T2B = VADD(T2w, T2x);
432 	       }
433 	       T1U = VADD(T1x, T1y);
434 	       T1V = VBYI(VADD(TW, TL));
435 	       T1W = VMUL(LDK(KP500000000), VSUB(T1U, T1V));
436 	       T1X = VCONJ(VMUL(LDK(KP500000000), VADD(T1V, T1U)));
437 	       ST(&(Rp[WS(rs, 5)]), T1W, ms, &(Rp[WS(rs, 1)]));
438 	       ST(&(Rm[WS(rs, 4)]), T1X, -ms, &(Rm[0]));
439 	       T2v = VSUB(T2t, T2u);
440 	       T2y = VSUB(T2w, T2x);
441 	       T2z = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T2y, VMUL(LDK(KP951056516), T2v))));
442 	       T2K = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T2y, VMUL(LDK(KP587785252), T2v))));
443 	       T2F = VMUL(LDK(KP279508497), VSUB(T2B, T2C));
444 	       T2D = VADD(T2B, T2C);
445 	       T2E = VFNMS(LDK(KP125000000), T2D, VMUL(LDK(KP500000000), T2A));
446 	       T2G = VSUB(T2E, T2F);
447 	       T2N = VCONJ(VMUL(LDK(KP500000000), VADD(T2A, T2D)));
448 	       T2J = VADD(T2F, T2E);
449 	       ST(&(Rm[WS(rs, 9)]), T2N, -ms, &(Rm[WS(rs, 1)]));
450 	       T2M = VCONJ(VADD(T2K, T2J));
451 	       ST(&(Rm[WS(rs, 5)]), T2M, -ms, &(Rm[WS(rs, 1)]));
452 	       T2H = VADD(T2z, T2G);
453 	       ST(&(Rp[WS(rs, 2)]), T2H, ms, &(Rp[0]));
454 	       T2I = VCONJ(VSUB(T2G, T2z));
455 	       ST(&(Rm[WS(rs, 1)]), T2I, -ms, &(Rm[WS(rs, 1)]));
456 	       T2L = VSUB(T2J, T2K);
457 	       ST(&(Rp[WS(rs, 6)]), T2L, ms, &(Rp[0]));
458 	       {
459 		    V T2c, T2p, T2l, T2s, T2o, T24, T2b, T2f, T2j, T2k, T2r, T2m, T2n, T2q, T1n;
460 		    V T1Q, T1E, T1K, T1B, T1R, T1F, T1N, T1m, T1J, TZ, T1I, TX, TY, T1q, T1M;
461 		    V T1A, T1L, T1t, T1z, T1C, T1S, T1T, T1D, T1G, T1O, T1P, T1H;
462 		    T24 = VSUB(T20, T23);
463 		    T2b = VSUB(T27, T2a);
464 		    T2c = VMUL(LDK(KP500000000), VBYI(VFMA(LDK(KP951056516), T24, VMUL(LDK(KP587785252), T2b))));
465 		    T2p = VMUL(LDK(KP500000000), VBYI(VFNMS(LDK(KP587785252), T24, VMUL(LDK(KP951056516), T2b))));
466 		    T2f = VMUL(LDK(KP279508497), VSUB(T2d, T2e));
467 		    T2j = VADD(T2d, T2e);
468 		    T2k = VFNMS(LDK(KP125000000), T2j, VMUL(LDK(KP500000000), T2i));
469 		    T2l = VADD(T2f, T2k);
470 		    T2s = VMUL(LDK(KP500000000), VADD(T2i, T2j));
471 		    T2o = VSUB(T2k, T2f);
472 		    ST(&(Rp[0]), T2s, ms, &(Rp[0]));
473 		    T2r = VCONJ(VADD(T2p, T2o));
474 		    ST(&(Rm[WS(rs, 7)]), T2r, -ms, &(Rm[WS(rs, 1)]));
475 		    T2m = VADD(T2c, T2l);
476 		    ST(&(Rp[WS(rs, 4)]), T2m, ms, &(Rp[0]));
477 		    T2n = VCONJ(VSUB(T2l, T2c));
478 		    ST(&(Rm[WS(rs, 3)]), T2n, -ms, &(Rm[WS(rs, 1)]));
479 		    T2q = VSUB(T2o, T2p);
480 		    ST(&(Rp[WS(rs, 8)]), T2q, ms, &(Rp[0]));
481 		    T1m = VFMA(LDK(KP951056516), T1a, VMUL(LDK(KP587785252), T1l));
482 		    T1J = VFNMS(LDK(KP587785252), T1a, VMUL(LDK(KP951056516), T1l));
483 		    TX = VFMS(LDK(KP250000000), TL, TW);
484 		    TY = VMUL(LDK(KP559016994), VSUB(TK, Tn));
485 		    TZ = VADD(TX, TY);
486 		    T1I = VSUB(TY, TX);
487 		    T1n = VMUL(LDK(KP500000000), VBYI(VSUB(TZ, T1m)));
488 		    T1Q = VMUL(LDK(KP500000000), VBYI(VADD(T1I, T1J)));
489 		    T1E = VMUL(LDK(KP500000000), VBYI(VADD(TZ, T1m)));
490 		    T1K = VMUL(LDK(KP500000000), VBYI(VSUB(T1I, T1J)));
491 		    T1q = VFMA(LDK(KP475528258), T1o, VMUL(LDK(KP293892626), T1p));
492 		    T1M = VFNMS(LDK(KP293892626), T1o, VMUL(LDK(KP475528258), T1p));
493 		    T1t = VMUL(LDK(KP279508497), VSUB(T1r, T1s));
494 		    T1z = VFNMS(LDK(KP125000000), T1y, VMUL(LDK(KP500000000), T1x));
495 		    T1A = VADD(T1t, T1z);
496 		    T1L = VSUB(T1z, T1t);
497 		    T1B = VADD(T1q, T1A);
498 		    T1R = VADD(T1M, T1L);
499 		    T1F = VSUB(T1A, T1q);
500 		    T1N = VSUB(T1L, T1M);
501 		    T1C = VADD(T1n, T1B);
502 		    ST(&(Rp[WS(rs, 1)]), T1C, ms, &(Rp[WS(rs, 1)]));
503 		    T1S = VADD(T1Q, T1R);
504 		    ST(&(Rp[WS(rs, 7)]), T1S, ms, &(Rp[WS(rs, 1)]));
505 		    T1T = VCONJ(VSUB(T1R, T1Q));
506 		    ST(&(Rm[WS(rs, 6)]), T1T, -ms, &(Rm[0]));
507 		    T1D = VCONJ(VSUB(T1B, T1n));
508 		    ST(&(Rm[0]), T1D, -ms, &(Rm[0]));
509 		    T1G = VADD(T1E, T1F);
510 		    ST(&(Rp[WS(rs, 9)]), T1G, ms, &(Rp[WS(rs, 1)]));
511 		    T1O = VADD(T1K, T1N);
512 		    ST(&(Rp[WS(rs, 3)]), T1O, ms, &(Rp[WS(rs, 1)]));
513 		    T1P = VCONJ(VSUB(T1N, T1K));
514 		    ST(&(Rm[WS(rs, 2)]), T1P, -ms, &(Rm[0]));
515 		    T1H = VCONJ(VSUB(T1F, T1E));
516 		    ST(&(Rm[WS(rs, 8)]), T1H, -ms, &(Rm[0]));
517 	       }
518 	  }
519      }
520      VLEAVE();
521 }
522 
523 static const tw_instr twinstr[] = {
524      VTW(1, 1),
525      VTW(1, 2),
526      VTW(1, 3),
527      VTW(1, 4),
528      VTW(1, 5),
529      VTW(1, 6),
530      VTW(1, 7),
531      VTW(1, 8),
532      VTW(1, 9),
533      VTW(1, 10),
534      VTW(1, 11),
535      VTW(1, 12),
536      VTW(1, 13),
537      VTW(1, 14),
538      VTW(1, 15),
539      VTW(1, 16),
540      VTW(1, 17),
541      VTW(1, 18),
542      VTW(1, 19),
543      { TW_NEXT, VL, 0 }
544 };
545 
546 static const hc2c_desc desc = { 20, XSIMD_STRING("hc2cfdftv_20"), twinstr, &GENUS, { 131, 65, 12, 0 } };
547 
XSIMD(codelet_hc2cfdftv_20)548 void XSIMD(codelet_hc2cfdftv_20) (planner *p) {
549      X(khc2c_register) (p, hc2cfdftv_20, &desc, HC2C_VIA_DFT);
550 }
551 #endif
552