1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:06:57 EST 2020 */
23 
24 #include "rdft/codelet-rdft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include rdft/simd/hc2cfv.h */
29 
30 /*
31  * This function contains 249 FP additions, 224 FP multiplications,
32  * (or, 119 additions, 94 multiplications, 130 fused multiply/add),
33  * 154 stack variables, 8 constants, and 64 memory accesses
34  */
35 #include "rdft/simd/hc2cfv.h"
36 
hc2cfdftv_32(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)37 static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
40      DVK(KP668178637, +0.668178637919298919997757686523080761552472251);
41      DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
42      DVK(KP198912367, +0.198912367379658006911597622644676228597850501);
43      DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
44      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
45      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
46      DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
47      {
48 	  INT m;
49 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
50 	       V T47, T48, T4l, T3w, T3F, T3B, T41, Ts, T2y, T1Q, T2B, T27, T2J, T3a, T40;
51 	       V T1X, T2C, T43, T44, T4a, T4b, T4m, T3p, T3E, T15, T2K, T1u, T2F, T3h, T3C;
52 	       V T1n, T2E, T2a, T2z, T1a, T18, TU, T3m, T3f, T1r, T1p, T13, T3n, T3e, TB;
53 	       V T3k, T1l, T3c, TK, T3j, T1g, T3b, T3l, T3o, TL, T14, T1s, T1t, T3d, T3g;
54 	       V T1b, T1m, T28, T29, T3Q, T3W, T3T, T3X, T3O, T3P, T3R, T3S, T3U, T3Z, T3V;
55 	       V T3Y;
56 	       {
57 		    V T1U, T1S, T3, T3u, T7, T1z, T1D, T3t, T24, T22, Tc, Tg, Th, T3q, T1J;
58 		    V Tl, Tp, Tq, T3r, T1O, T3s, T3v, T3z, T3A, T8, Tr, T1E, T1P, T25, T26;
59 		    V T38, T39, T1V, T1W;
60 		    {
61 			 V T1, T2, T5, T6, T1T, T1R, T4, T1x, T1y, T1B, T1C, T1w, T1A, T23, T21;
62 			 V T1I, T1G, Ta, Tb, T9, T1H, Te, Tf, Td, T1F, T1N, T1L, Tj, Tk, Ti;
63 			 V T1M, Tn, To, Tm, T1K;
64 			 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
65 			 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
66 			 T1T = LDW(&(W[0]));
67 			 T1U = VZMULIJ(T1T, VFNMSCONJ(T2, T1));
68 			 T5 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
69 			 T6 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
70 			 T1R = LDW(&(W[TWVL * 32]));
71 			 T1S = VZMULIJ(T1R, VFNMSCONJ(T6, T5));
72 			 T3 = VFMACONJ(T2, T1);
73 			 T3u = VADD(T1U, T1S);
74 			 T4 = LDW(&(W[TWVL * 30]));
75 			 T7 = VZMULJ(T4, VFMACONJ(T6, T5));
76 			 T1x = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
77 			 T1y = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
78 			 T1w = LDW(&(W[TWVL * 48]));
79 			 T1z = VZMULIJ(T1w, VFNMSCONJ(T1y, T1x));
80 			 T1B = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
81 			 T1C = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
82 			 T1A = LDW(&(W[TWVL * 16]));
83 			 T1D = VZMULIJ(T1A, VFNMSCONJ(T1C, T1B));
84 			 T3t = VADD(T1D, T1z);
85 			 T23 = LDW(&(W[TWVL * 46]));
86 			 T24 = VZMULJ(T23, VFMACONJ(T1y, T1x));
87 			 T21 = LDW(&(W[TWVL * 14]));
88 			 T22 = VZMULJ(T21, VFMACONJ(T1C, T1B));
89 			 Ta = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
90 			 Tb = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
91 			 T9 = LDW(&(W[TWVL * 6]));
92 			 Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
93 			 T1H = LDW(&(W[TWVL * 8]));
94 			 T1I = VZMULIJ(T1H, VFNMSCONJ(Tb, Ta));
95 			 Te = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
96 			 Tf = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
97 			 Td = LDW(&(W[TWVL * 38]));
98 			 Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
99 			 T1F = LDW(&(W[TWVL * 40]));
100 			 T1G = VZMULIJ(T1F, VFNMSCONJ(Tf, Te));
101 			 Th = VSUB(Tc, Tg);
102 			 T3q = VADD(T1I, T1G);
103 			 T1J = VSUB(T1G, T1I);
104 			 Tj = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
105 			 Tk = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
106 			 Ti = LDW(&(W[TWVL * 54]));
107 			 Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
108 			 T1M = LDW(&(W[TWVL * 56]));
109 			 T1N = VZMULIJ(T1M, VFNMSCONJ(Tk, Tj));
110 			 Tn = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
111 			 To = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
112 			 Tm = LDW(&(W[TWVL * 22]));
113 			 Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
114 			 T1K = LDW(&(W[TWVL * 24]));
115 			 T1L = VZMULIJ(T1K, VFNMSCONJ(To, Tn));
116 			 Tq = VSUB(Tl, Tp);
117 			 T3r = VADD(T1N, T1L);
118 			 T1O = VSUB(T1L, T1N);
119 		    }
120 		    T47 = VADD(T3u, T3t);
121 		    T48 = VADD(T3q, T3r);
122 		    T4l = VSUB(T48, T47);
123 		    T3s = VSUB(T3q, T3r);
124 		    T3v = VSUB(T3t, T3u);
125 		    T3w = VFNMS(LDK(KP414213562), T3v, T3s);
126 		    T3F = VFMA(LDK(KP414213562), T3s, T3v);
127 		    T3z = VADD(Tl, Tp);
128 		    T3A = VADD(Tc, Tg);
129 		    T3B = VSUB(T3z, T3A);
130 		    T41 = VADD(T3A, T3z);
131 		    T8 = VSUB(T3, T7);
132 		    Tr = VADD(Th, Tq);
133 		    Ts = VFNMS(LDK(KP707106781), Tr, T8);
134 		    T2y = VFMA(LDK(KP707106781), Tr, T8);
135 		    T1E = VSUB(T1z, T1D);
136 		    T1P = VSUB(T1J, T1O);
137 		    T1Q = VFNMS(LDK(KP707106781), T1P, T1E);
138 		    T2B = VFMA(LDK(KP707106781), T1P, T1E);
139 		    T25 = VSUB(T22, T24);
140 		    T26 = VSUB(Tq, Th);
141 		    T27 = VFMA(LDK(KP707106781), T26, T25);
142 		    T2J = VFNMS(LDK(KP707106781), T26, T25);
143 		    T38 = VADD(T3, T7);
144 		    T39 = VADD(T22, T24);
145 		    T3a = VSUB(T38, T39);
146 		    T40 = VADD(T38, T39);
147 		    T1V = VSUB(T1S, T1U);
148 		    T1W = VADD(T1J, T1O);
149 		    T1X = VFNMS(LDK(KP707106781), T1W, T1V);
150 		    T2C = VFMA(LDK(KP707106781), T1W, T1V);
151 	       }
152 	       {
153 		    V TP, TT, TN, TO, TM, T19, TR, TS, TQ, T17, TY, T12, TW, TX, TV;
154 		    V T1q, T10, T11, TZ, T1o, Tw, T1i, TA, T1k, Tu, Tv, Tt, T1h, Ty, Tz;
155 		    V Tx, T1j, TF, T1f, TJ, T1d, TD, TE, TC, T1e, TH, TI, TG, T1c;
156 		    TN = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
157 		    TO = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
158 		    TM = LDW(&(W[TWVL * 10]));
159 		    TP = VZMULJ(TM, VFMACONJ(TO, TN));
160 		    T19 = LDW(&(W[TWVL * 12]));
161 		    T1a = VZMULIJ(T19, VFNMSCONJ(TO, TN));
162 		    TR = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
163 		    TS = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
164 		    TQ = LDW(&(W[TWVL * 42]));
165 		    TT = VZMULJ(TQ, VFMACONJ(TS, TR));
166 		    T17 = LDW(&(W[TWVL * 44]));
167 		    T18 = VZMULIJ(T17, VFNMSCONJ(TS, TR));
168 		    TU = VSUB(TP, TT);
169 		    T3m = VADD(T1a, T18);
170 		    T3f = VADD(TP, TT);
171 		    TW = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
172 		    TX = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
173 		    TV = LDW(&(W[TWVL * 58]));
174 		    TY = VZMULJ(TV, VFMACONJ(TX, TW));
175 		    T1q = LDW(&(W[TWVL * 60]));
176 		    T1r = VZMULIJ(T1q, VFNMSCONJ(TX, TW));
177 		    T10 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
178 		    T11 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
179 		    TZ = LDW(&(W[TWVL * 26]));
180 		    T12 = VZMULJ(TZ, VFMACONJ(T11, T10));
181 		    T1o = LDW(&(W[TWVL * 28]));
182 		    T1p = VZMULIJ(T1o, VFNMSCONJ(T11, T10));
183 		    T13 = VSUB(TY, T12);
184 		    T3n = VADD(T1r, T1p);
185 		    T3e = VADD(TY, T12);
186 		    Tu = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
187 		    Tv = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
188 		    Tt = LDW(&(W[TWVL * 18]));
189 		    Tw = VZMULJ(Tt, VFMACONJ(Tv, Tu));
190 		    T1h = LDW(&(W[TWVL * 20]));
191 		    T1i = VZMULIJ(T1h, VFNMSCONJ(Tv, Tu));
192 		    Ty = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
193 		    Tz = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
194 		    Tx = LDW(&(W[TWVL * 50]));
195 		    TA = VZMULJ(Tx, VFMACONJ(Tz, Ty));
196 		    T1j = LDW(&(W[TWVL * 52]));
197 		    T1k = VZMULIJ(T1j, VFNMSCONJ(Tz, Ty));
198 		    TB = VSUB(Tw, TA);
199 		    T3k = VADD(T1i, T1k);
200 		    T1l = VSUB(T1i, T1k);
201 		    T3c = VADD(Tw, TA);
202 		    TD = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
203 		    TE = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
204 		    TC = LDW(&(W[TWVL * 2]));
205 		    TF = VZMULJ(TC, VFMACONJ(TE, TD));
206 		    T1e = LDW(&(W[TWVL * 4]));
207 		    T1f = VZMULIJ(T1e, VFNMSCONJ(TE, TD));
208 		    TH = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
209 		    TI = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
210 		    TG = LDW(&(W[TWVL * 34]));
211 		    TJ = VZMULJ(TG, VFMACONJ(TI, TH));
212 		    T1c = LDW(&(W[TWVL * 36]));
213 		    T1d = VZMULIJ(T1c, VFNMSCONJ(TI, TH));
214 		    TK = VSUB(TF, TJ);
215 		    T3j = VADD(T1f, T1d);
216 		    T1g = VSUB(T1d, T1f);
217 		    T3b = VADD(TF, TJ);
218 	       }
219 	       T43 = VADD(T3b, T3c);
220 	       T44 = VADD(T3e, T3f);
221 	       T4a = VADD(T3j, T3k);
222 	       T4b = VADD(T3n, T3m);
223 	       T4m = VSUB(T4a, T4b);
224 	       T3l = VSUB(T3j, T3k);
225 	       T3o = VSUB(T3m, T3n);
226 	       T3p = VFMA(LDK(KP414213562), T3o, T3l);
227 	       T3E = VFNMS(LDK(KP414213562), T3l, T3o);
228 	       TL = VFMA(LDK(KP414213562), TK, TB);
229 	       T14 = VFNMS(LDK(KP414213562), T13, TU);
230 	       T15 = VSUB(TL, T14);
231 	       T2K = VADD(TL, T14);
232 	       T1s = VSUB(T1p, T1r);
233 	       T1t = VADD(T1g, T1l);
234 	       T1u = VFNMS(LDK(KP707106781), T1t, T1s);
235 	       T2F = VFMA(LDK(KP707106781), T1t, T1s);
236 	       T3d = VSUB(T3b, T3c);
237 	       T3g = VSUB(T3e, T3f);
238 	       T3h = VADD(T3d, T3g);
239 	       T3C = VSUB(T3g, T3d);
240 	       T1b = VSUB(T18, T1a);
241 	       T1m = VSUB(T1g, T1l);
242 	       T1n = VFNMS(LDK(KP707106781), T1m, T1b);
243 	       T2E = VFMA(LDK(KP707106781), T1m, T1b);
244 	       T28 = VFMA(LDK(KP414213562), TU, T13);
245 	       T29 = VFNMS(LDK(KP414213562), TB, TK);
246 	       T2a = VSUB(T28, T29);
247 	       T2z = VADD(T29, T28);
248 	       {
249 		    V T4o, T4u, T4r, T4v, T4k, T4n, T4p, T4q, T4s, T4x, T4t, T4w, T3y, T3K, T3H;
250 		    V T3L, T3i, T3x, T3D, T3G, T3I, T3N, T3J, T3M, T46, T4g, T4d, T4h, T42, T45;
251 		    V T49, T4c, T4e, T4j, T4f, T4i;
252 		    T4k = VSUB(T40, T41);
253 		    T4n = VADD(T4l, T4m);
254 		    T4o = VFMA(LDK(KP707106781), T4n, T4k);
255 		    T4u = VFNMS(LDK(KP707106781), T4n, T4k);
256 		    T4p = VSUB(T44, T43);
257 		    T4q = VSUB(T4m, T4l);
258 		    T4r = VFMA(LDK(KP707106781), T4q, T4p);
259 		    T4v = VFNMS(LDK(KP707106781), T4q, T4p);
260 		    T4s = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4r, T4o)));
261 		    ST(&(Rm[WS(rs, 3)]), T4s, -ms, &(Rm[WS(rs, 1)]));
262 		    T4x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T4v, T4u)));
263 		    ST(&(Rm[WS(rs, 11)]), T4x, -ms, &(Rm[WS(rs, 1)]));
264 		    T4t = VMUL(LDK(KP500000000), VFMAI(T4r, T4o));
265 		    ST(&(Rp[WS(rs, 4)]), T4t, ms, &(Rp[0]));
266 		    T4w = VMUL(LDK(KP500000000), VFNMSI(T4v, T4u));
267 		    ST(&(Rp[WS(rs, 12)]), T4w, ms, &(Rp[0]));
268 		    T3i = VFNMS(LDK(KP707106781), T3h, T3a);
269 		    T3x = VSUB(T3p, T3w);
270 		    T3y = VFMA(LDK(KP923879532), T3x, T3i);
271 		    T3K = VFNMS(LDK(KP923879532), T3x, T3i);
272 		    T3D = VFNMS(LDK(KP707106781), T3C, T3B);
273 		    T3G = VSUB(T3E, T3F);
274 		    T3H = VFNMS(LDK(KP923879532), T3G, T3D);
275 		    T3L = VFMA(LDK(KP923879532), T3G, T3D);
276 		    T3I = VMUL(LDK(KP500000000), VFNMSI(T3H, T3y));
277 		    ST(&(Rp[WS(rs, 6)]), T3I, ms, &(Rp[0]));
278 		    T3N = VMUL(LDK(KP500000000), VFMAI(T3L, T3K));
279 		    ST(&(Rp[WS(rs, 10)]), T3N, ms, &(Rp[0]));
280 		    T3J = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3H, T3y)));
281 		    ST(&(Rm[WS(rs, 5)]), T3J, -ms, &(Rm[WS(rs, 1)]));
282 		    T3M = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3L, T3K)));
283 		    ST(&(Rm[WS(rs, 9)]), T3M, -ms, &(Rm[WS(rs, 1)]));
284 		    T42 = VADD(T40, T41);
285 		    T45 = VADD(T43, T44);
286 		    T46 = VSUB(T42, T45);
287 		    T4g = VADD(T42, T45);
288 		    T49 = VADD(T47, T48);
289 		    T4c = VADD(T4a, T4b);
290 		    T4d = VSUB(T49, T4c);
291 		    T4h = VADD(T49, T4c);
292 		    T4e = VMUL(LDK(KP500000000), VFMAI(T4d, T46));
293 		    ST(&(Rp[WS(rs, 8)]), T4e, ms, &(Rp[0]));
294 		    T4j = VCONJ(VMUL(LDK(KP500000000), VADD(T4h, T4g)));
295 		    ST(&(Rm[WS(rs, 15)]), T4j, -ms, &(Rm[WS(rs, 1)]));
296 		    T4f = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T4d, T46)));
297 		    ST(&(Rm[WS(rs, 7)]), T4f, -ms, &(Rm[WS(rs, 1)]));
298 		    T4i = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
299 		    ST(&(Rp[0]), T4i, ms, &(Rp[0]));
300 	       }
301 	       T3O = VFMA(LDK(KP707106781), T3h, T3a);
302 	       T3P = VADD(T3F, T3E);
303 	       T3Q = VFMA(LDK(KP923879532), T3P, T3O);
304 	       T3W = VFNMS(LDK(KP923879532), T3P, T3O);
305 	       T3R = VFMA(LDK(KP707106781), T3C, T3B);
306 	       T3S = VADD(T3w, T3p);
307 	       T3T = VFMA(LDK(KP923879532), T3S, T3R);
308 	       T3X = VFNMS(LDK(KP923879532), T3S, T3R);
309 	       T3U = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T3T, T3Q)));
310 	       ST(&(Rm[WS(rs, 1)]), T3U, -ms, &(Rm[WS(rs, 1)]));
311 	       T3Z = VCONJ(VMUL(LDK(KP500000000), VFMAI(T3X, T3W)));
312 	       ST(&(Rm[WS(rs, 13)]), T3Z, -ms, &(Rm[WS(rs, 1)]));
313 	       T3V = VMUL(LDK(KP500000000), VFMAI(T3T, T3Q));
314 	       ST(&(Rp[WS(rs, 2)]), T3V, ms, &(Rp[0]));
315 	       T3Y = VMUL(LDK(KP500000000), VFNMSI(T3X, T3W));
316 	       ST(&(Rp[WS(rs, 14)]), T3Y, ms, &(Rp[0]));
317 	       {
318 		    V T2I, T35, T2S, T31, T2P, T34, T2T, T2Y, T2A, T2Z, T2H, T30, T2D, T2G, T2L;
319 		    V T2W, T2O, T2X, T2M, T2N, T2Q, T36, T37, T2R, T2U, T32, T33, T2V, T20, T2v;
320 		    V T2i, T2r, T2f, T2u, T2j, T2o, T16, T2p, T1Z, T2q, T1v, T1Y, T2b, T2m, T2e;
321 		    V T2n, T2c, T2d, T2g, T2w, T2x, T2h, T2k, T2s, T2t, T2l;
322 		    T2A = VFNMS(LDK(KP923879532), T2z, T2y);
323 		    T2Z = VFMA(LDK(KP923879532), T2K, T2J);
324 		    T2D = VFMA(LDK(KP198912367), T2C, T2B);
325 		    T2G = VFNMS(LDK(KP198912367), T2F, T2E);
326 		    T2H = VSUB(T2D, T2G);
327 		    T30 = VADD(T2D, T2G);
328 		    T2I = VFMA(LDK(KP980785280), T2H, T2A);
329 		    T35 = VFNMS(LDK(KP980785280), T30, T2Z);
330 		    T2S = VFNMS(LDK(KP980785280), T2H, T2A);
331 		    T31 = VFMA(LDK(KP980785280), T30, T2Z);
332 		    T2L = VFNMS(LDK(KP923879532), T2K, T2J);
333 		    T2W = VFMA(LDK(KP923879532), T2z, T2y);
334 		    T2M = VFMA(LDK(KP198912367), T2E, T2F);
335 		    T2N = VFNMS(LDK(KP198912367), T2B, T2C);
336 		    T2O = VSUB(T2M, T2N);
337 		    T2X = VADD(T2N, T2M);
338 		    T2P = VFMA(LDK(KP980785280), T2O, T2L);
339 		    T34 = VFNMS(LDK(KP980785280), T2X, T2W);
340 		    T2T = VFNMS(LDK(KP980785280), T2O, T2L);
341 		    T2Y = VFMA(LDK(KP980785280), T2X, T2W);
342 		    T2Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2P, T2I)));
343 		    ST(&(Rm[WS(rs, 6)]), T2Q, -ms, &(Rm[0]));
344 		    T36 = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T35, T34)));
345 		    ST(&(Rm[WS(rs, 14)]), T36, -ms, &(Rm[0]));
346 		    T37 = VMUL(LDK(KP500000000), VFMAI(T35, T34));
347 		    ST(&(Rp[WS(rs, 15)]), T37, ms, &(Rp[WS(rs, 1)]));
348 		    T2R = VMUL(LDK(KP500000000), VFMAI(T2P, T2I));
349 		    ST(&(Rp[WS(rs, 7)]), T2R, ms, &(Rp[WS(rs, 1)]));
350 		    T2U = VMUL(LDK(KP500000000), VFNMSI(T2T, T2S));
351 		    ST(&(Rp[WS(rs, 9)]), T2U, ms, &(Rp[WS(rs, 1)]));
352 		    T32 = VMUL(LDK(KP500000000), VFNMSI(T31, T2Y));
353 		    ST(&(Rp[WS(rs, 1)]), T32, ms, &(Rp[WS(rs, 1)]));
354 		    T33 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T31, T2Y)));
355 		    ST(&(Rm[0]), T33, -ms, &(Rm[0]));
356 		    T2V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2T, T2S)));
357 		    ST(&(Rm[WS(rs, 8)]), T2V, -ms, &(Rm[0]));
358 		    T16 = VFNMS(LDK(KP923879532), T15, Ts);
359 		    T2p = VFMA(LDK(KP923879532), T2a, T27);
360 		    T1v = VFMA(LDK(KP668178637), T1u, T1n);
361 		    T1Y = VFNMS(LDK(KP668178637), T1X, T1Q);
362 		    T1Z = VSUB(T1v, T1Y);
363 		    T2q = VADD(T1Y, T1v);
364 		    T20 = VFMA(LDK(KP831469612), T1Z, T16);
365 		    T2v = VFNMS(LDK(KP831469612), T2q, T2p);
366 		    T2i = VFNMS(LDK(KP831469612), T1Z, T16);
367 		    T2r = VFMA(LDK(KP831469612), T2q, T2p);
368 		    T2b = VFNMS(LDK(KP923879532), T2a, T27);
369 		    T2m = VFMA(LDK(KP923879532), T15, Ts);
370 		    T2c = VFNMS(LDK(KP668178637), T1n, T1u);
371 		    T2d = VFMA(LDK(KP668178637), T1Q, T1X);
372 		    T2e = VSUB(T2c, T2d);
373 		    T2n = VADD(T2d, T2c);
374 		    T2f = VFNMS(LDK(KP831469612), T2e, T2b);
375 		    T2u = VFNMS(LDK(KP831469612), T2n, T2m);
376 		    T2j = VFMA(LDK(KP831469612), T2e, T2b);
377 		    T2o = VFMA(LDK(KP831469612), T2n, T2m);
378 		    T2g = VMUL(LDK(KP500000000), VFNMSI(T2f, T20));
379 		    ST(&(Rp[WS(rs, 5)]), T2g, ms, &(Rp[WS(rs, 1)]));
380 		    T2w = VMUL(LDK(KP500000000), VFNMSI(T2v, T2u));
381 		    ST(&(Rp[WS(rs, 13)]), T2w, ms, &(Rp[WS(rs, 1)]));
382 		    T2x = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2v, T2u)));
383 		    ST(&(Rm[WS(rs, 12)]), T2x, -ms, &(Rm[0]));
384 		    T2h = VCONJ(VMUL(LDK(KP500000000), VFMAI(T2f, T20)));
385 		    ST(&(Rm[WS(rs, 4)]), T2h, -ms, &(Rm[0]));
386 		    T2k = VMUL(LDK(KP500000000), VFMAI(T2j, T2i));
387 		    ST(&(Rp[WS(rs, 11)]), T2k, ms, &(Rp[WS(rs, 1)]));
388 		    T2s = VMUL(LDK(KP500000000), VFMAI(T2r, T2o));
389 		    ST(&(Rp[WS(rs, 3)]), T2s, ms, &(Rp[WS(rs, 1)]));
390 		    T2t = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2r, T2o)));
391 		    ST(&(Rm[WS(rs, 2)]), T2t, -ms, &(Rm[0]));
392 		    T2l = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T2j, T2i)));
393 		    ST(&(Rm[WS(rs, 10)]), T2l, -ms, &(Rm[0]));
394 	       }
395 	  }
396      }
397      VLEAVE();
398 }
399 
400 static const tw_instr twinstr[] = {
401      VTW(1, 1),
402      VTW(1, 2),
403      VTW(1, 3),
404      VTW(1, 4),
405      VTW(1, 5),
406      VTW(1, 6),
407      VTW(1, 7),
408      VTW(1, 8),
409      VTW(1, 9),
410      VTW(1, 10),
411      VTW(1, 11),
412      VTW(1, 12),
413      VTW(1, 13),
414      VTW(1, 14),
415      VTW(1, 15),
416      VTW(1, 16),
417      VTW(1, 17),
418      VTW(1, 18),
419      VTW(1, 19),
420      VTW(1, 20),
421      VTW(1, 21),
422      VTW(1, 22),
423      VTW(1, 23),
424      VTW(1, 24),
425      VTW(1, 25),
426      VTW(1, 26),
427      VTW(1, 27),
428      VTW(1, 28),
429      VTW(1, 29),
430      VTW(1, 30),
431      VTW(1, 31),
432      { TW_NEXT, VL, 0 }
433 };
434 
435 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, { 119, 94, 130, 0 } };
436 
XSIMD(codelet_hc2cfdftv_32)437 void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
438      X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
439 }
440 #else
441 
442 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 32 -dit -name hc2cfdftv_32 -include rdft/simd/hc2cfv.h */
443 
444 /*
445  * This function contains 249 FP additions, 133 FP multiplications,
446  * (or, 233 additions, 117 multiplications, 16 fused multiply/add),
447  * 130 stack variables, 9 constants, and 64 memory accesses
448  */
449 #include "rdft/simd/hc2cfv.h"
450 
hc2cfdftv_32(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)451 static void hc2cfdftv_32(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
452 {
453      DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
454      DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
455      DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
456      DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
457      DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
458      DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
459      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
460      DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
461      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
462      {
463 	  INT m;
464 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 62)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 62), MAKE_VOLATILE_STRIDE(128, rs)) {
465 	       V Ta, T2m, Tx, T2h, T3R, T4h, T3q, T4g, T3B, T4n, T3E, T4o, T1B, T2S, T1O;
466 	       V T2R, TV, T2p, T1i, T2o, T3L, T4q, T3I, T4r, T3w, T4k, T3t, T4j, T26, T2V;
467 	       V T2d, T2U;
468 	       {
469 		    V T4, T1m, T1H, T2j, T1M, T2l, T9, T1o, Tf, T1r, Tq, T1w, Tv, T1y, Tk;
470 		    V T1t, Tl, Tw, T3P, T3Q, T3o, T3p, T3z, T3A, T3C, T3D, T1p, T1N, T1A, T1C;
471 		    V T1u, T1z;
472 		    {
473 			 V T1, T3, T2, T1l, T1G, T1F, T1E, T1D, T2i, T1L, T1K, T1J, T1I, T2k, T6;
474 			 V T8, T7, T5, T1n, Tc, Te, Td, Tb, T1q, Tn, Tp, To, Tm, T1v, Ts;
475 			 V Tu, Tt, Tr, T1x, Th, Tj, Ti, Tg, T1s;
476 			 T1 = LD(&(Rp[0]), ms, &(Rp[0]));
477 			 T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
478 			 T3 = VCONJ(T2);
479 			 T4 = VADD(T1, T3);
480 			 T1l = LDW(&(W[0]));
481 			 T1m = VZMULIJ(T1l, VSUB(T3, T1));
482 			 T1G = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
483 			 T1E = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
484 			 T1F = VCONJ(T1E);
485 			 T1D = LDW(&(W[TWVL * 16]));
486 			 T1H = VZMULIJ(T1D, VSUB(T1F, T1G));
487 			 T2i = LDW(&(W[TWVL * 14]));
488 			 T2j = VZMULJ(T2i, VADD(T1G, T1F));
489 			 T1L = LD(&(Rp[WS(rs, 12)]), ms, &(Rp[0]));
490 			 T1J = LD(&(Rm[WS(rs, 12)]), -ms, &(Rm[0]));
491 			 T1K = VCONJ(T1J);
492 			 T1I = LDW(&(W[TWVL * 48]));
493 			 T1M = VZMULIJ(T1I, VSUB(T1K, T1L));
494 			 T2k = LDW(&(W[TWVL * 46]));
495 			 T2l = VZMULJ(T2k, VADD(T1L, T1K));
496 			 T6 = LD(&(Rp[WS(rs, 8)]), ms, &(Rp[0]));
497 			 T7 = LD(&(Rm[WS(rs, 8)]), -ms, &(Rm[0]));
498 			 T8 = VCONJ(T7);
499 			 T5 = LDW(&(W[TWVL * 30]));
500 			 T9 = VZMULJ(T5, VADD(T6, T8));
501 			 T1n = LDW(&(W[TWVL * 32]));
502 			 T1o = VZMULIJ(T1n, VSUB(T8, T6));
503 			 Tc = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
504 			 Td = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
505 			 Te = VCONJ(Td);
506 			 Tb = LDW(&(W[TWVL * 6]));
507 			 Tf = VZMULJ(Tb, VADD(Tc, Te));
508 			 T1q = LDW(&(W[TWVL * 8]));
509 			 T1r = VZMULIJ(T1q, VSUB(Te, Tc));
510 			 Tn = LD(&(Rp[WS(rs, 14)]), ms, &(Rp[0]));
511 			 To = LD(&(Rm[WS(rs, 14)]), -ms, &(Rm[0]));
512 			 Tp = VCONJ(To);
513 			 Tm = LDW(&(W[TWVL * 54]));
514 			 Tq = VZMULJ(Tm, VADD(Tn, Tp));
515 			 T1v = LDW(&(W[TWVL * 56]));
516 			 T1w = VZMULIJ(T1v, VSUB(Tp, Tn));
517 			 Ts = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
518 			 Tt = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
519 			 Tu = VCONJ(Tt);
520 			 Tr = LDW(&(W[TWVL * 22]));
521 			 Tv = VZMULJ(Tr, VADD(Ts, Tu));
522 			 T1x = LDW(&(W[TWVL * 24]));
523 			 T1y = VZMULIJ(T1x, VSUB(Tu, Ts));
524 			 Th = LD(&(Rp[WS(rs, 10)]), ms, &(Rp[0]));
525 			 Ti = LD(&(Rm[WS(rs, 10)]), -ms, &(Rm[0]));
526 			 Tj = VCONJ(Ti);
527 			 Tg = LDW(&(W[TWVL * 38]));
528 			 Tk = VZMULJ(Tg, VADD(Th, Tj));
529 			 T1s = LDW(&(W[TWVL * 40]));
530 			 T1t = VZMULIJ(T1s, VSUB(Tj, Th));
531 		    }
532 		    Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
533 		    T2m = VSUB(T2j, T2l);
534 		    Tl = VSUB(Tf, Tk);
535 		    Tw = VSUB(Tq, Tv);
536 		    Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
537 		    T2h = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
538 		    T3P = VADD(Tq, Tv);
539 		    T3Q = VADD(Tf, Tk);
540 		    T3R = VSUB(T3P, T3Q);
541 		    T4h = VADD(T3Q, T3P);
542 		    T3o = VADD(T4, T9);
543 		    T3p = VADD(T2j, T2l);
544 		    T3q = VMUL(LDK(KP500000000), VSUB(T3o, T3p));
545 		    T4g = VADD(T3o, T3p);
546 		    T3z = VADD(T1m, T1o);
547 		    T3A = VADD(T1H, T1M);
548 		    T3B = VSUB(T3z, T3A);
549 		    T4n = VADD(T3z, T3A);
550 		    T3C = VADD(T1w, T1y);
551 		    T3D = VADD(T1r, T1t);
552 		    T3E = VSUB(T3C, T3D);
553 		    T4o = VADD(T3D, T3C);
554 		    T1p = VSUB(T1m, T1o);
555 		    T1N = VSUB(T1H, T1M);
556 		    T1u = VSUB(T1r, T1t);
557 		    T1z = VSUB(T1w, T1y);
558 		    T1A = VMUL(LDK(KP707106781), VADD(T1u, T1z));
559 		    T1C = VMUL(LDK(KP707106781), VSUB(T1z, T1u));
560 		    T1B = VADD(T1p, T1A);
561 		    T2S = VADD(T1N, T1C);
562 		    T1O = VSUB(T1C, T1N);
563 		    T2R = VSUB(T1p, T1A);
564 	       }
565 	       {
566 		    V TD, T1R, T1b, T29, T1g, T2b, TI, T1T, TO, T1Y, T10, T22, T15, T24, TT;
567 		    V T1W, TJ, TU, T16, T1h, T3J, T3K, T3G, T3H, T3u, T3v, T3r, T3s, T25, T2c;
568 		    V T20, T27, T1U, T1Z;
569 		    {
570 			 V TA, TC, TB, Tz, T1Q, T18, T1a, T19, T17, T28, T1d, T1f, T1e, T1c, T2a;
571 			 V TF, TH, TG, TE, T1S, TL, TN, TM, TK, T1X, TX, TZ, TY, TW, T21;
572 			 V T12, T14, T13, T11, T23, TQ, TS, TR, TP, T1V;
573 			 TA = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
574 			 TB = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
575 			 TC = VCONJ(TB);
576 			 Tz = LDW(&(W[TWVL * 2]));
577 			 TD = VZMULJ(Tz, VADD(TA, TC));
578 			 T1Q = LDW(&(W[TWVL * 4]));
579 			 T1R = VZMULIJ(T1Q, VSUB(TC, TA));
580 			 T18 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
581 			 T19 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
582 			 T1a = VCONJ(T19);
583 			 T17 = LDW(&(W[TWVL * 10]));
584 			 T1b = VZMULJ(T17, VADD(T18, T1a));
585 			 T28 = LDW(&(W[TWVL * 12]));
586 			 T29 = VZMULIJ(T28, VSUB(T1a, T18));
587 			 T1d = LD(&(Rp[WS(rs, 11)]), ms, &(Rp[WS(rs, 1)]));
588 			 T1e = LD(&(Rm[WS(rs, 11)]), -ms, &(Rm[WS(rs, 1)]));
589 			 T1f = VCONJ(T1e);
590 			 T1c = LDW(&(W[TWVL * 42]));
591 			 T1g = VZMULJ(T1c, VADD(T1d, T1f));
592 			 T2a = LDW(&(W[TWVL * 44]));
593 			 T2b = VZMULIJ(T2a, VSUB(T1f, T1d));
594 			 TF = LD(&(Rp[WS(rs, 9)]), ms, &(Rp[WS(rs, 1)]));
595 			 TG = LD(&(Rm[WS(rs, 9)]), -ms, &(Rm[WS(rs, 1)]));
596 			 TH = VCONJ(TG);
597 			 TE = LDW(&(W[TWVL * 34]));
598 			 TI = VZMULJ(TE, VADD(TF, TH));
599 			 T1S = LDW(&(W[TWVL * 36]));
600 			 T1T = VZMULIJ(T1S, VSUB(TH, TF));
601 			 TL = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
602 			 TM = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
603 			 TN = VCONJ(TM);
604 			 TK = LDW(&(W[TWVL * 18]));
605 			 TO = VZMULJ(TK, VADD(TL, TN));
606 			 T1X = LDW(&(W[TWVL * 20]));
607 			 T1Y = VZMULIJ(T1X, VSUB(TN, TL));
608 			 TX = LD(&(Rp[WS(rs, 15)]), ms, &(Rp[WS(rs, 1)]));
609 			 TY = LD(&(Rm[WS(rs, 15)]), -ms, &(Rm[WS(rs, 1)]));
610 			 TZ = VCONJ(TY);
611 			 TW = LDW(&(W[TWVL * 58]));
612 			 T10 = VZMULJ(TW, VADD(TX, TZ));
613 			 T21 = LDW(&(W[TWVL * 60]));
614 			 T22 = VZMULIJ(T21, VSUB(TZ, TX));
615 			 T12 = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
616 			 T13 = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
617 			 T14 = VCONJ(T13);
618 			 T11 = LDW(&(W[TWVL * 26]));
619 			 T15 = VZMULJ(T11, VADD(T12, T14));
620 			 T23 = LDW(&(W[TWVL * 28]));
621 			 T24 = VZMULIJ(T23, VSUB(T14, T12));
622 			 TQ = LD(&(Rp[WS(rs, 13)]), ms, &(Rp[WS(rs, 1)]));
623 			 TR = LD(&(Rm[WS(rs, 13)]), -ms, &(Rm[WS(rs, 1)]));
624 			 TS = VCONJ(TR);
625 			 TP = LDW(&(W[TWVL * 50]));
626 			 TT = VZMULJ(TP, VADD(TQ, TS));
627 			 T1V = LDW(&(W[TWVL * 52]));
628 			 T1W = VZMULIJ(T1V, VSUB(TS, TQ));
629 		    }
630 		    TJ = VSUB(TD, TI);
631 		    TU = VSUB(TO, TT);
632 		    TV = VFNMS(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TJ));
633 		    T2p = VFMA(LDK(KP382683432), TJ, VMUL(LDK(KP923879532), TU));
634 		    T16 = VSUB(T10, T15);
635 		    T1h = VSUB(T1b, T1g);
636 		    T1i = VFMA(LDK(KP923879532), T16, VMUL(LDK(KP382683432), T1h));
637 		    T2o = VFNMS(LDK(KP923879532), T1h, VMUL(LDK(KP382683432), T16));
638 		    T3J = VADD(T1Y, T1W);
639 		    T3K = VADD(T1R, T1T);
640 		    T3L = VSUB(T3J, T3K);
641 		    T4q = VADD(T3K, T3J);
642 		    T3G = VADD(T22, T24);
643 		    T3H = VADD(T29, T2b);
644 		    T3I = VSUB(T3G, T3H);
645 		    T4r = VADD(T3G, T3H);
646 		    T3u = VADD(T10, T15);
647 		    T3v = VADD(T1b, T1g);
648 		    T3w = VSUB(T3u, T3v);
649 		    T4k = VADD(T3u, T3v);
650 		    T3r = VADD(TD, TI);
651 		    T3s = VADD(TO, TT);
652 		    T3t = VSUB(T3r, T3s);
653 		    T4j = VADD(T3r, T3s);
654 		    T25 = VSUB(T22, T24);
655 		    T2c = VSUB(T29, T2b);
656 		    T1U = VSUB(T1R, T1T);
657 		    T1Z = VSUB(T1W, T1Y);
658 		    T20 = VMUL(LDK(KP707106781), VADD(T1U, T1Z));
659 		    T27 = VMUL(LDK(KP707106781), VSUB(T1Z, T1U));
660 		    T26 = VADD(T20, T25);
661 		    T2V = VADD(T27, T2c);
662 		    T2d = VSUB(T27, T2c);
663 		    T2U = VSUB(T25, T20);
664 	       }
665 	       {
666 		    V T4m, T4w, T4t, T4x, T4i, T4l, T4p, T4s, T4u, T4z, T4v, T4y, T4E, T4L, T4H;
667 		    V T4K, T4A, T4F, T4D, T4G, T4B, T4C, T4I, T4N, T4J, T4M, T3O, T4c, T4d, T3X;
668 		    V T40, T46, T49, T41, T3y, T47, T3T, T45, T3N, T44, T3W, T48, T3x, T3S, T3F;
669 		    V T3M, T3U, T3V, T3Y, T4e, T4f, T3Z, T42, T4a, T4b, T43;
670 		    T4i = VADD(T4g, T4h);
671 		    T4l = VADD(T4j, T4k);
672 		    T4m = VADD(T4i, T4l);
673 		    T4w = VSUB(T4i, T4l);
674 		    T4p = VADD(T4n, T4o);
675 		    T4s = VADD(T4q, T4r);
676 		    T4t = VADD(T4p, T4s);
677 		    T4x = VBYI(VSUB(T4s, T4p));
678 		    T4u = VCONJ(VMUL(LDK(KP500000000), VSUB(T4m, T4t)));
679 		    ST(&(Rm[WS(rs, 15)]), T4u, -ms, &(Rm[WS(rs, 1)]));
680 		    T4z = VMUL(LDK(KP500000000), VADD(T4w, T4x));
681 		    ST(&(Rp[WS(rs, 8)]), T4z, ms, &(Rp[0]));
682 		    T4v = VMUL(LDK(KP500000000), VADD(T4m, T4t));
683 		    ST(&(Rp[0]), T4v, ms, &(Rp[0]));
684 		    T4y = VCONJ(VMUL(LDK(KP500000000), VSUB(T4w, T4x)));
685 		    ST(&(Rm[WS(rs, 7)]), T4y, -ms, &(Rm[WS(rs, 1)]));
686 		    T4A = VMUL(LDK(KP500000000), VSUB(T4g, T4h));
687 		    T4F = VSUB(T4k, T4j);
688 		    T4B = VSUB(T4n, T4o);
689 		    T4C = VSUB(T4r, T4q);
690 		    T4D = VMUL(LDK(KP353553390), VADD(T4B, T4C));
691 		    T4G = VMUL(LDK(KP707106781), VSUB(T4C, T4B));
692 		    T4E = VADD(T4A, T4D);
693 		    T4L = VMUL(LDK(KP500000000), VBYI(VSUB(T4G, T4F)));
694 		    T4H = VMUL(LDK(KP500000000), VBYI(VADD(T4F, T4G)));
695 		    T4K = VSUB(T4A, T4D);
696 		    T4I = VCONJ(VSUB(T4E, T4H));
697 		    ST(&(Rm[WS(rs, 3)]), T4I, -ms, &(Rm[WS(rs, 1)]));
698 		    T4N = VADD(T4K, T4L);
699 		    ST(&(Rp[WS(rs, 12)]), T4N, ms, &(Rp[0]));
700 		    T4J = VADD(T4E, T4H);
701 		    ST(&(Rp[WS(rs, 4)]), T4J, ms, &(Rp[0]));
702 		    T4M = VCONJ(VSUB(T4K, T4L));
703 		    ST(&(Rm[WS(rs, 11)]), T4M, -ms, &(Rm[WS(rs, 1)]));
704 		    T3x = VMUL(LDK(KP353553390), VADD(T3t, T3w));
705 		    T3y = VADD(T3q, T3x);
706 		    T47 = VSUB(T3q, T3x);
707 		    T3S = VMUL(LDK(KP707106781), VSUB(T3w, T3t));
708 		    T3T = VADD(T3R, T3S);
709 		    T45 = VSUB(T3S, T3R);
710 		    T3F = VFMA(LDK(KP923879532), T3B, VMUL(LDK(KP382683432), T3E));
711 		    T3M = VFNMS(LDK(KP382683432), T3L, VMUL(LDK(KP923879532), T3I));
712 		    T3N = VMUL(LDK(KP500000000), VADD(T3F, T3M));
713 		    T44 = VSUB(T3M, T3F);
714 		    T3U = VFNMS(LDK(KP382683432), T3B, VMUL(LDK(KP923879532), T3E));
715 		    T3V = VFMA(LDK(KP923879532), T3L, VMUL(LDK(KP382683432), T3I));
716 		    T3W = VADD(T3U, T3V);
717 		    T48 = VMUL(LDK(KP500000000), VSUB(T3V, T3U));
718 		    T3O = VADD(T3y, T3N);
719 		    T4c = VMUL(LDK(KP500000000), VBYI(VADD(T45, T44)));
720 		    T4d = VADD(T47, T48);
721 		    T3X = VMUL(LDK(KP500000000), VBYI(VADD(T3T, T3W)));
722 		    T40 = VSUB(T3y, T3N);
723 		    T46 = VMUL(LDK(KP500000000), VBYI(VSUB(T44, T45)));
724 		    T49 = VSUB(T47, T48);
725 		    T41 = VMUL(LDK(KP500000000), VBYI(VSUB(T3W, T3T)));
726 		    T3Y = VCONJ(VSUB(T3O, T3X));
727 		    ST(&(Rm[WS(rs, 1)]), T3Y, -ms, &(Rm[WS(rs, 1)]));
728 		    T4e = VADD(T4c, T4d);
729 		    ST(&(Rp[WS(rs, 6)]), T4e, ms, &(Rp[0]));
730 		    T4f = VCONJ(VSUB(T4d, T4c));
731 		    ST(&(Rm[WS(rs, 5)]), T4f, -ms, &(Rm[WS(rs, 1)]));
732 		    T3Z = VADD(T3O, T3X);
733 		    ST(&(Rp[WS(rs, 2)]), T3Z, ms, &(Rp[0]));
734 		    T42 = VCONJ(VSUB(T40, T41));
735 		    ST(&(Rm[WS(rs, 13)]), T42, -ms, &(Rm[WS(rs, 1)]));
736 		    T4a = VADD(T46, T49);
737 		    ST(&(Rp[WS(rs, 10)]), T4a, ms, &(Rp[0]));
738 		    T4b = VCONJ(VSUB(T49, T46));
739 		    ST(&(Rm[WS(rs, 9)]), T4b, -ms, &(Rm[WS(rs, 1)]));
740 		    T43 = VADD(T40, T41);
741 		    ST(&(Rp[WS(rs, 14)]), T43, ms, &(Rp[0]));
742 		    {
743 			 V T2g, T2K, T2L, T2v, T2y, T2E, T2H, T2z, T1k, T2F, T2u, T2G, T2f, T2C, T2r;
744 			 V T2D, Ty, T1j, T2s, T2t, T1P, T2e, T2n, T2q, T2w, T2M, T2N, T2x, T2A, T2I;
745 			 V T2J, T2B;
746 			 Ty = VADD(Ta, Tx);
747 			 T1j = VMUL(LDK(KP500000000), VADD(TV, T1i));
748 			 T1k = VADD(Ty, T1j);
749 			 T2F = VSUB(Ty, T1j);
750 			 T2s = VFNMS(LDK(KP195090322), T1B, VMUL(LDK(KP980785280), T1O));
751 			 T2t = VFMA(LDK(KP195090322), T26, VMUL(LDK(KP980785280), T2d));
752 			 T2u = VADD(T2s, T2t);
753 			 T2G = VMUL(LDK(KP500000000), VSUB(T2t, T2s));
754 			 T1P = VFMA(LDK(KP980785280), T1B, VMUL(LDK(KP195090322), T1O));
755 			 T2e = VFNMS(LDK(KP195090322), T2d, VMUL(LDK(KP980785280), T26));
756 			 T2f = VMUL(LDK(KP500000000), VADD(T1P, T2e));
757 			 T2C = VSUB(T2e, T1P);
758 			 T2n = VSUB(T2h, T2m);
759 			 T2q = VSUB(T2o, T2p);
760 			 T2r = VADD(T2n, T2q);
761 			 T2D = VSUB(T2q, T2n);
762 			 T2g = VADD(T1k, T2f);
763 			 T2K = VMUL(LDK(KP500000000), VBYI(VADD(T2D, T2C)));
764 			 T2L = VADD(T2F, T2G);
765 			 T2v = VMUL(LDK(KP500000000), VBYI(VADD(T2r, T2u)));
766 			 T2y = VSUB(T1k, T2f);
767 			 T2E = VMUL(LDK(KP500000000), VBYI(VSUB(T2C, T2D)));
768 			 T2H = VSUB(T2F, T2G);
769 			 T2z = VMUL(LDK(KP500000000), VBYI(VSUB(T2u, T2r)));
770 			 T2w = VCONJ(VSUB(T2g, T2v));
771 			 ST(&(Rm[0]), T2w, -ms, &(Rm[0]));
772 			 T2M = VADD(T2K, T2L);
773 			 ST(&(Rp[WS(rs, 7)]), T2M, ms, &(Rp[WS(rs, 1)]));
774 			 T2N = VCONJ(VSUB(T2L, T2K));
775 			 ST(&(Rm[WS(rs, 6)]), T2N, -ms, &(Rm[0]));
776 			 T2x = VADD(T2g, T2v);
777 			 ST(&(Rp[WS(rs, 1)]), T2x, ms, &(Rp[WS(rs, 1)]));
778 			 T2A = VCONJ(VSUB(T2y, T2z));
779 			 ST(&(Rm[WS(rs, 14)]), T2A, -ms, &(Rm[0]));
780 			 T2I = VADD(T2E, T2H);
781 			 ST(&(Rp[WS(rs, 9)]), T2I, ms, &(Rp[WS(rs, 1)]));
782 			 T2J = VCONJ(VSUB(T2H, T2E));
783 			 ST(&(Rm[WS(rs, 8)]), T2J, -ms, &(Rm[0]));
784 			 T2B = VADD(T2y, T2z);
785 			 ST(&(Rp[WS(rs, 15)]), T2B, ms, &(Rp[WS(rs, 1)]));
786 		    }
787 		    {
788 			 V T2Y, T3k, T3l, T35, T38, T3e, T3h, T39, T2Q, T3f, T34, T3g, T2X, T3c, T31;
789 			 V T3d, T2O, T2P, T32, T33, T2T, T2W, T2Z, T30, T36, T3m, T3n, T37, T3a, T3i;
790 			 V T3j, T3b;
791 			 T2O = VSUB(Ta, Tx);
792 			 T2P = VMUL(LDK(KP500000000), VADD(T2p, T2o));
793 			 T2Q = VADD(T2O, T2P);
794 			 T3f = VSUB(T2O, T2P);
795 			 T32 = VFNMS(LDK(KP555570233), T2R, VMUL(LDK(KP831469612), T2S));
796 			 T33 = VFMA(LDK(KP555570233), T2U, VMUL(LDK(KP831469612), T2V));
797 			 T34 = VADD(T32, T33);
798 			 T3g = VMUL(LDK(KP500000000), VSUB(T33, T32));
799 			 T2T = VFMA(LDK(KP831469612), T2R, VMUL(LDK(KP555570233), T2S));
800 			 T2W = VFNMS(LDK(KP555570233), T2V, VMUL(LDK(KP831469612), T2U));
801 			 T2X = VMUL(LDK(KP500000000), VADD(T2T, T2W));
802 			 T3c = VSUB(T2W, T2T);
803 			 T2Z = VADD(T2m, T2h);
804 			 T30 = VSUB(T1i, TV);
805 			 T31 = VADD(T2Z, T30);
806 			 T3d = VSUB(T30, T2Z);
807 			 T2Y = VADD(T2Q, T2X);
808 			 T3k = VMUL(LDK(KP500000000), VBYI(VADD(T3d, T3c)));
809 			 T3l = VADD(T3f, T3g);
810 			 T35 = VMUL(LDK(KP500000000), VBYI(VADD(T31, T34)));
811 			 T38 = VSUB(T2Q, T2X);
812 			 T3e = VMUL(LDK(KP500000000), VBYI(VSUB(T3c, T3d)));
813 			 T3h = VSUB(T3f, T3g);
814 			 T39 = VMUL(LDK(KP500000000), VBYI(VSUB(T34, T31)));
815 			 T36 = VCONJ(VSUB(T2Y, T35));
816 			 ST(&(Rm[WS(rs, 2)]), T36, -ms, &(Rm[0]));
817 			 T3m = VADD(T3k, T3l);
818 			 ST(&(Rp[WS(rs, 5)]), T3m, ms, &(Rp[WS(rs, 1)]));
819 			 T3n = VCONJ(VSUB(T3l, T3k));
820 			 ST(&(Rm[WS(rs, 4)]), T3n, -ms, &(Rm[0]));
821 			 T37 = VADD(T2Y, T35);
822 			 ST(&(Rp[WS(rs, 3)]), T37, ms, &(Rp[WS(rs, 1)]));
823 			 T3a = VCONJ(VSUB(T38, T39));
824 			 ST(&(Rm[WS(rs, 12)]), T3a, -ms, &(Rm[0]));
825 			 T3i = VADD(T3e, T3h);
826 			 ST(&(Rp[WS(rs, 11)]), T3i, ms, &(Rp[WS(rs, 1)]));
827 			 T3j = VCONJ(VSUB(T3h, T3e));
828 			 ST(&(Rm[WS(rs, 10)]), T3j, -ms, &(Rm[0]));
829 			 T3b = VADD(T38, T39);
830 			 ST(&(Rp[WS(rs, 13)]), T3b, ms, &(Rp[WS(rs, 1)]));
831 		    }
832 	       }
833 	  }
834      }
835      VLEAVE();
836 }
837 
838 static const tw_instr twinstr[] = {
839      VTW(1, 1),
840      VTW(1, 2),
841      VTW(1, 3),
842      VTW(1, 4),
843      VTW(1, 5),
844      VTW(1, 6),
845      VTW(1, 7),
846      VTW(1, 8),
847      VTW(1, 9),
848      VTW(1, 10),
849      VTW(1, 11),
850      VTW(1, 12),
851      VTW(1, 13),
852      VTW(1, 14),
853      VTW(1, 15),
854      VTW(1, 16),
855      VTW(1, 17),
856      VTW(1, 18),
857      VTW(1, 19),
858      VTW(1, 20),
859      VTW(1, 21),
860      VTW(1, 22),
861      VTW(1, 23),
862      VTW(1, 24),
863      VTW(1, 25),
864      VTW(1, 26),
865      VTW(1, 27),
866      VTW(1, 28),
867      VTW(1, 29),
868      VTW(1, 30),
869      VTW(1, 31),
870      { TW_NEXT, VL, 0 }
871 };
872 
873 static const hc2c_desc desc = { 32, XSIMD_STRING("hc2cfdftv_32"), twinstr, &GENUS, { 233, 117, 16, 0 } };
874 
XSIMD(codelet_hc2cfdftv_32)875 void XSIMD(codelet_hc2cfdftv_32) (planner *p) {
876      X(khc2c_register) (p, hc2cfdftv_32, &desc, HC2C_VIA_DFT);
877 }
878 #endif
879