1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:06:57 EST 2020 */
23 
24 #include "rdft/codelet-rdft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include rdft/simd/hc2cfv.h */
29 
30 /*
31  * This function contains 103 FP additions, 96 FP multiplications,
32  * (or, 53 additions, 46 multiplications, 50 fused multiply/add),
33  * 92 stack variables, 4 constants, and 32 memory accesses
34  */
35 #include "rdft/simd/hc2cfv.h"
36 
hc2cfdftv_16(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)37 static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
40      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
41      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
42      DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
43      {
44 	  INT m;
45 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
46 	       V T8, TZ, TH, T12, T1q, T1I, T1x, T1J, Tr, T10, T1A, T1K, TS, T13, T1t;
47 	       V T1N, T3, Tw, TF, TW, T7, Tu, TB, TY, T1, T2, Tv, TD, TE, TC;
48 	       V TV, T5, T6, T4, Tt, Tz, TA, Ty, TX, Tx, TG, T1o, T1p, T1v, T1w;
49 	       V T1C, T1D, T1u, T1B, T1G, T1H, T1E, T1F;
50 	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
51 	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
52 	       T3 = VFMACONJ(T2, T1);
53 	       Tv = LDW(&(W[0]));
54 	       Tw = VZMULIJ(Tv, VFNMSCONJ(T2, T1));
55 	       TD = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
56 	       TE = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
57 	       TC = LDW(&(W[TWVL * 8]));
58 	       TF = VZMULIJ(TC, VFNMSCONJ(TE, TD));
59 	       TV = LDW(&(W[TWVL * 6]));
60 	       TW = VZMULJ(TV, VFMACONJ(TE, TD));
61 	       T5 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
62 	       T6 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
63 	       T4 = LDW(&(W[TWVL * 14]));
64 	       T7 = VZMULJ(T4, VFMACONJ(T6, T5));
65 	       Tt = LDW(&(W[TWVL * 16]));
66 	       Tu = VZMULIJ(Tt, VFNMSCONJ(T6, T5));
67 	       Tz = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
68 	       TA = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
69 	       Ty = LDW(&(W[TWVL * 24]));
70 	       TB = VZMULIJ(Ty, VFNMSCONJ(TA, Tz));
71 	       TX = LDW(&(W[TWVL * 22]));
72 	       TY = VZMULJ(TX, VFMACONJ(TA, Tz));
73 	       T8 = VSUB(T3, T7);
74 	       TZ = VSUB(TW, TY);
75 	       Tx = VSUB(Tu, Tw);
76 	       TG = VSUB(TB, TF);
77 	       TH = VFNMS(LDK(KP414213562), TG, Tx);
78 	       T12 = VFMA(LDK(KP414213562), Tx, TG);
79 	       T1o = VADD(T3, T7);
80 	       T1p = VADD(TW, TY);
81 	       T1q = VADD(T1o, T1p);
82 	       T1I = VSUB(T1o, T1p);
83 	       T1v = VADD(Tw, Tu);
84 	       T1w = VADD(TF, TB);
85 	       T1x = VADD(T1v, T1w);
86 	       T1J = VSUB(T1w, T1v);
87 	       {
88 		    V Tc, TQ, Tp, TJ, Tg, TO, Tl, TL, Ta, Tb, T9, TP, Tn, To, Tm;
89 		    V TI, Te, Tf, Td, TN, Tj, Tk, Ti, TK, Th, Tq, T1y, T1z, TM, TR;
90 		    V T1r, T1s;
91 		    Ta = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
92 		    Tb = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
93 		    T9 = LDW(&(W[TWVL * 2]));
94 		    Tc = VZMULJ(T9, VFMACONJ(Tb, Ta));
95 		    TP = LDW(&(W[TWVL * 4]));
96 		    TQ = VZMULIJ(TP, VFNMSCONJ(Tb, Ta));
97 		    Tn = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
98 		    To = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
99 		    Tm = LDW(&(W[TWVL * 10]));
100 		    Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
101 		    TI = LDW(&(W[TWVL * 12]));
102 		    TJ = VZMULIJ(TI, VFNMSCONJ(To, Tn));
103 		    Te = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
104 		    Tf = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
105 		    Td = LDW(&(W[TWVL * 18]));
106 		    Tg = VZMULJ(Td, VFMACONJ(Tf, Te));
107 		    TN = LDW(&(W[TWVL * 20]));
108 		    TO = VZMULIJ(TN, VFNMSCONJ(Tf, Te));
109 		    Tj = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
110 		    Tk = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
111 		    Ti = LDW(&(W[TWVL * 26]));
112 		    Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
113 		    TK = LDW(&(W[TWVL * 28]));
114 		    TL = VZMULIJ(TK, VFNMSCONJ(Tk, Tj));
115 		    Th = VSUB(Tc, Tg);
116 		    Tq = VSUB(Tl, Tp);
117 		    Tr = VADD(Th, Tq);
118 		    T10 = VSUB(Tq, Th);
119 		    T1y = VADD(TQ, TO);
120 		    T1z = VADD(TL, TJ);
121 		    T1A = VADD(T1y, T1z);
122 		    T1K = VSUB(T1y, T1z);
123 		    TM = VSUB(TJ, TL);
124 		    TR = VSUB(TO, TQ);
125 		    TS = VFMA(LDK(KP414213562), TR, TM);
126 		    T13 = VFNMS(LDK(KP414213562), TM, TR);
127 		    T1r = VADD(Tc, Tg);
128 		    T1s = VADD(Tl, Tp);
129 		    T1t = VADD(T1r, T1s);
130 		    T1N = VSUB(T1s, T1r);
131 	       }
132 	       T1u = VSUB(T1q, T1t);
133 	       T1B = VSUB(T1x, T1A);
134 	       T1C = VMUL(LDK(KP500000000), VFMAI(T1B, T1u));
135 	       T1D = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1B, T1u)));
136 	       ST(&(Rp[WS(rs, 4)]), T1C, ms, &(Rp[0]));
137 	       ST(&(Rm[WS(rs, 3)]), T1D, -ms, &(Rm[WS(rs, 1)]));
138 	       T1E = VADD(T1q, T1t);
139 	       T1F = VADD(T1x, T1A);
140 	       T1G = VMUL(LDK(KP500000000), VSUB(T1E, T1F));
141 	       T1H = VCONJ(VMUL(LDK(KP500000000), VADD(T1F, T1E)));
142 	       ST(&(Rp[0]), T1G, ms, &(Rp[0]));
143 	       ST(&(Rm[WS(rs, 7)]), T1H, -ms, &(Rm[WS(rs, 1)]));
144 	       {
145 		    V T1M, T1S, T1P, T1T, T1L, T1O, T1Q, T1V, T1R, T1U, TU, T18, T15, T19, Ts;
146 		    V TT, T11, T14, T16, T1b, T17, T1a, T1e, T1k, T1h, T1l, T1c, T1d, T1f, T1g;
147 		    V T1i, T1n, T1j, T1m;
148 		    T1L = VADD(T1J, T1K);
149 		    T1M = VFMA(LDK(KP707106781), T1L, T1I);
150 		    T1S = VFNMS(LDK(KP707106781), T1L, T1I);
151 		    T1O = VSUB(T1K, T1J);
152 		    T1P = VFMA(LDK(KP707106781), T1O, T1N);
153 		    T1T = VFNMS(LDK(KP707106781), T1O, T1N);
154 		    T1Q = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1P, T1M)));
155 		    ST(&(Rm[WS(rs, 1)]), T1Q, -ms, &(Rm[WS(rs, 1)]));
156 		    T1V = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1T, T1S)));
157 		    ST(&(Rm[WS(rs, 5)]), T1V, -ms, &(Rm[WS(rs, 1)]));
158 		    T1R = VMUL(LDK(KP500000000), VFMAI(T1P, T1M));
159 		    ST(&(Rp[WS(rs, 2)]), T1R, ms, &(Rp[0]));
160 		    T1U = VMUL(LDK(KP500000000), VFNMSI(T1T, T1S));
161 		    ST(&(Rp[WS(rs, 6)]), T1U, ms, &(Rp[0]));
162 		    Ts = VFMA(LDK(KP707106781), Tr, T8);
163 		    TT = VADD(TH, TS);
164 		    TU = VFMA(LDK(KP923879532), TT, Ts);
165 		    T18 = VFNMS(LDK(KP923879532), TT, Ts);
166 		    T11 = VFNMS(LDK(KP707106781), T10, TZ);
167 		    T14 = VADD(T12, T13);
168 		    T15 = VFMA(LDK(KP923879532), T14, T11);
169 		    T19 = VFNMS(LDK(KP923879532), T14, T11);
170 		    T16 = VMUL(LDK(KP500000000), VFNMSI(T15, TU));
171 		    ST(&(Rp[WS(rs, 1)]), T16, ms, &(Rp[WS(rs, 1)]));
172 		    T1b = VMUL(LDK(KP500000000), VFMAI(T19, T18));
173 		    ST(&(Rp[WS(rs, 7)]), T1b, ms, &(Rp[WS(rs, 1)]));
174 		    T17 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T15, TU)));
175 		    ST(&(Rm[0]), T17, -ms, &(Rm[0]));
176 		    T1a = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T19, T18)));
177 		    ST(&(Rm[WS(rs, 6)]), T1a, -ms, &(Rm[0]));
178 		    T1c = VFNMS(LDK(KP707106781), Tr, T8);
179 		    T1d = VSUB(T12, T13);
180 		    T1e = VFMA(LDK(KP923879532), T1d, T1c);
181 		    T1k = VFNMS(LDK(KP923879532), T1d, T1c);
182 		    T1f = VFMA(LDK(KP707106781), T10, TZ);
183 		    T1g = VSUB(TS, TH);
184 		    T1h = VFMA(LDK(KP923879532), T1g, T1f);
185 		    T1l = VFNMS(LDK(KP923879532), T1g, T1f);
186 		    T1i = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1h, T1e)));
187 		    ST(&(Rm[WS(rs, 2)]), T1i, -ms, &(Rm[0]));
188 		    T1n = VCONJ(VMUL(LDK(KP500000000), VFMAI(T1l, T1k)));
189 		    ST(&(Rm[WS(rs, 4)]), T1n, -ms, &(Rm[0]));
190 		    T1j = VMUL(LDK(KP500000000), VFMAI(T1h, T1e));
191 		    ST(&(Rp[WS(rs, 3)]), T1j, ms, &(Rp[WS(rs, 1)]));
192 		    T1m = VMUL(LDK(KP500000000), VFNMSI(T1l, T1k));
193 		    ST(&(Rp[WS(rs, 5)]), T1m, ms, &(Rp[WS(rs, 1)]));
194 	       }
195 	  }
196      }
197      VLEAVE();
198 }
199 
200 static const tw_instr twinstr[] = {
201      VTW(1, 1),
202      VTW(1, 2),
203      VTW(1, 3),
204      VTW(1, 4),
205      VTW(1, 5),
206      VTW(1, 6),
207      VTW(1, 7),
208      VTW(1, 8),
209      VTW(1, 9),
210      VTW(1, 10),
211      VTW(1, 11),
212      VTW(1, 12),
213      VTW(1, 13),
214      VTW(1, 14),
215      VTW(1, 15),
216      { TW_NEXT, VL, 0 }
217 };
218 
219 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, { 53, 46, 50, 0 } };
220 
XSIMD(codelet_hc2cfdftv_16)221 void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
222      X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
223 }
224 #else
225 
226 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 16 -dit -name hc2cfdftv_16 -include rdft/simd/hc2cfv.h */
227 
228 /*
229  * This function contains 103 FP additions, 56 FP multiplications,
230  * (or, 99 additions, 52 multiplications, 4 fused multiply/add),
231  * 101 stack variables, 5 constants, and 32 memory accesses
232  */
233 #include "rdft/simd/hc2cfv.h"
234 
hc2cfdftv_16(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)235 static void hc2cfdftv_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
236 {
237      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
238      DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
239      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
240      DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
241      DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
242      {
243 	  INT m;
244 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 30)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(64, rs)) {
245 	       V T1D, T1E, T1R, TP, T1b, Ta, T1w, T18, T1x, T1z, T1A, T1G, T1H, T1S, Tx;
246 	       V T13, T10, T1a, T1, T3, TA, TM, TL, TN, T6, T8, TC, TH, TG, TI;
247 	       V T2, Tz, TK, TJ, T7, TB, TF, TE, TD, TO, T4, T9, T5, T15, T17;
248 	       V T14, T16;
249 	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
250 	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
251 	       T3 = VCONJ(T2);
252 	       Tz = LDW(&(W[0]));
253 	       TA = VZMULIJ(Tz, VSUB(T3, T1));
254 	       TM = LD(&(Rp[WS(rs, 6)]), ms, &(Rp[0]));
255 	       TK = LD(&(Rm[WS(rs, 6)]), -ms, &(Rm[0]));
256 	       TL = VCONJ(TK);
257 	       TJ = LDW(&(W[TWVL * 24]));
258 	       TN = VZMULIJ(TJ, VSUB(TL, TM));
259 	       T6 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
260 	       T7 = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
261 	       T8 = VCONJ(T7);
262 	       TB = LDW(&(W[TWVL * 16]));
263 	       TC = VZMULIJ(TB, VSUB(T8, T6));
264 	       TH = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
265 	       TF = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
266 	       TG = VCONJ(TF);
267 	       TE = LDW(&(W[TWVL * 8]));
268 	       TI = VZMULIJ(TE, VSUB(TG, TH));
269 	       T1D = VADD(TA, TC);
270 	       T1E = VADD(TI, TN);
271 	       T1R = VSUB(T1D, T1E);
272 	       TD = VSUB(TA, TC);
273 	       TO = VSUB(TI, TN);
274 	       TP = VFNMS(LDK(KP382683432), TO, VMUL(LDK(KP923879532), TD));
275 	       T1b = VFMA(LDK(KP382683432), TD, VMUL(LDK(KP923879532), TO));
276 	       T4 = VADD(T1, T3);
277 	       T5 = LDW(&(W[TWVL * 14]));
278 	       T9 = VZMULJ(T5, VADD(T6, T8));
279 	       Ta = VMUL(LDK(KP500000000), VSUB(T4, T9));
280 	       T1w = VADD(T4, T9);
281 	       T14 = LDW(&(W[TWVL * 6]));
282 	       T15 = VZMULJ(T14, VADD(TH, TG));
283 	       T16 = LDW(&(W[TWVL * 22]));
284 	       T17 = VZMULJ(T16, VADD(TM, TL));
285 	       T18 = VSUB(T15, T17);
286 	       T1x = VADD(T15, T17);
287 	       {
288 		    V Tf, TR, Tv, TY, Tk, TT, Tq, TW, Tc, Te, Td, Tb, TQ, Ts, Tu;
289 		    V Tt, Tr, TX, Th, Tj, Ti, Tg, TS, Tn, Tp, To, Tm, TV, Tl, Tw;
290 		    V TU, TZ;
291 		    Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
292 		    Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
293 		    Te = VCONJ(Td);
294 		    Tb = LDW(&(W[TWVL * 2]));
295 		    Tf = VZMULJ(Tb, VADD(Tc, Te));
296 		    TQ = LDW(&(W[TWVL * 4]));
297 		    TR = VZMULIJ(TQ, VSUB(Te, Tc));
298 		    Ts = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
299 		    Tt = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
300 		    Tu = VCONJ(Tt);
301 		    Tr = LDW(&(W[TWVL * 10]));
302 		    Tv = VZMULJ(Tr, VADD(Ts, Tu));
303 		    TX = LDW(&(W[TWVL * 12]));
304 		    TY = VZMULIJ(TX, VSUB(Tu, Ts));
305 		    Th = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
306 		    Ti = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
307 		    Tj = VCONJ(Ti);
308 		    Tg = LDW(&(W[TWVL * 18]));
309 		    Tk = VZMULJ(Tg, VADD(Th, Tj));
310 		    TS = LDW(&(W[TWVL * 20]));
311 		    TT = VZMULIJ(TS, VSUB(Tj, Th));
312 		    Tn = LD(&(Rp[WS(rs, 7)]), ms, &(Rp[WS(rs, 1)]));
313 		    To = LD(&(Rm[WS(rs, 7)]), -ms, &(Rm[WS(rs, 1)]));
314 		    Tp = VCONJ(To);
315 		    Tm = LDW(&(W[TWVL * 26]));
316 		    Tq = VZMULJ(Tm, VADD(Tn, Tp));
317 		    TV = LDW(&(W[TWVL * 28]));
318 		    TW = VZMULIJ(TV, VSUB(Tp, Tn));
319 		    T1z = VADD(Tf, Tk);
320 		    T1A = VADD(Tq, Tv);
321 		    T1G = VADD(TR, TT);
322 		    T1H = VADD(TW, TY);
323 		    T1S = VSUB(T1H, T1G);
324 		    Tl = VSUB(Tf, Tk);
325 		    Tw = VSUB(Tq, Tv);
326 		    Tx = VMUL(LDK(KP353553390), VADD(Tl, Tw));
327 		    T13 = VMUL(LDK(KP707106781), VSUB(Tw, Tl));
328 		    TU = VSUB(TR, TT);
329 		    TZ = VSUB(TW, TY);
330 		    T10 = VFMA(LDK(KP382683432), TU, VMUL(LDK(KP923879532), TZ));
331 		    T1a = VFNMS(LDK(KP923879532), TU, VMUL(LDK(KP382683432), TZ));
332 	       }
333 	       {
334 		    V T1U, T20, T1X, T21, T1Q, T1T, T1V, T1W, T1Y, T23, T1Z, T22, T1C, T1M, T1J;
335 		    V T1N, T1y, T1B, T1F, T1I, T1K, T1P, T1L, T1O, T12, T1g, T1d, T1h, Ty, T11;
336 		    V T19, T1c, T1e, T1j, T1f, T1i, T1m, T1s, T1p, T1t, T1k, T1l, T1n, T1o, T1q;
337 		    V T1v, T1r, T1u;
338 		    T1Q = VMUL(LDK(KP500000000), VSUB(T1w, T1x));
339 		    T1T = VMUL(LDK(KP353553390), VADD(T1R, T1S));
340 		    T1U = VADD(T1Q, T1T);
341 		    T20 = VSUB(T1Q, T1T);
342 		    T1V = VSUB(T1A, T1z);
343 		    T1W = VMUL(LDK(KP707106781), VSUB(T1S, T1R));
344 		    T1X = VMUL(LDK(KP500000000), VBYI(VADD(T1V, T1W)));
345 		    T21 = VMUL(LDK(KP500000000), VBYI(VSUB(T1W, T1V)));
346 		    T1Y = VCONJ(VSUB(T1U, T1X));
347 		    ST(&(Rm[WS(rs, 1)]), T1Y, -ms, &(Rm[WS(rs, 1)]));
348 		    T23 = VADD(T20, T21);
349 		    ST(&(Rp[WS(rs, 6)]), T23, ms, &(Rp[0]));
350 		    T1Z = VADD(T1U, T1X);
351 		    ST(&(Rp[WS(rs, 2)]), T1Z, ms, &(Rp[0]));
352 		    T22 = VCONJ(VSUB(T20, T21));
353 		    ST(&(Rm[WS(rs, 5)]), T22, -ms, &(Rm[WS(rs, 1)]));
354 		    T1y = VADD(T1w, T1x);
355 		    T1B = VADD(T1z, T1A);
356 		    T1C = VADD(T1y, T1B);
357 		    T1M = VSUB(T1y, T1B);
358 		    T1F = VADD(T1D, T1E);
359 		    T1I = VADD(T1G, T1H);
360 		    T1J = VADD(T1F, T1I);
361 		    T1N = VBYI(VSUB(T1I, T1F));
362 		    T1K = VCONJ(VMUL(LDK(KP500000000), VSUB(T1C, T1J)));
363 		    ST(&(Rm[WS(rs, 7)]), T1K, -ms, &(Rm[WS(rs, 1)]));
364 		    T1P = VMUL(LDK(KP500000000), VADD(T1M, T1N));
365 		    ST(&(Rp[WS(rs, 4)]), T1P, ms, &(Rp[0]));
366 		    T1L = VMUL(LDK(KP500000000), VADD(T1C, T1J));
367 		    ST(&(Rp[0]), T1L, ms, &(Rp[0]));
368 		    T1O = VCONJ(VMUL(LDK(KP500000000), VSUB(T1M, T1N)));
369 		    ST(&(Rm[WS(rs, 3)]), T1O, -ms, &(Rm[WS(rs, 1)]));
370 		    Ty = VADD(Ta, Tx);
371 		    T11 = VMUL(LDK(KP500000000), VADD(TP, T10));
372 		    T12 = VADD(Ty, T11);
373 		    T1g = VSUB(Ty, T11);
374 		    T19 = VSUB(T13, T18);
375 		    T1c = VSUB(T1a, T1b);
376 		    T1d = VMUL(LDK(KP500000000), VBYI(VADD(T19, T1c)));
377 		    T1h = VMUL(LDK(KP500000000), VBYI(VSUB(T1c, T19)));
378 		    T1e = VCONJ(VSUB(T12, T1d));
379 		    ST(&(Rm[0]), T1e, -ms, &(Rm[0]));
380 		    T1j = VADD(T1g, T1h);
381 		    ST(&(Rp[WS(rs, 7)]), T1j, ms, &(Rp[WS(rs, 1)]));
382 		    T1f = VADD(T12, T1d);
383 		    ST(&(Rp[WS(rs, 1)]), T1f, ms, &(Rp[WS(rs, 1)]));
384 		    T1i = VCONJ(VSUB(T1g, T1h));
385 		    ST(&(Rm[WS(rs, 6)]), T1i, -ms, &(Rm[0]));
386 		    T1k = VSUB(T10, TP);
387 		    T1l = VADD(T18, T13);
388 		    T1m = VMUL(LDK(KP500000000), VBYI(VSUB(T1k, T1l)));
389 		    T1s = VMUL(LDK(KP500000000), VBYI(VADD(T1l, T1k)));
390 		    T1n = VSUB(Ta, Tx);
391 		    T1o = VMUL(LDK(KP500000000), VADD(T1b, T1a));
392 		    T1p = VSUB(T1n, T1o);
393 		    T1t = VADD(T1n, T1o);
394 		    T1q = VADD(T1m, T1p);
395 		    ST(&(Rp[WS(rs, 5)]), T1q, ms, &(Rp[WS(rs, 1)]));
396 		    T1v = VCONJ(VSUB(T1t, T1s));
397 		    ST(&(Rm[WS(rs, 2)]), T1v, -ms, &(Rm[0]));
398 		    T1r = VCONJ(VSUB(T1p, T1m));
399 		    ST(&(Rm[WS(rs, 4)]), T1r, -ms, &(Rm[0]));
400 		    T1u = VADD(T1s, T1t);
401 		    ST(&(Rp[WS(rs, 3)]), T1u, ms, &(Rp[WS(rs, 1)]));
402 	       }
403 	  }
404      }
405      VLEAVE();
406 }
407 
408 static const tw_instr twinstr[] = {
409      VTW(1, 1),
410      VTW(1, 2),
411      VTW(1, 3),
412      VTW(1, 4),
413      VTW(1, 5),
414      VTW(1, 6),
415      VTW(1, 7),
416      VTW(1, 8),
417      VTW(1, 9),
418      VTW(1, 10),
419      VTW(1, 11),
420      VTW(1, 12),
421      VTW(1, 13),
422      VTW(1, 14),
423      VTW(1, 15),
424      { TW_NEXT, VL, 0 }
425 };
426 
427 static const hc2c_desc desc = { 16, XSIMD_STRING("hc2cfdftv_16"), twinstr, &GENUS, { 99, 52, 4, 0 } };
428 
XSIMD(codelet_hc2cfdftv_16)429 void XSIMD(codelet_hc2cfdftv_16) (planner *p) {
430      X(khc2c_register) (p, hc2cfdftv_16, &desc, HC2C_VIA_DFT);
431 }
432 #endif
433