1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:06:57 EST 2020 */
23 
24 #include "rdft/codelet-rdft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include rdft/simd/hc2cfv.h */
29 
30 /*
31  * This function contains 71 FP additions, 66 FP multiplications,
32  * (or, 41 additions, 36 multiplications, 30 fused multiply/add),
33  * 86 stack variables, 2 constants, and 24 memory accesses
34  */
35 #include "rdft/simd/hc2cfv.h"
36 
hc2cfdftv_12(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)37 static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
40      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
41      {
42 	  INT m;
43 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
44 	       V Td, TQ, Tr, TR, TI, TY, TA, TX, T12, T1e, TV, T1d, TK, TL, Ts;
45 	       V TJ, TO, TP, TM, TN, TW, T16, T13, T17, TS, TZ, T14, T19, T15, T18;
46 	       V T1f, T1j, T1c, T1i, T1a, T1b, T1g, T1l, T1h, T1k;
47 	       {
48 		    V T3, Tu, T7, Tw, Tp, TH, Tl, TE, Th, TC, Tb, Tz, T1, T2, Tt;
49 		    V T5, T6, T4, Tv, Tn, To, Tm, TG, Tj, Tk, Ti, TD, Tf, Tg, Te;
50 		    V TB, T9, Ta, T8, Ty, Tc, Tq, TF, Tx, T10, T11, TT, TU;
51 		    T1 = LD(&(Rp[0]), ms, &(Rp[0]));
52 		    T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
53 		    T3 = VFMACONJ(T2, T1);
54 		    Tt = LDW(&(W[0]));
55 		    Tu = VZMULIJ(Tt, VFNMSCONJ(T2, T1));
56 		    T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
57 		    T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
58 		    T4 = LDW(&(W[TWVL * 6]));
59 		    T7 = VZMULJ(T4, VFMACONJ(T6, T5));
60 		    Tv = LDW(&(W[TWVL * 8]));
61 		    Tw = VZMULIJ(Tv, VFNMSCONJ(T6, T5));
62 		    Tn = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
63 		    To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
64 		    Tm = LDW(&(W[TWVL * 2]));
65 		    Tp = VZMULJ(Tm, VFMACONJ(To, Tn));
66 		    TG = LDW(&(W[TWVL * 4]));
67 		    TH = VZMULIJ(TG, VFNMSCONJ(To, Tn));
68 		    Tj = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
69 		    Tk = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
70 		    Ti = LDW(&(W[TWVL * 18]));
71 		    Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj));
72 		    TD = LDW(&(W[TWVL * 20]));
73 		    TE = VZMULIJ(TD, VFNMSCONJ(Tk, Tj));
74 		    Tf = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
75 		    Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
76 		    Te = LDW(&(W[TWVL * 10]));
77 		    Th = VZMULJ(Te, VFMACONJ(Tg, Tf));
78 		    TB = LDW(&(W[TWVL * 12]));
79 		    TC = VZMULIJ(TB, VFNMSCONJ(Tg, Tf));
80 		    T9 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
81 		    Ta = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
82 		    T8 = LDW(&(W[TWVL * 14]));
83 		    Tb = VZMULJ(T8, VFMACONJ(Ta, T9));
84 		    Ty = LDW(&(W[TWVL * 16]));
85 		    Tz = VZMULIJ(Ty, VFNMSCONJ(Ta, T9));
86 		    Tc = VADD(T7, Tb);
87 		    Td = VADD(T3, Tc);
88 		    TQ = VFNMS(LDK(KP500000000), Tc, T3);
89 		    Tq = VADD(Tl, Tp);
90 		    Tr = VADD(Th, Tq);
91 		    TR = VFNMS(LDK(KP500000000), Tq, Th);
92 		    TF = VADD(TC, TE);
93 		    TI = VADD(TF, TH);
94 		    TY = VFNMS(LDK(KP500000000), TF, TH);
95 		    Tx = VADD(Tu, Tw);
96 		    TA = VADD(Tx, Tz);
97 		    TX = VFNMS(LDK(KP500000000), Tx, Tz);
98 		    T10 = VSUB(Tb, T7);
99 		    T11 = VSUB(Tp, Tl);
100 		    T12 = VSUB(T10, T11);
101 		    T1e = VADD(T10, T11);
102 		    TT = VSUB(TC, TE);
103 		    TU = VSUB(Tu, Tw);
104 		    TV = VSUB(TT, TU);
105 		    T1d = VADD(TU, TT);
106 	       }
107 	       Ts = VSUB(Td, Tr);
108 	       TJ = VSUB(TA, TI);
109 	       TK = VMUL(LDK(KP500000000), VFMAI(TJ, Ts));
110 	       TL = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TJ, Ts)));
111 	       ST(&(Rp[WS(rs, 3)]), TK, ms, &(Rp[WS(rs, 1)]));
112 	       ST(&(Rm[WS(rs, 2)]), TL, -ms, &(Rm[0]));
113 	       TM = VADD(Td, Tr);
114 	       TN = VADD(TA, TI);
115 	       TO = VMUL(LDK(KP500000000), VSUB(TM, TN));
116 	       TP = VCONJ(VMUL(LDK(KP500000000), VADD(TN, TM)));
117 	       ST(&(Rp[0]), TO, ms, &(Rp[0]));
118 	       ST(&(Rm[WS(rs, 5)]), TP, -ms, &(Rm[WS(rs, 1)]));
119 	       TS = VSUB(TQ, TR);
120 	       TW = VFMA(LDK(KP866025403), TV, TS);
121 	       T16 = VFNMS(LDK(KP866025403), TV, TS);
122 	       TZ = VSUB(TX, TY);
123 	       T13 = VFNMS(LDK(KP866025403), T12, TZ);
124 	       T17 = VFMA(LDK(KP866025403), T12, TZ);
125 	       T14 = VMUL(LDK(KP500000000), VFNMSI(T13, TW));
126 	       ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)]));
127 	       T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16)));
128 	       ST(&(Rm[WS(rs, 4)]), T19, -ms, &(Rm[0]));
129 	       T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, TW)));
130 	       ST(&(Rm[0]), T15, -ms, &(Rm[0]));
131 	       T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16));
132 	       ST(&(Rp[WS(rs, 5)]), T18, ms, &(Rp[WS(rs, 1)]));
133 	       T1f = VMUL(LDK(KP866025403), VSUB(T1d, T1e));
134 	       T1j = VMUL(LDK(KP866025403), VADD(T1d, T1e));
135 	       T1a = VADD(TX, TY);
136 	       T1b = VADD(TQ, TR);
137 	       T1c = VADD(T1a, T1b);
138 	       T1i = VSUB(T1b, T1a);
139 	       T1g = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1f, T1c)));
140 	       ST(&(Rm[WS(rs, 1)]), T1g, -ms, &(Rm[WS(rs, 1)]));
141 	       T1l = VMUL(LDK(KP500000000), VFMAI(T1j, T1i));
142 	       ST(&(Rp[WS(rs, 4)]), T1l, ms, &(Rp[0]));
143 	       T1h = VMUL(LDK(KP500000000), VFMAI(T1f, T1c));
144 	       ST(&(Rp[WS(rs, 2)]), T1h, ms, &(Rp[0]));
145 	       T1k = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1j, T1i)));
146 	       ST(&(Rm[WS(rs, 3)]), T1k, -ms, &(Rm[WS(rs, 1)]));
147 	  }
148      }
149      VLEAVE();
150 }
151 
152 static const tw_instr twinstr[] = {
153      VTW(1, 1),
154      VTW(1, 2),
155      VTW(1, 3),
156      VTW(1, 4),
157      VTW(1, 5),
158      VTW(1, 6),
159      VTW(1, 7),
160      VTW(1, 8),
161      VTW(1, 9),
162      VTW(1, 10),
163      VTW(1, 11),
164      { TW_NEXT, VL, 0 }
165 };
166 
167 static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, { 41, 36, 30, 0 } };
168 
XSIMD(codelet_hc2cfdftv_12)169 void XSIMD(codelet_hc2cfdftv_12) (planner *p) {
170      X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT);
171 }
172 #else
173 
174 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 12 -dit -name hc2cfdftv_12 -include rdft/simd/hc2cfv.h */
175 
176 /*
177  * This function contains 71 FP additions, 41 FP multiplications,
178  * (or, 67 additions, 37 multiplications, 4 fused multiply/add),
179  * 58 stack variables, 4 constants, and 24 memory accesses
180  */
181 #include "rdft/simd/hc2cfv.h"
182 
hc2cfdftv_12(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)183 static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
184 {
185      DVK(KP433012701, +0.433012701892219323381861585376468091735701313);
186      DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
187      DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
188      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
189      {
190 	  INT m;
191 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(48, rs)) {
192 	       V TX, T13, T4, Tf, TZ, TD, TF, T17, TW, T14, Tw, Tl, T10, TL, TN;
193 	       V T16;
194 	       {
195 		    V T1, T3, TA, Tb, Td, Te, T9, TC, T2, Tz, Tc, Ta, T6, T8, T7;
196 		    V T5, TB, TE, Ti, Tk, TI, Ts, Tu, Tv, Tq, TK, Tj, TH, Tt, Tr;
197 		    V Tn, Tp, To, Tm, TJ, Th, TM;
198 		    T1 = LD(&(Rp[0]), ms, &(Rp[0]));
199 		    T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
200 		    T3 = VCONJ(T2);
201 		    Tz = LDW(&(W[0]));
202 		    TA = VZMULIJ(Tz, VSUB(T3, T1));
203 		    Tb = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0]));
204 		    Tc = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0]));
205 		    Td = VCONJ(Tc);
206 		    Ta = LDW(&(W[TWVL * 14]));
207 		    Te = VZMULJ(Ta, VADD(Tb, Td));
208 		    T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
209 		    T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
210 		    T8 = VCONJ(T7);
211 		    T5 = LDW(&(W[TWVL * 6]));
212 		    T9 = VZMULJ(T5, VADD(T6, T8));
213 		    TB = LDW(&(W[TWVL * 8]));
214 		    TC = VZMULIJ(TB, VSUB(T8, T6));
215 		    TX = VSUB(TC, TA);
216 		    T13 = VSUB(Te, T9);
217 		    T4 = VADD(T1, T3);
218 		    Tf = VADD(T9, Te);
219 		    TZ = VFNMS(LDK(KP250000000), Tf, VMUL(LDK(KP500000000), T4));
220 		    TD = VADD(TA, TC);
221 		    TE = LDW(&(W[TWVL * 16]));
222 		    TF = VZMULIJ(TE, VSUB(Td, Tb));
223 		    T17 = VFNMS(LDK(KP500000000), TD, TF);
224 		    Ti = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
225 		    Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
226 		    Tk = VCONJ(Tj);
227 		    TH = LDW(&(W[TWVL * 12]));
228 		    TI = VZMULIJ(TH, VSUB(Tk, Ti));
229 		    Ts = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
230 		    Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
231 		    Tu = VCONJ(Tt);
232 		    Tr = LDW(&(W[TWVL * 2]));
233 		    Tv = VZMULJ(Tr, VADD(Ts, Tu));
234 		    Tn = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)]));
235 		    To = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)]));
236 		    Tp = VCONJ(To);
237 		    Tm = LDW(&(W[TWVL * 18]));
238 		    Tq = VZMULJ(Tm, VADD(Tn, Tp));
239 		    TJ = LDW(&(W[TWVL * 20]));
240 		    TK = VZMULIJ(TJ, VSUB(Tp, Tn));
241 		    TW = VSUB(TK, TI);
242 		    T14 = VSUB(Tv, Tq);
243 		    Tw = VADD(Tq, Tv);
244 		    Th = LDW(&(W[TWVL * 10]));
245 		    Tl = VZMULJ(Th, VADD(Ti, Tk));
246 		    T10 = VFNMS(LDK(KP250000000), Tw, VMUL(LDK(KP500000000), Tl));
247 		    TL = VADD(TI, TK);
248 		    TM = LDW(&(W[TWVL * 4]));
249 		    TN = VZMULIJ(TM, VSUB(Tu, Ts));
250 		    T16 = VFNMS(LDK(KP500000000), TL, TN);
251 	       }
252 	       {
253 		    V Ty, TS, TP, TT, Tg, Tx, TG, TO, TQ, TV, TR, TU, T1i, T1o, T1l;
254 		    V T1p, T1g, T1h, T1j, T1k, T1m, T1r, T1n, T1q, T12, T1c, T19, T1d, TY, T11;
255 		    V T15, T18, T1a, T1f, T1b, T1e;
256 		    Tg = VADD(T4, Tf);
257 		    Tx = VADD(Tl, Tw);
258 		    Ty = VADD(Tg, Tx);
259 		    TS = VSUB(Tg, Tx);
260 		    TG = VADD(TD, TF);
261 		    TO = VADD(TL, TN);
262 		    TP = VADD(TG, TO);
263 		    TT = VBYI(VSUB(TO, TG));
264 		    TQ = VCONJ(VMUL(LDK(KP500000000), VSUB(Ty, TP)));
265 		    ST(&(Rm[WS(rs, 5)]), TQ, -ms, &(Rm[WS(rs, 1)]));
266 		    TV = VMUL(LDK(KP500000000), VADD(TS, TT));
267 		    ST(&(Rp[WS(rs, 3)]), TV, ms, &(Rp[WS(rs, 1)]));
268 		    TR = VMUL(LDK(KP500000000), VADD(Ty, TP));
269 		    ST(&(Rp[0]), TR, ms, &(Rp[0]));
270 		    TU = VCONJ(VMUL(LDK(KP500000000), VSUB(TS, TT)));
271 		    ST(&(Rm[WS(rs, 2)]), TU, -ms, &(Rm[0]));
272 		    T1g = VADD(TX, TW);
273 		    T1h = VADD(T13, T14);
274 		    T1i = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(T1g, T1h))));
275 		    T1o = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VADD(T1g, T1h))));
276 		    T1j = VADD(TZ, T10);
277 		    T1k = VMUL(LDK(KP500000000), VADD(T17, T16));
278 		    T1l = VSUB(T1j, T1k);
279 		    T1p = VADD(T1j, T1k);
280 		    T1m = VADD(T1i, T1l);
281 		    ST(&(Rp[WS(rs, 2)]), T1m, ms, &(Rp[0]));
282 		    T1r = VCONJ(VSUB(T1p, T1o));
283 		    ST(&(Rm[WS(rs, 3)]), T1r, -ms, &(Rm[WS(rs, 1)]));
284 		    T1n = VCONJ(VSUB(T1l, T1i));
285 		    ST(&(Rm[WS(rs, 1)]), T1n, -ms, &(Rm[WS(rs, 1)]));
286 		    T1q = VADD(T1o, T1p);
287 		    ST(&(Rp[WS(rs, 4)]), T1q, ms, &(Rp[0]));
288 		    TY = VMUL(LDK(KP433012701), VSUB(TW, TX));
289 		    T11 = VSUB(TZ, T10);
290 		    T12 = VADD(TY, T11);
291 		    T1c = VSUB(T11, TY);
292 		    T15 = VMUL(LDK(KP866025403), VSUB(T13, T14));
293 		    T18 = VSUB(T16, T17);
294 		    T19 = VMUL(LDK(KP500000000), VBYI(VSUB(T15, T18)));
295 		    T1d = VMUL(LDK(KP500000000), VBYI(VADD(T15, T18)));
296 		    T1a = VCONJ(VSUB(T12, T19));
297 		    ST(&(Rm[0]), T1a, -ms, &(Rm[0]));
298 		    T1f = VCONJ(VADD(T1c, T1d));
299 		    ST(&(Rm[WS(rs, 4)]), T1f, -ms, &(Rm[0]));
300 		    T1b = VADD(T12, T19);
301 		    ST(&(Rp[WS(rs, 1)]), T1b, ms, &(Rp[WS(rs, 1)]));
302 		    T1e = VSUB(T1c, T1d);
303 		    ST(&(Rp[WS(rs, 5)]), T1e, ms, &(Rp[WS(rs, 1)]));
304 	       }
305 	  }
306      }
307      VLEAVE();
308 }
309 
310 static const tw_instr twinstr[] = {
311      VTW(1, 1),
312      VTW(1, 2),
313      VTW(1, 3),
314      VTW(1, 4),
315      VTW(1, 5),
316      VTW(1, 6),
317      VTW(1, 7),
318      VTW(1, 8),
319      VTW(1, 9),
320      VTW(1, 10),
321      VTW(1, 11),
322      { TW_NEXT, VL, 0 }
323 };
324 
325 static const hc2c_desc desc = { 12, XSIMD_STRING("hc2cfdftv_12"), twinstr, &GENUS, { 67, 37, 4, 0 } };
326 
XSIMD(codelet_hc2cfdftv_12)327 void XSIMD(codelet_hc2cfdftv_12) (planner *p) {
328      X(khc2c_register) (p, hc2cfdftv_12, &desc, HC2C_VIA_DFT);
329 }
330 #endif
331