1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:05:29 EST 2020 */
23 
24 #include "dft/codelet-dft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1bv_12 -include dft/simd/t1b.h -sign 1 */
29 
30 /*
31  * This function contains 59 FP additions, 42 FP multiplications,
32  * (or, 41 additions, 24 multiplications, 18 fused multiply/add),
33  * 28 stack variables, 2 constants, and 24 memory accesses
34  */
35 #include "dft/simd/t1b.h"
36 
t1bv_12(R * ri,R * ii,const R * W,stride rs,INT mb,INT me,INT ms)37 static void t1bv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
40      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
41      {
42 	  INT m;
43 	  R *x;
44 	  x = ii;
45 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
46 	       V T1, TK, T6, TA, Tq, TI, Tv, TE, T9, TL, Te, TB, Ti, TH, Tn;
47 	       V TD;
48 	       {
49 		    V T5, T3, T4, T2;
50 		    T1 = LD(&(x[0]), ms, &(x[0]));
51 		    T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
52 		    T5 = BYTW(&(W[TWVL * 14]), T4);
53 		    T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
54 		    T3 = BYTW(&(W[TWVL * 6]), T2);
55 		    TK = VSUB(T3, T5);
56 		    T6 = VADD(T3, T5);
57 		    TA = VFNMS(LDK(KP500000000), T6, T1);
58 	       }
59 	       {
60 		    V Tu, Ts, Tp, Tt, Tr;
61 		    Tp = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
62 		    Tq = BYTW(&(W[TWVL * 16]), Tp);
63 		    Tt = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
64 		    Tu = BYTW(&(W[TWVL * 8]), Tt);
65 		    Tr = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
66 		    Ts = BYTW(&(W[0]), Tr);
67 		    TI = VSUB(Tu, Ts);
68 		    Tv = VADD(Ts, Tu);
69 		    TE = VFNMS(LDK(KP500000000), Tv, Tq);
70 	       }
71 	       {
72 		    V Td, Tb, T8, Tc, Ta;
73 		    T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
74 		    T9 = BYTW(&(W[TWVL * 10]), T8);
75 		    Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
76 		    Td = BYTW(&(W[TWVL * 2]), Tc);
77 		    Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
78 		    Tb = BYTW(&(W[TWVL * 18]), Ta);
79 		    TL = VSUB(Tb, Td);
80 		    Te = VADD(Tb, Td);
81 		    TB = VFNMS(LDK(KP500000000), Te, T9);
82 	       }
83 	       {
84 		    V Tm, Tk, Th, Tl, Tj;
85 		    Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
86 		    Ti = BYTW(&(W[TWVL * 4]), Th);
87 		    Tl = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
88 		    Tm = BYTW(&(W[TWVL * 20]), Tl);
89 		    Tj = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
90 		    Tk = BYTW(&(W[TWVL * 12]), Tj);
91 		    TH = VSUB(Tk, Tm);
92 		    Tn = VADD(Tk, Tm);
93 		    TD = VFNMS(LDK(KP500000000), Tn, Ti);
94 	       }
95 	       {
96 		    V Tg, Ty, Tx, Tz;
97 		    {
98 			 V T7, Tf, To, Tw;
99 			 T7 = VADD(T1, T6);
100 			 Tf = VADD(T9, Te);
101 			 Tg = VSUB(T7, Tf);
102 			 Ty = VADD(T7, Tf);
103 			 To = VADD(Ti, Tn);
104 			 Tw = VADD(Tq, Tv);
105 			 Tx = VSUB(To, Tw);
106 			 Tz = VADD(To, Tw);
107 		    }
108 		    ST(&(x[WS(rs, 3)]), VFNMSI(Tx, Tg), ms, &(x[WS(rs, 1)]));
109 		    ST(&(x[0]), VADD(Ty, Tz), ms, &(x[0]));
110 		    ST(&(x[WS(rs, 9)]), VFMAI(Tx, Tg), ms, &(x[WS(rs, 1)]));
111 		    ST(&(x[WS(rs, 6)]), VSUB(Ty, Tz), ms, &(x[0]));
112 	       }
113 	       {
114 		    V TS, TW, TV, TX;
115 		    {
116 			 V TQ, TR, TT, TU;
117 			 TQ = VSUB(TA, TB);
118 			 TR = VADD(TH, TI);
119 			 TS = VFNMS(LDK(KP866025403), TR, TQ);
120 			 TW = VFMA(LDK(KP866025403), TR, TQ);
121 			 TT = VSUB(TD, TE);
122 			 TU = VSUB(TK, TL);
123 			 TV = VFMA(LDK(KP866025403), TU, TT);
124 			 TX = VFNMS(LDK(KP866025403), TU, TT);
125 		    }
126 		    ST(&(x[WS(rs, 1)]), VFMAI(TV, TS), ms, &(x[WS(rs, 1)]));
127 		    ST(&(x[WS(rs, 7)]), VFNMSI(TX, TW), ms, &(x[WS(rs, 1)]));
128 		    ST(&(x[WS(rs, 11)]), VFNMSI(TV, TS), ms, &(x[WS(rs, 1)]));
129 		    ST(&(x[WS(rs, 5)]), VFMAI(TX, TW), ms, &(x[WS(rs, 1)]));
130 	       }
131 	       {
132 		    V TG, TO, TN, TP;
133 		    {
134 			 V TC, TF, TJ, TM;
135 			 TC = VADD(TA, TB);
136 			 TF = VADD(TD, TE);
137 			 TG = VSUB(TC, TF);
138 			 TO = VADD(TC, TF);
139 			 TJ = VSUB(TH, TI);
140 			 TM = VADD(TK, TL);
141 			 TN = VMUL(LDK(KP866025403), VSUB(TJ, TM));
142 			 TP = VMUL(LDK(KP866025403), VADD(TM, TJ));
143 		    }
144 		    ST(&(x[WS(rs, 10)]), VFNMSI(TN, TG), ms, &(x[0]));
145 		    ST(&(x[WS(rs, 4)]), VFMAI(TP, TO), ms, &(x[0]));
146 		    ST(&(x[WS(rs, 2)]), VFMAI(TN, TG), ms, &(x[0]));
147 		    ST(&(x[WS(rs, 8)]), VFNMSI(TP, TO), ms, &(x[0]));
148 	       }
149 	  }
150      }
151      VLEAVE();
152 }
153 
154 static const tw_instr twinstr[] = {
155      VTW(0, 1),
156      VTW(0, 2),
157      VTW(0, 3),
158      VTW(0, 4),
159      VTW(0, 5),
160      VTW(0, 6),
161      VTW(0, 7),
162      VTW(0, 8),
163      VTW(0, 9),
164      VTW(0, 10),
165      VTW(0, 11),
166      { TW_NEXT, VL, 0 }
167 };
168 
169 static const ct_desc desc = { 12, XSIMD_STRING("t1bv_12"), twinstr, &GENUS, { 41, 24, 18, 0 }, 0, 0, 0 };
170 
XSIMD(codelet_t1bv_12)171 void XSIMD(codelet_t1bv_12) (planner *p) {
172      X(kdft_dit_register) (p, t1bv_12, &desc);
173 }
174 #else
175 
176 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 12 -name t1bv_12 -include dft/simd/t1b.h -sign 1 */
177 
178 /*
179  * This function contains 59 FP additions, 30 FP multiplications,
180  * (or, 55 additions, 26 multiplications, 4 fused multiply/add),
181  * 28 stack variables, 2 constants, and 24 memory accesses
182  */
183 #include "dft/simd/t1b.h"
184 
t1bv_12(R * ri,R * ii,const R * W,stride rs,INT mb,INT me,INT ms)185 static void t1bv_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
186 {
187      DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
188      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
189      {
190 	  INT m;
191 	  R *x;
192 	  x = ii;
193 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 22)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(12, rs)) {
194 	       V T1, Tt, T6, T7, TB, Tq, TC, TD, T9, Tu, Te, Tf, Tx, Tl, Ty;
195 	       V Tz;
196 	       {
197 		    V T5, T3, T4, T2;
198 		    T1 = LD(&(x[0]), ms, &(x[0]));
199 		    T4 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
200 		    T5 = BYTW(&(W[TWVL * 14]), T4);
201 		    T2 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
202 		    T3 = BYTW(&(W[TWVL * 6]), T2);
203 		    Tt = VSUB(T3, T5);
204 		    T6 = VADD(T3, T5);
205 		    T7 = VFNMS(LDK(KP500000000), T6, T1);
206 	       }
207 	       {
208 		    V Tn, Tp, Tm, TA, To;
209 		    Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
210 		    Tn = BYTW(&(W[0]), Tm);
211 		    TA = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
212 		    TB = BYTW(&(W[TWVL * 16]), TA);
213 		    To = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
214 		    Tp = BYTW(&(W[TWVL * 8]), To);
215 		    Tq = VSUB(Tn, Tp);
216 		    TC = VADD(Tn, Tp);
217 		    TD = VFNMS(LDK(KP500000000), TC, TB);
218 	       }
219 	       {
220 		    V Td, Tb, T8, Tc, Ta;
221 		    T8 = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
222 		    T9 = BYTW(&(W[TWVL * 10]), T8);
223 		    Tc = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
224 		    Td = BYTW(&(W[TWVL * 2]), Tc);
225 		    Ta = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
226 		    Tb = BYTW(&(W[TWVL * 18]), Ta);
227 		    Tu = VSUB(Tb, Td);
228 		    Te = VADD(Tb, Td);
229 		    Tf = VFNMS(LDK(KP500000000), Te, T9);
230 	       }
231 	       {
232 		    V Ti, Tk, Th, Tw, Tj;
233 		    Th = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
234 		    Ti = BYTW(&(W[TWVL * 12]), Th);
235 		    Tw = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
236 		    Tx = BYTW(&(W[TWVL * 4]), Tw);
237 		    Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
238 		    Tk = BYTW(&(W[TWVL * 20]), Tj);
239 		    Tl = VSUB(Ti, Tk);
240 		    Ty = VADD(Ti, Tk);
241 		    Tz = VFNMS(LDK(KP500000000), Ty, Tx);
242 	       }
243 	       {
244 		    V Ts, TG, TF, TH;
245 		    {
246 			 V Tg, Tr, Tv, TE;
247 			 Tg = VSUB(T7, Tf);
248 			 Tr = VMUL(LDK(KP866025403), VSUB(Tl, Tq));
249 			 Ts = VSUB(Tg, Tr);
250 			 TG = VADD(Tg, Tr);
251 			 Tv = VMUL(LDK(KP866025403), VSUB(Tt, Tu));
252 			 TE = VSUB(Tz, TD);
253 			 TF = VBYI(VADD(Tv, TE));
254 			 TH = VBYI(VSUB(TE, Tv));
255 		    }
256 		    ST(&(x[WS(rs, 11)]), VSUB(Ts, TF), ms, &(x[WS(rs, 1)]));
257 		    ST(&(x[WS(rs, 5)]), VADD(TG, TH), ms, &(x[WS(rs, 1)]));
258 		    ST(&(x[WS(rs, 1)]), VADD(Ts, TF), ms, &(x[WS(rs, 1)]));
259 		    ST(&(x[WS(rs, 7)]), VSUB(TG, TH), ms, &(x[WS(rs, 1)]));
260 	       }
261 	       {
262 		    V TS, TW, TV, TX;
263 		    {
264 			 V TQ, TR, TT, TU;
265 			 TQ = VADD(T1, T6);
266 			 TR = VADD(T9, Te);
267 			 TS = VSUB(TQ, TR);
268 			 TW = VADD(TQ, TR);
269 			 TT = VADD(Tx, Ty);
270 			 TU = VADD(TB, TC);
271 			 TV = VBYI(VSUB(TT, TU));
272 			 TX = VADD(TT, TU);
273 		    }
274 		    ST(&(x[WS(rs, 3)]), VSUB(TS, TV), ms, &(x[WS(rs, 1)]));
275 		    ST(&(x[0]), VADD(TW, TX), ms, &(x[0]));
276 		    ST(&(x[WS(rs, 9)]), VADD(TS, TV), ms, &(x[WS(rs, 1)]));
277 		    ST(&(x[WS(rs, 6)]), VSUB(TW, TX), ms, &(x[0]));
278 	       }
279 	       {
280 		    V TK, TO, TN, TP;
281 		    {
282 			 V TI, TJ, TL, TM;
283 			 TI = VADD(Tl, Tq);
284 			 TJ = VADD(Tt, Tu);
285 			 TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
286 			 TO = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
287 			 TL = VADD(T7, Tf);
288 			 TM = VADD(Tz, TD);
289 			 TN = VSUB(TL, TM);
290 			 TP = VADD(TL, TM);
291 		    }
292 		    ST(&(x[WS(rs, 2)]), VADD(TK, TN), ms, &(x[0]));
293 		    ST(&(x[WS(rs, 8)]), VSUB(TP, TO), ms, &(x[0]));
294 		    ST(&(x[WS(rs, 10)]), VSUB(TN, TK), ms, &(x[0]));
295 		    ST(&(x[WS(rs, 4)]), VADD(TO, TP), ms, &(x[0]));
296 	       }
297 	  }
298      }
299      VLEAVE();
300 }
301 
302 static const tw_instr twinstr[] = {
303      VTW(0, 1),
304      VTW(0, 2),
305      VTW(0, 3),
306      VTW(0, 4),
307      VTW(0, 5),
308      VTW(0, 6),
309      VTW(0, 7),
310      VTW(0, 8),
311      VTW(0, 9),
312      VTW(0, 10),
313      VTW(0, 11),
314      { TW_NEXT, VL, 0 }
315 };
316 
317 static const ct_desc desc = { 12, XSIMD_STRING("t1bv_12"), twinstr, &GENUS, { 55, 26, 4, 0 }, 0, 0, 0 };
318 
XSIMD(codelet_t1bv_12)319 void XSIMD(codelet_t1bv_12) (planner *p) {
320      X(kdft_dit_register) (p, t1bv_12, &desc);
321 }
322 #endif
323