1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:05:32 EST 2020 */
23 
24 #include "dft/codelet-dft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_twiddle_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2bv_16 -include dft/simd/t2b.h -sign 1 */
29 
30 /*
31  * This function contains 87 FP additions, 64 FP multiplications,
32  * (or, 53 additions, 30 multiplications, 34 fused multiply/add),
33  * 36 stack variables, 3 constants, and 32 memory accesses
34  */
35 #include "dft/simd/t2b.h"
36 
t2bv_16(R * ri,R * ii,const R * W,stride rs,INT mb,INT me,INT ms)37 static void t2bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
40      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
41      DVK(KP414213562, +0.414213562373095048801688724209698078569671875);
42      {
43 	  INT m;
44 	  R *x;
45 	  x = ii;
46 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
47 	       V T4, TW, T9, T19, TD, TI, TZ, T1a, Tf, Tk, Tl, T13, T1c, Tq, Tv;
48 	       V Tw, T16, T1d, T1, T3, T2;
49 	       T1 = LD(&(x[0]), ms, &(x[0]));
50 	       T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
51 	       T3 = BYTW(&(W[TWVL * 14]), T2);
52 	       T4 = VADD(T1, T3);
53 	       TW = VSUB(T1, T3);
54 	       {
55 		    V T6, T8, T5, T7;
56 		    T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
57 		    T6 = BYTW(&(W[TWVL * 6]), T5);
58 		    T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
59 		    T8 = BYTW(&(W[TWVL * 22]), T7);
60 		    T9 = VADD(T6, T8);
61 		    T19 = VSUB(T6, T8);
62 	       }
63 	       {
64 		    V TA, TH, TC, TF, TX, TY;
65 		    {
66 			 V Tz, TG, TB, TE;
67 			 Tz = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
68 			 TA = BYTW(&(W[TWVL * 2]), Tz);
69 			 TG = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
70 			 TH = BYTW(&(W[TWVL * 10]), TG);
71 			 TB = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
72 			 TC = BYTW(&(W[TWVL * 18]), TB);
73 			 TE = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
74 			 TF = BYTW(&(W[TWVL * 26]), TE);
75 		    }
76 		    TD = VADD(TA, TC);
77 		    TI = VADD(TF, TH);
78 		    TX = VSUB(TA, TC);
79 		    TY = VSUB(TF, TH);
80 		    TZ = VADD(TX, TY);
81 		    T1a = VSUB(TX, TY);
82 	       }
83 	       {
84 		    V Tc, Tj, Te, Th, T11, T12;
85 		    {
86 			 V Tb, Ti, Td, Tg;
87 			 Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
88 			 Tc = BYTW(&(W[0]), Tb);
89 			 Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
90 			 Tj = BYTW(&(W[TWVL * 24]), Ti);
91 			 Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
92 			 Te = BYTW(&(W[TWVL * 16]), Td);
93 			 Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
94 			 Th = BYTW(&(W[TWVL * 8]), Tg);
95 		    }
96 		    Tf = VADD(Tc, Te);
97 		    Tk = VADD(Th, Tj);
98 		    Tl = VSUB(Tf, Tk);
99 		    T11 = VSUB(Tc, Te);
100 		    T12 = VSUB(Th, Tj);
101 		    T13 = VFNMS(LDK(KP414213562), T12, T11);
102 		    T1c = VFMA(LDK(KP414213562), T11, T12);
103 	       }
104 	       {
105 		    V Tn, Tu, Tp, Ts, T14, T15;
106 		    {
107 			 V Tm, Tt, To, Tr;
108 			 Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
109 			 Tn = BYTW(&(W[TWVL * 28]), Tm);
110 			 Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
111 			 Tu = BYTW(&(W[TWVL * 20]), Tt);
112 			 To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
113 			 Tp = BYTW(&(W[TWVL * 12]), To);
114 			 Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
115 			 Ts = BYTW(&(W[TWVL * 4]), Tr);
116 		    }
117 		    Tq = VADD(Tn, Tp);
118 		    Tv = VADD(Ts, Tu);
119 		    Tw = VSUB(Tq, Tv);
120 		    T14 = VSUB(Tn, Tp);
121 		    T15 = VSUB(Tu, Ts);
122 		    T16 = VFNMS(LDK(KP414213562), T15, T14);
123 		    T1d = VFMA(LDK(KP414213562), T14, T15);
124 	       }
125 	       {
126 		    V Ty, TM, TL, TN;
127 		    {
128 			 V Ta, Tx, TJ, TK;
129 			 Ta = VSUB(T4, T9);
130 			 Tx = VADD(Tl, Tw);
131 			 Ty = VFNMS(LDK(KP707106781), Tx, Ta);
132 			 TM = VFMA(LDK(KP707106781), Tx, Ta);
133 			 TJ = VSUB(TD, TI);
134 			 TK = VSUB(Tl, Tw);
135 			 TL = VFNMS(LDK(KP707106781), TK, TJ);
136 			 TN = VFMA(LDK(KP707106781), TK, TJ);
137 		    }
138 		    ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0]));
139 		    ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0]));
140 		    ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0]));
141 		    ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0]));
142 	       }
143 	       {
144 		    V T1k, T1o, T1n, T1p;
145 		    {
146 			 V T1i, T1j, T1l, T1m;
147 			 T1i = VFNMS(LDK(KP707106781), TZ, TW);
148 			 T1j = VADD(T1c, T1d);
149 			 T1k = VFNMS(LDK(KP923879532), T1j, T1i);
150 			 T1o = VFMA(LDK(KP923879532), T1j, T1i);
151 			 T1l = VFNMS(LDK(KP707106781), T1a, T19);
152 			 T1m = VSUB(T13, T16);
153 			 T1n = VFMA(LDK(KP923879532), T1m, T1l);
154 			 T1p = VFNMS(LDK(KP923879532), T1m, T1l);
155 		    }
156 		    ST(&(x[WS(rs, 5)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)]));
157 		    ST(&(x[WS(rs, 13)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)]));
158 		    ST(&(x[WS(rs, 11)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)]));
159 		    ST(&(x[WS(rs, 3)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)]));
160 	       }
161 	       {
162 		    V TQ, TU, TT, TV;
163 		    {
164 			 V TO, TP, TR, TS;
165 			 TO = VADD(T4, T9);
166 			 TP = VADD(TD, TI);
167 			 TQ = VSUB(TO, TP);
168 			 TU = VADD(TO, TP);
169 			 TR = VADD(Tf, Tk);
170 			 TS = VADD(Tq, Tv);
171 			 TT = VSUB(TR, TS);
172 			 TV = VADD(TR, TS);
173 		    }
174 		    ST(&(x[WS(rs, 12)]), VFNMSI(TT, TQ), ms, &(x[0]));
175 		    ST(&(x[0]), VADD(TU, TV), ms, &(x[0]));
176 		    ST(&(x[WS(rs, 4)]), VFMAI(TT, TQ), ms, &(x[0]));
177 		    ST(&(x[WS(rs, 8)]), VSUB(TU, TV), ms, &(x[0]));
178 	       }
179 	       {
180 		    V T18, T1g, T1f, T1h;
181 		    {
182 			 V T10, T17, T1b, T1e;
183 			 T10 = VFMA(LDK(KP707106781), TZ, TW);
184 			 T17 = VADD(T13, T16);
185 			 T18 = VFNMS(LDK(KP923879532), T17, T10);
186 			 T1g = VFMA(LDK(KP923879532), T17, T10);
187 			 T1b = VFMA(LDK(KP707106781), T1a, T19);
188 			 T1e = VSUB(T1c, T1d);
189 			 T1f = VFNMS(LDK(KP923879532), T1e, T1b);
190 			 T1h = VFMA(LDK(KP923879532), T1e, T1b);
191 		    }
192 		    ST(&(x[WS(rs, 7)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)]));
193 		    ST(&(x[WS(rs, 1)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)]));
194 		    ST(&(x[WS(rs, 9)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)]));
195 		    ST(&(x[WS(rs, 15)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)]));
196 	       }
197 	  }
198      }
199      VLEAVE();
200 }
201 
202 static const tw_instr twinstr[] = {
203      VTW(0, 1),
204      VTW(0, 2),
205      VTW(0, 3),
206      VTW(0, 4),
207      VTW(0, 5),
208      VTW(0, 6),
209      VTW(0, 7),
210      VTW(0, 8),
211      VTW(0, 9),
212      VTW(0, 10),
213      VTW(0, 11),
214      VTW(0, 12),
215      VTW(0, 13),
216      VTW(0, 14),
217      VTW(0, 15),
218      { TW_NEXT, VL, 0 }
219 };
220 
221 static const ct_desc desc = { 16, XSIMD_STRING("t2bv_16"), twinstr, &GENUS, { 53, 30, 34, 0 }, 0, 0, 0 };
222 
XSIMD(codelet_t2bv_16)223 void XSIMD(codelet_t2bv_16) (planner *p) {
224      X(kdft_dit_register) (p, t2bv_16, &desc);
225 }
226 #else
227 
228 /* Generated by: ../../../genfft/gen_twiddle_c.native -simd -compact -variables 4 -pipeline-latency 8 -n 16 -name t2bv_16 -include dft/simd/t2b.h -sign 1 */
229 
230 /*
231  * This function contains 87 FP additions, 42 FP multiplications,
232  * (or, 83 additions, 38 multiplications, 4 fused multiply/add),
233  * 36 stack variables, 3 constants, and 32 memory accesses
234  */
235 #include "dft/simd/t2b.h"
236 
t2bv_16(R * ri,R * ii,const R * W,stride rs,INT mb,INT me,INT ms)237 static void t2bv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
238 {
239      DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
240      DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
241      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
242      {
243 	  INT m;
244 	  R *x;
245 	  x = ii;
246 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) {
247 	       V TJ, T1b, TD, T1c, T17, T18, Ty, TK, T10, T11, T12, Tb, TM, T13, T14;
248 	       V T15, Tm, TN, TG, TI, TH;
249 	       TG = LD(&(x[0]), ms, &(x[0]));
250 	       TH = LD(&(x[WS(rs, 8)]), ms, &(x[0]));
251 	       TI = BYTW(&(W[TWVL * 14]), TH);
252 	       TJ = VSUB(TG, TI);
253 	       T1b = VADD(TG, TI);
254 	       {
255 		    V TA, TC, Tz, TB;
256 		    Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0]));
257 		    TA = BYTW(&(W[TWVL * 6]), Tz);
258 		    TB = LD(&(x[WS(rs, 12)]), ms, &(x[0]));
259 		    TC = BYTW(&(W[TWVL * 22]), TB);
260 		    TD = VSUB(TA, TC);
261 		    T1c = VADD(TA, TC);
262 	       }
263 	       {
264 		    V Tp, Tw, Tr, Tu, Ts, Tx;
265 		    {
266 			 V To, Tv, Tq, Tt;
267 			 To = LD(&(x[WS(rs, 2)]), ms, &(x[0]));
268 			 Tp = BYTW(&(W[TWVL * 2]), To);
269 			 Tv = LD(&(x[WS(rs, 6)]), ms, &(x[0]));
270 			 Tw = BYTW(&(W[TWVL * 10]), Tv);
271 			 Tq = LD(&(x[WS(rs, 10)]), ms, &(x[0]));
272 			 Tr = BYTW(&(W[TWVL * 18]), Tq);
273 			 Tt = LD(&(x[WS(rs, 14)]), ms, &(x[0]));
274 			 Tu = BYTW(&(W[TWVL * 26]), Tt);
275 		    }
276 		    T17 = VADD(Tp, Tr);
277 		    T18 = VADD(Tu, Tw);
278 		    Ts = VSUB(Tp, Tr);
279 		    Tx = VSUB(Tu, Tw);
280 		    Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx));
281 		    TK = VMUL(LDK(KP707106781), VADD(Ts, Tx));
282 	       }
283 	       {
284 		    V T2, T9, T4, T7, T5, Ta;
285 		    {
286 			 V T1, T8, T3, T6;
287 			 T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));
288 			 T2 = BYTW(&(W[0]), T1);
289 			 T8 = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)]));
290 			 T9 = BYTW(&(W[TWVL * 24]), T8);
291 			 T3 = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)]));
292 			 T4 = BYTW(&(W[TWVL * 16]), T3);
293 			 T6 = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)]));
294 			 T7 = BYTW(&(W[TWVL * 8]), T6);
295 		    }
296 		    T10 = VADD(T2, T4);
297 		    T11 = VADD(T7, T9);
298 		    T12 = VSUB(T10, T11);
299 		    T5 = VSUB(T2, T4);
300 		    Ta = VSUB(T7, T9);
301 		    Tb = VFNMS(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), T5));
302 		    TM = VFMA(LDK(KP382683432), T5, VMUL(LDK(KP923879532), Ta));
303 	       }
304 	       {
305 		    V Td, Tk, Tf, Ti, Tg, Tl;
306 		    {
307 			 V Tc, Tj, Te, Th;
308 			 Tc = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)]));
309 			 Td = BYTW(&(W[TWVL * 28]), Tc);
310 			 Tj = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)]));
311 			 Tk = BYTW(&(W[TWVL * 20]), Tj);
312 			 Te = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)]));
313 			 Tf = BYTW(&(W[TWVL * 12]), Te);
314 			 Th = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));
315 			 Ti = BYTW(&(W[TWVL * 4]), Th);
316 		    }
317 		    T13 = VADD(Td, Tf);
318 		    T14 = VADD(Ti, Tk);
319 		    T15 = VSUB(T13, T14);
320 		    Tg = VSUB(Td, Tf);
321 		    Tl = VSUB(Ti, Tk);
322 		    Tm = VFMA(LDK(KP923879532), Tg, VMUL(LDK(KP382683432), Tl));
323 		    TN = VFNMS(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl));
324 	       }
325 	       {
326 		    V T1a, T1g, T1f, T1h;
327 		    {
328 			 V T16, T19, T1d, T1e;
329 			 T16 = VMUL(LDK(KP707106781), VSUB(T12, T15));
330 			 T19 = VSUB(T17, T18);
331 			 T1a = VBYI(VSUB(T16, T19));
332 			 T1g = VBYI(VADD(T19, T16));
333 			 T1d = VSUB(T1b, T1c);
334 			 T1e = VMUL(LDK(KP707106781), VADD(T12, T15));
335 			 T1f = VSUB(T1d, T1e);
336 			 T1h = VADD(T1d, T1e);
337 		    }
338 		    ST(&(x[WS(rs, 6)]), VADD(T1a, T1f), ms, &(x[0]));
339 		    ST(&(x[WS(rs, 14)]), VSUB(T1h, T1g), ms, &(x[0]));
340 		    ST(&(x[WS(rs, 10)]), VSUB(T1f, T1a), ms, &(x[0]));
341 		    ST(&(x[WS(rs, 2)]), VADD(T1g, T1h), ms, &(x[0]));
342 	       }
343 	       {
344 		    V T1k, T1o, T1n, T1p;
345 		    {
346 			 V T1i, T1j, T1l, T1m;
347 			 T1i = VADD(T1b, T1c);
348 			 T1j = VADD(T17, T18);
349 			 T1k = VSUB(T1i, T1j);
350 			 T1o = VADD(T1i, T1j);
351 			 T1l = VADD(T10, T11);
352 			 T1m = VADD(T13, T14);
353 			 T1n = VBYI(VSUB(T1l, T1m));
354 			 T1p = VADD(T1l, T1m);
355 		    }
356 		    ST(&(x[WS(rs, 12)]), VSUB(T1k, T1n), ms, &(x[0]));
357 		    ST(&(x[0]), VADD(T1o, T1p), ms, &(x[0]));
358 		    ST(&(x[WS(rs, 4)]), VADD(T1k, T1n), ms, &(x[0]));
359 		    ST(&(x[WS(rs, 8)]), VSUB(T1o, T1p), ms, &(x[0]));
360 	       }
361 	       {
362 		    V TF, TQ, TP, TR;
363 		    {
364 			 V Tn, TE, TL, TO;
365 			 Tn = VSUB(Tb, Tm);
366 			 TE = VSUB(Ty, TD);
367 			 TF = VBYI(VSUB(Tn, TE));
368 			 TQ = VBYI(VADD(TE, Tn));
369 			 TL = VSUB(TJ, TK);
370 			 TO = VSUB(TM, TN);
371 			 TP = VSUB(TL, TO);
372 			 TR = VADD(TL, TO);
373 		    }
374 		    ST(&(x[WS(rs, 5)]), VADD(TF, TP), ms, &(x[WS(rs, 1)]));
375 		    ST(&(x[WS(rs, 13)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)]));
376 		    ST(&(x[WS(rs, 11)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)]));
377 		    ST(&(x[WS(rs, 3)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)]));
378 	       }
379 	       {
380 		    V TU, TY, TX, TZ;
381 		    {
382 			 V TS, TT, TV, TW;
383 			 TS = VADD(TJ, TK);
384 			 TT = VADD(Tb, Tm);
385 			 TU = VADD(TS, TT);
386 			 TY = VSUB(TS, TT);
387 			 TV = VADD(TD, Ty);
388 			 TW = VADD(TM, TN);
389 			 TX = VBYI(VADD(TV, TW));
390 			 TZ = VBYI(VSUB(TW, TV));
391 		    }
392 		    ST(&(x[WS(rs, 15)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)]));
393 		    ST(&(x[WS(rs, 7)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)]));
394 		    ST(&(x[WS(rs, 1)]), VADD(TU, TX), ms, &(x[WS(rs, 1)]));
395 		    ST(&(x[WS(rs, 9)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)]));
396 	       }
397 	  }
398      }
399      VLEAVE();
400 }
401 
402 static const tw_instr twinstr[] = {
403      VTW(0, 1),
404      VTW(0, 2),
405      VTW(0, 3),
406      VTW(0, 4),
407      VTW(0, 5),
408      VTW(0, 6),
409      VTW(0, 7),
410      VTW(0, 8),
411      VTW(0, 9),
412      VTW(0, 10),
413      VTW(0, 11),
414      VTW(0, 12),
415      VTW(0, 13),
416      VTW(0, 14),
417      VTW(0, 15),
418      { TW_NEXT, VL, 0 }
419 };
420 
421 static const ct_desc desc = { 16, XSIMD_STRING("t2bv_16"), twinstr, &GENUS, { 83, 38, 4, 0 }, 0, 0, 0 };
422 
XSIMD(codelet_t2bv_16)423 void XSIMD(codelet_t2bv_16) (planner *p) {
424      X(kdft_dit_register) (p, t2bv_16, &desc);
425 }
426 #endif
427