1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:04:45 EST 2020 */
23 
24 #include "dft/codelet-dft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_notw_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n1bv_12 -include dft/simd/n1b.h */
29 
30 /*
31  * This function contains 48 FP additions, 20 FP multiplications,
32  * (or, 30 additions, 2 multiplications, 18 fused multiply/add),
33  * 27 stack variables, 2 constants, and 24 memory accesses
34  */
35 #include "dft/simd/n1b.h"
36 
n1bv_12(const R * ri,const R * ii,R * ro,R * io,stride is,stride os,INT v,INT ivs,INT ovs)37 static void n1bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
38 {
39      DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
40      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
41      {
42 	  INT i;
43 	  const R *xi;
44 	  R *xo;
45 	  xi = ii;
46 	  xo = io;
47 	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
48 	       V T5, Ta, TJ, TB, Tq, Tp, Tg, Tl, TG, Ty, Tt, Ts;
49 	       {
50 		    V T1, T6, T4, Tz, T9, TA;
51 		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
52 		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
53 		    {
54 			 V T2, T3, T7, T8;
55 			 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
56 			 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
57 			 T4 = VADD(T2, T3);
58 			 Tz = VSUB(T2, T3);
59 			 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
60 			 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
61 			 T9 = VADD(T7, T8);
62 			 TA = VSUB(T7, T8);
63 		    }
64 		    T5 = VADD(T1, T4);
65 		    Ta = VADD(T6, T9);
66 		    TJ = VSUB(Tz, TA);
67 		    TB = VADD(Tz, TA);
68 		    Tq = VFNMS(LDK(KP500000000), T9, T6);
69 		    Tp = VFNMS(LDK(KP500000000), T4, T1);
70 	       }
71 	       {
72 		    V Tc, Th, Tf, Tw, Tk, Tx;
73 		    Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
74 		    Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
75 		    {
76 			 V Td, Te, Ti, Tj;
77 			 Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
78 			 Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
79 			 Tf = VADD(Td, Te);
80 			 Tw = VSUB(Td, Te);
81 			 Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
82 			 Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
83 			 Tk = VADD(Ti, Tj);
84 			 Tx = VSUB(Tj, Ti);
85 		    }
86 		    Tg = VADD(Tc, Tf);
87 		    Tl = VADD(Th, Tk);
88 		    TG = VADD(Tw, Tx);
89 		    Ty = VSUB(Tw, Tx);
90 		    Tt = VFNMS(LDK(KP500000000), Tk, Th);
91 		    Ts = VFNMS(LDK(KP500000000), Tf, Tc);
92 	       }
93 	       {
94 		    V Tb, Tm, Tn, To;
95 		    Tb = VSUB(T5, Ta);
96 		    Tm = VSUB(Tg, Tl);
97 		    ST(&(xo[WS(os, 3)]), VFNMSI(Tm, Tb), ovs, &(xo[WS(os, 1)]));
98 		    ST(&(xo[WS(os, 9)]), VFMAI(Tm, Tb), ovs, &(xo[WS(os, 1)]));
99 		    Tn = VADD(T5, Ta);
100 		    To = VADD(Tg, Tl);
101 		    ST(&(xo[WS(os, 6)]), VSUB(Tn, To), ovs, &(xo[0]));
102 		    ST(&(xo[0]), VADD(Tn, To), ovs, &(xo[0]));
103 	       }
104 	       {
105 		    V TC, TE, Tv, TD, Tr, Tu;
106 		    TC = VMUL(LDK(KP866025403), VSUB(Ty, TB));
107 		    TE = VMUL(LDK(KP866025403), VADD(TB, Ty));
108 		    Tr = VADD(Tp, Tq);
109 		    Tu = VADD(Ts, Tt);
110 		    Tv = VSUB(Tr, Tu);
111 		    TD = VADD(Tr, Tu);
112 		    ST(&(xo[WS(os, 10)]), VFNMSI(TC, Tv), ovs, &(xo[0]));
113 		    ST(&(xo[WS(os, 4)]), VFMAI(TE, TD), ovs, &(xo[0]));
114 		    ST(&(xo[WS(os, 2)]), VFMAI(TC, Tv), ovs, &(xo[0]));
115 		    ST(&(xo[WS(os, 8)]), VFNMSI(TE, TD), ovs, &(xo[0]));
116 	       }
117 	       {
118 		    V TH, TL, TK, TM, TF, TI;
119 		    TF = VSUB(Tp, Tq);
120 		    TH = VFNMS(LDK(KP866025403), TG, TF);
121 		    TL = VFMA(LDK(KP866025403), TG, TF);
122 		    TI = VSUB(Ts, Tt);
123 		    TK = VFMA(LDK(KP866025403), TJ, TI);
124 		    TM = VFNMS(LDK(KP866025403), TJ, TI);
125 		    ST(&(xo[WS(os, 1)]), VFMAI(TK, TH), ovs, &(xo[WS(os, 1)]));
126 		    ST(&(xo[WS(os, 7)]), VFNMSI(TM, TL), ovs, &(xo[WS(os, 1)]));
127 		    ST(&(xo[WS(os, 11)]), VFNMSI(TK, TH), ovs, &(xo[WS(os, 1)]));
128 		    ST(&(xo[WS(os, 5)]), VFMAI(TM, TL), ovs, &(xo[WS(os, 1)]));
129 	       }
130 	  }
131      }
132      VLEAVE();
133 }
134 
135 static const kdft_desc desc = { 12, XSIMD_STRING("n1bv_12"), { 30, 2, 18, 0 }, &GENUS, 0, 0, 0, 0 };
136 
XSIMD(codelet_n1bv_12)137 void XSIMD(codelet_n1bv_12) (planner *p) { X(kdft_register) (p, n1bv_12, &desc);
138 }
139 
140 #else
141 
142 /* Generated by: ../../../genfft/gen_notw_c.native -simd -compact -variables 4 -pipeline-latency 8 -sign 1 -n 12 -name n1bv_12 -include dft/simd/n1b.h */
143 
144 /*
145  * This function contains 48 FP additions, 8 FP multiplications,
146  * (or, 44 additions, 4 multiplications, 4 fused multiply/add),
147  * 27 stack variables, 2 constants, and 24 memory accesses
148  */
149 #include "dft/simd/n1b.h"
150 
n1bv_12(const R * ri,const R * ii,R * ro,R * io,stride is,stride os,INT v,INT ivs,INT ovs)151 static void n1bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
152 {
153      DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
154      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
155      {
156 	  INT i;
157 	  const R *xi;
158 	  R *xo;
159 	  xi = ii;
160 	  xo = io;
161 	  for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
162 	       V T5, Ta, TG, TF, Ty, Tm, Ti, Tp, TJ, TI, Tx, Ts;
163 	       {
164 		    V T1, T6, T4, Tk, T9, Tl;
165 		    T1 = LD(&(xi[0]), ivs, &(xi[0]));
166 		    T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
167 		    {
168 			 V T2, T3, T7, T8;
169 			 T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
170 			 T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
171 			 T4 = VADD(T2, T3);
172 			 Tk = VSUB(T2, T3);
173 			 T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
174 			 T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
175 			 T9 = VADD(T7, T8);
176 			 Tl = VSUB(T7, T8);
177 		    }
178 		    T5 = VFNMS(LDK(KP500000000), T4, T1);
179 		    Ta = VFNMS(LDK(KP500000000), T9, T6);
180 		    TG = VADD(T6, T9);
181 		    TF = VADD(T1, T4);
182 		    Ty = VADD(Tk, Tl);
183 		    Tm = VMUL(LDK(KP866025403), VSUB(Tk, Tl));
184 	       }
185 	       {
186 		    V Tn, Tq, Te, To, Th, Tr;
187 		    Tn = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
188 		    Tq = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
189 		    {
190 			 V Tc, Td, Tf, Tg;
191 			 Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
192 			 Td = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
193 			 Te = VSUB(Tc, Td);
194 			 To = VADD(Tc, Td);
195 			 Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
196 			 Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
197 			 Th = VSUB(Tf, Tg);
198 			 Tr = VADD(Tf, Tg);
199 		    }
200 		    Ti = VMUL(LDK(KP866025403), VSUB(Te, Th));
201 		    Tp = VFNMS(LDK(KP500000000), To, Tn);
202 		    TJ = VADD(Tq, Tr);
203 		    TI = VADD(Tn, To);
204 		    Tx = VADD(Te, Th);
205 		    Ts = VFNMS(LDK(KP500000000), Tr, Tq);
206 	       }
207 	       {
208 		    V TH, TK, TL, TM;
209 		    TH = VSUB(TF, TG);
210 		    TK = VBYI(VSUB(TI, TJ));
211 		    ST(&(xo[WS(os, 3)]), VSUB(TH, TK), ovs, &(xo[WS(os, 1)]));
212 		    ST(&(xo[WS(os, 9)]), VADD(TH, TK), ovs, &(xo[WS(os, 1)]));
213 		    TL = VADD(TF, TG);
214 		    TM = VADD(TI, TJ);
215 		    ST(&(xo[WS(os, 6)]), VSUB(TL, TM), ovs, &(xo[0]));
216 		    ST(&(xo[0]), VADD(TL, TM), ovs, &(xo[0]));
217 	       }
218 	       {
219 		    V Tj, Tv, Tu, Tw, Tb, Tt;
220 		    Tb = VSUB(T5, Ta);
221 		    Tj = VSUB(Tb, Ti);
222 		    Tv = VADD(Tb, Ti);
223 		    Tt = VSUB(Tp, Ts);
224 		    Tu = VBYI(VADD(Tm, Tt));
225 		    Tw = VBYI(VSUB(Tt, Tm));
226 		    ST(&(xo[WS(os, 11)]), VSUB(Tj, Tu), ovs, &(xo[WS(os, 1)]));
227 		    ST(&(xo[WS(os, 5)]), VADD(Tv, Tw), ovs, &(xo[WS(os, 1)]));
228 		    ST(&(xo[WS(os, 1)]), VADD(Tj, Tu), ovs, &(xo[WS(os, 1)]));
229 		    ST(&(xo[WS(os, 7)]), VSUB(Tv, Tw), ovs, &(xo[WS(os, 1)]));
230 	       }
231 	       {
232 		    V Tz, TD, TC, TE, TA, TB;
233 		    Tz = VBYI(VMUL(LDK(KP866025403), VSUB(Tx, Ty)));
234 		    TD = VBYI(VMUL(LDK(KP866025403), VADD(Ty, Tx)));
235 		    TA = VADD(T5, Ta);
236 		    TB = VADD(Tp, Ts);
237 		    TC = VSUB(TA, TB);
238 		    TE = VADD(TA, TB);
239 		    ST(&(xo[WS(os, 2)]), VADD(Tz, TC), ovs, &(xo[0]));
240 		    ST(&(xo[WS(os, 8)]), VSUB(TE, TD), ovs, &(xo[0]));
241 		    ST(&(xo[WS(os, 10)]), VSUB(TC, Tz), ovs, &(xo[0]));
242 		    ST(&(xo[WS(os, 4)]), VADD(TD, TE), ovs, &(xo[0]));
243 	       }
244 	  }
245      }
246      VLEAVE();
247 }
248 
249 static const kdft_desc desc = { 12, XSIMD_STRING("n1bv_12"), { 44, 4, 4, 0 }, &GENUS, 0, 0, 0, 0 };
250 
XSIMD(codelet_n1bv_12)251 void XSIMD(codelet_n1bv_12) (planner *p) { X(kdft_register) (p, n1bv_12, &desc);
252 }
253 
254 #endif
255