1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:04:08 EST 2020 */
23 
24 #include "dft/codelet-dft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
29 
30 /*
31  * This function contains 80 FP additions, 56 FP multiplications,
32  * (or, 24 additions, 0 multiplications, 56 fused multiply/add),
33  * 41 stack variables, 10 constants, and 36 memory accesses
34  */
35 #include "dft/scalar/n.h"
36 
n1_9(const R * ri,const R * ii,R * ro,R * io,stride is,stride os,INT v,INT ivs,INT ovs)37 static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
38 {
39      DK(KP954188894, +0.954188894138671133499268364187245676532219158);
40      DK(KP363970234, +0.363970234266202361351047882776834043890471784);
41      DK(KP852868531, +0.852868531952443209628250963940074071936020296);
42      DK(KP492403876, +0.492403876506104029683371512294761506835321626);
43      DK(KP984807753, +0.984807753012208059366743024589523013670643252);
44      DK(KP777861913, +0.777861913430206160028177977318626690410586096);
45      DK(KP839099631, +0.839099631177280011763127298123181364687434283);
46      DK(KP176326980, +0.176326980708464973471090386868618986121633062);
47      DK(KP866025403, +0.866025403784438646763723170752936183471402627);
48      DK(KP500000000, +0.500000000000000000000000000000000000000000000);
49      {
50 	  INT i;
51 	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
52 	       E T5, TL, Tm, Tl, T1f, TM, Ta, T1c, TF, TW, TI, TX, Tf, T1d, Ts;
53 	       E TZ, Tx, T10;
54 	       {
55 		    E T1, T2, T3, T4;
56 		    T1 = ri[0];
57 		    T2 = ri[WS(is, 3)];
58 		    T3 = ri[WS(is, 6)];
59 		    T4 = T2 + T3;
60 		    T5 = T1 + T4;
61 		    TL = FNMS(KP500000000, T4, T1);
62 		    Tm = T3 - T2;
63 	       }
64 	       {
65 		    E Th, Ti, Tj, Tk;
66 		    Th = ii[0];
67 		    Ti = ii[WS(is, 3)];
68 		    Tj = ii[WS(is, 6)];
69 		    Tk = Ti + Tj;
70 		    Tl = FNMS(KP500000000, Tk, Th);
71 		    T1f = Th + Tk;
72 		    TM = Ti - Tj;
73 	       }
74 	       {
75 		    E T6, Tz, T9, TE, TC, TH, TD, TG;
76 		    T6 = ri[WS(is, 1)];
77 		    Tz = ii[WS(is, 1)];
78 		    {
79 			 E T7, T8, TA, TB;
80 			 T7 = ri[WS(is, 4)];
81 			 T8 = ri[WS(is, 7)];
82 			 T9 = T7 + T8;
83 			 TE = T7 - T8;
84 			 TA = ii[WS(is, 4)];
85 			 TB = ii[WS(is, 7)];
86 			 TC = TA + TB;
87 			 TH = TB - TA;
88 		    }
89 		    Ta = T6 + T9;
90 		    T1c = Tz + TC;
91 		    TD = FNMS(KP500000000, TC, Tz);
92 		    TF = FNMS(KP866025403, TE, TD);
93 		    TW = FMA(KP866025403, TE, TD);
94 		    TG = FNMS(KP500000000, T9, T6);
95 		    TI = FNMS(KP866025403, TH, TG);
96 		    TX = FMA(KP866025403, TH, TG);
97 	       }
98 	       {
99 		    E Tb, Tt, Te, Tw, Tr, Tu, To, Tv;
100 		    Tb = ri[WS(is, 2)];
101 		    Tt = ii[WS(is, 2)];
102 		    {
103 			 E Tc, Td, Tp, Tq;
104 			 Tc = ri[WS(is, 5)];
105 			 Td = ri[WS(is, 8)];
106 			 Te = Tc + Td;
107 			 Tw = Td - Tc;
108 			 Tp = ii[WS(is, 5)];
109 			 Tq = ii[WS(is, 8)];
110 			 Tr = Tp - Tq;
111 			 Tu = Tp + Tq;
112 		    }
113 		    Tf = Tb + Te;
114 		    T1d = Tt + Tu;
115 		    To = FNMS(KP500000000, Te, Tb);
116 		    Ts = FMA(KP866025403, Tr, To);
117 		    TZ = FNMS(KP866025403, Tr, To);
118 		    Tv = FNMS(KP500000000, Tu, Tt);
119 		    Tx = FMA(KP866025403, Tw, Tv);
120 		    T10 = FNMS(KP866025403, Tw, Tv);
121 	       }
122 	       {
123 		    E T1e, Tg, T1b, T1i, T1g, T1h;
124 		    T1e = T1c - T1d;
125 		    Tg = Ta + Tf;
126 		    T1b = FNMS(KP500000000, Tg, T5);
127 		    ro[0] = T5 + Tg;
128 		    ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b);
129 		    ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b);
130 		    T1i = Tf - Ta;
131 		    T1g = T1c + T1d;
132 		    T1h = FNMS(KP500000000, T1g, T1f);
133 		    io[WS(os, 3)] = FMA(KP866025403, T1i, T1h);
134 		    io[0] = T1f + T1g;
135 		    io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h);
136 	       }
137 	       {
138 		    E Tn, TN, TK, TS, TQ, TU, TR, TT;
139 		    Tn = FMA(KP866025403, Tm, Tl);
140 		    TN = FMA(KP866025403, TM, TL);
141 		    {
142 			 E Ty, TJ, TO, TP;
143 			 Ty = FNMS(KP176326980, Tx, Ts);
144 			 TJ = FNMS(KP839099631, TI, TF);
145 			 TK = FNMS(KP777861913, TJ, Ty);
146 			 TS = FMA(KP777861913, TJ, Ty);
147 			 TO = FMA(KP176326980, Ts, Tx);
148 			 TP = FMA(KP839099631, TF, TI);
149 			 TQ = FMA(KP777861913, TP, TO);
150 			 TU = FNMS(KP777861913, TP, TO);
151 		    }
152 		    io[WS(os, 1)] = FNMS(KP984807753, TK, Tn);
153 		    ro[WS(os, 1)] = FMA(KP984807753, TQ, TN);
154 		    TR = FNMS(KP492403876, TQ, TN);
155 		    ro[WS(os, 4)] = FMA(KP852868531, TS, TR);
156 		    ro[WS(os, 7)] = FNMS(KP852868531, TS, TR);
157 		    TT = FMA(KP492403876, TK, Tn);
158 		    io[WS(os, 7)] = FNMS(KP852868531, TU, TT);
159 		    io[WS(os, 4)] = FMA(KP852868531, TU, TT);
160 	       }
161 	       {
162 		    E TV, T17, T12, T1a, T16, T18, T13, T19;
163 		    TV = FNMS(KP866025403, TM, TL);
164 		    T17 = FNMS(KP866025403, Tm, Tl);
165 		    {
166 			 E TY, T11, T14, T15;
167 			 TY = FMA(KP176326980, TX, TW);
168 			 T11 = FNMS(KP363970234, T10, TZ);
169 			 T12 = FNMS(KP954188894, T11, TY);
170 			 T1a = FMA(KP954188894, T11, TY);
171 			 T14 = FNMS(KP176326980, TW, TX);
172 			 T15 = FMA(KP363970234, TZ, T10);
173 			 T16 = FNMS(KP954188894, T15, T14);
174 			 T18 = FMA(KP954188894, T15, T14);
175 		    }
176 		    ro[WS(os, 2)] = FMA(KP984807753, T12, TV);
177 		    io[WS(os, 2)] = FNMS(KP984807753, T18, T17);
178 		    T13 = FNMS(KP492403876, T12, TV);
179 		    ro[WS(os, 5)] = FNMS(KP852868531, T16, T13);
180 		    ro[WS(os, 8)] = FMA(KP852868531, T16, T13);
181 		    T19 = FMA(KP492403876, T18, T17);
182 		    io[WS(os, 5)] = FNMS(KP852868531, T1a, T19);
183 		    io[WS(os, 8)] = FMA(KP852868531, T1a, T19);
184 	       }
185 	  }
186      }
187 }
188 
189 static const kdft_desc desc = { 9, "n1_9", { 24, 0, 56, 0 }, &GENUS, 0, 0, 0, 0 };
190 
X(codelet_n1_9)191 void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
192 }
193 
194 #else
195 
196 /* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
197 
198 /*
199  * This function contains 80 FP additions, 40 FP multiplications,
200  * (or, 60 additions, 20 multiplications, 20 fused multiply/add),
201  * 39 stack variables, 8 constants, and 36 memory accesses
202  */
203 #include "dft/scalar/n.h"
204 
n1_9(const R * ri,const R * ii,R * ro,R * io,stride is,stride os,INT v,INT ivs,INT ovs)205 static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
206 {
207      DK(KP939692620, +0.939692620785908384054109277324731469936208134);
208      DK(KP342020143, +0.342020143325668733044099614682259580763083368);
209      DK(KP984807753, +0.984807753012208059366743024589523013670643252);
210      DK(KP173648177, +0.173648177666930348851716626769314796000375677);
211      DK(KP642787609, +0.642787609686539326322643409907263432907559884);
212      DK(KP766044443, +0.766044443118978035202392650555416673935832457);
213      DK(KP500000000, +0.500000000000000000000000000000000000000000000);
214      DK(KP866025403, +0.866025403784438646763723170752936183471402627);
215      {
216 	  INT i;
217 	  for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
218 	       E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB;
219 	       E T10, TG, TZ;
220 	       {
221 		    E T1, T2, T3, T4;
222 		    T1 = ri[0];
223 		    T2 = ri[WS(is, 3)];
224 		    T3 = ri[WS(is, 6)];
225 		    T4 = T2 + T3;
226 		    T5 = T1 + T4;
227 		    TO = KP866025403 * (T3 - T2);
228 		    Th = FNMS(KP500000000, T4, T1);
229 	       }
230 	       {
231 		    E TP, Ti, Tj, TQ;
232 		    TP = ii[0];
233 		    Ti = ii[WS(is, 3)];
234 		    Tj = ii[WS(is, 6)];
235 		    TQ = Ti + Tj;
236 		    Tk = KP866025403 * (Ti - Tj);
237 		    T1g = TP + TQ;
238 		    TR = FNMS(KP500000000, TQ, TP);
239 	       }
240 	       {
241 		    E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu;
242 		    T6 = ri[WS(is, 1)];
243 		    Ts = ii[WS(is, 1)];
244 		    {
245 			 E T7, T8, Tn, To;
246 			 T7 = ri[WS(is, 4)];
247 			 T8 = ri[WS(is, 7)];
248 			 T9 = T7 + T8;
249 			 Tr = KP866025403 * (T8 - T7);
250 			 Tn = ii[WS(is, 4)];
251 			 To = ii[WS(is, 7)];
252 			 Tp = KP866025403 * (Tn - To);
253 			 Tt = Tn + To;
254 		    }
255 		    Ta = T6 + T9;
256 		    T1c = Ts + Tt;
257 		    Tm = FNMS(KP500000000, T9, T6);
258 		    Tq = Tm + Tp;
259 		    TW = Tm - Tp;
260 		    Tu = FNMS(KP500000000, Tt, Ts);
261 		    Tv = Tr + Tu;
262 		    TX = Tu - Tr;
263 	       }
264 	       {
265 		    E Tb, TD, Te, TC, TA, TE, Tx, TF;
266 		    Tb = ri[WS(is, 2)];
267 		    TD = ii[WS(is, 2)];
268 		    {
269 			 E Tc, Td, Ty, Tz;
270 			 Tc = ri[WS(is, 5)];
271 			 Td = ri[WS(is, 8)];
272 			 Te = Tc + Td;
273 			 TC = KP866025403 * (Td - Tc);
274 			 Ty = ii[WS(is, 5)];
275 			 Tz = ii[WS(is, 8)];
276 			 TA = KP866025403 * (Ty - Tz);
277 			 TE = Ty + Tz;
278 		    }
279 		    Tf = Tb + Te;
280 		    T1d = TD + TE;
281 		    Tx = FNMS(KP500000000, Te, Tb);
282 		    TB = Tx + TA;
283 		    T10 = Tx - TA;
284 		    TF = FNMS(KP500000000, TE, TD);
285 		    TG = TC + TF;
286 		    TZ = TF - TC;
287 	       }
288 	       {
289 		    E T1e, Tg, T1b, T1f, T1h, T1i;
290 		    T1e = KP866025403 * (T1c - T1d);
291 		    Tg = Ta + Tf;
292 		    T1b = FNMS(KP500000000, Tg, T5);
293 		    ro[0] = T5 + Tg;
294 		    ro[WS(os, 3)] = T1b + T1e;
295 		    ro[WS(os, 6)] = T1b - T1e;
296 		    T1f = KP866025403 * (Tf - Ta);
297 		    T1h = T1c + T1d;
298 		    T1i = FNMS(KP500000000, T1h, T1g);
299 		    io[WS(os, 3)] = T1f + T1i;
300 		    io[0] = T1g + T1h;
301 		    io[WS(os, 6)] = T1i - T1f;
302 	       }
303 	       {
304 		    E Tl, TS, TI, TN, TM, TT, TJ, TU;
305 		    Tl = Th + Tk;
306 		    TS = TO + TR;
307 		    {
308 			 E Tw, TH, TK, TL;
309 			 Tw = FMA(KP766044443, Tq, KP642787609 * Tv);
310 			 TH = FMA(KP173648177, TB, KP984807753 * TG);
311 			 TI = Tw + TH;
312 			 TN = KP866025403 * (TH - Tw);
313 			 TK = FNMS(KP642787609, Tq, KP766044443 * Tv);
314 			 TL = FNMS(KP984807753, TB, KP173648177 * TG);
315 			 TM = KP866025403 * (TK - TL);
316 			 TT = TK + TL;
317 		    }
318 		    ro[WS(os, 1)] = Tl + TI;
319 		    io[WS(os, 1)] = TS + TT;
320 		    TJ = FNMS(KP500000000, TI, Tl);
321 		    ro[WS(os, 7)] = TJ - TM;
322 		    ro[WS(os, 4)] = TJ + TM;
323 		    TU = FNMS(KP500000000, TT, TS);
324 		    io[WS(os, 4)] = TN + TU;
325 		    io[WS(os, 7)] = TU - TN;
326 	       }
327 	       {
328 		    E TV, T14, T12, T13, T17, T1a, T18, T19;
329 		    TV = Th - Tk;
330 		    T14 = TR - TO;
331 		    {
332 			 E TY, T11, T15, T16;
333 			 TY = FMA(KP173648177, TW, KP984807753 * TX);
334 			 T11 = FNMS(KP939692620, T10, KP342020143 * TZ);
335 			 T12 = TY + T11;
336 			 T13 = KP866025403 * (T11 - TY);
337 			 T15 = FNMS(KP984807753, TW, KP173648177 * TX);
338 			 T16 = FMA(KP342020143, T10, KP939692620 * TZ);
339 			 T17 = T15 - T16;
340 			 T1a = KP866025403 * (T15 + T16);
341 		    }
342 		    ro[WS(os, 2)] = TV + T12;
343 		    io[WS(os, 2)] = T14 + T17;
344 		    T18 = FNMS(KP500000000, T17, T14);
345 		    io[WS(os, 5)] = T13 + T18;
346 		    io[WS(os, 8)] = T18 - T13;
347 		    T19 = FNMS(KP500000000, T12, TV);
348 		    ro[WS(os, 8)] = T19 - T1a;
349 		    ro[WS(os, 5)] = T19 + T1a;
350 	       }
351 	  }
352      }
353 }
354 
355 static const kdft_desc desc = { 9, "n1_9", { 60, 20, 20, 0 }, &GENUS, 0, 0, 0, 0 };
356 
X(codelet_n1_9)357 void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
358 }
359 
360 #endif
361