1 /*
2  * Copyright (c) 2003, 2007-14 Matteo Frigo
3  * Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  */
20 
21 /* This file was automatically generated --- DO NOT EDIT */
22 /* Generated on Thu Dec 10 07:06:57 EST 2020 */
23 
24 #include "rdft/codelet-rdft.h"
25 
26 #if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
27 
28 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -fma -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dit -name hc2cfdftv_8 -include rdft/simd/hc2cfv.h */
29 
30 /*
31  * This function contains 41 FP additions, 40 FP multiplications,
32  * (or, 23 additions, 22 multiplications, 18 fused multiply/add),
33  * 52 stack variables, 2 constants, and 16 memory accesses
34  */
35 #include "rdft/simd/hc2cfv.h"
36 
hc2cfdftv_8(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)37 static void hc2cfdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
38 {
39      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
40      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
41      {
42 	  INT m;
43 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
44 	       V T8, Tt, TG, TF, TD, TC, Tn, Tu, T3, Tc, Tl, Ts, T7, Ta, Th;
45 	       V Tq, T1, T2, Tb, Tj, Tk, Ti, Tr, T5, T6, T4, T9, Tf, Tg, Te;
46 	       V Tp, Td, Tm, Tw, Tx, To, Tv, TM, TN, TK, TL, TA, TB, Ty, Tz;
47 	       V TI, TJ, TE, TH;
48 	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
49 	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
50 	       T3 = VFMACONJ(T2, T1);
51 	       Tb = LDW(&(W[0]));
52 	       Tc = VZMULIJ(Tb, VFNMSCONJ(T2, T1));
53 	       Tj = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
54 	       Tk = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
55 	       Ti = LDW(&(W[TWVL * 12]));
56 	       Tl = VZMULIJ(Ti, VFNMSCONJ(Tk, Tj));
57 	       Tr = LDW(&(W[TWVL * 10]));
58 	       Ts = VZMULJ(Tr, VFMACONJ(Tk, Tj));
59 	       T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
60 	       T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
61 	       T4 = LDW(&(W[TWVL * 6]));
62 	       T7 = VZMULJ(T4, VFMACONJ(T6, T5));
63 	       T9 = LDW(&(W[TWVL * 8]));
64 	       Ta = VZMULIJ(T9, VFNMSCONJ(T6, T5));
65 	       Tf = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
66 	       Tg = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
67 	       Te = LDW(&(W[TWVL * 4]));
68 	       Th = VZMULIJ(Te, VFNMSCONJ(Tg, Tf));
69 	       Tp = LDW(&(W[TWVL * 2]));
70 	       Tq = VZMULJ(Tp, VFMACONJ(Tg, Tf));
71 	       T8 = VSUB(T3, T7);
72 	       Tt = VSUB(Tq, Ts);
73 	       TG = VADD(Th, Tl);
74 	       TF = VADD(Tc, Ta);
75 	       TD = VADD(Tq, Ts);
76 	       TC = VADD(T3, T7);
77 	       Td = VSUB(Ta, Tc);
78 	       Tm = VSUB(Th, Tl);
79 	       Tn = VADD(Td, Tm);
80 	       Tu = VSUB(Tm, Td);
81 	       To = VFMA(LDK(KP707106781), Tn, T8);
82 	       Tv = VFNMS(LDK(KP707106781), Tu, Tt);
83 	       Tw = VMUL(LDK(KP500000000), VFNMSI(Tv, To));
84 	       Tx = VCONJ(VMUL(LDK(KP500000000), VFMAI(Tv, To)));
85 	       ST(&(Rp[WS(rs, 1)]), Tw, ms, &(Rp[WS(rs, 1)]));
86 	       ST(&(Rm[0]), Tx, -ms, &(Rm[0]));
87 	       TK = VADD(TC, TD);
88 	       TL = VADD(TF, TG);
89 	       TM = VMUL(LDK(KP500000000), VSUB(TK, TL));
90 	       TN = VCONJ(VMUL(LDK(KP500000000), VADD(TL, TK)));
91 	       ST(&(Rp[0]), TM, ms, &(Rp[0]));
92 	       ST(&(Rm[WS(rs, 3)]), TN, -ms, &(Rm[WS(rs, 1)]));
93 	       Ty = VFNMS(LDK(KP707106781), Tn, T8);
94 	       Tz = VFMA(LDK(KP707106781), Tu, Tt);
95 	       TA = VCONJ(VMUL(LDK(KP500000000), VFNMSI(Tz, Ty)));
96 	       TB = VMUL(LDK(KP500000000), VFMAI(Tz, Ty));
97 	       ST(&(Rm[WS(rs, 2)]), TA, -ms, &(Rm[0]));
98 	       ST(&(Rp[WS(rs, 3)]), TB, ms, &(Rp[WS(rs, 1)]));
99 	       TE = VSUB(TC, TD);
100 	       TH = VSUB(TF, TG);
101 	       TI = VMUL(LDK(KP500000000), VFMAI(TH, TE));
102 	       TJ = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TH, TE)));
103 	       ST(&(Rp[WS(rs, 2)]), TI, ms, &(Rp[0]));
104 	       ST(&(Rm[WS(rs, 1)]), TJ, -ms, &(Rm[WS(rs, 1)]));
105 	  }
106      }
107      VLEAVE();
108 }
109 
110 static const tw_instr twinstr[] = {
111      VTW(1, 1),
112      VTW(1, 2),
113      VTW(1, 3),
114      VTW(1, 4),
115      VTW(1, 5),
116      VTW(1, 6),
117      VTW(1, 7),
118      { TW_NEXT, VL, 0 }
119 };
120 
121 static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cfdftv_8"), twinstr, &GENUS, { 23, 22, 18, 0 } };
122 
XSIMD(codelet_hc2cfdftv_8)123 void XSIMD(codelet_hc2cfdftv_8) (planner *p) {
124      X(khc2c_register) (p, hc2cfdftv_8, &desc, HC2C_VIA_DFT);
125 }
126 #else
127 
128 /* Generated by: ../../../genfft/gen_hc2cdft_c.native -simd -compact -variables 4 -pipeline-latency 8 -trivial-stores -variables 32 -no-generate-bytw -n 8 -dit -name hc2cfdftv_8 -include rdft/simd/hc2cfv.h */
129 
130 /*
131  * This function contains 41 FP additions, 23 FP multiplications,
132  * (or, 41 additions, 23 multiplications, 0 fused multiply/add),
133  * 57 stack variables, 3 constants, and 16 memory accesses
134  */
135 #include "rdft/simd/hc2cfv.h"
136 
hc2cfdftv_8(R * Rp,R * Ip,R * Rm,R * Im,const R * W,stride rs,INT mb,INT me,INT ms)137 static void hc2cfdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
138 {
139      DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
140      DVK(KP353553390, +0.353553390593273762200422181052424519642417969);
141      DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
142      {
143 	  INT m;
144 	  for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) {
145 	       V Ta, TE, Tr, TF, Tl, TK, Tw, TG, T1, T6, T3, T8, T2, T7, T4;
146 	       V T9, T5, To, Tq, Tn, Tp, Tc, Th, Te, Tj, Td, Ti, Tf, Tk, Tb;
147 	       V Tg, Tt, Tv, Ts, Tu, Ty, Tz, Tm, Tx, TC, TD, TA, TB, TI, TO;
148 	       V TL, TP, TH, TJ, TM, TR, TN, TQ;
149 	       T1 = LD(&(Rp[0]), ms, &(Rp[0]));
150 	       T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0]));
151 	       T2 = LD(&(Rm[0]), -ms, &(Rm[0]));
152 	       T3 = VCONJ(T2);
153 	       T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0]));
154 	       T8 = VCONJ(T7);
155 	       T4 = VADD(T1, T3);
156 	       T5 = LDW(&(W[TWVL * 6]));
157 	       T9 = VZMULJ(T5, VADD(T6, T8));
158 	       Ta = VADD(T4, T9);
159 	       TE = VMUL(LDK(KP500000000), VSUB(T4, T9));
160 	       Tn = LDW(&(W[0]));
161 	       To = VZMULIJ(Tn, VSUB(T3, T1));
162 	       Tp = LDW(&(W[TWVL * 8]));
163 	       Tq = VZMULIJ(Tp, VSUB(T8, T6));
164 	       Tr = VADD(To, Tq);
165 	       TF = VSUB(To, Tq);
166 	       Tc = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)]));
167 	       Th = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)]));
168 	       Td = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)]));
169 	       Te = VCONJ(Td);
170 	       Ti = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)]));
171 	       Tj = VCONJ(Ti);
172 	       Tb = LDW(&(W[TWVL * 2]));
173 	       Tf = VZMULJ(Tb, VADD(Tc, Te));
174 	       Tg = LDW(&(W[TWVL * 10]));
175 	       Tk = VZMULJ(Tg, VADD(Th, Tj));
176 	       Tl = VADD(Tf, Tk);
177 	       TK = VSUB(Tf, Tk);
178 	       Ts = LDW(&(W[TWVL * 4]));
179 	       Tt = VZMULIJ(Ts, VSUB(Te, Tc));
180 	       Tu = LDW(&(W[TWVL * 12]));
181 	       Tv = VZMULIJ(Tu, VSUB(Tj, Th));
182 	       Tw = VADD(Tt, Tv);
183 	       TG = VSUB(Tv, Tt);
184 	       Tm = VADD(Ta, Tl);
185 	       Tx = VADD(Tr, Tw);
186 	       Ty = VCONJ(VMUL(LDK(KP500000000), VSUB(Tm, Tx)));
187 	       Tz = VMUL(LDK(KP500000000), VADD(Tm, Tx));
188 	       ST(&(Rm[WS(rs, 3)]), Ty, -ms, &(Rm[WS(rs, 1)]));
189 	       ST(&(Rp[0]), Tz, ms, &(Rp[0]));
190 	       TA = VSUB(Ta, Tl);
191 	       TB = VBYI(VSUB(Tw, Tr));
192 	       TC = VCONJ(VMUL(LDK(KP500000000), VSUB(TA, TB)));
193 	       TD = VMUL(LDK(KP500000000), VADD(TA, TB));
194 	       ST(&(Rm[WS(rs, 1)]), TC, -ms, &(Rm[WS(rs, 1)]));
195 	       ST(&(Rp[WS(rs, 2)]), TD, ms, &(Rp[0]));
196 	       TH = VMUL(LDK(KP353553390), VADD(TF, TG));
197 	       TI = VADD(TE, TH);
198 	       TO = VSUB(TE, TH);
199 	       TJ = VMUL(LDK(KP707106781), VSUB(TG, TF));
200 	       TL = VMUL(LDK(KP500000000), VBYI(VSUB(TJ, TK)));
201 	       TP = VMUL(LDK(KP500000000), VBYI(VADD(TK, TJ)));
202 	       TM = VCONJ(VSUB(TI, TL));
203 	       ST(&(Rm[0]), TM, -ms, &(Rm[0]));
204 	       TR = VADD(TO, TP);
205 	       ST(&(Rp[WS(rs, 3)]), TR, ms, &(Rp[WS(rs, 1)]));
206 	       TN = VADD(TI, TL);
207 	       ST(&(Rp[WS(rs, 1)]), TN, ms, &(Rp[WS(rs, 1)]));
208 	       TQ = VCONJ(VSUB(TO, TP));
209 	       ST(&(Rm[WS(rs, 2)]), TQ, -ms, &(Rm[0]));
210 	  }
211      }
212      VLEAVE();
213 }
214 
215 static const tw_instr twinstr[] = {
216      VTW(1, 1),
217      VTW(1, 2),
218      VTW(1, 3),
219      VTW(1, 4),
220      VTW(1, 5),
221      VTW(1, 6),
222      VTW(1, 7),
223      { TW_NEXT, VL, 0 }
224 };
225 
226 static const hc2c_desc desc = { 8, XSIMD_STRING("hc2cfdftv_8"), twinstr, &GENUS, { 41, 23, 0, 0 } };
227 
XSIMD(codelet_hc2cfdftv_8)228 void XSIMD(codelet_hc2cfdftv_8) (planner *p) {
229      X(khc2c_register) (p, hc2cfdftv_8, &desc, HC2C_VIA_DFT);
230 }
231 #endif
232