1 // -*- mode: c++ -*-
2 #ifndef FFT_H
3 #define FFT_H
4 
5 #include <math.h>
6 #include <string.h>
7 #include "utils.h"
8 
9 #if defined(ENABLE_SSE) && !defined(APPLE_PPC)
10 #include "sse.h"
11 #endif
12 
13 namespace _sbsms_ {
14 
15 typedef void (*fftplan)(t_fft *x);
16 void fft128(t_fft *x);
17 void ifft128(t_fft *x);
18 void fft256(t_fft *x);
19 void ifft256(t_fft *x);
20 void fft384(t_fft *x);
21 void fft512(t_fft *x);
22 
23 template<int N>
24 class Factor {
25 public:
26   enum { value = (N%8==0)?8:(N%7==0)?7:(N%4==0)?4:(N%5==0)?5:(N%6==0)?6:(N%3==0)?3:(N%2==0)?2:1 };
27 };
28 
29 template<int N>
30 class LastFactor {
31 public:
32   enum { radix = Factor<N>::value,
33          stride = N / radix,
34          value = (stride==1?radix:LastFactor<stride>::value) };
35 };
36 
37 template <>
38 class LastFactor<1> {
39 public:
40   enum { value = 1 };
41 };
42 
43 template<int istride, int ostride, int radix, int dir>
44 class __fft {
45 public:
46   static inline void execute(t_fft *x, t_fft *y, int step);
47 };
48 
49 template <int N, int dir>
50 class FloatTwiddles {
51 public:
52   float c[N];
53   float s[N];
FloatTwiddles()54   FloatTwiddles() {
55     for(int k=0; k<N; k++) {
56       c[k] = cos(TWOPI * (float)k / (float)N);
57       s[k] = sin(TWOPI * (float)(-dir*k) / (float)N);
58     }
59   }
60 };
61 
62 template <int N, int dir>
63 class FloatTwiddle {
64  public:
65   static const float *c;
66   static const float *s;
67   static const FloatTwiddles<N,dir> t;
twiddle(int k,t_fft * x,float r,float i)68   static inline void twiddle(int k, t_fft *x, float r, float i) {
69     float cc = c[k];
70     float ss = s[k];
71     (*x)[0] = cc * r - ss * i;
72     (*x)[1] = ss * r + cc * i;
73   }
74 };
75 
76 template <int N, int dir>
77 const FloatTwiddles<N,dir> FloatTwiddle<N,dir>::t;
78 
79 template <int N, int dir>
80 const float* FloatTwiddle<N,dir>::c = t.c;
81 
82 template <int N, int dir>
83 const float* FloatTwiddle<N,dir>::s = t.s;
84 
85 template<int istride, int ostride, int dir>
86 class __fft<istride,ostride,2,dir> {
87 public:
88   enum { i1 = istride, o1 = ostride,
89          ir0 = 0, ii0 = 1, or0 = 0, oi0 = 1,
90          ir1 = i1<<1, ii1 = ir1 + 1, or1 = o1<<1, oi1 = or1 + 1 };
execute(t_fft * _x,t_fft * _y,int step)91   static inline void execute(t_fft *_x, t_fft *_y, int step) {
92     float *x = (float*)_x;
93     float *y = (float*)_y;
94     float y0 = x[ir0] - x[ir1]; float y1 = x[ii0] - x[ii1];
95     y[or0] = x[ir0] + x[ir1]; y[oi0] = x[ii0] + x[ii1];
96     y[or1] = y0; y[oi1] = y1;
97   }
98 };
99 
100 template<int istride, int ostride, int dir>
101 class __fft<istride,ostride,3,dir> {
102 public:
103   enum { N = istride * 3,
104          i1_ = istride, o1 = ostride,
105          i2_ = istride + istride, o2 = o1 + ostride,
106          i1 = (dir==1?i1_:i2_),
107          i2 = (dir==1?i2_:i1_),
108          ir0 = 0, ii0 = 1, or0 = 0, oi0 = 1,
109          ir1 = i1<<1, ii1 = ir1 + 1, or1 = o1<<1, oi1 = or1 + 1,
110          ir2 = i2<<1, ii2 = ir2 + 1, or2 = o2<<1, oi2 = or2 + 1 };
execute(t_fft * _x,t_fft * _y,int step)111   static inline void execute(t_fft *_x, t_fft *_y, int step) {
112     float *x = (float*)_x;
113     float *y = (float*)_y;
114     float z00 = x[ir1] + x[ir2]; float z01 = x[ii1] + x[ii2];
115     float z10 = x[ir0] - 0.5f*z00; float z11 = x[ii0] - 0.5f*z01;
116     float z20 = 0.86602540378444f*(x[ir2] - x[ir1]); float z21 = 0.86602540378444f*(x[ii2] - x[ii1]);
117     y[or0] = x[ir0] + z00; y[oi0] = x[ii0] + z01;
118     if(step) {
119       FloatTwiddle<N,dir>::twiddle(step, _y+o1,z10 - z21,z11 + z20);
120       FloatTwiddle<N,dir>::twiddle(step+step, _y+o2,z10 + z21,z11 - z20);
121     } else {
122       y[or1] = z10 - z21; y[oi1] = z11 + z20;
123       y[or2] = z10 + z21; y[oi2] = z11 - z20;
124     }
125   }
126 };
127 
128 template<int istride, int ostride, int dir>
129   class __fft<istride,ostride,5,dir> {
130 public:
131   enum { N = istride*5,
132          i0 = 0, o0 = 0,
133          i1_ = i0 + istride, o1 = o0 + ostride,
134          i2_ = i1_ + istride, o2 = o1 + ostride,
135          i3_ = i2_ + istride, o3 = o2 + ostride,
136          i4_ = i3_ + istride, o4 = o3 + ostride,
137          i1 = (dir==1?i4_:i1_),
138          i2 = (dir==1?i3_:i2_),
139          i3 = (dir==1?i2_:i3_),
140          i4 = (dir==1?i1_:i4_),
141          ir0 = i0<<1, ii0 = ir0 + 1, or0 = o0<<1, oi0 = or0 + 1,
142          ir1 = i1<<1, ii1 = ir1 + 1, or1 = o1<<1, oi1 = or1 + 1,
143          ir2 = i2<<1, ii2 = ir2 + 1, or2 = o2<<1, oi2 = or2 + 1,
144          ir3 = i3<<1, ii3 = ir3 + 1, or3 = o3<<1, oi3 = or3 + 1,
145          ir4 = i4<<1, ii4 = ir4 + 1, or4 = o4<<1, oi4 = or4 + 1 };
execute(t_fft * _x,t_fft * _y,int step)146   static inline void execute(t_fft *_x, t_fft *_y, int step) {
147     float *x = (float*)_x;
148     float *y = (float*)_y;
149     float z00, z01, z10, z11, z20, z21, z30, z31, z40, z41, z50, z51, z60, z61, z70, z71, z80, z81, z90, z91, z100, z101;
150     z00 = x[ir1] + x[ir4]; z01 = x[ii1] + x[ii4];
151     z10 = x[ir2] + x[ir3]; z11 = x[ii2] + x[ii3];
152     z20 = x[ir1] - x[ir4]; z21 = x[ii1] - x[ii4];
153     z30 = x[ir2] - x[ir3]; z31 = x[ii2] - x[ii3];
154     z40 = z00 + z10; z41 = z01 + z11;
155     z50 = 0.55901699437495f*(z00 - z10); z51 = 0.55901699437495f*(z01 - z11);
156     z60 = x[ir0] - 0.25f*z40; z61 = x[ii0] - 0.25f*z41;
157     z70 = z50 + z60; z71 = z51 + z61;
158     z80 = z60 - z50; z81 = z61 - z51;
159     z90 = 0.95105651629515f*z20 + 0.58778525229247f*z30; z91 = 0.95105651629515f*z21 + 0.58778525229247f*z31;
160     z100 = 0.58778525229247f*z20 - 0.95105651629515f*z30; z101 = 0.58778525229247f*z21 - 0.95105651629515f*z31;
161     y[or0] = x[ir0] + z40; y[oi0] = x[ii0] + z41;
162     if(step) {
163       int step2 = step + step;
164       FloatTwiddle<N,dir>::twiddle(step,_y+o1,z70 - z91,z71 + z90);
165       FloatTwiddle<N,dir>::twiddle(step2,_y+o2,z80 - z101,z81 + z100);
166       FloatTwiddle<N,dir>::twiddle(step2+step,_y+o3,z80 + z101,z81 - z100);
167       FloatTwiddle<N,dir>::twiddle(step2+step2,_y+o4,z70 + z91,z71 - z90);
168     } else {
169       y[or1] = z70 - z91; y[oi1] = z71 + z90;
170       y[or2] = z80 - z101; y[oi2] = z81 + z100;
171       y[or3] = z80 + z101; y[oi3] = z81 - z100;
172       y[or4] = z70 + z91; y[oi4] = z71 - z90;
173     }
174   }
175 };
176 
177 template<int istride, int ostride, int dir>
178 class __fft<istride,ostride,6,dir> {
179 public:
180   enum { N = istride*6,
181          i0 = 0, o0 = 0,
182          i1_ = i0 + istride, o1 = o0 + ostride,
183          i2_ = i1_ + istride, o2 = o1 + ostride,
184          i3 = i2_ + istride, o3 = o2 + ostride,
185          i4_ = i3 + istride, o4 = o3 + ostride,
186          i5_ = i4_ + istride, o5 = o4 + ostride,
187          i1 = (dir==1?i1_:i5_),
188          i2 = (dir==1?i2_:i4_),
189          i4 = (dir==1?i4_:i2_),
190          i5 = (dir==1?i5_:i1_),
191          ir0 = i0<<1, ii0 = ir0 + 1, or0 = o0<<1, oi0 = or0 + 1,
192          ir1 = i1<<1, ii1 = ir1 + 1, or1 = o1<<1, oi1 = or1 + 1,
193          ir2 = i2<<1, ii2 = ir2 + 1, or2 = o2<<1, oi2 = or2 + 1,
194          ir3 = i3<<1, ii3 = ir3 + 1, or3 = o3<<1, oi3 = or3 + 1,
195          ir4 = i4<<1, ii4 = ir4 + 1, or4 = o4<<1, oi4 = or4 + 1,
196          ir5 = i5<<1, ii5 = ir5 + 1, or5 = o5<<1, oi5 = or5 + 1 };
execute(t_fft * _x,t_fft * _y,int step)197   static inline void execute(t_fft *_x, t_fft *_y, int step) {
198     float *x = (float*)_x;
199     float *y = (float*)_y;
200     float za00, za01, za10, za11, za20, za21;
201     float a00, a01, a10, a11, a20, a21;
202     float zb00, zb01, zb10, zb11, zb20, zb21;
203     float b00, b01, b10, b11, b20, b21;
204     za00 = x[ir2] + x[ir4]; za01 = x[ii2] + x[ii4];
205     za10 = x[ir0] - 0.5f*za00; za11 = x[ii0] - 0.5f*za01;
206     za20 = 0.86602540378444f*(x[ir4] - x[ir2]); za21 = 0.86602540378444f*(x[ii4] - x[ii2]);
207     a00 = x[ir0] + za00; a01 = x[ii0] + za01;
208     a10 = za10 - za21; a11 = za11 + za20;
209     a20 = za10 + za21; a21 = za11 - za20;
210     zb00 = x[ir1] + x[ir5]; zb01 = x[ii1] + x[ii5];
211     zb10 = x[ir3] - 0.5f*zb00; zb11 = x[ii3] - 0.5f*zb01;
212     zb20 = 0.86602540378444f*(x[ir1] - x[ir5]); zb21 = 0.86602540378444f*(x[ii1] - x[ii5]);
213     b00 = x[ir3] + zb00; b01 = x[ii3] + zb01;
214     b10 = zb10 - zb21; b11 = zb11 + zb20;
215     b20 = zb10 + zb21; b21 = zb11 - zb20;
216     y[or0] = a00 + b00; y[oi0] = a01 + b01;
217     if(step) {
218       FloatTwiddle<N,dir>::twiddle(step,_y+o1,a10 - b10,a11 - b11);
219       int step2 = step + step;
220       FloatTwiddle<N,dir>::twiddle(step2,_y+o2,a20 + b20,a21 + b21);
221       int step3 = step2 + step;
222       FloatTwiddle<N,dir>::twiddle(step3,_y+o3,a00 - b00,a01 - b01);
223       FloatTwiddle<N,dir>::twiddle(step2+step2,_y+o4,a10 + b10,a11 + b11);
224       FloatTwiddle<N,dir>::twiddle(step3+step2,_y+o5,a20 - b20,a21 - b21);
225     } else {
226       y[or1] = a10 - b10; y[oi1] = a11 - b11;
227       y[or2] = a20 + b20; y[oi2] = a21 + b21;
228       y[or3] = a00 - b00; y[oi3] = a01 - b01;
229       y[or4] = a10 + b10; y[oi4] = a11 + b11;
230       y[or5] = a20 - b20; y[oi5] = a21 - b21;
231     }
232   }
233 };
234 
235 template<int istride, int ostride, int dir>
236 class __fft<istride,ostride,7,dir> {
237 public:
238   enum { N = istride*7,
239          i0 = 0, o0 = 0,
240          i1 = i0 + istride, o1 = o0 + ostride,
241          i2 = i1 + istride, o2 = o1 + ostride,
242          i3 = i2 + istride, o3 = o2 + ostride,
243          i4 = i3 + istride, o4 = o3 + ostride,
244          i5 = i4 + istride, o5 = o4 + ostride,
245          i6 = i5 + istride, o6 = o5 + ostride,
246          ir0 = i0<<1, ii0 = ir0 + 1, or0 = o0<<1, oi0 = or0 + 1,
247          ir1 = i1<<1, ii1 = ir1 + 1, or1 = o1<<1, oi1 = or1 + 1,
248          ir2 = i2<<1, ii2 = ir2 + 1, or2 = o2<<1, oi2 = or2 + 1,
249          ir3 = i3<<1, ii3 = ir3 + 1, or3 = o3<<1, oi3 = or3 + 1,
250          ir4 = i4<<1, ii4 = ir4 + 1, or4 = o4<<1, oi4 = or4 + 1,
251          ir5 = i5<<1, ii5 = ir5 + 1, or5 = o5<<1, oi5 = or5 + 1,
252          ir6 = i6<<1, ii6 = ir6 + 1, or6 = o6<<1, oi6 = or6 + 1 };
execute(t_fft * _x,t_fft * _y,int step)253   static inline void execute(t_fft *_x, t_fft *_y, int step) {
254     float *x = (float*)_x;
255     float *y = (float*)_y;
256     float u00, u01, u10, u11, u20, u21, u30, u31, u40, u41, u50, u51, u60, u61, u70, u71;
257     float b00, b01, b10, b11, b20, b21, b30, b31, b40, b41, b50, b51, b60, b61, b70, b71, b80, b81;
258     float T00, T01, T10, T11, T20, T21, T30, T31, T40, T41, T50, T51, T60, T61, T70, T71, T80, T81, T90, T91, T100, T101, T110, T111, T120, T121;
259     u00 = x[ir1] + x[ir6]; u01 = x[ii1] + x[ii6];
260     u10 = x[ir1] - x[ir6]; u11 = x[ii1] - x[ii6];
261     u20 = x[ir2] + x[ir5]; u21 = x[ii2] + x[ii5];
262     u30 = x[ir2] - x[ir5]; u31 = x[ii2] - x[ii5];
263     u40 = x[ir4] + x[ir3]; u41 = x[ii4] + x[ii3];
264     u50 = x[ir4] - x[ir3]; u51 = x[ii4] - x[ii3];
265     u60 = u20 + u00; u61 = u21 + u01;
266     u70 = u50 + u30; u71 = u51 + u31;
267     b00 = x[ir0] + u60 + u40; b01 = x[ii0] + u61 + u41;
268     b10 = -1.16666666666667f*(u60 + u40); b11 = -1.16666666666667f*(u61 + u41);
269     b20 = 0.79015646852540f*(u00 - u40); b21 = 0.79015646852540f*(u01 - u41);
270     b30 = 0.05585426728965f*(u40 - u20); b31 = 0.05585426728965f*(u41 - u21);
271     b40 = 0.73430220123575f*(u20 - u00); b41 = 0.73430220123575f*(u21 - u01);
272     if(dir==1) {
273       b50 = 0.44095855184410f*(u70 + u10);
274       b51 = 0.44095855184410f*(u71 + u11);
275       b60 = 0.34087293062393f*(u10 - u50);
276       b61 = 0.34087293062393f*(u11 - u51);
277       b70 = -0.53396936033773f*(u50 - u30);
278       b71 = -0.53396936033773f*(u51 - u31);
279       b80 = 0.87484229096166f*(u30 - u10);
280       b81 = 0.87484229096166f*(u31 - u11);
281     } else {
282       b50 = -0.44095855184410f*(u70 + u10);
283       b51 = -0.44095855184410f*(u71 + u11);
284       b60 = 0.34087293062393f*(u50 - u10);
285       b61 = 0.34087293062393f*(u51 - u11);
286       b70 = -0.53396936033773f*(u30 - u50);
287       b71 = -0.53396936033773f*(u31 - u51);
288       b80 = 0.87484229096166f*(u10 - u30);
289       b81 = 0.87484229096166f*(u11 - u31);
290     }
291     T00 = b00 + b10; T01 = b01 + b11;
292     T10 = b20 + b30; T11 = b21 + b31;
293     T20 = b40 - b30; T21 = b41 - b31;
294     T30 = -b20 - b40; T31 = -b21 - b41;
295     T40 = b60 + b70; T41 = b61 + b71;
296     T50 = b80 - b70; T51 = b81 - b71;
297     T60 = -b80 - b60; T61 = -b81 - b61;
298     T70 = T00 + T10; T71 = T01 + T11;
299     T80 = T00 + T20; T81 = T01 + T21;
300     T90 = T00 + T30; T91 = T01 + T31;
301     T100 = T40 + b50; T101 = T41 + b51;
302     T110 = T50 + b50; T111 = T51 + b51;
303     T120 = T60 + b50; T121 = T61 + b51;
304     y[or0] = b00; y[oi0] = b01;
305     if(step) {
306       FloatTwiddle<N,dir>::twiddle(step,_y+o1,T70 + T101,T71 - T100);
307       int step2 = step + step;
308       FloatTwiddle<N,dir>::twiddle(step2,_y+o2,T90 + T121,T91 - T120);
309       int step3 = step2 + step;
310       FloatTwiddle<N,dir>::twiddle(step3,_y+o3,T80 - T111,T81 + T110);
311       FloatTwiddle<N,dir>::twiddle(step2+step2,_y+o4,T80 + T111,T81 - T110);
312       FloatTwiddle<N,dir>::twiddle(step3+step2,_y+o5,T90 - T121,T91 + T120);
313       FloatTwiddle<N,dir>::twiddle(step3+step3,_y+o6,T70 - T101,T71 + T100);
314     } else {
315        y[or1] = T70 + T101; y[oi1] = T71 - T100;
316        y[or2] = T90 + T121; y[oi2] = T91 - T120;
317        y[or3] = T80 - T111; y[oi3] = T81 + T110;
318        y[or4] = T80 + T111; y[oi4] = T81 - T110;
319        y[or5] = T90 - T121; y[oi5] = T91 + T120;
320        y[or6] = T70 - T101; y[oi6] = T71 + T100;
321     }
322   }
323 };
324 
325 #if defined(ENABLE_SSE) && !defined(APPLE_PPC)
326 
327 template <int N, int dir>
328 class SSETwiddles {
329 public:
330   simd_vector cs[N];
SSETwiddles()331   SSETwiddles() {
332     for(int k=0; k<N; k++) {
333       float c = cos(TWOPI * (float)k / (float)N);
334       float s = sin(TWOPI * (float)(-dir*k) / (float)N);
335       simd_vector v = {c,s,c,-s};
336       cs[k] = v;
337     }
338   }
339 };
340 
341 template <int N, int dir>
342 class SSETwiddle {
343 public:
344   static const SSETwiddles<N,dir> t;
345   static const simd_vector *cs;
twiddle(int k,const simd_vector & v)346   static inline simd_vector twiddle(int k, const simd_vector &v) {
347     return VMUL(cs[k],v);
348   }
twiddle(int k,t_fft * x,const simd_vector & v)349   static inline void twiddle(int k, t_fft *x, const simd_vector &v) {
350     simd_vector y = twiddle(k,v);
351     simd_vector z = VADD(SHUFFLE(y,y,0,1,0,1),SHUFFLE(y,y,3,2,3,2));
352     STOREL(z,x);
353   }
twiddle2(int k1,int k2,t_fft * x1,t_fft * x2,const simd_vector & y1,const simd_vector & y2)354   static inline void twiddle2(int k1, int k2, t_fft *x1, t_fft *x2, const simd_vector &y1, const simd_vector &y2) {
355     simd_vector v1 = VMUL(cs[k1],y1);
356     simd_vector v2 = VMUL(cs[k2],y2);
357     simd_vector z = VADD(SHUFFLE(v1,v2,0,1,0,1),SHUFFLE(v1,v2,3,2,3,2));
358     STOREL(z,x1);
359     STOREH(z,x2);
360   }
361 };
362 
363 template <int N, int dir>
364   const SSETwiddles<N,dir> SSETwiddle<N,dir>::t;
365 
366 template <int N, int dir>
367   const simd_vector* SSETwiddle<N,dir>::cs = t.cs;
368 
369 template<int istride, int ostride, int dir>
370 class __fft<istride,ostride,4,dir> {
371 public:
372   enum { N = istride * 4,
373          ir0 = 0, or0 = 0,
374          _ir1 = ir0 + istride, or1 = or0 + ostride,
375          ir2 = _ir1 + istride, or2 = or1 + ostride,
376          _ir3 = ir2 + istride, or3 = or2 + ostride,
377          ir1 = (dir==1?_ir3:_ir1),
378          ir3 = (dir==1?_ir1:_ir3) };
execute(t_fft * x,t_fft * y,int step)379   static inline void execute(t_fft *x, t_fft *y, int step) {
380     simd_vector v1 = {};
381     simd_vector v2 = {};
382     simd_vector v3;
383     simd_vector v4;
384     simd_vector v5;
385     v1 = LOADH(LOADL(v1,x+ir0),x+ir1);
386     v2 = LOADH(LOADL(v2,x+ir2),x+ir3);
387     v3 = VADD(v1,v2);
388     v4 = VSUB(v1,v2);
389     v1 = SHUFFLE(v4,v3,0,1,0,1);
390     v2 = SHUFFLE(v4,v3,3,2,2,3);
391     v3 = VADD(v1,v2);
392     STOREH(v3,y+or0);
393     v4 = VSUB(v1,v2);
394     v1 = SHUFFLE(v4,v3,0,0,1,1);
395     v2 = SHUFFLE(v4,v4,2,2,3,3);
396     v5 = SHUFFLE(v3,v4,0,0,1,1);
397     if(step) {
398       v1 = SSETwiddle<N,dir>::twiddle(step,v1);
399       int step2 = step + step;
400       v2 = SSETwiddle<N,dir>::twiddle(step2,v2);
401       v3 = SSETwiddle<N,dir>::twiddle(step2+step,v5);
402       v3 = VADD(v3,SHUFFLE(v3,v3,3,2,1,0));
403       STOREL(v3,y+or3);
404       v5 = VADD(SHUFFLE(v1,v2,0,1,0,1),SHUFFLE(v1,v2,3,2,3,2));
405       STOREL(v5,y+or1);
406       STOREH(v5,y+or2);
407     } else {
408       v5 = SHUFFLE(v5,v5,0,2,0,2);
409       STOREL(v5,y+or3);
410       v3 = SHUFFLE(v1,v2,0,2,0,2);
411       STOREL(v3,y+or1);
412       STOREH(v3,y+or2);
413     }
414   }
415 };
416 
417 template<int istride, int ostride>
418 class __fft<istride,ostride,8,1> {
419 public:
420   enum { N = istride*8,
421          i0 = 0, o0 = 0,
422          i1 = i0 + istride, o1 = o0 + ostride,
423          i2 = i1 + istride, o2 = o1 + ostride,
424          i3 = i2 + istride, o3 = o2 + ostride,
425          i4 = i3 + istride, o4 = o3 + ostride,
426          i5 = i4 + istride, o5 = o4 + ostride,
427          i6 = i5 + istride, o6 = o5 + ostride,
428          i7 = i6 + istride, o7 = o6 + ostride };
execute(t_fft * x,t_fft * y,int step)429   static inline void execute(t_fft *x, t_fft *y, int step) {
430     simd_vector v1 = {}, v2 = {};
431     simd_vector v3,v4,v5,v6,v7,v8;
432     simd_vector x02,x37,x15,x17,x53,x46;
433     simd_vector w1 = {}, w2 = {};
434     simd_vector w3, w4;
435     w1 = LOADH(LOADL(w1,x+i0),x+i6);
436     w2 = LOADH(LOADL(w2,x+i4),x+i2);
437     w3 = VADD(w1,w2);
438     w4 = VSUB(w1,w2);
439     v1 = LOADH(LOADL(v1,x+i1),x+i7);
440     v2 = LOADH(LOADL(v2,x+i5),x+i3);
441     v3 = VADD(v1,v2);
442     v4 = VSUB(v1,v2);
443     v5 = SHUFFLE(v3,v4,2,1,0,1); //(z10,z01,z20,z21)
444     v6 = SHUFFLE(v3,v4,0,3,3,2); //(z00,z11,z31,z30)
445     v7 = VADD(v5,v6); // (x1,y30,y11)
446     v8 = VSUB(v5,v6); // (x51,x50,y10,y31)
447     x15 = SHUFFLE(v7,v8,0,1,1,0); //(x1,x5)
448     v2 = SHUFFLE(v8,v8,3,3,3,2); //(-,-,y31,y10)
449     v3 = VADD(v7,v2);
450     v4 = VSUB(v7,v2);
451     v5 = SHUFFLE(v3,v4,2,3,2,3);
452     x37 = VMUL(SET(0.7071067811865475f),v5); //(-x71,x30,-x70,x31)
453     v5 = SHUFFLE(w3,w4,0,1,0,1); //(z00,z01,z20,z21)
454     v6 = SHUFFLE(w3,w4,2,3,3,2); //(z10,z11,z31,z30)
455     v7 = VADD(v5,v6); //(x0,y30,y11)
456     v8 = VSUB(v5,v6); //(x4,y10,y31)
457     v1 = SHUFFLE(v7,v8,0,1,0,1); //(x0,x4)
458     v2 = SHUFFLE(v8,v7,3,2,2,3); //(x61,x20,x60,x21)
459     x02 = VADD(v1,x15);
460     x46 = VSUB(v1,x15);
461     x17 = VADD(v2,x37);
462     x53 = VSUB(v2,x37);
463     STOREL(x02,y+o0);
464     if(step) {
465       simd_vector y1;
466       simd_vector y2;
467       int step2 = step + step;
468       y1 = SHUFFLE(x02,x02,2,2,3,3);
469       SSETwiddle<N,1>::twiddle(step2,y+o2,y1);
470       int step3 = step2 + step;
471       y1 = SHUFFLE(x53,x53,1,1,3,3);
472       y2 = SHUFFLE(x53,x53,2,2,0,0);
473       SSETwiddle<N,1>::twiddle2(step3+step2,step3,y+o5,y+o3,y1,y2);
474       int step4 = step2 + step2;
475       y1 = SHUFFLE(x46,x46,0,0,1,1);
476       y2 = SHUFFLE(x46,x46,2,2,3,3);
477       SSETwiddle<N,1>::twiddle2(step4,step3+step3,y+o4,y+o6,y1,y2);
478       y1 = SHUFFLE(x17,x17,1,1,3,3);
479       y2 = SHUFFLE(x17,x17,2,2,0,0);
480       SSETwiddle<N,1>::twiddle2(step,step4+step3,y+o1,y+o7,y1,y2);
481     } else {
482       simd_vector y1;
483       y1 = SHUFFLE(x02,x02,2,3,3,3);
484       STOREL(y1,y+o2);
485       y1 = SHUFFLE(x53,x53,1,3,2,0);
486       STOREL(y1,y+o5);
487       STOREH(y1,y+o3);
488       STOREL(x46,y+o4);
489       STOREH(x46,y+o6);
490       y1 = SHUFFLE(x17,x17,1,3,2,0);
491       STOREL(y1,y+o1);
492       STOREH(y1,y+o7);
493     }
494   }
495 };
496 
497 template<int istride, int ostride>
498 class __fft<istride,ostride,8,-1> {
499 public:
500   enum { N = istride*8,
501          i0 = 0, o0 = 0,
502          i1 = i0 + istride, o1 = o0 + ostride,
503          i2 = i1 + istride, o2 = o1 + ostride,
504          i3 = i2 + istride, o3 = o2 + ostride,
505          i4 = i3 + istride, o4 = o3 + ostride,
506          i5 = i4 + istride, o5 = o4 + ostride,
507          i6 = i5 + istride, o6 = o5 + ostride,
508          i7 = i6 + istride, o7 = o6 + ostride };
execute(t_fft * x,t_fft * y,int step)509   static inline void execute(t_fft *x, t_fft *y, int step) {
510     simd_vector v1 = {}, v2 = {};
511     simd_vector v3,v4,v5,v6,v7,v8;
512     simd_vector x02,x37,x15,x17,x53,x46;
513     simd_vector w1 = {}, w2 = {};
514     simd_vector w3, w4;
515     w1 = LOADH(LOADL(w1,x+i0),x+i2);
516     w2 = LOADH(LOADL(w2,x+i4),x+i6);
517     w3 = VADD(w1,w2);
518     w4 = VSUB(w1,w2);
519     v1 = LOADH(LOADL(v1,x+i1),x+i3);
520     v2 = LOADH(LOADL(v2,x+i5),x+i7);
521     v3 = VADD(v1,v2);
522     v4 = VSUB(v1,v2);
523     v5 = SHUFFLE(v3,v4,0,3,0,1);
524     v6 = SHUFFLE(v3,v4,2,1,3,2);
525     v7 = VADD(v5,v6);
526     v8 = VSUB(v5,v6);
527     x15 = SHUFFLE(v7,v8,0,1,1,0);
528     v2 = SHUFFLE(v8,v8,3,3,3,2);
529     v3 = VADD(v7,v2);
530     v4 = VSUB(v2,v7);
531     v5 = SHUFFLE(v4,v3,2,3,2,3);
532     x37 = VMUL(SET(0.7071067811865475f),v5);
533     v5 = SHUFFLE(w3,w4,0,1,0,1);
534     v6 = SHUFFLE(w3,w4,2,3,3,2);
535     v7 = VADD(v5,v6);
536     v8 = VSUB(v5,v6);
537     v1 = SHUFFLE(v7,v8,0,1,0,1);
538     v2 = SHUFFLE(v8,v7,3,2,2,3);
539     x02 = VADD(v1,x15);
540     x46 = VSUB(v1,x15);
541     x17 = VADD(v2,x37);
542     x53 = VSUB(v2,x37);
543     STOREL(x02,y+o0);
544     if(step) {
545       simd_vector y1;
546       simd_vector y2;
547       int step2 = step + step;
548       y1 = SHUFFLE(x02,x02,2,2,3,3);
549       SSETwiddle<N,-1>::twiddle(step2,y+o2,y1);
550       int step3 = step2 + step;
551       y1 = SHUFFLE(x53,x53,1,1,3,3);
552       y2 = SHUFFLE(x53,x53,2,2,0,0);
553       SSETwiddle<N,-1>::twiddle2(step3+step2,step3,y+o5,y+o3,y1,y2);
554       int step4 = step2 + step2;
555       y1 = SHUFFLE(x46,x46,0,0,1,1);
556       y2 = SHUFFLE(x46,x46,2,2,3,3);
557       SSETwiddle<N,-1>::twiddle2(step4,step3+step3,y+o4,y+o6,y1,y2);
558       y1 = SHUFFLE(x17,x17,1,1,3,3);
559       y2 = SHUFFLE(x17,x17,2,2,0,0);
560       SSETwiddle<N,-1>::twiddle2(step,step4+step3,y+o1,y+o7,y1,y2);
561     } else {
562       simd_vector y1;
563       y1 = SHUFFLE(x02,x02,2,3,3,3);
564       STOREL(y1,y+o2);
565       y1 = SHUFFLE(x53,x53,1,3,2,0);
566       STOREL(y1,y+o5);
567       STOREH(y1,y+o3);
568       STOREL(x46,y+o4);
569       STOREH(x46,y+o6);
570       y1 = SHUFFLE(x17,x17,1,3,2,0);
571       STOREL(y1,y+o1);
572       STOREH(y1,y+o7);
573     }
574   }
575 };
576 
577 #else // !ENABLE_SSE
578 
579 template<int istride, int ostride, int dir>
580   class __fft<istride,ostride,4,dir> {
581  public:
582   enum { N = istride*4,
583          i0 = 0, o0 = 0,
584          i1_ = i0 + istride, o1 = o0 + ostride,
585          i2 = i1_ + istride, o2 = o1 + ostride,
586          i3_ = i2 + istride, o3 = o2 + ostride,
587          i1 = (dir==1?i1_:i3_),
588          i3 = (dir==1?i3_:i1_),
589          ir0 = i0<<1, ii0 = ir0 + 1, or0 = o0<<1, oi0 = or0 + 1,
590          ir1 = i1<<1, ii1 = ir1 + 1, or1 = o1<<1, oi1 = or1 + 1,
591          ir2 = i2<<1, ii2 = ir2 + 1, or2 = o2<<1, oi2 = or2 + 1,
592          ir3 = i3<<1, ii3 = ir3 + 1, or3 = o3<<1, oi3 = or3 + 1 };
execute(t_fft * _x,t_fft * _y,int step)593   static inline void execute(t_fft *_x, t_fft *_y, int step) {
594     float *x = (float*)_x;
595     float *y = (float*)_y;
596     float z00, z01, z10, z11, z20, z21, z30, z31;
597     z20 = x[ir0] - x[ir2]; z21 = x[ii0] - x[ii2];
598     z00 = x[ir0] + x[ir2]; z01 = x[ii0] + x[ii2];
599     z10 = x[ir1] + x[ir3]; z11 = x[ii1] + x[ii3];
600     y[or0] = z00 + z10; y[oi0] = z01 + z11;
601     z30 = x[ir3] - x[ir1]; z31 = x[ii3] - x[ii1];
602     if(step) {
603       int step2 = step + step;
604       FloatTwiddle<N,dir>::twiddle(step,_y+o1,z20 - z31,z21 + z30);
605       FloatTwiddle<N,dir>::twiddle(step2,_y+o2,z00 - z10,z01 - z11);
606       FloatTwiddle<N,dir>::twiddle(step2+step,_y+o3,z20 + z31,z21 - z30);
607     } else {
608       y[or1] = z20 - z31; y[oi1] = z21 + z30;
609       y[or2] = z00 - z10; y[oi2] = z01 - z11;
610       y[or3] = z20 + z31; y[oi3] = z21 - z30;
611     }
612   }
613 };
614 
615 template<int istride, int ostride>
616 class __fft<istride,ostride,8,1> {
617 public:
618   enum { N = istride*8,
619          i0 = 0, o0 = 0,
620          i1 = i0 + istride, o1 = o0 + ostride,
621          i2 = i1 + istride, o2 = o1 + ostride,
622          i3 = i2 + istride, o3 = o2 + ostride,
623          i4 = i3 + istride, o4 = o3 + ostride,
624          i5 = i4 + istride, o5 = o4 + ostride,
625          i6 = i5 + istride, o6 = o5 + ostride,
626          i7 = i6 + istride, o7 = o6 + ostride,
627          ir0 = i0<<1, ii0 = ir0 + 1, or0 = o0<<1, oi0 = or0 + 1,
628          ir1 = i1<<1, ii1 = ir1 + 1, or1 = o1<<1, oi1 = or1 + 1,
629          ir2 = i2<<1, ii2 = ir2 + 1, or2 = o2<<1, oi2 = or2 + 1,
630          ir3 = i3<<1, ii3 = ir3 + 1, or3 = o3<<1, oi3 = or3 + 1,
631          ir4 = i4<<1, ii4 = ir4 + 1, or4 = o4<<1, oi4 = or4 + 1,
632          ir5 = i5<<1, ii5 = ir5 + 1, or5 = o5<<1, oi5 = or5 + 1,
633          ir6 = i6<<1, ii6 = ir6 + 1, or6 = o6<<1, oi6 = or6 + 1,
634          ir7 = i7<<1, ii7 = ir7 + 1, or7 = o7<<1, oi7 = or7 + 1 };
execute(t_fft * _x,t_fft * _y,int step)635   static inline void execute(t_fft *_x, t_fft *_y, int step) {
636     float *x = (float*)_x;
637     float *y = (float*)_y;
638 		float z00, z01, z10, z11, z20, z21, z30, z31, y10, y11, y30, y31;
639     float yr0, yi0, yr1, yi1, yr2, yi2, yr3, yi3, yr4, yi4, yr5, yi5, yr6, yi6, yr7, yi7;
640 		z00 = x[ir1] + x[ir5]; z01 = x[ii1] + x[ii5];
641 		z10 = x[ir7] + x[ir3]; z11 = x[ii7] + x[ii3];
642 		z20 = x[ir1] - x[ir5]; z21 = x[ii1] - x[ii5];
643     z30 = x[ir7] - x[ir3]; z31 = x[ii7] - x[ii3];
644 		y10 = z20 - z31; y11 = z21 + z30;
645 		y30 = z20 + z31; y31 = z21 - z30;
646 		yr1 = z00 + z10; yi1 = z01 + z11;
647 		yr5 = z01 - z11; yi5 = z10 - z00;
648 		yr3 = 0.7071067811865475f * (y10 + y11); yi3 = 0.7071067811865475f * (y11 - y10);
649 		yr7 = 0.7071067811865475f * (y31 - y30); yi7 = -0.7071067811865475f * (y30 + y31);
650 		z20 = x[ir0] - x[ir4]; z21 = x[ii0] - x[ii4];
651     z00 = x[ir0] + x[ir4]; z01 = x[ii0] + x[ii4];
652 		z10 = x[ir6] + x[ir2]; z11 = x[ii6] + x[ii2];
653 		z30 = x[ir6] - x[ir2]; z31 = x[ii6] - x[ii2];
654 		yr0 = z00 + z10; yi0 = z01 + z11;
655 		yr4 = z00 - z10; yi4 = z01 - z11;
656 		yr2 = z20 - z31; yi2 = z21 + z30;
657 		yr6 = z20 + z31; yi6 = z21 - z30;
658 		y[or0] = yr0 + yr1; y[oi0] = yi0 + yi1;
659     if(step) {
660       FloatTwiddle<N,1>::twiddle(step,_y+o1,yr2 + yr3, yi2 + yi3);
661       int step2 = step + step;
662       FloatTwiddle<N,1>::twiddle(step2,_y+o2,yr4 + yr5, yi4 + yi5);
663       int step3 = step2 + step;
664       FloatTwiddle<N,1>::twiddle(step3,_y+o3,yr6 + yr7, yi6 + yi7);
665       FloatTwiddle<N,1>::twiddle(step2+step2,_y+o4,yr0 - yr1, yi0 - yi1);
666       FloatTwiddle<N,1>::twiddle(step3+step2,_y+o5,yr2 - yr3, yi2 - yi3);
667       FloatTwiddle<N,1>::twiddle(step3+step3,_y+o6,yr4 - yr5, yi4 - yi5);
668       FloatTwiddle<N,1>::twiddle(step3+step2+step2,_y+o7,yr6 - yr7, yi6 - yi7);
669     } else {
670       y[or1] = yr2 + yr3; y[oi1] = yi2 + yi3;
671       y[or2] = yr4 + yr5; y[oi2] = yi4 + yi5;
672       y[or3] = yr6 + yr7; y[oi3] = yi6 + yi7;
673       y[or4] = yr0 - yr1; y[oi4] = yi0 - yi1;
674       y[or5] = yr2 - yr3; y[oi5] = yi2 - yi3;
675       y[or6] = yr4 - yr5; y[oi6] = yi4 - yi5;
676       y[or7] = yr6 - yr7; y[oi7] = yi6 - yi7;
677     }
678   }
679 };
680 
681 template<int istride, int ostride>
682 class __fft<istride,ostride,8,-1> {
683 public:
684   enum { N = istride*8,
685          i0 = 0, o0 = 0,
686          i1 = i0 + istride, o1 = o0 + ostride,
687          i2 = i1 + istride, o2 = o1 + ostride,
688          i3 = i2 + istride, o3 = o2 + ostride,
689          i4 = i3 + istride, o4 = o3 + ostride,
690          i5 = i4 + istride, o5 = o4 + ostride,
691          i6 = i5 + istride, o6 = o5 + ostride,
692          i7 = i6 + istride, o7 = o6 + ostride,
693          ir0 = i0<<1, ii0 = ir0 + 1, or0 = o0<<1, oi0 = or0 + 1,
694          ir1 = i1<<1, ii1 = ir1 + 1, or1 = o1<<1, oi1 = or1 + 1,
695          ir2 = i2<<1, ii2 = ir2 + 1, or2 = o2<<1, oi2 = or2 + 1,
696          ir3 = i3<<1, ii3 = ir3 + 1, or3 = o3<<1, oi3 = or3 + 1,
697          ir4 = i4<<1, ii4 = ir4 + 1, or4 = o4<<1, oi4 = or4 + 1,
698          ir5 = i5<<1, ii5 = ir5 + 1, or5 = o5<<1, oi5 = or5 + 1,
699          ir6 = i6<<1, ii6 = ir6 + 1, or6 = o6<<1, oi6 = or6 + 1,
700          ir7 = i7<<1, ii7 = ir7 + 1, or7 = o7<<1, oi7 = or7 + 1 };
execute(t_fft * _x,t_fft * _y,int step)701   static inline void execute(t_fft *_x, t_fft *_y, int step) {
702     float *x = (float*)_x;
703     float *y = (float*)_y;
704 		float z00, z01, z10, z11, z20, z21, z30, z31, y10, y11, y30, y31;
705     float yr0, yi0, yr1, yi1, yr2, yi2, yr3, yi3, yr4, yi4, yr5, yi5, yr6, yi6, yr7, yi7;
706 		z00 = x[ir1] + x[ir5]; z01 = x[ii1] + x[ii5];
707 		z10 = x[ir3] + x[ir7]; z11 = x[ii3] + x[ii7];
708 		z20 = x[ir1] - x[ir5]; z21 = x[ii1] - x[ii5];
709 		z30 = x[ir3] - x[ir7]; z31 = x[ii3] - x[ii7];
710 		y10 = z20 - z31; y11 = z21 + z30;
711 		y30 = z20 + z31; y31 = z21 - z30;
712 		yr1 = z00 + z10; yi1 = z01 + z11;
713     yr5 = z11 - z01; yi5 = z00 - z10;
714 		yr3 = 0.7071067811865475f * (y10 - y11); yi3 = 0.7071067811865475f * (y10 + y11);
715 		yr7 = -0.7071067811865475f * (y30 + y31); yi7 = 0.7071067811865475f * (y30 - y31);
716 		z00 = x[ir0] + x[ir4]; z01 = x[ii0] + x[ii4];
717 		z10 = x[ir2] + x[ir6]; z11 = x[ii2] + x[ii6];
718 		z20 = x[ir0] - x[ir4]; z21 = x[ii0] - x[ii4];
719 		z30 = x[ir2] - x[ir6]; z31 = x[ii2] - x[ii6];
720 		yr0 = z00 + z10; yi0 = z01 + z11;
721 		yr4 = z00 - z10; yi4 = z01 - z11;
722 		yr2 = z20 - z31; yi2 = z21 + z30;
723 		yr6 = z20 + z31; yi6 = z21 - z30;
724 		y[or0] = yr0 + yr1;
725 		y[oi0] = yi0 + yi1;
726     if(step) {
727       FloatTwiddle<N,-1>::twiddle(step,_y+o1,yr2 + yr3, yi2 + yi3);
728       int step2 = step + step;
729       FloatTwiddle<N,-1>::twiddle(step2,_y+o2,yr4 + yr5, yi4 + yi5);
730       int step3 = step2 + step;
731       FloatTwiddle<N,-1>::twiddle(step3,_y+o3,yr6 + yr7, yi6 + yi7);
732       FloatTwiddle<N,-1>::twiddle(step2+step2,_y+o4,yr0 - yr1, yi0 - yi1);
733       FloatTwiddle<N,-1>::twiddle(step3+step2,_y+o5,yr2 - yr3, yi2 - yi3);
734       FloatTwiddle<N,-1>::twiddle(step3+step3,_y+o6,yr4 - yr5, yi4 - yi5);
735       FloatTwiddle<N,-1>::twiddle(step3+step2+step2,_y+o7,yr6 - yr7, yi6 - yi7);
736     } else {
737       y[or1] = yr2 + yr3; y[oi1] = yi2 + yi3;
738       y[or2] = yr4 + yr5; y[oi2] = yi4 + yi5;
739       y[or3] = yr6 + yr7; y[oi3] = yi6 + yi7;
740       y[or4] = yr0 - yr1; y[oi4] = yi0 - yi1;
741       y[or5] = yr2 - yr3; y[oi5] = yi2 - yi3;
742       y[or6] = yr4 - yr5; y[oi6] = yi4 - yi5;
743       y[or7] = yr6 - yr7; y[oi7] = yi6 - yi7;
744     }
745   }
746 };
747 
748 #endif // ENABLE_SSE
749 
750 template<int k, int stride, int radix, int dir>
751 class _fft {
752 public:
753   enum { N = radix * stride,
754          _radix = Factor<stride>::value,
755          _stride = stride / _radix };
execute(t_fft * x)756   static inline void execute(t_fft *x) {
757     for(int step=0; step<stride; step++) {
758       __fft<stride,stride,radix,dir>::execute(x+step,x+step,step);
759     }
760     _fft<N,_stride,_radix,dir>::loop(x+N);
761   }
loop(t_fft * x)762   static inline void loop(t_fft *x) {
763     _fft<k-N,stride,radix,dir>::execute(x-N);
764     _fft<k-N,stride,radix,dir>::loop(x-N);
765   }
766 };
767 
768 template<int stride, int radix, int dir>
769 class _fft<0,stride,radix,dir> {
770 public:
771   enum { N = radix * stride,
772          _radix = Factor<stride>::value,
773          _stride = stride / _radix };
execute(t_fft * x)774   static inline void execute(t_fft *x) {
775     for(int step=0; step<stride; step++) {
776       __fft<stride,stride,radix,dir>::execute(x+step,x+step,step);
777     }
778     _fft<N,_stride,_radix,dir>::loop(x+N);
779   }
loop(t_fft * x)780   static inline void loop(t_fft *x) { }
781 };
782 
783 template <int k, int radix, int dir>
784 class _fft<k,1,radix,dir> {
785 public:
execute(t_fft * x)786   static inline void execute(t_fft *x) { }
loop(t_fft * x)787   static inline void loop(t_fft *x) { }
788 };
789 
790 template <int radix, int dir>
791 class _fft<0,1,radix,dir> {
792 public:
execute(t_fft * x)793   static inline void execute(t_fft *x) { }
loop(t_fft * x)794   static inline void loop(t_fft *x) { }
795 };
796 
797 template<int N>
798 class fft_order_ {
799 public:
iterate(int k)800   static inline int iterate(int k) {
801     static const int N2 = Factor<N>::value;
802     static const int N1 = N/Factor<N>::value;
803     return (k%N2)*N1 + fft_order_<N1>::iterate(k/N2);
804   }
805 };
806 
807 template<>
808 class fft_order_<1> {
809 public:
iterate(int k0)810   static inline int iterate(int k0) {
811     return 0;
812   }
813 };
814 
815 template<int N>
816 class fft_order {
817 public:
818   fft_order<N>() {
819     for(int k=0;k<N;k++) {
820       int kr = fft_order_<N>::iterate(k);
821       order[kr] = k;
822     }
823   }
824   int order[N];
825 };
826 
827 template<int N, int dir>
828 class fft_reorder {
829 public:
830   enum { radix = LastFactor<N>::value,
831          ostride = N / radix,
832          s = N * sizeof(t_fft) };
833   static const fft_order<N> order;
reorder(t_fft * x)834   static inline void reorder(t_fft *x) {
835     t_fft y[N];
836     int *o = (int*)order.order;
837     memcpy(y,x,s);
838     for(int r=0; r<N; r+=radix) {
839       __fft<1,ostride,radix,dir>::execute(y+r,x+o[r],0);
840     }
841   }
842 };
843 
844 template<int N, int dir>
845 const fft_order<N> fft_reorder<N,dir>::order;
846 
847 template<int N, int dir>
fft(t_fft * x)848 void fft(t_fft *x) {
849   enum { radix = Factor<N>::value,
850          stride = N / radix };
851   _fft<0,stride,radix,dir>::execute(x);
852   fft_reorder<N,dir>::reorder(x);
853 }
854 
855 }
856 
857 #endif
858