1 /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
2
3 Based on original fortran 77 code from FFTPACKv4 from NETLIB
4 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5 of NCAR, in 1985.
6
7 As confirmed by the NCAR fftpack software curators, the following
8 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9 released under the same terms.
10
11 FFTPACK license:
12
13 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15 Copyright (c) 2004 the University Corporation for Atmospheric
16 Research ("UCAR"). All rights reserved. Developed by NCAR's
17 Computational and Information Systems Laboratory, UCAR,
18 www.cisl.ucar.edu.
19
20 Redistribution and use of the Software in source and binary forms,
21 with or without modification, is permitted provided that the
22 following conditions are met:
23
24 - Neither the names of NCAR's Computational and Information Systems
25 Laboratory, the University Corporation for Atmospheric Research,
26 nor the names of its sponsors or contributors may be used to
27 endorse or promote products derived from this Software without
28 specific prior written permission.
29
30 - Redistributions of source code must retain the above copyright
31 notices, this list of conditions, and the disclaimer below.
32
33 - Redistributions in binary form must reproduce the above copyright
34 notice, this list of conditions, and the disclaimer below in the
35 documentation and/or other materials provided with the
36 distribution.
37
38 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46 SOFTWARE.
47
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53 */
54
55 /*
56 ChangeLog:
57 - 2011/10/02, version 1: This is the very first release of this file.
58 */
59
60 #include "pffft.hxx"
61
62 #include <stdlib.h>
63 #include <stdio.h>
64 #include <math.h>
65 #include <assert.h>
66
67 /* detect compiler flavour */
68 #if defined(_MSC_VER)
69 # define COMPILER_MSVC
70 #elif defined(__GNUC__)
71 # define COMPILER_GCC
72 #endif
73
74 #if defined(COMPILER_GCC)
75 # define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
76 # define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
77 # define RESTRICT __restrict
78 # define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
79 #elif defined(COMPILER_MSVC)
80 # define ALWAYS_INLINE(return_type) __forceinline return_type
81 # define NEVER_INLINE(return_type) __declspec(noinline) return_type
82 # define RESTRICT __restrict
83 # define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
84 #endif
85
86
87 /*
88 vector support macros: the rest of the code is independant of
89 SSE/Altivec/NEON -- adding support for other platforms with 4-element
90 vectors should be limited to these macros
91 */
92
93
94 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
95 //#define PFFFT_SIMD_DISABLE
96
97 /*
98 Altivec support macros
99 */
100 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
101 typedef vector float v4sf;
102 # define SIMD_SZ 4
103 # define VZERO() ((vector float) vec_splat_u8(0))
104 # define VMUL(a,b) vec_madd(a,b, VZERO())
105 # define VADD(a,b) vec_add(a,b)
106 # define VMADD(a,b,c) vec_madd(a,b,c)
107 # define VSUB(a,b) vec_sub(a,b)
ld_ps1(const float * p)108 inline v4sf ld_ps1(const float *p)
109 {
110 v4sf v=vec_lde(0,p);
111 return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0);
112 }
113 # define LD_PS1(p) ld_ps1(&p)
114 # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
115 # define UNINTERLEAVE2(in1, in2, out1, out2) { \
116 vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
117 vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
118 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
119 }
120 # define VTRANSPOSE4(x0,x1,x2,x3) { \
121 v4sf y0 = vec_mergeh(x0, x2); \
122 v4sf y1 = vec_mergel(x0, x2); \
123 v4sf y2 = vec_mergeh(x1, x3); \
124 v4sf y3 = vec_mergel(x1, x3); \
125 x0 = vec_mergeh(y0, y2); \
126 x1 = vec_mergel(y0, y2); \
127 x2 = vec_mergeh(y1, y3); \
128 x3 = vec_mergel(y1, y3); \
129 }
130 # define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
131 # define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
132
133 /*
134 SSE1 support macros
135 */
136 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
137
138 #include <xmmintrin.h>
139 typedef __m128 v4sf;
140 # define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
141 # define VZERO() _mm_setzero_ps()
142 # define VMUL(a,b) _mm_mul_ps(a,b)
143 # define VADD(a,b) _mm_add_ps(a,b)
144 # define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
145 # define VSUB(a,b) _mm_sub_ps(a,b)
146 # define LD_PS1(p) _mm_set1_ps(p)
147 # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
148 # define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
149 # define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
150 # define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
151 # define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
152
153 /*
154 ARM NEON support macros
155 */
156 #elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
157 # include <arm_neon.h>
158 typedef float32x4_t v4sf;
159 # define SIMD_SZ 4
160 # define VZERO() vdupq_n_f32(0)
161 # define VMUL(a,b) vmulq_f32(a,b)
162 # define VADD(a,b) vaddq_f32(a,b)
163 # define VMADD(a,b,c) vmlaq_f32(c,a,b)
164 # define VSUB(a,b) vsubq_f32(a,b)
165 # define LD_PS1(p) vld1q_dup_f32(&(p))
166 # define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
167 # define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
168 # define VTRANSPOSE4(x0,x1,x2,x3) { \
169 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
170 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
171 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
172 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
173 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
174 }
175 // marginally faster version
176 //# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
177 # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
178 # define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
179 #else
180 # if !defined(PFFFT_SIMD_DISABLE)
181 # warning "building with simd disabled !\n";
182 # define PFFFT_SIMD_DISABLE // fallback to scalar code
183 # endif
184 #endif
185
186 // fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
187 #ifdef PFFFT_SIMD_DISABLE
188 typedef float v4sf;
189 # define SIMD_SZ 1
190 # define VZERO() 0.f
191 # define VMUL(a,b) ((a)*(b))
192 # define VADD(a,b) ((a)+(b))
193 # define VMADD(a,b,c) ((a)*(b)+(c))
194 # define VSUB(a,b) ((a)-(b))
195 # define LD_PS1(p) (p)
196 # define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
197 #endif
198
199 // shortcuts for complex multiplcations
200 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
201 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
202 #ifndef SVMUL
203 // multiply a scalar with a vector
204 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
205 #endif
206
207 #if !defined(PFFFT_SIMD_DISABLE)
208 typedef union v4sf_union {
209 v4sf v;
210 float f[4];
211 } v4sf_union;
212
213 #include <string.h>
214
215 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
216
217 /* detect bugs with the vector support macros */
validate_pffft_simd()218 void validate_pffft_simd()
219 {
220 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
221 v4sf_union a0, a1, a2, a3, t, u;
222 memcpy(a0.f, f, 4*sizeof(float));
223 memcpy(a1.f, f+4, 4*sizeof(float));
224 memcpy(a2.f, f+8, 4*sizeof(float));
225 memcpy(a3.f, f+12, 4*sizeof(float));
226
227 t = a0;
228 u = a1;
229 t.v = VZERO();
230 printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
231 assertv4(t, 0, 0, 0, 0);
232 t.v = VADD(a1.v, a2.v);
233 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
234 assertv4(t, 12, 14, 16, 18);
235 t.v = VMUL(a1.v, a2.v);
236 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
237 assertv4(t, 32, 45, 60, 77);
238 t.v = VMADD(a1.v, a2.v,a0.v);
239 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
240 assertv4(t, 32, 46, 62, 80);
241
242 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
243 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
244 assertv4(t, 4, 8, 5, 9);
245 assertv4(u, 6, 10, 7, 11);
246 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
247 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
248 assertv4(t, 4, 6, 8, 10);
249 assertv4(u, 5, 7, 9, 11);
250
251 t.v=LD_PS1(f[15]);
252 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
253 assertv4(t, 15, 15, 15, 15);
254 t.v = VSWAPHL(a1.v, a2.v);
255 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
256 assertv4(t, 8, 9, 6, 7);
257 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
258 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
259 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
260 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
261 assertv4(a0, 0, 4, 8, 12);
262 assertv4(a1, 1, 5, 9, 13);
263 assertv4(a2, 2, 6, 10, 14);
264 assertv4(a3, 3, 7, 11, 15);
265 }
266 #endif //!PFFFT_SIMD_DISABLE
267
268 /* SSE and co like 16-bytes aligned pointers */
269 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
pffft_aligned_malloc(size_t nb_bytes)270 void *pffft_aligned_malloc(size_t nb_bytes)
271 {
272 void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
273 if (!p0) return (void *) 0;
274 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
275 *((void **) p - 1) = p0;
276 return p;
277 }
278
pffft_aligned_free(void * p)279 void pffft_aligned_free(void *p)
280 {
281 if (p) free(*((void **) p - 1));
282 }
283
pffft_simd_size()284 int pffft_simd_size()
285 {
286 return SIMD_SZ;
287 }
288
289 /*
290 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
291 */
passf2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,float fsign)292 static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign)
293 {
294 int k, i;
295 int l1ido = l1*ido;
296 if (ido <= 2) {
297 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
298 ch[0] = VADD(cc[0], cc[ido+0]);
299 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
300 ch[1] = VADD(cc[1], cc[ido+1]);
301 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
302 }
303 } else {
304 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
305 for (i=0; i<ido-1; i+=2) {
306 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
307 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
308 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
309 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
310 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
311 VCPLXMUL(tr2, ti2, wr, wi);
312 ch[i+l1ido] = tr2;
313 ch[i+l1ido+1] = ti2;
314 }
315 }
316 }
317 }
318
319 /*
320 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
321 */
passf3_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,float fsign)322 static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
323 const float *wa1, const float *wa2, float fsign)
324 {
325 static const float taur = -0.5f;
326 float taui = 0.866025403784439f*fsign;
327 int i, k;
328 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
329 int l1ido = l1*ido;
330 float wr1, wi1, wr2, wi2;
331 assert(ido > 2);
332 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
333 for (i=0; i<ido-1; i+=2) {
334 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
335 cr2 = VADD(cc[i], SVMUL(taur,tr2));
336 ch[i] = VADD(cc[i], tr2);
337 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
338 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
339 ch[i+1] = VADD(cc[i+1], ti2);
340 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
341 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
342 dr2 = VSUB(cr2, ci3);
343 dr3 = VADD(cr2, ci3);
344 di2 = VADD(ci2, cr3);
345 di3 = VSUB(ci2, cr3);
346 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
347 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
348 ch[i+l1ido] = dr2;
349 ch[i+l1ido + 1] = di2;
350 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
351 ch[i+2*l1ido] = dr3;
352 ch[i+2*l1ido+1] = di3;
353 }
354 }
355 } /* passf3 */
356
passf4_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,float fsign)357 static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
358 const float *wa1, const float *wa2, const float *wa3, float fsign)
359 {
360 /* isign == -1 for forward transform and +1 for backward transform */
361
362 int i, k;
363 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
364 int l1ido = l1*ido;
365 if (ido == 2) {
366 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
367 tr1 = VSUB(cc[0], cc[2*ido + 0]);
368 tr2 = VADD(cc[0], cc[2*ido + 0]);
369 ti1 = VSUB(cc[1], cc[2*ido + 1]);
370 ti2 = VADD(cc[1], cc[2*ido + 1]);
371 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
372 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
373 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
374 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
375
376 ch[0*l1ido + 0] = VADD(tr2, tr3);
377 ch[0*l1ido + 1] = VADD(ti2, ti3);
378 ch[1*l1ido + 0] = VADD(tr1, tr4);
379 ch[1*l1ido + 1] = VADD(ti1, ti4);
380 ch[2*l1ido + 0] = VSUB(tr2, tr3);
381 ch[2*l1ido + 1] = VSUB(ti2, ti3);
382 ch[3*l1ido + 0] = VSUB(tr1, tr4);
383 ch[3*l1ido + 1] = VSUB(ti1, ti4);
384 }
385 } else {
386 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
387 for (i=0; i<ido-1; i+=2) {
388 float wr1, wi1, wr2, wi2, wr3, wi3;
389 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
390 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
391 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
392 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
393 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
394 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
395 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
396 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
397
398 ch[i] = VADD(tr2, tr3);
399 cr3 = VSUB(tr2, tr3);
400 ch[i + 1] = VADD(ti2, ti3);
401 ci3 = VSUB(ti2, ti3);
402
403 cr2 = VADD(tr1, tr4);
404 cr4 = VSUB(tr1, tr4);
405 ci2 = VADD(ti1, ti4);
406 ci4 = VSUB(ti1, ti4);
407 wr1=wa1[i], wi1=fsign*wa1[i+1];
408 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
409 wr2=wa2[i], wi2=fsign*wa2[i+1];
410 ch[i + l1ido] = cr2;
411 ch[i + l1ido + 1] = ci2;
412
413 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
414 wr3=wa3[i], wi3=fsign*wa3[i+1];
415 ch[i + 2*l1ido] = cr3;
416 ch[i + 2*l1ido + 1] = ci3;
417
418 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
419 ch[i + 3*l1ido] = cr4;
420 ch[i + 3*l1ido + 1] = ci4;
421 }
422 }
423 }
424 } /* passf4 */
425
426 /*
427 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
428 */
passf5_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4,float fsign)429 static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
430 const float *wa1, const float *wa2,
431 const float *wa3, const float *wa4, float fsign)
432 {
433 static const float tr11 = .309016994374947f;
434 const float ti11 = .951056516295154f*fsign;
435 static const float tr12 = -.809016994374947f;
436 const float ti12 = .587785252292473f*fsign;
437
438 /* Local variables */
439 int i, k;
440 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
441 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
442
443 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
444
445 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
446 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
447
448 assert(ido > 2);
449 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
450 for (i = 0; i < ido-1; i += 2) {
451 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
452 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
453 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
454 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
455 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
456 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
457 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
458 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
459 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
460 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
461 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
462 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
463 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
464 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
465 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
466 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
467 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
468 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
469 dr3 = VSUB(cr3, ci4);
470 dr4 = VADD(cr3, ci4);
471 di3 = VADD(ci3, cr4);
472 di4 = VSUB(ci3, cr4);
473 dr5 = VADD(cr2, ci5);
474 dr2 = VSUB(cr2, ci5);
475 di5 = VSUB(ci2, cr5);
476 di2 = VADD(ci2, cr5);
477 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
478 wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
479 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
480 ch_ref(i - 1, 2) = dr2;
481 ch_ref(i, 2) = di2;
482 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
483 ch_ref(i - 1, 3) = dr3;
484 ch_ref(i, 3) = di3;
485 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
486 ch_ref(i - 1, 4) = dr4;
487 ch_ref(i, 4) = di4;
488 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
489 ch_ref(i - 1, 5) = dr5;
490 ch_ref(i, 5) = di5;
491 }
492 }
493 #undef ch_ref
494 #undef cc_ref
495 }
496
radf2_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1)497 static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1)
498 {
499 static const float minus_one = -1.f;
500 int i, k, l1ido = l1*ido;
501 for (k=0; k < l1ido; k += ido) {
502 v4sf a = cc[k], b = cc[k + l1ido];
503 ch[2*k] = VADD(a, b);
504 ch[2*(k+ido)-1] = VSUB(a, b);
505 }
506 if (ido < 2) return;
507 if (ido != 2) {
508 for (k=0; k < l1ido; k += ido) {
509 for (i=2; i<ido; i+=2) {
510 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
511 v4sf br = cc[i - 1 + k], bi = cc[i + k];
512 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
513 ch[i + 2*k] = VADD(bi, ti2);
514 ch[2*(k+ido) - i] = VSUB(ti2, bi);
515 ch[i - 1 + 2*k] = VADD(br, tr2);
516 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
517 }
518 }
519 if (ido % 2 == 1) return;
520 }
521 for (k=0; k < l1ido; k += ido) {
522 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
523 ch[2*k + ido-1] = cc[k + ido-1];
524 }
525 } /* radf2 */
526
527
radb2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1)528 static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1)
529 {
530 static const float minus_two=-2;
531 int i, k, l1ido = l1*ido;
532 v4sf a,b,c,d, tr2, ti2;
533 for (k=0; k < l1ido; k += ido) {
534 a = cc[2*k];
535 b = cc[2*(k+ido) - 1];
536 ch[k] = VADD(a, b);
537 ch[k + l1ido] =VSUB(a, b);
538 }
539 if (ido < 2) return;
540 if (ido != 2) {
541 for (k = 0; k < l1ido; k += ido) {
542 for (i = 2; i < ido; i += 2) {
543 a = cc[i-1 + 2*k];
544 b = cc[2*(k + ido) - i - 1];
545 c = cc[i+0 + 2*k];
546 d = cc[2*(k + ido) - i + 0];
547 ch[i-1 + k] = VADD(a, b);
548 tr2 = VSUB(a, b);
549 ch[i+0 + k] = VSUB(c, d);
550 ti2 = VADD(c, d);
551 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
552 ch[i-1 + k + l1ido] = tr2;
553 ch[i+0 + k + l1ido] = ti2;
554 }
555 }
556 if (ido % 2 == 1) return;
557 }
558 for (k = 0; k < l1ido; k += ido) {
559 a = cc[2*k + ido-1];
560 b = cc[2*k + ido];
561 ch[k + ido-1] = VADD(a,a);
562 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
563 }
564 } /* radb2 */
565
radf3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)566 static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
567 const float *wa1, const float *wa2)
568 {
569 static const float taur = -0.5f;
570 static const float taui = 0.866025403784439f;
571 int i, k, ic;
572 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
573 for (k=0; k<l1; k++) {
574 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
575 ch[3*k*ido] = VADD(cc[k*ido], cr2);
576 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
577 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
578 }
579 if (ido == 1) return;
580 for (k=0; k<l1; k++) {
581 for (i=2; i<ido; i+=2) {
582 ic = ido - i;
583 wr1 = LD_PS1(wa1[i - 2]);
584 wi1 = LD_PS1(wa1[i - 1]);
585 dr2 = cc[i - 1 + (k + l1)*ido];
586 di2 = cc[i + (k + l1)*ido];
587 VCPLXMULCONJ(dr2, di2, wr1, wi1);
588
589 wr2 = LD_PS1(wa2[i - 2]);
590 wi2 = LD_PS1(wa2[i - 1]);
591 dr3 = cc[i - 1 + (k + l1*2)*ido];
592 di3 = cc[i + (k + l1*2)*ido];
593 VCPLXMULCONJ(dr3, di3, wr2, wi2);
594
595 cr2 = VADD(dr2, dr3);
596 ci2 = VADD(di2, di3);
597 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
598 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
599 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
600 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
601 tr3 = SVMUL(taui, VSUB(di2, di3));
602 ti3 = SVMUL(taui, VSUB(dr3, dr2));
603 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
604 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
605 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
606 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
607 }
608 }
609 } /* radf3 */
610
611
radb3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)612 static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
613 const float *wa1, const float *wa2)
614 {
615 static const float taur = -0.5f;
616 static const float taui = 0.866025403784439f;
617 static const float taui_2 = 0.866025403784439f*2;
618 int i, k, ic;
619 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
620 for (k=0; k<l1; k++) {
621 tr2 = cc[ido-1 + (3*k + 1)*ido];
622 tr2 = VADD(tr2,tr2);
623 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
624 ch[k*ido] = VADD(cc[3*k*ido], tr2);
625 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
626 ch[(k + l1)*ido] = VSUB(cr2, ci3);
627 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
628 }
629 if (ido == 1) return;
630 for (k=0; k<l1; k++) {
631 for (i=2; i<ido; i+=2) {
632 ic = ido - i;
633 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
634 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
635 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
636 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
637 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
638 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
639 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
640 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
641 dr2 = VSUB(cr2, ci3);
642 dr3 = VADD(cr2, ci3);
643 di2 = VADD(ci2, cr3);
644 di3 = VSUB(ci2, cr3);
645 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
646 ch[i - 1 + (k + l1)*ido] = dr2;
647 ch[i + (k + l1)*ido] = di2;
648 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
649 ch[i - 1 + (k + 2*l1)*ido] = dr3;
650 ch[i + (k + 2*l1)*ido] = di3;
651 }
652 }
653 } /* radb3 */
654
radf4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)655 static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
656 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
657 {
658 static const float minus_hsqt2 = (float)-0.7071067811865475;
659 int i, k, l1ido = l1*ido;
660 {
661 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
662 v4sf * RESTRICT ch_ = ch;
663 while (cc < cc_end) {
664 // this loop represents between 25% and 40% of total radf4_ps cost !
665 v4sf a0 = cc[0], a1 = cc[l1ido];
666 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
667 v4sf tr1 = VADD(a1, a3);
668 v4sf tr2 = VADD(a0, a2);
669 ch[2*ido-1] = VSUB(a0, a2);
670 ch[2*ido ] = VSUB(a3, a1);
671 ch[0 ] = VADD(tr1, tr2);
672 ch[4*ido-1] = VSUB(tr2, tr1);
673 cc += ido;
674 ch += 4*ido;
675 }
676 cc = cc_;
677 ch = ch_;
678 }
679 if (ido < 2) return;
680 if (ido != 2) {
681 for (k = 0; k < l1ido; k += ido) {
682 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
683 for (i=2; i<ido; i += 2, pc += 2) {
684 int ic = ido - i;
685 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
686 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
687
688 cr2 = pc[1*l1ido+0];
689 ci2 = pc[1*l1ido+1];
690 wr=LD_PS1(wa1[i - 2]);
691 wi=LD_PS1(wa1[i - 1]);
692 VCPLXMULCONJ(cr2,ci2,wr,wi);
693
694 cr3 = pc[2*l1ido+0];
695 ci3 = pc[2*l1ido+1];
696 wr = LD_PS1(wa2[i-2]);
697 wi = LD_PS1(wa2[i-1]);
698 VCPLXMULCONJ(cr3, ci3, wr, wi);
699
700 cr4 = pc[3*l1ido];
701 ci4 = pc[3*l1ido+1];
702 wr = LD_PS1(wa3[i-2]);
703 wi = LD_PS1(wa3[i-1]);
704 VCPLXMULCONJ(cr4, ci4, wr, wi);
705
706 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
707
708 tr1 = VADD(cr2,cr4);
709 tr4 = VSUB(cr4,cr2);
710 tr2 = VADD(pc[0],cr3);
711 tr3 = VSUB(pc[0],cr3);
712 ch[i - 1 + 4*k] = VADD(tr1,tr2);
713 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
714 ti1 = VADD(ci2,ci4);
715 ti4 = VSUB(ci2,ci4);
716 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
717 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
718 ti2 = VADD(pc[1],ci3);
719 ti3 = VSUB(pc[1],ci3);
720 ch[i + 4*k] = VADD(ti1, ti2);
721 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
722 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
723 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
724 }
725 }
726 if (ido % 2 == 1) return;
727 }
728 for (k=0; k<l1ido; k += ido) {
729 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
730 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
731 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
732 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
733 ch[ido-1 + 4*k] = VADD(tr1, c);
734 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
735 ch[4*k + 1*ido] = VSUB(ti1, d);
736 ch[4*k + 3*ido] = VADD(ti1, d);
737 }
738 } /* radf4 */
739
740
radb4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)741 static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
742 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
743 {
744 static const float minus_sqrt2 = (float)-1.414213562373095;
745 static const float two = 2.f;
746 int i, k, l1ido = l1*ido;
747 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
748 {
749 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
750 v4sf *ch_ = ch;
751 while (ch < ch_end) {
752 v4sf a = cc[0], b = cc[4*ido-1];
753 v4sf c = cc[2*ido], d = cc[2*ido-1];
754 tr3 = SVMUL(two,d);
755 tr2 = VADD(a,b);
756 tr1 = VSUB(a,b);
757 tr4 = SVMUL(two,c);
758 ch[0*l1ido] = VADD(tr2, tr3);
759 ch[2*l1ido] = VSUB(tr2, tr3);
760 ch[1*l1ido] = VSUB(tr1, tr4);
761 ch[3*l1ido] = VADD(tr1, tr4);
762
763 cc += 4*ido;
764 ch += ido;
765 }
766 cc = cc_;
767 ch = ch_;
768 }
769 if (ido < 2) return;
770 if (ido != 2) {
771 for (k = 0; k < l1ido; k += ido) {
772 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
773 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
774 for (i = 2; i < ido; i += 2) {
775
776 tr1 = VSUB(pc[i], pc[4*ido - i]);
777 tr2 = VADD(pc[i], pc[4*ido - i]);
778 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
779 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
780 ph[0] = VADD(tr2, tr3);
781 cr3 = VSUB(tr2, tr3);
782
783 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
784 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
785 cr2 = VSUB(tr1, tr4);
786 cr4 = VADD(tr1, tr4);
787
788 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
789 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
790
791 ph[1] = VADD(ti2, ti3);
792 ph += l1ido;
793 ci3 = VSUB(ti2, ti3);
794 ci2 = VADD(ti1, ti4);
795 ci4 = VSUB(ti1, ti4);
796 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
797 ph[0] = cr2;
798 ph[1] = ci2;
799 ph += l1ido;
800 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
801 ph[0] = cr3;
802 ph[1] = ci3;
803 ph += l1ido;
804 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
805 ph[0] = cr4;
806 ph[1] = ci4;
807 ph = ph - 3*l1ido + 2;
808 }
809 }
810 if (ido % 2 == 1) return;
811 }
812 for (k=0; k < l1ido; k+=ido) {
813 int i0 = 4*k + ido;
814 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
815 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
816 tr1 = VSUB(c,d);
817 tr2 = VADD(c,d);
818 ti1 = VADD(b,a);
819 ti2 = VSUB(b,a);
820 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
821 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
822 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
823 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
824 }
825 } /* radb4 */
826
radf5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)827 static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
828 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
829 {
830 static const float tr11 = .309016994374947f;
831 static const float ti11 = .951056516295154f;
832 static const float tr12 = -.809016994374947f;
833 static const float ti12 = .587785252292473f;
834
835 /* System generated locals */
836 int cc_offset, ch_offset;
837
838 /* Local variables */
839 int i, k, ic;
840 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
841 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
842 int idp2;
843
844
845 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
846 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
847
848 /* Parameter adjustments */
849 ch_offset = 1 + ido * 6;
850 ch -= ch_offset;
851 cc_offset = 1 + ido * (1 + l1);
852 cc -= cc_offset;
853
854 /* Function Body */
855 for (k = 1; k <= l1; ++k) {
856 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
857 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
858 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
859 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
860 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
861 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
862 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
863 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
864 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
865 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
866 }
867 if (ido == 1) {
868 return;
869 }
870 idp2 = ido + 2;
871 for (k = 1; k <= l1; ++k) {
872 for (i = 3; i <= ido; i += 2) {
873 ic = idp2 - i;
874 dr2 = LD_PS1(wa1[i-3]);
875 di2 = LD_PS1(wa1[i-2]);
876 dr3 = LD_PS1(wa2[i-3]);
877 di3 = LD_PS1(wa2[i-2]);
878 dr4 = LD_PS1(wa3[i-3]);
879 di4 = LD_PS1(wa3[i-2]);
880 dr5 = LD_PS1(wa4[i-3]);
881 di5 = LD_PS1(wa4[i-2]);
882 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
883 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
884 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
885 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
886 cr2 = VADD(dr2, dr5);
887 ci5 = VSUB(dr5, dr2);
888 cr5 = VSUB(di2, di5);
889 ci2 = VADD(di2, di5);
890 cr3 = VADD(dr3, dr4);
891 ci4 = VSUB(dr4, dr3);
892 cr4 = VSUB(di3, di4);
893 ci3 = VADD(di3, di4);
894 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
895 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
896 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
897 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
898 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
899 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
900 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
901 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
902 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
903 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
904 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
905 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
906 ch_ref(i, 3, k) = VADD(ti2, ti5);
907 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
908 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
909 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
910 ch_ref(i, 5, k) = VADD(ti3, ti4);
911 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
912 }
913 }
914 #undef cc_ref
915 #undef ch_ref
916 } /* radf5 */
917
radb5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)918 static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
919 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
920 {
921 static const float tr11 = .309016994374947f;
922 static const float ti11 = .951056516295154f;
923 static const float tr12 = -.809016994374947f;
924 static const float ti12 = .587785252292473f;
925
926 int cc_offset, ch_offset;
927
928 /* Local variables */
929 int i, k, ic;
930 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
931 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
932 int idp2;
933
934 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
935 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
936
937 /* Parameter adjustments */
938 ch_offset = 1 + ido * (1 + l1);
939 ch -= ch_offset;
940 cc_offset = 1 + ido * 6;
941 cc -= cc_offset;
942
943 /* Function Body */
944 for (k = 1; k <= l1; ++k) {
945 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
946 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
947 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
948 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
949 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
950 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
951 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
952 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
953 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
954 ch_ref(1, k, 2) = VSUB(cr2, ci5);
955 ch_ref(1, k, 3) = VSUB(cr3, ci4);
956 ch_ref(1, k, 4) = VADD(cr3, ci4);
957 ch_ref(1, k, 5) = VADD(cr2, ci5);
958 }
959 if (ido == 1) {
960 return;
961 }
962 idp2 = ido + 2;
963 for (k = 1; k <= l1; ++k) {
964 for (i = 3; i <= ido; i += 2) {
965 ic = idp2 - i;
966 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
967 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
968 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
969 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
970 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
971 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
972 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
973 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
974 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
975 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
976 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
977 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
978 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
979 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
980 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
981 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
982 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
983 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
984 dr3 = VSUB(cr3, ci4);
985 dr4 = VADD(cr3, ci4);
986 di3 = VADD(ci3, cr4);
987 di4 = VSUB(ci3, cr4);
988 dr5 = VADD(cr2, ci5);
989 dr2 = VSUB(cr2, ci5);
990 di5 = VSUB(ci2, cr5);
991 di2 = VADD(ci2, cr5);
992 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
993 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
994 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
995 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
996
997 ch_ref(i-1, k, 2) = dr2;
998 ch_ref(i, k, 2) = di2;
999 ch_ref(i-1, k, 3) = dr3;
1000 ch_ref(i, k, 3) = di3;
1001 ch_ref(i-1, k, 4) = dr4;
1002 ch_ref(i, k, 4) = di4;
1003 ch_ref(i-1, k, 5) = dr5;
1004 ch_ref(i, k, 5) = di5;
1005 }
1006 }
1007 #undef cc_ref
1008 #undef ch_ref
1009 } /* radb5 */
1010
rfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)1011 static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1012 const float *wa, const int *ifac)
1013 {
1014 v4sf *in = (v4sf*)input_readonly;
1015 v4sf *out = (in == work2 ? work1 : work2);
1016 int nf = ifac[1], k1;
1017 int l2 = n;
1018 int iw = n-1;
1019 assert(in != out && work1 != work2);
1020 for (k1 = 1; k1 <= nf; ++k1) {
1021 int kh = nf - k1;
1022 int ip = ifac[kh + 2];
1023 int l1 = l2 / ip;
1024 int ido = n / l2;
1025 iw -= (ip - 1)*ido;
1026 switch (ip) {
1027 case 5: {
1028 int ix2 = iw + ido;
1029 int ix3 = ix2 + ido;
1030 int ix4 = ix3 + ido;
1031 radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1032 }
1033 break;
1034 case 4: {
1035 int ix2 = iw + ido;
1036 int ix3 = ix2 + ido;
1037 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1038 }
1039 break;
1040 case 3: {
1041 int ix2 = iw + ido;
1042 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1043 }
1044 break;
1045 case 2:
1046 radf2_ps(ido, l1, in, out, &wa[iw]);
1047 break;
1048 default:
1049 assert(0);
1050 break;
1051 }
1052 l2 = l1;
1053 if (out == work2) {
1054 out = work1;
1055 in = work2;
1056 } else {
1057 out = work2;
1058 in = work1;
1059 }
1060 }
1061 return in; /* this is in fact the output .. */
1062 } /* rfftf1 */
1063
rfftb1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)1064 static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1065 const float *wa, const int *ifac)
1066 {
1067 v4sf *in = (v4sf*)input_readonly;
1068 v4sf *out = (in == work2 ? work1 : work2);
1069 int nf = ifac[1], k1;
1070 int l1 = 1;
1071 int iw = 0;
1072 assert(in != out);
1073 for (k1=1; k1<=nf; k1++) {
1074 int ip = ifac[k1 + 1];
1075 int l2 = ip*l1;
1076 int ido = n / l2;
1077 switch (ip) {
1078 case 5: {
1079 int ix2 = iw + ido;
1080 int ix3 = ix2 + ido;
1081 int ix4 = ix3 + ido;
1082 radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1083 }
1084 break;
1085 case 4: {
1086 int ix2 = iw + ido;
1087 int ix3 = ix2 + ido;
1088 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1089 }
1090 break;
1091 case 3: {
1092 int ix2 = iw + ido;
1093 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1094 }
1095 break;
1096 case 2:
1097 radb2_ps(ido, l1, in, out, &wa[iw]);
1098 break;
1099 default:
1100 assert(0);
1101 break;
1102 }
1103 l1 = l2;
1104 iw += (ip - 1)*ido;
1105
1106 if (out == work2) {
1107 out = work1;
1108 in = work2;
1109 } else {
1110 out = work2;
1111 in = work1;
1112 }
1113 }
1114 return in; /* this is in fact the output .. */
1115 }
1116
decompose(int n,int * ifac,const int * ntryh)1117 static int decompose(int n, int *ifac, const int *ntryh)
1118 {
1119 int nl = n, nf = 0, i, j = 0;
1120 for (j=0; ntryh[j]; ++j) {
1121 int ntry = ntryh[j];
1122 while (nl != 1) {
1123 int nq = nl / ntry;
1124 int nr = nl - ntry * nq;
1125 if (nr == 0) {
1126 ifac[2+nf++] = ntry;
1127 nl = nq;
1128 if (ntry == 2 && nf != 1) {
1129 for (i = 2; i <= nf; ++i) {
1130 int ib = nf - i + 2;
1131 ifac[ib + 1] = ifac[ib];
1132 }
1133 ifac[2] = 2;
1134 }
1135 } else break;
1136 }
1137 }
1138 ifac[0] = n;
1139 ifac[1] = nf;
1140 return nf;
1141 }
1142
1143
1144
rffti1_ps(int n,float * wa,int * ifac)1145 static void rffti1_ps(int n, float *wa, int *ifac)
1146 {
1147 static const int ntryh[] = { 4,2,3,5,0 };
1148 int k1, j, ii;
1149
1150 int nf = decompose(n,ifac,ntryh);
1151 float argh = (2*M_PI) / n;
1152 int is = 0;
1153 int nfm1 = nf - 1;
1154 int l1 = 1;
1155 for (k1 = 1; k1 <= nfm1; k1++) {
1156 int ip = ifac[k1 + 1];
1157 int ld = 0;
1158 int l2 = l1*ip;
1159 int ido = n / l2;
1160 int ipm = ip - 1;
1161 for (j = 1; j <= ipm; ++j) {
1162 float argld;
1163 int i = is, fi=0;
1164 ld += l1;
1165 argld = ld*argh;
1166 for (ii = 3; ii <= ido; ii += 2) {
1167 i += 2;
1168 fi += 1;
1169 wa[i - 2] = cos(fi*argld);
1170 wa[i - 1] = sin(fi*argld);
1171 }
1172 is += ido;
1173 }
1174 l1 = l2;
1175 }
1176 } /* rffti1 */
1177
cffti1_ps(int n,float * wa,int * ifac)1178 void cffti1_ps(int n, float *wa, int *ifac)
1179 {
1180 static const int ntryh[] = { 5,3,4,2,0 };
1181 int k1, j, ii;
1182
1183 int nf = decompose(n,ifac,ntryh);
1184 float argh = (2*M_PI)/(float)n;
1185 int i = 1;
1186 int l1 = 1;
1187 for (k1=1; k1<=nf; k1++) {
1188 int ip = ifac[k1+1];
1189 int ld = 0;
1190 int l2 = l1*ip;
1191 int ido = n / l2;
1192 int idot = ido + ido + 2;
1193 int ipm = ip - 1;
1194 for (j=1; j<=ipm; j++) {
1195 float argld;
1196 int i1 = i, fi = 0;
1197 wa[i-1] = 1;
1198 wa[i] = 0;
1199 ld += l1;
1200 argld = ld*argh;
1201 for (ii = 4; ii <= idot; ii += 2) {
1202 i += 2;
1203 fi += 1;
1204 wa[i-1] = cos(fi*argld);
1205 wa[i] = sin(fi*argld);
1206 }
1207 if (ip > 5) {
1208 wa[i1-1] = wa[i-1];
1209 wa[i1] = wa[i];
1210 }
1211 }
1212 l1 = l2;
1213 }
1214 } /* cffti1 */
1215
1216
cfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac,int isign)1217 v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign)
1218 {
1219 v4sf *in = (v4sf*)input_readonly;
1220 v4sf *out = (in == work2 ? work1 : work2);
1221 int nf = ifac[1], k1;
1222 int l1 = 1;
1223 int iw = 0;
1224 assert(in != out && work1 != work2);
1225 for (k1=2; k1<=nf+1; k1++) {
1226 int ip = ifac[k1];
1227 int l2 = ip*l1;
1228 int ido = n / l2;
1229 int idot = ido + ido;
1230 switch (ip) {
1231 case 5: {
1232 int ix2 = iw + idot;
1233 int ix3 = ix2 + idot;
1234 int ix4 = ix3 + idot;
1235 passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1236 }
1237 break;
1238 case 4: {
1239 int ix2 = iw + idot;
1240 int ix3 = ix2 + idot;
1241 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1242 }
1243 break;
1244 case 2: {
1245 passf2_ps(idot, l1, in, out, &wa[iw], isign);
1246 }
1247 break;
1248 case 3: {
1249 int ix2 = iw + idot;
1250 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1251 }
1252 break;
1253 default:
1254 assert(0);
1255 }
1256 l1 = l2;
1257 iw += (ip - 1)*idot;
1258 if (out == work2) {
1259 out = work1;
1260 in = work2;
1261 } else {
1262 out = work2;
1263 in = work1;
1264 }
1265 }
1266
1267 return in; /* this is in fact the output .. */
1268 }
1269
1270
1271 struct PFFFT_Setup {
1272 int N;
1273 int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1274 int ifac[15];
1275 pffft_transform_t transform;
1276 v4sf *data; // allocated room for twiddle coefs
1277 float *e; // points into 'data' , N/4*3 elements
1278 float *twiddle; // points into 'data', N/4 elements
1279 };
1280
pffft_new_setup(int N,pffft_transform_t transform)1281 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
1282 {
1283 PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1284 int k, m;
1285 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1286 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1287 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1288 if (transform == PFFFT_REAL) {
1289 assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0);
1290 }
1291 if (transform == PFFFT_COMPLEX) {
1292 assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0);
1293 }
1294 //assert((N % 32) == 0);
1295 s->N = N;
1296 s->transform = transform;
1297 /* nb of complex simd vectors */
1298 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1299 s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1300 s->e = (float*)s->data;
1301 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1302
1303 if (transform == PFFFT_REAL) {
1304 for (k=0; k < s->Ncvec; ++k) {
1305 int i = k/SIMD_SZ;
1306 int j = k%SIMD_SZ;
1307 for (m=0; m < SIMD_SZ-1; ++m) {
1308 float A = -2*M_PI*(m+1)*k / N;
1309 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
1310 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
1311 }
1312 }
1313 rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1314 } else {
1315 for (k=0; k < s->Ncvec; ++k) {
1316 int i = k/SIMD_SZ;
1317 int j = k%SIMD_SZ;
1318 for (m=0; m < SIMD_SZ-1; ++m) {
1319 float A = -2*M_PI*(m+1)*k / N;
1320 s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
1321 s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
1322 }
1323 }
1324 cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1325 }
1326
1327 /* check that N is decomposable with allowed prime factors */
1328 for (k=0, m=1; k < s->ifac[1]; ++k) {
1329 m *= s->ifac[2+k];
1330 }
1331 if (m != N/SIMD_SZ) {
1332 pffft_destroy_setup(s);
1333 s = 0;
1334 }
1335
1336 return s;
1337 }
1338
1339
pffft_destroy_setup(PFFFT_Setup * s)1340 void pffft_destroy_setup(PFFFT_Setup *s)
1341 {
1342 pffft_aligned_free(s->data);
1343 free(s);
1344 }
1345
1346 #if !defined(PFFFT_SIMD_DISABLE)
1347
1348 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
reversed_copy(int N,const v4sf * in,int in_stride,v4sf * out)1349 static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out)
1350 {
1351 v4sf g0, g1;
1352 int k;
1353 INTERLEAVE2(in[0], in[1], g0, g1);
1354 in += in_stride;
1355
1356 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1357 for (k=1; k < N; ++k) {
1358 v4sf h0, h1;
1359 INTERLEAVE2(in[0], in[1], h0, h1);
1360 in += in_stride;
1361 *--out = VSWAPHL(g1, h0);
1362 *--out = VSWAPHL(h0, h1);
1363 g1 = h1;
1364 }
1365 *--out = VSWAPHL(g1, g0);
1366 }
1367
unreversed_copy(int N,const v4sf * in,v4sf * out,int out_stride)1368 static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride)
1369 {
1370 v4sf g0, g1, h0, h1;
1371 int k;
1372 g0 = g1 = in[0];
1373 ++in;
1374 for (k=1; k < N; ++k) {
1375 h0 = *in++;
1376 h1 = *in++;
1377 g1 = VSWAPHL(g1, h0);
1378 h0 = VSWAPHL(h0, h1);
1379 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1380 out += out_stride;
1381 g1 = h1;
1382 }
1383 h0 = *in++;
1384 h1 = g0;
1385 g1 = VSWAPHL(g1, h0);
1386 h0 = VSWAPHL(h0, h1);
1387 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1388 }
1389
pffft_zreorder(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1390 void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
1391 {
1392 int k, N = setup->N, Ncvec = setup->Ncvec;
1393 const v4sf *vin = (const v4sf*)in;
1394 v4sf *vout = (v4sf*)out;
1395 assert(in != out);
1396 if (setup->transform == PFFFT_REAL) {
1397 int k, dk = N/32;
1398 if (direction == PFFFT_FORWARD) {
1399 for (k=0; k < dk; ++k) {
1400 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1401 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1402 }
1403 reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1404 reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1405 } else {
1406 for (k=0; k < dk; ++k) {
1407 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1408 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1409 }
1410 unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1411 unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1412 }
1413 } else {
1414 if (direction == PFFFT_FORWARD) {
1415 for (k=0; k < Ncvec; ++k) {
1416 int kk = (k/4) + (k%4)*(Ncvec/4);
1417 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1418 }
1419 } else {
1420 for (k=0; k < Ncvec; ++k) {
1421 int kk = (k/4) + (k%4)*(Ncvec/4);
1422 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1423 }
1424 }
1425 }
1426 }
1427
pffft_cplx_finalize(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1428 void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
1429 {
1430 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1431 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1432 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1433 assert(in != out);
1434 for (k=0; k < dk; ++k) {
1435 r0 = in[8*k+0];
1436 i0 = in[8*k+1];
1437 r1 = in[8*k+2];
1438 i1 = in[8*k+3];
1439 r2 = in[8*k+4];
1440 i2 = in[8*k+5];
1441 r3 = in[8*k+6];
1442 i3 = in[8*k+7];
1443 VTRANSPOSE4(r0,r1,r2,r3);
1444 VTRANSPOSE4(i0,i1,i2,i3);
1445 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1446 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1447 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1448
1449 sr0 = VADD(r0,r2);
1450 dr0 = VSUB(r0, r2);
1451 sr1 = VADD(r1,r3);
1452 dr1 = VSUB(r1, r3);
1453 si0 = VADD(i0,i2);
1454 di0 = VSUB(i0, i2);
1455 si1 = VADD(i1,i3);
1456 di1 = VSUB(i1, i3);
1457
1458 /*
1459 transformation for each column is:
1460
1461 [1 1 1 1 0 0 0 0] [r0]
1462 [1 0 -1 0 0 -1 0 1] [r1]
1463 [1 -1 1 -1 0 0 0 0] [r2]
1464 [1 0 -1 0 0 1 0 -1] [r3]
1465 [0 0 0 0 1 1 1 1] * [i0]
1466 [0 1 0 -1 1 0 -1 0] [i1]
1467 [0 0 0 0 1 -1 1 -1] [i2]
1468 [0 -1 0 1 1 0 -1 0] [i3]
1469 */
1470
1471 r0 = VADD(sr0, sr1);
1472 i0 = VADD(si0, si1);
1473 r1 = VADD(dr0, di1);
1474 i1 = VSUB(di0, dr1);
1475 r2 = VSUB(sr0, sr1);
1476 i2 = VSUB(si0, si1);
1477 r3 = VSUB(dr0, di1);
1478 i3 = VADD(di0, dr1);
1479
1480 *out++ = r0;
1481 *out++ = i0;
1482 *out++ = r1;
1483 *out++ = i1;
1484 *out++ = r2;
1485 *out++ = i2;
1486 *out++ = r3;
1487 *out++ = i3;
1488 }
1489 }
1490
pffft_cplx_preprocess(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1491 void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
1492 {
1493 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1494 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1495 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1496 assert(in != out);
1497 for (k=0; k < dk; ++k) {
1498 r0 = in[8*k+0];
1499 i0 = in[8*k+1];
1500 r1 = in[8*k+2];
1501 i1 = in[8*k+3];
1502 r2 = in[8*k+4];
1503 i2 = in[8*k+5];
1504 r3 = in[8*k+6];
1505 i3 = in[8*k+7];
1506
1507 sr0 = VADD(r0,r2);
1508 dr0 = VSUB(r0, r2);
1509 sr1 = VADD(r1,r3);
1510 dr1 = VSUB(r1, r3);
1511 si0 = VADD(i0,i2);
1512 di0 = VSUB(i0, i2);
1513 si1 = VADD(i1,i3);
1514 di1 = VSUB(i1, i3);
1515
1516 r0 = VADD(sr0, sr1);
1517 i0 = VADD(si0, si1);
1518 r1 = VSUB(dr0, di1);
1519 i1 = VADD(di0, dr1);
1520 r2 = VSUB(sr0, sr1);
1521 i2 = VSUB(si0, si1);
1522 r3 = VADD(dr0, di1);
1523 i3 = VSUB(di0, dr1);
1524
1525 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1526 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1527 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1528
1529 VTRANSPOSE4(r0,r1,r2,r3);
1530 VTRANSPOSE4(i0,i1,i2,i3);
1531
1532 *out++ = r0;
1533 *out++ = i0;
1534 *out++ = r1;
1535 *out++ = i1;
1536 *out++ = r2;
1537 *out++ = i2;
1538 *out++ = r3;
1539 *out++ = i3;
1540 }
1541 }
1542
1543
pffft_real_finalize_4x4(const v4sf * in0,const v4sf * in1,const v4sf * in,const v4sf * e,v4sf * out)1544 static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1545 const v4sf *e, v4sf *out)
1546 {
1547 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1548 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1549 r0 = *in0;
1550 i0 = *in1;
1551 r1 = *in++;
1552 i1 = *in++;
1553 r2 = *in++;
1554 i2 = *in++;
1555 r3 = *in++;
1556 i3 = *in++;
1557 VTRANSPOSE4(r0,r1,r2,r3);
1558 VTRANSPOSE4(i0,i1,i2,i3);
1559
1560 /*
1561 transformation for each column is:
1562
1563 [1 1 1 1 0 0 0 0] [r0]
1564 [1 0 -1 0 0 -1 0 1] [r1]
1565 [1 0 -1 0 0 1 0 -1] [r2]
1566 [1 -1 1 -1 0 0 0 0] [r3]
1567 [0 0 0 0 1 1 1 1] * [i0]
1568 [0 -1 0 1 -1 0 1 0] [i1]
1569 [0 -1 0 1 1 0 -1 0] [i2]
1570 [0 0 0 0 -1 1 -1 1] [i3]
1571 */
1572
1573 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1574 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1575
1576 VCPLXMUL(r1,i1,e[0],e[1]);
1577 VCPLXMUL(r2,i2,e[2],e[3]);
1578 VCPLXMUL(r3,i3,e[4],e[5]);
1579
1580 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1581 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1582
1583 sr0 = VADD(r0,r2);
1584 dr0 = VSUB(r0,r2);
1585 sr1 = VADD(r1,r3);
1586 dr1 = VSUB(r3,r1);
1587 si0 = VADD(i0,i2);
1588 di0 = VSUB(i0,i2);
1589 si1 = VADD(i1,i3);
1590 di1 = VSUB(i3,i1);
1591
1592 r0 = VADD(sr0, sr1);
1593 r3 = VSUB(sr0, sr1);
1594 i0 = VADD(si0, si1);
1595 i3 = VSUB(si1, si0);
1596 r1 = VADD(dr0, di1);
1597 r2 = VSUB(dr0, di1);
1598 i1 = VSUB(dr1, di0);
1599 i2 = VADD(dr1, di0);
1600
1601 *out++ = r0;
1602 *out++ = i0;
1603 *out++ = r1;
1604 *out++ = i1;
1605 *out++ = r2;
1606 *out++ = i2;
1607 *out++ = r3;
1608 *out++ = i3;
1609
1610 }
1611
pffft_real_finalize(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1612 static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
1613 {
1614 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1615 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1616
1617 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1618 v4sf save = in[7], zero=VZERO();
1619 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1620 static const float s = M_SQRT2/2;
1621
1622 cr.v = in[0];
1623 ci.v = in[Ncvec*2-1];
1624 assert(in != out);
1625 pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1626
1627 /*
1628 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1629
1630 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1631 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1632 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1633 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1634 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1635 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1636 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1637 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1638 */
1639
1640 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]);
1641 uout[0].f[0] = xr0;
1642 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]);
1643 uout[1].f[0] = xi0;
1644 xr2=(cr.f[0]-cr.f[2]);
1645 uout[4].f[0] = xr2;
1646 xi2=(cr.f[3]-cr.f[1]);
1647 uout[5].f[0] = xi2;
1648 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);
1649 uout[2].f[0] = xr1;
1650 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);
1651 uout[3].f[0] = xi1;
1652 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);
1653 uout[6].f[0] = xr3;
1654 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);
1655 uout[7].f[0] = xi3;
1656
1657 for (k=1; k < dk; ++k) {
1658 v4sf save_next = in[8*k+7];
1659 pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1660 e + k*6, out + k*8);
1661 save = save_next;
1662 }
1663
1664 }
1665
pffft_real_preprocess_4x4(const v4sf * in,const v4sf * e,v4sf * out,int first)1666 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1667 const v4sf *e, v4sf *out, int first)
1668 {
1669 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1670 /*
1671 transformation for each column is:
1672
1673 [1 1 1 1 0 0 0 0] [r0]
1674 [1 0 0 -1 0 -1 -1 0] [r1]
1675 [1 -1 -1 1 0 0 0 0] [r2]
1676 [1 0 0 -1 0 1 1 0] [r3]
1677 [0 0 0 0 1 -1 1 -1] * [i0]
1678 [0 -1 1 0 1 0 0 1] [i1]
1679 [0 0 0 0 1 1 -1 -1] [i2]
1680 [0 1 -1 0 1 0 0 1] [i3]
1681 */
1682
1683 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1684 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1685 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1686 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1687
1688 r0 = VADD(sr0, sr1);
1689 r2 = VSUB(sr0, sr1);
1690 r1 = VSUB(dr0, si1);
1691 r3 = VADD(dr0, si1);
1692 i0 = VSUB(di0, di1);
1693 i2 = VADD(di0, di1);
1694 i1 = VSUB(si0, dr1);
1695 i3 = VADD(si0, dr1);
1696
1697 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1698 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1699 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1700
1701 VTRANSPOSE4(r0,r1,r2,r3);
1702 VTRANSPOSE4(i0,i1,i2,i3);
1703
1704 if (!first) {
1705 *out++ = r0;
1706 *out++ = i0;
1707 }
1708 *out++ = r1;
1709 *out++ = i1;
1710 *out++ = r2;
1711 *out++ = i2;
1712 *out++ = r3;
1713 *out++ = i3;
1714 }
1715
pffft_real_preprocess(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1716 static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
1717 {
1718 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1719 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1720
1721 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1722 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1723 static const float s = M_SQRT2;
1724 assert(in != out);
1725 for (k=0; k < 4; ++k) {
1726 Xr.f[k] = ((float*)in)[8*k];
1727 Xi.f[k] = ((float*)in)[8*k+4];
1728 }
1729
1730 pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1731
1732 /*
1733 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1734
1735 [cr0] [1 0 2 0 1 0 0 0]
1736 [cr1] [1 0 0 0 -1 0 -2 0]
1737 [cr2] [1 0 -2 0 1 0 0 0]
1738 [cr3] [1 0 0 0 -1 0 2 0]
1739 [ci0] [0 2 0 2 0 0 0 0]
1740 [ci1] [0 s 0 -s 0 -s 0 -s]
1741 [ci2] [0 0 0 0 0 -2 0 2]
1742 [ci3] [0 -s 0 s 0 -s 0 -s]
1743 */
1744 for (k=1; k < dk; ++k) {
1745 pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1746 }
1747
1748 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2];
1749 uout[0].f[0] = cr0;
1750 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2];
1751 uout[0].f[1] = cr1;
1752 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2];
1753 uout[0].f[2] = cr2;
1754 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2];
1755 uout[0].f[3] = cr3;
1756 ci0= 2*(Xr.f[1]+Xr.f[3]);
1757 uout[2*Ncvec-1].f[0] = ci0;
1758 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]);
1759 uout[2*Ncvec-1].f[1] = ci1;
1760 ci2= 2*(Xi.f[3]-Xi.f[1]);
1761 uout[2*Ncvec-1].f[2] = ci2;
1762 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]);
1763 uout[2*Ncvec-1].f[3] = ci3;
1764 }
1765
1766
pffft_transform_internal(PFFFT_Setup * setup,const float * finput,float * foutput,v4sf * scratch,pffft_direction_t direction,int ordered)1767 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1768 pffft_direction_t direction, int ordered)
1769 {
1770 int k, Ncvec = setup->Ncvec;
1771 int nf_odd = (setup->ifac[1] & 1);
1772
1773 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1774 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1775 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1776
1777 const v4sf *vinput = (const v4sf*)finput;
1778 v4sf *voutput = (v4sf*)foutput;
1779 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
1780 int ib = (nf_odd ^ ordered ? 1 : 0);
1781
1782 assert(VALIGNED(finput) && VALIGNED(foutput));
1783
1784 //assert(finput != foutput);
1785 if (direction == PFFFT_FORWARD) {
1786 ib = !ib;
1787 if (setup->transform == PFFFT_REAL) {
1788 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1789 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1790 pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1791 } else {
1792 v4sf *tmp = buff[ib];
1793 for (k=0; k < Ncvec; ++k) {
1794 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1795 }
1796 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1797 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1798 pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1799 }
1800 if (ordered) {
1801 pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1802 } else ib = !ib;
1803 } else {
1804 if (vinput == buff[ib]) {
1805 ib = !ib; // may happen when finput == foutput
1806 }
1807 if (ordered) {
1808 pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1809 vinput = buff[ib];
1810 ib = !ib;
1811 }
1812 if (setup->transform == PFFFT_REAL) {
1813 pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1814 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1815 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1816 } else {
1817 pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1818 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1819 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1820 for (k=0; k < Ncvec; ++k) {
1821 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1822 }
1823 }
1824 }
1825
1826 if (buff[ib] != voutput) {
1827 /* extra copy required -- this situation should only happen when finput == foutput */
1828 assert(finput==foutput);
1829 for (k=0; k < Ncvec; ++k) {
1830 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1831 voutput[2*k] = a;
1832 voutput[2*k+1] = b;
1833 }
1834 ib = !ib;
1835 }
1836 assert(buff[ib] == voutput);
1837 }
1838
pffft_zconvolve_accumulate(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1839 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling)
1840 {
1841 int Ncvec = s->Ncvec;
1842 const v4sf * RESTRICT va = (const v4sf*)a;
1843 const v4sf * RESTRICT vb = (const v4sf*)b;
1844 v4sf * RESTRICT vab = (v4sf*)ab;
1845
1846 #ifdef __arm__
1847 __builtin_prefetch(va);
1848 __builtin_prefetch(vb);
1849 __builtin_prefetch(vab);
1850 __builtin_prefetch(va+2);
1851 __builtin_prefetch(vb+2);
1852 __builtin_prefetch(vab+2);
1853 __builtin_prefetch(va+4);
1854 __builtin_prefetch(vb+4);
1855 __builtin_prefetch(vab+4);
1856 __builtin_prefetch(va+6);
1857 __builtin_prefetch(vb+6);
1858 __builtin_prefetch(vab+6);
1859 # ifndef __clang__
1860 # define ZCONVOLVE_USING_INLINE_NEON_ASM
1861 # endif
1862 #endif
1863
1864 float ar, ai, br, bi, abr, abi;
1865 #ifndef ZCONVOLVE_USING_INLINE_ASM
1866 v4sf vscal = LD_PS1(scaling);
1867 int i;
1868 #endif
1869
1870 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1871 ar = ((v4sf_union*)va)[0].f[0];
1872 ai = ((v4sf_union*)va)[1].f[0];
1873 br = ((v4sf_union*)vb)[0].f[0];
1874 bi = ((v4sf_union*)vb)[1].f[0];
1875 abr = ((v4sf_union*)vab)[0].f[0];
1876 abi = ((v4sf_union*)vab)[1].f[0];
1877
1878 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1879 const float *a_ = a, *b_ = b;
1880 float *ab_ = ab;
1881 int N = Ncvec;
1882 asm volatile("mov r8, %2 \n"
1883 "vdup.f32 q15, %4 \n"
1884 "1: \n"
1885 "pld [%0,#64] \n"
1886 "pld [%1,#64] \n"
1887 "pld [%2,#64] \n"
1888 "pld [%0,#96] \n"
1889 "pld [%1,#96] \n"
1890 "pld [%2,#96] \n"
1891 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1892 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1893 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1894 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1895 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1896
1897 "vmul.f32 q10, q0, q4 \n"
1898 "vmul.f32 q11, q0, q5 \n"
1899 "vmul.f32 q12, q2, q6 \n"
1900 "vmul.f32 q13, q2, q7 \n"
1901 "vmls.f32 q10, q1, q5 \n"
1902 "vmla.f32 q11, q1, q4 \n"
1903 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1904 "vmls.f32 q12, q3, q7 \n"
1905 "vmla.f32 q13, q3, q6 \n"
1906 "vmla.f32 q8, q10, q15 \n"
1907 "vmla.f32 q9, q11, q15 \n"
1908 "vmla.f32 q0, q12, q15 \n"
1909 "vmla.f32 q1, q13, q15 \n"
1910 "vst1.f32 {q8,q9},[%2,:128]! \n"
1911 "vst1.f32 {q0,q1},[%2,:128]! \n"
1912 "subs %3, #2 \n"
1913 "bne 1b \n"
1914 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1915 #else // default routine, works fine for non-arm cpus with current compilers
1916 for (i=0; i < Ncvec; i += 2) {
1917 v4sf ar, ai, br, bi;
1918 ar = va[2*i+0];
1919 ai = va[2*i+1];
1920 br = vb[2*i+0];
1921 bi = vb[2*i+1];
1922 VCPLXMUL(ar, ai, br, bi);
1923 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1924 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1925 ar = va[2*i+2];
1926 ai = va[2*i+3];
1927 br = vb[2*i+2];
1928 bi = vb[2*i+3];
1929 VCPLXMUL(ar, ai, br, bi);
1930 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1931 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1932 }
1933 #endif
1934 if (s->transform == PFFFT_REAL) {
1935 ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1936 ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1937 }
1938 }
1939
1940
1941 #else // defined(PFFFT_SIMD_DISABLE)
1942
1943 // standard routine using scalar floats, without SIMD stuff.
1944
1945 #define pffft_zreorder_nosimd pffft_zreorder
pffft_zreorder_nosimd(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1946 void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
1947 {
1948 int k, N = setup->N;
1949 if (setup->transform == PFFFT_COMPLEX) {
1950 for (k=0; k < 2*N; ++k) out[k] = in[k];
1951 return;
1952 } else if (direction == PFFFT_FORWARD) {
1953 float x_N = in[N-1];
1954 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1955 out[0] = in[0];
1956 out[1] = x_N;
1957 } else {
1958 float x_N = in[1];
1959 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1960 out[0] = in[0];
1961 out[N-1] = x_N;
1962 }
1963 }
1964
1965 #define pffft_transform_internal_nosimd pffft_transform_internal
pffft_transform_internal_nosimd(PFFFT_Setup * setup,const float * input,float * output,float * scratch,pffft_direction_t direction,int ordered)1966 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1967 pffft_direction_t direction, int ordered)
1968 {
1969 int Ncvec = setup->Ncvec;
1970 int nf_odd = (setup->ifac[1] & 1);
1971
1972 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1973 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1974 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1975 float *buff[2];
1976 int ib;
1977 if (scratch == 0) scratch = scratch_on_stack;
1978 buff[0] = output;
1979 buff[1] = scratch;
1980
1981 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1982 ib = (nf_odd ^ ordered ? 1 : 0);
1983
1984 if (direction == PFFFT_FORWARD) {
1985 if (setup->transform == PFFFT_REAL) {
1986 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1987 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1988 } else {
1989 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1990 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1991 }
1992 if (ordered) {
1993 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
1994 ib = !ib;
1995 }
1996 } else {
1997 if (input == buff[ib]) {
1998 ib = !ib; // may happen when finput == foutput
1999 }
2000 if (ordered) {
2001 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
2002 input = buff[!ib];
2003 }
2004 if (setup->transform == PFFFT_REAL) {
2005 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
2006 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
2007 } else {
2008 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
2009 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
2010 }
2011 }
2012 if (buff[ib] != output) {
2013 int k;
2014 // extra copy required -- this situation should happens only when finput == foutput
2015 assert(input==output);
2016 for (k=0; k < Ncvec; ++k) {
2017 float a = buff[ib][2*k], b = buff[ib][2*k+1];
2018 output[2*k] = a;
2019 output[2*k+1] = b;
2020 }
2021 ib = !ib;
2022 }
2023 assert(buff[ib] == output);
2024 }
2025
2026 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)2027 void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
2028 float *ab, float scaling)
2029 {
2030 int i, Ncvec = s->Ncvec;
2031
2032 if (s->transform == PFFFT_REAL) {
2033 // take care of the fftpack ordering
2034 ab[0] += a[0]*b[0]*scaling;
2035 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
2036 ++ab;
2037 ++a;
2038 ++b;
2039 --Ncvec;
2040 }
2041 for (i=0; i < Ncvec; ++i) {
2042 float ar, ai, br, bi;
2043 ar = a[2*i+0];
2044 ai = a[2*i+1];
2045 br = b[2*i+0];
2046 bi = b[2*i+1];
2047 VCPLXMUL(ar, ai, br, bi);
2048 ab[2*i+0] += ar*scaling;
2049 ab[2*i+1] += ai*scaling;
2050 }
2051 }
2052
2053 #endif // defined(PFFFT_SIMD_DISABLE)
2054
pffft_transform(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)2055 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
2056 {
2057 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
2058 }
2059
pffft_transform_ordered(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)2060 void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
2061 {
2062 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
2063 }
2064