1 /*
2 Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
3
4 Based on original fortran 77 code from FFTPACKv4 from NETLIB
5 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
6 of NCAR, in 1985.
7
8 As confirmed by the NCAR fftpack software curators, the following
9 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
10 released under the same terms.
11
12 FFTPACK license:
13
14 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
15
16 Copyright (c) 2004 the University Corporation for Atmospheric
17 Research ("UCAR"). All rights reserved. Developed by NCAR's
18 Computational and Information Systems Laboratory, UCAR,
19 www.cisl.ucar.edu.
20
21 Redistribution and use of the Software in source and binary forms,
22 with or without modification, is permitted provided that the
23 following conditions are met:
24
25 - Neither the names of NCAR's Computational and Information Systems
26 Laboratory, the University Corporation for Atmospheric Research,
27 nor the names of its sponsors or contributors may be used to
28 endorse or promote products derived from this Software without
29 specific prior written permission.
30
31 - Redistributions of source code must retain the above copyright
32 notices, this list of conditions, and the disclaimer below.
33
34 - Redistributions in binary form must reproduce the above copyright
35 notice, this list of conditions, and the disclaimer below in the
36 documentation and/or other materials provided with the
37 distribution.
38
39 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
40 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
41 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
42 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
43 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
44 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
45 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
46 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
47 SOFTWARE.
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53
54 */
55
56 /*
57 ChangeLog:
58 - 2011/10/02, version 1: This is the very first release of this file.
59 */
60
61 #include "pffft.h"
62 #include <stdlib.h>
63 #include <stdio.h>
64 #include <assert.h>
65 #include <stdint.h>
66 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
67 #include <altivec.h>
68 #endif
69
70 /* detect compiler flavour */
71 #if defined(_MSC_VER)
72 # define COMPILER_MSVC
73 #elif defined(__GNUC__)
74 # define COMPILER_GCC
75 #endif
76
77 #ifdef COMPILER_MSVC
78 #define _USE_MATH_DEFINES
79 #endif
80 #include <math.h>
81
82 #if defined(COMPILER_GCC)
83 # define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
84 # define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
85 # define RESTRICT __restrict
86 # define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
87 #elif defined(COMPILER_MSVC)
88 # define ALWAYS_INLINE(return_type) __forceinline return_type
89 # define NEVER_INLINE(return_type) __declspec(noinline) return_type
90 # define RESTRICT __restrict
91 # define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
92 #endif
93
94
95 /*
96 vector support macros: the rest of the code is independant of
97 SSE/Altivec/NEON -- adding support for other platforms with 4-element
98 vectors should be limited to these macros
99 */
100
101
102 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
103 //#define PFFFT_SIMD_DISABLE
104
105 /*
106 Altivec support macros
107 */
108 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
109 typedef vector float v4sf;
110 # define SIMD_SZ 4
111 # define VZERO() ((vector float) vec_splat_u8(0))
112 # define VMUL(a,b) vec_madd(a,b, VZERO())
113 # define VADD(a,b) vec_add(a,b)
114 # define VMADD(a,b,c) vec_madd(a,b,c)
115 # define VSUB(a,b) vec_sub(a,b)
ld_ps1(const float * p)116 inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
117 # define LD_PS1(p) ld_ps1(&p)
118 # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
119 # define UNINTERLEAVE2(in1, in2, out1, out2) { \
120 vector unsigned char vperm1 = (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \
121 vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
122 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
123 }
124 # define VTRANSPOSE4(x0,x1,x2,x3) { \
125 v4sf y0 = vec_mergeh(x0, x2); \
126 v4sf y1 = vec_mergel(x0, x2); \
127 v4sf y2 = vec_mergeh(x1, x3); \
128 v4sf y3 = vec_mergel(x1, x3); \
129 x0 = vec_mergeh(y0, y2); \
130 x1 = vec_mergel(y0, y2); \
131 x2 = vec_mergeh(y1, y3); \
132 x3 = vec_mergel(y1, y3); \
133 }
134 # define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
135 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
136
137 /*
138 SSE1 support macros
139 */
140 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
141
142 #include <xmmintrin.h>
143 typedef __m128 v4sf;
144 # define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
145 # define VZERO() _mm_setzero_ps()
146 # define VMUL(a,b) _mm_mul_ps(a,b)
147 # define VADD(a,b) _mm_add_ps(a,b)
148 # define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
149 # define VSUB(a,b) _mm_sub_ps(a,b)
150 # define LD_PS1(p) _mm_set1_ps(p)
151 # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
152 # define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
153 # define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
154 # define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
155 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
156
157 /*
158 ARM NEON support macros
159 */
160 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(IOS))
161 # include <arm_neon.h>
162 typedef float32x4_t v4sf;
163 # define SIMD_SZ 4
164 # define VZERO() vdupq_n_f32(0)
165 # define VMUL(a,b) vmulq_f32(a,b)
166 # define VADD(a,b) vaddq_f32(a,b)
167 # define VMADD(a,b,c) vmlaq_f32(c,a,b)
168 # define VSUB(a,b) vsubq_f32(a,b)
169 # define LD_PS1(p) vld1q_dup_f32(&(p))
170 # define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
171 # define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
172 # define VTRANSPOSE4(x0,x1,x2,x3) { \
173 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
174 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
175 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
176 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
177 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
178 }
179 // marginally faster version
180 //# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
181 # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
182 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
183 #else
184 # if !defined(PFFFT_SIMD_DISABLE)
185 //# warning "building with simd disabled !!!!!!!!!!!!!!\n";
186 # define PFFFT_SIMD_DISABLE // fallback to scalar code
187 # endif
188 #endif
189
190 // fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
191 #ifdef PFFFT_SIMD_DISABLE
192 typedef float v4sf;
193 # define SIMD_SZ 1
194 # define VZERO() 0.f
195 # define VMUL(a,b) ((a)*(b))
196 # define VADD(a,b) ((a)+(b))
197 # define VMADD(a,b,c) ((a)*(b)+(c))
198 # define VSUB(a,b) ((a)-(b))
199 # define LD_PS1(p) (p)
200 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
201 #endif
202
203 // shortcuts for complex multiplcations
204 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
205 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
206 #ifndef SVMUL
207 // multiply a scalar with a vector
208 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
209 #endif
210
211 #if !defined(PFFFT_SIMD_DISABLE)
212 typedef union v4sf_union {
213 v4sf v;
214 float f[4];
215 } v4sf_union;
216
217 #include <string.h>
218
219 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
220
221 /* detect bugs with the vector support macros */
validate_pffft_simd(void)222 void validate_pffft_simd(void) {
223 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
224 v4sf_union a0, a1, a2, a3, t, u;
225 memcpy(a0.f, f, 4*sizeof(float));
226 memcpy(a1.f, f+4, 4*sizeof(float));
227 memcpy(a2.f, f+8, 4*sizeof(float));
228 memcpy(a3.f, f+12, 4*sizeof(float));
229
230 t = a0; u = a1; t.v = VZERO();
231 printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
232 t.v = VADD(a1.v, a2.v);
233 printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
234 t.v = VMUL(a1.v, a2.v);
235 printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
236 t.v = VMADD(a1.v, a2.v,a0.v);
237 printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
238
239 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
240 printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
241 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
242 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
243 printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
244 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
245
246 t.v=LD_PS1(f[15]);
247 printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
248 assertv4(t, 15, 15, 15, 15);
249 t.v = VSWAPHL(a1.v, a2.v);
250 printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
251 assertv4(t, 8, 9, 6, 7);
252 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
253 printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
254 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
255 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
256 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
257 }
258 #endif //!PFFFT_SIMD_DISABLE
259
260 /* SSE and co like 16-bytes aligned pointers */
261 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
pffft_aligned_malloc(size_t nb_bytes)262 void *pffft_aligned_malloc(size_t nb_bytes) {
263 void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
264 if (!p0) return (void *) 0;
265 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
266 *((void **) p - 1) = p0;
267 return p;
268 }
269
pffft_aligned_free(void * p)270 void pffft_aligned_free(void *p) {
271 if (p) free(*((void **) p - 1));
272 }
273
pffft_simd_size(void)274 int32_t pffft_simd_size(void) { return SIMD_SZ; }
275
276 /*
277 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
278 */
passf2_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1,float fsign)279 static NEVER_INLINE(void) passf2_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
280 int32_t k, i;
281 int32_t l1ido = l1*ido;
282 if (ido <= 2) {
283 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
284 ch[0] = VADD(cc[0], cc[ido+0]);
285 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
286 ch[1] = VADD(cc[1], cc[ido+1]);
287 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
288 }
289 } else {
290 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
291 for (i=0; i<ido-1; i+=2) {
292 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
293 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
294 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
295 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
296 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
297 VCPLXMUL(tr2, ti2, wr, wi);
298 ch[i+l1ido] = tr2;
299 ch[i+l1ido+1] = ti2;
300 }
301 }
302 }
303 }
304
305 /*
306 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
307 */
passf3_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,float fsign)308 static NEVER_INLINE(void) passf3_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch,
309 const float *wa1, const float *wa2, float fsign) {
310 static const float taur = -0.5f;
311 float taui = 0.866025403784439f*fsign;
312 int32_t i, k;
313 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
314 int32_t l1ido = l1*ido;
315 float wr1, wi1, wr2, wi2;
316 assert(ido > 2);
317 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
318 for (i=0; i<ido-1; i+=2) {
319 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
320 cr2 = VADD(cc[i], SVMUL(taur,tr2));
321 ch[i] = VADD(cc[i], tr2);
322 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
323 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
324 ch[i+1] = VADD(cc[i+1], ti2);
325 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
326 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
327 dr2 = VSUB(cr2, ci3);
328 dr3 = VADD(cr2, ci3);
329 di2 = VADD(ci2, cr3);
330 di3 = VSUB(ci2, cr3);
331 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
332 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
333 ch[i+l1ido] = dr2;
334 ch[i+l1ido + 1] = di2;
335 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
336 ch[i+2*l1ido] = dr3;
337 ch[i+2*l1ido+1] = di3;
338 }
339 }
340 } /* passf3 */
341
passf4_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,float fsign)342 static NEVER_INLINE(void) passf4_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch,
343 const float *wa1, const float *wa2, const float *wa3, float fsign) {
344 /* isign == -1 for forward transform and +1 for backward transform */
345
346 int32_t i, k;
347 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
348 int32_t l1ido = l1*ido;
349 if (ido == 2) {
350 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
351 tr1 = VSUB(cc[0], cc[2*ido + 0]);
352 tr2 = VADD(cc[0], cc[2*ido + 0]);
353 ti1 = VSUB(cc[1], cc[2*ido + 1]);
354 ti2 = VADD(cc[1], cc[2*ido + 1]);
355 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
356 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
357 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
358 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
359
360 ch[0*l1ido + 0] = VADD(tr2, tr3);
361 ch[0*l1ido + 1] = VADD(ti2, ti3);
362 ch[1*l1ido + 0] = VADD(tr1, tr4);
363 ch[1*l1ido + 1] = VADD(ti1, ti4);
364 ch[2*l1ido + 0] = VSUB(tr2, tr3);
365 ch[2*l1ido + 1] = VSUB(ti2, ti3);
366 ch[3*l1ido + 0] = VSUB(tr1, tr4);
367 ch[3*l1ido + 1] = VSUB(ti1, ti4);
368 }
369 } else {
370 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
371 for (i=0; i<ido-1; i+=2) {
372 float wr1, wi1, wr2, wi2, wr3, wi3;
373 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
374 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
375 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
376 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
377 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
378 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
379 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
380 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
381
382 ch[i] = VADD(tr2, tr3);
383 cr3 = VSUB(tr2, tr3);
384 ch[i + 1] = VADD(ti2, ti3);
385 ci3 = VSUB(ti2, ti3);
386
387 cr2 = VADD(tr1, tr4);
388 cr4 = VSUB(tr1, tr4);
389 ci2 = VADD(ti1, ti4);
390 ci4 = VSUB(ti1, ti4);
391 wr1=wa1[i], wi1=fsign*wa1[i+1];
392 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
393 wr2=wa2[i], wi2=fsign*wa2[i+1];
394 ch[i + l1ido] = cr2;
395 ch[i + l1ido + 1] = ci2;
396
397 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
398 wr3=wa3[i], wi3=fsign*wa3[i+1];
399 ch[i + 2*l1ido] = cr3;
400 ch[i + 2*l1ido + 1] = ci3;
401
402 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
403 ch[i + 3*l1ido] = cr4;
404 ch[i + 3*l1ido + 1] = ci4;
405 }
406 }
407 }
408 } /* passf4 */
409
410 /*
411 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
412 */
passf5_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4,float fsign)413 static NEVER_INLINE(void) passf5_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch,
414 const float *wa1, const float *wa2,
415 const float *wa3, const float *wa4, float fsign) {
416 static const float tr11 = .309016994374947f;
417 const float ti11 = .951056516295154f*fsign;
418 static const float tr12 = -.809016994374947f;
419 const float ti12 = .587785252292473f*fsign;
420
421 /* Local variables */
422 int32_t i, k;
423 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
424 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
425
426 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
427
428 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
429 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
430
431 assert(ido > 2);
432 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
433 for (i = 0; i < ido-1; i += 2) {
434 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
435 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
436 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
437 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
438 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
439 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
440 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
441 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
442 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
443 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
444 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
445 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
446 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
447 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
448 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
449 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
450 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
451 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
452 dr3 = VSUB(cr3, ci4);
453 dr4 = VADD(cr3, ci4);
454 di3 = VADD(ci3, cr4);
455 di4 = VSUB(ci3, cr4);
456 dr5 = VADD(cr2, ci5);
457 dr2 = VSUB(cr2, ci5);
458 di5 = VSUB(ci2, cr5);
459 di2 = VADD(ci2, cr5);
460 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
461 wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
462 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
463 ch_ref(i - 1, 2) = dr2;
464 ch_ref(i, 2) = di2;
465 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
466 ch_ref(i - 1, 3) = dr3;
467 ch_ref(i, 3) = di3;
468 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
469 ch_ref(i - 1, 4) = dr4;
470 ch_ref(i, 4) = di4;
471 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
472 ch_ref(i - 1, 5) = dr5;
473 ch_ref(i, 5) = di5;
474 }
475 }
476 #undef ch_ref
477 #undef cc_ref
478 }
479
radf2_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1)480 static NEVER_INLINE(void) radf2_ps(int32_t ido, int32_t l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
481 static const float minus_one = -1.f;
482 int32_t i, k, l1ido = l1*ido;
483 for (k=0; k < l1ido; k += ido) {
484 v4sf a = cc[k], b = cc[k + l1ido];
485 ch[2*k] = VADD(a, b);
486 ch[2*(k+ido)-1] = VSUB(a, b);
487 }
488 if (ido < 2) return;
489 if (ido != 2) {
490 for (k=0; k < l1ido; k += ido) {
491 for (i=2; i<ido; i+=2) {
492 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
493 v4sf br = cc[i - 1 + k], bi = cc[i + k];
494 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
495 ch[i + 2*k] = VADD(bi, ti2);
496 ch[2*(k+ido) - i] = VSUB(ti2, bi);
497 ch[i - 1 + 2*k] = VADD(br, tr2);
498 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
499 }
500 }
501 if (ido % 2 == 1) return;
502 }
503 for (k=0; k < l1ido; k += ido) {
504 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
505 ch[2*k + ido-1] = cc[k + ido-1];
506 }
507 } /* radf2 */
508
509
radb2_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1)510 static NEVER_INLINE(void) radb2_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch, const float *wa1) {
511 static const float minus_two=-2;
512 int32_t i, k, l1ido = l1*ido;
513 v4sf a,b,c,d, tr2, ti2;
514 for (k=0; k < l1ido; k += ido) {
515 a = cc[2*k]; b = cc[2*(k+ido) - 1];
516 ch[k] = VADD(a, b);
517 ch[k + l1ido] =VSUB(a, b);
518 }
519 if (ido < 2) return;
520 if (ido != 2) {
521 for (k = 0; k < l1ido; k += ido) {
522 for (i = 2; i < ido; i += 2) {
523 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
524 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
525 ch[i-1 + k] = VADD(a, b);
526 tr2 = VSUB(a, b);
527 ch[i+0 + k] = VSUB(c, d);
528 ti2 = VADD(c, d);
529 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
530 ch[i-1 + k + l1ido] = tr2;
531 ch[i+0 + k + l1ido] = ti2;
532 }
533 }
534 if (ido % 2 == 1) return;
535 }
536 for (k = 0; k < l1ido; k += ido) {
537 a = cc[2*k + ido-1]; b = cc[2*k + ido];
538 ch[k + ido-1] = VADD(a,a);
539 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
540 }
541 } /* radb2 */
542
radf3_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)543 static void radf3_ps(int32_t ido, int32_t l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
544 const float *wa1, const float *wa2) {
545 static const float taur = -0.5f;
546 static const float taui = 0.866025403784439f;
547 int32_t i, k, ic;
548 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
549 for (k=0; k<l1; k++) {
550 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
551 ch[3*k*ido] = VADD(cc[k*ido], cr2);
552 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
553 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
554 }
555 if (ido == 1) return;
556 for (k=0; k<l1; k++) {
557 for (i=2; i<ido; i+=2) {
558 ic = ido - i;
559 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
560 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
561 VCPLXMULCONJ(dr2, di2, wr1, wi1);
562
563 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
564 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
565 VCPLXMULCONJ(dr3, di3, wr2, wi2);
566
567 cr2 = VADD(dr2, dr3);
568 ci2 = VADD(di2, di3);
569 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
570 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
571 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
572 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
573 tr3 = SVMUL(taui, VSUB(di2, di3));
574 ti3 = SVMUL(taui, VSUB(dr3, dr2));
575 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
576 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
577 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
578 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
579 }
580 }
581 } /* radf3 */
582
583
radb3_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)584 static void radb3_ps(int32_t ido, int32_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
585 const float *wa1, const float *wa2)
586 {
587 static const float taur = -0.5f;
588 static const float taui = 0.866025403784439f;
589 static const float taui_2 = 0.866025403784439f*2;
590 int32_t i, k, ic;
591 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
592 for (k=0; k<l1; k++) {
593 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
594 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
595 ch[k*ido] = VADD(cc[3*k*ido], tr2);
596 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
597 ch[(k + l1)*ido] = VSUB(cr2, ci3);
598 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
599 }
600 if (ido == 1) return;
601 for (k=0; k<l1; k++) {
602 for (i=2; i<ido; i+=2) {
603 ic = ido - i;
604 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
605 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
606 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
607 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
608 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
609 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
610 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
611 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
612 dr2 = VSUB(cr2, ci3);
613 dr3 = VADD(cr2, ci3);
614 di2 = VADD(ci2, cr3);
615 di3 = VSUB(ci2, cr3);
616 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
617 ch[i - 1 + (k + l1)*ido] = dr2;
618 ch[i + (k + l1)*ido] = di2;
619 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
620 ch[i - 1 + (k + 2*l1)*ido] = dr3;
621 ch[i + (k + 2*l1)*ido] = di3;
622 }
623 }
624 } /* radb3 */
625
radf4_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)626 static NEVER_INLINE(void) radf4_ps(int32_t ido, int32_t l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
627 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
628 {
629 static const float minus_hsqt2 = (float)-0.7071067811865475;
630 int32_t i, k, l1ido = l1*ido;
631 {
632 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
633 v4sf * RESTRICT ch_ = ch;
634 while (cc < cc_end) {
635 // this loop represents between 25% and 40% of total radf4_ps cost !
636 v4sf a0 = cc[0], a1 = cc[l1ido];
637 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
638 v4sf tr1 = VADD(a1, a3);
639 v4sf tr2 = VADD(a0, a2);
640 ch[2*ido-1] = VSUB(a0, a2);
641 ch[2*ido ] = VSUB(a3, a1);
642 ch[0 ] = VADD(tr1, tr2);
643 ch[4*ido-1] = VSUB(tr2, tr1);
644 cc += ido; ch += 4*ido;
645 }
646 cc = cc_; ch = ch_;
647 }
648 if (ido < 2) return;
649 if (ido != 2) {
650 for (k = 0; k < l1ido; k += ido) {
651 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
652 for (i=2; i<ido; i += 2, pc += 2) {
653 int32_t ic = ido - i;
654 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
655 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
656
657 cr2 = pc[1*l1ido+0];
658 ci2 = pc[1*l1ido+1];
659 wr=LD_PS1(wa1[i - 2]);
660 wi=LD_PS1(wa1[i - 1]);
661 VCPLXMULCONJ(cr2,ci2,wr,wi);
662
663 cr3 = pc[2*l1ido+0];
664 ci3 = pc[2*l1ido+1];
665 wr = LD_PS1(wa2[i-2]);
666 wi = LD_PS1(wa2[i-1]);
667 VCPLXMULCONJ(cr3, ci3, wr, wi);
668
669 cr4 = pc[3*l1ido];
670 ci4 = pc[3*l1ido+1];
671 wr = LD_PS1(wa3[i-2]);
672 wi = LD_PS1(wa3[i-1]);
673 VCPLXMULCONJ(cr4, ci4, wr, wi);
674
675 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
676
677 tr1 = VADD(cr2,cr4);
678 tr4 = VSUB(cr4,cr2);
679 tr2 = VADD(pc[0],cr3);
680 tr3 = VSUB(pc[0],cr3);
681 ch[i - 1 + 4*k] = VADD(tr1,tr2);
682 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
683 ti1 = VADD(ci2,ci4);
684 ti4 = VSUB(ci2,ci4);
685 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
686 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
687 ti2 = VADD(pc[1],ci3);
688 ti3 = VSUB(pc[1],ci3);
689 ch[i + 4*k] = VADD(ti1, ti2);
690 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
691 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
692 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
693 }
694 }
695 if (ido % 2 == 1) return;
696 }
697 for (k=0; k<l1ido; k += ido) {
698 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
699 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
700 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
701 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
702 ch[ido-1 + 4*k] = VADD(tr1, c);
703 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
704 ch[4*k + 1*ido] = VSUB(ti1, d);
705 ch[4*k + 3*ido] = VADD(ti1, d);
706 }
707 } /* radf4 */
708
709
radb4_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)710 static NEVER_INLINE(void) radb4_ps(int32_t ido, int32_t l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
711 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
712 {
713 static const float minus_sqrt2 = (float)-1.414213562373095;
714 static const float two = 2.f;
715 int32_t i, k, l1ido = l1*ido;
716 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
717 {
718 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
719 v4sf *ch_ = ch;
720 while (ch < ch_end) {
721 v4sf a = cc[0], b = cc[4*ido-1];
722 v4sf c = cc[2*ido], d = cc[2*ido-1];
723 tr3 = SVMUL(two,d);
724 tr2 = VADD(a,b);
725 tr1 = VSUB(a,b);
726 tr4 = SVMUL(two,c);
727 ch[0*l1ido] = VADD(tr2, tr3);
728 ch[2*l1ido] = VSUB(tr2, tr3);
729 ch[1*l1ido] = VSUB(tr1, tr4);
730 ch[3*l1ido] = VADD(tr1, tr4);
731
732 cc += 4*ido; ch += ido;
733 }
734 cc = cc_; ch = ch_;
735 }
736 if (ido < 2) return;
737 if (ido != 2) {
738 for (k = 0; k < l1ido; k += ido) {
739 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
740 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
741 for (i = 2; i < ido; i += 2) {
742
743 tr1 = VSUB(pc[i], pc[4*ido - i]);
744 tr2 = VADD(pc[i], pc[4*ido - i]);
745 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
746 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
747 ph[0] = VADD(tr2, tr3);
748 cr3 = VSUB(tr2, tr3);
749
750 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
751 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
752 cr2 = VSUB(tr1, tr4);
753 cr4 = VADD(tr1, tr4);
754
755 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
756 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
757
758 ph[1] = VADD(ti2, ti3); ph += l1ido;
759 ci3 = VSUB(ti2, ti3);
760 ci2 = VADD(ti1, ti4);
761 ci4 = VSUB(ti1, ti4);
762 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
763 ph[0] = cr2;
764 ph[1] = ci2; ph += l1ido;
765 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
766 ph[0] = cr3;
767 ph[1] = ci3; ph += l1ido;
768 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
769 ph[0] = cr4;
770 ph[1] = ci4; ph = ph - 3*l1ido + 2;
771 }
772 }
773 if (ido % 2 == 1) return;
774 }
775 for (k=0; k < l1ido; k+=ido) {
776 int32_t i0 = 4*k + ido;
777 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
778 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
779 tr1 = VSUB(c,d);
780 tr2 = VADD(c,d);
781 ti1 = VADD(b,a);
782 ti2 = VSUB(b,a);
783 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
784 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
785 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
786 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
787 }
788 } /* radb4 */
789
radf5_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)790 static void radf5_ps(int32_t ido, int32_t l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
791 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
792 {
793 static const float tr11 = .309016994374947f;
794 static const float ti11 = .951056516295154f;
795 static const float tr12 = -.809016994374947f;
796 static const float ti12 = .587785252292473f;
797
798 /* System generated locals */
799 int32_t cc_offset, ch_offset;
800
801 /* Local variables */
802 int32_t i, k, ic;
803 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
804 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
805 int32_t idp2;
806
807
808 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
809 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
810
811 /* Parameter adjustments */
812 ch_offset = 1 + ido * 6;
813 ch -= ch_offset;
814 cc_offset = 1 + ido * (1 + l1);
815 cc -= cc_offset;
816
817 /* Function Body */
818 for (k = 1; k <= l1; ++k) {
819 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
820 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
821 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
822 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
823 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
824 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
825 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
826 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
827 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
828 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
829 }
830 if (ido == 1) {
831 return;
832 }
833 idp2 = ido + 2;
834 for (k = 1; k <= l1; ++k) {
835 for (i = 3; i <= ido; i += 2) {
836 ic = idp2 - i;
837 dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
838 dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
839 dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
840 dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
841 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
842 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
843 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
844 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
845 cr2 = VADD(dr2, dr5);
846 ci5 = VSUB(dr5, dr2);
847 cr5 = VSUB(di2, di5);
848 ci2 = VADD(di2, di5);
849 cr3 = VADD(dr3, dr4);
850 ci4 = VSUB(dr4, dr3);
851 cr4 = VSUB(di3, di4);
852 ci3 = VADD(di3, di4);
853 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
854 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
855 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
856 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
857 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
858 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
859 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
860 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
861 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
862 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
863 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
864 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
865 ch_ref(i, 3, k) = VADD(ti2, ti5);
866 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
867 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
868 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
869 ch_ref(i, 5, k) = VADD(ti3, ti4);
870 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
871 }
872 }
873 #undef cc_ref
874 #undef ch_ref
875 } /* radf5 */
876
radb5_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)877 static void radb5_ps(int32_t ido, int32_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
878 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
879 {
880 static const float tr11 = .309016994374947f;
881 static const float ti11 = .951056516295154f;
882 static const float tr12 = -.809016994374947f;
883 static const float ti12 = .587785252292473f;
884
885 int32_t cc_offset, ch_offset;
886
887 /* Local variables */
888 int32_t i, k, ic;
889 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
890 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
891 int32_t idp2;
892
893 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
894 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
895
896 /* Parameter adjustments */
897 ch_offset = 1 + ido * (1 + l1);
898 ch -= ch_offset;
899 cc_offset = 1 + ido * 6;
900 cc -= cc_offset;
901
902 /* Function Body */
903 for (k = 1; k <= l1; ++k) {
904 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
905 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
906 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
907 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
908 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
909 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
910 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
911 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
912 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
913 ch_ref(1, k, 2) = VSUB(cr2, ci5);
914 ch_ref(1, k, 3) = VSUB(cr3, ci4);
915 ch_ref(1, k, 4) = VADD(cr3, ci4);
916 ch_ref(1, k, 5) = VADD(cr2, ci5);
917 }
918 if (ido == 1) {
919 return;
920 }
921 idp2 = ido + 2;
922 for (k = 1; k <= l1; ++k) {
923 for (i = 3; i <= ido; i += 2) {
924 ic = idp2 - i;
925 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
926 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
927 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
928 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
929 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
930 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
931 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
932 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
933 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
934 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
935 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
936 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
937 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
938 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
939 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
940 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
941 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
942 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
943 dr3 = VSUB(cr3, ci4);
944 dr4 = VADD(cr3, ci4);
945 di3 = VADD(ci3, cr4);
946 di4 = VSUB(ci3, cr4);
947 dr5 = VADD(cr2, ci5);
948 dr2 = VSUB(cr2, ci5);
949 di5 = VSUB(ci2, cr5);
950 di2 = VADD(ci2, cr5);
951 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
952 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
953 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
954 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
955
956 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
957 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
958 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
959 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
960 }
961 }
962 #undef cc_ref
963 #undef ch_ref
964 } /* radb5 */
965
rfftf1_ps(int32_t n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int32_t * ifac)966 static NEVER_INLINE(v4sf *) rfftf1_ps(int32_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
967 const float *wa, const int32_t *ifac) {
968 v4sf *in = (v4sf*)input_readonly;
969 v4sf *out = (in == work2 ? work1 : work2);
970 int32_t nf = ifac[1], k1;
971 int32_t l2 = n;
972 int32_t iw = n-1;
973 assert(in != out && work1 != work2);
974 for (k1 = 1; k1 <= nf; ++k1) {
975 int32_t kh = nf - k1;
976 int32_t ip = ifac[kh + 2];
977 int32_t l1 = l2 / ip;
978 int32_t ido = n / l2;
979 iw -= (ip - 1)*ido;
980 switch (ip) {
981 case 5: {
982 int32_t ix2 = iw + ido;
983 int32_t ix3 = ix2 + ido;
984 int32_t ix4 = ix3 + ido;
985 radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
986 } break;
987 case 4: {
988 int32_t ix2 = iw + ido;
989 int32_t ix3 = ix2 + ido;
990 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
991 } break;
992 case 3: {
993 int32_t ix2 = iw + ido;
994 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
995 } break;
996 case 2:
997 radf2_ps(ido, l1, in, out, &wa[iw]);
998 break;
999 default:
1000 assert(0);
1001 break;
1002 }
1003 l2 = l1;
1004 if (out == work2) {
1005 out = work1; in = work2;
1006 } else {
1007 out = work2; in = work1;
1008 }
1009 }
1010 return in; /* this is in fact the output .. */
1011 } /* rfftf1 */
1012
rfftb1_ps(int32_t n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int32_t * ifac)1013 static NEVER_INLINE(v4sf *) rfftb1_ps(int32_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1014 const float *wa, const int32_t *ifac) {
1015 v4sf *in = (v4sf*)input_readonly;
1016 v4sf *out = (in == work2 ? work1 : work2);
1017 int32_t nf = ifac[1], k1;
1018 int32_t l1 = 1;
1019 int32_t iw = 0;
1020 assert(in != out);
1021 for (k1=1; k1<=nf; k1++) {
1022 int32_t ip = ifac[k1 + 1];
1023 int32_t l2 = ip*l1;
1024 int32_t ido = n / l2;
1025 switch (ip) {
1026 case 5: {
1027 int32_t ix2 = iw + ido;
1028 int32_t ix3 = ix2 + ido;
1029 int32_t ix4 = ix3 + ido;
1030 radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1031 } break;
1032 case 4: {
1033 int32_t ix2 = iw + ido;
1034 int32_t ix3 = ix2 + ido;
1035 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1036 } break;
1037 case 3: {
1038 int32_t ix2 = iw + ido;
1039 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1040 } break;
1041 case 2:
1042 radb2_ps(ido, l1, in, out, &wa[iw]);
1043 break;
1044 default:
1045 assert(0);
1046 break;
1047 }
1048 l1 = l2;
1049 iw += (ip - 1)*ido;
1050
1051 if (out == work2) {
1052 out = work1; in = work2;
1053 } else {
1054 out = work2; in = work1;
1055 }
1056 }
1057 return in; /* this is in fact the output .. */
1058 }
1059
decompose(int32_t n,int32_t * ifac,const int32_t * ntryh)1060 static int32_t decompose(int32_t n, int32_t *ifac, const int32_t *ntryh) {
1061 int32_t nl = n, nf = 0, i, j = 0;
1062 for (j=0; ntryh[j]; ++j) {
1063 int32_t ntry = ntryh[j];
1064 while (nl != 1) {
1065 int32_t nq = nl / ntry;
1066 int32_t nr = nl - ntry * nq;
1067 if (nr == 0) {
1068 ifac[2+nf++] = ntry;
1069 nl = nq;
1070 if (ntry == 2 && nf != 1) {
1071 for (i = 2; i <= nf; ++i) {
1072 int32_t ib = nf - i + 2;
1073 ifac[ib + 1] = ifac[ib];
1074 }
1075 ifac[2] = 2;
1076 }
1077 } else break;
1078 }
1079 }
1080 ifac[0] = n;
1081 ifac[1] = nf;
1082 return nf;
1083 }
1084
1085
1086
rffti1_ps(int32_t n,float * wa,int32_t * ifac)1087 static void rffti1_ps(int32_t n, float *wa, int32_t *ifac)
1088 {
1089 static const int32_t ntryh[] = { 4,2,3,5,0 };
1090 int32_t k1, j, ii;
1091
1092 int32_t nf = decompose(n,ifac,ntryh);
1093 float argh = (2*M_PI) / n;
1094 int32_t is = 0;
1095 int32_t nfm1 = nf - 1;
1096 int32_t l1 = 1;
1097 for (k1 = 1; k1 <= nfm1; k1++) {
1098 int32_t ip = ifac[k1 + 1];
1099 int32_t ld = 0;
1100 int32_t l2 = l1*ip;
1101 int32_t ido = n / l2;
1102 int32_t ipm = ip - 1;
1103 for (j = 1; j <= ipm; ++j) {
1104 float argld;
1105 int32_t i = is, fi=0;
1106 ld += l1;
1107 argld = ld*argh;
1108 for (ii = 3; ii <= ido; ii += 2) {
1109 i += 2;
1110 fi += 1;
1111 wa[i - 2] = cos(fi*argld);
1112 wa[i - 1] = sin(fi*argld);
1113 }
1114 is += ido;
1115 }
1116 l1 = l2;
1117 }
1118 } /* rffti1 */
1119
cffti1_ps(int32_t n,float * wa,int32_t * ifac)1120 void cffti1_ps(int32_t n, float *wa, int32_t *ifac)
1121 {
1122 static const int32_t ntryh[] = { 5,3,4,2,0 };
1123 int32_t k1, j, ii;
1124
1125 int32_t nf = decompose(n,ifac,ntryh);
1126 float argh = (2*M_PI)/(float)n;
1127 int32_t i = 1;
1128 int32_t l1 = 1;
1129 for (k1=1; k1<=nf; k1++) {
1130 int32_t ip = ifac[k1+1];
1131 int32_t ld = 0;
1132 int32_t l2 = l1*ip;
1133 int32_t ido = n / l2;
1134 int32_t idot = ido + ido + 2;
1135 int32_t ipm = ip - 1;
1136 for (j=1; j<=ipm; j++) {
1137 float argld;
1138 int32_t i1 = i, fi = 0;
1139 wa[i-1] = 1;
1140 wa[i] = 0;
1141 ld += l1;
1142 argld = ld*argh;
1143 for (ii = 4; ii <= idot; ii += 2) {
1144 i += 2;
1145 fi += 1;
1146 wa[i-1] = cos(fi*argld);
1147 wa[i] = sin(fi*argld);
1148 }
1149 if (ip > 5) {
1150 wa[i1-1] = wa[i-1];
1151 wa[i1] = wa[i];
1152 }
1153 }
1154 l1 = l2;
1155 }
1156 } /* cffti1 */
1157
1158
cfftf1_ps(int32_t n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int32_t * ifac,int32_t isign)1159 v4sf *cfftf1_ps(int32_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int32_t *ifac, int32_t isign) {
1160 v4sf *in = (v4sf*)input_readonly;
1161 v4sf *out = (in == work2 ? work1 : work2);
1162 int32_t nf = ifac[1], k1;
1163 int32_t l1 = 1;
1164 int32_t iw = 0;
1165 assert(in != out && work1 != work2);
1166 for (k1=2; k1<=nf+1; k1++) {
1167 int32_t ip = ifac[k1];
1168 int32_t l2 = ip*l1;
1169 int32_t ido = n / l2;
1170 int32_t idot = ido + ido;
1171 switch (ip) {
1172 case 5: {
1173 int32_t ix2 = iw + idot;
1174 int32_t ix3 = ix2 + idot;
1175 int32_t ix4 = ix3 + idot;
1176 passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1177 } break;
1178 case 4: {
1179 int32_t ix2 = iw + idot;
1180 int32_t ix3 = ix2 + idot;
1181 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1182 } break;
1183 case 2: {
1184 passf2_ps(idot, l1, in, out, &wa[iw], isign);
1185 } break;
1186 case 3: {
1187 int32_t ix2 = iw + idot;
1188 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1189 } break;
1190 default:
1191 assert(0);
1192 }
1193 l1 = l2;
1194 iw += (ip - 1)*idot;
1195 if (out == work2) {
1196 out = work1; in = work2;
1197 } else {
1198 out = work2; in = work1;
1199 }
1200 }
1201
1202 return in; /* this is in fact the output .. */
1203 }
1204
1205
1206 struct PFFFT_Setup {
1207 int32_t N;
1208 int32_t Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1209 int32_t ifac[15];
1210 pffft_transform_t transform;
1211 v4sf *data; // allocated room for twiddle coefs
1212 float *e; // points into 'data' , N/4*3 elements
1213 float *twiddle; // points into 'data', N/4 elements
1214 };
1215
pffft_new_setup(int32_t N,pffft_transform_t transform)1216 PFFFT_Setup *pffft_new_setup(int32_t N, pffft_transform_t transform) {
1217 PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1218 int32_t k, m;
1219 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1220 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1221 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1222 if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1223 if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1224 //assert((N % 32) == 0);
1225 s->N = N;
1226 s->transform = transform;
1227 /* nb of complex simd vectors */
1228 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1229 s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1230 s->e = (float*)s->data;
1231 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1232
1233 if (transform == PFFFT_REAL) {
1234 for (k=0; k < s->Ncvec; ++k) {
1235 int32_t i = k/SIMD_SZ;
1236 int32_t j = k%SIMD_SZ;
1237 for (m=0; m < SIMD_SZ-1; ++m) {
1238 float A = -2*M_PI*(m+1)*k / N;
1239 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
1240 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
1241 }
1242 }
1243 rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1244 } else {
1245 for (k=0; k < s->Ncvec; ++k) {
1246 int32_t i = k/SIMD_SZ;
1247 int32_t j = k%SIMD_SZ;
1248 for (m=0; m < SIMD_SZ-1; ++m) {
1249 float A = -2*M_PI*(m+1)*k / N;
1250 s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
1251 s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
1252 }
1253 }
1254 cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1255 }
1256
1257 /* check that N is decomposable with allowed prime factors */
1258 for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1259 if (m != N/SIMD_SZ) {
1260 pffft_destroy_setup(s); s = 0;
1261 }
1262
1263 return s;
1264 }
1265
1266
pffft_destroy_setup(PFFFT_Setup * s)1267 void pffft_destroy_setup(PFFFT_Setup *s) {
1268 pffft_aligned_free(s->data);
1269 free(s);
1270 }
1271
1272 #if !defined(PFFFT_SIMD_DISABLE)
1273
1274 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
reversed_copy(int32_t N,const v4sf * in,int32_t in_stride,v4sf * out)1275 static void reversed_copy(int32_t N, const v4sf *in, int32_t in_stride, v4sf *out) {
1276 v4sf g0, g1;
1277 int32_t k;
1278 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1279
1280 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1281 for (k=1; k < N; ++k) {
1282 v4sf h0, h1;
1283 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1284 *--out = VSWAPHL(g1, h0);
1285 *--out = VSWAPHL(h0, h1);
1286 g1 = h1;
1287 }
1288 *--out = VSWAPHL(g1, g0);
1289 }
1290
unreversed_copy(int32_t N,const v4sf * in,v4sf * out,int32_t out_stride)1291 static void unreversed_copy(int32_t N, const v4sf *in, v4sf *out, int32_t out_stride) {
1292 v4sf g0, g1, h0, h1;
1293 int32_t k;
1294 g0 = g1 = in[0]; ++in;
1295 for (k=1; k < N; ++k) {
1296 h0 = *in++; h1 = *in++;
1297 g1 = VSWAPHL(g1, h0);
1298 h0 = VSWAPHL(h0, h1);
1299 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1300 g1 = h1;
1301 }
1302 h0 = *in++; h1 = g0;
1303 g1 = VSWAPHL(g1, h0);
1304 h0 = VSWAPHL(h0, h1);
1305 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1306 }
1307
pffft_zreorder(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1308 void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1309 int32_t k, N = setup->N, Ncvec = setup->Ncvec;
1310 const v4sf *vin = (const v4sf*)in;
1311 v4sf *vout = (v4sf*)out;
1312 assert(in != out);
1313 if (setup->transform == PFFFT_REAL) {
1314 int32_t k, dk = N/32;
1315 if (direction == PFFFT_FORWARD) {
1316 for (k=0; k < dk; ++k) {
1317 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1318 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1319 }
1320 reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1321 reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1322 } else {
1323 for (k=0; k < dk; ++k) {
1324 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1325 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1326 }
1327 unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1328 unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1329 }
1330 } else {
1331 if (direction == PFFFT_FORWARD) {
1332 for (k=0; k < Ncvec; ++k) {
1333 int32_t kk = (k/4) + (k%4)*(Ncvec/4);
1334 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1335 }
1336 } else {
1337 for (k=0; k < Ncvec; ++k) {
1338 int32_t kk = (k/4) + (k%4)*(Ncvec/4);
1339 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1340 }
1341 }
1342 }
1343 }
1344
pffft_cplx_finalize(int32_t Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1345 void pffft_cplx_finalize(int32_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1346 int32_t k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1347 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1348 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1349 assert(in != out);
1350 for (k=0; k < dk; ++k) {
1351 r0 = in[8*k+0]; i0 = in[8*k+1];
1352 r1 = in[8*k+2]; i1 = in[8*k+3];
1353 r2 = in[8*k+4]; i2 = in[8*k+5];
1354 r3 = in[8*k+6]; i3 = in[8*k+7];
1355 VTRANSPOSE4(r0,r1,r2,r3);
1356 VTRANSPOSE4(i0,i1,i2,i3);
1357 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1358 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1359 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1360
1361 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1362 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1363 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1364 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1365
1366 /*
1367 transformation for each column is:
1368
1369 [1 1 1 1 0 0 0 0] [r0]
1370 [1 0 -1 0 0 -1 0 1] [r1]
1371 [1 -1 1 -1 0 0 0 0] [r2]
1372 [1 0 -1 0 0 1 0 -1] [r3]
1373 [0 0 0 0 1 1 1 1] * [i0]
1374 [0 1 0 -1 1 0 -1 0] [i1]
1375 [0 0 0 0 1 -1 1 -1] [i2]
1376 [0 -1 0 1 1 0 -1 0] [i3]
1377 */
1378
1379 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1380 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1381 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1382 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1383
1384 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1385 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1386 }
1387 }
1388
pffft_cplx_preprocess(int32_t Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1389 void pffft_cplx_preprocess(int32_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1390 int32_t k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1391 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1392 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1393 assert(in != out);
1394 for (k=0; k < dk; ++k) {
1395 r0 = in[8*k+0]; i0 = in[8*k+1];
1396 r1 = in[8*k+2]; i1 = in[8*k+3];
1397 r2 = in[8*k+4]; i2 = in[8*k+5];
1398 r3 = in[8*k+6]; i3 = in[8*k+7];
1399
1400 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1401 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1402 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1403 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1404
1405 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1406 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1407 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1408 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1409
1410 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1411 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1412 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1413
1414 VTRANSPOSE4(r0,r1,r2,r3);
1415 VTRANSPOSE4(i0,i1,i2,i3);
1416
1417 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1418 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1419 }
1420 }
1421
1422
pffft_real_finalize_4x4(const v4sf * in0,const v4sf * in1,const v4sf * in,const v4sf * e,v4sf * out)1423 static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1424 const v4sf *e, v4sf *out) {
1425 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1426 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1427 r0 = *in0; i0 = *in1;
1428 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1429 VTRANSPOSE4(r0,r1,r2,r3);
1430 VTRANSPOSE4(i0,i1,i2,i3);
1431
1432 /*
1433 transformation for each column is:
1434
1435 [1 1 1 1 0 0 0 0] [r0]
1436 [1 0 -1 0 0 -1 0 1] [r1]
1437 [1 0 -1 0 0 1 0 -1] [r2]
1438 [1 -1 1 -1 0 0 0 0] [r3]
1439 [0 0 0 0 1 1 1 1] * [i0]
1440 [0 -1 0 1 -1 0 1 0] [i1]
1441 [0 -1 0 1 1 0 -1 0] [i2]
1442 [0 0 0 0 -1 1 -1 1] [i3]
1443 */
1444
1445 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1446 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1447
1448 VCPLXMUL(r1,i1,e[0],e[1]);
1449 VCPLXMUL(r2,i2,e[2],e[3]);
1450 VCPLXMUL(r3,i3,e[4],e[5]);
1451
1452 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1453 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1454
1455 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1456 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1457 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1458 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1459
1460 r0 = VADD(sr0, sr1);
1461 r3 = VSUB(sr0, sr1);
1462 i0 = VADD(si0, si1);
1463 i3 = VSUB(si1, si0);
1464 r1 = VADD(dr0, di1);
1465 r2 = VSUB(dr0, di1);
1466 i1 = VSUB(dr1, di0);
1467 i2 = VADD(dr1, di0);
1468
1469 *out++ = r0;
1470 *out++ = i0;
1471 *out++ = r1;
1472 *out++ = i1;
1473 *out++ = r2;
1474 *out++ = i2;
1475 *out++ = r3;
1476 *out++ = i3;
1477
1478 }
1479
pffft_real_finalize(int32_t Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1480 static NEVER_INLINE(void) pffft_real_finalize(int32_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1481 int32_t k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1482 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1483
1484 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1485 v4sf save = in[7], zero=VZERO();
1486 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1487 static const float s = (float) M_SQRT2/2;
1488
1489 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1490 assert(in != out);
1491 pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1492
1493 /*
1494 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1495
1496 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1497 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1498 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1499 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1500 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1501 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1502 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1503 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1504 */
1505
1506 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1507 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1508 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1509 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1510 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1511 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1512 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1513 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1514
1515 for (k=1; k < dk; ++k) {
1516 v4sf save_next = in[8*k+7];
1517 pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1518 e + k*6, out + k*8);
1519 save = save_next;
1520 }
1521
1522 }
1523
pffft_real_preprocess_4x4(const v4sf * in,const v4sf * e,v4sf * out,int32_t first)1524 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1525 const v4sf *e, v4sf *out, int32_t first) {
1526 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1527 /*
1528 transformation for each column is:
1529
1530 [1 1 1 1 0 0 0 0] [r0]
1531 [1 0 0 -1 0 -1 -1 0] [r1]
1532 [1 -1 -1 1 0 0 0 0] [r2]
1533 [1 0 0 -1 0 1 1 0] [r3]
1534 [0 0 0 0 1 -1 1 -1] * [i0]
1535 [0 -1 1 0 1 0 0 1] [i1]
1536 [0 0 0 0 1 1 -1 -1] [i2]
1537 [0 1 -1 0 1 0 0 1] [i3]
1538 */
1539
1540 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1541 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1542 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1543 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1544
1545 r0 = VADD(sr0, sr1);
1546 r2 = VSUB(sr0, sr1);
1547 r1 = VSUB(dr0, si1);
1548 r3 = VADD(dr0, si1);
1549 i0 = VSUB(di0, di1);
1550 i2 = VADD(di0, di1);
1551 i1 = VSUB(si0, dr1);
1552 i3 = VADD(si0, dr1);
1553
1554 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1555 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1556 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1557
1558 VTRANSPOSE4(r0,r1,r2,r3);
1559 VTRANSPOSE4(i0,i1,i2,i3);
1560
1561 if (!first) {
1562 *out++ = r0;
1563 *out++ = i0;
1564 }
1565 *out++ = r1;
1566 *out++ = i1;
1567 *out++ = r2;
1568 *out++ = i2;
1569 *out++ = r3;
1570 *out++ = i3;
1571 }
1572
pffft_real_preprocess(int32_t Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1573 static NEVER_INLINE(void) pffft_real_preprocess(int32_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1574 int32_t k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1575 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1576
1577 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1578 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1579 static const float s = (float) M_SQRT2;
1580 assert(in != out);
1581 for (k=0; k < 4; ++k) {
1582 Xr.f[k] = ((float*)in)[8*k];
1583 Xi.f[k] = ((float*)in)[8*k+4];
1584 }
1585
1586 pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1587
1588 /*
1589 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1590
1591 [cr0] [1 0 2 0 1 0 0 0]
1592 [cr1] [1 0 0 0 -1 0 -2 0]
1593 [cr2] [1 0 -2 0 1 0 0 0]
1594 [cr3] [1 0 0 0 -1 0 2 0]
1595 [ci0] [0 2 0 2 0 0 0 0]
1596 [ci1] [0 s 0 -s 0 -s 0 -s]
1597 [ci2] [0 0 0 0 0 -2 0 2]
1598 [ci3] [0 -s 0 s 0 -s 0 -s]
1599 */
1600 for (k=1; k < dk; ++k) {
1601 pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1602 }
1603
1604 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1605 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1606 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1607 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1608 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1609 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1610 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1611 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1612 }
1613
1614
pffft_transform_internal(PFFFT_Setup * setup,const float * finput,float * foutput,v4sf * scratch,pffft_direction_t direction,int32_t ordered)1615 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1616 pffft_direction_t direction, int32_t ordered) {
1617 int32_t k, Ncvec = setup->Ncvec;
1618 int32_t nf_odd = (setup->ifac[1] & 1);
1619
1620 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1621 int32_t stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1622 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1623
1624 const v4sf *vinput = (const v4sf*)finput;
1625 v4sf *voutput = (v4sf*)foutput;
1626 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
1627 int32_t ib = (nf_odd ^ ordered ? 1 : 0);
1628
1629 assert(VALIGNED(finput) && VALIGNED(foutput));
1630
1631 //assert(finput != foutput);
1632 if (direction == PFFFT_FORWARD) {
1633 ib = !ib;
1634 if (setup->transform == PFFFT_REAL) {
1635 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1636 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1637 pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1638 } else {
1639 v4sf *tmp = buff[ib];
1640 for (k=0; k < Ncvec; ++k) {
1641 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1642 }
1643 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1644 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1645 pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1646 }
1647 if (ordered) {
1648 pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1649 } else ib = !ib;
1650 } else {
1651 if (vinput == buff[ib]) {
1652 ib = !ib; // may happen when finput == foutput
1653 }
1654 if (ordered) {
1655 pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1656 vinput = buff[ib]; ib = !ib;
1657 }
1658 if (setup->transform == PFFFT_REAL) {
1659 pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1660 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1661 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1662 } else {
1663 pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1664 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1665 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1666 for (k=0; k < Ncvec; ++k) {
1667 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1668 }
1669 }
1670 }
1671
1672 if (buff[ib] != voutput) {
1673 /* extra copy required -- this situation should only happen when finput == foutput */
1674 assert(finput==foutput);
1675 for (k=0; k < Ncvec; ++k) {
1676 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1677 voutput[2*k] = a; voutput[2*k+1] = b;
1678 }
1679 ib = !ib;
1680 }
1681 assert(buff[ib] == voutput);
1682 }
1683
pffft_zconvolve_accumulate(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1684 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1685 int32_t Ncvec = s->Ncvec;
1686 const v4sf * RESTRICT va = (const v4sf*)a;
1687 const v4sf * RESTRICT vb = (const v4sf*)b;
1688 v4sf * RESTRICT vab = (v4sf*)ab;
1689
1690 #ifdef __arm__
1691 __builtin_prefetch(va);
1692 __builtin_prefetch(vb);
1693 __builtin_prefetch(vab);
1694 __builtin_prefetch(va+2);
1695 __builtin_prefetch(vb+2);
1696 __builtin_prefetch(vab+2);
1697 __builtin_prefetch(va+4);
1698 __builtin_prefetch(vb+4);
1699 __builtin_prefetch(vab+4);
1700 __builtin_prefetch(va+6);
1701 __builtin_prefetch(vb+6);
1702 __builtin_prefetch(vab+6);
1703 # ifndef __clang__
1704 # define ZCONVOLVE_USING_INLINE_NEON_ASM
1705 # endif
1706 #endif
1707
1708 float ar, ai, br, bi, abr, abi;
1709 #ifndef ZCONVOLVE_USING_INLINE_ASM
1710 v4sf vscal = LD_PS1(scaling);
1711 int32_t i;
1712 #endif
1713
1714 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1715 ar = ((v4sf_union*)va)[0].f[0];
1716 ai = ((v4sf_union*)va)[1].f[0];
1717 br = ((v4sf_union*)vb)[0].f[0];
1718 bi = ((v4sf_union*)vb)[1].f[0];
1719 abr = ((v4sf_union*)vab)[0].f[0];
1720 abi = ((v4sf_union*)vab)[1].f[0];
1721
1722 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1723 const float *a_ = a, *b_ = b; float *ab_ = ab;
1724 int32_t N = Ncvec;
1725 asm volatile("mov r8, %2 \n"
1726 "vdup.f32 q15, %4 \n"
1727 "1: \n"
1728 "pld [%0,#64] \n"
1729 "pld [%1,#64] \n"
1730 "pld [%2,#64] \n"
1731 "pld [%0,#96] \n"
1732 "pld [%1,#96] \n"
1733 "pld [%2,#96] \n"
1734 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1735 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1736 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1737 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1738 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1739
1740 "vmul.f32 q10, q0, q4 \n"
1741 "vmul.f32 q11, q0, q5 \n"
1742 "vmul.f32 q12, q2, q6 \n"
1743 "vmul.f32 q13, q2, q7 \n"
1744 "vmls.f32 q10, q1, q5 \n"
1745 "vmla.f32 q11, q1, q4 \n"
1746 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1747 "vmls.f32 q12, q3, q7 \n"
1748 "vmla.f32 q13, q3, q6 \n"
1749 "vmla.f32 q8, q10, q15 \n"
1750 "vmla.f32 q9, q11, q15 \n"
1751 "vmla.f32 q0, q12, q15 \n"
1752 "vmla.f32 q1, q13, q15 \n"
1753 "vst1.f32 {q8,q9},[%2,:128]! \n"
1754 "vst1.f32 {q0,q1},[%2,:128]! \n"
1755 "subs %3, #2 \n"
1756 "bne 1b \n"
1757 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1758 #else // default routine, works fine for non-arm cpus with current compilers
1759 for (i=0; i < Ncvec; i += 2) {
1760 v4sf ar, ai, br, bi;
1761 ar = va[2*i+0]; ai = va[2*i+1];
1762 br = vb[2*i+0]; bi = vb[2*i+1];
1763 VCPLXMUL(ar, ai, br, bi);
1764 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1765 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1766 ar = va[2*i+2]; ai = va[2*i+3];
1767 br = vb[2*i+2]; bi = vb[2*i+3];
1768 VCPLXMUL(ar, ai, br, bi);
1769 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1770 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1771 }
1772 #endif
1773 if (s->transform == PFFFT_REAL) {
1774 ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1775 ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1776 }
1777 }
1778 #else // defined(PFFFT_SIMD_DISABLE)
1779
1780 // standard routine using scalar floats, without SIMD stuff.
1781
1782 #define pffft_zreorder_nosimd pffft_zreorder
pffft_zreorder_nosimd(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1783 void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1784 int32_t k, N = setup->N;
1785 if (setup->transform == PFFFT_COMPLEX) {
1786 for (k=0; k < 2*N; ++k) out[k] = in[k];
1787 return;
1788 }
1789 else if (direction == PFFFT_FORWARD) {
1790 float x_N = in[N-1];
1791 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1792 out[0] = in[0];
1793 out[1] = x_N;
1794 } else {
1795 float x_N = in[1];
1796 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1797 out[0] = in[0];
1798 out[N-1] = x_N;
1799 }
1800 }
1801
1802 #define pffft_transform_internal_nosimd pffft_transform_internal
pffft_transform_internal_nosimd(PFFFT_Setup * setup,const float * input,float * output,float * scratch,pffft_direction_t direction,int32_t ordered)1803 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1804 pffft_direction_t direction, int32_t ordered) {
1805 int32_t Ncvec = setup->Ncvec;
1806 int32_t nf_odd = (setup->ifac[1] & 1);
1807
1808 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1809 int32_t stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1810 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1811 float *buff[2];
1812 int32_t ib;
1813 if (scratch == 0) scratch = scratch_on_stack;
1814 buff[0] = output; buff[1] = scratch;
1815
1816 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1817 ib = (nf_odd ^ ordered ? 1 : 0);
1818
1819 if (direction == PFFFT_FORWARD) {
1820 if (setup->transform == PFFFT_REAL) {
1821 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1822 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1823 } else {
1824 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1825 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1826 }
1827 if (ordered) {
1828 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1829 }
1830 } else {
1831 if (input == buff[ib]) {
1832 ib = !ib; // may happen when finput == foutput
1833 }
1834 if (ordered) {
1835 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1836 input = buff[!ib];
1837 }
1838 if (setup->transform == PFFFT_REAL) {
1839 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1840 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1841 } else {
1842 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1843 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1844 }
1845 }
1846 if (buff[ib] != output) {
1847 int32_t k;
1848 // extra copy required -- this situation should happens only when finput == foutput
1849 assert(input==output);
1850 for (k=0; k < Ncvec; ++k) {
1851 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1852 output[2*k] = a; output[2*k+1] = b;
1853 }
1854 ib = !ib;
1855 }
1856 assert(buff[ib] == output);
1857 }
1858
1859 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1860 void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1861 float *ab, float scaling) {
1862 int32_t i, Ncvec = s->Ncvec;
1863
1864 if (s->transform == PFFFT_REAL) {
1865 // take care of the fftpack ordering
1866 ab[0] += a[0]*b[0]*scaling;
1867 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1868 ++ab; ++a; ++b; --Ncvec;
1869 }
1870 for (i=0; i < Ncvec; ++i) {
1871 float ar, ai, br, bi;
1872 ar = a[2*i+0]; ai = a[2*i+1];
1873 br = b[2*i+0]; bi = b[2*i+1];
1874 VCPLXMUL(ar, ai, br, bi);
1875 ab[2*i+0] += ar*scaling;
1876 ab[2*i+1] += ai*scaling;
1877 }
1878 }
1879
1880 #endif // defined(PFFFT_SIMD_DISABLE)
1881
pffft_transform(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)1882 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1883 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
1884 }
1885
pffft_transform_ordered(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)1886 void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1887 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
1888 }
1889