1 /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
2
3 Based on original fortran 77 code from FFTPACKv4 from NETLIB
4 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5 of NCAR, in 1985.
6
7 As confirmed by the NCAR fftpack software curators, the following
8 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9 released under the same terms.
10
11 FFTPACK license:
12
13 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15 Copyright (c) 2004 the University Corporation for Atmospheric
16 Research ("UCAR"). All rights reserved. Developed by NCAR's
17 Computational and Information Systems Laboratory, UCAR,
18 www.cisl.ucar.edu.
19
20 Redistribution and use of the Software in source and binary forms,
21 with or without modification, is permitted provided that the
22 following conditions are met:
23
24 - Neither the names of NCAR's Computational and Information Systems
25 Laboratory, the University Corporation for Atmospheric Research,
26 nor the names of its sponsors or contributors may be used to
27 endorse or promote products derived from this Software without
28 specific prior written permission.
29
30 - Redistributions of source code must retain the above copyright
31 notices, this list of conditions, and the disclaimer below.
32
33 - Redistributions in binary form must reproduce the above copyright
34 notice, this list of conditions, and the disclaimer below in the
35 documentation and/or other materials provided with the
36 distribution.
37
38 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46 SOFTWARE.
47
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53 */
54
55 /*
56 ChangeLog:
57 - 2011/10/02, version 1: This is the very first release of this file.
58 */
59
60 #include "pffft.h"
61 #include <stdint.h>
62 #include <stdlib.h>
63 #include <math.h>
64 #include <assert.h>
65
66 /* detect compiler flavour */
67 #if defined(_MSC_VER)
68 # define COMPILER_MSVC
69 #elif defined(__GNUC__)
70 # define COMPILER_GCC
71 #endif
72
73 #if defined(COMPILER_GCC)
74 # define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
75 # define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
76 # define RESTRICT __restrict
77 # define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
78 # define VLA_ARRAY_ON_STACK_FREE(varname__)
79 #elif defined(COMPILER_MSVC)
80 #include <malloc.h>
81 # define ALWAYS_INLINE(return_type) __forceinline return_type
82 # define NEVER_INLINE(return_type) __declspec(noinline) return_type
83 # define RESTRICT __restrict
84 # define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_malloca(size__ * sizeof(type__))
85 # define VLA_ARRAY_ON_STACK_FREE(varname__) _freea(varname__)
86 #endif
87
88
89 /*
90 vector support macros: the rest of the code is independant of
91 SSE/Altivec/NEON -- adding support for other platforms with 4-element
92 vectors should be limited to these macros
93 */
94
95
96 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
97 //#define PFFFT_SIMD_DISABLE
98
99 /*
100 Altivec support macros
101 */
102 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
103 typedef vector float v4sf;
104 # define SIMD_SZ 4
105 # define VZERO() ((vector float) vec_splat_u8(0))
106 # define VMUL(a,b) vec_madd(a,b, VZERO())
107 # define VADD(a,b) vec_add(a,b)
108 # define VMADD(a,b,c) vec_madd(a,b,c)
109 # define VSUB(a,b) vec_sub(a,b)
ld_ps1(const float * p)110 inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
111 # define LD_PS1(p) ld_ps1(&p)
112 # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
113 # define UNINTERLEAVE2(in1, in2, out1, out2) { \
114 vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
115 vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
116 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
117 }
118 # define VTRANSPOSE4(x0,x1,x2,x3) { \
119 v4sf y0 = vec_mergeh(x0, x2); \
120 v4sf y1 = vec_mergel(x0, x2); \
121 v4sf y2 = vec_mergeh(x1, x3); \
122 v4sf y3 = vec_mergel(x1, x3); \
123 x0 = vec_mergeh(y0, y2); \
124 x1 = vec_mergel(y0, y2); \
125 x2 = vec_mergeh(y1, y3); \
126 x3 = vec_mergel(y1, y3); \
127 }
128 # define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
129 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
130
131 /*
132 SSE1 support macros
133 */
134 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(_M_IX86))
135
136 #include <xmmintrin.h>
137 typedef __m128 v4sf;
138 # define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
139 # define VZERO() _mm_setzero_ps()
140 # define VMUL(a,b) _mm_mul_ps(a,b)
141 # define VADD(a,b) _mm_add_ps(a,b)
142 # define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
143 # define VSUB(a,b) _mm_sub_ps(a,b)
144 # define LD_PS1(p) _mm_set1_ps(p)
145 # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
146 # define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
147 # define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
148 # define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
149 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
150
151 /*
152 ARM NEON support macros
153 */
154 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__ARMEL__) || defined(__aarch64__) || defined(_M_ARM64))
155 # include <arm_neon.h>
156 typedef float32x4_t v4sf;
157 # define SIMD_SZ 4
158 # define VZERO() vdupq_n_f32(0)
159 # define VMUL(a,b) vmulq_f32(a,b)
160 # define VADD(a,b) vaddq_f32(a,b)
161 # define VMADD(a,b,c) vmlaq_f32(c,a,b)
162 # define VSUB(a,b) vsubq_f32(a,b)
163 # define LD_PS1(p) vld1q_dup_f32(&(p))
164 # define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
165 # define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
166 # define VTRANSPOSE4(x0,x1,x2,x3) { \
167 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
168 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
169 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
170 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
171 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
172 }
173 // marginally faster version
174 //# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
175 # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
176 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
177 #else
178 # if !defined(PFFFT_SIMD_DISABLE)
179 # warning "building with simd disabled !\n";
180 # define PFFFT_SIMD_DISABLE // fallback to scalar code
181 # endif
182 #endif
183
184 // fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
185 #ifdef PFFFT_SIMD_DISABLE
186 typedef float v4sf;
187 # define SIMD_SZ 1
188 # define VZERO() 0.f
189 # define VMUL(a,b) ((a)*(b))
190 # define VADD(a,b) ((a)+(b))
191 # define VMADD(a,b,c) ((a)*(b)+(c))
192 # define VSUB(a,b) ((a)-(b))
193 # define LD_PS1(p) (p)
194 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
195 #endif
196
197 // shortcuts for complex multiplcations
198 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
199 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
200 #ifndef SVMUL
201 // multiply a scalar with a vector
202 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
203 #endif
204
205 #if !defined(PFFFT_SIMD_DISABLE)
206 typedef union v4sf_union {
207 v4sf v;
208 float f[4];
209 } v4sf_union;
210
211 #include <string.h>
212
213 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
214
215 /* detect bugs with the vector support macros */
validate_pffft_simd()216 void validate_pffft_simd() {
217 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
218 v4sf_union a0, a1, a2, a3, t, u;
219 memcpy(a0.f, f, 4*sizeof(float));
220 memcpy(a1.f, f+4, 4*sizeof(float));
221 memcpy(a2.f, f+8, 4*sizeof(float));
222 memcpy(a3.f, f+12, 4*sizeof(float));
223
224 t = a0; u = a1; t.v = VZERO();
225 assertv4(t, 0, 0, 0, 0);
226 t.v = VADD(a1.v, a2.v);
227 assertv4(t, 12, 14, 16, 18);
228 t.v = VMUL(a1.v, a2.v);
229 assertv4(t, 32, 45, 60, 77);
230 t.v = VMADD(a1.v, a2.v,a0.v);
231 assertv4(t, 32, 46, 62, 80);
232
233 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
234 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
235 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
236 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
237
238 t.v=LD_PS1(f[15]);
239 assertv4(t, 15, 15, 15, 15);
240 t.v = VSWAPHL(a1.v, a2.v);
241 assertv4(t, 8, 9, 6, 7);
242 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
243 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
244 }
245 #endif //!PFFFT_SIMD_DISABLE
246
247 /* SSE and co like 16-bytes aligned pointers */
248 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
pffft_aligned_malloc(size_t nb_bytes)249 void *pffft_aligned_malloc(size_t nb_bytes) {
250 void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
251 if (!p0) return (void *) 0;
252 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
253 *((void **) p - 1) = p0;
254 return p;
255 }
256
pffft_aligned_free(void * p)257 void pffft_aligned_free(void *p) {
258 if (p) free(*((void **) p - 1));
259 }
260
pffft_simd_size()261 int pffft_simd_size() { return SIMD_SZ; }
262
263 /*
264 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
265 */
passf2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,float fsign)266 static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
267 int k, i;
268 int l1ido = l1*ido;
269 if (ido <= 2) {
270 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
271 ch[0] = VADD(cc[0], cc[ido+0]);
272 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
273 ch[1] = VADD(cc[1], cc[ido+1]);
274 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
275 }
276 } else {
277 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
278 for (i=0; i<ido-1; i+=2) {
279 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
280 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
281 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
282 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
283 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
284 VCPLXMUL(tr2, ti2, wr, wi);
285 ch[i+l1ido] = tr2;
286 ch[i+l1ido+1] = ti2;
287 }
288 }
289 }
290 }
291
292 /*
293 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
294 */
passf3_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,float fsign)295 static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
296 const float *wa1, const float *wa2, float fsign) {
297 static const float taur = -0.5f;
298 float taui = 0.866025403784439f*fsign;
299 int i, k;
300 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
301 int l1ido = l1*ido;
302 float wr1, wi1, wr2, wi2;
303 assert(ido > 2);
304 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
305 for (i=0; i<ido-1; i+=2) {
306 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
307 cr2 = VADD(cc[i], SVMUL(taur,tr2));
308 ch[i] = VADD(cc[i], tr2);
309 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
310 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
311 ch[i+1] = VADD(cc[i+1], ti2);
312 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
313 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
314 dr2 = VSUB(cr2, ci3);
315 dr3 = VADD(cr2, ci3);
316 di2 = VADD(ci2, cr3);
317 di3 = VSUB(ci2, cr3);
318 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
319 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
320 ch[i+l1ido] = dr2;
321 ch[i+l1ido + 1] = di2;
322 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
323 ch[i+2*l1ido] = dr3;
324 ch[i+2*l1ido+1] = di3;
325 }
326 }
327 } /* passf3 */
328
passf4_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,float fsign)329 static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
330 const float *wa1, const float *wa2, const float *wa3, float fsign) {
331 /* isign == -1 for forward transform and +1 for backward transform */
332
333 int i, k;
334 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
335 int l1ido = l1*ido;
336 if (ido == 2) {
337 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
338 tr1 = VSUB(cc[0], cc[2*ido + 0]);
339 tr2 = VADD(cc[0], cc[2*ido + 0]);
340 ti1 = VSUB(cc[1], cc[2*ido + 1]);
341 ti2 = VADD(cc[1], cc[2*ido + 1]);
342 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
343 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
344 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
345 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
346
347 ch[0*l1ido + 0] = VADD(tr2, tr3);
348 ch[0*l1ido + 1] = VADD(ti2, ti3);
349 ch[1*l1ido + 0] = VADD(tr1, tr4);
350 ch[1*l1ido + 1] = VADD(ti1, ti4);
351 ch[2*l1ido + 0] = VSUB(tr2, tr3);
352 ch[2*l1ido + 1] = VSUB(ti2, ti3);
353 ch[3*l1ido + 0] = VSUB(tr1, tr4);
354 ch[3*l1ido + 1] = VSUB(ti1, ti4);
355 }
356 } else {
357 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
358 for (i=0; i<ido-1; i+=2) {
359 float wr1, wi1, wr2, wi2, wr3, wi3;
360 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
361 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
362 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
363 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
364 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
365 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
366 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
367 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
368
369 ch[i] = VADD(tr2, tr3);
370 cr3 = VSUB(tr2, tr3);
371 ch[i + 1] = VADD(ti2, ti3);
372 ci3 = VSUB(ti2, ti3);
373
374 cr2 = VADD(tr1, tr4);
375 cr4 = VSUB(tr1, tr4);
376 ci2 = VADD(ti1, ti4);
377 ci4 = VSUB(ti1, ti4);
378 wr1=wa1[i], wi1=fsign*wa1[i+1];
379 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
380 wr2=wa2[i], wi2=fsign*wa2[i+1];
381 ch[i + l1ido] = cr2;
382 ch[i + l1ido + 1] = ci2;
383
384 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
385 wr3=wa3[i], wi3=fsign*wa3[i+1];
386 ch[i + 2*l1ido] = cr3;
387 ch[i + 2*l1ido + 1] = ci3;
388
389 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
390 ch[i + 3*l1ido] = cr4;
391 ch[i + 3*l1ido + 1] = ci4;
392 }
393 }
394 }
395 } /* passf4 */
396
397 /*
398 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
399 */
passf5_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4,float fsign)400 static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
401 const float *wa1, const float *wa2,
402 const float *wa3, const float *wa4, float fsign) {
403 static const float tr11 = .309016994374947f;
404 const float ti11 = .951056516295154f*fsign;
405 static const float tr12 = -.809016994374947f;
406 const float ti12 = .587785252292473f*fsign;
407
408 /* Local variables */
409 int i, k;
410 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
411 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
412
413 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
414
415 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
416 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
417
418 assert(ido > 2);
419 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
420 for (i = 0; i < ido-1; i += 2) {
421 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
422 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
423 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
424 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
425 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
426 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
427 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
428 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
429 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
430 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
431 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
432 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
433 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
434 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
435 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
436 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
437 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
438 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
439 dr3 = VSUB(cr3, ci4);
440 dr4 = VADD(cr3, ci4);
441 di3 = VADD(ci3, cr4);
442 di4 = VSUB(ci3, cr4);
443 dr5 = VADD(cr2, ci5);
444 dr2 = VSUB(cr2, ci5);
445 di5 = VSUB(ci2, cr5);
446 di2 = VADD(ci2, cr5);
447 wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
448 wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
449 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
450 ch_ref(i - 1, 2) = dr2;
451 ch_ref(i, 2) = di2;
452 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
453 ch_ref(i - 1, 3) = dr3;
454 ch_ref(i, 3) = di3;
455 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
456 ch_ref(i - 1, 4) = dr4;
457 ch_ref(i, 4) = di4;
458 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
459 ch_ref(i - 1, 5) = dr5;
460 ch_ref(i, 5) = di5;
461 }
462 }
463 #undef ch_ref
464 #undef cc_ref
465 }
466
radf2_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1)467 static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
468 static const float minus_one = -1.f;
469 int i, k, l1ido = l1*ido;
470 for (k=0; k < l1ido; k += ido) {
471 v4sf a = cc[k], b = cc[k + l1ido];
472 ch[2*k] = VADD(a, b);
473 ch[2*(k+ido)-1] = VSUB(a, b);
474 }
475 if (ido < 2) return;
476 if (ido != 2) {
477 for (k=0; k < l1ido; k += ido) {
478 for (i=2; i<ido; i+=2) {
479 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
480 v4sf br = cc[i - 1 + k], bi = cc[i + k];
481 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
482 ch[i + 2*k] = VADD(bi, ti2);
483 ch[2*(k+ido) - i] = VSUB(ti2, bi);
484 ch[i - 1 + 2*k] = VADD(br, tr2);
485 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
486 }
487 }
488 if (ido % 2 == 1) return;
489 }
490 for (k=0; k < l1ido; k += ido) {
491 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
492 ch[2*k + ido-1] = cc[k + ido-1];
493 }
494 } /* radf2 */
495
496
radb2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1)497 static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
498 static const float minus_two=-2;
499 int i, k, l1ido = l1*ido;
500 v4sf a,b,c,d, tr2, ti2;
501 for (k=0; k < l1ido; k += ido) {
502 a = cc[2*k]; b = cc[2*(k+ido) - 1];
503 ch[k] = VADD(a, b);
504 ch[k + l1ido] =VSUB(a, b);
505 }
506 if (ido < 2) return;
507 if (ido != 2) {
508 for (k = 0; k < l1ido; k += ido) {
509 for (i = 2; i < ido; i += 2) {
510 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
511 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
512 ch[i-1 + k] = VADD(a, b);
513 tr2 = VSUB(a, b);
514 ch[i+0 + k] = VSUB(c, d);
515 ti2 = VADD(c, d);
516 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
517 ch[i-1 + k + l1ido] = tr2;
518 ch[i+0 + k + l1ido] = ti2;
519 }
520 }
521 if (ido % 2 == 1) return;
522 }
523 for (k = 0; k < l1ido; k += ido) {
524 a = cc[2*k + ido-1]; b = cc[2*k + ido];
525 ch[k + ido-1] = VADD(a,a);
526 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
527 }
528 } /* radb2 */
529
radf3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)530 static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
531 const float *wa1, const float *wa2) {
532 static const float taur = -0.5f;
533 static const float taui = 0.866025403784439f;
534 int i, k, ic;
535 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
536 for (k=0; k<l1; k++) {
537 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
538 ch[3*k*ido] = VADD(cc[k*ido], cr2);
539 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
540 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
541 }
542 if (ido == 1) return;
543 for (k=0; k<l1; k++) {
544 for (i=2; i<ido; i+=2) {
545 ic = ido - i;
546 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
547 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
548 VCPLXMULCONJ(dr2, di2, wr1, wi1);
549
550 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
551 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
552 VCPLXMULCONJ(dr3, di3, wr2, wi2);
553
554 cr2 = VADD(dr2, dr3);
555 ci2 = VADD(di2, di3);
556 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
557 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
558 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
559 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
560 tr3 = SVMUL(taui, VSUB(di2, di3));
561 ti3 = SVMUL(taui, VSUB(dr3, dr2));
562 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
563 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
564 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
565 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
566 }
567 }
568 } /* radf3 */
569
570
radb3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)571 static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
572 const float *wa1, const float *wa2)
573 {
574 static const float taur = -0.5f;
575 static const float taui = 0.866025403784439f;
576 static const float taui_2 = 0.866025403784439f*2;
577 int i, k, ic;
578 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
579 for (k=0; k<l1; k++) {
580 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
581 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
582 ch[k*ido] = VADD(cc[3*k*ido], tr2);
583 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
584 ch[(k + l1)*ido] = VSUB(cr2, ci3);
585 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
586 }
587 if (ido == 1) return;
588 for (k=0; k<l1; k++) {
589 for (i=2; i<ido; i+=2) {
590 ic = ido - i;
591 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
592 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
593 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
594 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
595 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
596 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
597 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
598 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
599 dr2 = VSUB(cr2, ci3);
600 dr3 = VADD(cr2, ci3);
601 di2 = VADD(ci2, cr3);
602 di3 = VSUB(ci2, cr3);
603 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
604 ch[i - 1 + (k + l1)*ido] = dr2;
605 ch[i + (k + l1)*ido] = di2;
606 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
607 ch[i - 1 + (k + 2*l1)*ido] = dr3;
608 ch[i + (k + 2*l1)*ido] = di3;
609 }
610 }
611 } /* radb3 */
612
radf4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)613 static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
614 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
615 {
616 static const float minus_hsqt2 = (float)-0.7071067811865475;
617 int i, k, l1ido = l1*ido;
618 {
619 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
620 v4sf * RESTRICT ch_ = ch;
621 while (cc < cc_end) {
622 // this loop represents between 25% and 40% of total radf4_ps cost !
623 v4sf a0 = cc[0], a1 = cc[l1ido];
624 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
625 v4sf tr1 = VADD(a1, a3);
626 v4sf tr2 = VADD(a0, a2);
627 ch[2*ido-1] = VSUB(a0, a2);
628 ch[2*ido ] = VSUB(a3, a1);
629 ch[0 ] = VADD(tr1, tr2);
630 ch[4*ido-1] = VSUB(tr2, tr1);
631 cc += ido; ch += 4*ido;
632 }
633 cc = cc_; ch = ch_;
634 }
635 if (ido < 2) return;
636 if (ido != 2) {
637 for (k = 0; k < l1ido; k += ido) {
638 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
639 for (i=2; i<ido; i += 2, pc += 2) {
640 int ic = ido - i;
641 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
642 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
643
644 cr2 = pc[1*l1ido+0];
645 ci2 = pc[1*l1ido+1];
646 wr=LD_PS1(wa1[i - 2]);
647 wi=LD_PS1(wa1[i - 1]);
648 VCPLXMULCONJ(cr2,ci2,wr,wi);
649
650 cr3 = pc[2*l1ido+0];
651 ci3 = pc[2*l1ido+1];
652 wr = LD_PS1(wa2[i-2]);
653 wi = LD_PS1(wa2[i-1]);
654 VCPLXMULCONJ(cr3, ci3, wr, wi);
655
656 cr4 = pc[3*l1ido];
657 ci4 = pc[3*l1ido+1];
658 wr = LD_PS1(wa3[i-2]);
659 wi = LD_PS1(wa3[i-1]);
660 VCPLXMULCONJ(cr4, ci4, wr, wi);
661
662 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
663
664 tr1 = VADD(cr2,cr4);
665 tr4 = VSUB(cr4,cr2);
666 tr2 = VADD(pc[0],cr3);
667 tr3 = VSUB(pc[0],cr3);
668 ch[i - 1 + 4*k] = VADD(tr1,tr2);
669 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
670 ti1 = VADD(ci2,ci4);
671 ti4 = VSUB(ci2,ci4);
672 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
673 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
674 ti2 = VADD(pc[1],ci3);
675 ti3 = VSUB(pc[1],ci3);
676 ch[i + 4*k] = VADD(ti1, ti2);
677 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
678 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
679 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
680 }
681 }
682 if (ido % 2 == 1) return;
683 }
684 for (k=0; k<l1ido; k += ido) {
685 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
686 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
687 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
688 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
689 ch[ido-1 + 4*k] = VADD(tr1, c);
690 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
691 ch[4*k + 1*ido] = VSUB(ti1, d);
692 ch[4*k + 3*ido] = VADD(ti1, d);
693 }
694 } /* radf4 */
695
696
radb4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)697 static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
698 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
699 {
700 static const float minus_sqrt2 = (float)-1.414213562373095;
701 static const float two = 2.f;
702 int i, k, l1ido = l1*ido;
703 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
704 {
705 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
706 v4sf *ch_ = ch;
707 while (ch < ch_end) {
708 v4sf a = cc[0], b = cc[4*ido-1];
709 v4sf c = cc[2*ido], d = cc[2*ido-1];
710 tr3 = SVMUL(two,d);
711 tr2 = VADD(a,b);
712 tr1 = VSUB(a,b);
713 tr4 = SVMUL(two,c);
714 ch[0*l1ido] = VADD(tr2, tr3);
715 ch[2*l1ido] = VSUB(tr2, tr3);
716 ch[1*l1ido] = VSUB(tr1, tr4);
717 ch[3*l1ido] = VADD(tr1, tr4);
718
719 cc += 4*ido; ch += ido;
720 }
721 cc = cc_; ch = ch_;
722 }
723 if (ido < 2) return;
724 if (ido != 2) {
725 for (k = 0; k < l1ido; k += ido) {
726 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
727 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
728 for (i = 2; i < ido; i += 2) {
729
730 tr1 = VSUB(pc[i], pc[4*ido - i]);
731 tr2 = VADD(pc[i], pc[4*ido - i]);
732 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
733 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
734 ph[0] = VADD(tr2, tr3);
735 cr3 = VSUB(tr2, tr3);
736
737 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
738 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
739 cr2 = VSUB(tr1, tr4);
740 cr4 = VADD(tr1, tr4);
741
742 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
743 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
744
745 ph[1] = VADD(ti2, ti3); ph += l1ido;
746 ci3 = VSUB(ti2, ti3);
747 ci2 = VADD(ti1, ti4);
748 ci4 = VSUB(ti1, ti4);
749 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
750 ph[0] = cr2;
751 ph[1] = ci2; ph += l1ido;
752 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
753 ph[0] = cr3;
754 ph[1] = ci3; ph += l1ido;
755 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
756 ph[0] = cr4;
757 ph[1] = ci4; ph = ph - 3*l1ido + 2;
758 }
759 }
760 if (ido % 2 == 1) return;
761 }
762 for (k=0; k < l1ido; k+=ido) {
763 int i0 = 4*k + ido;
764 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
765 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
766 tr1 = VSUB(c,d);
767 tr2 = VADD(c,d);
768 ti1 = VADD(b,a);
769 ti2 = VSUB(b,a);
770 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
771 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
772 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
773 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
774 }
775 } /* radb4 */
776
radf5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)777 static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
778 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
779 {
780 static const float tr11 = .309016994374947f;
781 static const float ti11 = .951056516295154f;
782 static const float tr12 = -.809016994374947f;
783 static const float ti12 = .587785252292473f;
784
785 /* System generated locals */
786 int cc_offset, ch_offset;
787
788 /* Local variables */
789 int i, k, ic;
790 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
791 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
792 int idp2;
793
794
795 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
796 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
797
798 /* Parameter adjustments */
799 ch_offset = 1 + ido * 6;
800 ch -= ch_offset;
801 cc_offset = 1 + ido * (1 + l1);
802 cc -= cc_offset;
803
804 /* Function Body */
805 for (k = 1; k <= l1; ++k) {
806 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
807 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
808 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
809 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
810 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
811 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
812 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
813 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
814 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
815 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
816 }
817 if (ido == 1) {
818 return;
819 }
820 idp2 = ido + 2;
821 for (k = 1; k <= l1; ++k) {
822 for (i = 3; i <= ido; i += 2) {
823 ic = idp2 - i;
824 dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
825 dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
826 dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
827 dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
828 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
829 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
830 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
831 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
832 cr2 = VADD(dr2, dr5);
833 ci5 = VSUB(dr5, dr2);
834 cr5 = VSUB(di2, di5);
835 ci2 = VADD(di2, di5);
836 cr3 = VADD(dr3, dr4);
837 ci4 = VSUB(dr4, dr3);
838 cr4 = VSUB(di3, di4);
839 ci3 = VADD(di3, di4);
840 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
841 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
842 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
843 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
844 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
845 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
846 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
847 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
848 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
849 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
850 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
851 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
852 ch_ref(i, 3, k) = VADD(ti2, ti5);
853 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
854 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
855 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
856 ch_ref(i, 5, k) = VADD(ti3, ti4);
857 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
858 }
859 }
860 #undef cc_ref
861 #undef ch_ref
862 } /* radf5 */
863
radb5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)864 static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
865 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
866 {
867 static const float tr11 = .309016994374947f;
868 static const float ti11 = .951056516295154f;
869 static const float tr12 = -.809016994374947f;
870 static const float ti12 = .587785252292473f;
871
872 int cc_offset, ch_offset;
873
874 /* Local variables */
875 int i, k, ic;
876 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
877 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
878 int idp2;
879
880 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
881 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
882
883 /* Parameter adjustments */
884 ch_offset = 1 + ido * (1 + l1);
885 ch -= ch_offset;
886 cc_offset = 1 + ido * 6;
887 cc -= cc_offset;
888
889 /* Function Body */
890 for (k = 1; k <= l1; ++k) {
891 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
892 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
893 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
894 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
895 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
896 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
897 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
898 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
899 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
900 ch_ref(1, k, 2) = VSUB(cr2, ci5);
901 ch_ref(1, k, 3) = VSUB(cr3, ci4);
902 ch_ref(1, k, 4) = VADD(cr3, ci4);
903 ch_ref(1, k, 5) = VADD(cr2, ci5);
904 }
905 if (ido == 1) {
906 return;
907 }
908 idp2 = ido + 2;
909 for (k = 1; k <= l1; ++k) {
910 for (i = 3; i <= ido; i += 2) {
911 ic = idp2 - i;
912 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
913 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
914 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
915 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
916 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
917 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
918 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
919 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
920 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
921 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
922 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
923 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
924 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
925 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
926 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
927 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
928 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
929 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
930 dr3 = VSUB(cr3, ci4);
931 dr4 = VADD(cr3, ci4);
932 di3 = VADD(ci3, cr4);
933 di4 = VSUB(ci3, cr4);
934 dr5 = VADD(cr2, ci5);
935 dr2 = VSUB(cr2, ci5);
936 di5 = VSUB(ci2, cr5);
937 di2 = VADD(ci2, cr5);
938 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
939 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
940 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
941 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
942
943 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
944 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
945 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
946 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
947 }
948 }
949 #undef cc_ref
950 #undef ch_ref
951 } /* radb5 */
952
rfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)953 static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
954 const float *wa, const int *ifac) {
955 v4sf *in = (v4sf*)input_readonly;
956 v4sf *out = (in == work2 ? work1 : work2);
957 int nf = ifac[1], k1;
958 int l2 = n;
959 int iw = n-1;
960 assert(in != out && work1 != work2);
961 for (k1 = 1; k1 <= nf; ++k1) {
962 int kh = nf - k1;
963 int ip = ifac[kh + 2];
964 int l1 = l2 / ip;
965 int ido = n / l2;
966 iw -= (ip - 1)*ido;
967 switch (ip) {
968 case 5: {
969 int ix2 = iw + ido;
970 int ix3 = ix2 + ido;
971 int ix4 = ix3 + ido;
972 radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
973 } break;
974 case 4: {
975 int ix2 = iw + ido;
976 int ix3 = ix2 + ido;
977 radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
978 } break;
979 case 3: {
980 int ix2 = iw + ido;
981 radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
982 } break;
983 case 2:
984 radf2_ps(ido, l1, in, out, &wa[iw]);
985 break;
986 default:
987 assert(0);
988 break;
989 }
990 l2 = l1;
991 if (out == work2) {
992 out = work1; in = work2;
993 } else {
994 out = work2; in = work1;
995 }
996 }
997 return in; /* this is in fact the output .. */
998 } /* rfftf1 */
999
rfftb1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)1000 static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1001 const float *wa, const int *ifac) {
1002 v4sf *in = (v4sf*)input_readonly;
1003 v4sf *out = (in == work2 ? work1 : work2);
1004 int nf = ifac[1], k1;
1005 int l1 = 1;
1006 int iw = 0;
1007 assert(in != out);
1008 for (k1=1; k1<=nf; k1++) {
1009 int ip = ifac[k1 + 1];
1010 int l2 = ip*l1;
1011 int ido = n / l2;
1012 switch (ip) {
1013 case 5: {
1014 int ix2 = iw + ido;
1015 int ix3 = ix2 + ido;
1016 int ix4 = ix3 + ido;
1017 radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1018 } break;
1019 case 4: {
1020 int ix2 = iw + ido;
1021 int ix3 = ix2 + ido;
1022 radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1023 } break;
1024 case 3: {
1025 int ix2 = iw + ido;
1026 radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1027 } break;
1028 case 2:
1029 radb2_ps(ido, l1, in, out, &wa[iw]);
1030 break;
1031 default:
1032 assert(0);
1033 break;
1034 }
1035 l1 = l2;
1036 iw += (ip - 1)*ido;
1037
1038 if (out == work2) {
1039 out = work1; in = work2;
1040 } else {
1041 out = work2; in = work1;
1042 }
1043 }
1044 return in; /* this is in fact the output .. */
1045 }
1046
decompose(int n,int * ifac,const int * ntryh)1047 static int decompose(int n, int *ifac, const int *ntryh) {
1048 int nl = n, nf = 0, i, j = 0;
1049 for (j=0; ntryh[j]; ++j) {
1050 int ntry = ntryh[j];
1051 while (nl != 1) {
1052 int nq = nl / ntry;
1053 int nr = nl - ntry * nq;
1054 if (nr == 0) {
1055 ifac[2+nf++] = ntry;
1056 nl = nq;
1057 if (ntry == 2 && nf != 1) {
1058 for (i = 2; i <= nf; ++i) {
1059 int ib = nf - i + 2;
1060 ifac[ib + 1] = ifac[ib];
1061 }
1062 ifac[2] = 2;
1063 }
1064 } else break;
1065 }
1066 }
1067 ifac[0] = n;
1068 ifac[1] = nf;
1069 return nf;
1070 }
1071
1072
1073
rffti1_ps(int n,float * wa,int * ifac)1074 static void rffti1_ps(int n, float *wa, int *ifac)
1075 {
1076 static const int ntryh[] = { 4,2,3,5,0 };
1077 int k1, j, ii;
1078
1079 int nf = decompose(n,ifac,ntryh);
1080 float argh = (2*M_PI) / n;
1081 int is = 0;
1082 int nfm1 = nf - 1;
1083 int l1 = 1;
1084 for (k1 = 1; k1 <= nfm1; k1++) {
1085 int ip = ifac[k1 + 1];
1086 int ld = 0;
1087 int l2 = l1*ip;
1088 int ido = n / l2;
1089 int ipm = ip - 1;
1090 for (j = 1; j <= ipm; ++j) {
1091 float argld;
1092 int i = is, fi=0;
1093 ld += l1;
1094 argld = ld*argh;
1095 for (ii = 3; ii <= ido; ii += 2) {
1096 i += 2;
1097 fi += 1;
1098 wa[i - 2] = cos(fi*argld);
1099 wa[i - 1] = sin(fi*argld);
1100 }
1101 is += ido;
1102 }
1103 l1 = l2;
1104 }
1105 } /* rffti1 */
1106
cffti1_ps(int n,float * wa,int * ifac)1107 void cffti1_ps(int n, float *wa, int *ifac)
1108 {
1109 static const int ntryh[] = { 5,3,4,2,0 };
1110 int k1, j, ii;
1111
1112 int nf = decompose(n,ifac,ntryh);
1113 float argh = (2*M_PI)/(float)n;
1114 int i = 1;
1115 int l1 = 1;
1116 for (k1=1; k1<=nf; k1++) {
1117 int ip = ifac[k1+1];
1118 int ld = 0;
1119 int l2 = l1*ip;
1120 int ido = n / l2;
1121 int idot = ido + ido + 2;
1122 int ipm = ip - 1;
1123 for (j=1; j<=ipm; j++) {
1124 float argld;
1125 int i1 = i, fi = 0;
1126 wa[i-1] = 1;
1127 wa[i] = 0;
1128 ld += l1;
1129 argld = ld*argh;
1130 for (ii = 4; ii <= idot; ii += 2) {
1131 i += 2;
1132 fi += 1;
1133 wa[i-1] = cos(fi*argld);
1134 wa[i] = sin(fi*argld);
1135 }
1136 if (ip > 5) {
1137 wa[i1-1] = wa[i-1];
1138 wa[i1] = wa[i];
1139 }
1140 }
1141 l1 = l2;
1142 }
1143 } /* cffti1 */
1144
1145
cfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac,int isign)1146 v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
1147 v4sf *in = (v4sf*)input_readonly;
1148 v4sf *out = (in == work2 ? work1 : work2);
1149 int nf = ifac[1], k1;
1150 int l1 = 1;
1151 int iw = 0;
1152 assert(in != out && work1 != work2);
1153 for (k1=2; k1<=nf+1; k1++) {
1154 int ip = ifac[k1];
1155 int l2 = ip*l1;
1156 int ido = n / l2;
1157 int idot = ido + ido;
1158 switch (ip) {
1159 case 5: {
1160 int ix2 = iw + idot;
1161 int ix3 = ix2 + idot;
1162 int ix4 = ix3 + idot;
1163 passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1164 } break;
1165 case 4: {
1166 int ix2 = iw + idot;
1167 int ix3 = ix2 + idot;
1168 passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1169 } break;
1170 case 2: {
1171 passf2_ps(idot, l1, in, out, &wa[iw], isign);
1172 } break;
1173 case 3: {
1174 int ix2 = iw + idot;
1175 passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1176 } break;
1177 default:
1178 assert(0);
1179 }
1180 l1 = l2;
1181 iw += (ip - 1)*idot;
1182 if (out == work2) {
1183 out = work1; in = work2;
1184 } else {
1185 out = work2; in = work1;
1186 }
1187 }
1188
1189 return in; /* this is in fact the output .. */
1190 }
1191
1192
1193 struct PFFFT_Setup {
1194 int N;
1195 int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1196 int ifac[15];
1197 pffft_transform_t transform;
1198 v4sf *data; // allocated room for twiddle coefs
1199 float *e; // points into 'data' , N/4*3 elements
1200 float *twiddle; // points into 'data', N/4 elements
1201 };
1202
pffft_new_setup(int N,pffft_transform_t transform)1203 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
1204 PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1205 int k, m;
1206 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1207 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1208 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1209 if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1210 if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1211 //assert((N % 32) == 0);
1212 s->N = N;
1213 s->transform = transform;
1214 /* nb of complex simd vectors */
1215 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1216 s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1217 s->e = (float*)s->data;
1218 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1219
1220 if (transform == PFFFT_REAL) {
1221 for (k=0; k < s->Ncvec; ++k) {
1222 int i = k/SIMD_SZ;
1223 int j = k%SIMD_SZ;
1224 for (m=0; m < SIMD_SZ-1; ++m) {
1225 float A = -2*M_PI*(m+1)*k / N;
1226 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
1227 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
1228 }
1229 }
1230 rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1231 } else {
1232 for (k=0; k < s->Ncvec; ++k) {
1233 int i = k/SIMD_SZ;
1234 int j = k%SIMD_SZ;
1235 for (m=0; m < SIMD_SZ-1; ++m) {
1236 float A = -2*M_PI*(m+1)*k / N;
1237 s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
1238 s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
1239 }
1240 }
1241 cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1242 }
1243
1244 /* check that N is decomposable with allowed prime factors */
1245 for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1246 if (m != N/SIMD_SZ) {
1247 pffft_destroy_setup(s); s = 0;
1248 }
1249
1250 return s;
1251 }
1252
1253
pffft_destroy_setup(PFFFT_Setup * s)1254 void pffft_destroy_setup(PFFFT_Setup *s) {
1255 pffft_aligned_free(s->data);
1256 free(s);
1257 }
1258
1259 #if !defined(PFFFT_SIMD_DISABLE)
1260
1261 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
reversed_copy(int N,const v4sf * in,int in_stride,v4sf * out)1262 static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1263 v4sf g0, g1;
1264 int k;
1265 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1266
1267 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1268 for (k=1; k < N; ++k) {
1269 v4sf h0, h1;
1270 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1271 *--out = VSWAPHL(g1, h0);
1272 *--out = VSWAPHL(h0, h1);
1273 g1 = h1;
1274 }
1275 *--out = VSWAPHL(g1, g0);
1276 }
1277
unreversed_copy(int N,const v4sf * in,v4sf * out,int out_stride)1278 static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1279 v4sf g0, g1, h0, h1;
1280 int k;
1281 g0 = g1 = in[0]; ++in;
1282 for (k=1; k < N; ++k) {
1283 h0 = *in++; h1 = *in++;
1284 g1 = VSWAPHL(g1, h0);
1285 h0 = VSWAPHL(h0, h1);
1286 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1287 g1 = h1;
1288 }
1289 h0 = *in++; h1 = g0;
1290 g1 = VSWAPHL(g1, h0);
1291 h0 = VSWAPHL(h0, h1);
1292 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1293 }
1294
pffft_zreorder(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1295 void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1296 int k, N = setup->N, Ncvec = setup->Ncvec;
1297 const v4sf *vin = (const v4sf*)in;
1298 v4sf *vout = (v4sf*)out;
1299 assert(in != out);
1300 if (setup->transform == PFFFT_REAL) {
1301 int k, dk = N/32;
1302 if (direction == PFFFT_FORWARD) {
1303 for (k=0; k < dk; ++k) {
1304 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1305 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1306 }
1307 reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1308 reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1309 } else {
1310 for (k=0; k < dk; ++k) {
1311 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1312 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1313 }
1314 unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1315 unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1316 }
1317 } else {
1318 if (direction == PFFFT_FORWARD) {
1319 for (k=0; k < Ncvec; ++k) {
1320 int kk = (k/4) + (k%4)*(Ncvec/4);
1321 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1322 }
1323 } else {
1324 for (k=0; k < Ncvec; ++k) {
1325 int kk = (k/4) + (k%4)*(Ncvec/4);
1326 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1327 }
1328 }
1329 }
1330 }
1331
pffft_cplx_finalize(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1332 void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1333 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1334 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1335 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1336 assert(in != out);
1337 for (k=0; k < dk; ++k) {
1338 r0 = in[8*k+0]; i0 = in[8*k+1];
1339 r1 = in[8*k+2]; i1 = in[8*k+3];
1340 r2 = in[8*k+4]; i2 = in[8*k+5];
1341 r3 = in[8*k+6]; i3 = in[8*k+7];
1342 VTRANSPOSE4(r0,r1,r2,r3);
1343 VTRANSPOSE4(i0,i1,i2,i3);
1344 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1345 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1346 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1347
1348 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1349 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1350 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1351 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1352
1353 /*
1354 transformation for each column is:
1355
1356 [1 1 1 1 0 0 0 0] [r0]
1357 [1 0 -1 0 0 -1 0 1] [r1]
1358 [1 -1 1 -1 0 0 0 0] [r2]
1359 [1 0 -1 0 0 1 0 -1] [r3]
1360 [0 0 0 0 1 1 1 1] * [i0]
1361 [0 1 0 -1 1 0 -1 0] [i1]
1362 [0 0 0 0 1 -1 1 -1] [i2]
1363 [0 -1 0 1 1 0 -1 0] [i3]
1364 */
1365
1366 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1367 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1368 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1369 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1370
1371 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1372 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1373 }
1374 }
1375
pffft_cplx_preprocess(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1376 void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1377 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1378 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1379 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1380 assert(in != out);
1381 for (k=0; k < dk; ++k) {
1382 r0 = in[8*k+0]; i0 = in[8*k+1];
1383 r1 = in[8*k+2]; i1 = in[8*k+3];
1384 r2 = in[8*k+4]; i2 = in[8*k+5];
1385 r3 = in[8*k+6]; i3 = in[8*k+7];
1386
1387 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1388 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1389 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1390 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1391
1392 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1393 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1394 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1395 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1396
1397 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1398 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1399 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1400
1401 VTRANSPOSE4(r0,r1,r2,r3);
1402 VTRANSPOSE4(i0,i1,i2,i3);
1403
1404 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1405 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1406 }
1407 }
1408
1409
pffft_real_finalize_4x4(const v4sf * in0,const v4sf * in1,const v4sf * in,const v4sf * e,v4sf * out)1410 static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1411 const v4sf *e, v4sf *out) {
1412 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1413 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1414 r0 = *in0; i0 = *in1;
1415 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1416 VTRANSPOSE4(r0,r1,r2,r3);
1417 VTRANSPOSE4(i0,i1,i2,i3);
1418
1419 /*
1420 transformation for each column is:
1421
1422 [1 1 1 1 0 0 0 0] [r0]
1423 [1 0 -1 0 0 -1 0 1] [r1]
1424 [1 0 -1 0 0 1 0 -1] [r2]
1425 [1 -1 1 -1 0 0 0 0] [r3]
1426 [0 0 0 0 1 1 1 1] * [i0]
1427 [0 -1 0 1 -1 0 1 0] [i1]
1428 [0 -1 0 1 1 0 -1 0] [i2]
1429 [0 0 0 0 -1 1 -1 1] [i3]
1430 */
1431
1432 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1433 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1434
1435 VCPLXMUL(r1,i1,e[0],e[1]);
1436 VCPLXMUL(r2,i2,e[2],e[3]);
1437 VCPLXMUL(r3,i3,e[4],e[5]);
1438
1439 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1440 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1441
1442 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1443 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1444 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1445 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1446
1447 r0 = VADD(sr0, sr1);
1448 r3 = VSUB(sr0, sr1);
1449 i0 = VADD(si0, si1);
1450 i3 = VSUB(si1, si0);
1451 r1 = VADD(dr0, di1);
1452 r2 = VSUB(dr0, di1);
1453 i1 = VSUB(dr1, di0);
1454 i2 = VADD(dr1, di0);
1455
1456 *out++ = r0;
1457 *out++ = i0;
1458 *out++ = r1;
1459 *out++ = i1;
1460 *out++ = r2;
1461 *out++ = i2;
1462 *out++ = r3;
1463 *out++ = i3;
1464
1465 }
1466
pffft_real_finalize(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1467 static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1468 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1469 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1470
1471 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1472 v4sf save = in[7], zero=VZERO();
1473 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1474 static const float s = M_SQRT2/2;
1475
1476 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1477 assert(in != out);
1478 pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1479
1480 /*
1481 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1482
1483 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1484 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1485 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1486 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1487 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1488 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1489 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1490 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1491 */
1492
1493 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1494 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1495 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1496 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1497 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1498 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1499 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1500 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1501
1502 for (k=1; k < dk; ++k) {
1503 v4sf save_next = in[8*k+7];
1504 pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1505 e + k*6, out + k*8);
1506 save = save_next;
1507 }
1508
1509 }
1510
pffft_real_preprocess_4x4(const v4sf * in,const v4sf * e,v4sf * out,int first)1511 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1512 const v4sf *e, v4sf *out, int first) {
1513 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1514 /*
1515 transformation for each column is:
1516
1517 [1 1 1 1 0 0 0 0] [r0]
1518 [1 0 0 -1 0 -1 -1 0] [r1]
1519 [1 -1 -1 1 0 0 0 0] [r2]
1520 [1 0 0 -1 0 1 1 0] [r3]
1521 [0 0 0 0 1 -1 1 -1] * [i0]
1522 [0 -1 1 0 1 0 0 1] [i1]
1523 [0 0 0 0 1 1 -1 -1] [i2]
1524 [0 1 -1 0 1 0 0 1] [i3]
1525 */
1526
1527 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1528 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1529 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1530 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1531
1532 r0 = VADD(sr0, sr1);
1533 r2 = VSUB(sr0, sr1);
1534 r1 = VSUB(dr0, si1);
1535 r3 = VADD(dr0, si1);
1536 i0 = VSUB(di0, di1);
1537 i2 = VADD(di0, di1);
1538 i1 = VSUB(si0, dr1);
1539 i3 = VADD(si0, dr1);
1540
1541 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1542 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1543 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1544
1545 VTRANSPOSE4(r0,r1,r2,r3);
1546 VTRANSPOSE4(i0,i1,i2,i3);
1547
1548 if (!first) {
1549 *out++ = r0;
1550 *out++ = i0;
1551 }
1552 *out++ = r1;
1553 *out++ = i1;
1554 *out++ = r2;
1555 *out++ = i2;
1556 *out++ = r3;
1557 *out++ = i3;
1558 }
1559
pffft_real_preprocess(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1560 static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1561 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1562 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1563
1564 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1565 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1566 static const float s = M_SQRT2;
1567 assert(in != out);
1568 for (k=0; k < 4; ++k) {
1569 Xr.f[k] = ((float*)in)[8*k];
1570 Xi.f[k] = ((float*)in)[8*k+4];
1571 }
1572
1573 pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1574
1575 /*
1576 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1577
1578 [cr0] [1 0 2 0 1 0 0 0]
1579 [cr1] [1 0 0 0 -1 0 -2 0]
1580 [cr2] [1 0 -2 0 1 0 0 0]
1581 [cr3] [1 0 0 0 -1 0 2 0]
1582 [ci0] [0 2 0 2 0 0 0 0]
1583 [ci1] [0 s 0 -s 0 -s 0 -s]
1584 [ci2] [0 0 0 0 0 -2 0 2]
1585 [ci3] [0 -s 0 s 0 -s 0 -s]
1586 */
1587 for (k=1; k < dk; ++k) {
1588 pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1589 }
1590
1591 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1592 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1593 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1594 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1595 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1596 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1597 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1598 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1599 }
1600
1601
pffft_transform_internal(PFFFT_Setup * setup,const float * finput,float * foutput,v4sf * scratch,pffft_direction_t direction,int ordered)1602 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1603 pffft_direction_t direction, int ordered) {
1604 int k, Ncvec = setup->Ncvec;
1605 int nf_odd = (setup->ifac[1] & 1);
1606
1607 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1608 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1609 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1610
1611 const v4sf *vinput = (const v4sf*)finput;
1612 v4sf *voutput = (v4sf*)foutput;
1613 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
1614 int ib = (nf_odd ^ ordered ? 1 : 0);
1615
1616 assert(VALIGNED(finput) && VALIGNED(foutput));
1617
1618 //assert(finput != foutput);
1619 if (direction == PFFFT_FORWARD) {
1620 ib = !ib;
1621 if (setup->transform == PFFFT_REAL) {
1622 ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1623 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1624 pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1625 } else {
1626 v4sf *tmp = buff[ib];
1627 for (k=0; k < Ncvec; ++k) {
1628 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1629 }
1630 ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1631 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1632 pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1633 }
1634 if (ordered) {
1635 pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1636 } else ib = !ib;
1637 } else {
1638 if (vinput == buff[ib]) {
1639 ib = !ib; // may happen when finput == foutput
1640 }
1641 if (ordered) {
1642 pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1643 vinput = buff[ib]; ib = !ib;
1644 }
1645 if (setup->transform == PFFFT_REAL) {
1646 pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1647 ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1648 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1649 } else {
1650 pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1651 ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1652 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1653 for (k=0; k < Ncvec; ++k) {
1654 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1655 }
1656 }
1657 }
1658
1659 if (buff[ib] != voutput) {
1660 /* extra copy required -- this situation should only happen when finput == foutput */
1661 assert(finput==foutput);
1662 for (k=0; k < Ncvec; ++k) {
1663 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1664 voutput[2*k] = a; voutput[2*k+1] = b;
1665 }
1666 ib = !ib;
1667 }
1668 assert(buff[ib] == voutput);
1669
1670 VLA_ARRAY_ON_STACK_FREE(scratch_on_stack);
1671 }
1672
pffft_zconvolve_accumulate(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1673 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1674 int Ncvec = s->Ncvec;
1675 const v4sf * RESTRICT va = (const v4sf*)a;
1676 const v4sf * RESTRICT vb = (const v4sf*)b;
1677 v4sf * RESTRICT vab = (v4sf*)ab;
1678
1679 #ifdef __arm__
1680 __builtin_prefetch(va);
1681 __builtin_prefetch(vb);
1682 __builtin_prefetch(vab);
1683 __builtin_prefetch(va+2);
1684 __builtin_prefetch(vb+2);
1685 __builtin_prefetch(vab+2);
1686 __builtin_prefetch(va+4);
1687 __builtin_prefetch(vb+4);
1688 __builtin_prefetch(vab+4);
1689 __builtin_prefetch(va+6);
1690 __builtin_prefetch(vb+6);
1691 __builtin_prefetch(vab+6);
1692 # ifndef __clang__
1693 # define ZCONVOLVE_USING_INLINE_NEON_ASM
1694 # endif
1695 #endif
1696
1697 float ar, ai, br, bi, abr, abi;
1698 #ifndef ZCONVOLVE_USING_INLINE_ASM
1699 v4sf vscal = LD_PS1(scaling);
1700 int i;
1701 #endif
1702
1703 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1704 ar = ((v4sf_union*)va)[0].f[0];
1705 ai = ((v4sf_union*)va)[1].f[0];
1706 br = ((v4sf_union*)vb)[0].f[0];
1707 bi = ((v4sf_union*)vb)[1].f[0];
1708 abr = ((v4sf_union*)vab)[0].f[0];
1709 abi = ((v4sf_union*)vab)[1].f[0];
1710
1711 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1712 const float *a_ = a, *b_ = b; float *ab_ = ab;
1713 int N = Ncvec;
1714 asm volatile("mov r8, %2 \n"
1715 "vdup.f32 q15, %4 \n"
1716 "1: \n"
1717 "pld [%0,#64] \n"
1718 "pld [%1,#64] \n"
1719 "pld [%2,#64] \n"
1720 "pld [%0,#96] \n"
1721 "pld [%1,#96] \n"
1722 "pld [%2,#96] \n"
1723 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1724 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1725 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1726 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1727 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1728
1729 "vmul.f32 q10, q0, q4 \n"
1730 "vmul.f32 q11, q0, q5 \n"
1731 "vmul.f32 q12, q2, q6 \n"
1732 "vmul.f32 q13, q2, q7 \n"
1733 "vmls.f32 q10, q1, q5 \n"
1734 "vmla.f32 q11, q1, q4 \n"
1735 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1736 "vmls.f32 q12, q3, q7 \n"
1737 "vmla.f32 q13, q3, q6 \n"
1738 "vmla.f32 q8, q10, q15 \n"
1739 "vmla.f32 q9, q11, q15 \n"
1740 "vmla.f32 q0, q12, q15 \n"
1741 "vmla.f32 q1, q13, q15 \n"
1742 "vst1.f32 {q8,q9},[%2,:128]! \n"
1743 "vst1.f32 {q0,q1},[%2,:128]! \n"
1744 "subs %3, #2 \n"
1745 "bne 1b \n"
1746 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1747 #else // default routine, works fine for non-arm cpus with current compilers
1748 for (i=0; i < Ncvec; i += 2) {
1749 v4sf ar, ai, br, bi;
1750 ar = va[2*i+0]; ai = va[2*i+1];
1751 br = vb[2*i+0]; bi = vb[2*i+1];
1752 VCPLXMUL(ar, ai, br, bi);
1753 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1754 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1755 ar = va[2*i+2]; ai = va[2*i+3];
1756 br = vb[2*i+2]; bi = vb[2*i+3];
1757 VCPLXMUL(ar, ai, br, bi);
1758 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1759 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1760 }
1761 #endif
1762 if (s->transform == PFFFT_REAL) {
1763 ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1764 ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1765 }
1766 }
1767
1768
1769 #else // defined(PFFFT_SIMD_DISABLE)
1770
1771 // standard routine using scalar floats, without SIMD stuff.
1772
1773 #define pffft_zreorder_nosimd pffft_zreorder
pffft_zreorder_nosimd(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1774 void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1775 int k, N = setup->N;
1776 if (setup->transform == PFFFT_COMPLEX) {
1777 for (k=0; k < 2*N; ++k) out[k] = in[k];
1778 return;
1779 }
1780 else if (direction == PFFFT_FORWARD) {
1781 float x_N = in[N-1];
1782 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1783 out[0] = in[0];
1784 out[1] = x_N;
1785 } else {
1786 float x_N = in[1];
1787 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1788 out[0] = in[0];
1789 out[N-1] = x_N;
1790 }
1791 }
1792
1793 #define pffft_transform_internal_nosimd pffft_transform_internal
pffft_transform_internal_nosimd(PFFFT_Setup * setup,const float * input,float * output,float * scratch,pffft_direction_t direction,int ordered)1794 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1795 pffft_direction_t direction, int ordered) {
1796 int Ncvec = setup->Ncvec;
1797 int nf_odd = (setup->ifac[1] & 1);
1798
1799 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1800 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1801 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1802 float *buff[2];
1803 int ib;
1804 if (scratch == 0) scratch = scratch_on_stack;
1805 buff[0] = output; buff[1] = scratch;
1806
1807 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1808 ib = (nf_odd ^ ordered ? 1 : 0);
1809
1810 if (direction == PFFFT_FORWARD) {
1811 if (setup->transform == PFFFT_REAL) {
1812 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1813 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1814 } else {
1815 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1816 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1817 }
1818 if (ordered) {
1819 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1820 }
1821 } else {
1822 if (input == buff[ib]) {
1823 ib = !ib; // may happen when finput == foutput
1824 }
1825 if (ordered) {
1826 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1827 input = buff[!ib];
1828 }
1829 if (setup->transform == PFFFT_REAL) {
1830 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1831 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1832 } else {
1833 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1834 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1835 }
1836 }
1837 if (buff[ib] != output) {
1838 int k;
1839 // extra copy required -- this situation should happens only when finput == foutput
1840 assert(input==output);
1841 for (k=0; k < Ncvec; ++k) {
1842 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1843 output[2*k] = a; output[2*k+1] = b;
1844 }
1845 ib = !ib;
1846 }
1847 assert(buff[ib] == output);
1848
1849 VLA_ARRAY_ON_STACK_FREE(scratch_on_stack);
1850 }
1851
1852 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1853 void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1854 float *ab, float scaling) {
1855 int i, Ncvec = s->Ncvec;
1856
1857 if (s->transform == PFFFT_REAL) {
1858 // take care of the fftpack ordering
1859 ab[0] += a[0]*b[0]*scaling;
1860 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1861 ++ab; ++a; ++b; --Ncvec;
1862 }
1863 for (i=0; i < Ncvec; ++i) {
1864 float ar, ai, br, bi;
1865 ar = a[2*i+0]; ai = a[2*i+1];
1866 br = b[2*i+0]; bi = b[2*i+1];
1867 VCPLXMUL(ar, ai, br, bi);
1868 ab[2*i+0] += ar*scaling;
1869 ab[2*i+1] += ai*scaling;
1870 }
1871 }
1872
1873 #endif // defined(PFFFT_SIMD_DISABLE)
1874
pffft_transform(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)1875 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1876 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
1877 }
1878
pffft_transform_ordered(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)1879 void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1880 pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
1881 }
1882