1 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
2 
3    Based on original fortran 77 code from FFTPACKv4 from NETLIB
4    (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5    of NCAR, in 1985.
6 
7    As confirmed by the NCAR fftpack software curators, the following
8    FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9    released under the same terms.
10 
11    FFTPACK license:
12 
13    http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14 
15    Copyright (c) 2004 the University Corporation for Atmospheric
16    Research ("UCAR"). All rights reserved. Developed by NCAR's
17    Computational and Information Systems Laboratory, UCAR,
18    www.cisl.ucar.edu.
19 
20    Redistribution and use of the Software in source and binary forms,
21    with or without modification, is permitted provided that the
22    following conditions are met:
23 
24    - Neither the names of NCAR's Computational and Information Systems
25    Laboratory, the University Corporation for Atmospheric Research,
26    nor the names of its sponsors or contributors may be used to
27    endorse or promote products derived from this Software without
28    specific prior written permission.
29 
30    - Redistributions of source code must retain the above copyright
31    notices, this list of conditions, and the disclaimer below.
32 
33    - Redistributions in binary form must reproduce the above copyright
34    notice, this list of conditions, and the disclaimer below in the
35    documentation and/or other materials provided with the
36    distribution.
37 
38    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46    SOFTWARE.
47 
48 
49    PFFFT : a Pretty Fast FFT.
50 
51    This file is largerly based on the original FFTPACK implementation, modified in
52    order to take advantage of SIMD instructions of modern CPUs.
53 */
54 
55 /*
56   ChangeLog:
57   - 2011/10/02, version 1: This is the very first release of this file.
58 */
59 
60 #include "pffft.h"
61 #include <stdint.h>
62 #include <stdlib.h>
63 #include <math.h>
64 #include <assert.h>
65 
66 /* detect compiler flavour */
67 #if defined(_MSC_VER)
68 #  define COMPILER_MSVC
69 #elif defined(__GNUC__)
70 #  define COMPILER_GCC
71 #endif
72 
73 #if defined(COMPILER_GCC)
74 #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
75 #  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
76 #  define RESTRICT __restrict
77 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
78 #  define VLA_ARRAY_ON_STACK_FREE(varname__)
79 #elif defined(COMPILER_MSVC)
80 #include <malloc.h>
81 #  define ALWAYS_INLINE(return_type) __forceinline return_type
82 #  define NEVER_INLINE(return_type) __declspec(noinline) return_type
83 #  define RESTRICT __restrict
84 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_malloca(size__ * sizeof(type__))
85 #  define VLA_ARRAY_ON_STACK_FREE(varname__) _freea(varname__)
86 #endif
87 
88 
89 /*
90    vector support macros: the rest of the code is independant of
91    SSE/Altivec/NEON -- adding support for other platforms with 4-element
92    vectors should be limited to these macros
93 */
94 
95 
96 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
97 //#define PFFFT_SIMD_DISABLE
98 
99 /*
100    Altivec support macros
101 */
102 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
103 typedef vector float v4sf;
104 #  define SIMD_SZ 4
105 #  define VZERO() ((vector float) vec_splat_u8(0))
106 #  define VMUL(a,b) vec_madd(a,b, VZERO())
107 #  define VADD(a,b) vec_add(a,b)
108 #  define VMADD(a,b,c) vec_madd(a,b,c)
109 #  define VSUB(a,b) vec_sub(a,b)
ld_ps1(const float * p)110 inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
111 #  define LD_PS1(p) ld_ps1(&p)
112 #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
113 #  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
114     vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
115     vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
116     v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
117   }
118 #  define VTRANSPOSE4(x0,x1,x2,x3) {              \
119     v4sf y0 = vec_mergeh(x0, x2);               \
120     v4sf y1 = vec_mergel(x0, x2);               \
121     v4sf y2 = vec_mergeh(x1, x3);               \
122     v4sf y3 = vec_mergel(x1, x3);               \
123     x0 = vec_mergeh(y0, y2);                    \
124     x1 = vec_mergel(y0, y2);                    \
125     x2 = vec_mergeh(y1, y3);                    \
126     x3 = vec_mergel(y1, y3);                    \
127   }
128 #  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
129 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
130 
131 /*
132   SSE1 support macros
133 */
134 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(_M_IX86))
135 
136 #include <xmmintrin.h>
137 typedef __m128 v4sf;
138 #  define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
139 #  define VZERO() _mm_setzero_ps()
140 #  define VMUL(a,b) _mm_mul_ps(a,b)
141 #  define VADD(a,b) _mm_add_ps(a,b)
142 #  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
143 #  define VSUB(a,b) _mm_sub_ps(a,b)
144 #  define LD_PS1(p) _mm_set1_ps(p)
145 #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
146 #  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
147 #  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
148 #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
149 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
150 
151 /*
152   ARM NEON support macros
153 */
154 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__ARMEL__) || defined(__aarch64__) || defined(_M_ARM64))
155 #  include <arm_neon.h>
156 typedef float32x4_t v4sf;
157 #  define SIMD_SZ 4
158 #  define VZERO() vdupq_n_f32(0)
159 #  define VMUL(a,b) vmulq_f32(a,b)
160 #  define VADD(a,b) vaddq_f32(a,b)
161 #  define VMADD(a,b,c) vmlaq_f32(c,a,b)
162 #  define VSUB(a,b) vsubq_f32(a,b)
163 #  define LD_PS1(p) vld1q_dup_f32(&(p))
164 #  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
165 #  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
166 #  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
167     float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
168     float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
169     float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
170     float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
171     x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
172   }
173 // marginally faster version
174 //#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
175 #  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
176 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
177 #else
178 #  if !defined(PFFFT_SIMD_DISABLE)
179 #    warning "building with simd disabled !\n";
180 #    define PFFFT_SIMD_DISABLE // fallback to scalar code
181 #  endif
182 #endif
183 
184 // fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
185 #ifdef PFFFT_SIMD_DISABLE
186 typedef float v4sf;
187 #  define SIMD_SZ 1
188 #  define VZERO() 0.f
189 #  define VMUL(a,b) ((a)*(b))
190 #  define VADD(a,b) ((a)+(b))
191 #  define VMADD(a,b,c) ((a)*(b)+(c))
192 #  define VSUB(a,b) ((a)-(b))
193 #  define LD_PS1(p) (p)
194 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
195 #endif
196 
197 // shortcuts for complex multiplcations
198 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
199 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
200 #ifndef SVMUL
201 // multiply a scalar with a vector
202 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
203 #endif
204 
205 #if !defined(PFFFT_SIMD_DISABLE)
206 typedef union v4sf_union {
207   v4sf  v;
208   float f[4];
209 } v4sf_union;
210 
211 #include <string.h>
212 
213 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
214 
215 /* detect bugs with the vector support macros */
validate_pffft_simd()216 void validate_pffft_simd() {
217   float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
218   v4sf_union a0, a1, a2, a3, t, u;
219   memcpy(a0.f, f, 4*sizeof(float));
220   memcpy(a1.f, f+4, 4*sizeof(float));
221   memcpy(a2.f, f+8, 4*sizeof(float));
222   memcpy(a3.f, f+12, 4*sizeof(float));
223 
224   t = a0; u = a1; t.v = VZERO();
225   assertv4(t, 0, 0, 0, 0);
226   t.v = VADD(a1.v, a2.v);
227   assertv4(t, 12, 14, 16, 18);
228   t.v = VMUL(a1.v, a2.v);
229   assertv4(t, 32, 45, 60, 77);
230   t.v = VMADD(a1.v, a2.v,a0.v);
231   assertv4(t, 32, 46, 62, 80);
232 
233   INTERLEAVE2(a1.v,a2.v,t.v,u.v);
234   assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
235   UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
236   assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
237 
238   t.v=LD_PS1(f[15]);
239   assertv4(t, 15, 15, 15, 15);
240   t.v = VSWAPHL(a1.v, a2.v);
241   assertv4(t, 8, 9, 6, 7);
242   VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
243   assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
244 }
245 #endif //!PFFFT_SIMD_DISABLE
246 
247 /* SSE and co like 16-bytes aligned pointers */
248 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
pffft_aligned_malloc(size_t nb_bytes)249 void *pffft_aligned_malloc(size_t nb_bytes) {
250   void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
251   if (!p0) return (void *) 0;
252   p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
253   *((void **) p - 1) = p0;
254   return p;
255 }
256 
pffft_aligned_free(void * p)257 void pffft_aligned_free(void *p) {
258   if (p) free(*((void **) p - 1));
259 }
260 
pffft_simd_size()261 int pffft_simd_size() { return SIMD_SZ; }
262 
263 /*
264   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
265 */
passf2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,float fsign)266 static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
267   int k, i;
268   int l1ido = l1*ido;
269   if (ido <= 2) {
270     for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
271       ch[0]         = VADD(cc[0], cc[ido+0]);
272       ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
273       ch[1]         = VADD(cc[1], cc[ido+1]);
274       ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
275     }
276   } else {
277     for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
278       for (i=0; i<ido-1; i+=2) {
279         v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
280         v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
281         v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
282         ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
283         ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
284         VCPLXMUL(tr2, ti2, wr, wi);
285         ch[i+l1ido]   = tr2;
286         ch[i+l1ido+1] = ti2;
287       }
288     }
289   }
290 }
291 
292 /*
293   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
294 */
passf3_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,float fsign)295 static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
296                                     const float *wa1, const float *wa2, float fsign) {
297   static const float taur = -0.5f;
298   float taui = 0.866025403784439f*fsign;
299   int i, k;
300   v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
301   int l1ido = l1*ido;
302   float wr1, wi1, wr2, wi2;
303   assert(ido > 2);
304   for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
305     for (i=0; i<ido-1; i+=2) {
306       tr2 = VADD(cc[i+ido], cc[i+2*ido]);
307       cr2 = VADD(cc[i], SVMUL(taur,tr2));
308       ch[i]    = VADD(cc[i], tr2);
309       ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
310       ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
311       ch[i+1]  = VADD(cc[i+1], ti2);
312       cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
313       ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
314       dr2 = VSUB(cr2, ci3);
315       dr3 = VADD(cr2, ci3);
316       di2 = VADD(ci2, cr3);
317       di3 = VSUB(ci2, cr3);
318       wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
319       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
320       ch[i+l1ido] = dr2;
321       ch[i+l1ido + 1] = di2;
322       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
323       ch[i+2*l1ido] = dr3;
324       ch[i+2*l1ido+1] = di3;
325     }
326   }
327 } /* passf3 */
328 
passf4_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,float fsign)329 static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
330                                     const float *wa1, const float *wa2, const float *wa3, float fsign) {
331   /* isign == -1 for forward transform and +1 for backward transform */
332 
333   int i, k;
334   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
335   int l1ido = l1*ido;
336   if (ido == 2) {
337     for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
338       tr1 = VSUB(cc[0], cc[2*ido + 0]);
339       tr2 = VADD(cc[0], cc[2*ido + 0]);
340       ti1 = VSUB(cc[1], cc[2*ido + 1]);
341       ti2 = VADD(cc[1], cc[2*ido + 1]);
342       ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
343       tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
344       tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
345       ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
346 
347       ch[0*l1ido + 0] = VADD(tr2, tr3);
348       ch[0*l1ido + 1] = VADD(ti2, ti3);
349       ch[1*l1ido + 0] = VADD(tr1, tr4);
350       ch[1*l1ido + 1] = VADD(ti1, ti4);
351       ch[2*l1ido + 0] = VSUB(tr2, tr3);
352       ch[2*l1ido + 1] = VSUB(ti2, ti3);
353       ch[3*l1ido + 0] = VSUB(tr1, tr4);
354       ch[3*l1ido + 1] = VSUB(ti1, ti4);
355     }
356   } else {
357     for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
358       for (i=0; i<ido-1; i+=2) {
359         float wr1, wi1, wr2, wi2, wr3, wi3;
360         tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
361         tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
362         ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
363         ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
364         tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
365         ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
366         tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
367         ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
368 
369         ch[i] = VADD(tr2, tr3);
370         cr3    = VSUB(tr2, tr3);
371         ch[i + 1] = VADD(ti2, ti3);
372         ci3 = VSUB(ti2, ti3);
373 
374         cr2 = VADD(tr1, tr4);
375         cr4 = VSUB(tr1, tr4);
376         ci2 = VADD(ti1, ti4);
377         ci4 = VSUB(ti1, ti4);
378         wr1=wa1[i], wi1=fsign*wa1[i+1];
379         VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
380         wr2=wa2[i], wi2=fsign*wa2[i+1];
381         ch[i + l1ido] = cr2;
382         ch[i + l1ido + 1] = ci2;
383 
384         VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
385         wr3=wa3[i], wi3=fsign*wa3[i+1];
386         ch[i + 2*l1ido] = cr3;
387         ch[i + 2*l1ido + 1] = ci3;
388 
389         VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
390         ch[i + 3*l1ido] = cr4;
391         ch[i + 3*l1ido + 1] = ci4;
392       }
393     }
394   }
395 } /* passf4 */
396 
397 /*
398   passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
399 */
passf5_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4,float fsign)400 static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
401                                     const float *wa1, const float *wa2,
402                                     const float *wa3, const float *wa4, float fsign) {
403   static const float tr11 = .309016994374947f;
404   const float ti11 = .951056516295154f*fsign;
405   static const float tr12 = -.809016994374947f;
406   const float ti12 = .587785252292473f*fsign;
407 
408   /* Local variables */
409   int i, k;
410   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
411     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
412 
413   float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
414 
415 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
416 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
417 
418   assert(ido > 2);
419   for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
420     for (i = 0; i < ido-1; i += 2) {
421       ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
422       ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
423       ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
424       ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
425       tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
426       tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
427       tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
428       tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
429       ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
430       ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
431       cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
432       ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
433       cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
434       ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
435       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
436       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
437       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
438       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
439       dr3 = VSUB(cr3, ci4);
440       dr4 = VADD(cr3, ci4);
441       di3 = VADD(ci3, cr4);
442       di4 = VSUB(ci3, cr4);
443       dr5 = VADD(cr2, ci5);
444       dr2 = VSUB(cr2, ci5);
445       di5 = VSUB(ci2, cr5);
446       di2 = VADD(ci2, cr5);
447       wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
448       wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
449       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
450       ch_ref(i - 1, 2) = dr2;
451       ch_ref(i, 2)     = di2;
452       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
453       ch_ref(i - 1, 3) = dr3;
454       ch_ref(i, 3)     = di3;
455       VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
456       ch_ref(i - 1, 4) = dr4;
457       ch_ref(i, 4)     = di4;
458       VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
459       ch_ref(i - 1, 5) = dr5;
460       ch_ref(i, 5)     = di5;
461     }
462   }
463 #undef ch_ref
464 #undef cc_ref
465 }
466 
radf2_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1)467 static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
468   static const float minus_one = -1.f;
469   int i, k, l1ido = l1*ido;
470   for (k=0; k < l1ido; k += ido) {
471     v4sf a = cc[k], b = cc[k + l1ido];
472     ch[2*k] = VADD(a, b);
473     ch[2*(k+ido)-1] = VSUB(a, b);
474   }
475   if (ido < 2) return;
476   if (ido != 2) {
477     for (k=0; k < l1ido; k += ido) {
478       for (i=2; i<ido; i+=2) {
479         v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
480         v4sf br = cc[i - 1 + k], bi = cc[i + k];
481         VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
482         ch[i + 2*k] = VADD(bi, ti2);
483         ch[2*(k+ido) - i] = VSUB(ti2, bi);
484         ch[i - 1 + 2*k] = VADD(br, tr2);
485         ch[2*(k+ido) - i -1] = VSUB(br, tr2);
486       }
487     }
488     if (ido % 2 == 1) return;
489   }
490   for (k=0; k < l1ido; k += ido) {
491     ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
492     ch[2*k + ido-1] = cc[k + ido-1];
493   }
494 } /* radf2 */
495 
496 
radb2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1)497 static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
498   static const float minus_two=-2;
499   int i, k, l1ido = l1*ido;
500   v4sf a,b,c,d, tr2, ti2;
501   for (k=0; k < l1ido; k += ido) {
502     a = cc[2*k]; b = cc[2*(k+ido) - 1];
503     ch[k] = VADD(a, b);
504     ch[k + l1ido] =VSUB(a, b);
505   }
506   if (ido < 2) return;
507   if (ido != 2) {
508     for (k = 0; k < l1ido; k += ido) {
509       for (i = 2; i < ido; i += 2) {
510         a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
511         c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
512         ch[i-1 + k] = VADD(a, b);
513         tr2 = VSUB(a, b);
514         ch[i+0 + k] = VSUB(c, d);
515         ti2 = VADD(c, d);
516         VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
517         ch[i-1 + k + l1ido] = tr2;
518         ch[i+0 + k + l1ido] = ti2;
519       }
520     }
521     if (ido % 2 == 1) return;
522   }
523   for (k = 0; k < l1ido; k += ido) {
524     a = cc[2*k + ido-1]; b = cc[2*k + ido];
525     ch[k + ido-1] = VADD(a,a);
526     ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
527   }
528 } /* radb2 */
529 
radf3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)530 static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
531                      const float *wa1, const float *wa2) {
532   static const float taur = -0.5f;
533   static const float taui = 0.866025403784439f;
534   int i, k, ic;
535   v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
536   for (k=0; k<l1; k++) {
537     cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
538     ch[3*k*ido] = VADD(cc[k*ido], cr2);
539     ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
540     ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
541   }
542   if (ido == 1) return;
543   for (k=0; k<l1; k++) {
544     for (i=2; i<ido; i+=2) {
545       ic = ido - i;
546       wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
547       dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
548       VCPLXMULCONJ(dr2, di2, wr1, wi1);
549 
550       wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
551       dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
552       VCPLXMULCONJ(dr3, di3, wr2, wi2);
553 
554       cr2 = VADD(dr2, dr3);
555       ci2 = VADD(di2, di3);
556       ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
557       ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
558       tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
559       ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
560       tr3 = SVMUL(taui, VSUB(di2, di3));
561       ti3 = SVMUL(taui, VSUB(dr3, dr2));
562       ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
563       ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
564       ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
565       ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
566     }
567   }
568 } /* radf3 */
569 
570 
radb3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)571 static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
572                      const float *wa1, const float *wa2)
573 {
574   static const float taur = -0.5f;
575   static const float taui = 0.866025403784439f;
576   static const float taui_2 = 0.866025403784439f*2;
577   int i, k, ic;
578   v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
579   for (k=0; k<l1; k++) {
580     tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
581     cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
582     ch[k*ido] = VADD(cc[3*k*ido], tr2);
583     ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
584     ch[(k + l1)*ido] = VSUB(cr2, ci3);
585     ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
586   }
587   if (ido == 1) return;
588   for (k=0; k<l1; k++) {
589     for (i=2; i<ido; i+=2) {
590       ic = ido - i;
591       tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
592       cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
593       ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
594       ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
595       ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
596       ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
597       cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
598       ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
599       dr2 = VSUB(cr2, ci3);
600       dr3 = VADD(cr2, ci3);
601       di2 = VADD(ci2, cr3);
602       di3 = VSUB(ci2, cr3);
603       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
604       ch[i - 1 + (k + l1)*ido] = dr2;
605       ch[i + (k + l1)*ido] = di2;
606       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
607       ch[i - 1 + (k + 2*l1)*ido] = dr3;
608       ch[i + (k + 2*l1)*ido] = di3;
609     }
610   }
611 } /* radb3 */
612 
radf4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)613 static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
614                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
615 {
616   static const float minus_hsqt2 = (float)-0.7071067811865475;
617   int i, k, l1ido = l1*ido;
618   {
619     const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
620     v4sf * RESTRICT ch_ = ch;
621     while (cc < cc_end) {
622       // this loop represents between 25% and 40% of total radf4_ps cost !
623       v4sf a0 = cc[0], a1 = cc[l1ido];
624       v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
625       v4sf tr1 = VADD(a1, a3);
626       v4sf tr2 = VADD(a0, a2);
627       ch[2*ido-1] = VSUB(a0, a2);
628       ch[2*ido  ] = VSUB(a3, a1);
629       ch[0      ] = VADD(tr1, tr2);
630       ch[4*ido-1] = VSUB(tr2, tr1);
631       cc += ido; ch += 4*ido;
632     }
633     cc = cc_; ch = ch_;
634   }
635   if (ido < 2) return;
636   if (ido != 2) {
637     for (k = 0; k < l1ido; k += ido) {
638       const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
639       for (i=2; i<ido; i += 2, pc += 2) {
640         int ic = ido - i;
641         v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
642         v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
643 
644         cr2 = pc[1*l1ido+0];
645         ci2 = pc[1*l1ido+1];
646         wr=LD_PS1(wa1[i - 2]);
647         wi=LD_PS1(wa1[i - 1]);
648         VCPLXMULCONJ(cr2,ci2,wr,wi);
649 
650         cr3 = pc[2*l1ido+0];
651         ci3 = pc[2*l1ido+1];
652         wr = LD_PS1(wa2[i-2]);
653         wi = LD_PS1(wa2[i-1]);
654         VCPLXMULCONJ(cr3, ci3, wr, wi);
655 
656         cr4 = pc[3*l1ido];
657         ci4 = pc[3*l1ido+1];
658         wr = LD_PS1(wa3[i-2]);
659         wi = LD_PS1(wa3[i-1]);
660         VCPLXMULCONJ(cr4, ci4, wr, wi);
661 
662         /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
663 
664         tr1 = VADD(cr2,cr4);
665         tr4 = VSUB(cr4,cr2);
666         tr2 = VADD(pc[0],cr3);
667         tr3 = VSUB(pc[0],cr3);
668         ch[i - 1 + 4*k] = VADD(tr1,tr2);
669         ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
670         ti1 = VADD(ci2,ci4);
671         ti4 = VSUB(ci2,ci4);
672         ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
673         ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
674         ti2 = VADD(pc[1],ci3);
675         ti3 = VSUB(pc[1],ci3);
676         ch[i + 4*k] = VADD(ti1, ti2);
677         ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
678         ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
679         ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
680       }
681     }
682     if (ido % 2 == 1) return;
683   }
684   for (k=0; k<l1ido; k += ido) {
685     v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
686     v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
687     v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
688     v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
689     ch[ido-1 + 4*k] = VADD(tr1, c);
690     ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
691     ch[4*k + 1*ido] = VSUB(ti1, d);
692     ch[4*k + 3*ido] = VADD(ti1, d);
693   }
694 } /* radf4 */
695 
696 
radb4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)697 static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
698                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
699 {
700   static const float minus_sqrt2 = (float)-1.414213562373095;
701   static const float two = 2.f;
702   int i, k, l1ido = l1*ido;
703   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
704   {
705     const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
706     v4sf *ch_ = ch;
707     while (ch < ch_end) {
708       v4sf a = cc[0], b = cc[4*ido-1];
709       v4sf c = cc[2*ido], d = cc[2*ido-1];
710       tr3 = SVMUL(two,d);
711       tr2 = VADD(a,b);
712       tr1 = VSUB(a,b);
713       tr4 = SVMUL(two,c);
714       ch[0*l1ido] = VADD(tr2, tr3);
715       ch[2*l1ido] = VSUB(tr2, tr3);
716       ch[1*l1ido] = VSUB(tr1, tr4);
717       ch[3*l1ido] = VADD(tr1, tr4);
718 
719       cc += 4*ido; ch += ido;
720     }
721     cc = cc_; ch = ch_;
722   }
723   if (ido < 2) return;
724   if (ido != 2) {
725     for (k = 0; k < l1ido; k += ido) {
726       const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
727       v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
728       for (i = 2; i < ido; i += 2) {
729 
730         tr1 = VSUB(pc[i], pc[4*ido - i]);
731         tr2 = VADD(pc[i], pc[4*ido - i]);
732         ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
733         tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
734         ph[0] = VADD(tr2, tr3);
735         cr3 = VSUB(tr2, tr3);
736 
737         ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
738         tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
739         cr2 = VSUB(tr1, tr4);
740         cr4 = VADD(tr1, tr4);
741 
742         ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
743         ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
744 
745         ph[1] = VADD(ti2, ti3); ph += l1ido;
746         ci3 = VSUB(ti2, ti3);
747         ci2 = VADD(ti1, ti4);
748         ci4 = VSUB(ti1, ti4);
749         VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
750         ph[0] = cr2;
751         ph[1] = ci2; ph += l1ido;
752         VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
753         ph[0] = cr3;
754         ph[1] = ci3; ph += l1ido;
755         VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
756         ph[0] = cr4;
757         ph[1] = ci4; ph = ph - 3*l1ido + 2;
758       }
759     }
760     if (ido % 2 == 1) return;
761   }
762   for (k=0; k < l1ido; k+=ido) {
763     int i0 = 4*k + ido;
764     v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
765     v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
766     tr1 = VSUB(c,d);
767     tr2 = VADD(c,d);
768     ti1 = VADD(b,a);
769     ti2 = VSUB(b,a);
770     ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
771     ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
772     ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
773     ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
774   }
775 } /* radb4 */
776 
radf5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)777 static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
778                      const float *wa1, const float *wa2, const float *wa3, const float *wa4)
779 {
780   static const float tr11 = .309016994374947f;
781   static const float ti11 = .951056516295154f;
782   static const float tr12 = -.809016994374947f;
783   static const float ti12 = .587785252292473f;
784 
785   /* System generated locals */
786   int cc_offset, ch_offset;
787 
788   /* Local variables */
789   int i, k, ic;
790   v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
791     cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
792   int idp2;
793 
794 
795 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
796 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
797 
798   /* Parameter adjustments */
799   ch_offset = 1 + ido * 6;
800   ch -= ch_offset;
801   cc_offset = 1 + ido * (1 + l1);
802   cc -= cc_offset;
803 
804   /* Function Body */
805   for (k = 1; k <= l1; ++k) {
806     cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
807     ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
808     cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
809     ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
810     ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
811     ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
812     ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
813     ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
814     ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
815     //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
816   }
817   if (ido == 1) {
818     return;
819   }
820   idp2 = ido + 2;
821   for (k = 1; k <= l1; ++k) {
822     for (i = 3; i <= ido; i += 2) {
823       ic = idp2 - i;
824       dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
825       dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
826       dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
827       dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
828       VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
829       VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
830       VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
831       VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
832       cr2 = VADD(dr2, dr5);
833       ci5 = VSUB(dr5, dr2);
834       cr5 = VSUB(di2, di5);
835       ci2 = VADD(di2, di5);
836       cr3 = VADD(dr3, dr4);
837       ci4 = VSUB(dr4, dr3);
838       cr4 = VSUB(di3, di4);
839       ci3 = VADD(di3, di4);
840       ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
841       ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
842       tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
843       ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
844       tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
845       ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
846       tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
847       ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
848       tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
849       ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
850       ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
851       ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
852       ch_ref(i, 3, k) = VADD(ti2, ti5);
853       ch_ref(ic, 2, k) = VSUB(ti5, ti2);
854       ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
855       ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
856       ch_ref(i, 5, k) = VADD(ti3, ti4);
857       ch_ref(ic, 4, k) = VSUB(ti4, ti3);
858     }
859   }
860 #undef cc_ref
861 #undef ch_ref
862 } /* radf5 */
863 
radb5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)864 static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
865                   const float *wa1, const float *wa2, const float *wa3, const float *wa4)
866 {
867   static const float tr11 = .309016994374947f;
868   static const float ti11 = .951056516295154f;
869   static const float tr12 = -.809016994374947f;
870   static const float ti12 = .587785252292473f;
871 
872   int cc_offset, ch_offset;
873 
874   /* Local variables */
875   int i, k, ic;
876   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
877     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
878   int idp2;
879 
880 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
881 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
882 
883   /* Parameter adjustments */
884   ch_offset = 1 + ido * (1 + l1);
885   ch -= ch_offset;
886   cc_offset = 1 + ido * 6;
887   cc -= cc_offset;
888 
889   /* Function Body */
890   for (k = 1; k <= l1; ++k) {
891     ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
892     ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
893     tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
894     tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
895     ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
896     cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
897     cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
898     ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
899     ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
900     ch_ref(1, k, 2) = VSUB(cr2, ci5);
901     ch_ref(1, k, 3) = VSUB(cr3, ci4);
902     ch_ref(1, k, 4) = VADD(cr3, ci4);
903     ch_ref(1, k, 5) = VADD(cr2, ci5);
904   }
905   if (ido == 1) {
906     return;
907   }
908   idp2 = ido + 2;
909   for (k = 1; k <= l1; ++k) {
910     for (i = 3; i <= ido; i += 2) {
911       ic = idp2 - i;
912       ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
913       ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
914       ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
915       ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
916       tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
917       tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
918       tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
919       tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
920       ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
921       ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
922       cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
923       ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
924       cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
925       ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
926       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
927       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
928       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
929       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
930       dr3 = VSUB(cr3, ci4);
931       dr4 = VADD(cr3, ci4);
932       di3 = VADD(ci3, cr4);
933       di4 = VSUB(ci3, cr4);
934       dr5 = VADD(cr2, ci5);
935       dr2 = VSUB(cr2, ci5);
936       di5 = VSUB(ci2, cr5);
937       di2 = VADD(ci2, cr5);
938       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
939       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
940       VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
941       VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
942 
943       ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
944       ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
945       ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
946       ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
947     }
948   }
949 #undef cc_ref
950 #undef ch_ref
951 } /* radb5 */
952 
rfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)953 static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
954                                       const float *wa, const int *ifac) {
955   v4sf *in  = (v4sf*)input_readonly;
956   v4sf *out = (in == work2 ? work1 : work2);
957   int nf = ifac[1], k1;
958   int l2 = n;
959   int iw = n-1;
960   assert(in != out && work1 != work2);
961   for (k1 = 1; k1 <= nf; ++k1) {
962     int kh = nf - k1;
963     int ip = ifac[kh + 2];
964     int l1 = l2 / ip;
965     int ido = n / l2;
966     iw -= (ip - 1)*ido;
967     switch (ip) {
968       case 5: {
969         int ix2 = iw + ido;
970         int ix3 = ix2 + ido;
971         int ix4 = ix3 + ido;
972         radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
973       } break;
974       case 4: {
975         int ix2 = iw + ido;
976         int ix3 = ix2 + ido;
977         radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
978       } break;
979       case 3: {
980         int ix2 = iw + ido;
981         radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
982       } break;
983       case 2:
984         radf2_ps(ido, l1, in, out, &wa[iw]);
985         break;
986       default:
987         assert(0);
988         break;
989     }
990     l2 = l1;
991     if (out == work2) {
992       out = work1; in = work2;
993     } else {
994       out = work2; in = work1;
995     }
996   }
997   return in; /* this is in fact the output .. */
998 } /* rfftf1 */
999 
rfftb1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)1000 static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1001                                       const float *wa, const int *ifac) {
1002   v4sf *in  = (v4sf*)input_readonly;
1003   v4sf *out = (in == work2 ? work1 : work2);
1004   int nf = ifac[1], k1;
1005   int l1 = 1;
1006   int iw = 0;
1007   assert(in != out);
1008   for (k1=1; k1<=nf; k1++) {
1009     int ip = ifac[k1 + 1];
1010     int l2 = ip*l1;
1011     int ido = n / l2;
1012     switch (ip) {
1013       case 5: {
1014         int ix2 = iw + ido;
1015         int ix3 = ix2 + ido;
1016         int ix4 = ix3 + ido;
1017         radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1018       } break;
1019       case 4: {
1020         int ix2 = iw + ido;
1021         int ix3 = ix2 + ido;
1022         radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1023       } break;
1024       case 3: {
1025         int ix2 = iw + ido;
1026         radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1027       } break;
1028       case 2:
1029         radb2_ps(ido, l1, in, out, &wa[iw]);
1030         break;
1031       default:
1032         assert(0);
1033         break;
1034     }
1035     l1 = l2;
1036     iw += (ip - 1)*ido;
1037 
1038     if (out == work2) {
1039       out = work1; in = work2;
1040     } else {
1041       out = work2; in = work1;
1042     }
1043   }
1044   return in; /* this is in fact the output .. */
1045 }
1046 
decompose(int n,int * ifac,const int * ntryh)1047 static int decompose(int n, int *ifac, const int *ntryh) {
1048   int nl = n, nf = 0, i, j = 0;
1049   for (j=0; ntryh[j]; ++j) {
1050     int ntry = ntryh[j];
1051     while (nl != 1) {
1052       int nq = nl / ntry;
1053       int nr = nl - ntry * nq;
1054       if (nr == 0) {
1055         ifac[2+nf++] = ntry;
1056         nl = nq;
1057         if (ntry == 2 && nf != 1) {
1058           for (i = 2; i <= nf; ++i) {
1059             int ib = nf - i + 2;
1060             ifac[ib + 1] = ifac[ib];
1061           }
1062           ifac[2] = 2;
1063         }
1064       } else break;
1065     }
1066   }
1067   ifac[0] = n;
1068   ifac[1] = nf;
1069   return nf;
1070 }
1071 
1072 
1073 
rffti1_ps(int n,float * wa,int * ifac)1074 static void rffti1_ps(int n, float *wa, int *ifac)
1075 {
1076   static const int ntryh[] = { 4,2,3,5,0 };
1077   int k1, j, ii;
1078 
1079   int nf = decompose(n,ifac,ntryh);
1080   float argh = (2*M_PI) / n;
1081   int is = 0;
1082   int nfm1 = nf - 1;
1083   int l1 = 1;
1084   for (k1 = 1; k1 <= nfm1; k1++) {
1085     int ip = ifac[k1 + 1];
1086     int ld = 0;
1087     int l2 = l1*ip;
1088     int ido = n / l2;
1089     int ipm = ip - 1;
1090     for (j = 1; j <= ipm; ++j) {
1091       float argld;
1092       int i = is, fi=0;
1093       ld += l1;
1094       argld = ld*argh;
1095       for (ii = 3; ii <= ido; ii += 2) {
1096         i += 2;
1097         fi += 1;
1098         wa[i - 2] = cos(fi*argld);
1099         wa[i - 1] = sin(fi*argld);
1100       }
1101       is += ido;
1102     }
1103     l1 = l2;
1104   }
1105 } /* rffti1 */
1106 
cffti1_ps(int n,float * wa,int * ifac)1107 void cffti1_ps(int n, float *wa, int *ifac)
1108 {
1109   static const int ntryh[] = { 5,3,4,2,0 };
1110   int k1, j, ii;
1111 
1112   int nf = decompose(n,ifac,ntryh);
1113   float argh = (2*M_PI)/(float)n;
1114   int i = 1;
1115   int l1 = 1;
1116   for (k1=1; k1<=nf; k1++) {
1117     int ip = ifac[k1+1];
1118     int ld = 0;
1119     int l2 = l1*ip;
1120     int ido = n / l2;
1121     int idot = ido + ido + 2;
1122     int ipm = ip - 1;
1123     for (j=1; j<=ipm; j++) {
1124       float argld;
1125       int i1 = i, fi = 0;
1126       wa[i-1] = 1;
1127       wa[i] = 0;
1128       ld += l1;
1129       argld = ld*argh;
1130       for (ii = 4; ii <= idot; ii += 2) {
1131         i += 2;
1132         fi += 1;
1133         wa[i-1] = cos(fi*argld);
1134         wa[i] = sin(fi*argld);
1135       }
1136       if (ip > 5) {
1137         wa[i1-1] = wa[i-1];
1138         wa[i1] = wa[i];
1139       }
1140     }
1141     l1 = l2;
1142   }
1143 } /* cffti1 */
1144 
1145 
cfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac,int isign)1146 v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
1147   v4sf *in  = (v4sf*)input_readonly;
1148   v4sf *out = (in == work2 ? work1 : work2);
1149   int nf = ifac[1], k1;
1150   int l1 = 1;
1151   int iw = 0;
1152   assert(in != out && work1 != work2);
1153   for (k1=2; k1<=nf+1; k1++) {
1154     int ip = ifac[k1];
1155     int l2 = ip*l1;
1156     int ido = n / l2;
1157     int idot = ido + ido;
1158     switch (ip) {
1159       case 5: {
1160         int ix2 = iw + idot;
1161         int ix3 = ix2 + idot;
1162         int ix4 = ix3 + idot;
1163         passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1164       } break;
1165       case 4: {
1166         int ix2 = iw + idot;
1167         int ix3 = ix2 + idot;
1168         passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1169       } break;
1170       case 2: {
1171         passf2_ps(idot, l1, in, out, &wa[iw], isign);
1172       } break;
1173       case 3: {
1174         int ix2 = iw + idot;
1175         passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1176       } break;
1177       default:
1178         assert(0);
1179     }
1180     l1 = l2;
1181     iw += (ip - 1)*idot;
1182     if (out == work2) {
1183       out = work1; in = work2;
1184     } else {
1185       out = work2; in = work1;
1186     }
1187   }
1188 
1189   return in; /* this is in fact the output .. */
1190 }
1191 
1192 
1193 struct PFFFT_Setup {
1194   int     N;
1195   int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1196   int ifac[15];
1197   pffft_transform_t transform;
1198   v4sf *data; // allocated room for twiddle coefs
1199   float *e;    // points into 'data' , N/4*3 elements
1200   float *twiddle; // points into 'data', N/4 elements
1201 };
1202 
pffft_new_setup(int N,pffft_transform_t transform)1203 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
1204   PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1205   int k, m;
1206   /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1207      and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1208      handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1209   if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1210   if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1211   //assert((N % 32) == 0);
1212   s->N = N;
1213   s->transform = transform;
1214   /* nb of complex simd vectors */
1215   s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1216   s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1217   s->e = (float*)s->data;
1218   s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1219 
1220   if (transform == PFFFT_REAL) {
1221     for (k=0; k < s->Ncvec; ++k) {
1222       int i = k/SIMD_SZ;
1223       int j = k%SIMD_SZ;
1224       for (m=0; m < SIMD_SZ-1; ++m) {
1225         float A = -2*M_PI*(m+1)*k / N;
1226         s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
1227         s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
1228       }
1229     }
1230     rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1231   } else {
1232     for (k=0; k < s->Ncvec; ++k) {
1233       int i = k/SIMD_SZ;
1234       int j = k%SIMD_SZ;
1235       for (m=0; m < SIMD_SZ-1; ++m) {
1236         float A = -2*M_PI*(m+1)*k / N;
1237         s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
1238         s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
1239       }
1240     }
1241     cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1242   }
1243 
1244   /* check that N is decomposable with allowed prime factors */
1245   for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1246   if (m != N/SIMD_SZ) {
1247     pffft_destroy_setup(s); s = 0;
1248   }
1249 
1250   return s;
1251 }
1252 
1253 
pffft_destroy_setup(PFFFT_Setup * s)1254 void pffft_destroy_setup(PFFFT_Setup *s) {
1255   pffft_aligned_free(s->data);
1256   free(s);
1257 }
1258 
1259 #if !defined(PFFFT_SIMD_DISABLE)
1260 
1261 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
reversed_copy(int N,const v4sf * in,int in_stride,v4sf * out)1262 static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1263   v4sf g0, g1;
1264   int k;
1265   INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1266 
1267   *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1268   for (k=1; k < N; ++k) {
1269     v4sf h0, h1;
1270     INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1271     *--out = VSWAPHL(g1, h0);
1272     *--out = VSWAPHL(h0, h1);
1273     g1 = h1;
1274   }
1275   *--out = VSWAPHL(g1, g0);
1276 }
1277 
unreversed_copy(int N,const v4sf * in,v4sf * out,int out_stride)1278 static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1279   v4sf g0, g1, h0, h1;
1280   int k;
1281   g0 = g1 = in[0]; ++in;
1282   for (k=1; k < N; ++k) {
1283     h0 = *in++; h1 = *in++;
1284     g1 = VSWAPHL(g1, h0);
1285     h0 = VSWAPHL(h0, h1);
1286     UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1287     g1 = h1;
1288   }
1289   h0 = *in++; h1 = g0;
1290   g1 = VSWAPHL(g1, h0);
1291   h0 = VSWAPHL(h0, h1);
1292   UNINTERLEAVE2(h0, g1, out[0], out[1]);
1293 }
1294 
pffft_zreorder(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1295 void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1296   int k, N = setup->N, Ncvec = setup->Ncvec;
1297   const v4sf *vin = (const v4sf*)in;
1298   v4sf *vout = (v4sf*)out;
1299   assert(in != out);
1300   if (setup->transform == PFFFT_REAL) {
1301     int k, dk = N/32;
1302     if (direction == PFFFT_FORWARD) {
1303       for (k=0; k < dk; ++k) {
1304         INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1305         INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1306       }
1307       reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1308       reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1309     } else {
1310       for (k=0; k < dk; ++k) {
1311         UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1312         UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1313       }
1314       unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1315       unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1316     }
1317   } else {
1318     if (direction == PFFFT_FORWARD) {
1319       for (k=0; k < Ncvec; ++k) {
1320         int kk = (k/4) + (k%4)*(Ncvec/4);
1321         INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1322       }
1323     } else {
1324       for (k=0; k < Ncvec; ++k) {
1325         int kk = (k/4) + (k%4)*(Ncvec/4);
1326         UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1327       }
1328     }
1329   }
1330 }
1331 
pffft_cplx_finalize(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1332 void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1333   int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1334   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1335   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1336   assert(in != out);
1337   for (k=0; k < dk; ++k) {
1338     r0 = in[8*k+0]; i0 = in[8*k+1];
1339     r1 = in[8*k+2]; i1 = in[8*k+3];
1340     r2 = in[8*k+4]; i2 = in[8*k+5];
1341     r3 = in[8*k+6]; i3 = in[8*k+7];
1342     VTRANSPOSE4(r0,r1,r2,r3);
1343     VTRANSPOSE4(i0,i1,i2,i3);
1344     VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1345     VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1346     VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1347 
1348     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1349     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1350     si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1351     si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1352 
1353     /*
1354       transformation for each column is:
1355 
1356       [1   1   1   1   0   0   0   0]   [r0]
1357       [1   0  -1   0   0  -1   0   1]   [r1]
1358       [1  -1   1  -1   0   0   0   0]   [r2]
1359       [1   0  -1   0   0   1   0  -1]   [r3]
1360       [0   0   0   0   1   1   1   1] * [i0]
1361       [0   1   0  -1   1   0  -1   0]   [i1]
1362       [0   0   0   0   1  -1   1  -1]   [i2]
1363       [0  -1   0   1   1   0  -1   0]   [i3]
1364     */
1365 
1366     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1367     r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1368     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1369     r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1370 
1371     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1372     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1373   }
1374 }
1375 
pffft_cplx_preprocess(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1376 void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1377   int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1378   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1379   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1380   assert(in != out);
1381   for (k=0; k < dk; ++k) {
1382     r0 = in[8*k+0]; i0 = in[8*k+1];
1383     r1 = in[8*k+2]; i1 = in[8*k+3];
1384     r2 = in[8*k+4]; i2 = in[8*k+5];
1385     r3 = in[8*k+6]; i3 = in[8*k+7];
1386 
1387     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1388     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1389     si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1390     si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1391 
1392     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1393     r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1394     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1395     r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1396 
1397     VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1398     VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1399     VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1400 
1401     VTRANSPOSE4(r0,r1,r2,r3);
1402     VTRANSPOSE4(i0,i1,i2,i3);
1403 
1404     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1405     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1406   }
1407 }
1408 
1409 
pffft_real_finalize_4x4(const v4sf * in0,const v4sf * in1,const v4sf * in,const v4sf * e,v4sf * out)1410 static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1411                             const v4sf *e, v4sf *out) {
1412   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1413   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1414   r0 = *in0; i0 = *in1;
1415   r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1416   VTRANSPOSE4(r0,r1,r2,r3);
1417   VTRANSPOSE4(i0,i1,i2,i3);
1418 
1419   /*
1420     transformation for each column is:
1421 
1422     [1   1   1   1   0   0   0   0]   [r0]
1423     [1   0  -1   0   0  -1   0   1]   [r1]
1424     [1   0  -1   0   0   1   0  -1]   [r2]
1425     [1  -1   1  -1   0   0   0   0]   [r3]
1426     [0   0   0   0   1   1   1   1] * [i0]
1427     [0  -1   0   1  -1   0   1   0]   [i1]
1428     [0  -1   0   1   1   0  -1   0]   [i2]
1429     [0   0   0   0  -1   1  -1   1]   [i3]
1430   */
1431 
1432   //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1433   //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1434 
1435   VCPLXMUL(r1,i1,e[0],e[1]);
1436   VCPLXMUL(r2,i2,e[2],e[3]);
1437   VCPLXMUL(r3,i3,e[4],e[5]);
1438 
1439   //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1440   //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1441 
1442   sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1443   sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1444   si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1445   si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1446 
1447   r0 = VADD(sr0, sr1);
1448   r3 = VSUB(sr0, sr1);
1449   i0 = VADD(si0, si1);
1450   i3 = VSUB(si1, si0);
1451   r1 = VADD(dr0, di1);
1452   r2 = VSUB(dr0, di1);
1453   i1 = VSUB(dr1, di0);
1454   i2 = VADD(dr1, di0);
1455 
1456   *out++ = r0;
1457   *out++ = i0;
1458   *out++ = r1;
1459   *out++ = i1;
1460   *out++ = r2;
1461   *out++ = i2;
1462   *out++ = r3;
1463   *out++ = i3;
1464 
1465 }
1466 
pffft_real_finalize(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1467 static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1468   int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1469   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1470 
1471   v4sf_union cr, ci, *uout = (v4sf_union*)out;
1472   v4sf save = in[7], zero=VZERO();
1473   float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1474   static const float s = M_SQRT2/2;
1475 
1476   cr.v = in[0]; ci.v = in[Ncvec*2-1];
1477   assert(in != out);
1478   pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1479 
1480   /*
1481     [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1482 
1483     [Xr(1)]  ] [1   1   1   1   0   0   0   0]
1484     [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
1485     [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
1486     [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
1487     [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
1488     [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
1489     [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
1490     [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
1491   */
1492 
1493   xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1494   xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1495   xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
1496   xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
1497   xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
1498   xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
1499   xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
1500   xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
1501 
1502   for (k=1; k < dk; ++k) {
1503     v4sf save_next = in[8*k+7];
1504     pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1505                            e + k*6, out + k*8);
1506     save = save_next;
1507   }
1508 
1509 }
1510 
pffft_real_preprocess_4x4(const v4sf * in,const v4sf * e,v4sf * out,int first)1511 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1512                                              const v4sf *e, v4sf *out, int first) {
1513   v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1514   /*
1515     transformation for each column is:
1516 
1517     [1   1   1   1   0   0   0   0]   [r0]
1518     [1   0   0  -1   0  -1  -1   0]   [r1]
1519     [1  -1  -1   1   0   0   0   0]   [r2]
1520     [1   0   0  -1   0   1   1   0]   [r3]
1521     [0   0   0   0   1  -1   1  -1] * [i0]
1522     [0  -1   1   0   1   0   0   1]   [i1]
1523     [0   0   0   0   1   1  -1  -1]   [i2]
1524     [0   1  -1   0   1   0   0   1]   [i3]
1525   */
1526 
1527   v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1528   v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1529   v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1530   v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1531 
1532   r0 = VADD(sr0, sr1);
1533   r2 = VSUB(sr0, sr1);
1534   r1 = VSUB(dr0, si1);
1535   r3 = VADD(dr0, si1);
1536   i0 = VSUB(di0, di1);
1537   i2 = VADD(di0, di1);
1538   i1 = VSUB(si0, dr1);
1539   i3 = VADD(si0, dr1);
1540 
1541   VCPLXMULCONJ(r1,i1,e[0],e[1]);
1542   VCPLXMULCONJ(r2,i2,e[2],e[3]);
1543   VCPLXMULCONJ(r3,i3,e[4],e[5]);
1544 
1545   VTRANSPOSE4(r0,r1,r2,r3);
1546   VTRANSPOSE4(i0,i1,i2,i3);
1547 
1548   if (!first) {
1549     *out++ = r0;
1550     *out++ = i0;
1551   }
1552   *out++ = r1;
1553   *out++ = i1;
1554   *out++ = r2;
1555   *out++ = i2;
1556   *out++ = r3;
1557   *out++ = i3;
1558 }
1559 
pffft_real_preprocess(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1560 static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1561   int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1562   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1563 
1564   v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1565   float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1566   static const float s = M_SQRT2;
1567   assert(in != out);
1568   for (k=0; k < 4; ++k) {
1569     Xr.f[k] = ((float*)in)[8*k];
1570     Xi.f[k] = ((float*)in)[8*k+4];
1571   }
1572 
1573   pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1574 
1575   /*
1576     [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1577 
1578     [cr0] [1   0   2   0   1   0   0   0]
1579     [cr1] [1   0   0   0  -1   0  -2   0]
1580     [cr2] [1   0  -2   0   1   0   0   0]
1581     [cr3] [1   0   0   0  -1   0   2   0]
1582     [ci0] [0   2   0   2   0   0   0   0]
1583     [ci1] [0   s   0  -s   0  -s   0  -s]
1584     [ci2] [0   0   0   0   0  -2   0   2]
1585     [ci3] [0  -s   0   s   0  -s   0  -s]
1586   */
1587   for (k=1; k < dk; ++k) {
1588     pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1589   }
1590 
1591   cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1592   cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1593   cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1594   cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1595   ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
1596   ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1597   ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
1598   ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1599 }
1600 
1601 
pffft_transform_internal(PFFFT_Setup * setup,const float * finput,float * foutput,v4sf * scratch,pffft_direction_t direction,int ordered)1602 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1603                              pffft_direction_t direction, int ordered) {
1604   int k, Ncvec   = setup->Ncvec;
1605   int nf_odd = (setup->ifac[1] & 1);
1606 
1607   // temporary buffer is allocated on the stack if the scratch pointer is NULL
1608   int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1609   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1610 
1611   const v4sf *vinput = (const v4sf*)finput;
1612   v4sf *voutput      = (v4sf*)foutput;
1613   v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
1614   int ib = (nf_odd ^ ordered ? 1 : 0);
1615 
1616   assert(VALIGNED(finput) && VALIGNED(foutput));
1617 
1618   //assert(finput != foutput);
1619   if (direction == PFFFT_FORWARD) {
1620     ib = !ib;
1621     if (setup->transform == PFFFT_REAL) {
1622       ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1623                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1624       pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1625     } else {
1626       v4sf *tmp = buff[ib];
1627       for (k=0; k < Ncvec; ++k) {
1628         UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1629       }
1630       ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1631                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1632       pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1633     }
1634     if (ordered) {
1635       pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1636     } else ib = !ib;
1637   } else {
1638     if (vinput == buff[ib]) {
1639       ib = !ib; // may happen when finput == foutput
1640     }
1641     if (ordered) {
1642       pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1643       vinput = buff[ib]; ib = !ib;
1644     }
1645     if (setup->transform == PFFFT_REAL) {
1646       pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1647       ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1648                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1649     } else {
1650       pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1651       ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1652                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1653       for (k=0; k < Ncvec; ++k) {
1654         INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1655       }
1656     }
1657   }
1658 
1659   if (buff[ib] != voutput) {
1660     /* extra copy required -- this situation should only happen when finput == foutput */
1661     assert(finput==foutput);
1662     for (k=0; k < Ncvec; ++k) {
1663       v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1664       voutput[2*k] = a; voutput[2*k+1] = b;
1665     }
1666     ib = !ib;
1667   }
1668   assert(buff[ib] == voutput);
1669 
1670   VLA_ARRAY_ON_STACK_FREE(scratch_on_stack);
1671 }
1672 
pffft_zconvolve_accumulate(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1673 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1674   int Ncvec = s->Ncvec;
1675   const v4sf * RESTRICT va = (const v4sf*)a;
1676   const v4sf * RESTRICT vb = (const v4sf*)b;
1677   v4sf * RESTRICT vab = (v4sf*)ab;
1678 
1679 #ifdef __arm__
1680   __builtin_prefetch(va);
1681   __builtin_prefetch(vb);
1682   __builtin_prefetch(vab);
1683   __builtin_prefetch(va+2);
1684   __builtin_prefetch(vb+2);
1685   __builtin_prefetch(vab+2);
1686   __builtin_prefetch(va+4);
1687   __builtin_prefetch(vb+4);
1688   __builtin_prefetch(vab+4);
1689   __builtin_prefetch(va+6);
1690   __builtin_prefetch(vb+6);
1691   __builtin_prefetch(vab+6);
1692 # ifndef __clang__
1693 #   define ZCONVOLVE_USING_INLINE_NEON_ASM
1694 # endif
1695 #endif
1696 
1697   float ar, ai, br, bi, abr, abi;
1698 #ifndef ZCONVOLVE_USING_INLINE_ASM
1699   v4sf vscal = LD_PS1(scaling);
1700   int i;
1701 #endif
1702 
1703   assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1704   ar = ((v4sf_union*)va)[0].f[0];
1705   ai = ((v4sf_union*)va)[1].f[0];
1706   br = ((v4sf_union*)vb)[0].f[0];
1707   bi = ((v4sf_union*)vb)[1].f[0];
1708   abr = ((v4sf_union*)vab)[0].f[0];
1709   abi = ((v4sf_union*)vab)[1].f[0];
1710 
1711 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1712   const float *a_ = a, *b_ = b; float *ab_ = ab;
1713   int N = Ncvec;
1714   asm volatile("mov         r8, %2                  \n"
1715                "vdup.f32    q15, %4                 \n"
1716                "1:                                  \n"
1717                "pld         [%0,#64]                \n"
1718                "pld         [%1,#64]                \n"
1719                "pld         [%2,#64]                \n"
1720                "pld         [%0,#96]                \n"
1721                "pld         [%1,#96]                \n"
1722                "pld         [%2,#96]                \n"
1723                "vld1.f32    {q0,q1},   [%0,:128]!         \n"
1724                "vld1.f32    {q4,q5},   [%1,:128]!         \n"
1725                "vld1.f32    {q2,q3},   [%0,:128]!         \n"
1726                "vld1.f32    {q6,q7},   [%1,:128]!         \n"
1727                "vld1.f32    {q8,q9},   [r8,:128]!          \n"
1728 
1729                "vmul.f32    q10, q0, q4             \n"
1730                "vmul.f32    q11, q0, q5             \n"
1731                "vmul.f32    q12, q2, q6             \n"
1732                "vmul.f32    q13, q2, q7             \n"
1733                "vmls.f32    q10, q1, q5             \n"
1734                "vmla.f32    q11, q1, q4             \n"
1735                "vld1.f32    {q0,q1}, [r8,:128]!     \n"
1736                "vmls.f32    q12, q3, q7             \n"
1737                "vmla.f32    q13, q3, q6             \n"
1738                "vmla.f32    q8, q10, q15            \n"
1739                "vmla.f32    q9, q11, q15            \n"
1740                "vmla.f32    q0, q12, q15            \n"
1741                "vmla.f32    q1, q13, q15            \n"
1742                "vst1.f32    {q8,q9},[%2,:128]!    \n"
1743                "vst1.f32    {q0,q1},[%2,:128]!    \n"
1744                "subs        %3, #2                  \n"
1745                "bne         1b                      \n"
1746                : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1747 #else // default routine, works fine for non-arm cpus with current compilers
1748   for (i=0; i < Ncvec; i += 2) {
1749     v4sf ar, ai, br, bi;
1750     ar = va[2*i+0]; ai = va[2*i+1];
1751     br = vb[2*i+0]; bi = vb[2*i+1];
1752     VCPLXMUL(ar, ai, br, bi);
1753     vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1754     vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1755     ar = va[2*i+2]; ai = va[2*i+3];
1756     br = vb[2*i+2]; bi = vb[2*i+3];
1757     VCPLXMUL(ar, ai, br, bi);
1758     vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1759     vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1760   }
1761 #endif
1762   if (s->transform == PFFFT_REAL) {
1763     ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1764     ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1765   }
1766 }
1767 
1768 
1769 #else // defined(PFFFT_SIMD_DISABLE)
1770 
1771 // standard routine using scalar floats, without SIMD stuff.
1772 
1773 #define pffft_zreorder_nosimd pffft_zreorder
pffft_zreorder_nosimd(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1774 void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1775   int k, N = setup->N;
1776   if (setup->transform == PFFFT_COMPLEX) {
1777     for (k=0; k < 2*N; ++k) out[k] = in[k];
1778     return;
1779   }
1780   else if (direction == PFFFT_FORWARD) {
1781     float x_N = in[N-1];
1782     for (k=N-1; k > 1; --k) out[k] = in[k-1];
1783     out[0] = in[0];
1784     out[1] = x_N;
1785   } else {
1786     float x_N = in[1];
1787     for (k=1; k < N-1; ++k) out[k] = in[k+1];
1788     out[0] = in[0];
1789     out[N-1] = x_N;
1790   }
1791 }
1792 
1793 #define pffft_transform_internal_nosimd pffft_transform_internal
pffft_transform_internal_nosimd(PFFFT_Setup * setup,const float * input,float * output,float * scratch,pffft_direction_t direction,int ordered)1794 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1795                                     pffft_direction_t direction, int ordered) {
1796   int Ncvec   = setup->Ncvec;
1797   int nf_odd = (setup->ifac[1] & 1);
1798 
1799   // temporary buffer is allocated on the stack if the scratch pointer is NULL
1800   int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1801   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1802   float *buff[2];
1803   int ib;
1804   if (scratch == 0) scratch = scratch_on_stack;
1805   buff[0] = output; buff[1] = scratch;
1806 
1807   if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1808   ib = (nf_odd ^ ordered ? 1 : 0);
1809 
1810   if (direction == PFFFT_FORWARD) {
1811     if (setup->transform == PFFFT_REAL) {
1812       ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1813                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1814     } else {
1815       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1816                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1817     }
1818     if (ordered) {
1819       pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1820     }
1821   } else {
1822     if (input == buff[ib]) {
1823       ib = !ib; // may happen when finput == foutput
1824     }
1825     if (ordered) {
1826       pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1827       input = buff[!ib];
1828     }
1829     if (setup->transform == PFFFT_REAL) {
1830       ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1831                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1832     } else {
1833       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1834                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1835     }
1836   }
1837   if (buff[ib] != output) {
1838     int k;
1839     // extra copy required -- this situation should happens only when finput == foutput
1840     assert(input==output);
1841     for (k=0; k < Ncvec; ++k) {
1842       float a = buff[ib][2*k], b = buff[ib][2*k+1];
1843       output[2*k] = a; output[2*k+1] = b;
1844     }
1845     ib = !ib;
1846   }
1847   assert(buff[ib] == output);
1848 
1849   VLA_ARRAY_ON_STACK_FREE(scratch_on_stack);
1850 }
1851 
1852 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1853 void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1854                                        float *ab, float scaling) {
1855   int i, Ncvec = s->Ncvec;
1856 
1857   if (s->transform == PFFFT_REAL) {
1858     // take care of the fftpack ordering
1859     ab[0] += a[0]*b[0]*scaling;
1860     ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1861     ++ab; ++a; ++b; --Ncvec;
1862   }
1863   for (i=0; i < Ncvec; ++i) {
1864     float ar, ai, br, bi;
1865     ar = a[2*i+0]; ai = a[2*i+1];
1866     br = b[2*i+0]; bi = b[2*i+1];
1867     VCPLXMUL(ar, ai, br, bi);
1868     ab[2*i+0] += ar*scaling;
1869     ab[2*i+1] += ai*scaling;
1870   }
1871 }
1872 
1873 #endif // defined(PFFFT_SIMD_DISABLE)
1874 
pffft_transform(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)1875 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1876   pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
1877 }
1878 
pffft_transform_ordered(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)1879 void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1880   pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
1881 }
1882