1 /*
2    Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
3 
4    Based on original fortran 77 code from FFTPACKv4 from NETLIB
5    (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
6    of NCAR, in 1985.
7 
8    As confirmed by the NCAR fftpack software curators, the following
9    FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
10    released under the same terms.
11 
12    FFTPACK license:
13 
14    http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
15 
16    Copyright (c) 2004 the University Corporation for Atmospheric
17    Research ("UCAR"). All rights reserved. Developed by NCAR's
18    Computational and Information Systems Laboratory, UCAR,
19    www.cisl.ucar.edu.
20 
21    Redistribution and use of the Software in source and binary forms,
22    with or without modification, is permitted provided that the
23    following conditions are met:
24 
25    - Neither the names of NCAR's Computational and Information Systems
26    Laboratory, the University Corporation for Atmospheric Research,
27    nor the names of its sponsors or contributors may be used to
28    endorse or promote products derived from this Software without
29    specific prior written permission.
30 
31    - Redistributions of source code must retain the above copyright
32    notices, this list of conditions, and the disclaimer below.
33 
34    - Redistributions in binary form must reproduce the above copyright
35    notice, this list of conditions, and the disclaimer below in the
36    documentation and/or other materials provided with the
37    distribution.
38 
39    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
40    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
41    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
42    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
43    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
44    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
45    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
46    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
47    SOFTWARE.
48 
49    PFFFT : a Pretty Fast FFT.
50 
51    This file is largerly based on the original FFTPACK implementation, modified in
52    order to take advantage of SIMD instructions of modern CPUs.
53 
54 */
55 
56 /*
57   ChangeLog:
58   - 2011/10/02, version 1: This is the very first release of this file.
59 */
60 
61 #include "pffft.h"
62 #include <stdlib.h>
63 #include <stdio.h>
64 #include <assert.h>
65 #include <stdint.h>
66 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
67 #include <altivec.h>
68 #endif
69 
70 /* detect compiler flavour */
71 #if defined(_MSC_VER)
72 #  define COMPILER_MSVC
73 #elif defined(__GNUC__)
74 #  define COMPILER_GCC
75 #endif
76 
77 #ifdef COMPILER_MSVC
78 #define _USE_MATH_DEFINES
79 #endif
80 #include <math.h>
81 
82 #if defined(COMPILER_GCC)
83 #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
84 #  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
85 #  define RESTRICT __restrict
86 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
87 #elif defined(COMPILER_MSVC)
88 #  define ALWAYS_INLINE(return_type) __forceinline return_type
89 #  define NEVER_INLINE(return_type) __declspec(noinline) return_type
90 #  define RESTRICT __restrict
91 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
92 #endif
93 
94 
95 /*
96    vector support macros: the rest of the code is independant of
97    SSE/Altivec/NEON -- adding support for other platforms with 4-element
98    vectors should be limited to these macros
99 */
100 
101 
102 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
103 //#define PFFFT_SIMD_DISABLE
104 
105 /*
106    Altivec support macros
107 */
108 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
109 typedef vector float v4sf;
110 #  define SIMD_SZ 4
111 #  define VZERO() ((vector float) vec_splat_u8(0))
112 #  define VMUL(a,b) vec_madd(a,b, VZERO())
113 #  define VADD(a,b) vec_add(a,b)
114 #  define VMADD(a,b,c) vec_madd(a,b,c)
115 #  define VSUB(a,b) vec_sub(a,b)
ld_ps1(const float * p)116 inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
117 #  define LD_PS1(p) ld_ps1(&p)
118 #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
119 #  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
120     vector unsigned char vperm1 =  (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \
121     vector unsigned char vperm2 =  (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
122     v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
123   }
124 #  define VTRANSPOSE4(x0,x1,x2,x3) {              \
125     v4sf y0 = vec_mergeh(x0, x2);               \
126     v4sf y1 = vec_mergel(x0, x2);               \
127     v4sf y2 = vec_mergeh(x1, x3);               \
128     v4sf y3 = vec_mergel(x1, x3);               \
129     x0 = vec_mergeh(y0, y2);                    \
130     x1 = vec_mergel(y0, y2);                    \
131     x2 = vec_mergeh(y1, y3);                    \
132     x3 = vec_mergel(y1, y3);                    \
133   }
134 #  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
135 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
136 
137 /*
138   SSE1 support macros
139 */
140 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
141 
142 #include <xmmintrin.h>
143 typedef __m128 v4sf;
144 #  define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
145 #  define VZERO() _mm_setzero_ps()
146 #  define VMUL(a,b) _mm_mul_ps(a,b)
147 #  define VADD(a,b) _mm_add_ps(a,b)
148 #  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
149 #  define VSUB(a,b) _mm_sub_ps(a,b)
150 #  define LD_PS1(p) _mm_set1_ps(p)
151 #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
152 #  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
153 #  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
154 #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
155 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0)
156 
157 /*
158   ARM NEON support macros
159 */
160 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(IOS))
161 #  include <arm_neon.h>
162 typedef float32x4_t v4sf;
163 #  define SIMD_SZ 4
164 #  define VZERO() vdupq_n_f32(0)
165 #  define VMUL(a,b) vmulq_f32(a,b)
166 #  define VADD(a,b) vaddq_f32(a,b)
167 #  define VMADD(a,b,c) vmlaq_f32(c,a,b)
168 #  define VSUB(a,b) vsubq_f32(a,b)
169 #  define LD_PS1(p) vld1q_dup_f32(&(p))
170 #  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
171 #  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
172 #  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
173     float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
174     float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
175     float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
176     float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
177     x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
178   }
179 // marginally faster version
180 //#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
181 #  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
182 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
183 #else
184 #  if !defined(PFFFT_SIMD_DISABLE)
185 //#    warning "building with simd disabled !!!!!!!!!!!!!!\n";
186 #    define PFFFT_SIMD_DISABLE // fallback to scalar code
187 #  endif
188 #endif
189 
190 // fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
191 #ifdef PFFFT_SIMD_DISABLE
192 typedef float v4sf;
193 #  define SIMD_SZ 1
194 #  define VZERO() 0.f
195 #  define VMUL(a,b) ((a)*(b))
196 #  define VADD(a,b) ((a)+(b))
197 #  define VMADD(a,b,c) ((a)*(b)+(c))
198 #  define VSUB(a,b) ((a)-(b))
199 #  define LD_PS1(p) (p)
200 #  define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0)
201 #endif
202 
203 // shortcuts for complex multiplcations
204 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
205 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
206 #ifndef SVMUL
207 // multiply a scalar with a vector
208 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
209 #endif
210 
211 #if !defined(PFFFT_SIMD_DISABLE)
212 typedef union v4sf_union {
213   v4sf  v;
214   float f[4];
215 } v4sf_union;
216 
217 #include <string.h>
218 
219 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
220 
221 /* detect bugs with the vector support macros */
validate_pffft_simd(void)222 void validate_pffft_simd(void) {
223   float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
224   v4sf_union a0, a1, a2, a3, t, u;
225   memcpy(a0.f, f, 4*sizeof(float));
226   memcpy(a1.f, f+4, 4*sizeof(float));
227   memcpy(a2.f, f+8, 4*sizeof(float));
228   memcpy(a3.f, f+12, 4*sizeof(float));
229 
230   t = a0; u = a1; t.v = VZERO();
231   printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
232   t.v = VADD(a1.v, a2.v);
233   printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
234   t.v = VMUL(a1.v, a2.v);
235   printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
236   t.v = VMADD(a1.v, a2.v,a0.v);
237   printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
238 
239   INTERLEAVE2(a1.v,a2.v,t.v,u.v);
240   printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
241   assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
242   UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
243   printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
244   assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
245 
246   t.v=LD_PS1(f[15]);
247   printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
248   assertv4(t, 15, 15, 15, 15);
249   t.v = VSWAPHL(a1.v, a2.v);
250   printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
251   assertv4(t, 8, 9, 6, 7);
252   VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
253   printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
254          a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
255          a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
256   assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
257 }
258 #endif //!PFFFT_SIMD_DISABLE
259 
260 /* SSE and co like 16-bytes aligned pointers */
261 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
pffft_aligned_malloc(size_t nb_bytes)262 void *pffft_aligned_malloc(size_t nb_bytes) {
263   void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
264   if (!p0) return (void *) 0;
265   p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
266   *((void **) p - 1) = p0;
267   return p;
268 }
269 
pffft_aligned_free(void * p)270 void pffft_aligned_free(void *p) {
271   if (p) free(*((void **) p - 1));
272 }
273 
pffft_simd_size(void)274 int32_t pffft_simd_size(void) { return SIMD_SZ; }
275 
276 /*
277   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
278 */
passf2_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1,float fsign)279 static NEVER_INLINE(void) passf2_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
280   int32_t k, i;
281   int32_t l1ido = l1*ido;
282   if (ido <= 2) {
283     for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
284       ch[0]         = VADD(cc[0], cc[ido+0]);
285       ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
286       ch[1]         = VADD(cc[1], cc[ido+1]);
287       ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
288     }
289   } else {
290     for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
291       for (i=0; i<ido-1; i+=2) {
292         v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
293         v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
294         v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
295         ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
296         ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
297         VCPLXMUL(tr2, ti2, wr, wi);
298         ch[i+l1ido]   = tr2;
299         ch[i+l1ido+1] = ti2;
300       }
301     }
302   }
303 }
304 
305 /*
306   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
307 */
passf3_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,float fsign)308 static NEVER_INLINE(void) passf3_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch,
309                                     const float *wa1, const float *wa2, float fsign) {
310   static const float taur = -0.5f;
311   float taui = 0.866025403784439f*fsign;
312   int32_t i, k;
313   v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
314   int32_t l1ido = l1*ido;
315   float wr1, wi1, wr2, wi2;
316   assert(ido > 2);
317   for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
318     for (i=0; i<ido-1; i+=2) {
319       tr2 = VADD(cc[i+ido], cc[i+2*ido]);
320       cr2 = VADD(cc[i], SVMUL(taur,tr2));
321       ch[i]    = VADD(cc[i], tr2);
322       ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
323       ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
324       ch[i+1]  = VADD(cc[i+1], ti2);
325       cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
326       ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
327       dr2 = VSUB(cr2, ci3);
328       dr3 = VADD(cr2, ci3);
329       di2 = VADD(ci2, cr3);
330       di3 = VSUB(ci2, cr3);
331       wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
332       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
333       ch[i+l1ido] = dr2;
334       ch[i+l1ido + 1] = di2;
335       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
336       ch[i+2*l1ido] = dr3;
337       ch[i+2*l1ido+1] = di3;
338     }
339   }
340 } /* passf3 */
341 
passf4_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,float fsign)342 static NEVER_INLINE(void) passf4_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch,
343                                     const float *wa1, const float *wa2, const float *wa3, float fsign) {
344   /* isign == -1 for forward transform and +1 for backward transform */
345 
346   int32_t i, k;
347   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
348   int32_t l1ido = l1*ido;
349   if (ido == 2) {
350     for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
351       tr1 = VSUB(cc[0], cc[2*ido + 0]);
352       tr2 = VADD(cc[0], cc[2*ido + 0]);
353       ti1 = VSUB(cc[1], cc[2*ido + 1]);
354       ti2 = VADD(cc[1], cc[2*ido + 1]);
355       ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
356       tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
357       tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
358       ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
359 
360       ch[0*l1ido + 0] = VADD(tr2, tr3);
361       ch[0*l1ido + 1] = VADD(ti2, ti3);
362       ch[1*l1ido + 0] = VADD(tr1, tr4);
363       ch[1*l1ido + 1] = VADD(ti1, ti4);
364       ch[2*l1ido + 0] = VSUB(tr2, tr3);
365       ch[2*l1ido + 1] = VSUB(ti2, ti3);
366       ch[3*l1ido + 0] = VSUB(tr1, tr4);
367       ch[3*l1ido + 1] = VSUB(ti1, ti4);
368     }
369   } else {
370     for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
371       for (i=0; i<ido-1; i+=2) {
372         float wr1, wi1, wr2, wi2, wr3, wi3;
373         tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
374         tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
375         ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
376         ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
377         tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
378         ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
379         tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
380         ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
381 
382         ch[i] = VADD(tr2, tr3);
383         cr3    = VSUB(tr2, tr3);
384         ch[i + 1] = VADD(ti2, ti3);
385         ci3 = VSUB(ti2, ti3);
386 
387         cr2 = VADD(tr1, tr4);
388         cr4 = VSUB(tr1, tr4);
389         ci2 = VADD(ti1, ti4);
390         ci4 = VSUB(ti1, ti4);
391         wr1=wa1[i], wi1=fsign*wa1[i+1];
392         VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
393         wr2=wa2[i], wi2=fsign*wa2[i+1];
394         ch[i + l1ido] = cr2;
395         ch[i + l1ido + 1] = ci2;
396 
397         VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
398         wr3=wa3[i], wi3=fsign*wa3[i+1];
399         ch[i + 2*l1ido] = cr3;
400         ch[i + 2*l1ido + 1] = ci3;
401 
402         VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
403         ch[i + 3*l1ido] = cr4;
404         ch[i + 3*l1ido + 1] = ci4;
405       }
406     }
407   }
408 } /* passf4 */
409 
410 /*
411   passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
412 */
passf5_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4,float fsign)413 static NEVER_INLINE(void) passf5_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch,
414                                     const float *wa1, const float *wa2,
415                                     const float *wa3, const float *wa4, float fsign) {
416   static const float tr11 = .309016994374947f;
417   const float ti11 = .951056516295154f*fsign;
418   static const float tr12 = -.809016994374947f;
419   const float ti12 = .587785252292473f*fsign;
420 
421   /* Local variables */
422   int32_t i, k;
423   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
424     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
425 
426   float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
427 
428 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
429 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
430 
431   assert(ido > 2);
432   for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
433     for (i = 0; i < ido-1; i += 2) {
434       ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
435       ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
436       ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
437       ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
438       tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
439       tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
440       tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
441       tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
442       ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
443       ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
444       cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
445       ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
446       cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
447       ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
448       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
449       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
450       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
451       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
452       dr3 = VSUB(cr3, ci4);
453       dr4 = VADD(cr3, ci4);
454       di3 = VADD(ci3, cr4);
455       di4 = VSUB(ci3, cr4);
456       dr5 = VADD(cr2, ci5);
457       dr2 = VSUB(cr2, ci5);
458       di5 = VSUB(ci2, cr5);
459       di2 = VADD(ci2, cr5);
460       wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
461       wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
462       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
463       ch_ref(i - 1, 2) = dr2;
464       ch_ref(i, 2)     = di2;
465       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
466       ch_ref(i - 1, 3) = dr3;
467       ch_ref(i, 3)     = di3;
468       VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
469       ch_ref(i - 1, 4) = dr4;
470       ch_ref(i, 4)     = di4;
471       VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
472       ch_ref(i - 1, 5) = dr5;
473       ch_ref(i, 5)     = di5;
474     }
475   }
476 #undef ch_ref
477 #undef cc_ref
478 }
479 
radf2_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1)480 static NEVER_INLINE(void) radf2_ps(int32_t ido, int32_t l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
481   static const float minus_one = -1.f;
482   int32_t i, k, l1ido = l1*ido;
483   for (k=0; k < l1ido; k += ido) {
484     v4sf a = cc[k], b = cc[k + l1ido];
485     ch[2*k] = VADD(a, b);
486     ch[2*(k+ido)-1] = VSUB(a, b);
487   }
488   if (ido < 2) return;
489   if (ido != 2) {
490     for (k=0; k < l1ido; k += ido) {
491       for (i=2; i<ido; i+=2) {
492         v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
493         v4sf br = cc[i - 1 + k], bi = cc[i + k];
494         VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
495         ch[i + 2*k] = VADD(bi, ti2);
496         ch[2*(k+ido) - i] = VSUB(ti2, bi);
497         ch[i - 1 + 2*k] = VADD(br, tr2);
498         ch[2*(k+ido) - i -1] = VSUB(br, tr2);
499       }
500     }
501     if (ido % 2 == 1) return;
502   }
503   for (k=0; k < l1ido; k += ido) {
504     ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
505     ch[2*k + ido-1] = cc[k + ido-1];
506   }
507 } /* radf2 */
508 
509 
radb2_ps(int32_t ido,int32_t l1,const v4sf * cc,v4sf * ch,const float * wa1)510 static NEVER_INLINE(void) radb2_ps(int32_t ido, int32_t l1, const v4sf *cc, v4sf *ch, const float *wa1) {
511   static const float minus_two=-2;
512   int32_t i, k, l1ido = l1*ido;
513   v4sf a,b,c,d, tr2, ti2;
514   for (k=0; k < l1ido; k += ido) {
515     a = cc[2*k]; b = cc[2*(k+ido) - 1];
516     ch[k] = VADD(a, b);
517     ch[k + l1ido] =VSUB(a, b);
518   }
519   if (ido < 2) return;
520   if (ido != 2) {
521     for (k = 0; k < l1ido; k += ido) {
522       for (i = 2; i < ido; i += 2) {
523         a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
524         c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
525         ch[i-1 + k] = VADD(a, b);
526         tr2 = VSUB(a, b);
527         ch[i+0 + k] = VSUB(c, d);
528         ti2 = VADD(c, d);
529         VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
530         ch[i-1 + k + l1ido] = tr2;
531         ch[i+0 + k + l1ido] = ti2;
532       }
533     }
534     if (ido % 2 == 1) return;
535   }
536   for (k = 0; k < l1ido; k += ido) {
537     a = cc[2*k + ido-1]; b = cc[2*k + ido];
538     ch[k + ido-1] = VADD(a,a);
539     ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
540   }
541 } /* radb2 */
542 
radf3_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)543 static void radf3_ps(int32_t ido, int32_t l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
544                      const float *wa1, const float *wa2) {
545   static const float taur = -0.5f;
546   static const float taui = 0.866025403784439f;
547   int32_t i, k, ic;
548   v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
549   for (k=0; k<l1; k++) {
550     cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
551     ch[3*k*ido] = VADD(cc[k*ido], cr2);
552     ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
553     ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
554   }
555   if (ido == 1) return;
556   for (k=0; k<l1; k++) {
557     for (i=2; i<ido; i+=2) {
558       ic = ido - i;
559       wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
560       dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
561       VCPLXMULCONJ(dr2, di2, wr1, wi1);
562 
563       wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
564       dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
565       VCPLXMULCONJ(dr3, di3, wr2, wi2);
566 
567       cr2 = VADD(dr2, dr3);
568       ci2 = VADD(di2, di3);
569       ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
570       ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
571       tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
572       ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
573       tr3 = SVMUL(taui, VSUB(di2, di3));
574       ti3 = SVMUL(taui, VSUB(dr3, dr2));
575       ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
576       ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
577       ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
578       ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
579     }
580   }
581 } /* radf3 */
582 
583 
radb3_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)584 static void radb3_ps(int32_t ido, int32_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
585                      const float *wa1, const float *wa2)
586 {
587   static const float taur = -0.5f;
588   static const float taui = 0.866025403784439f;
589   static const float taui_2 = 0.866025403784439f*2;
590   int32_t i, k, ic;
591   v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
592   for (k=0; k<l1; k++) {
593     tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
594     cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
595     ch[k*ido] = VADD(cc[3*k*ido], tr2);
596     ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
597     ch[(k + l1)*ido] = VSUB(cr2, ci3);
598     ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
599   }
600   if (ido == 1) return;
601   for (k=0; k<l1; k++) {
602     for (i=2; i<ido; i+=2) {
603       ic = ido - i;
604       tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
605       cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
606       ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
607       ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
608       ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
609       ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
610       cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
611       ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
612       dr2 = VSUB(cr2, ci3);
613       dr3 = VADD(cr2, ci3);
614       di2 = VADD(ci2, cr3);
615       di3 = VSUB(ci2, cr3);
616       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
617       ch[i - 1 + (k + l1)*ido] = dr2;
618       ch[i + (k + l1)*ido] = di2;
619       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
620       ch[i - 1 + (k + 2*l1)*ido] = dr3;
621       ch[i + (k + 2*l1)*ido] = di3;
622     }
623   }
624 } /* radb3 */
625 
radf4_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)626 static NEVER_INLINE(void) radf4_ps(int32_t ido, int32_t l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
627                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
628 {
629   static const float minus_hsqt2 = (float)-0.7071067811865475;
630   int32_t i, k, l1ido = l1*ido;
631   {
632     const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
633     v4sf * RESTRICT ch_ = ch;
634     while (cc < cc_end) {
635       // this loop represents between 25% and 40% of total radf4_ps cost !
636       v4sf a0 = cc[0], a1 = cc[l1ido];
637       v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
638       v4sf tr1 = VADD(a1, a3);
639       v4sf tr2 = VADD(a0, a2);
640       ch[2*ido-1] = VSUB(a0, a2);
641       ch[2*ido  ] = VSUB(a3, a1);
642       ch[0      ] = VADD(tr1, tr2);
643       ch[4*ido-1] = VSUB(tr2, tr1);
644       cc += ido; ch += 4*ido;
645     }
646     cc = cc_; ch = ch_;
647   }
648   if (ido < 2) return;
649   if (ido != 2) {
650     for (k = 0; k < l1ido; k += ido) {
651       const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
652       for (i=2; i<ido; i += 2, pc += 2) {
653         int32_t ic = ido - i;
654         v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
655         v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
656 
657         cr2 = pc[1*l1ido+0];
658         ci2 = pc[1*l1ido+1];
659         wr=LD_PS1(wa1[i - 2]);
660         wi=LD_PS1(wa1[i - 1]);
661         VCPLXMULCONJ(cr2,ci2,wr,wi);
662 
663         cr3 = pc[2*l1ido+0];
664         ci3 = pc[2*l1ido+1];
665         wr = LD_PS1(wa2[i-2]);
666         wi = LD_PS1(wa2[i-1]);
667         VCPLXMULCONJ(cr3, ci3, wr, wi);
668 
669         cr4 = pc[3*l1ido];
670         ci4 = pc[3*l1ido+1];
671         wr = LD_PS1(wa3[i-2]);
672         wi = LD_PS1(wa3[i-1]);
673         VCPLXMULCONJ(cr4, ci4, wr, wi);
674 
675         /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
676 
677         tr1 = VADD(cr2,cr4);
678         tr4 = VSUB(cr4,cr2);
679         tr2 = VADD(pc[0],cr3);
680         tr3 = VSUB(pc[0],cr3);
681         ch[i - 1 + 4*k] = VADD(tr1,tr2);
682         ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
683         ti1 = VADD(ci2,ci4);
684         ti4 = VSUB(ci2,ci4);
685         ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
686         ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
687         ti2 = VADD(pc[1],ci3);
688         ti3 = VSUB(pc[1],ci3);
689         ch[i + 4*k] = VADD(ti1, ti2);
690         ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
691         ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
692         ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
693       }
694     }
695     if (ido % 2 == 1) return;
696   }
697   for (k=0; k<l1ido; k += ido) {
698     v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
699     v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
700     v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
701     v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
702     ch[ido-1 + 4*k] = VADD(tr1, c);
703     ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
704     ch[4*k + 1*ido] = VSUB(ti1, d);
705     ch[4*k + 3*ido] = VADD(ti1, d);
706   }
707 } /* radf4 */
708 
709 
radb4_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)710 static NEVER_INLINE(void) radb4_ps(int32_t ido, int32_t l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
711                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
712 {
713   static const float minus_sqrt2 = (float)-1.414213562373095;
714   static const float two = 2.f;
715   int32_t i, k, l1ido = l1*ido;
716   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
717   {
718     const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
719     v4sf *ch_ = ch;
720     while (ch < ch_end) {
721       v4sf a = cc[0], b = cc[4*ido-1];
722       v4sf c = cc[2*ido], d = cc[2*ido-1];
723       tr3 = SVMUL(two,d);
724       tr2 = VADD(a,b);
725       tr1 = VSUB(a,b);
726       tr4 = SVMUL(two,c);
727       ch[0*l1ido] = VADD(tr2, tr3);
728       ch[2*l1ido] = VSUB(tr2, tr3);
729       ch[1*l1ido] = VSUB(tr1, tr4);
730       ch[3*l1ido] = VADD(tr1, tr4);
731 
732       cc += 4*ido; ch += ido;
733     }
734     cc = cc_; ch = ch_;
735   }
736   if (ido < 2) return;
737   if (ido != 2) {
738     for (k = 0; k < l1ido; k += ido) {
739       const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
740       v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
741       for (i = 2; i < ido; i += 2) {
742 
743         tr1 = VSUB(pc[i], pc[4*ido - i]);
744         tr2 = VADD(pc[i], pc[4*ido - i]);
745         ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
746         tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
747         ph[0] = VADD(tr2, tr3);
748         cr3 = VSUB(tr2, tr3);
749 
750         ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
751         tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
752         cr2 = VSUB(tr1, tr4);
753         cr4 = VADD(tr1, tr4);
754 
755         ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
756         ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
757 
758         ph[1] = VADD(ti2, ti3); ph += l1ido;
759         ci3 = VSUB(ti2, ti3);
760         ci2 = VADD(ti1, ti4);
761         ci4 = VSUB(ti1, ti4);
762         VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
763         ph[0] = cr2;
764         ph[1] = ci2; ph += l1ido;
765         VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
766         ph[0] = cr3;
767         ph[1] = ci3; ph += l1ido;
768         VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
769         ph[0] = cr4;
770         ph[1] = ci4; ph = ph - 3*l1ido + 2;
771       }
772     }
773     if (ido % 2 == 1) return;
774   }
775   for (k=0; k < l1ido; k+=ido) {
776     int32_t i0 = 4*k + ido;
777     v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
778     v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
779     tr1 = VSUB(c,d);
780     tr2 = VADD(c,d);
781     ti1 = VADD(b,a);
782     ti2 = VSUB(b,a);
783     ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
784     ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
785     ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
786     ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
787   }
788 } /* radb4 */
789 
radf5_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)790 static void radf5_ps(int32_t ido, int32_t l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
791                      const float *wa1, const float *wa2, const float *wa3, const float *wa4)
792 {
793   static const float tr11 = .309016994374947f;
794   static const float ti11 = .951056516295154f;
795   static const float tr12 = -.809016994374947f;
796   static const float ti12 = .587785252292473f;
797 
798   /* System generated locals */
799   int32_t cc_offset, ch_offset;
800 
801   /* Local variables */
802   int32_t i, k, ic;
803   v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
804     cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
805   int32_t idp2;
806 
807 
808 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
809 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
810 
811   /* Parameter adjustments */
812   ch_offset = 1 + ido * 6;
813   ch -= ch_offset;
814   cc_offset = 1 + ido * (1 + l1);
815   cc -= cc_offset;
816 
817   /* Function Body */
818   for (k = 1; k <= l1; ++k) {
819     cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
820     ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
821     cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
822     ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
823     ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
824     ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
825     ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
826     ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
827     ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
828     //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
829   }
830   if (ido == 1) {
831     return;
832   }
833   idp2 = ido + 2;
834   for (k = 1; k <= l1; ++k) {
835     for (i = 3; i <= ido; i += 2) {
836       ic = idp2 - i;
837       dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
838       dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
839       dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
840       dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
841       VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
842       VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
843       VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
844       VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
845       cr2 = VADD(dr2, dr5);
846       ci5 = VSUB(dr5, dr2);
847       cr5 = VSUB(di2, di5);
848       ci2 = VADD(di2, di5);
849       cr3 = VADD(dr3, dr4);
850       ci4 = VSUB(dr4, dr3);
851       cr4 = VSUB(di3, di4);
852       ci3 = VADD(di3, di4);
853       ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
854       ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
855       tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
856       ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
857       tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
858       ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
859       tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
860       ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
861       tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
862       ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
863       ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
864       ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
865       ch_ref(i, 3, k) = VADD(ti2, ti5);
866       ch_ref(ic, 2, k) = VSUB(ti5, ti2);
867       ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
868       ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
869       ch_ref(i, 5, k) = VADD(ti3, ti4);
870       ch_ref(ic, 4, k) = VSUB(ti4, ti3);
871     }
872   }
873 #undef cc_ref
874 #undef ch_ref
875 } /* radf5 */
876 
radb5_ps(int32_t ido,int32_t l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)877 static void radb5_ps(int32_t ido, int32_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
878                   const float *wa1, const float *wa2, const float *wa3, const float *wa4)
879 {
880   static const float tr11 = .309016994374947f;
881   static const float ti11 = .951056516295154f;
882   static const float tr12 = -.809016994374947f;
883   static const float ti12 = .587785252292473f;
884 
885   int32_t cc_offset, ch_offset;
886 
887   /* Local variables */
888   int32_t i, k, ic;
889   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
890     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
891   int32_t idp2;
892 
893 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
894 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
895 
896   /* Parameter adjustments */
897   ch_offset = 1 + ido * (1 + l1);
898   ch -= ch_offset;
899   cc_offset = 1 + ido * 6;
900   cc -= cc_offset;
901 
902   /* Function Body */
903   for (k = 1; k <= l1; ++k) {
904     ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
905     ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
906     tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
907     tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
908     ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
909     cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
910     cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
911     ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
912     ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
913     ch_ref(1, k, 2) = VSUB(cr2, ci5);
914     ch_ref(1, k, 3) = VSUB(cr3, ci4);
915     ch_ref(1, k, 4) = VADD(cr3, ci4);
916     ch_ref(1, k, 5) = VADD(cr2, ci5);
917   }
918   if (ido == 1) {
919     return;
920   }
921   idp2 = ido + 2;
922   for (k = 1; k <= l1; ++k) {
923     for (i = 3; i <= ido; i += 2) {
924       ic = idp2 - i;
925       ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
926       ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
927       ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
928       ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
929       tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
930       tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
931       tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
932       tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
933       ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
934       ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
935       cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
936       ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
937       cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
938       ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
939       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
940       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
941       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
942       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
943       dr3 = VSUB(cr3, ci4);
944       dr4 = VADD(cr3, ci4);
945       di3 = VADD(ci3, cr4);
946       di4 = VSUB(ci3, cr4);
947       dr5 = VADD(cr2, ci5);
948       dr2 = VSUB(cr2, ci5);
949       di5 = VSUB(ci2, cr5);
950       di2 = VADD(ci2, cr5);
951       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
952       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
953       VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
954       VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
955 
956       ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
957       ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
958       ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
959       ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
960     }
961   }
962 #undef cc_ref
963 #undef ch_ref
964 } /* radb5 */
965 
rfftf1_ps(int32_t n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int32_t * ifac)966 static NEVER_INLINE(v4sf *) rfftf1_ps(int32_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
967                                       const float *wa, const int32_t *ifac) {
968   v4sf *in  = (v4sf*)input_readonly;
969   v4sf *out = (in == work2 ? work1 : work2);
970   int32_t nf = ifac[1], k1;
971   int32_t l2 = n;
972   int32_t iw = n-1;
973   assert(in != out && work1 != work2);
974   for (k1 = 1; k1 <= nf; ++k1) {
975     int32_t kh = nf - k1;
976     int32_t ip = ifac[kh + 2];
977     int32_t l1 = l2 / ip;
978     int32_t ido = n / l2;
979     iw -= (ip - 1)*ido;
980     switch (ip) {
981       case 5: {
982         int32_t ix2 = iw + ido;
983         int32_t ix3 = ix2 + ido;
984         int32_t ix4 = ix3 + ido;
985         radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
986       } break;
987       case 4: {
988         int32_t ix2 = iw + ido;
989         int32_t ix3 = ix2 + ido;
990         radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
991       } break;
992       case 3: {
993         int32_t ix2 = iw + ido;
994         radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
995       } break;
996       case 2:
997         radf2_ps(ido, l1, in, out, &wa[iw]);
998         break;
999       default:
1000         assert(0);
1001         break;
1002     }
1003     l2 = l1;
1004     if (out == work2) {
1005       out = work1; in = work2;
1006     } else {
1007       out = work2; in = work1;
1008     }
1009   }
1010   return in; /* this is in fact the output .. */
1011 } /* rfftf1 */
1012 
rfftb1_ps(int32_t n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int32_t * ifac)1013 static NEVER_INLINE(v4sf *) rfftb1_ps(int32_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1014                                       const float *wa, const int32_t *ifac) {
1015   v4sf *in  = (v4sf*)input_readonly;
1016   v4sf *out = (in == work2 ? work1 : work2);
1017   int32_t nf = ifac[1], k1;
1018   int32_t l1 = 1;
1019   int32_t iw = 0;
1020   assert(in != out);
1021   for (k1=1; k1<=nf; k1++) {
1022     int32_t ip = ifac[k1 + 1];
1023     int32_t l2 = ip*l1;
1024     int32_t ido = n / l2;
1025     switch (ip) {
1026       case 5: {
1027         int32_t ix2 = iw + ido;
1028         int32_t ix3 = ix2 + ido;
1029         int32_t ix4 = ix3 + ido;
1030         radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1031       } break;
1032       case 4: {
1033         int32_t ix2 = iw + ido;
1034         int32_t ix3 = ix2 + ido;
1035         radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1036       } break;
1037       case 3: {
1038         int32_t ix2 = iw + ido;
1039         radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1040       } break;
1041       case 2:
1042         radb2_ps(ido, l1, in, out, &wa[iw]);
1043         break;
1044       default:
1045         assert(0);
1046         break;
1047     }
1048     l1 = l2;
1049     iw += (ip - 1)*ido;
1050 
1051     if (out == work2) {
1052       out = work1; in = work2;
1053     } else {
1054       out = work2; in = work1;
1055     }
1056   }
1057   return in; /* this is in fact the output .. */
1058 }
1059 
decompose(int32_t n,int32_t * ifac,const int32_t * ntryh)1060 static int32_t decompose(int32_t n, int32_t *ifac, const int32_t *ntryh) {
1061   int32_t nl = n, nf = 0, i, j = 0;
1062   for (j=0; ntryh[j]; ++j) {
1063     int32_t ntry = ntryh[j];
1064     while (nl != 1) {
1065       int32_t nq = nl / ntry;
1066       int32_t nr = nl - ntry * nq;
1067       if (nr == 0) {
1068         ifac[2+nf++] = ntry;
1069         nl = nq;
1070         if (ntry == 2 && nf != 1) {
1071           for (i = 2; i <= nf; ++i) {
1072             int32_t ib = nf - i + 2;
1073             ifac[ib + 1] = ifac[ib];
1074           }
1075           ifac[2] = 2;
1076         }
1077       } else break;
1078     }
1079   }
1080   ifac[0] = n;
1081   ifac[1] = nf;
1082   return nf;
1083 }
1084 
1085 
1086 
rffti1_ps(int32_t n,float * wa,int32_t * ifac)1087 static void rffti1_ps(int32_t n, float *wa, int32_t *ifac)
1088 {
1089   static const int32_t ntryh[] = { 4,2,3,5,0 };
1090   int32_t k1, j, ii;
1091 
1092   int32_t nf = decompose(n,ifac,ntryh);
1093   float argh = (2*M_PI) / n;
1094   int32_t is = 0;
1095   int32_t nfm1 = nf - 1;
1096   int32_t l1 = 1;
1097   for (k1 = 1; k1 <= nfm1; k1++) {
1098     int32_t ip = ifac[k1 + 1];
1099     int32_t ld = 0;
1100     int32_t l2 = l1*ip;
1101     int32_t ido = n / l2;
1102     int32_t ipm = ip - 1;
1103     for (j = 1; j <= ipm; ++j) {
1104       float argld;
1105       int32_t i = is, fi=0;
1106       ld += l1;
1107       argld = ld*argh;
1108       for (ii = 3; ii <= ido; ii += 2) {
1109         i += 2;
1110         fi += 1;
1111         wa[i - 2] = cos(fi*argld);
1112         wa[i - 1] = sin(fi*argld);
1113       }
1114       is += ido;
1115     }
1116     l1 = l2;
1117   }
1118 } /* rffti1 */
1119 
cffti1_ps(int32_t n,float * wa,int32_t * ifac)1120 void cffti1_ps(int32_t n, float *wa, int32_t *ifac)
1121 {
1122   static const int32_t ntryh[] = { 5,3,4,2,0 };
1123   int32_t k1, j, ii;
1124 
1125   int32_t nf = decompose(n,ifac,ntryh);
1126   float argh = (2*M_PI)/(float)n;
1127   int32_t i = 1;
1128   int32_t l1 = 1;
1129   for (k1=1; k1<=nf; k1++) {
1130     int32_t ip = ifac[k1+1];
1131     int32_t ld = 0;
1132     int32_t l2 = l1*ip;
1133     int32_t ido = n / l2;
1134     int32_t idot = ido + ido + 2;
1135     int32_t ipm = ip - 1;
1136     for (j=1; j<=ipm; j++) {
1137       float argld;
1138       int32_t i1 = i, fi = 0;
1139       wa[i-1] = 1;
1140       wa[i] = 0;
1141       ld += l1;
1142       argld = ld*argh;
1143       for (ii = 4; ii <= idot; ii += 2) {
1144         i += 2;
1145         fi += 1;
1146         wa[i-1] = cos(fi*argld);
1147         wa[i] = sin(fi*argld);
1148       }
1149       if (ip > 5) {
1150         wa[i1-1] = wa[i-1];
1151         wa[i1] = wa[i];
1152       }
1153     }
1154     l1 = l2;
1155   }
1156 } /* cffti1 */
1157 
1158 
cfftf1_ps(int32_t n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int32_t * ifac,int32_t isign)1159 v4sf *cfftf1_ps(int32_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int32_t *ifac, int32_t isign) {
1160   v4sf *in  = (v4sf*)input_readonly;
1161   v4sf *out = (in == work2 ? work1 : work2);
1162   int32_t nf = ifac[1], k1;
1163   int32_t l1 = 1;
1164   int32_t iw = 0;
1165   assert(in != out && work1 != work2);
1166   for (k1=2; k1<=nf+1; k1++) {
1167     int32_t ip = ifac[k1];
1168     int32_t l2 = ip*l1;
1169     int32_t ido = n / l2;
1170     int32_t idot = ido + ido;
1171     switch (ip) {
1172       case 5: {
1173         int32_t ix2 = iw + idot;
1174         int32_t ix3 = ix2 + idot;
1175         int32_t ix4 = ix3 + idot;
1176         passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1177       } break;
1178       case 4: {
1179         int32_t ix2 = iw + idot;
1180         int32_t ix3 = ix2 + idot;
1181         passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1182       } break;
1183       case 2: {
1184         passf2_ps(idot, l1, in, out, &wa[iw], isign);
1185       } break;
1186       case 3: {
1187         int32_t ix2 = iw + idot;
1188         passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1189       } break;
1190       default:
1191         assert(0);
1192     }
1193     l1 = l2;
1194     iw += (ip - 1)*idot;
1195     if (out == work2) {
1196       out = work1; in = work2;
1197     } else {
1198       out = work2; in = work1;
1199     }
1200   }
1201 
1202   return in; /* this is in fact the output .. */
1203 }
1204 
1205 
1206 struct PFFFT_Setup {
1207   int32_t     N;
1208   int32_t     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1209   int32_t ifac[15];
1210   pffft_transform_t transform;
1211   v4sf *data; // allocated room for twiddle coefs
1212   float *e;    // points into 'data' , N/4*3 elements
1213   float *twiddle; // points into 'data', N/4 elements
1214 };
1215 
pffft_new_setup(int32_t N,pffft_transform_t transform)1216 PFFFT_Setup *pffft_new_setup(int32_t N, pffft_transform_t transform) {
1217   PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1218   int32_t k, m;
1219   /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1220      and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1221      handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1222   if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1223   if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1224   //assert((N % 32) == 0);
1225   s->N = N;
1226   s->transform = transform;
1227   /* nb of complex simd vectors */
1228   s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1229   s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1230   s->e = (float*)s->data;
1231   s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1232 
1233   if (transform == PFFFT_REAL) {
1234     for (k=0; k < s->Ncvec; ++k) {
1235       int32_t i = k/SIMD_SZ;
1236       int32_t j = k%SIMD_SZ;
1237       for (m=0; m < SIMD_SZ-1; ++m) {
1238         float A = -2*M_PI*(m+1)*k / N;
1239         s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
1240         s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
1241       }
1242     }
1243     rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1244   } else {
1245     for (k=0; k < s->Ncvec; ++k) {
1246       int32_t i = k/SIMD_SZ;
1247       int32_t j = k%SIMD_SZ;
1248       for (m=0; m < SIMD_SZ-1; ++m) {
1249         float A = -2*M_PI*(m+1)*k / N;
1250         s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
1251         s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
1252       }
1253     }
1254     cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1255   }
1256 
1257   /* check that N is decomposable with allowed prime factors */
1258   for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1259   if (m != N/SIMD_SZ) {
1260     pffft_destroy_setup(s); s = 0;
1261   }
1262 
1263   return s;
1264 }
1265 
1266 
pffft_destroy_setup(PFFFT_Setup * s)1267 void pffft_destroy_setup(PFFFT_Setup *s) {
1268   pffft_aligned_free(s->data);
1269   free(s);
1270 }
1271 
1272 #if !defined(PFFFT_SIMD_DISABLE)
1273 
1274 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
reversed_copy(int32_t N,const v4sf * in,int32_t in_stride,v4sf * out)1275 static void reversed_copy(int32_t N, const v4sf *in, int32_t in_stride, v4sf *out) {
1276   v4sf g0, g1;
1277   int32_t k;
1278   INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1279 
1280   *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1281   for (k=1; k < N; ++k) {
1282     v4sf h0, h1;
1283     INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1284     *--out = VSWAPHL(g1, h0);
1285     *--out = VSWAPHL(h0, h1);
1286     g1 = h1;
1287   }
1288   *--out = VSWAPHL(g1, g0);
1289 }
1290 
unreversed_copy(int32_t N,const v4sf * in,v4sf * out,int32_t out_stride)1291 static void unreversed_copy(int32_t N, const v4sf *in, v4sf *out, int32_t out_stride) {
1292   v4sf g0, g1, h0, h1;
1293   int32_t k;
1294   g0 = g1 = in[0]; ++in;
1295   for (k=1; k < N; ++k) {
1296     h0 = *in++; h1 = *in++;
1297     g1 = VSWAPHL(g1, h0);
1298     h0 = VSWAPHL(h0, h1);
1299     UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1300     g1 = h1;
1301   }
1302   h0 = *in++; h1 = g0;
1303   g1 = VSWAPHL(g1, h0);
1304   h0 = VSWAPHL(h0, h1);
1305   UNINTERLEAVE2(h0, g1, out[0], out[1]);
1306 }
1307 
pffft_zreorder(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1308 void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1309   int32_t k, N = setup->N, Ncvec = setup->Ncvec;
1310   const v4sf *vin = (const v4sf*)in;
1311   v4sf *vout = (v4sf*)out;
1312   assert(in != out);
1313   if (setup->transform == PFFFT_REAL) {
1314     int32_t k, dk = N/32;
1315     if (direction == PFFFT_FORWARD) {
1316       for (k=0; k < dk; ++k) {
1317         INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1318         INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1319       }
1320       reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1321       reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1322     } else {
1323       for (k=0; k < dk; ++k) {
1324         UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1325         UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1326       }
1327       unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1328       unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1329     }
1330   } else {
1331     if (direction == PFFFT_FORWARD) {
1332       for (k=0; k < Ncvec; ++k) {
1333         int32_t kk = (k/4) + (k%4)*(Ncvec/4);
1334         INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1335       }
1336     } else {
1337       for (k=0; k < Ncvec; ++k) {
1338         int32_t kk = (k/4) + (k%4)*(Ncvec/4);
1339         UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1340       }
1341     }
1342   }
1343 }
1344 
pffft_cplx_finalize(int32_t Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1345 void pffft_cplx_finalize(int32_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1346   int32_t k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1347   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1348   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1349   assert(in != out);
1350   for (k=0; k < dk; ++k) {
1351     r0 = in[8*k+0]; i0 = in[8*k+1];
1352     r1 = in[8*k+2]; i1 = in[8*k+3];
1353     r2 = in[8*k+4]; i2 = in[8*k+5];
1354     r3 = in[8*k+6]; i3 = in[8*k+7];
1355     VTRANSPOSE4(r0,r1,r2,r3);
1356     VTRANSPOSE4(i0,i1,i2,i3);
1357     VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1358     VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1359     VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1360 
1361     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1362     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1363     si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1364     si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1365 
1366     /*
1367       transformation for each column is:
1368 
1369       [1   1   1   1   0   0   0   0]   [r0]
1370       [1   0  -1   0   0  -1   0   1]   [r1]
1371       [1  -1   1  -1   0   0   0   0]   [r2]
1372       [1   0  -1   0   0   1   0  -1]   [r3]
1373       [0   0   0   0   1   1   1   1] * [i0]
1374       [0   1   0  -1   1   0  -1   0]   [i1]
1375       [0   0   0   0   1  -1   1  -1]   [i2]
1376       [0  -1   0   1   1   0  -1   0]   [i3]
1377     */
1378 
1379     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1380     r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1381     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1382     r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1383 
1384     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1385     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1386   }
1387 }
1388 
pffft_cplx_preprocess(int32_t Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1389 void pffft_cplx_preprocess(int32_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1390   int32_t k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1391   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1392   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1393   assert(in != out);
1394   for (k=0; k < dk; ++k) {
1395     r0 = in[8*k+0]; i0 = in[8*k+1];
1396     r1 = in[8*k+2]; i1 = in[8*k+3];
1397     r2 = in[8*k+4]; i2 = in[8*k+5];
1398     r3 = in[8*k+6]; i3 = in[8*k+7];
1399 
1400     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1401     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1402     si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1403     si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1404 
1405     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1406     r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1407     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1408     r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1409 
1410     VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1411     VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1412     VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1413 
1414     VTRANSPOSE4(r0,r1,r2,r3);
1415     VTRANSPOSE4(i0,i1,i2,i3);
1416 
1417     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1418     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1419   }
1420 }
1421 
1422 
pffft_real_finalize_4x4(const v4sf * in0,const v4sf * in1,const v4sf * in,const v4sf * e,v4sf * out)1423 static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1424                             const v4sf *e, v4sf *out) {
1425   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1426   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1427   r0 = *in0; i0 = *in1;
1428   r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1429   VTRANSPOSE4(r0,r1,r2,r3);
1430   VTRANSPOSE4(i0,i1,i2,i3);
1431 
1432   /*
1433     transformation for each column is:
1434 
1435     [1   1   1   1   0   0   0   0]   [r0]
1436     [1   0  -1   0   0  -1   0   1]   [r1]
1437     [1   0  -1   0   0   1   0  -1]   [r2]
1438     [1  -1   1  -1   0   0   0   0]   [r3]
1439     [0   0   0   0   1   1   1   1] * [i0]
1440     [0  -1   0   1  -1   0   1   0]   [i1]
1441     [0  -1   0   1   1   0  -1   0]   [i2]
1442     [0   0   0   0  -1   1  -1   1]   [i3]
1443   */
1444 
1445   //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1446   //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1447 
1448   VCPLXMUL(r1,i1,e[0],e[1]);
1449   VCPLXMUL(r2,i2,e[2],e[3]);
1450   VCPLXMUL(r3,i3,e[4],e[5]);
1451 
1452   //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1453   //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1454 
1455   sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1456   sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1457   si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1458   si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1459 
1460   r0 = VADD(sr0, sr1);
1461   r3 = VSUB(sr0, sr1);
1462   i0 = VADD(si0, si1);
1463   i3 = VSUB(si1, si0);
1464   r1 = VADD(dr0, di1);
1465   r2 = VSUB(dr0, di1);
1466   i1 = VSUB(dr1, di0);
1467   i2 = VADD(dr1, di0);
1468 
1469   *out++ = r0;
1470   *out++ = i0;
1471   *out++ = r1;
1472   *out++ = i1;
1473   *out++ = r2;
1474   *out++ = i2;
1475   *out++ = r3;
1476   *out++ = i3;
1477 
1478 }
1479 
pffft_real_finalize(int32_t Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1480 static NEVER_INLINE(void) pffft_real_finalize(int32_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1481   int32_t k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1482   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1483 
1484   v4sf_union cr, ci, *uout = (v4sf_union*)out;
1485   v4sf save = in[7], zero=VZERO();
1486   float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1487   static const float s = (float) M_SQRT2/2;
1488 
1489   cr.v = in[0]; ci.v = in[Ncvec*2-1];
1490   assert(in != out);
1491   pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1492 
1493   /*
1494     [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1495 
1496     [Xr(1)]  ] [1   1   1   1   0   0   0   0]
1497     [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
1498     [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
1499     [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
1500     [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
1501     [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
1502     [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
1503     [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
1504   */
1505 
1506   xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1507   xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1508   xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
1509   xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
1510   xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
1511   xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
1512   xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
1513   xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
1514 
1515   for (k=1; k < dk; ++k) {
1516     v4sf save_next = in[8*k+7];
1517     pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1518                            e + k*6, out + k*8);
1519     save = save_next;
1520   }
1521 
1522 }
1523 
pffft_real_preprocess_4x4(const v4sf * in,const v4sf * e,v4sf * out,int32_t first)1524 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1525                                              const v4sf *e, v4sf *out, int32_t first) {
1526   v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1527   /*
1528     transformation for each column is:
1529 
1530     [1   1   1   1   0   0   0   0]   [r0]
1531     [1   0   0  -1   0  -1  -1   0]   [r1]
1532     [1  -1  -1   1   0   0   0   0]   [r2]
1533     [1   0   0  -1   0   1   1   0]   [r3]
1534     [0   0   0   0   1  -1   1  -1] * [i0]
1535     [0  -1   1   0   1   0   0   1]   [i1]
1536     [0   0   0   0   1   1  -1  -1]   [i2]
1537     [0   1  -1   0   1   0   0   1]   [i3]
1538   */
1539 
1540   v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1541   v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1542   v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1543   v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1544 
1545   r0 = VADD(sr0, sr1);
1546   r2 = VSUB(sr0, sr1);
1547   r1 = VSUB(dr0, si1);
1548   r3 = VADD(dr0, si1);
1549   i0 = VSUB(di0, di1);
1550   i2 = VADD(di0, di1);
1551   i1 = VSUB(si0, dr1);
1552   i3 = VADD(si0, dr1);
1553 
1554   VCPLXMULCONJ(r1,i1,e[0],e[1]);
1555   VCPLXMULCONJ(r2,i2,e[2],e[3]);
1556   VCPLXMULCONJ(r3,i3,e[4],e[5]);
1557 
1558   VTRANSPOSE4(r0,r1,r2,r3);
1559   VTRANSPOSE4(i0,i1,i2,i3);
1560 
1561   if (!first) {
1562     *out++ = r0;
1563     *out++ = i0;
1564   }
1565   *out++ = r1;
1566   *out++ = i1;
1567   *out++ = r2;
1568   *out++ = i2;
1569   *out++ = r3;
1570   *out++ = i3;
1571 }
1572 
pffft_real_preprocess(int32_t Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1573 static NEVER_INLINE(void) pffft_real_preprocess(int32_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1574   int32_t k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1575   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1576 
1577   v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1578   float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1579   static const float s = (float) M_SQRT2;
1580   assert(in != out);
1581   for (k=0; k < 4; ++k) {
1582     Xr.f[k] = ((float*)in)[8*k];
1583     Xi.f[k] = ((float*)in)[8*k+4];
1584   }
1585 
1586   pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1587 
1588   /*
1589     [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1590 
1591     [cr0] [1   0   2   0   1   0   0   0]
1592     [cr1] [1   0   0   0  -1   0  -2   0]
1593     [cr2] [1   0  -2   0   1   0   0   0]
1594     [cr3] [1   0   0   0  -1   0   2   0]
1595     [ci0] [0   2   0   2   0   0   0   0]
1596     [ci1] [0   s   0  -s   0  -s   0  -s]
1597     [ci2] [0   0   0   0   0  -2   0   2]
1598     [ci3] [0  -s   0   s   0  -s   0  -s]
1599   */
1600   for (k=1; k < dk; ++k) {
1601     pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1602   }
1603 
1604   cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1605   cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1606   cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1607   cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1608   ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
1609   ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1610   ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
1611   ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1612 }
1613 
1614 
pffft_transform_internal(PFFFT_Setup * setup,const float * finput,float * foutput,v4sf * scratch,pffft_direction_t direction,int32_t ordered)1615 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1616                              pffft_direction_t direction, int32_t ordered) {
1617   int32_t k, Ncvec   = setup->Ncvec;
1618   int32_t nf_odd = (setup->ifac[1] & 1);
1619 
1620   // temporary buffer is allocated on the stack if the scratch pointer is NULL
1621   int32_t stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1622   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1623 
1624   const v4sf *vinput = (const v4sf*)finput;
1625   v4sf *voutput      = (v4sf*)foutput;
1626   v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
1627   int32_t ib = (nf_odd ^ ordered ? 1 : 0);
1628 
1629   assert(VALIGNED(finput) && VALIGNED(foutput));
1630 
1631   //assert(finput != foutput);
1632   if (direction == PFFFT_FORWARD) {
1633     ib = !ib;
1634     if (setup->transform == PFFFT_REAL) {
1635       ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1636                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1637       pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1638     } else {
1639       v4sf *tmp = buff[ib];
1640       for (k=0; k < Ncvec; ++k) {
1641         UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1642       }
1643       ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1644                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1645       pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1646     }
1647     if (ordered) {
1648       pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1649     } else ib = !ib;
1650   } else {
1651     if (vinput == buff[ib]) {
1652       ib = !ib; // may happen when finput == foutput
1653     }
1654     if (ordered) {
1655       pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1656       vinput = buff[ib]; ib = !ib;
1657     }
1658     if (setup->transform == PFFFT_REAL) {
1659       pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1660       ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1661                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1662     } else {
1663       pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1664       ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1665                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1666       for (k=0; k < Ncvec; ++k) {
1667         INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1668       }
1669     }
1670   }
1671 
1672   if (buff[ib] != voutput) {
1673     /* extra copy required -- this situation should only happen when finput == foutput */
1674     assert(finput==foutput);
1675     for (k=0; k < Ncvec; ++k) {
1676       v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1677       voutput[2*k] = a; voutput[2*k+1] = b;
1678     }
1679     ib = !ib;
1680   }
1681   assert(buff[ib] == voutput);
1682 }
1683 
pffft_zconvolve_accumulate(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1684 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1685   int32_t Ncvec = s->Ncvec;
1686   const v4sf * RESTRICT va = (const v4sf*)a;
1687   const v4sf * RESTRICT vb = (const v4sf*)b;
1688   v4sf * RESTRICT vab = (v4sf*)ab;
1689 
1690 #ifdef __arm__
1691   __builtin_prefetch(va);
1692   __builtin_prefetch(vb);
1693   __builtin_prefetch(vab);
1694   __builtin_prefetch(va+2);
1695   __builtin_prefetch(vb+2);
1696   __builtin_prefetch(vab+2);
1697   __builtin_prefetch(va+4);
1698   __builtin_prefetch(vb+4);
1699   __builtin_prefetch(vab+4);
1700   __builtin_prefetch(va+6);
1701   __builtin_prefetch(vb+6);
1702   __builtin_prefetch(vab+6);
1703 # ifndef __clang__
1704 #   define ZCONVOLVE_USING_INLINE_NEON_ASM
1705 # endif
1706 #endif
1707 
1708   float ar, ai, br, bi, abr, abi;
1709 #ifndef ZCONVOLVE_USING_INLINE_ASM
1710   v4sf vscal = LD_PS1(scaling);
1711   int32_t i;
1712 #endif
1713 
1714   assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1715   ar = ((v4sf_union*)va)[0].f[0];
1716   ai = ((v4sf_union*)va)[1].f[0];
1717   br = ((v4sf_union*)vb)[0].f[0];
1718   bi = ((v4sf_union*)vb)[1].f[0];
1719   abr = ((v4sf_union*)vab)[0].f[0];
1720   abi = ((v4sf_union*)vab)[1].f[0];
1721 
1722 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1723   const float *a_ = a, *b_ = b; float *ab_ = ab;
1724   int32_t N = Ncvec;
1725   asm volatile("mov         r8, %2                  \n"
1726                "vdup.f32    q15, %4                 \n"
1727                "1:                                  \n"
1728                "pld         [%0,#64]                \n"
1729                "pld         [%1,#64]                \n"
1730                "pld         [%2,#64]                \n"
1731                "pld         [%0,#96]                \n"
1732                "pld         [%1,#96]                \n"
1733                "pld         [%2,#96]                \n"
1734                "vld1.f32    {q0,q1},   [%0,:128]!         \n"
1735                "vld1.f32    {q4,q5},   [%1,:128]!         \n"
1736                "vld1.f32    {q2,q3},   [%0,:128]!         \n"
1737                "vld1.f32    {q6,q7},   [%1,:128]!         \n"
1738                "vld1.f32    {q8,q9},   [r8,:128]!          \n"
1739 
1740                "vmul.f32    q10, q0, q4             \n"
1741                "vmul.f32    q11, q0, q5             \n"
1742                "vmul.f32    q12, q2, q6             \n"
1743                "vmul.f32    q13, q2, q7             \n"
1744                "vmls.f32    q10, q1, q5             \n"
1745                "vmla.f32    q11, q1, q4             \n"
1746                "vld1.f32    {q0,q1}, [r8,:128]!     \n"
1747                "vmls.f32    q12, q3, q7             \n"
1748                "vmla.f32    q13, q3, q6             \n"
1749                "vmla.f32    q8, q10, q15            \n"
1750                "vmla.f32    q9, q11, q15            \n"
1751                "vmla.f32    q0, q12, q15            \n"
1752                "vmla.f32    q1, q13, q15            \n"
1753                "vst1.f32    {q8,q9},[%2,:128]!    \n"
1754                "vst1.f32    {q0,q1},[%2,:128]!    \n"
1755                "subs        %3, #2                  \n"
1756                "bne         1b                      \n"
1757                : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1758 #else // default routine, works fine for non-arm cpus with current compilers
1759   for (i=0; i < Ncvec; i += 2) {
1760     v4sf ar, ai, br, bi;
1761     ar = va[2*i+0]; ai = va[2*i+1];
1762     br = vb[2*i+0]; bi = vb[2*i+1];
1763     VCPLXMUL(ar, ai, br, bi);
1764     vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1765     vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1766     ar = va[2*i+2]; ai = va[2*i+3];
1767     br = vb[2*i+2]; bi = vb[2*i+3];
1768     VCPLXMUL(ar, ai, br, bi);
1769     vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1770     vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1771   }
1772 #endif
1773   if (s->transform == PFFFT_REAL) {
1774     ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1775     ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1776   }
1777 }
1778 #else // defined(PFFFT_SIMD_DISABLE)
1779 
1780 // standard routine using scalar floats, without SIMD stuff.
1781 
1782 #define pffft_zreorder_nosimd pffft_zreorder
pffft_zreorder_nosimd(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1783 void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1784   int32_t k, N = setup->N;
1785   if (setup->transform == PFFFT_COMPLEX) {
1786     for (k=0; k < 2*N; ++k) out[k] = in[k];
1787     return;
1788   }
1789   else if (direction == PFFFT_FORWARD) {
1790     float x_N = in[N-1];
1791     for (k=N-1; k > 1; --k) out[k] = in[k-1];
1792     out[0] = in[0];
1793     out[1] = x_N;
1794   } else {
1795     float x_N = in[1];
1796     for (k=1; k < N-1; ++k) out[k] = in[k+1];
1797     out[0] = in[0];
1798     out[N-1] = x_N;
1799   }
1800 }
1801 
1802 #define pffft_transform_internal_nosimd pffft_transform_internal
pffft_transform_internal_nosimd(PFFFT_Setup * setup,const float * input,float * output,float * scratch,pffft_direction_t direction,int32_t ordered)1803 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1804                                     pffft_direction_t direction, int32_t ordered) {
1805   int32_t Ncvec   = setup->Ncvec;
1806   int32_t nf_odd = (setup->ifac[1] & 1);
1807 
1808   // temporary buffer is allocated on the stack if the scratch pointer is NULL
1809   int32_t stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1810   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1811   float *buff[2];
1812   int32_t ib;
1813   if (scratch == 0) scratch = scratch_on_stack;
1814   buff[0] = output; buff[1] = scratch;
1815 
1816   if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1817   ib = (nf_odd ^ ordered ? 1 : 0);
1818 
1819   if (direction == PFFFT_FORWARD) {
1820     if (setup->transform == PFFFT_REAL) {
1821       ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1822                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1823     } else {
1824       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1825                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1826     }
1827     if (ordered) {
1828       pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1829     }
1830   } else {
1831     if (input == buff[ib]) {
1832       ib = !ib; // may happen when finput == foutput
1833     }
1834     if (ordered) {
1835       pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1836       input = buff[!ib];
1837     }
1838     if (setup->transform == PFFFT_REAL) {
1839       ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1840                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1841     } else {
1842       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1843                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1844     }
1845   }
1846   if (buff[ib] != output) {
1847     int32_t k;
1848     // extra copy required -- this situation should happens only when finput == foutput
1849     assert(input==output);
1850     for (k=0; k < Ncvec; ++k) {
1851       float a = buff[ib][2*k], b = buff[ib][2*k+1];
1852       output[2*k] = a; output[2*k+1] = b;
1853     }
1854     ib = !ib;
1855   }
1856   assert(buff[ib] == output);
1857 }
1858 
1859 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1860 void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1861                                        float *ab, float scaling) {
1862   int32_t i, Ncvec = s->Ncvec;
1863 
1864   if (s->transform == PFFFT_REAL) {
1865     // take care of the fftpack ordering
1866     ab[0] += a[0]*b[0]*scaling;
1867     ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1868     ++ab; ++a; ++b; --Ncvec;
1869   }
1870   for (i=0; i < Ncvec; ++i) {
1871     float ar, ai, br, bi;
1872     ar = a[2*i+0]; ai = a[2*i+1];
1873     br = b[2*i+0]; bi = b[2*i+1];
1874     VCPLXMUL(ar, ai, br, bi);
1875     ab[2*i+0] += ar*scaling;
1876     ab[2*i+1] += ai*scaling;
1877   }
1878 }
1879 
1880 #endif // defined(PFFFT_SIMD_DISABLE)
1881 
pffft_transform(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)1882 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1883   pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
1884 }
1885 
pffft_transform_ordered(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)1886 void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1887   pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
1888 }
1889