1 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
2 
3    Based on original fortran 77 code from FFTPACKv4 from NETLIB
4    (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5    of NCAR, in 1985.
6 
7    As confirmed by the NCAR fftpack software curators, the following
8    FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9    released under the same terms.
10 
11    FFTPACK license:
12 
13    http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14 
15    Copyright (c) 2004 the University Corporation for Atmospheric
16    Research ("UCAR"). All rights reserved. Developed by NCAR's
17    Computational and Information Systems Laboratory, UCAR,
18    www.cisl.ucar.edu.
19 
20    Redistribution and use of the Software in source and binary forms,
21    with or without modification, is permitted provided that the
22    following conditions are met:
23 
24    - Neither the names of NCAR's Computational and Information Systems
25    Laboratory, the University Corporation for Atmospheric Research,
26    nor the names of its sponsors or contributors may be used to
27    endorse or promote products derived from this Software without
28    specific prior written permission.
29 
30    - Redistributions of source code must retain the above copyright
31    notices, this list of conditions, and the disclaimer below.
32 
33    - Redistributions in binary form must reproduce the above copyright
34    notice, this list of conditions, and the disclaimer below in the
35    documentation and/or other materials provided with the
36    distribution.
37 
38    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46    SOFTWARE.
47 
48 
49    PFFFT : a Pretty Fast FFT.
50 
51    This file is largerly based on the original FFTPACK implementation, modified in
52    order to take advantage of SIMD instructions of modern CPUs.
53 */
54 
55 /*
56   ChangeLog:
57   - 2011/10/02, version 1: This is the very first release of this file.
58 */
59 
60 #include "pffft.hxx"
61 
62 #include <stdlib.h>
63 #include <stdio.h>
64 #include <math.h>
65 #include <assert.h>
66 
67 /* detect compiler flavour */
68 #if defined(_MSC_VER)
69 #  define COMPILER_MSVC
70 #elif defined(__GNUC__)
71 #  define COMPILER_GCC
72 #endif
73 
74 #if defined(COMPILER_GCC)
75 #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
76 #  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
77 #  define RESTRICT __restrict
78 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
79 #elif defined(COMPILER_MSVC)
80 #  define ALWAYS_INLINE(return_type) __forceinline return_type
81 #  define NEVER_INLINE(return_type) __declspec(noinline) return_type
82 #  define RESTRICT __restrict
83 #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
84 #endif
85 
86 
87 /*
88    vector support macros: the rest of the code is independant of
89    SSE/Altivec/NEON -- adding support for other platforms with 4-element
90    vectors should be limited to these macros
91 */
92 
93 
94 // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
95 //#define PFFFT_SIMD_DISABLE
96 
97 /*
98    Altivec support macros
99 */
100 #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
101 typedef vector float v4sf;
102 #  define SIMD_SZ 4
103 #  define VZERO() ((vector float) vec_splat_u8(0))
104 #  define VMUL(a,b) vec_madd(a,b, VZERO())
105 #  define VADD(a,b) vec_add(a,b)
106 #  define VMADD(a,b,c) vec_madd(a,b,c)
107 #  define VSUB(a,b) vec_sub(a,b)
ld_ps1(const float * p)108 inline v4sf ld_ps1(const float *p)
109 {
110 	v4sf v=vec_lde(0,p);
111 	return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0);
112 }
113 #  define LD_PS1(p) ld_ps1(&p)
114 #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
115 #  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
116     vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
117     vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
118     v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
119   }
120 #  define VTRANSPOSE4(x0,x1,x2,x3) {              \
121     v4sf y0 = vec_mergeh(x0, x2);               \
122     v4sf y1 = vec_mergel(x0, x2);               \
123     v4sf y2 = vec_mergeh(x1, x3);               \
124     v4sf y3 = vec_mergel(x1, x3);               \
125     x0 = vec_mergeh(y0, y2);                    \
126     x1 = vec_mergel(y0, y2);                    \
127     x2 = vec_mergeh(y1, y3);                    \
128     x3 = vec_mergel(y1, y3);                    \
129   }
130 #  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
131 #  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
132 
133 /*
134   SSE1 support macros
135 */
136 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
137 
138 #include <xmmintrin.h>
139 typedef __m128 v4sf;
140 #  define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
141 #  define VZERO() _mm_setzero_ps()
142 #  define VMUL(a,b) _mm_mul_ps(a,b)
143 #  define VADD(a,b) _mm_add_ps(a,b)
144 #  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
145 #  define VSUB(a,b) _mm_sub_ps(a,b)
146 #  define LD_PS1(p) _mm_set1_ps(p)
147 #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
148 #  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
149 #  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
150 #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
151 #  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
152 
153 /*
154   ARM NEON support macros
155 */
156 #elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
157 #  include <arm_neon.h>
158 typedef float32x4_t v4sf;
159 #  define SIMD_SZ 4
160 #  define VZERO() vdupq_n_f32(0)
161 #  define VMUL(a,b) vmulq_f32(a,b)
162 #  define VADD(a,b) vaddq_f32(a,b)
163 #  define VMADD(a,b,c) vmlaq_f32(c,a,b)
164 #  define VSUB(a,b) vsubq_f32(a,b)
165 #  define LD_PS1(p) vld1q_dup_f32(&(p))
166 #  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
167 #  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
168 #  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
169     float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
170     float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
171     float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
172     float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
173     x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
174   }
175 // marginally faster version
176 //#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
177 #  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
178 #  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
179 #else
180 #  if !defined(PFFFT_SIMD_DISABLE)
181 #    warning "building with simd disabled !\n";
182 #    define PFFFT_SIMD_DISABLE // fallback to scalar code
183 #  endif
184 #endif
185 
186 // fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
187 #ifdef PFFFT_SIMD_DISABLE
188 typedef float v4sf;
189 #  define SIMD_SZ 1
190 #  define VZERO() 0.f
191 #  define VMUL(a,b) ((a)*(b))
192 #  define VADD(a,b) ((a)+(b))
193 #  define VMADD(a,b,c) ((a)*(b)+(c))
194 #  define VSUB(a,b) ((a)-(b))
195 #  define LD_PS1(p) (p)
196 #  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
197 #endif
198 
199 // shortcuts for complex multiplcations
200 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
201 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
202 #ifndef SVMUL
203 // multiply a scalar with a vector
204 #define SVMUL(f,v) VMUL(LD_PS1(f),v)
205 #endif
206 
207 #if !defined(PFFFT_SIMD_DISABLE)
208 typedef union v4sf_union {
209 	v4sf  v;
210 	float f[4];
211 } v4sf_union;
212 
213 #include <string.h>
214 
215 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
216 
217 /* detect bugs with the vector support macros */
validate_pffft_simd()218 void validate_pffft_simd()
219 {
220 	float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
221 	v4sf_union a0, a1, a2, a3, t, u;
222 	memcpy(a0.f, f, 4*sizeof(float));
223 	memcpy(a1.f, f+4, 4*sizeof(float));
224 	memcpy(a2.f, f+8, 4*sizeof(float));
225 	memcpy(a3.f, f+12, 4*sizeof(float));
226 
227 	t = a0;
228 	u = a1;
229 	t.v = VZERO();
230 	printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
231 	assertv4(t, 0, 0, 0, 0);
232 	t.v = VADD(a1.v, a2.v);
233 	printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
234 	assertv4(t, 12, 14, 16, 18);
235 	t.v = VMUL(a1.v, a2.v);
236 	printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
237 	assertv4(t, 32, 45, 60, 77);
238 	t.v = VMADD(a1.v, a2.v,a0.v);
239 	printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
240 	assertv4(t, 32, 46, 62, 80);
241 
242 	INTERLEAVE2(a1.v,a2.v,t.v,u.v);
243 	printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
244 	assertv4(t, 4, 8, 5, 9);
245 	assertv4(u, 6, 10, 7, 11);
246 	UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
247 	printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
248 	assertv4(t, 4, 6, 8, 10);
249 	assertv4(u, 5, 7, 9, 11);
250 
251 	t.v=LD_PS1(f[15]);
252 	printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
253 	assertv4(t, 15, 15, 15, 15);
254 	t.v = VSWAPHL(a1.v, a2.v);
255 	printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
256 	assertv4(t, 8, 9, 6, 7);
257 	VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
258 	printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
259 	       a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
260 	       a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
261 	assertv4(a0, 0, 4, 8, 12);
262 	assertv4(a1, 1, 5, 9, 13);
263 	assertv4(a2, 2, 6, 10, 14);
264 	assertv4(a3, 3, 7, 11, 15);
265 }
266 #endif //!PFFFT_SIMD_DISABLE
267 
268 /* SSE and co like 16-bytes aligned pointers */
269 #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
pffft_aligned_malloc(size_t nb_bytes)270 void *pffft_aligned_malloc(size_t nb_bytes)
271 {
272 	void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
273 	if (!p0) return (void *) 0;
274 	p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
275 	*((void **) p - 1) = p0;
276 	return p;
277 }
278 
pffft_aligned_free(void * p)279 void pffft_aligned_free(void *p)
280 {
281 	if (p) free(*((void **) p - 1));
282 }
283 
pffft_simd_size()284 int pffft_simd_size()
285 {
286 	return SIMD_SZ;
287 }
288 
289 /*
290   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
291 */
passf2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,float fsign)292 static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign)
293 {
294 	int k, i;
295 	int l1ido = l1*ido;
296 	if (ido <= 2) {
297 		for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
298 			ch[0]         = VADD(cc[0], cc[ido+0]);
299 			ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
300 			ch[1]         = VADD(cc[1], cc[ido+1]);
301 			ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
302 		}
303 	} else {
304 		for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
305 			for (i=0; i<ido-1; i+=2) {
306 				v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
307 				v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
308 				v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
309 				ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
310 				ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
311 				VCPLXMUL(tr2, ti2, wr, wi);
312 				ch[i+l1ido]   = tr2;
313 				ch[i+l1ido+1] = ti2;
314 			}
315 		}
316 	}
317 }
318 
319 /*
320   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
321 */
passf3_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,float fsign)322 static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
323                                     const float *wa1, const float *wa2, float fsign)
324 {
325 	static const float taur = -0.5f;
326 	float taui = 0.866025403784439f*fsign;
327 	int i, k;
328 	v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
329 	int l1ido = l1*ido;
330 	float wr1, wi1, wr2, wi2;
331 	assert(ido > 2);
332 	for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
333 		for (i=0; i<ido-1; i+=2) {
334 			tr2 = VADD(cc[i+ido], cc[i+2*ido]);
335 			cr2 = VADD(cc[i], SVMUL(taur,tr2));
336 			ch[i]    = VADD(cc[i], tr2);
337 			ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
338 			ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
339 			ch[i+1]  = VADD(cc[i+1], ti2);
340 			cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
341 			ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
342 			dr2 = VSUB(cr2, ci3);
343 			dr3 = VADD(cr2, ci3);
344 			di2 = VADD(ci2, cr3);
345 			di3 = VSUB(ci2, cr3);
346 			wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
347 			VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
348 			ch[i+l1ido] = dr2;
349 			ch[i+l1ido + 1] = di2;
350 			VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
351 			ch[i+2*l1ido] = dr3;
352 			ch[i+2*l1ido+1] = di3;
353 		}
354 	}
355 } /* passf3 */
356 
passf4_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,float fsign)357 static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
358                                     const float *wa1, const float *wa2, const float *wa3, float fsign)
359 {
360 	/* isign == -1 for forward transform and +1 for backward transform */
361 
362 	int i, k;
363 	v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
364 	int l1ido = l1*ido;
365 	if (ido == 2) {
366 		for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
367 			tr1 = VSUB(cc[0], cc[2*ido + 0]);
368 			tr2 = VADD(cc[0], cc[2*ido + 0]);
369 			ti1 = VSUB(cc[1], cc[2*ido + 1]);
370 			ti2 = VADD(cc[1], cc[2*ido + 1]);
371 			ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
372 			tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
373 			tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
374 			ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
375 
376 			ch[0*l1ido + 0] = VADD(tr2, tr3);
377 			ch[0*l1ido + 1] = VADD(ti2, ti3);
378 			ch[1*l1ido + 0] = VADD(tr1, tr4);
379 			ch[1*l1ido + 1] = VADD(ti1, ti4);
380 			ch[2*l1ido + 0] = VSUB(tr2, tr3);
381 			ch[2*l1ido + 1] = VSUB(ti2, ti3);
382 			ch[3*l1ido + 0] = VSUB(tr1, tr4);
383 			ch[3*l1ido + 1] = VSUB(ti1, ti4);
384 		}
385 	} else {
386 		for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
387 			for (i=0; i<ido-1; i+=2) {
388 				float wr1, wi1, wr2, wi2, wr3, wi3;
389 				tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
390 				tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
391 				ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
392 				ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
393 				tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
394 				ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
395 				tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
396 				ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
397 
398 				ch[i] = VADD(tr2, tr3);
399 				cr3    = VSUB(tr2, tr3);
400 				ch[i + 1] = VADD(ti2, ti3);
401 				ci3 = VSUB(ti2, ti3);
402 
403 				cr2 = VADD(tr1, tr4);
404 				cr4 = VSUB(tr1, tr4);
405 				ci2 = VADD(ti1, ti4);
406 				ci4 = VSUB(ti1, ti4);
407 				wr1=wa1[i], wi1=fsign*wa1[i+1];
408 				VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
409 				wr2=wa2[i], wi2=fsign*wa2[i+1];
410 				ch[i + l1ido] = cr2;
411 				ch[i + l1ido + 1] = ci2;
412 
413 				VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
414 				wr3=wa3[i], wi3=fsign*wa3[i+1];
415 				ch[i + 2*l1ido] = cr3;
416 				ch[i + 2*l1ido + 1] = ci3;
417 
418 				VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
419 				ch[i + 3*l1ido] = cr4;
420 				ch[i + 3*l1ido + 1] = ci4;
421 			}
422 		}
423 	}
424 } /* passf4 */
425 
426 /*
427   passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
428 */
passf5_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4,float fsign)429 static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
430                                     const float *wa1, const float *wa2,
431                                     const float *wa3, const float *wa4, float fsign)
432 {
433 	static const float tr11 = .309016994374947f;
434 	const float ti11 = .951056516295154f*fsign;
435 	static const float tr12 = -.809016994374947f;
436 	const float ti12 = .587785252292473f*fsign;
437 
438 	/* Local variables */
439 	int i, k;
440 	v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
441 	     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
442 
443 	float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
444 
445 #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
446 #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
447 
448 	assert(ido > 2);
449 	for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
450 		for (i = 0; i < ido-1; i += 2) {
451 			ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
452 			ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
453 			ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
454 			ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
455 			tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
456 			tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
457 			tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
458 			tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
459 			ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
460 			ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
461 			cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
462 			ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
463 			cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
464 			ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
465 			cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
466 			ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
467 			cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
468 			ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
469 			dr3 = VSUB(cr3, ci4);
470 			dr4 = VADD(cr3, ci4);
471 			di3 = VADD(ci3, cr4);
472 			di4 = VSUB(ci3, cr4);
473 			dr5 = VADD(cr2, ci5);
474 			dr2 = VSUB(cr2, ci5);
475 			di5 = VSUB(ci2, cr5);
476 			di2 = VADD(ci2, cr5);
477 			wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
478 			wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
479 			VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
480 			ch_ref(i - 1, 2) = dr2;
481 			ch_ref(i, 2)     = di2;
482 			VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
483 			ch_ref(i - 1, 3) = dr3;
484 			ch_ref(i, 3)     = di3;
485 			VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
486 			ch_ref(i - 1, 4) = dr4;
487 			ch_ref(i, 4)     = di4;
488 			VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
489 			ch_ref(i - 1, 5) = dr5;
490 			ch_ref(i, 5)     = di5;
491 		}
492 	}
493 #undef ch_ref
494 #undef cc_ref
495 }
496 
radf2_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1)497 static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1)
498 {
499 	static const float minus_one = -1.f;
500 	int i, k, l1ido = l1*ido;
501 	for (k=0; k < l1ido; k += ido) {
502 		v4sf a = cc[k], b = cc[k + l1ido];
503 		ch[2*k] = VADD(a, b);
504 		ch[2*(k+ido)-1] = VSUB(a, b);
505 	}
506 	if (ido < 2) return;
507 	if (ido != 2) {
508 		for (k=0; k < l1ido; k += ido) {
509 			for (i=2; i<ido; i+=2) {
510 				v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
511 				v4sf br = cc[i - 1 + k], bi = cc[i + k];
512 				VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
513 				ch[i + 2*k] = VADD(bi, ti2);
514 				ch[2*(k+ido) - i] = VSUB(ti2, bi);
515 				ch[i - 1 + 2*k] = VADD(br, tr2);
516 				ch[2*(k+ido) - i -1] = VSUB(br, tr2);
517 			}
518 		}
519 		if (ido % 2 == 1) return;
520 	}
521 	for (k=0; k < l1ido; k += ido) {
522 		ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
523 		ch[2*k + ido-1] = cc[k + ido-1];
524 	}
525 } /* radf2 */
526 
527 
radb2_ps(int ido,int l1,const v4sf * cc,v4sf * ch,const float * wa1)528 static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1)
529 {
530 	static const float minus_two=-2;
531 	int i, k, l1ido = l1*ido;
532 	v4sf a,b,c,d, tr2, ti2;
533 	for (k=0; k < l1ido; k += ido) {
534 		a = cc[2*k];
535 		b = cc[2*(k+ido) - 1];
536 		ch[k] = VADD(a, b);
537 		ch[k + l1ido] =VSUB(a, b);
538 	}
539 	if (ido < 2) return;
540 	if (ido != 2) {
541 		for (k = 0; k < l1ido; k += ido) {
542 			for (i = 2; i < ido; i += 2) {
543 				a = cc[i-1 + 2*k];
544 				b = cc[2*(k + ido) - i - 1];
545 				c = cc[i+0 + 2*k];
546 				d = cc[2*(k + ido) - i + 0];
547 				ch[i-1 + k] = VADD(a, b);
548 				tr2 = VSUB(a, b);
549 				ch[i+0 + k] = VSUB(c, d);
550 				ti2 = VADD(c, d);
551 				VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
552 				ch[i-1 + k + l1ido] = tr2;
553 				ch[i+0 + k + l1ido] = ti2;
554 			}
555 		}
556 		if (ido % 2 == 1) return;
557 	}
558 	for (k = 0; k < l1ido; k += ido) {
559 		a = cc[2*k + ido-1];
560 		b = cc[2*k + ido];
561 		ch[k + ido-1] = VADD(a,a);
562 		ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
563 	}
564 } /* radb2 */
565 
radf3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)566 static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
567                      const float *wa1, const float *wa2)
568 {
569 	static const float taur = -0.5f;
570 	static const float taui = 0.866025403784439f;
571 	int i, k, ic;
572 	v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
573 	for (k=0; k<l1; k++) {
574 		cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
575 		ch[3*k*ido] = VADD(cc[k*ido], cr2);
576 		ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
577 		ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
578 	}
579 	if (ido == 1) return;
580 	for (k=0; k<l1; k++) {
581 		for (i=2; i<ido; i+=2) {
582 			ic = ido - i;
583 			wr1 = LD_PS1(wa1[i - 2]);
584 			wi1 = LD_PS1(wa1[i - 1]);
585 			dr2 = cc[i - 1 + (k + l1)*ido];
586 			di2 = cc[i + (k + l1)*ido];
587 			VCPLXMULCONJ(dr2, di2, wr1, wi1);
588 
589 			wr2 = LD_PS1(wa2[i - 2]);
590 			wi2 = LD_PS1(wa2[i - 1]);
591 			dr3 = cc[i - 1 + (k + l1*2)*ido];
592 			di3 = cc[i + (k + l1*2)*ido];
593 			VCPLXMULCONJ(dr3, di3, wr2, wi2);
594 
595 			cr2 = VADD(dr2, dr3);
596 			ci2 = VADD(di2, di3);
597 			ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
598 			ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
599 			tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
600 			ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
601 			tr3 = SVMUL(taui, VSUB(di2, di3));
602 			ti3 = SVMUL(taui, VSUB(dr3, dr2));
603 			ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
604 			ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
605 			ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
606 			ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
607 		}
608 	}
609 } /* radf3 */
610 
611 
radb3_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2)612 static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
613                      const float *wa1, const float *wa2)
614 {
615 	static const float taur = -0.5f;
616 	static const float taui = 0.866025403784439f;
617 	static const float taui_2 = 0.866025403784439f*2;
618 	int i, k, ic;
619 	v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
620 	for (k=0; k<l1; k++) {
621 		tr2 = cc[ido-1 + (3*k + 1)*ido];
622 		tr2 = VADD(tr2,tr2);
623 		cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
624 		ch[k*ido] = VADD(cc[3*k*ido], tr2);
625 		ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
626 		ch[(k + l1)*ido] = VSUB(cr2, ci3);
627 		ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
628 	}
629 	if (ido == 1) return;
630 	for (k=0; k<l1; k++) {
631 		for (i=2; i<ido; i+=2) {
632 			ic = ido - i;
633 			tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
634 			cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
635 			ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
636 			ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
637 			ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
638 			ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
639 			cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
640 			ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
641 			dr2 = VSUB(cr2, ci3);
642 			dr3 = VADD(cr2, ci3);
643 			di2 = VADD(ci2, cr3);
644 			di3 = VSUB(ci2, cr3);
645 			VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
646 			ch[i - 1 + (k + l1)*ido] = dr2;
647 			ch[i + (k + l1)*ido] = di2;
648 			VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
649 			ch[i - 1 + (k + 2*l1)*ido] = dr3;
650 			ch[i + (k + 2*l1)*ido] = di3;
651 		}
652 	}
653 } /* radb3 */
654 
radf4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)655 static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
656                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
657 {
658 	static const float minus_hsqt2 = (float)-0.7071067811865475;
659 	int i, k, l1ido = l1*ido;
660 	{
661 		const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
662 		v4sf * RESTRICT ch_ = ch;
663 		while (cc < cc_end) {
664 			// this loop represents between 25% and 40% of total radf4_ps cost !
665 			v4sf a0 = cc[0], a1 = cc[l1ido];
666 			v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
667 			v4sf tr1 = VADD(a1, a3);
668 			v4sf tr2 = VADD(a0, a2);
669 			ch[2*ido-1] = VSUB(a0, a2);
670 			ch[2*ido  ] = VSUB(a3, a1);
671 			ch[0      ] = VADD(tr1, tr2);
672 			ch[4*ido-1] = VSUB(tr2, tr1);
673 			cc += ido;
674 			ch += 4*ido;
675 		}
676 		cc = cc_;
677 		ch = ch_;
678 	}
679 	if (ido < 2) return;
680 	if (ido != 2) {
681 		for (k = 0; k < l1ido; k += ido) {
682 			const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
683 			for (i=2; i<ido; i += 2, pc += 2) {
684 				int ic = ido - i;
685 				v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
686 				v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
687 
688 				cr2 = pc[1*l1ido+0];
689 				ci2 = pc[1*l1ido+1];
690 				wr=LD_PS1(wa1[i - 2]);
691 				wi=LD_PS1(wa1[i - 1]);
692 				VCPLXMULCONJ(cr2,ci2,wr,wi);
693 
694 				cr3 = pc[2*l1ido+0];
695 				ci3 = pc[2*l1ido+1];
696 				wr = LD_PS1(wa2[i-2]);
697 				wi = LD_PS1(wa2[i-1]);
698 				VCPLXMULCONJ(cr3, ci3, wr, wi);
699 
700 				cr4 = pc[3*l1ido];
701 				ci4 = pc[3*l1ido+1];
702 				wr = LD_PS1(wa3[i-2]);
703 				wi = LD_PS1(wa3[i-1]);
704 				VCPLXMULCONJ(cr4, ci4, wr, wi);
705 
706 				/* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
707 
708 				tr1 = VADD(cr2,cr4);
709 				tr4 = VSUB(cr4,cr2);
710 				tr2 = VADD(pc[0],cr3);
711 				tr3 = VSUB(pc[0],cr3);
712 				ch[i - 1 + 4*k] = VADD(tr1,tr2);
713 				ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
714 				ti1 = VADD(ci2,ci4);
715 				ti4 = VSUB(ci2,ci4);
716 				ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
717 				ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
718 				ti2 = VADD(pc[1],ci3);
719 				ti3 = VSUB(pc[1],ci3);
720 				ch[i + 4*k] = VADD(ti1, ti2);
721 				ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
722 				ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
723 				ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
724 			}
725 		}
726 		if (ido % 2 == 1) return;
727 	}
728 	for (k=0; k<l1ido; k += ido) {
729 		v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
730 		v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
731 		v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
732 		v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
733 		ch[ido-1 + 4*k] = VADD(tr1, c);
734 		ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
735 		ch[4*k + 1*ido] = VSUB(ti1, d);
736 		ch[4*k + 3*ido] = VADD(ti1, d);
737 	}
738 } /* radf4 */
739 
740 
radb4_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * RESTRICT wa1,const float * RESTRICT wa2,const float * RESTRICT wa3)741 static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
742                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
743 {
744 	static const float minus_sqrt2 = (float)-1.414213562373095;
745 	static const float two = 2.f;
746 	int i, k, l1ido = l1*ido;
747 	v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
748 	{
749 		const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
750 		v4sf *ch_ = ch;
751 		while (ch < ch_end) {
752 			v4sf a = cc[0], b = cc[4*ido-1];
753 			v4sf c = cc[2*ido], d = cc[2*ido-1];
754 			tr3 = SVMUL(two,d);
755 			tr2 = VADD(a,b);
756 			tr1 = VSUB(a,b);
757 			tr4 = SVMUL(two,c);
758 			ch[0*l1ido] = VADD(tr2, tr3);
759 			ch[2*l1ido] = VSUB(tr2, tr3);
760 			ch[1*l1ido] = VSUB(tr1, tr4);
761 			ch[3*l1ido] = VADD(tr1, tr4);
762 
763 			cc += 4*ido;
764 			ch += ido;
765 		}
766 		cc = cc_;
767 		ch = ch_;
768 	}
769 	if (ido < 2) return;
770 	if (ido != 2) {
771 		for (k = 0; k < l1ido; k += ido) {
772 			const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
773 			v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
774 			for (i = 2; i < ido; i += 2) {
775 
776 				tr1 = VSUB(pc[i], pc[4*ido - i]);
777 				tr2 = VADD(pc[i], pc[4*ido - i]);
778 				ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
779 				tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
780 				ph[0] = VADD(tr2, tr3);
781 				cr3 = VSUB(tr2, tr3);
782 
783 				ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
784 				tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
785 				cr2 = VSUB(tr1, tr4);
786 				cr4 = VADD(tr1, tr4);
787 
788 				ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
789 				ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
790 
791 				ph[1] = VADD(ti2, ti3);
792 				ph += l1ido;
793 				ci3 = VSUB(ti2, ti3);
794 				ci2 = VADD(ti1, ti4);
795 				ci4 = VSUB(ti1, ti4);
796 				VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
797 				ph[0] = cr2;
798 				ph[1] = ci2;
799 				ph += l1ido;
800 				VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
801 				ph[0] = cr3;
802 				ph[1] = ci3;
803 				ph += l1ido;
804 				VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
805 				ph[0] = cr4;
806 				ph[1] = ci4;
807 				ph = ph - 3*l1ido + 2;
808 			}
809 		}
810 		if (ido % 2 == 1) return;
811 	}
812 	for (k=0; k < l1ido; k+=ido) {
813 		int i0 = 4*k + ido;
814 		v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
815 		v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
816 		tr1 = VSUB(c,d);
817 		tr2 = VADD(c,d);
818 		ti1 = VADD(b,a);
819 		ti2 = VSUB(b,a);
820 		ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
821 		ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
822 		ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
823 		ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
824 	}
825 } /* radb4 */
826 
radf5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)827 static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
828                      const float *wa1, const float *wa2, const float *wa3, const float *wa4)
829 {
830 	static const float tr11 = .309016994374947f;
831 	static const float ti11 = .951056516295154f;
832 	static const float tr12 = -.809016994374947f;
833 	static const float ti12 = .587785252292473f;
834 
835 	/* System generated locals */
836 	int cc_offset, ch_offset;
837 
838 	/* Local variables */
839 	int i, k, ic;
840 	v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
841 	     cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
842 	int idp2;
843 
844 
845 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
846 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
847 
848 	/* Parameter adjustments */
849 	ch_offset = 1 + ido * 6;
850 	ch -= ch_offset;
851 	cc_offset = 1 + ido * (1 + l1);
852 	cc -= cc_offset;
853 
854 	/* Function Body */
855 	for (k = 1; k <= l1; ++k) {
856 		cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
857 		ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
858 		cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
859 		ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
860 		ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
861 		ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
862 		ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
863 		ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
864 		ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
865 		//printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
866 	}
867 	if (ido == 1) {
868 		return;
869 	}
870 	idp2 = ido + 2;
871 	for (k = 1; k <= l1; ++k) {
872 		for (i = 3; i <= ido; i += 2) {
873 			ic = idp2 - i;
874 			dr2 = LD_PS1(wa1[i-3]);
875 			di2 = LD_PS1(wa1[i-2]);
876 			dr3 = LD_PS1(wa2[i-3]);
877 			di3 = LD_PS1(wa2[i-2]);
878 			dr4 = LD_PS1(wa3[i-3]);
879 			di4 = LD_PS1(wa3[i-2]);
880 			dr5 = LD_PS1(wa4[i-3]);
881 			di5 = LD_PS1(wa4[i-2]);
882 			VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
883 			VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
884 			VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
885 			VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
886 			cr2 = VADD(dr2, dr5);
887 			ci5 = VSUB(dr5, dr2);
888 			cr5 = VSUB(di2, di5);
889 			ci2 = VADD(di2, di5);
890 			cr3 = VADD(dr3, dr4);
891 			ci4 = VSUB(dr4, dr3);
892 			cr4 = VSUB(di3, di4);
893 			ci3 = VADD(di3, di4);
894 			ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
895 			ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
896 			tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
897 			ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
898 			tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
899 			ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
900 			tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
901 			ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
902 			tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
903 			ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
904 			ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
905 			ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
906 			ch_ref(i, 3, k) = VADD(ti2, ti5);
907 			ch_ref(ic, 2, k) = VSUB(ti5, ti2);
908 			ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
909 			ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
910 			ch_ref(i, 5, k) = VADD(ti3, ti4);
911 			ch_ref(ic, 4, k) = VSUB(ti4, ti3);
912 		}
913 	}
914 #undef cc_ref
915 #undef ch_ref
916 } /* radf5 */
917 
radb5_ps(int ido,int l1,const v4sf * RESTRICT cc,v4sf * RESTRICT ch,const float * wa1,const float * wa2,const float * wa3,const float * wa4)918 static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
919                      const float *wa1, const float *wa2, const float *wa3, const float *wa4)
920 {
921 	static const float tr11 = .309016994374947f;
922 	static const float ti11 = .951056516295154f;
923 	static const float tr12 = -.809016994374947f;
924 	static const float ti12 = .587785252292473f;
925 
926 	int cc_offset, ch_offset;
927 
928 	/* Local variables */
929 	int i, k, ic;
930 	v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
931 	     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
932 	int idp2;
933 
934 #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
935 #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
936 
937 	/* Parameter adjustments */
938 	ch_offset = 1 + ido * (1 + l1);
939 	ch -= ch_offset;
940 	cc_offset = 1 + ido * 6;
941 	cc -= cc_offset;
942 
943 	/* Function Body */
944 	for (k = 1; k <= l1; ++k) {
945 		ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
946 		ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
947 		tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
948 		tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
949 		ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
950 		cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
951 		cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
952 		ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
953 		ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
954 		ch_ref(1, k, 2) = VSUB(cr2, ci5);
955 		ch_ref(1, k, 3) = VSUB(cr3, ci4);
956 		ch_ref(1, k, 4) = VADD(cr3, ci4);
957 		ch_ref(1, k, 5) = VADD(cr2, ci5);
958 	}
959 	if (ido == 1) {
960 		return;
961 	}
962 	idp2 = ido + 2;
963 	for (k = 1; k <= l1; ++k) {
964 		for (i = 3; i <= ido; i += 2) {
965 			ic = idp2 - i;
966 			ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
967 			ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
968 			ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
969 			ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
970 			tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
971 			tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
972 			tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
973 			tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
974 			ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
975 			ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
976 			cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
977 			ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
978 			cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
979 			ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
980 			cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
981 			ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
982 			cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
983 			ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
984 			dr3 = VSUB(cr3, ci4);
985 			dr4 = VADD(cr3, ci4);
986 			di3 = VADD(ci3, cr4);
987 			di4 = VSUB(ci3, cr4);
988 			dr5 = VADD(cr2, ci5);
989 			dr2 = VSUB(cr2, ci5);
990 			di5 = VSUB(ci2, cr5);
991 			di2 = VADD(ci2, cr5);
992 			VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
993 			VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
994 			VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
995 			VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
996 
997 			ch_ref(i-1, k, 2) = dr2;
998 			ch_ref(i, k, 2) = di2;
999 			ch_ref(i-1, k, 3) = dr3;
1000 			ch_ref(i, k, 3) = di3;
1001 			ch_ref(i-1, k, 4) = dr4;
1002 			ch_ref(i, k, 4) = di4;
1003 			ch_ref(i-1, k, 5) = dr5;
1004 			ch_ref(i, k, 5) = di5;
1005 		}
1006 	}
1007 #undef cc_ref
1008 #undef ch_ref
1009 } /* radb5 */
1010 
rfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)1011 static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1012                                       const float *wa, const int *ifac)
1013 {
1014 	v4sf *in  = (v4sf*)input_readonly;
1015 	v4sf *out = (in == work2 ? work1 : work2);
1016 	int nf = ifac[1], k1;
1017 	int l2 = n;
1018 	int iw = n-1;
1019 	assert(in != out && work1 != work2);
1020 	for (k1 = 1; k1 <= nf; ++k1) {
1021 		int kh = nf - k1;
1022 		int ip = ifac[kh + 2];
1023 		int l1 = l2 / ip;
1024 		int ido = n / l2;
1025 		iw -= (ip - 1)*ido;
1026 		switch (ip) {
1027 		case 5: {
1028 			int ix2 = iw + ido;
1029 			int ix3 = ix2 + ido;
1030 			int ix4 = ix3 + ido;
1031 			radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1032 		}
1033 		break;
1034 		case 4: {
1035 			int ix2 = iw + ido;
1036 			int ix3 = ix2 + ido;
1037 			radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1038 		}
1039 		break;
1040 		case 3: {
1041 			int ix2 = iw + ido;
1042 			radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1043 		}
1044 		break;
1045 		case 2:
1046 			radf2_ps(ido, l1, in, out, &wa[iw]);
1047 			break;
1048 		default:
1049 			assert(0);
1050 			break;
1051 		}
1052 		l2 = l1;
1053 		if (out == work2) {
1054 			out = work1;
1055 			in = work2;
1056 		} else {
1057 			out = work2;
1058 			in = work1;
1059 		}
1060 	}
1061 	return in; /* this is in fact the output .. */
1062 } /* rfftf1 */
1063 
rfftb1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac)1064 static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1065                                       const float *wa, const int *ifac)
1066 {
1067 	v4sf *in  = (v4sf*)input_readonly;
1068 	v4sf *out = (in == work2 ? work1 : work2);
1069 	int nf = ifac[1], k1;
1070 	int l1 = 1;
1071 	int iw = 0;
1072 	assert(in != out);
1073 	for (k1=1; k1<=nf; k1++) {
1074 		int ip = ifac[k1 + 1];
1075 		int l2 = ip*l1;
1076 		int ido = n / l2;
1077 		switch (ip) {
1078 		case 5: {
1079 			int ix2 = iw + ido;
1080 			int ix3 = ix2 + ido;
1081 			int ix4 = ix3 + ido;
1082 			radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
1083 		}
1084 		break;
1085 		case 4: {
1086 			int ix2 = iw + ido;
1087 			int ix3 = ix2 + ido;
1088 			radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
1089 		}
1090 		break;
1091 		case 3: {
1092 			int ix2 = iw + ido;
1093 			radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
1094 		}
1095 		break;
1096 		case 2:
1097 			radb2_ps(ido, l1, in, out, &wa[iw]);
1098 			break;
1099 		default:
1100 			assert(0);
1101 			break;
1102 		}
1103 		l1 = l2;
1104 		iw += (ip - 1)*ido;
1105 
1106 		if (out == work2) {
1107 			out = work1;
1108 			in = work2;
1109 		} else {
1110 			out = work2;
1111 			in = work1;
1112 		}
1113 	}
1114 	return in; /* this is in fact the output .. */
1115 }
1116 
decompose(int n,int * ifac,const int * ntryh)1117 static int decompose(int n, int *ifac, const int *ntryh)
1118 {
1119 	int nl = n, nf = 0, i, j = 0;
1120 	for (j=0; ntryh[j]; ++j) {
1121 		int ntry = ntryh[j];
1122 		while (nl != 1) {
1123 			int nq = nl / ntry;
1124 			int nr = nl - ntry * nq;
1125 			if (nr == 0) {
1126 				ifac[2+nf++] = ntry;
1127 				nl = nq;
1128 				if (ntry == 2 && nf != 1) {
1129 					for (i = 2; i <= nf; ++i) {
1130 						int ib = nf - i + 2;
1131 						ifac[ib + 1] = ifac[ib];
1132 					}
1133 					ifac[2] = 2;
1134 				}
1135 			} else break;
1136 		}
1137 	}
1138 	ifac[0] = n;
1139 	ifac[1] = nf;
1140 	return nf;
1141 }
1142 
1143 
1144 
rffti1_ps(int n,float * wa,int * ifac)1145 static void rffti1_ps(int n, float *wa, int *ifac)
1146 {
1147 	static const int ntryh[] = { 4,2,3,5,0 };
1148 	int k1, j, ii;
1149 
1150 	int nf = decompose(n,ifac,ntryh);
1151 	float argh = (2*M_PI) / n;
1152 	int is = 0;
1153 	int nfm1 = nf - 1;
1154 	int l1 = 1;
1155 	for (k1 = 1; k1 <= nfm1; k1++) {
1156 		int ip = ifac[k1 + 1];
1157 		int ld = 0;
1158 		int l2 = l1*ip;
1159 		int ido = n / l2;
1160 		int ipm = ip - 1;
1161 		for (j = 1; j <= ipm; ++j) {
1162 			float argld;
1163 			int i = is, fi=0;
1164 			ld += l1;
1165 			argld = ld*argh;
1166 			for (ii = 3; ii <= ido; ii += 2) {
1167 				i += 2;
1168 				fi += 1;
1169 				wa[i - 2] = cos(fi*argld);
1170 				wa[i - 1] = sin(fi*argld);
1171 			}
1172 			is += ido;
1173 		}
1174 		l1 = l2;
1175 	}
1176 } /* rffti1 */
1177 
cffti1_ps(int n,float * wa,int * ifac)1178 void cffti1_ps(int n, float *wa, int *ifac)
1179 {
1180 	static const int ntryh[] = { 5,3,4,2,0 };
1181 	int k1, j, ii;
1182 
1183 	int nf = decompose(n,ifac,ntryh);
1184 	float argh = (2*M_PI)/(float)n;
1185 	int i = 1;
1186 	int l1 = 1;
1187 	for (k1=1; k1<=nf; k1++) {
1188 		int ip = ifac[k1+1];
1189 		int ld = 0;
1190 		int l2 = l1*ip;
1191 		int ido = n / l2;
1192 		int idot = ido + ido + 2;
1193 		int ipm = ip - 1;
1194 		for (j=1; j<=ipm; j++) {
1195 			float argld;
1196 			int i1 = i, fi = 0;
1197 			wa[i-1] = 1;
1198 			wa[i] = 0;
1199 			ld += l1;
1200 			argld = ld*argh;
1201 			for (ii = 4; ii <= idot; ii += 2) {
1202 				i += 2;
1203 				fi += 1;
1204 				wa[i-1] = cos(fi*argld);
1205 				wa[i] = sin(fi*argld);
1206 			}
1207 			if (ip > 5) {
1208 				wa[i1-1] = wa[i-1];
1209 				wa[i1] = wa[i];
1210 			}
1211 		}
1212 		l1 = l2;
1213 	}
1214 } /* cffti1 */
1215 
1216 
cfftf1_ps(int n,const v4sf * input_readonly,v4sf * work1,v4sf * work2,const float * wa,const int * ifac,int isign)1217 v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign)
1218 {
1219 	v4sf *in  = (v4sf*)input_readonly;
1220 	v4sf *out = (in == work2 ? work1 : work2);
1221 	int nf = ifac[1], k1;
1222 	int l1 = 1;
1223 	int iw = 0;
1224 	assert(in != out && work1 != work2);
1225 	for (k1=2; k1<=nf+1; k1++) {
1226 		int ip = ifac[k1];
1227 		int l2 = ip*l1;
1228 		int ido = n / l2;
1229 		int idot = ido + ido;
1230 		switch (ip) {
1231 		case 5: {
1232 			int ix2 = iw + idot;
1233 			int ix3 = ix2 + idot;
1234 			int ix4 = ix3 + idot;
1235 			passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
1236 		}
1237 		break;
1238 		case 4: {
1239 			int ix2 = iw + idot;
1240 			int ix3 = ix2 + idot;
1241 			passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], isign);
1242 		}
1243 		break;
1244 		case 2: {
1245 			passf2_ps(idot, l1, in, out, &wa[iw], isign);
1246 		}
1247 		break;
1248 		case 3: {
1249 			int ix2 = iw + idot;
1250 			passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], isign);
1251 		}
1252 		break;
1253 		default:
1254 			assert(0);
1255 		}
1256 		l1 = l2;
1257 		iw += (ip - 1)*idot;
1258 		if (out == work2) {
1259 			out = work1;
1260 			in = work2;
1261 		} else {
1262 			out = work2;
1263 			in = work1;
1264 		}
1265 	}
1266 
1267 	return in; /* this is in fact the output .. */
1268 }
1269 
1270 
1271 struct PFFFT_Setup {
1272 	int     N;
1273 	int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1274 	int ifac[15];
1275 	pffft_transform_t transform;
1276 	v4sf *data; // allocated room for twiddle coefs
1277 	float *e;    // points into 'data' , N/4*3 elements
1278 	float *twiddle; // points into 'data', N/4 elements
1279 };
1280 
pffft_new_setup(int N,pffft_transform_t transform)1281 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
1282 {
1283 	PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
1284 	int k, m;
1285 	/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1286 	   and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1287 	   handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1288 	if (transform == PFFFT_REAL) {
1289 		assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0);
1290 	}
1291 	if (transform == PFFFT_COMPLEX) {
1292 		assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0);
1293 	}
1294 	//assert((N % 32) == 0);
1295 	s->N = N;
1296 	s->transform = transform;
1297 	/* nb of complex simd vectors */
1298 	s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1299 	s->data = (v4sf*)pffft_aligned_malloc(2*s->Ncvec * sizeof(v4sf));
1300 	s->e = (float*)s->data;
1301 	s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1302 
1303 	if (transform == PFFFT_REAL) {
1304 		for (k=0; k < s->Ncvec; ++k) {
1305 			int i = k/SIMD_SZ;
1306 			int j = k%SIMD_SZ;
1307 			for (m=0; m < SIMD_SZ-1; ++m) {
1308 				float A = -2*M_PI*(m+1)*k / N;
1309 				s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
1310 				s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
1311 			}
1312 		}
1313 		rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1314 	} else {
1315 		for (k=0; k < s->Ncvec; ++k) {
1316 			int i = k/SIMD_SZ;
1317 			int j = k%SIMD_SZ;
1318 			for (m=0; m < SIMD_SZ-1; ++m) {
1319 				float A = -2*M_PI*(m+1)*k / N;
1320 				s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
1321 				s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
1322 			}
1323 		}
1324 		cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
1325 	}
1326 
1327 	/* check that N is decomposable with allowed prime factors */
1328 	for (k=0, m=1; k < s->ifac[1]; ++k) {
1329 		m *= s->ifac[2+k];
1330 	}
1331 	if (m != N/SIMD_SZ) {
1332 		pffft_destroy_setup(s);
1333 		s = 0;
1334 	}
1335 
1336 	return s;
1337 }
1338 
1339 
pffft_destroy_setup(PFFFT_Setup * s)1340 void pffft_destroy_setup(PFFFT_Setup *s)
1341 {
1342 	pffft_aligned_free(s->data);
1343 	free(s);
1344 }
1345 
1346 #if !defined(PFFFT_SIMD_DISABLE)
1347 
1348 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
reversed_copy(int N,const v4sf * in,int in_stride,v4sf * out)1349 static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out)
1350 {
1351 	v4sf g0, g1;
1352 	int k;
1353 	INTERLEAVE2(in[0], in[1], g0, g1);
1354 	in += in_stride;
1355 
1356 	*--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1357 	for (k=1; k < N; ++k) {
1358 		v4sf h0, h1;
1359 		INTERLEAVE2(in[0], in[1], h0, h1);
1360 		in += in_stride;
1361 		*--out = VSWAPHL(g1, h0);
1362 		*--out = VSWAPHL(h0, h1);
1363 		g1 = h1;
1364 	}
1365 	*--out = VSWAPHL(g1, g0);
1366 }
1367 
unreversed_copy(int N,const v4sf * in,v4sf * out,int out_stride)1368 static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride)
1369 {
1370 	v4sf g0, g1, h0, h1;
1371 	int k;
1372 	g0 = g1 = in[0];
1373 	++in;
1374 	for (k=1; k < N; ++k) {
1375 		h0 = *in++;
1376 		h1 = *in++;
1377 		g1 = VSWAPHL(g1, h0);
1378 		h0 = VSWAPHL(h0, h1);
1379 		UNINTERLEAVE2(h0, g1, out[0], out[1]);
1380 		out += out_stride;
1381 		g1 = h1;
1382 	}
1383 	h0 = *in++;
1384 	h1 = g0;
1385 	g1 = VSWAPHL(g1, h0);
1386 	h0 = VSWAPHL(h0, h1);
1387 	UNINTERLEAVE2(h0, g1, out[0], out[1]);
1388 }
1389 
pffft_zreorder(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1390 void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
1391 {
1392 	int k, N = setup->N, Ncvec = setup->Ncvec;
1393 	const v4sf *vin = (const v4sf*)in;
1394 	v4sf *vout = (v4sf*)out;
1395 	assert(in != out);
1396 	if (setup->transform == PFFFT_REAL) {
1397 		int k, dk = N/32;
1398 		if (direction == PFFFT_FORWARD) {
1399 			for (k=0; k < dk; ++k) {
1400 				INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1401 				INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1402 			}
1403 			reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
1404 			reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
1405 		} else {
1406 			for (k=0; k < dk; ++k) {
1407 				UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1408 				UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1409 			}
1410 			unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
1411 			unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
1412 		}
1413 	} else {
1414 		if (direction == PFFFT_FORWARD) {
1415 			for (k=0; k < Ncvec; ++k) {
1416 				int kk = (k/4) + (k%4)*(Ncvec/4);
1417 				INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1418 			}
1419 		} else {
1420 			for (k=0; k < Ncvec; ++k) {
1421 				int kk = (k/4) + (k%4)*(Ncvec/4);
1422 				UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1423 			}
1424 		}
1425 	}
1426 }
1427 
pffft_cplx_finalize(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1428 void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
1429 {
1430 	int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1431 	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1432 	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1433 	assert(in != out);
1434 	for (k=0; k < dk; ++k) {
1435 		r0 = in[8*k+0];
1436 		i0 = in[8*k+1];
1437 		r1 = in[8*k+2];
1438 		i1 = in[8*k+3];
1439 		r2 = in[8*k+4];
1440 		i2 = in[8*k+5];
1441 		r3 = in[8*k+6];
1442 		i3 = in[8*k+7];
1443 		VTRANSPOSE4(r0,r1,r2,r3);
1444 		VTRANSPOSE4(i0,i1,i2,i3);
1445 		VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1446 		VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1447 		VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1448 
1449 		sr0 = VADD(r0,r2);
1450 		dr0 = VSUB(r0, r2);
1451 		sr1 = VADD(r1,r3);
1452 		dr1 = VSUB(r1, r3);
1453 		si0 = VADD(i0,i2);
1454 		di0 = VSUB(i0, i2);
1455 		si1 = VADD(i1,i3);
1456 		di1 = VSUB(i1, i3);
1457 
1458 		/*
1459 		  transformation for each column is:
1460 
1461 		  [1   1   1   1   0   0   0   0]   [r0]
1462 		  [1   0  -1   0   0  -1   0   1]   [r1]
1463 		  [1  -1   1  -1   0   0   0   0]   [r2]
1464 		  [1   0  -1   0   0   1   0  -1]   [r3]
1465 		  [0   0   0   0   1   1   1   1] * [i0]
1466 		  [0   1   0  -1   1   0  -1   0]   [i1]
1467 		  [0   0   0   0   1  -1   1  -1]   [i2]
1468 		  [0  -1   0   1   1   0  -1   0]   [i3]
1469 		*/
1470 
1471 		r0 = VADD(sr0, sr1);
1472 		i0 = VADD(si0, si1);
1473 		r1 = VADD(dr0, di1);
1474 		i1 = VSUB(di0, dr1);
1475 		r2 = VSUB(sr0, sr1);
1476 		i2 = VSUB(si0, si1);
1477 		r3 = VSUB(dr0, di1);
1478 		i3 = VADD(di0, dr1);
1479 
1480 		*out++ = r0;
1481 		*out++ = i0;
1482 		*out++ = r1;
1483 		*out++ = i1;
1484 		*out++ = r2;
1485 		*out++ = i2;
1486 		*out++ = r3;
1487 		*out++ = i3;
1488 	}
1489 }
1490 
pffft_cplx_preprocess(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1491 void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
1492 {
1493 	int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1494 	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1495 	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1496 	assert(in != out);
1497 	for (k=0; k < dk; ++k) {
1498 		r0 = in[8*k+0];
1499 		i0 = in[8*k+1];
1500 		r1 = in[8*k+2];
1501 		i1 = in[8*k+3];
1502 		r2 = in[8*k+4];
1503 		i2 = in[8*k+5];
1504 		r3 = in[8*k+6];
1505 		i3 = in[8*k+7];
1506 
1507 		sr0 = VADD(r0,r2);
1508 		dr0 = VSUB(r0, r2);
1509 		sr1 = VADD(r1,r3);
1510 		dr1 = VSUB(r1, r3);
1511 		si0 = VADD(i0,i2);
1512 		di0 = VSUB(i0, i2);
1513 		si1 = VADD(i1,i3);
1514 		di1 = VSUB(i1, i3);
1515 
1516 		r0 = VADD(sr0, sr1);
1517 		i0 = VADD(si0, si1);
1518 		r1 = VSUB(dr0, di1);
1519 		i1 = VADD(di0, dr1);
1520 		r2 = VSUB(sr0, sr1);
1521 		i2 = VSUB(si0, si1);
1522 		r3 = VADD(dr0, di1);
1523 		i3 = VSUB(di0, dr1);
1524 
1525 		VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1526 		VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1527 		VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1528 
1529 		VTRANSPOSE4(r0,r1,r2,r3);
1530 		VTRANSPOSE4(i0,i1,i2,i3);
1531 
1532 		*out++ = r0;
1533 		*out++ = i0;
1534 		*out++ = r1;
1535 		*out++ = i1;
1536 		*out++ = r2;
1537 		*out++ = i2;
1538 		*out++ = r3;
1539 		*out++ = i3;
1540 	}
1541 }
1542 
1543 
pffft_real_finalize_4x4(const v4sf * in0,const v4sf * in1,const v4sf * in,const v4sf * e,v4sf * out)1544 static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1545                 const v4sf *e, v4sf *out)
1546 {
1547 	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1548 	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1549 	r0 = *in0;
1550 	i0 = *in1;
1551 	r1 = *in++;
1552 	i1 = *in++;
1553 	r2 = *in++;
1554 	i2 = *in++;
1555 	r3 = *in++;
1556 	i3 = *in++;
1557 	VTRANSPOSE4(r0,r1,r2,r3);
1558 	VTRANSPOSE4(i0,i1,i2,i3);
1559 
1560 	/*
1561 	  transformation for each column is:
1562 
1563 	  [1   1   1   1   0   0   0   0]   [r0]
1564 	  [1   0  -1   0   0  -1   0   1]   [r1]
1565 	  [1   0  -1   0   0   1   0  -1]   [r2]
1566 	  [1  -1   1  -1   0   0   0   0]   [r3]
1567 	  [0   0   0   0   1   1   1   1] * [i0]
1568 	  [0  -1   0   1  -1   0   1   0]   [i1]
1569 	  [0  -1   0   1   1   0  -1   0]   [i2]
1570 	  [0   0   0   0  -1   1  -1   1]   [i3]
1571 	*/
1572 
1573 	//cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1574 	//cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1575 
1576 	VCPLXMUL(r1,i1,e[0],e[1]);
1577 	VCPLXMUL(r2,i2,e[2],e[3]);
1578 	VCPLXMUL(r3,i3,e[4],e[5]);
1579 
1580 	//cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1581 	//cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1582 
1583 	sr0 = VADD(r0,r2);
1584 	dr0 = VSUB(r0,r2);
1585 	sr1 = VADD(r1,r3);
1586 	dr1 = VSUB(r3,r1);
1587 	si0 = VADD(i0,i2);
1588 	di0 = VSUB(i0,i2);
1589 	si1 = VADD(i1,i3);
1590 	di1 = VSUB(i3,i1);
1591 
1592 	r0 = VADD(sr0, sr1);
1593 	r3 = VSUB(sr0, sr1);
1594 	i0 = VADD(si0, si1);
1595 	i3 = VSUB(si1, si0);
1596 	r1 = VADD(dr0, di1);
1597 	r2 = VSUB(dr0, di1);
1598 	i1 = VSUB(dr1, di0);
1599 	i2 = VADD(dr1, di0);
1600 
1601 	*out++ = r0;
1602 	*out++ = i0;
1603 	*out++ = r1;
1604 	*out++ = i1;
1605 	*out++ = r2;
1606 	*out++ = i2;
1607 	*out++ = r3;
1608 	*out++ = i3;
1609 
1610 }
1611 
pffft_real_finalize(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1612 static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
1613 {
1614 	int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1615 	/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1616 
1617 	v4sf_union cr, ci, *uout = (v4sf_union*)out;
1618 	v4sf save = in[7], zero=VZERO();
1619 	float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1620 	static const float s = M_SQRT2/2;
1621 
1622 	cr.v = in[0];
1623 	ci.v = in[Ncvec*2-1];
1624 	assert(in != out);
1625 	pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1626 
1627 	/*
1628 	  [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1629 
1630 	  [Xr(1)]  ] [1   1   1   1   0   0   0   0]
1631 	  [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
1632 	  [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
1633 	  [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
1634 	  [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
1635 	  [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
1636 	  [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
1637 	  [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
1638 	*/
1639 
1640 	xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]);
1641 	uout[0].f[0] = xr0;
1642 	xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]);
1643 	uout[1].f[0] = xi0;
1644 	xr2=(cr.f[0]-cr.f[2]);
1645 	uout[4].f[0] = xr2;
1646 	xi2=(cr.f[3]-cr.f[1]);
1647 	uout[5].f[0] = xi2;
1648 	xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);
1649 	uout[2].f[0] = xr1;
1650 	xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);
1651 	uout[3].f[0] = xi1;
1652 	xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);
1653 	uout[6].f[0] = xr3;
1654 	xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);
1655 	uout[7].f[0] = xi3;
1656 
1657 	for (k=1; k < dk; ++k) {
1658 		v4sf save_next = in[8*k+7];
1659 		pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
1660 		                        e + k*6, out + k*8);
1661 		save = save_next;
1662 	}
1663 
1664 }
1665 
pffft_real_preprocess_4x4(const v4sf * in,const v4sf * e,v4sf * out,int first)1666 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1667                 const v4sf *e, v4sf *out, int first)
1668 {
1669 	v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1670 	/*
1671 	  transformation for each column is:
1672 
1673 	  [1   1   1   1   0   0   0   0]   [r0]
1674 	  [1   0   0  -1   0  -1  -1   0]   [r1]
1675 	  [1  -1  -1   1   0   0   0   0]   [r2]
1676 	  [1   0   0  -1   0   1   1   0]   [r3]
1677 	  [0   0   0   0   1  -1   1  -1] * [i0]
1678 	  [0  -1   1   0   1   0   0   1]   [i1]
1679 	  [0   0   0   0   1   1  -1  -1]   [i2]
1680 	  [0   1  -1   0   1   0   0   1]   [i3]
1681 	*/
1682 
1683 	v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1684 	v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1685 	v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1686 	v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1687 
1688 	r0 = VADD(sr0, sr1);
1689 	r2 = VSUB(sr0, sr1);
1690 	r1 = VSUB(dr0, si1);
1691 	r3 = VADD(dr0, si1);
1692 	i0 = VSUB(di0, di1);
1693 	i2 = VADD(di0, di1);
1694 	i1 = VSUB(si0, dr1);
1695 	i3 = VADD(si0, dr1);
1696 
1697 	VCPLXMULCONJ(r1,i1,e[0],e[1]);
1698 	VCPLXMULCONJ(r2,i2,e[2],e[3]);
1699 	VCPLXMULCONJ(r3,i3,e[4],e[5]);
1700 
1701 	VTRANSPOSE4(r0,r1,r2,r3);
1702 	VTRANSPOSE4(i0,i1,i2,i3);
1703 
1704 	if (!first) {
1705 		*out++ = r0;
1706 		*out++ = i0;
1707 	}
1708 	*out++ = r1;
1709 	*out++ = i1;
1710 	*out++ = r2;
1711 	*out++ = i2;
1712 	*out++ = r3;
1713 	*out++ = i3;
1714 }
1715 
pffft_real_preprocess(int Ncvec,const v4sf * in,v4sf * out,const v4sf * e)1716 static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
1717 {
1718 	int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1719 	/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1720 
1721 	v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1722 	float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1723 	static const float s = M_SQRT2;
1724 	assert(in != out);
1725 	for (k=0; k < 4; ++k) {
1726 		Xr.f[k] = ((float*)in)[8*k];
1727 		Xi.f[k] = ((float*)in)[8*k+4];
1728 	}
1729 
1730 	pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
1731 
1732 	/*
1733 	  [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1734 
1735 	  [cr0] [1   0   2   0   1   0   0   0]
1736 	  [cr1] [1   0   0   0  -1   0  -2   0]
1737 	  [cr2] [1   0  -2   0   1   0   0   0]
1738 	  [cr3] [1   0   0   0  -1   0   2   0]
1739 	  [ci0] [0   2   0   2   0   0   0   0]
1740 	  [ci1] [0   s   0  -s   0  -s   0  -s]
1741 	  [ci2] [0   0   0   0   0  -2   0   2]
1742 	  [ci3] [0  -s   0   s   0  -s   0  -s]
1743 	*/
1744 	for (k=1; k < dk; ++k) {
1745 		pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
1746 	}
1747 
1748 	cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2];
1749 	uout[0].f[0] = cr0;
1750 	cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2];
1751 	uout[0].f[1] = cr1;
1752 	cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2];
1753 	uout[0].f[2] = cr2;
1754 	cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2];
1755 	uout[0].f[3] = cr3;
1756 	ci0= 2*(Xr.f[1]+Xr.f[3]);
1757 	uout[2*Ncvec-1].f[0] = ci0;
1758 	ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]);
1759 	uout[2*Ncvec-1].f[1] = ci1;
1760 	ci2= 2*(Xi.f[3]-Xi.f[1]);
1761 	uout[2*Ncvec-1].f[2] = ci2;
1762 	ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]);
1763 	uout[2*Ncvec-1].f[3] = ci3;
1764 }
1765 
1766 
pffft_transform_internal(PFFFT_Setup * setup,const float * finput,float * foutput,v4sf * scratch,pffft_direction_t direction,int ordered)1767 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1768                               pffft_direction_t direction, int ordered)
1769 {
1770 	int k, Ncvec   = setup->Ncvec;
1771 	int nf_odd = (setup->ifac[1] & 1);
1772 
1773 	// temporary buffer is allocated on the stack if the scratch pointer is NULL
1774 	int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1775 	VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1776 
1777 	const v4sf *vinput = (const v4sf*)finput;
1778 	v4sf *voutput      = (v4sf*)foutput;
1779 	v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack };
1780 	int ib = (nf_odd ^ ordered ? 1 : 0);
1781 
1782 	assert(VALIGNED(finput) && VALIGNED(foutput));
1783 
1784 	//assert(finput != foutput);
1785 	if (direction == PFFFT_FORWARD) {
1786 		ib = !ib;
1787 		if (setup->transform == PFFFT_REAL) {
1788 			ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
1789 			                setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1790 			pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1791 		} else {
1792 			v4sf *tmp = buff[ib];
1793 			for (k=0; k < Ncvec; ++k) {
1794 				UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1795 			}
1796 			ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
1797 			                setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1798 			pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
1799 		}
1800 		if (ordered) {
1801 			pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
1802 		} else ib = !ib;
1803 	} else {
1804 		if (vinput == buff[ib]) {
1805 			ib = !ib; // may happen when finput == foutput
1806 		}
1807 		if (ordered) {
1808 			pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
1809 			vinput = buff[ib];
1810 			ib = !ib;
1811 		}
1812 		if (setup->transform == PFFFT_REAL) {
1813 			pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1814 			ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
1815 			                setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1816 		} else {
1817 			pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
1818 			ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
1819 			                setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1820 			for (k=0; k < Ncvec; ++k) {
1821 				INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1822 			}
1823 		}
1824 	}
1825 
1826 	if (buff[ib] != voutput) {
1827 		/* extra copy required -- this situation should only happen when finput == foutput */
1828 		assert(finput==foutput);
1829 		for (k=0; k < Ncvec; ++k) {
1830 			v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1831 			voutput[2*k] = a;
1832 			voutput[2*k+1] = b;
1833 		}
1834 		ib = !ib;
1835 	}
1836 	assert(buff[ib] == voutput);
1837 }
1838 
pffft_zconvolve_accumulate(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)1839 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling)
1840 {
1841 	int Ncvec = s->Ncvec;
1842 	const v4sf * RESTRICT va = (const v4sf*)a;
1843 	const v4sf * RESTRICT vb = (const v4sf*)b;
1844 	v4sf * RESTRICT vab = (v4sf*)ab;
1845 
1846 #ifdef __arm__
1847 	__builtin_prefetch(va);
1848 	__builtin_prefetch(vb);
1849 	__builtin_prefetch(vab);
1850 	__builtin_prefetch(va+2);
1851 	__builtin_prefetch(vb+2);
1852 	__builtin_prefetch(vab+2);
1853 	__builtin_prefetch(va+4);
1854 	__builtin_prefetch(vb+4);
1855 	__builtin_prefetch(vab+4);
1856 	__builtin_prefetch(va+6);
1857 	__builtin_prefetch(vb+6);
1858 	__builtin_prefetch(vab+6);
1859 # ifndef __clang__
1860 #   define ZCONVOLVE_USING_INLINE_NEON_ASM
1861 # endif
1862 #endif
1863 
1864 	float ar, ai, br, bi, abr, abi;
1865 #ifndef ZCONVOLVE_USING_INLINE_ASM
1866 	v4sf vscal = LD_PS1(scaling);
1867 	int i;
1868 #endif
1869 
1870 	assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1871 	ar = ((v4sf_union*)va)[0].f[0];
1872 	ai = ((v4sf_union*)va)[1].f[0];
1873 	br = ((v4sf_union*)vb)[0].f[0];
1874 	bi = ((v4sf_union*)vb)[1].f[0];
1875 	abr = ((v4sf_union*)vab)[0].f[0];
1876 	abi = ((v4sf_union*)vab)[1].f[0];
1877 
1878 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1879 	const float *a_ = a, *b_ = b;
1880 	float *ab_ = ab;
1881 	int N = Ncvec;
1882 	asm volatile("mov         r8, %2                  \n"
1883 	             "vdup.f32    q15, %4                 \n"
1884 	             "1:                                  \n"
1885 	             "pld         [%0,#64]                \n"
1886 	             "pld         [%1,#64]                \n"
1887 	             "pld         [%2,#64]                \n"
1888 	             "pld         [%0,#96]                \n"
1889 	             "pld         [%1,#96]                \n"
1890 	             "pld         [%2,#96]                \n"
1891 	             "vld1.f32    {q0,q1},   [%0,:128]!         \n"
1892 	             "vld1.f32    {q4,q5},   [%1,:128]!         \n"
1893 	             "vld1.f32    {q2,q3},   [%0,:128]!         \n"
1894 	             "vld1.f32    {q6,q7},   [%1,:128]!         \n"
1895 	             "vld1.f32    {q8,q9},   [r8,:128]!          \n"
1896 
1897 	             "vmul.f32    q10, q0, q4             \n"
1898 	             "vmul.f32    q11, q0, q5             \n"
1899 	             "vmul.f32    q12, q2, q6             \n"
1900 	             "vmul.f32    q13, q2, q7             \n"
1901 	             "vmls.f32    q10, q1, q5             \n"
1902 	             "vmla.f32    q11, q1, q4             \n"
1903 	             "vld1.f32    {q0,q1}, [r8,:128]!     \n"
1904 	             "vmls.f32    q12, q3, q7             \n"
1905 	             "vmla.f32    q13, q3, q6             \n"
1906 	             "vmla.f32    q8, q10, q15            \n"
1907 	             "vmla.f32    q9, q11, q15            \n"
1908 	             "vmla.f32    q0, q12, q15            \n"
1909 	             "vmla.f32    q1, q13, q15            \n"
1910 	             "vst1.f32    {q8,q9},[%2,:128]!    \n"
1911 	             "vst1.f32    {q0,q1},[%2,:128]!    \n"
1912 	             "subs        %3, #2                  \n"
1913 	             "bne         1b                      \n"
1914 	             : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1915 #else // default routine, works fine for non-arm cpus with current compilers
1916 	for (i=0; i < Ncvec; i += 2) {
1917 		v4sf ar, ai, br, bi;
1918 		ar = va[2*i+0];
1919 		ai = va[2*i+1];
1920 		br = vb[2*i+0];
1921 		bi = vb[2*i+1];
1922 		VCPLXMUL(ar, ai, br, bi);
1923 		vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1924 		vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1925 		ar = va[2*i+2];
1926 		ai = va[2*i+3];
1927 		br = vb[2*i+2];
1928 		bi = vb[2*i+3];
1929 		VCPLXMUL(ar, ai, br, bi);
1930 		vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1931 		vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1932 	}
1933 #endif
1934 	if (s->transform == PFFFT_REAL) {
1935 		((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1936 		((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1937 	}
1938 }
1939 
1940 
1941 #else // defined(PFFFT_SIMD_DISABLE)
1942 
1943 // standard routine using scalar floats, without SIMD stuff.
1944 
1945 #define pffft_zreorder_nosimd pffft_zreorder
pffft_zreorder_nosimd(PFFFT_Setup * setup,const float * in,float * out,pffft_direction_t direction)1946 void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
1947 {
1948 	int k, N = setup->N;
1949 	if (setup->transform == PFFFT_COMPLEX) {
1950 		for (k=0; k < 2*N; ++k) out[k] = in[k];
1951 		return;
1952 	} else if (direction == PFFFT_FORWARD) {
1953 		float x_N = in[N-1];
1954 		for (k=N-1; k > 1; --k) out[k] = in[k-1];
1955 		out[0] = in[0];
1956 		out[1] = x_N;
1957 	} else {
1958 		float x_N = in[1];
1959 		for (k=1; k < N-1; ++k) out[k] = in[k+1];
1960 		out[0] = in[0];
1961 		out[N-1] = x_N;
1962 	}
1963 }
1964 
1965 #define pffft_transform_internal_nosimd pffft_transform_internal
pffft_transform_internal_nosimd(PFFFT_Setup * setup,const float * input,float * output,float * scratch,pffft_direction_t direction,int ordered)1966 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1967                                      pffft_direction_t direction, int ordered)
1968 {
1969 	int Ncvec   = setup->Ncvec;
1970 	int nf_odd = (setup->ifac[1] & 1);
1971 
1972 	// temporary buffer is allocated on the stack if the scratch pointer is NULL
1973 	int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1974 	VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1975 	float *buff[2];
1976 	int ib;
1977 	if (scratch == 0) scratch = scratch_on_stack;
1978 	buff[0] = output;
1979 	buff[1] = scratch;
1980 
1981 	if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1982 	ib = (nf_odd ^ ordered ? 1 : 0);
1983 
1984 	if (direction == PFFFT_FORWARD) {
1985 		if (setup->transform == PFFFT_REAL) {
1986 			ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1987 			                setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1988 		} else {
1989 			ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1990 			                setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1991 		}
1992 		if (ordered) {
1993 			pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
1994 			ib = !ib;
1995 		}
1996 	} else {
1997 		if (input == buff[ib]) {
1998 			ib = !ib; // may happen when finput == foutput
1999 		}
2000 		if (ordered) {
2001 			pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
2002 			input = buff[!ib];
2003 		}
2004 		if (setup->transform == PFFFT_REAL) {
2005 			ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
2006 			                setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
2007 		} else {
2008 			ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
2009 			                setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
2010 		}
2011 	}
2012 	if (buff[ib] != output) {
2013 		int k;
2014 		// extra copy required -- this situation should happens only when finput == foutput
2015 		assert(input==output);
2016 		for (k=0; k < Ncvec; ++k) {
2017 			float a = buff[ib][2*k], b = buff[ib][2*k+1];
2018 			output[2*k] = a;
2019 			output[2*k+1] = b;
2020 		}
2021 		ib = !ib;
2022 	}
2023 	assert(buff[ib] == output);
2024 }
2025 
2026 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s,const float * a,const float * b,float * ab,float scaling)2027 void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
2028                                        float *ab, float scaling)
2029 {
2030 	int i, Ncvec = s->Ncvec;
2031 
2032 	if (s->transform == PFFFT_REAL) {
2033 		// take care of the fftpack ordering
2034 		ab[0] += a[0]*b[0]*scaling;
2035 		ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
2036 		++ab;
2037 		++a;
2038 		++b;
2039 		--Ncvec;
2040 	}
2041 	for (i=0; i < Ncvec; ++i) {
2042 		float ar, ai, br, bi;
2043 		ar = a[2*i+0];
2044 		ai = a[2*i+1];
2045 		br = b[2*i+0];
2046 		bi = b[2*i+1];
2047 		VCPLXMUL(ar, ai, br, bi);
2048 		ab[2*i+0] += ar*scaling;
2049 		ab[2*i+1] += ai*scaling;
2050 	}
2051 }
2052 
2053 #endif // defined(PFFFT_SIMD_DISABLE)
2054 
pffft_transform(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)2055 void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
2056 {
2057 	pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
2058 }
2059 
pffft_transform_ordered(PFFFT_Setup * setup,const float * input,float * output,float * work,pffft_direction_t direction)2060 void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
2061 {
2062 	pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
2063 }
2064