1 /* -----------------------------------------------------------------------------
2 Software License for The Fraunhofer FDK AAC Codec Library for Android
3 
4 © Copyright  1995 - 2018 Fraunhofer-Gesellschaft zur Förderung der angewandten
5 Forschung e.V. All rights reserved.
6 
7  1.    INTRODUCTION
8 The Fraunhofer FDK AAC Codec Library for Android ("FDK AAC Codec") is software
9 that implements the MPEG Advanced Audio Coding ("AAC") encoding and decoding
10 scheme for digital audio. This FDK AAC Codec software is intended to be used on
11 a wide variety of Android devices.
12 
13 AAC's HE-AAC and HE-AAC v2 versions are regarded as today's most efficient
14 general perceptual audio codecs. AAC-ELD is considered the best-performing
15 full-bandwidth communications codec by independent studies and is widely
16 deployed. AAC has been standardized by ISO and IEC as part of the MPEG
17 specifications.
18 
19 Patent licenses for necessary patent claims for the FDK AAC Codec (including
20 those of Fraunhofer) may be obtained through Via Licensing
21 (www.vialicensing.com) or through the respective patent owners individually for
22 the purpose of encoding or decoding bit streams in products that are compliant
23 with the ISO/IEC MPEG audio standards. Please note that most manufacturers of
24 Android devices already license these patent claims through Via Licensing or
25 directly from the patent owners, and therefore FDK AAC Codec software may
26 already be covered under those patent licenses when it is used for those
27 licensed purposes only.
28 
29 Commercially-licensed AAC software libraries, including floating-point versions
30 with enhanced sound quality, are also available from Fraunhofer. Users are
31 encouraged to check the Fraunhofer website for additional applications
32 information and documentation.
33 
34 2.    COPYRIGHT LICENSE
35 
36 Redistribution and use in source and binary forms, with or without modification,
37 are permitted without payment of copyright license fees provided that you
38 satisfy the following conditions:
39 
40 You must retain the complete text of this software license in redistributions of
41 the FDK AAC Codec or your modifications thereto in source code form.
42 
43 You must retain the complete text of this software license in the documentation
44 and/or other materials provided with redistributions of the FDK AAC Codec or
45 your modifications thereto in binary form. You must make available free of
46 charge copies of the complete source code of the FDK AAC Codec and your
47 modifications thereto to recipients of copies in binary form.
48 
49 The name of Fraunhofer may not be used to endorse or promote products derived
50 from this library without prior written permission.
51 
52 You may not charge copyright license fees for anyone to use, copy or distribute
53 the FDK AAC Codec software or your modifications thereto.
54 
55 Your modified versions of the FDK AAC Codec must carry prominent notices stating
56 that you changed the software and the date of any change. For modified versions
57 of the FDK AAC Codec, the term "Fraunhofer FDK AAC Codec Library for Android"
58 must be replaced by the term "Third-Party Modified Version of the Fraunhofer FDK
59 AAC Codec Library for Android."
60 
61 3.    NO PATENT LICENSE
62 
63 NO EXPRESS OR IMPLIED LICENSES TO ANY PATENT CLAIMS, including without
64 limitation the patents of Fraunhofer, ARE GRANTED BY THIS SOFTWARE LICENSE.
65 Fraunhofer provides no warranty of patent non-infringement with respect to this
66 software.
67 
68 You may use this FDK AAC Codec software or modifications thereto only for
69 purposes that are authorized by appropriate patent licenses.
70 
71 4.    DISCLAIMER
72 
73 This FDK AAC Codec software is provided by Fraunhofer on behalf of the copyright
74 holders and contributors "AS IS" and WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES,
75 including but not limited to the implied warranties of merchantability and
76 fitness for a particular purpose. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
77 CONTRIBUTORS BE LIABLE for any direct, indirect, incidental, special, exemplary,
78 or consequential damages, including but not limited to procurement of substitute
79 goods or services; loss of use, data, or profits, or business interruption,
80 however caused and on any theory of liability, whether in contract, strict
81 liability, or tort (including negligence), arising in any way out of the use of
82 this software, even if advised of the possibility of such damage.
83 
84 5.    CONTACT INFORMATION
85 
86 Fraunhofer Institute for Integrated Circuits IIS
87 Attention: Audio and Multimedia Departments - FDK AAC LL
88 Am Wolfsmantel 33
89 91058 Erlangen, Germany
90 
91 www.iis.fraunhofer.de/amm
92 amm-info@iis.fraunhofer.de
93 ----------------------------------------------------------------------------- */
94 
95 /******************* Library for basic calculation routines ********************
96 
97    Author(s):   Josef Hoepfl, DSP Solutions
98 
99    Description: Fix point FFT
100 
101 *******************************************************************************/
102 
103 #include "fft_rad2.h"
104 #include "FDK_tools_rom.h"
105 
106 #define W_PiFOURTH STC(0x5a82799a)
107 //#define W_PiFOURTH ((FIXP_DBL)(0x5a82799a))
108 #ifndef SUMDIFF_PIFOURTH
109 #define SUMDIFF_PIFOURTH(diff, sum, a, b) \
110   {                                       \
111     FIXP_DBL wa, wb;                      \
112     wa = fMultDiv2(a, W_PiFOURTH);        \
113     wb = fMultDiv2(b, W_PiFOURTH);        \
114     diff = wb - wa;                       \
115     sum = wb + wa;                        \
116   }
117 #define SUMDIFF_PIFOURTH16(diff, sum, a, b)       \
118   {                                               \
119     FIXP_SGL wa, wb;                              \
120     wa = FX_DBL2FX_SGL(fMultDiv2(a, W_PiFOURTH)); \
121     wb = FX_DBL2FX_SGL(fMultDiv2(b, W_PiFOURTH)); \
122     diff = wb - wa;                               \
123     sum = wb + wa;                                \
124   }
125 #endif
126 
127 #define SCALEFACTOR2048 10
128 #define SCALEFACTOR1024 9
129 #define SCALEFACTOR512 8
130 #define SCALEFACTOR256 7
131 #define SCALEFACTOR128 6
132 #define SCALEFACTOR64 5
133 #define SCALEFACTOR32 4
134 #define SCALEFACTOR16 3
135 #define SCALEFACTOR8 2
136 #define SCALEFACTOR4 1
137 #define SCALEFACTOR2 1
138 
139 #define SCALEFACTOR3 1
140 #define SCALEFACTOR5 1
141 #define SCALEFACTOR6 (SCALEFACTOR2 + SCALEFACTOR3 + 2)
142 #define SCALEFACTOR7 2
143 #define SCALEFACTOR9 2
144 #define SCALEFACTOR10 5
145 #define SCALEFACTOR12 3
146 #define SCALEFACTOR15 3
147 #define SCALEFACTOR18 (SCALEFACTOR2 + SCALEFACTOR9 + 2)
148 #define SCALEFACTOR20 (SCALEFACTOR4 + SCALEFACTOR5 + 2)
149 #define SCALEFACTOR21 (SCALEFACTOR3 + SCALEFACTOR7 + 2)
150 #define SCALEFACTOR24 (SCALEFACTOR2 + SCALEFACTOR12 + 2)
151 #define SCALEFACTOR30 (SCALEFACTOR2 + SCALEFACTOR15 + 2)
152 #define SCALEFACTOR40 (SCALEFACTOR5 + SCALEFACTOR8 + 2)
153 #define SCALEFACTOR48 (SCALEFACTOR4 + SCALEFACTOR12 + 2)
154 #define SCALEFACTOR60 (SCALEFACTOR4 + SCALEFACTOR15 + 2)
155 #define SCALEFACTOR80 (SCALEFACTOR5 + SCALEFACTOR16 + 2)
156 #define SCALEFACTOR96 (SCALEFACTOR3 + SCALEFACTOR32 + 2)
157 #define SCALEFACTOR120 (SCALEFACTOR8 + SCALEFACTOR15 + 2)
158 #define SCALEFACTOR160 (SCALEFACTOR10 + SCALEFACTOR16 + 2)
159 #define SCALEFACTOR168 (SCALEFACTOR21 + SCALEFACTOR8 + 2)
160 #define SCALEFACTOR192 (SCALEFACTOR12 + SCALEFACTOR16 + 2)
161 #define SCALEFACTOR240 (SCALEFACTOR16 + SCALEFACTOR15 + 2)
162 #define SCALEFACTOR320 (SCALEFACTOR10 + SCALEFACTOR32 + 2)
163 #define SCALEFACTOR336 (SCALEFACTOR21 + SCALEFACTOR16 + 2)
164 #define SCALEFACTOR384 (SCALEFACTOR12 + SCALEFACTOR32 + 2)
165 #define SCALEFACTOR480 (SCALEFACTOR32 + SCALEFACTOR15 + 2)
166 
167 #include "fft.h"
168 
169 #ifndef FUNCTION_fft2
170 
171 /* Performs the FFT of length 2. Input vector unscaled, output vector scaled
172  * with factor 0.5 */
fft2(FIXP_DBL * RESTRICT pDat)173 static FDK_FORCEINLINE void fft2(FIXP_DBL *RESTRICT pDat) {
174   FIXP_DBL r1, i1;
175   FIXP_DBL r2, i2;
176 
177   /* real part */
178   r1 = pDat[2];
179   r2 = pDat[0];
180 
181   /* imaginary part */
182   i1 = pDat[3];
183   i2 = pDat[1];
184 
185   /* real part */
186   pDat[0] = (r2 + r1) >> 1;
187   pDat[2] = (r2 - r1) >> 1;
188 
189   /* imaginary part */
190   pDat[1] = (i2 + i1) >> 1;
191   pDat[3] = (i2 - i1) >> 1;
192 }
193 #endif /* FUNCTION_fft2 */
194 
195 #define C31 (STC(0x91261468)) /* FL2FXCONST_DBL(-0.86602540) = -sqrt(3)/2  */
196 
197 #ifndef FUNCTION_fft3
198 /* Performs the FFT of length 3 according to the algorithm after winograd. */
fft3(FIXP_DBL * RESTRICT pDat)199 static FDK_FORCEINLINE void fft3(FIXP_DBL *RESTRICT pDat) {
200   FIXP_DBL r1, r2;
201   FIXP_DBL s1, s2;
202   FIXP_DBL pD;
203 
204   /* real part */
205   r1 = pDat[2] + pDat[4];
206   r2 = fMultDiv2((pDat[2] - pDat[4]), C31);
207   pD = pDat[0] >> 1;
208   pDat[0] = pD + (r1 >> 1);
209   r1 = pD - (r1 >> 2);
210 
211   /* imaginary part */
212   s1 = pDat[3] + pDat[5];
213   s2 = fMultDiv2((pDat[3] - pDat[5]), C31);
214   pD = pDat[1] >> 1;
215   pDat[1] = pD + (s1 >> 1);
216   s1 = pD - (s1 >> 2);
217 
218   /* combination */
219   pDat[2] = r1 - s2;
220   pDat[4] = r1 + s2;
221   pDat[3] = s1 + r2;
222   pDat[5] = s1 - r2;
223 }
224 #endif /* #ifndef FUNCTION_fft3 */
225 
226 #define F5C(x) STC(x)
227 
228 #define C51 (F5C(0x79bc3854)) /* FL2FXCONST_DBL( 0.95105652)   */
229 #define C52 (F5C(0x9d839db0)) /* FL2FXCONST_DBL(-1.53884180/2) */
230 #define C53 (F5C(0xd18053ce)) /* FL2FXCONST_DBL(-0.36327126)   */
231 #define C54 (F5C(0x478dde64)) /* FL2FXCONST_DBL( 0.55901699)   */
232 #define C55 (F5C(0xb0000001)) /* FL2FXCONST_DBL(-1.25/2)       */
233 
234 /* performs the FFT of length 5 according to the algorithm after winograd */
235 /* This version works with a prescale of 2 instead of 3 */
fft5(FIXP_DBL * RESTRICT pDat)236 static FDK_FORCEINLINE void fft5(FIXP_DBL *RESTRICT pDat) {
237   FIXP_DBL r1, r2, r3, r4;
238   FIXP_DBL s1, s2, s3, s4;
239   FIXP_DBL t;
240 
241   /* real part */
242   r1 = (pDat[2] + pDat[8]) >> 1;
243   r4 = (pDat[2] - pDat[8]) >> 1;
244   r3 = (pDat[4] + pDat[6]) >> 1;
245   r2 = (pDat[4] - pDat[6]) >> 1;
246   t = fMult((r1 - r3), C54);
247   r1 = r1 + r3;
248   pDat[0] = (pDat[0] >> 1) + r1;
249   /* Bit shift left because of the constant C55 which was scaled with the factor
250      0.5 because of the representation of the values as fracts */
251   r1 = pDat[0] + (fMultDiv2(r1, C55) << (2));
252   r3 = r1 - t;
253   r1 = r1 + t;
254   t = fMult((r4 + r2), C51);
255   /* Bit shift left because of the constant C55 which was scaled with the factor
256      0.5 because of the representation of the values as fracts */
257   r4 = t + (fMultDiv2(r4, C52) << (2));
258   r2 = t + fMult(r2, C53);
259 
260   /* imaginary part */
261   s1 = (pDat[3] + pDat[9]) >> 1;
262   s4 = (pDat[3] - pDat[9]) >> 1;
263   s3 = (pDat[5] + pDat[7]) >> 1;
264   s2 = (pDat[5] - pDat[7]) >> 1;
265   t = fMult((s1 - s3), C54);
266   s1 = s1 + s3;
267   pDat[1] = (pDat[1] >> 1) + s1;
268   /* Bit shift left because of the constant C55 which was scaled with the factor
269      0.5 because of the representation of the values as fracts */
270   s1 = pDat[1] + (fMultDiv2(s1, C55) << (2));
271   s3 = s1 - t;
272   s1 = s1 + t;
273   t = fMult((s4 + s2), C51);
274   /* Bit shift left because of the constant C55 which was scaled with the factor
275      0.5 because of the representation of the values as fracts */
276   s4 = t + (fMultDiv2(s4, C52) << (2));
277   s2 = t + fMult(s2, C53);
278 
279   /* combination */
280   pDat[2] = r1 + s2;
281   pDat[8] = r1 - s2;
282   pDat[4] = r3 - s4;
283   pDat[6] = r3 + s4;
284 
285   pDat[3] = s1 - r2;
286   pDat[9] = s1 + r2;
287   pDat[5] = s3 + r4;
288   pDat[7] = s3 - r4;
289 }
290 
291 #define F5C(x) STC(x)
292 
293 #define C51 (F5C(0x79bc3854)) /* FL2FXCONST_DBL( 0.95105652)   */
294 #define C52 (F5C(0x9d839db0)) /* FL2FXCONST_DBL(-1.53884180/2) */
295 #define C53 (F5C(0xd18053ce)) /* FL2FXCONST_DBL(-0.36327126)   */
296 #define C54 (F5C(0x478dde64)) /* FL2FXCONST_DBL( 0.55901699)   */
297 #define C55 (F5C(0xb0000001)) /* FL2FXCONST_DBL(-1.25/2)       */
298 /**
299  * \brief    Function performs a complex 10-point FFT
300  *           The FFT is performed inplace. The result of the FFT
301  *           is scaled by SCALEFACTOR10 bits.
302  *
303  *           WOPS FLC version:                    1093 cycles
304  *           WOPS with 32x16 bit multiplications:  196 cycles
305  *
306  * \param    [i/o] re    real input / output
307  * \param    [i/o] im    imag input / output
308  * \param    [i  ] s     stride real and imag input / output
309  *
310  * \return   void
311  */
fft10(FIXP_DBL * x)312 static void fft10(FIXP_DBL *x)  // FIXP_DBL *re, FIXP_DBL *im, FIXP_SGL s)
313 {
314   FIXP_DBL t;
315   FIXP_DBL x0, x1, x2, x3, x4;
316   FIXP_DBL r1, r2, r3, r4;
317   FIXP_DBL s1, s2, s3, s4;
318   FIXP_DBL y00, y01, y02, y03, y04, y05, y06, y07, y08, y09;
319   FIXP_DBL y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
320 
321   const int s = 1;  // stride factor
322 
323   /* 2 fft5 stages */
324 
325   /* real part */
326   x0 = (x[s * 0] >> SCALEFACTOR10);
327   x1 = (x[s * 4] >> SCALEFACTOR10);
328   x2 = (x[s * 8] >> SCALEFACTOR10);
329   x3 = (x[s * 12] >> SCALEFACTOR10);
330   x4 = (x[s * 16] >> SCALEFACTOR10);
331 
332   r1 = (x3 + x2);
333   r4 = (x3 - x2);
334   r3 = (x1 + x4);
335   r2 = (x1 - x4);
336   t = fMult((r1 - r3), C54);
337   r1 = (r1 + r3);
338   y00 = (x0 + r1);
339   r1 = (y00 + ((fMult(r1, C55) << 1)));
340   r3 = (r1 - t);
341   r1 = (r1 + t);
342   t = fMult((r4 + r2), C51);
343   r4 = (t + (fMult(r4, C52) << 1));
344   r2 = (t + fMult(r2, C53));
345 
346   /* imaginary part */
347   x0 = (x[s * 0 + 1] >> SCALEFACTOR10);
348   x1 = (x[s * 4 + 1] >> SCALEFACTOR10);
349   x2 = (x[s * 8 + 1] >> SCALEFACTOR10);
350   x3 = (x[s * 12 + 1] >> SCALEFACTOR10);
351   x4 = (x[s * 16 + 1] >> SCALEFACTOR10);
352 
353   s1 = (x3 + x2);
354   s4 = (x3 - x2);
355   s3 = (x1 + x4);
356   s2 = (x1 - x4);
357   t = fMult((s1 - s3), C54);
358   s1 = (s1 + s3);
359   y01 = (x0 + s1);
360   s1 = (y01 + (fMult(s1, C55) << 1));
361   s3 = (s1 - t);
362   s1 = (s1 + t);
363   t = fMult((s4 + s2), C51);
364   s4 = (t + (fMult(s4, C52) << 1));
365   s2 = (t + fMult(s2, C53));
366 
367   /* combination */
368   y04 = (r1 + s2);
369   y16 = (r1 - s2);
370   y08 = (r3 - s4);
371   y12 = (r3 + s4);
372 
373   y05 = (s1 - r2);
374   y17 = (s1 + r2);
375   y09 = (s3 + r4);
376   y13 = (s3 - r4);
377 
378   /* real part */
379   x0 = (x[s * 10] >> SCALEFACTOR10);
380   x1 = (x[s * 2] >> SCALEFACTOR10);
381   x2 = (x[s * 6] >> SCALEFACTOR10);
382   x3 = (x[s * 14] >> SCALEFACTOR10);
383   x4 = (x[s * 18] >> SCALEFACTOR10);
384 
385   r1 = (x1 + x4);
386   r4 = (x1 - x4);
387   r3 = (x3 + x2);
388   r2 = (x3 - x2);
389   t = fMult((r1 - r3), C54);
390   r1 = (r1 + r3);
391   y02 = (x0 + r1);
392   r1 = (y02 + ((fMult(r1, C55) << 1)));
393   r3 = (r1 - t);
394   r1 = (r1 + t);
395   t = fMult(((r4 + r2)), C51);
396   r4 = (t + (fMult(r4, C52) << 1));
397   r2 = (t + fMult(r2, C53));
398 
399   /* imaginary part */
400   x0 = (x[s * 10 + 1] >> SCALEFACTOR10);
401   x1 = (x[s * 2 + 1] >> SCALEFACTOR10);
402   x2 = (x[s * 6 + 1] >> SCALEFACTOR10);
403   x3 = (x[s * 14 + 1] >> SCALEFACTOR10);
404   x4 = (x[s * 18 + 1] >> SCALEFACTOR10);
405 
406   s1 = (x1 + x4);
407   s4 = (x1 - x4);
408   s3 = (x3 + x2);
409   s2 = (x3 - x2);
410   t = fMult((s1 - s3), C54);
411   s1 = (s1 + s3);
412   y03 = (x0 + s1);
413   s1 = (y03 + (fMult(s1, C55) << 1));
414   s3 = (s1 - t);
415   s1 = (s1 + t);
416   t = fMult((s4 + s2), C51);
417   s4 = (t + (fMult(s4, C52) << 1));
418   s2 = (t + fMult(s2, C53));
419 
420   /* combination */
421   y06 = (r1 + s2);
422   y18 = (r1 - s2);
423   y10 = (r3 - s4);
424   y14 = (r3 + s4);
425 
426   y07 = (s1 - r2);
427   y19 = (s1 + r2);
428   y11 = (s3 + r4);
429   y15 = (s3 - r4);
430 
431   /* 5 fft2 stages */
432   x[s * 0] = (y00 + y02);
433   x[s * 0 + 1] = (y01 + y03);
434   x[s * 10] = (y00 - y02);
435   x[s * 10 + 1] = (y01 - y03);
436 
437   x[s * 4] = (y04 + y06);
438   x[s * 4 + 1] = (y05 + y07);
439   x[s * 14] = (y04 - y06);
440   x[s * 14 + 1] = (y05 - y07);
441 
442   x[s * 8] = (y08 + y10);
443   x[s * 8 + 1] = (y09 + y11);
444   x[s * 18] = (y08 - y10);
445   x[s * 18 + 1] = (y09 - y11);
446 
447   x[s * 12] = (y12 + y14);
448   x[s * 12 + 1] = (y13 + y15);
449   x[s * 2] = (y12 - y14);
450   x[s * 2 + 1] = (y13 - y15);
451 
452   x[s * 16] = (y16 + y18);
453   x[s * 16 + 1] = (y17 + y19);
454   x[s * 6] = (y16 - y18);
455   x[s * 6 + 1] = (y17 - y19);
456 }
457 
458 #ifndef FUNCTION_fft12
459 #define FUNCTION_fft12
460 
461 #undef C31
462 #define C31 (STC(0x91261468)) /* FL2FXCONST_DBL(-0.86602540) = -sqrt(3)/2  */
463 
fft12(FIXP_DBL * pInput)464 static inline void fft12(FIXP_DBL *pInput) {
465   FIXP_DBL aDst[24];
466   FIXP_DBL *pSrc, *pDst;
467   int i;
468 
469   pSrc = pInput;
470   pDst = aDst;
471   FIXP_DBL r1, r2, s1, s2, pD;
472 
473   /* First 3*2 samples are shifted right by 2 before output */
474   r1 = pSrc[8] + pSrc[16];
475   r2 = fMultDiv2((pSrc[8] - pSrc[16]), C31);
476   pD = pSrc[0] >> 1;
477   pDst[0] = (pD + (r1 >> 1)) >> 1;
478   r1 = pD - (r1 >> 2);
479 
480   /* imaginary part */
481   s1 = pSrc[9] + pSrc[17];
482   s2 = fMultDiv2((pSrc[9] - pSrc[17]), C31);
483   pD = pSrc[1] >> 1;
484   pDst[1] = (pD + (s1 >> 1)) >> 1;
485   s1 = pD - (s1 >> 2);
486 
487   /* combination */
488   pDst[2] = (r1 - s2) >> 1;
489   pDst[3] = (s1 + r2) >> 1;
490   pDst[4] = (r1 + s2) >> 1;
491   pDst[5] = (s1 - r2) >> 1;
492   pSrc += 2;
493   pDst += 6;
494 
495   const FIXP_STB *pVecRe = RotVectorReal12;
496   const FIXP_STB *pVecIm = RotVectorImag12;
497   FIXP_DBL re, im;
498   FIXP_STB vre, vim;
499   for (i = 0; i < 2; i++) {
500     /* sample 0,1 are shifted right by 2 before output */
501     /* sample 2,3 4,5 are shifted right by 1 and complex multiplied before
502      * output */
503 
504     r1 = pSrc[8] + pSrc[16];
505     r2 = fMultDiv2((pSrc[8] - pSrc[16]), C31);
506     pD = pSrc[0] >> 1;
507     pDst[0] = (pD + (r1 >> 1)) >> 1;
508     r1 = pD - (r1 >> 2);
509 
510     /* imaginary part */
511     s1 = pSrc[9] + pSrc[17];
512     s2 = fMultDiv2((pSrc[9] - pSrc[17]), C31);
513     pD = pSrc[1] >> 1;
514     pDst[1] = (pD + (s1 >> 1)) >> 1;
515     s1 = pD - (s1 >> 2);
516 
517     /* combination */
518     re = (r1 - s2) >> 0;
519     im = (s1 + r2) >> 0;
520     vre = *pVecRe++;
521     vim = *pVecIm++;
522     cplxMultDiv2(&pDst[3], &pDst[2], im, re, vre, vim);
523 
524     re = (r1 + s2) >> 0;
525     im = (s1 - r2) >> 0;
526     vre = *pVecRe++;
527     vim = *pVecIm++;
528     cplxMultDiv2(&pDst[5], &pDst[4], im, re, vre, vim);
529 
530     pDst += 6;
531     pSrc += 2;
532   }
533   /* sample 0,1 are shifted right by 2 before output */
534   /* sample 2,3 is shifted right by 1 and complex multiplied with (0.0,+1.0) */
535   /* sample 4,5 is shifted right by 1 and complex multiplied with (-1.0,0.0) */
536   r1 = pSrc[8] + pSrc[16];
537   r2 = fMultDiv2((pSrc[8] - pSrc[16]), C31);
538   pD = pSrc[0] >> 1;
539   pDst[0] = (pD + (r1 >> 1)) >> 1;
540   r1 = pD - (r1 >> 2);
541 
542   /* imaginary part */
543   s1 = pSrc[9] + pSrc[17];
544   s2 = fMultDiv2((pSrc[9] - pSrc[17]), C31);
545   pD = pSrc[1] >> 1;
546   pDst[1] = (pD + (s1 >> 1)) >> 1;
547   s1 = pD - (s1 >> 2);
548 
549   /* combination */
550   pDst[2] = (s1 + r2) >> 1;
551   pDst[3] = (s2 - r1) >> 1;
552   pDst[4] = -((r1 + s2) >> 1);
553   pDst[5] = (r2 - s1) >> 1;
554 
555   /* Perform 3 times the fft of length 4. The input samples are at the address
556   of aDst and the output samples are at the address of pInput. The input vector
557   for the fft of length 4 is built of the interleaved samples in aDst, the
558   output samples are stored consecutively at the address of pInput.
559   */
560   pSrc = aDst;
561   pDst = pInput;
562   for (i = 0; i < 3; i++) {
563     /* inline FFT4 merged with incoming resorting loop */
564     FIXP_DBL a00, a10, a20, a30, tmp0, tmp1;
565 
566     a00 = (pSrc[0] + pSrc[12]) >> 1; /* Re A + Re B */
567     a10 = (pSrc[6] + pSrc[18]) >> 1; /* Re C + Re D */
568     a20 = (pSrc[1] + pSrc[13]) >> 1; /* Im A + Im B */
569     a30 = (pSrc[7] + pSrc[19]) >> 1; /* Im C + Im D */
570 
571     pDst[0] = a00 + a10; /* Re A' = Re A + Re B + Re C + Re D */
572     pDst[1] = a20 + a30; /* Im A' = Im A + Im B + Im C + Im D */
573 
574     tmp0 = a00 - pSrc[12]; /* Re A - Re B */
575     tmp1 = a20 - pSrc[13]; /* Im A - Im B */
576 
577     pDst[12] = a00 - a10; /* Re C' = Re A + Re B - Re C - Re D */
578     pDst[13] = a20 - a30; /* Im C' = Im A + Im B - Im C - Im D */
579 
580     a10 = a10 - pSrc[18]; /* Re C - Re D */
581     a30 = a30 - pSrc[19]; /* Im C - Im D */
582 
583     pDst[6] = tmp0 + a30;  /* Re B' = Re A - Re B + Im C - Im D */
584     pDst[18] = tmp0 - a30; /* Re D' = Re A - Re B - Im C + Im D */
585     pDst[7] = tmp1 - a10;  /* Im B' = Im A - Im B - Re C + Re D */
586     pDst[19] = tmp1 + a10; /* Im D' = Im A - Im B + Re C - Re D */
587 
588     pSrc += 2;
589     pDst += 2;
590   }
591 }
592 #endif /* FUNCTION_fft12 */
593 
594 #ifndef FUNCTION_fft15
595 
596 #define N3 3
597 #define N5 5
598 #define N6 6
599 #define N15 15
600 
601 /* Performs the FFT of length 15. It is split into FFTs of length 3 and
602  * length 5. */
fft15(FIXP_DBL * pInput)603 static inline void fft15(FIXP_DBL *pInput) {
604   FIXP_DBL aDst[2 * N15];
605   FIXP_DBL aDst1[2 * N15];
606   int i, k, l;
607 
608   /* Sort input vector for fft's of length 3
609   input3(0:2)   = [input(0) input(5) input(10)];
610   input3(3:5)   = [input(3) input(8) input(13)];
611   input3(6:8)   = [input(6) input(11) input(1)];
612   input3(9:11)  = [input(9) input(14) input(4)];
613   input3(12:14) = [input(12) input(2) input(7)]; */
614   {
615     const FIXP_DBL *pSrc = pInput;
616     FIXP_DBL *RESTRICT pDst = aDst;
617     /* Merge 3 loops into one, skip call of fft3 */
618     for (i = 0, l = 0, k = 0; i < N5; i++, k += 6) {
619       pDst[k + 0] = pSrc[l];
620       pDst[k + 1] = pSrc[l + 1];
621       l += 2 * N5;
622       if (l >= (2 * N15)) l -= (2 * N15);
623 
624       pDst[k + 2] = pSrc[l];
625       pDst[k + 3] = pSrc[l + 1];
626       l += 2 * N5;
627       if (l >= (2 * N15)) l -= (2 * N15);
628       pDst[k + 4] = pSrc[l];
629       pDst[k + 5] = pSrc[l + 1];
630       l += (2 * N5) + (2 * N3);
631       if (l >= (2 * N15)) l -= (2 * N15);
632 
633       /* fft3 merged with shift right by 2 loop */
634       FIXP_DBL r1, r2, r3;
635       FIXP_DBL s1, s2;
636       /* real part */
637       r1 = pDst[k + 2] + pDst[k + 4];
638       r2 = fMult((pDst[k + 2] - pDst[k + 4]), C31);
639       s1 = pDst[k + 0];
640       pDst[k + 0] = (s1 + r1) >> 2;
641       r1 = s1 - (r1 >> 1);
642 
643       /* imaginary part */
644       s1 = pDst[k + 3] + pDst[k + 5];
645       s2 = fMult((pDst[k + 3] - pDst[k + 5]), C31);
646       r3 = pDst[k + 1];
647       pDst[k + 1] = (r3 + s1) >> 2;
648       s1 = r3 - (s1 >> 1);
649 
650       /* combination */
651       pDst[k + 2] = (r1 - s2) >> 2;
652       pDst[k + 4] = (r1 + s2) >> 2;
653       pDst[k + 3] = (s1 + r2) >> 2;
654       pDst[k + 5] = (s1 - r2) >> 2;
655     }
656   }
657   /* Sort input vector for fft's of length 5
658   input5(0:4)   = [output3(0) output3(3) output3(6) output3(9) output3(12)];
659   input5(5:9)   = [output3(1) output3(4) output3(7) output3(10) output3(13)];
660   input5(10:14) = [output3(2) output3(5) output3(8) output3(11) output3(14)]; */
661   /* Merge 2 loops into one, brings about 10% */
662   {
663     const FIXP_DBL *pSrc = aDst;
664     FIXP_DBL *RESTRICT pDst = aDst1;
665     for (i = 0, l = 0, k = 0; i < N3; i++, k += 10) {
666       l = 2 * i;
667       pDst[k + 0] = pSrc[l + 0];
668       pDst[k + 1] = pSrc[l + 1];
669       pDst[k + 2] = pSrc[l + 0 + (2 * N3)];
670       pDst[k + 3] = pSrc[l + 1 + (2 * N3)];
671       pDst[k + 4] = pSrc[l + 0 + (4 * N3)];
672       pDst[k + 5] = pSrc[l + 1 + (4 * N3)];
673       pDst[k + 6] = pSrc[l + 0 + (6 * N3)];
674       pDst[k + 7] = pSrc[l + 1 + (6 * N3)];
675       pDst[k + 8] = pSrc[l + 0 + (8 * N3)];
676       pDst[k + 9] = pSrc[l + 1 + (8 * N3)];
677       fft5(&pDst[k]);
678     }
679   }
680   /* Sort output vector of length 15
681   output = [out5(0)  out5(6)  out5(12) out5(3)  out5(9)
682             out5(10) out5(1)  out5(7)  out5(13) out5(4)
683             out5(5)  out5(11) out5(2)  out5(8)  out5(14)]; */
684   /* optimize clumsy loop, brings about 5% */
685   {
686     const FIXP_DBL *pSrc = aDst1;
687     FIXP_DBL *RESTRICT pDst = pInput;
688     for (i = 0, l = 0, k = 0; i < N3; i++, k += 10) {
689       pDst[k + 0] = pSrc[l];
690       pDst[k + 1] = pSrc[l + 1];
691       l += (2 * N6);
692       if (l >= (2 * N15)) l -= (2 * N15);
693       pDst[k + 2] = pSrc[l];
694       pDst[k + 3] = pSrc[l + 1];
695       l += (2 * N6);
696       if (l >= (2 * N15)) l -= (2 * N15);
697       pDst[k + 4] = pSrc[l];
698       pDst[k + 5] = pSrc[l + 1];
699       l += (2 * N6);
700       if (l >= (2 * N15)) l -= (2 * N15);
701       pDst[k + 6] = pSrc[l];
702       pDst[k + 7] = pSrc[l + 1];
703       l += (2 * N6);
704       if (l >= (2 * N15)) l -= (2 * N15);
705       pDst[k + 8] = pSrc[l];
706       pDst[k + 9] = pSrc[l + 1];
707       l += 2; /* no modulo check needed, it cannot occur */
708     }
709   }
710 }
711 #endif /* FUNCTION_fft15 */
712 
713 /*
714  Select shift placement.
715  Some processors like ARM may shift "for free" in combination with an addition
716  or substraction, but others don't so either combining shift with +/- or reduce
717  the total amount or shift operations is optimal
718  */
719 #if !defined(__arm__)
720 #define SHIFT_A >> 1
721 #define SHIFT_B
722 #else
723 #define SHIFT_A
724 #define SHIFT_B >> 1
725 #endif
726 
727 #ifndef FUNCTION_fft_16 /* we check, if fft_16 (FIXP_DBL *) is not yet defined \
728                          */
729 
730 /* This defines prevents this array to be declared twice, if 16-bit fft is
731  * enabled too */
732 #define FUNCTION_DATA_fft_16_w16
733 static const FIXP_STP fft16_w16[2] = {STCP(0x7641af3d, 0x30fbc54d),
734                                       STCP(0x30fbc54d, 0x7641af3d)};
735 
736 LNK_SECTION_CODE_L1
fft_16(FIXP_DBL * RESTRICT x)737 inline void fft_16(FIXP_DBL *RESTRICT x) {
738   FIXP_DBL vr, ur;
739   FIXP_DBL vr2, ur2;
740   FIXP_DBL vr3, ur3;
741   FIXP_DBL vr4, ur4;
742   FIXP_DBL vi, ui;
743   FIXP_DBL vi2, ui2;
744   FIXP_DBL vi3, ui3;
745 
746   vr = (x[0] >> 1) + (x[16] >> 1);       /* Re A + Re B */
747   ur = (x[1] >> 1) + (x[17] >> 1);       /* Im A + Im B */
748   vi = (x[8] SHIFT_A) + (x[24] SHIFT_A); /* Re C + Re D */
749   ui = (x[9] SHIFT_A) + (x[25] SHIFT_A); /* Im C + Im D */
750   x[0] = vr + (vi SHIFT_B);              /* Re A' = ReA + ReB +ReC + ReD */
751   x[1] = ur + (ui SHIFT_B);              /* Im A' = sum of imag values */
752 
753   vr2 = (x[4] >> 1) + (x[20] >> 1); /* Re A + Re B */
754   ur2 = (x[5] >> 1) + (x[21] >> 1); /* Im A + Im B */
755 
756   x[4] = vr - (vi SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
757   x[5] = ur - (ui SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
758   vr -= x[16];              /* Re A - Re B */
759   vi = (vi SHIFT_B)-x[24];  /* Re C - Re D */
760   ur -= x[17];              /* Im A - Im B */
761   ui = (ui SHIFT_B)-x[25];  /* Im C - Im D */
762 
763   vr3 = (x[2] >> 1) + (x[18] >> 1); /* Re A + Re B */
764   ur3 = (x[3] >> 1) + (x[19] >> 1); /* Im A + Im B */
765 
766   x[2] = ui + vr; /* Re B' = Im C - Im D  + Re A - Re B */
767   x[3] = ur - vi; /* Im B'= -Re C + Re D + Im A - Im B */
768 
769   vr4 = (x[6] >> 1) + (x[22] >> 1); /* Re A + Re B */
770   ur4 = (x[7] >> 1) + (x[23] >> 1); /* Im A + Im B */
771 
772   x[6] = vr - ui; /* Re D' = -Im C + Im D + Re A - Re B */
773   x[7] = vi + ur; /* Im D'= Re C - Re D + Im A - Im B */
774 
775   vi2 = (x[12] SHIFT_A) + (x[28] SHIFT_A); /* Re C + Re D */
776   ui2 = (x[13] SHIFT_A) + (x[29] SHIFT_A); /* Im C + Im D */
777   x[8] = vr2 + (vi2 SHIFT_B);              /* Re A' = ReA + ReB +ReC + ReD */
778   x[9] = ur2 + (ui2 SHIFT_B);              /* Im A' = sum of imag values */
779   x[12] = vr2 - (vi2 SHIFT_B);             /* Re C' = -(ReC+ReD) + (ReA+ReB) */
780   x[13] = ur2 - (ui2 SHIFT_B);             /* Im C' = -Im C -Im D +Im A +Im B */
781   vr2 -= x[20];                            /* Re A - Re B */
782   ur2 -= x[21];                            /* Im A - Im B */
783   vi2 = (vi2 SHIFT_B)-x[28];               /* Re C - Re D */
784   ui2 = (ui2 SHIFT_B)-x[29];               /* Im C - Im D */
785 
786   vi = (x[10] SHIFT_A) + (x[26] SHIFT_A); /* Re C + Re D */
787   ui = (x[11] SHIFT_A) + (x[27] SHIFT_A); /* Im C + Im D */
788 
789   x[10] = ui2 + vr2; /* Re B' = Im C - Im D  + Re A - Re B */
790   x[11] = ur2 - vi2; /* Im B'= -Re C + Re D + Im A - Im B */
791 
792   vi3 = (x[14] SHIFT_A) + (x[30] SHIFT_A); /* Re C + Re D */
793   ui3 = (x[15] SHIFT_A) + (x[31] SHIFT_A); /* Im C + Im D */
794 
795   x[14] = vr2 - ui2; /* Re D' = -Im C + Im D + Re A - Re B */
796   x[15] = vi2 + ur2; /* Im D'= Re C - Re D + Im A - Im B */
797 
798   x[16] = vr3 + (vi SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
799   x[17] = ur3 + (ui SHIFT_B); /* Im A' = sum of imag values */
800   x[20] = vr3 - (vi SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
801   x[21] = ur3 - (ui SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
802   vr3 -= x[18];               /* Re A - Re B */
803   ur3 -= x[19];               /* Im A - Im B */
804   vi = (vi SHIFT_B)-x[26];    /* Re C - Re D */
805   ui = (ui SHIFT_B)-x[27];    /* Im C - Im D */
806   x[18] = ui + vr3;           /* Re B' = Im C - Im D  + Re A - Re B */
807   x[19] = ur3 - vi;           /* Im B'= -Re C + Re D + Im A - Im B */
808 
809   x[24] = vr4 + (vi3 SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
810   x[28] = vr4 - (vi3 SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
811   x[25] = ur4 + (ui3 SHIFT_B); /* Im A' = sum of imag values */
812   x[29] = ur4 - (ui3 SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
813   vr4 -= x[22];                /* Re A - Re B */
814   ur4 -= x[23];                /* Im A - Im B */
815 
816   x[22] = vr3 - ui; /* Re D' = -Im C + Im D + Re A - Re B */
817   x[23] = vi + ur3; /* Im D'= Re C - Re D + Im A - Im B */
818 
819   vi3 = (vi3 SHIFT_B)-x[30]; /* Re C - Re D */
820   ui3 = (ui3 SHIFT_B)-x[31]; /* Im C - Im D */
821   x[26] = ui3 + vr4;         /* Re B' = Im C - Im D  + Re A - Re B */
822   x[30] = vr4 - ui3;         /* Re D' = -Im C + Im D + Re A - Re B */
823   x[27] = ur4 - vi3;         /* Im B'= -Re C + Re D + Im A - Im B */
824   x[31] = vi3 + ur4;         /* Im D'= Re C - Re D + Im A - Im B */
825 
826   // xt1 =  0
827   // xt2 =  8
828   vr = x[8];
829   vi = x[9];
830   ur = x[0] >> 1;
831   ui = x[1] >> 1;
832   x[0] = ur + (vr >> 1);
833   x[1] = ui + (vi >> 1);
834   x[8] = ur - (vr >> 1);
835   x[9] = ui - (vi >> 1);
836 
837   // xt1 =  4
838   // xt2 = 12
839   vr = x[13];
840   vi = x[12];
841   ur = x[4] >> 1;
842   ui = x[5] >> 1;
843   x[4] = ur + (vr >> 1);
844   x[5] = ui - (vi >> 1);
845   x[12] = ur - (vr >> 1);
846   x[13] = ui + (vi >> 1);
847 
848   // xt1 = 16
849   // xt2 = 24
850   vr = x[24];
851   vi = x[25];
852   ur = x[16] >> 1;
853   ui = x[17] >> 1;
854   x[16] = ur + (vr >> 1);
855   x[17] = ui + (vi >> 1);
856   x[24] = ur - (vr >> 1);
857   x[25] = ui - (vi >> 1);
858 
859   // xt1 = 20
860   // xt2 = 28
861   vr = x[29];
862   vi = x[28];
863   ur = x[20] >> 1;
864   ui = x[21] >> 1;
865   x[20] = ur + (vr >> 1);
866   x[21] = ui - (vi >> 1);
867   x[28] = ur - (vr >> 1);
868   x[29] = ui + (vi >> 1);
869 
870   // xt1 =  2
871   // xt2 = 10
872   SUMDIFF_PIFOURTH(vi, vr, x[10], x[11])
873   // vr = fMultDiv2((x[11] + x[10]),W_PiFOURTH);
874   // vi = fMultDiv2((x[11] - x[10]),W_PiFOURTH);
875   ur = x[2];
876   ui = x[3];
877   x[2] = (ur >> 1) + vr;
878   x[3] = (ui >> 1) + vi;
879   x[10] = (ur >> 1) - vr;
880   x[11] = (ui >> 1) - vi;
881 
882   // xt1 =  6
883   // xt2 = 14
884   SUMDIFF_PIFOURTH(vr, vi, x[14], x[15])
885   ur = x[6];
886   ui = x[7];
887   x[6] = (ur >> 1) + vr;
888   x[7] = (ui >> 1) - vi;
889   x[14] = (ur >> 1) - vr;
890   x[15] = (ui >> 1) + vi;
891 
892   // xt1 = 18
893   // xt2 = 26
894   SUMDIFF_PIFOURTH(vi, vr, x[26], x[27])
895   ur = x[18];
896   ui = x[19];
897   x[18] = (ur >> 1) + vr;
898   x[19] = (ui >> 1) + vi;
899   x[26] = (ur >> 1) - vr;
900   x[27] = (ui >> 1) - vi;
901 
902   // xt1 = 22
903   // xt2 = 30
904   SUMDIFF_PIFOURTH(vr, vi, x[30], x[31])
905   ur = x[22];
906   ui = x[23];
907   x[22] = (ur >> 1) + vr;
908   x[23] = (ui >> 1) - vi;
909   x[30] = (ur >> 1) - vr;
910   x[31] = (ui >> 1) + vi;
911 
912   // xt1 =  0
913   // xt2 = 16
914   vr = x[16];
915   vi = x[17];
916   ur = x[0] >> 1;
917   ui = x[1] >> 1;
918   x[0] = ur + (vr >> 1);
919   x[1] = ui + (vi >> 1);
920   x[16] = ur - (vr >> 1);
921   x[17] = ui - (vi >> 1);
922 
923   // xt1 =  8
924   // xt2 = 24
925   vi = x[24];
926   vr = x[25];
927   ur = x[8] >> 1;
928   ui = x[9] >> 1;
929   x[8] = ur + (vr >> 1);
930   x[9] = ui - (vi >> 1);
931   x[24] = ur - (vr >> 1);
932   x[25] = ui + (vi >> 1);
933 
934   // xt1 =  2
935   // xt2 = 18
936   cplxMultDiv2(&vi, &vr, x[19], x[18], fft16_w16[0]);
937   ur = x[2];
938   ui = x[3];
939   x[2] = (ur >> 1) + vr;
940   x[3] = (ui >> 1) + vi;
941   x[18] = (ur >> 1) - vr;
942   x[19] = (ui >> 1) - vi;
943 
944   // xt1 = 10
945   // xt2 = 26
946   cplxMultDiv2(&vr, &vi, x[27], x[26], fft16_w16[0]);
947   ur = x[10];
948   ui = x[11];
949   x[10] = (ur >> 1) + vr;
950   x[11] = (ui >> 1) - vi;
951   x[26] = (ur >> 1) - vr;
952   x[27] = (ui >> 1) + vi;
953 
954   // xt1 =  4
955   // xt2 = 20
956   SUMDIFF_PIFOURTH(vi, vr, x[20], x[21])
957   ur = x[4];
958   ui = x[5];
959   x[4] = (ur >> 1) + vr;
960   x[5] = (ui >> 1) + vi;
961   x[20] = (ur >> 1) - vr;
962   x[21] = (ui >> 1) - vi;
963 
964   // xt1 = 12
965   // xt2 = 28
966   SUMDIFF_PIFOURTH(vr, vi, x[28], x[29])
967   ur = x[12];
968   ui = x[13];
969   x[12] = (ur >> 1) + vr;
970   x[13] = (ui >> 1) - vi;
971   x[28] = (ur >> 1) - vr;
972   x[29] = (ui >> 1) + vi;
973 
974   // xt1 =  6
975   // xt2 = 22
976   cplxMultDiv2(&vi, &vr, x[23], x[22], fft16_w16[1]);
977   ur = x[6];
978   ui = x[7];
979   x[6] = (ur >> 1) + vr;
980   x[7] = (ui >> 1) + vi;
981   x[22] = (ur >> 1) - vr;
982   x[23] = (ui >> 1) - vi;
983 
984   // xt1 = 14
985   // xt2 = 30
986   cplxMultDiv2(&vr, &vi, x[31], x[30], fft16_w16[1]);
987   ur = x[14];
988   ui = x[15];
989   x[14] = (ur >> 1) + vr;
990   x[15] = (ui >> 1) - vi;
991   x[30] = (ur >> 1) - vr;
992   x[31] = (ui >> 1) + vi;
993 }
994 #endif /* FUNCTION_fft_16 */
995 
996 #ifndef FUNCTION_fft_32
997 static const FIXP_STP fft32_w32[6] = {
998     STCP(0x7641af3d, 0x30fbc54d), STCP(0x30fbc54d, 0x7641af3d),
999     STCP(0x7d8a5f40, 0x18f8b83c), STCP(0x6a6d98a4, 0x471cece7),
1000     STCP(0x471cece7, 0x6a6d98a4), STCP(0x18f8b83c, 0x7d8a5f40)};
1001 #define W_PiFOURTH STC(0x5a82799a)
1002 
1003 LNK_SECTION_CODE_L1
fft_32(FIXP_DBL * const _x)1004 inline void fft_32(FIXP_DBL *const _x) {
1005   /*
1006    * 1+2 stage radix 4
1007    */
1008 
1009   /////////////////////////////////////////////////////////////////////////////////////////
1010   {
1011     FIXP_DBL *const x = _x;
1012     FIXP_DBL vi, ui;
1013     FIXP_DBL vi2, ui2;
1014     FIXP_DBL vi3, ui3;
1015     FIXP_DBL vr, ur;
1016     FIXP_DBL vr2, ur2;
1017     FIXP_DBL vr3, ur3;
1018     FIXP_DBL vr4, ur4;
1019 
1020     // i = 0
1021     vr = (x[0] + x[32]) >> 1;     /* Re A + Re B */
1022     ur = (x[1] + x[33]) >> 1;     /* Im A + Im B */
1023     vi = (x[16] + x[48]) SHIFT_A; /* Re C + Re D */
1024     ui = (x[17] + x[49]) SHIFT_A; /* Im C + Im D */
1025 
1026     x[0] = vr + (vi SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
1027     x[1] = ur + (ui SHIFT_B); /* Im A' = sum of imag values */
1028 
1029     vr2 = (x[4] + x[36]) >> 1; /* Re A + Re B */
1030     ur2 = (x[5] + x[37]) >> 1; /* Im A + Im B */
1031 
1032     x[4] = vr - (vi SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
1033     x[5] = ur - (ui SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
1034 
1035     vr -= x[32];             /* Re A - Re B */
1036     ur -= x[33];             /* Im A - Im B */
1037     vi = (vi SHIFT_B)-x[48]; /* Re C - Re D */
1038     ui = (ui SHIFT_B)-x[49]; /* Im C - Im D */
1039 
1040     vr3 = (x[2] + x[34]) >> 1; /* Re A + Re B */
1041     ur3 = (x[3] + x[35]) >> 1; /* Im A + Im B */
1042 
1043     x[2] = ui + vr; /* Re B' = Im C - Im D  + Re A - Re B */
1044     x[3] = ur - vi; /* Im B'= -Re C + Re D + Im A - Im B */
1045 
1046     vr4 = (x[6] + x[38]) >> 1; /* Re A + Re B */
1047     ur4 = (x[7] + x[39]) >> 1; /* Im A + Im B */
1048 
1049     x[6] = vr - ui; /* Re D' = -Im C + Im D + Re A - Re B */
1050     x[7] = vi + ur; /* Im D'= Re C - Re D + Im A - Im B */
1051 
1052     // i=16
1053     vi = (x[20] + x[52]) SHIFT_A; /* Re C + Re D */
1054     ui = (x[21] + x[53]) SHIFT_A; /* Im C + Im D */
1055 
1056     x[16] = vr2 + (vi SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
1057     x[17] = ur2 + (ui SHIFT_B); /* Im A' = sum of imag values */
1058     x[20] = vr2 - (vi SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
1059     x[21] = ur2 - (ui SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
1060 
1061     vr2 -= x[36];            /* Re A - Re B */
1062     ur2 -= x[37];            /* Im A - Im B */
1063     vi = (vi SHIFT_B)-x[52]; /* Re C - Re D */
1064     ui = (ui SHIFT_B)-x[53]; /* Im C - Im D */
1065 
1066     vi2 = (x[18] + x[50]) SHIFT_A; /* Re C + Re D */
1067     ui2 = (x[19] + x[51]) SHIFT_A; /* Im C + Im D */
1068 
1069     x[18] = ui + vr2; /* Re B' = Im C - Im D  + Re A - Re B */
1070     x[19] = ur2 - vi; /* Im B'= -Re C + Re D + Im A - Im B */
1071 
1072     vi3 = (x[22] + x[54]) SHIFT_A; /* Re C + Re D */
1073     ui3 = (x[23] + x[55]) SHIFT_A; /* Im C + Im D */
1074 
1075     x[22] = vr2 - ui; /* Re D' = -Im C + Im D + Re A - Re B */
1076     x[23] = vi + ur2; /* Im D'= Re C - Re D + Im A - Im B */
1077 
1078     // i = 32
1079 
1080     x[32] = vr3 + (vi2 SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
1081     x[33] = ur3 + (ui2 SHIFT_B); /* Im A' = sum of imag values */
1082     x[36] = vr3 - (vi2 SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
1083     x[37] = ur3 - (ui2 SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
1084 
1085     vr3 -= x[34];              /* Re A - Re B */
1086     ur3 -= x[35];              /* Im A - Im B */
1087     vi2 = (vi2 SHIFT_B)-x[50]; /* Re C - Re D */
1088     ui2 = (ui2 SHIFT_B)-x[51]; /* Im C - Im D */
1089 
1090     x[34] = ui2 + vr3; /* Re B' = Im C - Im D  + Re A - Re B */
1091     x[35] = ur3 - vi2; /* Im B'= -Re C + Re D + Im A - Im B */
1092 
1093     // i=48
1094 
1095     x[48] = vr4 + (vi3 SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
1096     x[52] = vr4 - (vi3 SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
1097     x[49] = ur4 + (ui3 SHIFT_B); /* Im A' = sum of imag values */
1098     x[53] = ur4 - (ui3 SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
1099 
1100     vr4 -= x[38]; /* Re A - Re B */
1101     ur4 -= x[39]; /* Im A - Im B */
1102 
1103     x[38] = vr3 - ui2; /* Re D' = -Im C + Im D + Re A - Re B */
1104     x[39] = vi2 + ur3; /* Im D'= Re C - Re D + Im A - Im B */
1105 
1106     vi3 = (vi3 SHIFT_B)-x[54]; /* Re C - Re D */
1107     ui3 = (ui3 SHIFT_B)-x[55]; /* Im C - Im D */
1108 
1109     x[50] = ui3 + vr4; /* Re B' = Im C - Im D  + Re A - Re B */
1110     x[54] = vr4 - ui3; /* Re D' = -Im C + Im D + Re A - Re B */
1111     x[51] = ur4 - vi3; /* Im B'= -Re C + Re D + Im A - Im B */
1112     x[55] = vi3 + ur4; /* Im D'= Re C - Re D + Im A - Im B */
1113 
1114     // i=8
1115     vr = (x[8] + x[40]) >> 1;     /* Re A + Re B */
1116     ur = (x[9] + x[41]) >> 1;     /* Im A + Im B */
1117     vi = (x[24] + x[56]) SHIFT_A; /* Re C + Re D */
1118     ui = (x[25] + x[57]) SHIFT_A; /* Im C + Im D */
1119 
1120     x[8] = vr + (vi SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
1121     x[9] = ur + (ui SHIFT_B); /* Im A' = sum of imag values */
1122 
1123     vr2 = (x[12] + x[44]) >> 1; /* Re A + Re B */
1124     ur2 = (x[13] + x[45]) >> 1; /* Im A + Im B */
1125 
1126     x[12] = vr - (vi SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
1127     x[13] = ur - (ui SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
1128 
1129     vr -= x[40];             /* Re A - Re B */
1130     ur -= x[41];             /* Im A - Im B */
1131     vi = (vi SHIFT_B)-x[56]; /* Re C - Re D */
1132     ui = (ui SHIFT_B)-x[57]; /* Im C - Im D */
1133 
1134     vr3 = (x[10] + x[42]) >> 1; /* Re A + Re B */
1135     ur3 = (x[11] + x[43]) >> 1; /* Im A + Im B */
1136 
1137     x[10] = ui + vr; /* Re B' = Im C - Im D  + Re A - Re B */
1138     x[11] = ur - vi; /* Im B'= -Re C + Re D + Im A - Im B */
1139 
1140     vr4 = (x[14] + x[46]) >> 1; /* Re A + Re B */
1141     ur4 = (x[15] + x[47]) >> 1; /* Im A + Im B */
1142 
1143     x[14] = vr - ui; /* Re D' = -Im C + Im D + Re A - Re B */
1144     x[15] = vi + ur; /* Im D'= Re C - Re D + Im A - Im B */
1145 
1146     // i=24
1147     vi = (x[28] + x[60]) SHIFT_A; /* Re C + Re D */
1148     ui = (x[29] + x[61]) SHIFT_A; /* Im C + Im D */
1149 
1150     x[24] = vr2 + (vi SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
1151     x[28] = vr2 - (vi SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
1152     x[25] = ur2 + (ui SHIFT_B); /* Im A' = sum of imag values */
1153     x[29] = ur2 - (ui SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
1154 
1155     vr2 -= x[44];            /* Re A - Re B */
1156     ur2 -= x[45];            /* Im A - Im B */
1157     vi = (vi SHIFT_B)-x[60]; /* Re C - Re D */
1158     ui = (ui SHIFT_B)-x[61]; /* Im C - Im D */
1159 
1160     vi2 = (x[26] + x[58]) SHIFT_A; /* Re C + Re D */
1161     ui2 = (x[27] + x[59]) SHIFT_A; /* Im C + Im D */
1162 
1163     x[26] = ui + vr2; /* Re B' = Im C - Im D  + Re A - Re B */
1164     x[27] = ur2 - vi; /* Im B'= -Re C + Re D + Im A - Im B */
1165 
1166     vi3 = (x[30] + x[62]) SHIFT_A; /* Re C + Re D */
1167     ui3 = (x[31] + x[63]) SHIFT_A; /* Im C + Im D */
1168 
1169     x[30] = vr2 - ui; /* Re D' = -Im C + Im D + Re A - Re B */
1170     x[31] = vi + ur2; /* Im D'= Re C - Re D + Im A - Im B */
1171 
1172     // i=40
1173 
1174     x[40] = vr3 + (vi2 SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
1175     x[44] = vr3 - (vi2 SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
1176     x[41] = ur3 + (ui2 SHIFT_B); /* Im A' = sum of imag values */
1177     x[45] = ur3 - (ui2 SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
1178 
1179     vr3 -= x[42];              /* Re A - Re B */
1180     ur3 -= x[43];              /* Im A - Im B */
1181     vi2 = (vi2 SHIFT_B)-x[58]; /* Re C - Re D */
1182     ui2 = (ui2 SHIFT_B)-x[59]; /* Im C - Im D */
1183 
1184     x[42] = ui2 + vr3; /* Re B' = Im C - Im D  + Re A - Re B */
1185     x[43] = ur3 - vi2; /* Im B'= -Re C + Re D + Im A - Im B */
1186 
1187     // i=56
1188 
1189     x[56] = vr4 + (vi3 SHIFT_B); /* Re A' = ReA + ReB +ReC + ReD */
1190     x[60] = vr4 - (vi3 SHIFT_B); /* Re C' = -(ReC+ReD) + (ReA+ReB) */
1191     x[57] = ur4 + (ui3 SHIFT_B); /* Im A' = sum of imag values */
1192     x[61] = ur4 - (ui3 SHIFT_B); /* Im C' = -Im C -Im D +Im A +Im B */
1193 
1194     vr4 -= x[46]; /* Re A - Re B */
1195     ur4 -= x[47]; /* Im A - Im B */
1196 
1197     x[46] = vr3 - ui2; /* Re D' = -Im C + Im D + Re A - Re B */
1198     x[47] = vi2 + ur3; /* Im D'= Re C - Re D + Im A - Im B */
1199 
1200     vi3 = (vi3 SHIFT_B)-x[62]; /* Re C - Re D */
1201     ui3 = (ui3 SHIFT_B)-x[63]; /* Im C - Im D */
1202 
1203     x[58] = ui3 + vr4; /* Re B' = Im C - Im D  + Re A - Re B */
1204     x[62] = vr4 - ui3; /* Re D' = -Im C + Im D + Re A - Re B */
1205     x[59] = ur4 - vi3; /* Im B'= -Re C + Re D + Im A - Im B */
1206     x[63] = vi3 + ur4; /* Im D'= Re C - Re D + Im A - Im B */
1207   }
1208 
1209   {
1210     FIXP_DBL *xt = _x;
1211 
1212     int j = 4;
1213     do {
1214       FIXP_DBL vi, ui, vr, ur;
1215 
1216       vr = xt[8];
1217       vi = xt[9];
1218       ur = xt[0] >> 1;
1219       ui = xt[1] >> 1;
1220       xt[0] = ur + (vr >> 1);
1221       xt[1] = ui + (vi >> 1);
1222       xt[8] = ur - (vr >> 1);
1223       xt[9] = ui - (vi >> 1);
1224 
1225       vr = xt[13];
1226       vi = xt[12];
1227       ur = xt[4] >> 1;
1228       ui = xt[5] >> 1;
1229       xt[4] = ur + (vr >> 1);
1230       xt[5] = ui - (vi >> 1);
1231       xt[12] = ur - (vr >> 1);
1232       xt[13] = ui + (vi >> 1);
1233 
1234       SUMDIFF_PIFOURTH(vi, vr, xt[10], xt[11])
1235       ur = xt[2];
1236       ui = xt[3];
1237       xt[2] = (ur >> 1) + vr;
1238       xt[3] = (ui >> 1) + vi;
1239       xt[10] = (ur >> 1) - vr;
1240       xt[11] = (ui >> 1) - vi;
1241 
1242       SUMDIFF_PIFOURTH(vr, vi, xt[14], xt[15])
1243       ur = xt[6];
1244       ui = xt[7];
1245 
1246       xt[6] = (ur >> 1) + vr;
1247       xt[7] = (ui >> 1) - vi;
1248       xt[14] = (ur >> 1) - vr;
1249       xt[15] = (ui >> 1) + vi;
1250       xt += 16;
1251     } while (--j != 0);
1252   }
1253 
1254   {
1255     FIXP_DBL *const x = _x;
1256     FIXP_DBL vi, ui, vr, ur;
1257 
1258     vr = x[16];
1259     vi = x[17];
1260     ur = x[0] >> 1;
1261     ui = x[1] >> 1;
1262     x[0] = ur + (vr >> 1);
1263     x[1] = ui + (vi >> 1);
1264     x[16] = ur - (vr >> 1);
1265     x[17] = ui - (vi >> 1);
1266 
1267     vi = x[24];
1268     vr = x[25];
1269     ur = x[8] >> 1;
1270     ui = x[9] >> 1;
1271     x[8] = ur + (vr >> 1);
1272     x[9] = ui - (vi >> 1);
1273     x[24] = ur - (vr >> 1);
1274     x[25] = ui + (vi >> 1);
1275 
1276     vr = x[48];
1277     vi = x[49];
1278     ur = x[32] >> 1;
1279     ui = x[33] >> 1;
1280     x[32] = ur + (vr >> 1);
1281     x[33] = ui + (vi >> 1);
1282     x[48] = ur - (vr >> 1);
1283     x[49] = ui - (vi >> 1);
1284 
1285     vi = x[56];
1286     vr = x[57];
1287     ur = x[40] >> 1;
1288     ui = x[41] >> 1;
1289     x[40] = ur + (vr >> 1);
1290     x[41] = ui - (vi >> 1);
1291     x[56] = ur - (vr >> 1);
1292     x[57] = ui + (vi >> 1);
1293 
1294     cplxMultDiv2(&vi, &vr, x[19], x[18], fft32_w32[0]);
1295     ur = x[2];
1296     ui = x[3];
1297     x[2] = (ur >> 1) + vr;
1298     x[3] = (ui >> 1) + vi;
1299     x[18] = (ur >> 1) - vr;
1300     x[19] = (ui >> 1) - vi;
1301 
1302     cplxMultDiv2(&vr, &vi, x[27], x[26], fft32_w32[0]);
1303     ur = x[10];
1304     ui = x[11];
1305     x[10] = (ur >> 1) + vr;
1306     x[11] = (ui >> 1) - vi;
1307     x[26] = (ur >> 1) - vr;
1308     x[27] = (ui >> 1) + vi;
1309 
1310     cplxMultDiv2(&vi, &vr, x[51], x[50], fft32_w32[0]);
1311     ur = x[34];
1312     ui = x[35];
1313     x[34] = (ur >> 1) + vr;
1314     x[35] = (ui >> 1) + vi;
1315     x[50] = (ur >> 1) - vr;
1316     x[51] = (ui >> 1) - vi;
1317 
1318     cplxMultDiv2(&vr, &vi, x[59], x[58], fft32_w32[0]);
1319     ur = x[42];
1320     ui = x[43];
1321     x[42] = (ur >> 1) + vr;
1322     x[43] = (ui >> 1) - vi;
1323     x[58] = (ur >> 1) - vr;
1324     x[59] = (ui >> 1) + vi;
1325 
1326     SUMDIFF_PIFOURTH(vi, vr, x[20], x[21])
1327     ur = x[4];
1328     ui = x[5];
1329     x[4] = (ur >> 1) + vr;
1330     x[5] = (ui >> 1) + vi;
1331     x[20] = (ur >> 1) - vr;
1332     x[21] = (ui >> 1) - vi;
1333 
1334     SUMDIFF_PIFOURTH(vr, vi, x[28], x[29])
1335     ur = x[12];
1336     ui = x[13];
1337     x[12] = (ur >> 1) + vr;
1338     x[13] = (ui >> 1) - vi;
1339     x[28] = (ur >> 1) - vr;
1340     x[29] = (ui >> 1) + vi;
1341 
1342     SUMDIFF_PIFOURTH(vi, vr, x[52], x[53])
1343     ur = x[36];
1344     ui = x[37];
1345     x[36] = (ur >> 1) + vr;
1346     x[37] = (ui >> 1) + vi;
1347     x[52] = (ur >> 1) - vr;
1348     x[53] = (ui >> 1) - vi;
1349 
1350     SUMDIFF_PIFOURTH(vr, vi, x[60], x[61])
1351     ur = x[44];
1352     ui = x[45];
1353     x[44] = (ur >> 1) + vr;
1354     x[45] = (ui >> 1) - vi;
1355     x[60] = (ur >> 1) - vr;
1356     x[61] = (ui >> 1) + vi;
1357 
1358     cplxMultDiv2(&vi, &vr, x[23], x[22], fft32_w32[1]);
1359     ur = x[6];
1360     ui = x[7];
1361     x[6] = (ur >> 1) + vr;
1362     x[7] = (ui >> 1) + vi;
1363     x[22] = (ur >> 1) - vr;
1364     x[23] = (ui >> 1) - vi;
1365 
1366     cplxMultDiv2(&vr, &vi, x[31], x[30], fft32_w32[1]);
1367     ur = x[14];
1368     ui = x[15];
1369     x[14] = (ur >> 1) + vr;
1370     x[15] = (ui >> 1) - vi;
1371     x[30] = (ur >> 1) - vr;
1372     x[31] = (ui >> 1) + vi;
1373 
1374     cplxMultDiv2(&vi, &vr, x[55], x[54], fft32_w32[1]);
1375     ur = x[38];
1376     ui = x[39];
1377     x[38] = (ur >> 1) + vr;
1378     x[39] = (ui >> 1) + vi;
1379     x[54] = (ur >> 1) - vr;
1380     x[55] = (ui >> 1) - vi;
1381 
1382     cplxMultDiv2(&vr, &vi, x[63], x[62], fft32_w32[1]);
1383     ur = x[46];
1384     ui = x[47];
1385 
1386     x[46] = (ur >> 1) + vr;
1387     x[47] = (ui >> 1) - vi;
1388     x[62] = (ur >> 1) - vr;
1389     x[63] = (ui >> 1) + vi;
1390 
1391     vr = x[32];
1392     vi = x[33];
1393     ur = x[0] >> 1;
1394     ui = x[1] >> 1;
1395     x[0] = ur + (vr >> 1);
1396     x[1] = ui + (vi >> 1);
1397     x[32] = ur - (vr >> 1);
1398     x[33] = ui - (vi >> 1);
1399 
1400     vi = x[48];
1401     vr = x[49];
1402     ur = x[16] >> 1;
1403     ui = x[17] >> 1;
1404     x[16] = ur + (vr >> 1);
1405     x[17] = ui - (vi >> 1);
1406     x[48] = ur - (vr >> 1);
1407     x[49] = ui + (vi >> 1);
1408 
1409     cplxMultDiv2(&vi, &vr, x[35], x[34], fft32_w32[2]);
1410     ur = x[2];
1411     ui = x[3];
1412     x[2] = (ur >> 1) + vr;
1413     x[3] = (ui >> 1) + vi;
1414     x[34] = (ur >> 1) - vr;
1415     x[35] = (ui >> 1) - vi;
1416 
1417     cplxMultDiv2(&vr, &vi, x[51], x[50], fft32_w32[2]);
1418     ur = x[18];
1419     ui = x[19];
1420     x[18] = (ur >> 1) + vr;
1421     x[19] = (ui >> 1) - vi;
1422     x[50] = (ur >> 1) - vr;
1423     x[51] = (ui >> 1) + vi;
1424 
1425     cplxMultDiv2(&vi, &vr, x[37], x[36], fft32_w32[0]);
1426     ur = x[4];
1427     ui = x[5];
1428     x[4] = (ur >> 1) + vr;
1429     x[5] = (ui >> 1) + vi;
1430     x[36] = (ur >> 1) - vr;
1431     x[37] = (ui >> 1) - vi;
1432 
1433     cplxMultDiv2(&vr, &vi, x[53], x[52], fft32_w32[0]);
1434     ur = x[20];
1435     ui = x[21];
1436     x[20] = (ur >> 1) + vr;
1437     x[21] = (ui >> 1) - vi;
1438     x[52] = (ur >> 1) - vr;
1439     x[53] = (ui >> 1) + vi;
1440 
1441     cplxMultDiv2(&vi, &vr, x[39], x[38], fft32_w32[3]);
1442     ur = x[6];
1443     ui = x[7];
1444     x[6] = (ur >> 1) + vr;
1445     x[7] = (ui >> 1) + vi;
1446     x[38] = (ur >> 1) - vr;
1447     x[39] = (ui >> 1) - vi;
1448 
1449     cplxMultDiv2(&vr, &vi, x[55], x[54], fft32_w32[3]);
1450     ur = x[22];
1451     ui = x[23];
1452     x[22] = (ur >> 1) + vr;
1453     x[23] = (ui >> 1) - vi;
1454     x[54] = (ur >> 1) - vr;
1455     x[55] = (ui >> 1) + vi;
1456 
1457     SUMDIFF_PIFOURTH(vi, vr, x[40], x[41])
1458     ur = x[8];
1459     ui = x[9];
1460     x[8] = (ur >> 1) + vr;
1461     x[9] = (ui >> 1) + vi;
1462     x[40] = (ur >> 1) - vr;
1463     x[41] = (ui >> 1) - vi;
1464 
1465     SUMDIFF_PIFOURTH(vr, vi, x[56], x[57])
1466     ur = x[24];
1467     ui = x[25];
1468     x[24] = (ur >> 1) + vr;
1469     x[25] = (ui >> 1) - vi;
1470     x[56] = (ur >> 1) - vr;
1471     x[57] = (ui >> 1) + vi;
1472 
1473     cplxMultDiv2(&vi, &vr, x[43], x[42], fft32_w32[4]);
1474     ur = x[10];
1475     ui = x[11];
1476 
1477     x[10] = (ur >> 1) + vr;
1478     x[11] = (ui >> 1) + vi;
1479     x[42] = (ur >> 1) - vr;
1480     x[43] = (ui >> 1) - vi;
1481 
1482     cplxMultDiv2(&vr, &vi, x[59], x[58], fft32_w32[4]);
1483     ur = x[26];
1484     ui = x[27];
1485     x[26] = (ur >> 1) + vr;
1486     x[27] = (ui >> 1) - vi;
1487     x[58] = (ur >> 1) - vr;
1488     x[59] = (ui >> 1) + vi;
1489 
1490     cplxMultDiv2(&vi, &vr, x[45], x[44], fft32_w32[1]);
1491     ur = x[12];
1492     ui = x[13];
1493     x[12] = (ur >> 1) + vr;
1494     x[13] = (ui >> 1) + vi;
1495     x[44] = (ur >> 1) - vr;
1496     x[45] = (ui >> 1) - vi;
1497 
1498     cplxMultDiv2(&vr, &vi, x[61], x[60], fft32_w32[1]);
1499     ur = x[28];
1500     ui = x[29];
1501     x[28] = (ur >> 1) + vr;
1502     x[29] = (ui >> 1) - vi;
1503     x[60] = (ur >> 1) - vr;
1504     x[61] = (ui >> 1) + vi;
1505 
1506     cplxMultDiv2(&vi, &vr, x[47], x[46], fft32_w32[5]);
1507     ur = x[14];
1508     ui = x[15];
1509     x[14] = (ur >> 1) + vr;
1510     x[15] = (ui >> 1) + vi;
1511     x[46] = (ur >> 1) - vr;
1512     x[47] = (ui >> 1) - vi;
1513 
1514     cplxMultDiv2(&vr, &vi, x[63], x[62], fft32_w32[5]);
1515     ur = x[30];
1516     ui = x[31];
1517     x[30] = (ur >> 1) + vr;
1518     x[31] = (ui >> 1) - vi;
1519     x[62] = (ur >> 1) - vr;
1520     x[63] = (ui >> 1) + vi;
1521   }
1522 }
1523 #endif /* #ifndef FUNCTION_fft_32 */
1524 
1525 /**
1526  * \brief Apply rotation vectors to a data buffer.
1527  * \param cl length of each row of input data.
1528  * \param l total length of input data.
1529  * \param pVecRe real part of rotation coefficient vector.
1530  * \param pVecIm imaginary part of rotation coefficient vector.
1531  */
1532 
1533 /*
1534    This defines patches each inaccurate 0x7FFF i.e. 0.9999 and uses 0x8000
1535    (-1.0) instead. At the end, the sign of the result is inverted
1536 */
1537 #define noFFT_APPLY_ROT_VECTOR_HQ
1538 
1539 #ifndef FUNCTION_fft_apply_rot_vector__FIXP_DBL
fft_apply_rot_vector(FIXP_DBL * RESTRICT pData,const int cl,const int l,const FIXP_STB * pVecRe,const FIXP_STB * pVecIm)1540 static inline void fft_apply_rot_vector(FIXP_DBL *RESTRICT pData, const int cl,
1541                                         const int l, const FIXP_STB *pVecRe,
1542                                         const FIXP_STB *pVecIm) {
1543   FIXP_DBL re, im;
1544   FIXP_STB vre, vim;
1545 
1546   int i, c;
1547 
1548   for (i = 0; i < cl; i++) {
1549     re = pData[2 * i];
1550     im = pData[2 * i + 1];
1551 
1552     pData[2 * i] = re >> 2;     /* * 0.25 */
1553     pData[2 * i + 1] = im >> 2; /* * 0.25 */
1554   }
1555   for (; i < l; i += cl) {
1556     re = pData[2 * i];
1557     im = pData[2 * i + 1];
1558 
1559     pData[2 * i] = re >> 2;     /* * 0.25 */
1560     pData[2 * i + 1] = im >> 2; /* * 0.25 */
1561 
1562     for (c = i + 1; c < i + cl; c++) {
1563       re = pData[2 * c] >> 1;
1564       im = pData[2 * c + 1] >> 1;
1565       vre = *pVecRe++;
1566       vim = *pVecIm++;
1567 
1568       cplxMultDiv2(&pData[2 * c + 1], &pData[2 * c], im, re, vre, vim);
1569     }
1570   }
1571 }
1572 #endif /* FUNCTION_fft_apply_rot_vector__FIXP_DBL */
1573 
1574 /* select either switch case of function pointer. */
1575 //#define FFT_TWO_STAGE_SWITCH_CASE
1576 #ifndef FUNCTION_fftN2_func
fftN2_func(FIXP_DBL * pInput,const int length,const int dim1,const int dim2,void (* const fft1)(FIXP_DBL *),void (* const fft2)(FIXP_DBL *),const FIXP_STB * RotVectorReal,const FIXP_STB * RotVectorImag,FIXP_DBL * aDst,FIXP_DBL * aDst2)1577 static inline void fftN2_func(FIXP_DBL *pInput, const int length,
1578                               const int dim1, const int dim2,
1579                               void (*const fft1)(FIXP_DBL *),
1580                               void (*const fft2)(FIXP_DBL *),
1581                               const FIXP_STB *RotVectorReal,
1582                               const FIXP_STB *RotVectorImag, FIXP_DBL *aDst,
1583                               FIXP_DBL *aDst2) {
1584   /* The real part of the input samples are at the addresses with even indices
1585   and the imaginary part of the input samples are at the addresses with odd
1586   indices. The output samples are stored at the address of pInput
1587   */
1588   FIXP_DBL *pSrc, *pDst, *pDstOut;
1589   int i;
1590 
1591   FDK_ASSERT(length == dim1 * dim2);
1592 
1593   /* Perform dim2 times the fft of length dim1. The input samples are at the
1594   address of pSrc and the output samples are at the address of pDst. The input
1595   vector for the fft of length dim1 is built of the interleaved samples in pSrc,
1596   the output samples are stored consecutively.
1597   */
1598   pSrc = pInput;
1599   pDst = aDst;
1600   for (i = 0; i < dim2; i++) {
1601     for (int j = 0; j < dim1; j++) {
1602       pDst[2 * j] = pSrc[2 * j * dim2];
1603       pDst[2 * j + 1] = pSrc[2 * j * dim2 + 1];
1604     }
1605 
1606       /* fft of size dim1 */
1607 #ifndef FFT_TWO_STAGE_SWITCH_CASE
1608     fft1(pDst);
1609 #else
1610     switch (dim1) {
1611       case 2:
1612         fft2(pDst);
1613         break;
1614       case 3:
1615         fft3(pDst);
1616         break;
1617       case 4:
1618         fft_4(pDst);
1619         break;
1620       /* case 5: fft5(pDst); break; */
1621       /* case 8: fft_8(pDst); break; */
1622       case 12:
1623         fft12(pDst);
1624         break;
1625       /* case 15: fft15(pDst); break; */
1626       case 16:
1627         fft_16(pDst);
1628         break;
1629       case 32:
1630         fft_32(pDst);
1631         break;
1632         /*case 64: fft_64(pDst); break;*/
1633         /* case 128: fft_128(pDst); break; */
1634     }
1635 #endif
1636     pSrc += 2;
1637     pDst = pDst + 2 * dim1;
1638   }
1639 
1640   /* Perform the modulation of the output of the fft of length dim1 */
1641   pSrc = aDst;
1642   fft_apply_rot_vector(pSrc, dim1, length, RotVectorReal, RotVectorImag);
1643 
1644   /* Perform dim1 times the fft of length dim2. The input samples are at the
1645   address of aDst and the output samples are at the address of pInput. The input
1646   vector for the fft of length dim2 is built of the interleaved samples in aDst,
1647   the output samples are stored consecutively at the address of pInput.
1648   */
1649   pSrc = aDst;
1650   pDst = aDst2;
1651   pDstOut = pInput;
1652   for (i = 0; i < dim1; i++) {
1653     for (int j = 0; j < dim2; j++) {
1654       pDst[2 * j] = pSrc[2 * j * dim1];
1655       pDst[2 * j + 1] = pSrc[2 * j * dim1 + 1];
1656     }
1657 
1658 #ifndef FFT_TWO_STAGE_SWITCH_CASE
1659     fft2(pDst);
1660 #else
1661     switch (dim2) {
1662       case 4:
1663         fft_4(pDst);
1664         break;
1665       case 9:
1666         fft9(pDst);
1667         break;
1668       case 12:
1669         fft12(pDst);
1670         break;
1671       case 15:
1672         fft15(pDst);
1673         break;
1674       case 16:
1675         fft_16(pDst);
1676         break;
1677       case 32:
1678         fft_32(pDst);
1679         break;
1680     }
1681 #endif
1682 
1683     for (int j = 0; j < dim2; j++) {
1684       pDstOut[2 * j * dim1] = pDst[2 * j];
1685       pDstOut[2 * j * dim1 + 1] = pDst[2 * j + 1];
1686     }
1687     pSrc += 2;
1688     pDstOut += 2;
1689   }
1690 }
1691 #endif /* FUNCTION_fftN2_function */
1692 
1693 #define fftN2(DATA_TYPE, pInput, length, dim1, dim2, fft_func1, fft_func2, \
1694               RotVectorReal, RotVectorImag)                                \
1695   {                                                                        \
1696     C_AALLOC_SCRATCH_START(aDst, DATA_TYPE, 2 * length)                    \
1697     C_AALLOC_SCRATCH_START(aDst2, DATA_TYPE, 2 * dim2)                     \
1698     fftN2_func(pInput, length, dim1, dim2, fft_func1, fft_func2,           \
1699                RotVectorReal, RotVectorImag, aDst, aDst2);                 \
1700     C_AALLOC_SCRATCH_END(aDst2, DATA_TYPE, 2 * dim2)                       \
1701     C_AALLOC_SCRATCH_END(aDst, DATA_TYPE, 2 * length)                      \
1702   }
1703 
1704   /*!
1705    *
1706    *  \brief  complex FFT of length 12,18,24,30,48,60,96, 192, 240, 384, 480
1707    *  \param  pInput contains the input signal prescaled right by 2
1708    *          pInput contains the output signal scaled by SCALEFACTOR<#length>
1709    *          The output signal does not have any fixed headroom
1710    *  \return void
1711    *
1712    */
1713 
1714 #ifndef FUNCTION_fft6
fft6(FIXP_DBL * pInput)1715 static inline void fft6(FIXP_DBL *pInput) {
1716   fftN2(FIXP_DBL, pInput, 6, 2, 3, fft2, fft3, RotVectorReal6, RotVectorImag6);
1717 }
1718 #endif /* #ifndef FUNCTION_fft6 */
1719 
1720 #ifndef FUNCTION_fft12
fft12(FIXP_DBL * pInput)1721 static inline void fft12(FIXP_DBL *pInput) {
1722   fftN2(FIXP_DBL, pInput, 12, 3, 4, fft3, fft_4, RotVectorReal12,
1723         RotVectorImag12); /* 16,58 */
1724 }
1725 #endif /* #ifndef FUNCTION_fft12 */
1726 
1727 #ifndef FUNCTION_fft20
fft20(FIXP_DBL * pInput)1728 static inline void fft20(FIXP_DBL *pInput) {
1729   fftN2(FIXP_DBL, pInput, 20, 4, 5, fft_4, fft5, RotVectorReal20,
1730         RotVectorImag20);
1731 }
1732 #endif /* FUNCTION_fft20 */
1733 
fft24(FIXP_DBL * pInput)1734 static inline void fft24(FIXP_DBL *pInput) {
1735   fftN2(FIXP_DBL, pInput, 24, 2, 12, fft2, fft12, RotVectorReal24,
1736         RotVectorImag24); /* 16,73 */
1737 }
1738 
fft48(FIXP_DBL * pInput)1739 static inline void fft48(FIXP_DBL *pInput) {
1740   fftN2(FIXP_DBL, pInput, 48, 4, 12, fft_4, fft12, RotVectorReal48,
1741         RotVectorImag48); /* 16,32 */
1742 }
1743 
1744 #ifndef FUNCTION_fft60
fft60(FIXP_DBL * pInput)1745 static inline void fft60(FIXP_DBL *pInput) {
1746   fftN2(FIXP_DBL, pInput, 60, 4, 15, fft_4, fft15, RotVectorReal60,
1747         RotVectorImag60); /* 15,51 */
1748 }
1749 #endif /* FUNCTION_fft60 */
1750 
1751 #ifndef FUNCTION_fft80
fft80(FIXP_DBL * pInput)1752 static inline void fft80(FIXP_DBL *pInput) {
1753   fftN2(FIXP_DBL, pInput, 80, 5, 16, fft5, fft_16, RotVectorReal80,
1754         RotVectorImag80); /*  */
1755 }
1756 #endif
1757 
1758 #ifndef FUNCTION_fft96
fft96(FIXP_DBL * pInput)1759 static inline void fft96(FIXP_DBL *pInput) {
1760   fftN2(FIXP_DBL, pInput, 96, 3, 32, fft3, fft_32, RotVectorReal96,
1761         RotVectorImag96); /* 15,47 */
1762 }
1763 #endif /* FUNCTION_fft96*/
1764 
1765 #ifndef FUNCTION_fft120
fft120(FIXP_DBL * pInput)1766 static inline void fft120(FIXP_DBL *pInput) {
1767   fftN2(FIXP_DBL, pInput, 120, 8, 15, fft_8, fft15, RotVectorReal120,
1768         RotVectorImag120);
1769 }
1770 #endif /* FUNCTION_fft120 */
1771 
1772 #ifndef FUNCTION_fft192
fft192(FIXP_DBL * pInput)1773 static inline void fft192(FIXP_DBL *pInput) {
1774   fftN2(FIXP_DBL, pInput, 192, 16, 12, fft_16, fft12, RotVectorReal192,
1775         RotVectorImag192); /* 15,50 */
1776 }
1777 #endif
1778 
1779 #ifndef FUNCTION_fft240
fft240(FIXP_DBL * pInput)1780 static inline void fft240(FIXP_DBL *pInput) {
1781   fftN2(FIXP_DBL, pInput, 240, 16, 15, fft_16, fft15, RotVectorReal240,
1782         RotVectorImag240); /* 15.44 */
1783 }
1784 #endif
1785 
1786 #ifndef FUNCTION_fft384
fft384(FIXP_DBL * pInput)1787 static inline void fft384(FIXP_DBL *pInput) {
1788   fftN2(FIXP_DBL, pInput, 384, 12, 32, fft12, fft_32, RotVectorReal384,
1789         RotVectorImag384); /* 16.02 */
1790 }
1791 #endif /* FUNCTION_fft384 */
1792 
1793 #ifndef FUNCTION_fft480
fft480(FIXP_DBL * pInput)1794 static inline void fft480(FIXP_DBL *pInput) {
1795   fftN2(FIXP_DBL, pInput, 480, 32, 15, fft_32, fft15, RotVectorReal480,
1796         RotVectorImag480); /* 15.84 */
1797 }
1798 #endif /* FUNCTION_fft480 */
1799 
fft(int length,FIXP_DBL * pInput,INT * pScalefactor)1800 void fft(int length, FIXP_DBL *pInput, INT *pScalefactor) {
1801   /* Ensure, that the io-ptr is always (at least 8-byte) aligned */
1802   C_ALLOC_ALIGNED_CHECK(pInput);
1803 
1804   if (length == 32) {
1805     fft_32(pInput);
1806     *pScalefactor += SCALEFACTOR32;
1807   } else {
1808     switch (length) {
1809       case 16:
1810         fft_16(pInput);
1811         *pScalefactor += SCALEFACTOR16;
1812         break;
1813       case 8:
1814         fft_8(pInput);
1815         *pScalefactor += SCALEFACTOR8;
1816         break;
1817       case 2:
1818         fft2(pInput);
1819         *pScalefactor += SCALEFACTOR2;
1820         break;
1821       case 3:
1822         fft3(pInput);
1823         *pScalefactor += SCALEFACTOR3;
1824         break;
1825       case 4:
1826         fft_4(pInput);
1827         *pScalefactor += SCALEFACTOR4;
1828         break;
1829       case 5:
1830         fft5(pInput);
1831         *pScalefactor += SCALEFACTOR5;
1832         break;
1833       case 6:
1834         fft6(pInput);
1835         *pScalefactor += SCALEFACTOR6;
1836         break;
1837       case 10:
1838         fft10(pInput);
1839         *pScalefactor += SCALEFACTOR10;
1840         break;
1841       case 12:
1842         fft12(pInput);
1843         *pScalefactor += SCALEFACTOR12;
1844         break;
1845       case 15:
1846         fft15(pInput);
1847         *pScalefactor += SCALEFACTOR15;
1848         break;
1849       case 20:
1850         fft20(pInput);
1851         *pScalefactor += SCALEFACTOR20;
1852         break;
1853       case 24:
1854         fft24(pInput);
1855         *pScalefactor += SCALEFACTOR24;
1856         break;
1857       case 48:
1858         fft48(pInput);
1859         *pScalefactor += SCALEFACTOR48;
1860         break;
1861       case 60:
1862         fft60(pInput);
1863         *pScalefactor += SCALEFACTOR60;
1864         break;
1865       case 64:
1866         dit_fft(pInput, 6, SineTable512, 512);
1867         *pScalefactor += SCALEFACTOR64;
1868         break;
1869       case 80:
1870         fft80(pInput);
1871         *pScalefactor += SCALEFACTOR80;
1872         break;
1873       case 96:
1874         fft96(pInput);
1875         *pScalefactor += SCALEFACTOR96;
1876         break;
1877       case 120:
1878         fft120(pInput);
1879         *pScalefactor += SCALEFACTOR120;
1880         break;
1881       case 128:
1882         dit_fft(pInput, 7, SineTable512, 512);
1883         *pScalefactor += SCALEFACTOR128;
1884         break;
1885       case 192:
1886         fft192(pInput);
1887         *pScalefactor += SCALEFACTOR192;
1888         break;
1889       case 240:
1890         fft240(pInput);
1891         *pScalefactor += SCALEFACTOR240;
1892         break;
1893       case 256:
1894         dit_fft(pInput, 8, SineTable512, 512);
1895         *pScalefactor += SCALEFACTOR256;
1896         break;
1897       case 384:
1898         fft384(pInput);
1899         *pScalefactor += SCALEFACTOR384;
1900         break;
1901       case 480:
1902         fft480(pInput);
1903         *pScalefactor += SCALEFACTOR480;
1904         break;
1905       case 512:
1906         dit_fft(pInput, 9, SineTable512, 512);
1907         *pScalefactor += SCALEFACTOR512;
1908         break;
1909       default:
1910         FDK_ASSERT(0); /* FFT length not supported! */
1911         break;
1912     }
1913   }
1914 }
1915 
ifft(int length,FIXP_DBL * pInput,INT * scalefactor)1916 void ifft(int length, FIXP_DBL *pInput, INT *scalefactor) {
1917   switch (length) {
1918     default:
1919       FDK_ASSERT(0); /* IFFT length not supported! */
1920       break;
1921   }
1922 }
1923