1 /*
2  * http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
3  * Copyright Takuya OOURA, 1996-2001
4  *
5  * You may use, copy, modify and distribute this code for any purpose (include
6  * commercial use) and without fee. Please refer to this package when you modify
7  * this code.
8  *
9  * Changes by the WebRTC authors:
10  *    - Trivial type modifications.
11  *    - Minimal code subset to do rdft of length 128.
12  *    - Optimizations because of known length.
13  *    - Removed the global variables by moving the code in to a class in order
14  *      to make it thread safe.
15  *
16  *  All changes are covered by the WebRTC license and IP grant:
17  *  Use of this source code is governed by a BSD-style license
18  *  that can be found in the LICENSE file in the root of the source
19  *  tree. An additional intellectual property rights grant can be found
20  *  in the file PATENTS.  All contributing project authors may
21  *  be found in the AUTHORS file in the root of the source tree.
22  */
23 
24 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
25 
26 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
27 #include "rtc_base/system/arch.h"
28 #include "system_wrappers/include/cpu_features_wrapper.h"
29 
30 namespace webrtc {
31 
32 namespace {
33 
34 #if !(defined(MIPS_FPU_LE) || defined(WEBRTC_HAS_NEON))
cft1st_128_C(float * a)35 static void cft1st_128_C(float* a) {
36   const int n = 128;
37   int j, k1, k2;
38   float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
39   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
40 
41   // The processing of the first set of elements was simplified in C to avoid
42   // some operations (multiplication by zero or one, addition of two elements
43   // multiplied by the same weight, ...).
44   x0r = a[0] + a[2];
45   x0i = a[1] + a[3];
46   x1r = a[0] - a[2];
47   x1i = a[1] - a[3];
48   x2r = a[4] + a[6];
49   x2i = a[5] + a[7];
50   x3r = a[4] - a[6];
51   x3i = a[5] - a[7];
52   a[0] = x0r + x2r;
53   a[1] = x0i + x2i;
54   a[4] = x0r - x2r;
55   a[5] = x0i - x2i;
56   a[2] = x1r - x3i;
57   a[3] = x1i + x3r;
58   a[6] = x1r + x3i;
59   a[7] = x1i - x3r;
60   wk1r = rdft_w[2];
61   x0r = a[8] + a[10];
62   x0i = a[9] + a[11];
63   x1r = a[8] - a[10];
64   x1i = a[9] - a[11];
65   x2r = a[12] + a[14];
66   x2i = a[13] + a[15];
67   x3r = a[12] - a[14];
68   x3i = a[13] - a[15];
69   a[8] = x0r + x2r;
70   a[9] = x0i + x2i;
71   a[12] = x2i - x0i;
72   a[13] = x0r - x2r;
73   x0r = x1r - x3i;
74   x0i = x1i + x3r;
75   a[10] = wk1r * (x0r - x0i);
76   a[11] = wk1r * (x0r + x0i);
77   x0r = x3i + x1r;
78   x0i = x3r - x1i;
79   a[14] = wk1r * (x0i - x0r);
80   a[15] = wk1r * (x0i + x0r);
81   k1 = 0;
82   for (j = 16; j < n; j += 16) {
83     k1 += 2;
84     k2 = 2 * k1;
85     wk2r = rdft_w[k1 + 0];
86     wk2i = rdft_w[k1 + 1];
87     wk1r = rdft_w[k2 + 0];
88     wk1i = rdft_w[k2 + 1];
89     wk3r = rdft_wk3ri_first[k1 + 0];
90     wk3i = rdft_wk3ri_first[k1 + 1];
91     x0r = a[j + 0] + a[j + 2];
92     x0i = a[j + 1] + a[j + 3];
93     x1r = a[j + 0] - a[j + 2];
94     x1i = a[j + 1] - a[j + 3];
95     x2r = a[j + 4] + a[j + 6];
96     x2i = a[j + 5] + a[j + 7];
97     x3r = a[j + 4] - a[j + 6];
98     x3i = a[j + 5] - a[j + 7];
99     a[j + 0] = x0r + x2r;
100     a[j + 1] = x0i + x2i;
101     x0r -= x2r;
102     x0i -= x2i;
103     a[j + 4] = wk2r * x0r - wk2i * x0i;
104     a[j + 5] = wk2r * x0i + wk2i * x0r;
105     x0r = x1r - x3i;
106     x0i = x1i + x3r;
107     a[j + 2] = wk1r * x0r - wk1i * x0i;
108     a[j + 3] = wk1r * x0i + wk1i * x0r;
109     x0r = x1r + x3i;
110     x0i = x1i - x3r;
111     a[j + 6] = wk3r * x0r - wk3i * x0i;
112     a[j + 7] = wk3r * x0i + wk3i * x0r;
113     wk1r = rdft_w[k2 + 2];
114     wk1i = rdft_w[k2 + 3];
115     wk3r = rdft_wk3ri_second[k1 + 0];
116     wk3i = rdft_wk3ri_second[k1 + 1];
117     x0r = a[j + 8] + a[j + 10];
118     x0i = a[j + 9] + a[j + 11];
119     x1r = a[j + 8] - a[j + 10];
120     x1i = a[j + 9] - a[j + 11];
121     x2r = a[j + 12] + a[j + 14];
122     x2i = a[j + 13] + a[j + 15];
123     x3r = a[j + 12] - a[j + 14];
124     x3i = a[j + 13] - a[j + 15];
125     a[j + 8] = x0r + x2r;
126     a[j + 9] = x0i + x2i;
127     x0r -= x2r;
128     x0i -= x2i;
129     a[j + 12] = -wk2i * x0r - wk2r * x0i;
130     a[j + 13] = -wk2i * x0i + wk2r * x0r;
131     x0r = x1r - x3i;
132     x0i = x1i + x3r;
133     a[j + 10] = wk1r * x0r - wk1i * x0i;
134     a[j + 11] = wk1r * x0i + wk1i * x0r;
135     x0r = x1r + x3i;
136     x0i = x1i - x3r;
137     a[j + 14] = wk3r * x0r - wk3i * x0i;
138     a[j + 15] = wk3r * x0i + wk3i * x0r;
139   }
140 }
141 
cftmdl_128_C(float * a)142 static void cftmdl_128_C(float* a) {
143   const int l = 8;
144   const int n = 128;
145   const int m = 32;
146   int j0, j1, j2, j3, k, k1, k2, m2;
147   float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
148   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
149 
150   for (j0 = 0; j0 < l; j0 += 2) {
151     j1 = j0 + 8;
152     j2 = j0 + 16;
153     j3 = j0 + 24;
154     x0r = a[j0 + 0] + a[j1 + 0];
155     x0i = a[j0 + 1] + a[j1 + 1];
156     x1r = a[j0 + 0] - a[j1 + 0];
157     x1i = a[j0 + 1] - a[j1 + 1];
158     x2r = a[j2 + 0] + a[j3 + 0];
159     x2i = a[j2 + 1] + a[j3 + 1];
160     x3r = a[j2 + 0] - a[j3 + 0];
161     x3i = a[j2 + 1] - a[j3 + 1];
162     a[j0 + 0] = x0r + x2r;
163     a[j0 + 1] = x0i + x2i;
164     a[j2 + 0] = x0r - x2r;
165     a[j2 + 1] = x0i - x2i;
166     a[j1 + 0] = x1r - x3i;
167     a[j1 + 1] = x1i + x3r;
168     a[j3 + 0] = x1r + x3i;
169     a[j3 + 1] = x1i - x3r;
170   }
171   wk1r = rdft_w[2];
172   for (j0 = m; j0 < l + m; j0 += 2) {
173     j1 = j0 + 8;
174     j2 = j0 + 16;
175     j3 = j0 + 24;
176     x0r = a[j0 + 0] + a[j1 + 0];
177     x0i = a[j0 + 1] + a[j1 + 1];
178     x1r = a[j0 + 0] - a[j1 + 0];
179     x1i = a[j0 + 1] - a[j1 + 1];
180     x2r = a[j2 + 0] + a[j3 + 0];
181     x2i = a[j2 + 1] + a[j3 + 1];
182     x3r = a[j2 + 0] - a[j3 + 0];
183     x3i = a[j2 + 1] - a[j3 + 1];
184     a[j0 + 0] = x0r + x2r;
185     a[j0 + 1] = x0i + x2i;
186     a[j2 + 0] = x2i - x0i;
187     a[j2 + 1] = x0r - x2r;
188     x0r = x1r - x3i;
189     x0i = x1i + x3r;
190     a[j1 + 0] = wk1r * (x0r - x0i);
191     a[j1 + 1] = wk1r * (x0r + x0i);
192     x0r = x3i + x1r;
193     x0i = x3r - x1i;
194     a[j3 + 0] = wk1r * (x0i - x0r);
195     a[j3 + 1] = wk1r * (x0i + x0r);
196   }
197   k1 = 0;
198   m2 = 2 * m;
199   for (k = m2; k < n; k += m2) {
200     k1 += 2;
201     k2 = 2 * k1;
202     wk2r = rdft_w[k1 + 0];
203     wk2i = rdft_w[k1 + 1];
204     wk1r = rdft_w[k2 + 0];
205     wk1i = rdft_w[k2 + 1];
206     wk3r = rdft_wk3ri_first[k1 + 0];
207     wk3i = rdft_wk3ri_first[k1 + 1];
208     for (j0 = k; j0 < l + k; j0 += 2) {
209       j1 = j0 + 8;
210       j2 = j0 + 16;
211       j3 = j0 + 24;
212       x0r = a[j0 + 0] + a[j1 + 0];
213       x0i = a[j0 + 1] + a[j1 + 1];
214       x1r = a[j0 + 0] - a[j1 + 0];
215       x1i = a[j0 + 1] - a[j1 + 1];
216       x2r = a[j2 + 0] + a[j3 + 0];
217       x2i = a[j2 + 1] + a[j3 + 1];
218       x3r = a[j2 + 0] - a[j3 + 0];
219       x3i = a[j2 + 1] - a[j3 + 1];
220       a[j0 + 0] = x0r + x2r;
221       a[j0 + 1] = x0i + x2i;
222       x0r -= x2r;
223       x0i -= x2i;
224       a[j2 + 0] = wk2r * x0r - wk2i * x0i;
225       a[j2 + 1] = wk2r * x0i + wk2i * x0r;
226       x0r = x1r - x3i;
227       x0i = x1i + x3r;
228       a[j1 + 0] = wk1r * x0r - wk1i * x0i;
229       a[j1 + 1] = wk1r * x0i + wk1i * x0r;
230       x0r = x1r + x3i;
231       x0i = x1i - x3r;
232       a[j3 + 0] = wk3r * x0r - wk3i * x0i;
233       a[j3 + 1] = wk3r * x0i + wk3i * x0r;
234     }
235     wk1r = rdft_w[k2 + 2];
236     wk1i = rdft_w[k2 + 3];
237     wk3r = rdft_wk3ri_second[k1 + 0];
238     wk3i = rdft_wk3ri_second[k1 + 1];
239     for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
240       j1 = j0 + 8;
241       j2 = j0 + 16;
242       j3 = j0 + 24;
243       x0r = a[j0 + 0] + a[j1 + 0];
244       x0i = a[j0 + 1] + a[j1 + 1];
245       x1r = a[j0 + 0] - a[j1 + 0];
246       x1i = a[j0 + 1] - a[j1 + 1];
247       x2r = a[j2 + 0] + a[j3 + 0];
248       x2i = a[j2 + 1] + a[j3 + 1];
249       x3r = a[j2 + 0] - a[j3 + 0];
250       x3i = a[j2 + 1] - a[j3 + 1];
251       a[j0 + 0] = x0r + x2r;
252       a[j0 + 1] = x0i + x2i;
253       x0r -= x2r;
254       x0i -= x2i;
255       a[j2 + 0] = -wk2i * x0r - wk2r * x0i;
256       a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
257       x0r = x1r - x3i;
258       x0i = x1i + x3r;
259       a[j1 + 0] = wk1r * x0r - wk1i * x0i;
260       a[j1 + 1] = wk1r * x0i + wk1i * x0r;
261       x0r = x1r + x3i;
262       x0i = x1i - x3r;
263       a[j3 + 0] = wk3r * x0r - wk3i * x0i;
264       a[j3 + 1] = wk3r * x0i + wk3i * x0r;
265     }
266   }
267 }
268 
rftfsub_128_C(float * a)269 static void rftfsub_128_C(float* a) {
270   const float* c = rdft_w + 32;
271   int j1, j2, k1, k2;
272   float wkr, wki, xr, xi, yr, yi;
273 
274   for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
275     k2 = 128 - j2;
276     k1 = 32 - j1;
277     wkr = 0.5f - c[k1];
278     wki = c[j1];
279     xr = a[j2 + 0] - a[k2 + 0];
280     xi = a[j2 + 1] + a[k2 + 1];
281     yr = wkr * xr - wki * xi;
282     yi = wkr * xi + wki * xr;
283     a[j2 + 0] -= yr;
284     a[j2 + 1] -= yi;
285     a[k2 + 0] += yr;
286     a[k2 + 1] -= yi;
287   }
288 }
289 
rftbsub_128_C(float * a)290 static void rftbsub_128_C(float* a) {
291   const float* c = rdft_w + 32;
292   int j1, j2, k1, k2;
293   float wkr, wki, xr, xi, yr, yi;
294 
295   a[1] = -a[1];
296   for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
297     k2 = 128 - j2;
298     k1 = 32 - j1;
299     wkr = 0.5f - c[k1];
300     wki = c[j1];
301     xr = a[j2 + 0] - a[k2 + 0];
302     xi = a[j2 + 1] + a[k2 + 1];
303     yr = wkr * xr + wki * xi;
304     yi = wkr * xi - wki * xr;
305     a[j2 + 0] = a[j2 + 0] - yr;
306     a[j2 + 1] = yi - a[j2 + 1];
307     a[k2 + 0] = yr + a[k2 + 0];
308     a[k2 + 1] = yi - a[k2 + 1];
309   }
310   a[65] = -a[65];
311 }
312 #endif
313 
314 }  // namespace
315 
OouraFft(bool sse2_available)316 OouraFft::OouraFft(bool sse2_available) {
317 #if defined(WEBRTC_ARCH_X86_FAMILY)
318   use_sse2_ = sse2_available;
319 #else
320   use_sse2_ = false;
321 #endif
322 }
323 
OouraFft()324 OouraFft::OouraFft() {
325 #if defined(WEBRTC_ARCH_X86_FAMILY)
326   use_sse2_ = (GetCPUInfo(kSSE2) != 0);
327 #else
328   use_sse2_ = false;
329 #endif
330 }
331 
332 OouraFft::~OouraFft() = default;
333 
Fft(float * a) const334 void OouraFft::Fft(float* a) const {
335   float xi;
336   bitrv2_128(a);
337   cftfsub_128(a);
338   rftfsub_128(a);
339   xi = a[0] - a[1];
340   a[0] += a[1];
341   a[1] = xi;
342 }
InverseFft(float * a) const343 void OouraFft::InverseFft(float* a) const {
344   a[1] = 0.5f * (a[0] - a[1]);
345   a[0] -= a[1];
346   rftbsub_128(a);
347   bitrv2_128(a);
348   cftbsub_128(a);
349 }
350 
cft1st_128(float * a) const351 void OouraFft::cft1st_128(float* a) const {
352 #if defined(MIPS_FPU_LE)
353   cft1st_128_mips(a);
354 #elif defined(WEBRTC_HAS_NEON)
355   cft1st_128_neon(a);
356 #elif defined(WEBRTC_ARCH_X86_FAMILY)
357   if (use_sse2_) {
358     cft1st_128_SSE2(a);
359   } else {
360     cft1st_128_C(a);
361   }
362 #else
363   cft1st_128_C(a);
364 #endif
365 }
cftmdl_128(float * a) const366 void OouraFft::cftmdl_128(float* a) const {
367 #if defined(MIPS_FPU_LE)
368   cftmdl_128_mips(a);
369 #elif defined(WEBRTC_HAS_NEON)
370   cftmdl_128_neon(a);
371 #elif defined(WEBRTC_ARCH_X86_FAMILY)
372   if (use_sse2_) {
373     cftmdl_128_SSE2(a);
374   } else {
375     cftmdl_128_C(a);
376   }
377 #else
378   cftmdl_128_C(a);
379 #endif
380 }
rftfsub_128(float * a) const381 void OouraFft::rftfsub_128(float* a) const {
382 #if defined(MIPS_FPU_LE)
383   rftfsub_128_mips(a);
384 #elif defined(WEBRTC_HAS_NEON)
385   rftfsub_128_neon(a);
386 #elif defined(WEBRTC_ARCH_X86_FAMILY)
387   if (use_sse2_) {
388     rftfsub_128_SSE2(a);
389   } else {
390     rftfsub_128_C(a);
391   }
392 #else
393   rftfsub_128_C(a);
394 #endif
395 }
396 
rftbsub_128(float * a) const397 void OouraFft::rftbsub_128(float* a) const {
398 #if defined(MIPS_FPU_LE)
399   rftbsub_128_mips(a);
400 #elif defined(WEBRTC_HAS_NEON)
401   rftbsub_128_neon(a);
402 #elif defined(WEBRTC_ARCH_X86_FAMILY)
403   if (use_sse2_) {
404     rftbsub_128_SSE2(a);
405   } else {
406     rftbsub_128_C(a);
407   }
408 #else
409   rftbsub_128_C(a);
410 #endif
411 }
412 
cftbsub_128(float * a) const413 void OouraFft::cftbsub_128(float* a) const {
414   int j, j1, j2, j3, l;
415   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
416 
417   cft1st_128(a);
418   cftmdl_128(a);
419   l = 32;
420 
421   for (j = 0; j < l; j += 2) {
422     j1 = j + l;
423     j2 = j1 + l;
424     j3 = j2 + l;
425     x0r = a[j] + a[j1];
426     x0i = -a[j + 1] - a[j1 + 1];
427     x1r = a[j] - a[j1];
428     x1i = -a[j + 1] + a[j1 + 1];
429     x2r = a[j2] + a[j3];
430     x2i = a[j2 + 1] + a[j3 + 1];
431     x3r = a[j2] - a[j3];
432     x3i = a[j2 + 1] - a[j3 + 1];
433     a[j] = x0r + x2r;
434     a[j + 1] = x0i - x2i;
435     a[j2] = x0r - x2r;
436     a[j2 + 1] = x0i + x2i;
437     a[j1] = x1r - x3i;
438     a[j1 + 1] = x1i - x3r;
439     a[j3] = x1r + x3i;
440     a[j3 + 1] = x1i + x3r;
441   }
442 }
443 
cftfsub_128(float * a) const444 void OouraFft::cftfsub_128(float* a) const {
445   int j, j1, j2, j3, l;
446   float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
447 
448   cft1st_128(a);
449   cftmdl_128(a);
450   l = 32;
451   for (j = 0; j < l; j += 2) {
452     j1 = j + l;
453     j2 = j1 + l;
454     j3 = j2 + l;
455     x0r = a[j] + a[j1];
456     x0i = a[j + 1] + a[j1 + 1];
457     x1r = a[j] - a[j1];
458     x1i = a[j + 1] - a[j1 + 1];
459     x2r = a[j2] + a[j3];
460     x2i = a[j2 + 1] + a[j3 + 1];
461     x3r = a[j2] - a[j3];
462     x3i = a[j2 + 1] - a[j3 + 1];
463     a[j] = x0r + x2r;
464     a[j + 1] = x0i + x2i;
465     a[j2] = x0r - x2r;
466     a[j2 + 1] = x0i - x2i;
467     a[j1] = x1r - x3i;
468     a[j1 + 1] = x1i + x3r;
469     a[j3] = x1r + x3i;
470     a[j3 + 1] = x1i - x3r;
471   }
472 }
473 
bitrv2_128(float * a) const474 void OouraFft::bitrv2_128(float* a) const {
475   /*
476       Following things have been attempted but are no faster:
477       (a) Storing the swap indexes in a LUT (index calculations are done
478           for 'free' while waiting on memory/L1).
479       (b) Consolidate the load/store of two consecutive floats by a 64 bit
480           integer (execution is memory/L1 bound).
481       (c) Do a mix of floats and 64 bit integer to maximize register
482           utilization (execution is memory/L1 bound).
483       (d) Replacing ip[i] by ((k<<31)>>25) + ((k >> 1)<<5).
484       (e) Hard-coding of the offsets to completely eliminates index
485           calculations.
486   */
487 
488   unsigned int j, j1, k, k1;
489   float xr, xi, yr, yi;
490 
491   const int ip[4] = {0, 64, 32, 96};
492   for (k = 0; k < 4; k++) {
493     for (j = 0; j < k; j++) {
494       j1 = 2 * j + ip[k];
495       k1 = 2 * k + ip[j];
496       xr = a[j1 + 0];
497       xi = a[j1 + 1];
498       yr = a[k1 + 0];
499       yi = a[k1 + 1];
500       a[j1 + 0] = yr;
501       a[j1 + 1] = yi;
502       a[k1 + 0] = xr;
503       a[k1 + 1] = xi;
504       j1 += 8;
505       k1 += 16;
506       xr = a[j1 + 0];
507       xi = a[j1 + 1];
508       yr = a[k1 + 0];
509       yi = a[k1 + 1];
510       a[j1 + 0] = yr;
511       a[j1 + 1] = yi;
512       a[k1 + 0] = xr;
513       a[k1 + 1] = xi;
514       j1 += 8;
515       k1 -= 8;
516       xr = a[j1 + 0];
517       xi = a[j1 + 1];
518       yr = a[k1 + 0];
519       yi = a[k1 + 1];
520       a[j1 + 0] = yr;
521       a[j1 + 1] = yi;
522       a[k1 + 0] = xr;
523       a[k1 + 1] = xi;
524       j1 += 8;
525       k1 += 16;
526       xr = a[j1 + 0];
527       xi = a[j1 + 1];
528       yr = a[k1 + 0];
529       yi = a[k1 + 1];
530       a[j1 + 0] = yr;
531       a[j1 + 1] = yi;
532       a[k1 + 0] = xr;
533       a[k1 + 1] = xi;
534     }
535     j1 = 2 * k + 8 + ip[k];
536     k1 = j1 + 8;
537     xr = a[j1 + 0];
538     xi = a[j1 + 1];
539     yr = a[k1 + 0];
540     yi = a[k1 + 1];
541     a[j1 + 0] = yr;
542     a[j1 + 1] = yi;
543     a[k1 + 0] = xr;
544     a[k1 + 1] = xi;
545   }
546 }
547 
548 }  // namespace webrtc
549