1 /*
2  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>
12 #include <xmmintrin.h>
13 
14 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
15 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
16 #include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
17 #include "rtc_base/system/arch.h"
18 
19 namespace webrtc {
20 
21 #if defined(WEBRTC_ARCH_X86_FAMILY)
22 
23 namespace {
24 // These intrinsics were unavailable before VS 2008.
25 // TODO(andrew): move to a common file.
26 #if defined(_MSC_VER) && _MSC_VER < 1500
_mm_castsi128_ps(__m128i a)27 static __inline __m128 _mm_castsi128_ps(__m128i a) {
28   return *(__m128*)&a;
29 }
_mm_castps_si128(__m128 a)30 static __inline __m128i _mm_castps_si128(__m128 a) {
31   return *(__m128i*)&a;
32 }
33 #endif
34 
35 }  // namespace
36 
cft1st_128_SSE2(float * a)37 void cft1st_128_SSE2(float* a) {
38   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
39   int j, k2;
40 
41   for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
42     __m128 a00v = _mm_loadu_ps(&a[j + 0]);
43     __m128 a04v = _mm_loadu_ps(&a[j + 4]);
44     __m128 a08v = _mm_loadu_ps(&a[j + 8]);
45     __m128 a12v = _mm_loadu_ps(&a[j + 12]);
46     __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
47     __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
48     __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
49     __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
50 
51     const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
52     const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
53     const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
54     const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
55     const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
56     const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
57     __m128 x0v = _mm_add_ps(a01v, a23v);
58     const __m128 x1v = _mm_sub_ps(a01v, a23v);
59     const __m128 x2v = _mm_add_ps(a45v, a67v);
60     const __m128 x3v = _mm_sub_ps(a45v, a67v);
61     __m128 x0w;
62     a01v = _mm_add_ps(x0v, x2v);
63     x0v = _mm_sub_ps(x0v, x2v);
64     x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
65     {
66       const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
67       const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
68       a45v = _mm_add_ps(a45_0v, a45_1v);
69     }
70     {
71       __m128 a23_0v, a23_1v;
72       const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
73       const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
74       x0v = _mm_add_ps(x1v, x3s);
75       x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
76       a23_0v = _mm_mul_ps(wk1rv, x0v);
77       a23_1v = _mm_mul_ps(wk1iv, x0w);
78       a23v = _mm_add_ps(a23_0v, a23_1v);
79 
80       x0v = _mm_sub_ps(x1v, x3s);
81       x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
82     }
83     {
84       const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
85       const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
86       a67v = _mm_add_ps(a67_0v, a67_1v);
87     }
88 
89     a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
90     a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
91     a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
92     a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
93     _mm_storeu_ps(&a[j + 0], a00v);
94     _mm_storeu_ps(&a[j + 4], a04v);
95     _mm_storeu_ps(&a[j + 8], a08v);
96     _mm_storeu_ps(&a[j + 12], a12v);
97   }
98 }
99 
cftmdl_128_SSE2(float * a)100 void cftmdl_128_SSE2(float* a) {
101   const int l = 8;
102   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
103   int j0;
104 
105   __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
106   for (j0 = 0; j0 < l; j0 += 2) {
107     const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
108     const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
109     const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
110     const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
111     const __m128 a_00_32 =
112         _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
113                        _MM_SHUFFLE(1, 0, 1, 0));
114     const __m128 a_08_40 =
115         _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
116                        _MM_SHUFFLE(1, 0, 1, 0));
117     __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
118     const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
119 
120     const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
121     const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
122     const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
123     const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
124     const __m128 a_16_48 =
125         _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
126                        _MM_SHUFFLE(1, 0, 1, 0));
127     const __m128 a_24_56 =
128         _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
129                        _MM_SHUFFLE(1, 0, 1, 0));
130     const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
131     const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
132 
133     const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
134     const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
135 
136     const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
137         _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
138     const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
139     const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
140     const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
141 
142     const __m128 yy0 =
143         _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
144     const __m128 yy1 =
145         _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
146     const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
147     const __m128 yy3 = _mm_add_ps(yy0, yy2);
148     const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
149 
150     _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
151     _mm_storel_epi64(
152         (__m128i*)&a[j0 + 32],
153         _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
154 
155     _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
156     _mm_storel_epi64(
157         (__m128i*)&a[j0 + 48],
158         _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
159     a[j0 + 48] = -a[j0 + 48];
160 
161     _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
162     _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
163 
164     _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
165     _mm_storel_epi64(
166         (__m128i*)&a[j0 + 56],
167         _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
168   }
169 
170   {
171     int k = 64;
172     int k1 = 2;
173     int k2 = 2 * k1;
174     const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
175     const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
176     const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
177     const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
178     const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
179     wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
180     for (j0 = k; j0 < l + k; j0 += 2) {
181       const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
182       const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
183       const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
184       const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
185       const __m128 a_00_32 =
186           _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
187                          _MM_SHUFFLE(1, 0, 1, 0));
188       const __m128 a_08_40 =
189           _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
190                          _MM_SHUFFLE(1, 0, 1, 0));
191       __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
192       const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
193 
194       const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
195       const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
196       const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
197       const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
198       const __m128 a_16_48 =
199           _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
200                          _MM_SHUFFLE(1, 0, 1, 0));
201       const __m128 a_24_56 =
202           _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
203                          _MM_SHUFFLE(1, 0, 1, 0));
204       const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
205       const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
206 
207       const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
208       const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
209       const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
210       const __m128 xx3 = _mm_mul_ps(
211           wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
212                                                     _MM_SHUFFLE(2, 3, 0, 1))));
213       const __m128 xx4 = _mm_add_ps(xx2, xx3);
214 
215       const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
216           _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
217       const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
218       const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
219       const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
220 
221       const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
222       const __m128 xx11 = _mm_mul_ps(
223           wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
224                                                     _MM_SHUFFLE(2, 3, 0, 1))));
225       const __m128 xx12 = _mm_add_ps(xx10, xx11);
226 
227       const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
228       const __m128 xx21 = _mm_mul_ps(
229           wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
230                                                     _MM_SHUFFLE(2, 3, 0, 1))));
231       const __m128 xx22 = _mm_add_ps(xx20, xx21);
232 
233       _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
234       _mm_storel_epi64(
235           (__m128i*)&a[j0 + 32],
236           _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
237 
238       _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
239       _mm_storel_epi64(
240           (__m128i*)&a[j0 + 48],
241           _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
242 
243       _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
244       _mm_storel_epi64(
245           (__m128i*)&a[j0 + 40],
246           _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
247 
248       _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
249       _mm_storel_epi64(
250           (__m128i*)&a[j0 + 56],
251           _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
252     }
253   }
254 }
255 
rftfsub_128_SSE2(float * a)256 void rftfsub_128_SSE2(float* a) {
257   const float* c = rdft_w + 32;
258   int j1, j2, k1, k2;
259   float wkr, wki, xr, xi, yr, yi;
260 
261   static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
262                                                           0.5f};
263   const __m128 mm_half = _mm_load_ps(k_half);
264 
265   // Vectorized code (four at once).
266   //    Note: commented number are indexes for the first iteration of the loop.
267   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
268     // Load 'wk'.
269     const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
270     const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
271     const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
272     const __m128 wkr_ =
273         _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
274     const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
275     // Load and shuffle 'a'.
276     const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
277     const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
278     const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
279     const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
280     const __m128 a_j2_p0 = _mm_shuffle_ps(
281         a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
282     const __m128 a_j2_p1 = _mm_shuffle_ps(
283         a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
284     const __m128 a_k2_p0 = _mm_shuffle_ps(
285         a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
286     const __m128 a_k2_p1 = _mm_shuffle_ps(
287         a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
288     // Calculate 'x'.
289     const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
290     // 2-126, 4-124, 6-122, 8-120,
291     const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
292     // 3-127, 5-125, 7-123, 9-121,
293     // Calculate product into 'y'.
294     //    yr = wkr * xr - wki * xi;
295     //    yi = wkr * xi + wki * xr;
296     const __m128 a_ = _mm_mul_ps(wkr_, xr_);
297     const __m128 b_ = _mm_mul_ps(wki_, xi_);
298     const __m128 c_ = _mm_mul_ps(wkr_, xi_);
299     const __m128 d_ = _mm_mul_ps(wki_, xr_);
300     const __m128 yr_ = _mm_sub_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
301     const __m128 yi_ = _mm_add_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
302                                             // Update 'a'.
303                                             //    a[j2 + 0] -= yr;
304                                             //    a[j2 + 1] -= yi;
305                                             //    a[k2 + 0] += yr;
306     //    a[k2 + 1] -= yi;
307     const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
308     const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_);  //   3,   5,   7,   9,
309     const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
310     const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_);  // 127, 125, 123, 121,
311     // Shuffle in right order and store.
312     const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
313     //   2,   3,   4,   5,
314     const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
315     //   6,   7,   8,   9,
316     const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
317     // 122, 123, 120, 121,
318     const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
319     // 126, 127, 124, 125,
320     const __m128 a_k2_0n = _mm_shuffle_ps(
321         a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
322     const __m128 a_k2_4n = _mm_shuffle_ps(
323         a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
324     _mm_storeu_ps(&a[0 + j2], a_j2_0n);
325     _mm_storeu_ps(&a[4 + j2], a_j2_4n);
326     _mm_storeu_ps(&a[122 - j2], a_k2_0n);
327     _mm_storeu_ps(&a[126 - j2], a_k2_4n);
328   }
329   // Scalar code for the remaining items.
330   for (; j2 < 64; j1 += 1, j2 += 2) {
331     k2 = 128 - j2;
332     k1 = 32 - j1;
333     wkr = 0.5f - c[k1];
334     wki = c[j1];
335     xr = a[j2 + 0] - a[k2 + 0];
336     xi = a[j2 + 1] + a[k2 + 1];
337     yr = wkr * xr - wki * xi;
338     yi = wkr * xi + wki * xr;
339     a[j2 + 0] -= yr;
340     a[j2 + 1] -= yi;
341     a[k2 + 0] += yr;
342     a[k2 + 1] -= yi;
343   }
344 }
345 
rftbsub_128_SSE2(float * a)346 void rftbsub_128_SSE2(float* a) {
347   const float* c = rdft_w + 32;
348   int j1, j2, k1, k2;
349   float wkr, wki, xr, xi, yr, yi;
350 
351   static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
352                                                           0.5f};
353   const __m128 mm_half = _mm_load_ps(k_half);
354 
355   a[1] = -a[1];
356   // Vectorized code (four at once).
357   //    Note: commented number are indexes for the first iteration of the loop.
358   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
359     // Load 'wk'.
360     const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
361     const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
362     const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
363     const __m128 wkr_ =
364         _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
365     const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
366     // Load and shuffle 'a'.
367     const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
368     const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
369     const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
370     const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
371     const __m128 a_j2_p0 = _mm_shuffle_ps(
372         a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
373     const __m128 a_j2_p1 = _mm_shuffle_ps(
374         a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
375     const __m128 a_k2_p0 = _mm_shuffle_ps(
376         a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
377     const __m128 a_k2_p1 = _mm_shuffle_ps(
378         a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
379     // Calculate 'x'.
380     const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
381     // 2-126, 4-124, 6-122, 8-120,
382     const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
383     // 3-127, 5-125, 7-123, 9-121,
384     // Calculate product into 'y'.
385     //    yr = wkr * xr + wki * xi;
386     //    yi = wkr * xi - wki * xr;
387     const __m128 a_ = _mm_mul_ps(wkr_, xr_);
388     const __m128 b_ = _mm_mul_ps(wki_, xi_);
389     const __m128 c_ = _mm_mul_ps(wkr_, xi_);
390     const __m128 d_ = _mm_mul_ps(wki_, xr_);
391     const __m128 yr_ = _mm_add_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
392     const __m128 yi_ = _mm_sub_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
393                                             // Update 'a'.
394                                             //    a[j2 + 0] = a[j2 + 0] - yr;
395                                             //    a[j2 + 1] = yi - a[j2 + 1];
396                                             //    a[k2 + 0] = yr + a[k2 + 0];
397     //    a[k2 + 1] = yi - a[k2 + 1];
398     const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
399     const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
400     const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
401     const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
402     // Shuffle in right order and store.
403     const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
404     //   2,   3,   4,   5,
405     const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
406     //   6,   7,   8,   9,
407     const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
408     // 122, 123, 120, 121,
409     const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
410     // 126, 127, 124, 125,
411     const __m128 a_k2_0n = _mm_shuffle_ps(
412         a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
413     const __m128 a_k2_4n = _mm_shuffle_ps(
414         a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
415     _mm_storeu_ps(&a[0 + j2], a_j2_0n);
416     _mm_storeu_ps(&a[4 + j2], a_j2_4n);
417     _mm_storeu_ps(&a[122 - j2], a_k2_0n);
418     _mm_storeu_ps(&a[126 - j2], a_k2_4n);
419   }
420   // Scalar code for the remaining items.
421   for (; j2 < 64; j1 += 1, j2 += 2) {
422     k2 = 128 - j2;
423     k1 = 32 - j1;
424     wkr = 0.5f - c[k1];
425     wki = c[j1];
426     xr = a[j2 + 0] - a[k2 + 0];
427     xi = a[j2 + 1] + a[k2 + 1];
428     yr = wkr * xr + wki * xi;
429     yi = wkr * xi - wki * xr;
430     a[j2 + 0] = a[j2 + 0] - yr;
431     a[j2 + 1] = yi - a[j2 + 1];
432     a[k2 + 0] = yr + a[k2 + 0];
433     a[k2 + 1] = yi - a[k2 + 1];
434   }
435   a[65] = -a[65];
436 }
437 #endif
438 
439 }  // namespace webrtc
440