1 /*
2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "modules/audio_coding/codecs/isac/fix/source/codec.h"
14 #include "modules/audio_coding/codecs/isac/fix/source/fft.h"
15 #include "modules/audio_coding/codecs/isac/fix/source/settings.h"
16 
17 // Tables are defined in transform_tables.c file.
18 // Cosine table 1 in Q14.
19 extern const int16_t WebRtcIsacfix_kCosTab1[FRAMESAMPLES/2];
20 // Sine table 1 in Q14.
21 extern const int16_t WebRtcIsacfix_kSinTab1[FRAMESAMPLES/2];
22 // Sine table 2 in Q14.
23 extern const int16_t WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4];
24 
ComplexMulAndFindMaxNeon(int16_t * inre1Q9,int16_t * inre2Q9,int32_t * outreQ16,int32_t * outimQ16)25 static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9,
26                                                int16_t* inre2Q9,
27                                                int32_t* outreQ16,
28                                                int32_t* outimQ16) {
29   int k;
30   const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
31   const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
32   // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921.
33   // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code.
34   int32_t fact  = 16921 << 5;
35   int32x4_t factq = vdupq_n_s32(fact);
36   uint32x4_t max_r = vdupq_n_u32(0);
37   uint32x4_t max_i = vdupq_n_u32(0);
38 
39   for (k = 0; k < FRAMESAMPLES/2; k += 8) {
40     int16x8_t tmpr = vld1q_s16(kCosTab);
41     int16x8_t tmpi = vld1q_s16(kSinTab);
42     int16x8_t inre1 = vld1q_s16(inre1Q9);
43     int16x8_t inre2 = vld1q_s16(inre2Q9);
44     kCosTab += 8;
45     kSinTab += 8;
46     inre1Q9 += 8;
47     inre2Q9 += 8;
48 
49     // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
50     int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1));
51     int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2));
52     tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2));
53     tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1));
54 #if defined(WEBRTC_ARCH_ARM64)
55     int32x4_t tmp2 = vmull_high_s16(tmpr, inre1);
56     int32x4_t tmp3 = vmull_high_s16(tmpr, inre2);
57     tmp2 = vmlal_high_s16(tmp2, tmpi, inre2);
58     tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1);
59 #else
60     int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1));
61     int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2));
62     tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2));
63     tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1));
64 #endif
65 
66     int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq);
67     int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq);
68     int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq);
69     int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq);
70     vst1q_s32(outreQ16, outr_0);
71     outreQ16 += 4;
72     vst1q_s32(outreQ16, outr_1);
73     outreQ16 += 4;
74     vst1q_s32(outimQ16, outi_0);
75     outimQ16 += 4;
76     vst1q_s32(outimQ16, outi_1);
77     outimQ16 += 4;
78 
79     // Find the absolute maximum in the vectors.
80     tmp0 = vabsq_s32(outr_0);
81     tmp1 = vabsq_s32(outr_1);
82     tmp2 = vabsq_s32(outi_0);
83     tmp3 = vabsq_s32(outi_1);
84     // vabs doesn't change the value of 0x80000000.
85     // Use u32 so we don't lose the value 0x80000000.
86     max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
87     max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
88     max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
89     max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));
90   }
91 
92   max_r = vmaxq_u32(max_r, max_i);
93 #if defined(WEBRTC_ARCH_ARM64)
94   uint32_t maximum = vmaxvq_u32(max_r);
95 #else
96   uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
97   max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
98   uint32_t maximum = vget_lane_u32(max32x2_r, 0);
99 #endif
100 
101   return (int32_t)maximum;
102 }
103 
PreShiftW32toW16Neon(int32_t * inre,int32_t * inim,int16_t * outre,int16_t * outim,int32_t sh)104 static inline void PreShiftW32toW16Neon(int32_t* inre,
105                                         int32_t* inim,
106                                         int16_t* outre,
107                                         int16_t* outim,
108                                         int32_t sh) {
109   int k;
110   int32x4_t sh32x4 = vdupq_n_s32(sh);
111   for (k = 0; k < FRAMESAMPLES/2; k += 16) {
112     int32x4x4_t inre32x4x4 = vld4q_s32(inre);
113     int32x4x4_t inim32x4x4 = vld4q_s32(inim);
114     inre += 16;
115     inim += 16;
116     inre32x4x4.val[0] = vrshlq_s32(inre32x4x4.val[0], sh32x4);
117     inre32x4x4.val[1] = vrshlq_s32(inre32x4x4.val[1], sh32x4);
118     inre32x4x4.val[2] = vrshlq_s32(inre32x4x4.val[2], sh32x4);
119     inre32x4x4.val[3] = vrshlq_s32(inre32x4x4.val[3], sh32x4);
120     inim32x4x4.val[0] = vrshlq_s32(inim32x4x4.val[0], sh32x4);
121     inim32x4x4.val[1] = vrshlq_s32(inim32x4x4.val[1], sh32x4);
122     inim32x4x4.val[2] = vrshlq_s32(inim32x4x4.val[2], sh32x4);
123     inim32x4x4.val[3] = vrshlq_s32(inim32x4x4.val[3], sh32x4);
124     int16x4x4_t outre16x4x4;
125     int16x4x4_t outim16x4x4;
126     outre16x4x4.val[0]  = vmovn_s32(inre32x4x4.val[0]);
127     outre16x4x4.val[1]  = vmovn_s32(inre32x4x4.val[1]);
128     outre16x4x4.val[2]  = vmovn_s32(inre32x4x4.val[2]);
129     outre16x4x4.val[3]  = vmovn_s32(inre32x4x4.val[3]);
130     outim16x4x4.val[0]  = vmovn_s32(inim32x4x4.val[0]);
131     outim16x4x4.val[1]  = vmovn_s32(inim32x4x4.val[1]);
132     outim16x4x4.val[2]  = vmovn_s32(inim32x4x4.val[2]);
133     outim16x4x4.val[3]  = vmovn_s32(inim32x4x4.val[3]);
134     vst4_s16(outre, outre16x4x4);
135     vst4_s16(outim, outim16x4x4);
136     outre += 16;
137     outim += 16;
138   }
139 }
140 
PostShiftAndSeparateNeon(int16_t * inre,int16_t * inim,int16_t * outre,int16_t * outim,int32_t sh)141 static inline void PostShiftAndSeparateNeon(int16_t* inre,
142                                             int16_t* inim,
143                                             int16_t* outre,
144                                             int16_t* outim,
145                                             int32_t sh) {
146   int k;
147   int16_t* inre1 = inre;
148   int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
149   int16_t* inim1 = inim;
150   int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
151   int16_t* outre1 = outre;
152   int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
153   int16_t* outim1 = outim;
154   int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
155   const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
156   const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4];
157   // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)",
158   // ">> 14" and then ">> 9" as in the C code.
159   int32x4_t shift = vdupq_n_s32(-sh - 23);
160 
161   for (k = 0; k < FRAMESAMPLES/4; k += 4) {
162     int16x4_t tmpi = vld1_s16(kSinTab1);
163     kSinTab1 += 4;
164     int16x4_t tmpr = vld1_s16(kSinTab2);
165     kSinTab2 -= 4;
166     int16x4_t inre_0 = vld1_s16(inre1);
167     inre1 += 4;
168     int16x4_t inre_1 = vld1_s16(inre2);
169     inre2 -= 4;
170     int16x4_t inim_0 = vld1_s16(inim1);
171     inim1 += 4;
172     int16x4_t inim_1 = vld1_s16(inim2);
173     inim2 -= 4;
174     tmpr = vneg_s16(tmpr);
175     inre_1 = vrev64_s16(inre_1);
176     inim_1 = vrev64_s16(inim_1);
177     tmpr = vrev64_s16(tmpr);
178 
179     int16x4_t xr = vqadd_s16(inre_0, inre_1);
180     int16x4_t xi = vqsub_s16(inim_0, inim_1);
181     int16x4_t yr = vqadd_s16(inim_0, inim_1);
182     int16x4_t yi = vqsub_s16(inre_1, inre_0);
183 
184     int32x4_t outr0 = vmull_s16(tmpr, xr);
185     int32x4_t outi0 = vmull_s16(tmpi, xr);
186     int32x4_t outr1 = vmull_s16(tmpi, yr);
187     int32x4_t outi1 = vmull_s16(tmpi, yi);
188     outr0 = vmlsl_s16(outr0, tmpi, xi);
189     outi0 = vmlal_s16(outi0, tmpr, xi);
190     outr1 = vmlal_s16(outr1, tmpr, yi);
191     outi1 = vmlsl_s16(outi1, tmpr, yr);
192 
193     outr0 = vshlq_s32(outr0, shift);
194     outi0 = vshlq_s32(outi0, shift);
195     outr1 = vshlq_s32(outr1, shift);
196     outi1 = vshlq_s32(outi1, shift);
197     outr1 = vnegq_s32(outr1);
198 
199     int16x4_t outre_0  = vmovn_s32(outr0);
200     int16x4_t outim_0  = vmovn_s32(outi0);
201     int16x4_t outre_1  = vmovn_s32(outr1);
202     int16x4_t outim_1  = vmovn_s32(outi1);
203     outre_1 = vrev64_s16(outre_1);
204     outim_1 = vrev64_s16(outim_1);
205 
206     vst1_s16(outre1, outre_0);
207     outre1 += 4;
208     vst1_s16(outim1, outim_0);
209     outim1 += 4;
210     vst1_s16(outre2, outre_1);
211     outre2 -= 4;
212     vst1_s16(outim2, outim_1);
213     outim2 -= 4;
214   }
215 }
216 
WebRtcIsacfix_Time2SpecNeon(int16_t * inre1Q9,int16_t * inre2Q9,int16_t * outreQ7,int16_t * outimQ7)217 void WebRtcIsacfix_Time2SpecNeon(int16_t* inre1Q9,
218                                  int16_t* inre2Q9,
219                                  int16_t* outreQ7,
220                                  int16_t* outimQ7) {
221   int32_t tmpreQ16[FRAMESAMPLES/2], tmpimQ16[FRAMESAMPLES/2];
222   int32_t max;
223   int32_t sh;
224 
225   // Multiply with complex exponentials and combine into one complex vector.
226   // And find the maximum.
227   max = ComplexMulAndFindMaxNeon(inre1Q9, inre2Q9, tmpreQ16, tmpimQ16);
228 
229   sh = (int32_t)WebRtcSpl_NormW32(max);
230   sh = sh - 24;
231 
232   // If sh becomes >= 0, then we should shift sh steps to the left,
233   // and the domain will become Q(16 + sh).
234   // If sh becomes < 0, then we should shift -sh steps to the right,
235   // and the domain will become Q(16 + sh).
236   PreShiftW32toW16Neon(tmpreQ16, tmpimQ16, inre1Q9, inre2Q9, sh);
237 
238   // Get DFT.
239   WebRtcIsacfix_FftRadix16Fastest(inre1Q9, inre2Q9, -1);
240 
241   // If sh >= 0, shift sh steps to the right,
242   // If sh < 0, shift -sh steps to the left.
243   // Use symmetry to separate into two complex vectors
244   // and center frames in time around zero.
245   PostShiftAndSeparateNeon(inre1Q9, inre2Q9, outreQ7, outimQ7, sh);
246 }
247 
TransformAndFindMaxNeon(int16_t * inre,int16_t * inim,int32_t * outre,int32_t * outim)248 static inline int32_t TransformAndFindMaxNeon(int16_t* inre,
249                                               int16_t* inim,
250                                               int32_t* outre,
251                                               int32_t* outim) {
252   int k;
253   int16_t* inre1 = inre;
254   int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
255   int16_t* inim1 = inim;
256   int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
257   int32_t* outre1 = outre;
258   int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
259   int32_t* outim1 = outim;
260   int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
261   const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
262   const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4];
263   uint32x4_t max_r = vdupq_n_u32(0);
264   uint32x4_t max_i = vdupq_n_u32(0);
265 
266   // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
267   for (k = 0; k < FRAMESAMPLES/4; k += 4) {
268     int16x4_t tmpi = vld1_s16(kSinTab1);
269     kSinTab1 += 4;
270     int16x4_t tmpr = vld1_s16(kSinTab2);
271     kSinTab2 -= 4;
272     int16x4_t inre_0 = vld1_s16(inre1);
273     inre1 += 4;
274     int16x4_t inre_1 = vld1_s16(inre2);
275     inre2 -= 4;
276     int16x4_t inim_0 = vld1_s16(inim1);
277     inim1 += 4;
278     int16x4_t inim_1 = vld1_s16(inim2);
279     inim2 -= 4;
280     tmpr = vneg_s16(tmpr);
281     inre_1 = vrev64_s16(inre_1);
282     inim_1 = vrev64_s16(inim_1);
283     tmpr = vrev64_s16(tmpr);
284 
285     int32x4_t xr = vmull_s16(tmpr, inre_0);
286     int32x4_t xi = vmull_s16(tmpr, inim_0);
287     int32x4_t yr = vmull_s16(tmpr, inim_1);
288     int32x4_t yi = vmull_s16(tmpi, inim_1);
289     xr = vmlal_s16(xr, tmpi, inim_0);
290     xi = vmlsl_s16(xi, tmpi, inre_0);
291     yr = vmlal_s16(yr, tmpi, inre_1);
292     yi = vmlsl_s16(yi, tmpr, inre_1);
293     yr = vnegq_s32(yr);
294 
295     xr = vshrq_n_s32(xr, 5);
296     xi = vshrq_n_s32(xi, 5);
297     yr = vshrq_n_s32(yr, 5);
298     yi = vshrq_n_s32(yi, 5);
299 
300     int32x4_t outr0 = vsubq_s32(xr, yi);
301     int32x4_t outr1 = vaddq_s32(xr, yi);
302     int32x4_t outi0 = vaddq_s32(xi, yr);
303     int32x4_t outi1 = vsubq_s32(yr, xi);
304 
305     // Find the absolute maximum in the vectors.
306     int32x4_t tmp0 = vabsq_s32(outr0);
307     int32x4_t tmp1 = vabsq_s32(outr1);
308     int32x4_t tmp2 = vabsq_s32(outi0);
309     int32x4_t tmp3 = vabsq_s32(outi1);
310     // vabs doesn't change the value of 0x80000000.
311     // Use u32 so we don't lose the value 0x80000000.
312     max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
313     max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
314     max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
315     max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));
316 
317     // Store the vectors.
318     outr1 = vrev64q_s32(outr1);
319     outi1 = vrev64q_s32(outi1);
320     int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1));
321     int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1));
322 
323     vst1q_s32(outre1, outr0);
324     outre1 += 4;
325     vst1q_s32(outim1, outi0);
326     outim1 += 4;
327     vst1q_s32(outre2, outr_1);
328     outre2 -= 4;
329     vst1q_s32(outim2, outi_1);
330     outim2 -= 4;
331   }
332 
333   max_r = vmaxq_u32(max_r, max_i);
334 #if defined(WEBRTC_ARCH_ARM64)
335   uint32_t maximum = vmaxvq_u32(max_r);
336 #else
337   uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
338   max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
339   uint32_t maximum = vget_lane_u32(max32x2_r, 0);
340 #endif
341 
342   return (int32_t)maximum;
343 }
344 
PostShiftAndDivideAndDemodulateNeon(int16_t * inre,int16_t * inim,int32_t * outre1,int32_t * outre2,int32_t sh)345 static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre,
346                                                        int16_t* inim,
347                                                        int32_t* outre1,
348                                                        int32_t* outre2,
349                                                        int32_t sh) {
350   int k;
351   int16_t* p_inre = inre;
352   int16_t* p_inim = inim;
353   int32_t* p_outre1 = outre1;
354   int32_t* p_outre2 = outre2;
355   const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
356   const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
357   int32x4_t shift = vdupq_n_s32(-sh - 16);
358   // Divide through by the normalizing constant:
359   // scale all values with 1/240, i.e. with 273 in Q16.
360   // 273/65536 ~= 0.0041656
361   // 1/240 ~= 0.0041666
362   int16x8_t scale = vdupq_n_s16(273);
363   // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727.
364   int factQ19 = 31727 << 16;
365   int32x4_t fact = vdupq_n_s32(factQ19);
366 
367   for (k = 0; k < FRAMESAMPLES/2; k += 8) {
368     int16x8_t inre16x8 = vld1q_s16(p_inre);
369     int16x8_t inim16x8 = vld1q_s16(p_inim);
370     p_inre += 8;
371     p_inim += 8;
372     int16x8_t tmpr = vld1q_s16(kCosTab);
373     int16x8_t tmpi = vld1q_s16(kSinTab);
374     kCosTab += 8;
375     kSinTab += 8;
376     // By vshl and vmull, we effectively did "<< (-sh - 16)",
377     // instead of "<< (-sh)" and ">> 16" as in the C code.
378     int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale));
379     int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale));
380 #if defined(WEBRTC_ARCH_ARM64)
381     int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale);
382     int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale);
383 #else
384     int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8),
385                                    vget_high_s16(scale));
386     int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8),
387                                    vget_high_s16(scale));
388 #endif
389 
390     outre1_0 = vshlq_s32(outre1_0, shift);
391     outre1_1 = vshlq_s32(outre1_1, shift);
392     outre2_0 = vshlq_s32(outre2_0, shift);
393     outre2_1 = vshlq_s32(outre2_1, shift);
394 
395     // Demodulate and separate.
396     int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr));
397     int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi));
398 #if defined(WEBRTC_ARCH_ARM64)
399     int32x4_t tmpr_1 = vmovl_high_s16(tmpr);
400     int32x4_t tmpi_1 = vmovl_high_s16(tmpi);
401 #else
402     int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr));
403     int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi));
404 #endif
405 
406     int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0));
407     int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0));
408     int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1));
409     int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1));
410     xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0));
411     xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0));
412     xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1));
413     xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1));
414 
415 #if defined(WEBRTC_ARCH_ARM64)
416     int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0);
417     int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0);
418     int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1);
419     int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1);
420     xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0);
421     xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0);
422     xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1);
423     xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1);
424 #else
425     int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0));
426     int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0));
427     int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1));
428     int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1));
429     xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0));
430     xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0));
431     xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1));
432     xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1));
433 #endif
434 
435     outre1_0 = vcombine_s32(vrshrn_n_s64(xr0, 10), vrshrn_n_s64(xr1, 10));
436     outre2_0 = vcombine_s32(vrshrn_n_s64(xi0, 10), vrshrn_n_s64(xi1, 10));
437     outre1_1 = vcombine_s32(vrshrn_n_s64(xr2, 10), vrshrn_n_s64(xr3, 10));
438     outre2_1 = vcombine_s32(vrshrn_n_s64(xi2, 10), vrshrn_n_s64(xi3, 10));
439     outre1_0 = vqdmulhq_s32(outre1_0, fact);
440     outre2_0 = vqdmulhq_s32(outre2_0, fact);
441     outre1_1 = vqdmulhq_s32(outre1_1, fact);
442     outre2_1 = vqdmulhq_s32(outre2_1, fact);
443 
444     vst1q_s32(p_outre1, outre1_0);
445     p_outre1 += 4;
446     vst1q_s32(p_outre1, outre1_1);
447     p_outre1 += 4;
448     vst1q_s32(p_outre2, outre2_0);
449     p_outre2 += 4;
450     vst1q_s32(p_outre2, outre2_1);
451     p_outre2 += 4;
452   }
453 }
454 
WebRtcIsacfix_Spec2TimeNeon(int16_t * inreQ7,int16_t * inimQ7,int32_t * outre1Q16,int32_t * outre2Q16)455 void WebRtcIsacfix_Spec2TimeNeon(int16_t* inreQ7,
456                                  int16_t* inimQ7,
457                                  int32_t* outre1Q16,
458                                  int32_t* outre2Q16) {
459   int32_t max;
460   int32_t sh;
461 
462   max = TransformAndFindMaxNeon(inreQ7, inimQ7, outre1Q16, outre2Q16);
463 
464 
465   sh = (int32_t)WebRtcSpl_NormW32(max);
466   sh = sh - 24;
467   // If sh becomes >= 0, then we should shift sh steps to the left,
468   // and the domain will become Q(16 + sh).
469   // If sh becomes < 0, then we should shift -sh steps to the right,
470   // and the domain will become Q(16 + sh).
471 
472   // "Fastest" vectors.
473   PreShiftW32toW16Neon(outre1Q16, outre2Q16, inreQ7, inimQ7, sh);
474 
475   // Get IDFT.
476   WebRtcIsacfix_FftRadix16Fastest(inreQ7, inimQ7, 1);
477 
478   PostShiftAndDivideAndDemodulateNeon(inreQ7, inimQ7, outre1Q16, outre2Q16, sh);
479 }
480