1 /*
2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_processing/aecm/aecm_core.h"
12 
13 #include "modules/audio_processing/aecm/echo_control_mobile.h"
14 #include "modules/audio_processing/utility/delay_estimator_wrapper.h"
15 #include "rtc_base/checks.h"
16 #include "rtc_base/numerics/safe_conversions.h"
17 
18 static const ALIGN8_BEG int16_t WebRtcAecm_kSqrtHanning[] ALIGN8_END = {
19   0, 399, 798, 1196, 1594, 1990, 2386, 2780, 3172,
20   3562, 3951, 4337, 4720, 5101, 5478, 5853, 6224,
21   6591, 6954, 7313, 7668, 8019, 8364, 8705, 9040,
22   9370, 9695, 10013, 10326, 10633, 10933, 11227, 11514,
23   11795, 12068, 12335, 12594, 12845, 13089, 13325, 13553,
24   13773, 13985, 14189, 14384, 14571, 14749, 14918, 15079,
25   15231, 15373, 15506, 15631, 15746, 15851, 15947, 16034,
26   16111, 16179, 16237, 16286, 16325, 16354, 16373, 16384
27 };
28 
29 static const int16_t kNoiseEstQDomain = 15;
30 static const int16_t kNoiseEstIncCount = 5;
31 
32 static int16_t coefTable[] = {
33    0,   4, 256, 260, 128, 132, 384, 388,
34   64,  68, 320, 324, 192, 196, 448, 452,
35   32,  36, 288, 292, 160, 164, 416, 420,
36   96, 100, 352, 356, 224, 228, 480, 484,
37   16,  20, 272, 276, 144, 148, 400, 404,
38   80,  84, 336, 340, 208, 212, 464, 468,
39   48,  52, 304, 308, 176, 180, 432, 436,
40  112, 116, 368, 372, 240, 244, 496, 500,
41    8,  12, 264, 268, 136, 140, 392, 396,
42   72,  76, 328, 332, 200, 204, 456, 460,
43   40,  44, 296, 300, 168, 172, 424, 428,
44  104, 108, 360, 364, 232, 236, 488, 492,
45   24,  28, 280, 284, 152, 156, 408, 412,
46   88,  92, 344, 348, 216, 220, 472, 476,
47   56,  60, 312, 316, 184, 188, 440, 444,
48  120, 124, 376, 380, 248, 252, 504, 508
49 };
50 
51 static int16_t coefTable_ifft[] = {
52     0, 512, 256, 508, 128, 252, 384, 380,
53    64, 124, 320, 444, 192, 188, 448, 316,
54    32,  60, 288, 476, 160, 220, 416, 348,
55    96,  92, 352, 412, 224, 156, 480, 284,
56    16,  28, 272, 492, 144, 236, 400, 364,
57    80, 108, 336, 428, 208, 172, 464, 300,
58    48,  44, 304, 460, 176, 204, 432, 332,
59   112,  76, 368, 396, 240, 140, 496, 268,
60     8,  12, 264, 500, 136, 244, 392, 372,
61    72, 116, 328, 436, 200, 180, 456, 308,
62    40,  52, 296, 468, 168, 212, 424, 340,
63   104,  84, 360, 404, 232, 148, 488, 276,
64    24,  20, 280, 484, 152, 228, 408, 356,
65    88, 100, 344, 420, 216, 164, 472, 292,
66    56,  36, 312, 452, 184, 196, 440, 324,
67   120,  68, 376, 388, 248, 132, 504, 260
68 };
69 
70 static void ComfortNoise(AecmCore* aecm,
71                          const uint16_t* dfa,
72                          ComplexInt16* out,
73                          const int16_t* lambda);
74 
WindowAndFFT(AecmCore * aecm,int16_t * fft,const int16_t * time_signal,ComplexInt16 * freq_signal,int time_signal_scaling)75 static void WindowAndFFT(AecmCore* aecm,
76                          int16_t* fft,
77                          const int16_t* time_signal,
78                          ComplexInt16* freq_signal,
79                          int time_signal_scaling) {
80   int i, j;
81   int32_t tmp1, tmp2, tmp3, tmp4;
82   int16_t* pfrfi;
83   ComplexInt16* pfreq_signal;
84   int16_t  f_coef, s_coef;
85   int32_t load_ptr, store_ptr1, store_ptr2, shift, shift1;
86   int32_t hann, hann1, coefs;
87 
88   memset(fft, 0, sizeof(int16_t) * PART_LEN4);
89 
90   // FFT of signal
91   __asm __volatile (
92     ".set        push                                                    \n\t"
93     ".set        noreorder                                               \n\t"
94     "addiu       %[shift],          %[time_signal_scaling], -14          \n\t"
95     "addiu       %[i],              $zero,                  64           \n\t"
96     "addiu       %[load_ptr],       %[time_signal],         0            \n\t"
97     "addiu       %[hann],           %[hanning],             0            \n\t"
98     "addiu       %[hann1],          %[hanning],             128          \n\t"
99     "addiu       %[coefs],          %[coefTable],           0            \n\t"
100     "bltz        %[shift],          2f                                   \n\t"
101     " negu       %[shift1],         %[shift]                             \n\t"
102    "1:                                                                   \n\t"
103     "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
104     "lh          %[tmp2],           0(%[hann])                           \n\t"
105     "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
106     "lh          %[tmp4],           0(%[hann1])                          \n\t"
107     "addiu       %[i],              %[i],                   -1           \n\t"
108     "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
109     "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
110     "lh          %[f_coef],         0(%[coefs])                          \n\t"
111     "lh          %[s_coef],         2(%[coefs])                          \n\t"
112     "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
113     "addiu       %[hann],           %[hann],                2            \n\t"
114     "addiu       %[hann1],          %[hann1],               -2           \n\t"
115     "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
116     "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
117     "sllv        %[tmp1],           %[tmp1],                %[shift]     \n\t"
118     "sllv        %[tmp3],           %[tmp3],                %[shift]     \n\t"
119     "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
120     "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
121     "bgtz        %[i],              1b                                   \n\t"
122     " addiu      %[coefs],          %[coefs],               4            \n\t"
123     "b           3f                                                      \n\t"
124     " nop                                                                \n\t"
125    "2:                                                                   \n\t"
126     "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
127     "lh          %[tmp2],           0(%[hann])                           \n\t"
128     "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
129     "lh          %[tmp4],           0(%[hann1])                          \n\t"
130     "addiu       %[i],              %[i],                   -1           \n\t"
131     "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
132     "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
133     "lh          %[f_coef],         0(%[coefs])                          \n\t"
134     "lh          %[s_coef],         2(%[coefs])                          \n\t"
135     "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
136     "addiu       %[hann],           %[hann],                2            \n\t"
137     "addiu       %[hann1],          %[hann1],               -2           \n\t"
138     "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
139     "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
140     "srav        %[tmp1],           %[tmp1],                %[shift1]    \n\t"
141     "srav        %[tmp3],           %[tmp3],                %[shift1]    \n\t"
142     "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
143     "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
144     "bgtz        %[i],              2b                                   \n\t"
145     " addiu      %[coefs],          %[coefs],               4            \n\t"
146    "3:                                                                   \n\t"
147     ".set        pop                                                     \n\t"
148     : [load_ptr] "=&r" (load_ptr), [shift] "=&r" (shift), [hann] "=&r" (hann),
149       [hann1] "=&r" (hann1), [shift1] "=&r" (shift1), [coefs] "=&r" (coefs),
150       [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
151       [tmp4] "=&r" (tmp4), [i] "=&r" (i), [f_coef] "=&r" (f_coef),
152       [s_coef] "=&r" (s_coef), [store_ptr1] "=&r" (store_ptr1),
153       [store_ptr2] "=&r" (store_ptr2)
154     : [time_signal] "r" (time_signal), [coefTable] "r" (coefTable),
155       [time_signal_scaling] "r" (time_signal_scaling),
156       [hanning] "r" (WebRtcAecm_kSqrtHanning), [fft] "r" (fft)
157     : "memory", "hi", "lo"
158   );
159 
160   WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
161   pfrfi = fft;
162   pfreq_signal = freq_signal;
163 
164   __asm __volatile (
165     ".set        push                                                     \n\t"
166     ".set        noreorder                                                \n\t"
167     "addiu       %[j],              $zero,                 128            \n\t"
168    "1:                                                                    \n\t"
169     "lh          %[tmp1],           0(%[pfrfi])                           \n\t"
170     "lh          %[tmp2],           2(%[pfrfi])                           \n\t"
171     "lh          %[tmp3],           4(%[pfrfi])                           \n\t"
172     "lh          %[tmp4],           6(%[pfrfi])                           \n\t"
173     "subu        %[tmp2],           $zero,                 %[tmp2]        \n\t"
174     "sh          %[tmp1],           0(%[pfreq_signal])                    \n\t"
175     "sh          %[tmp2],           2(%[pfreq_signal])                    \n\t"
176     "subu        %[tmp4],           $zero,                 %[tmp4]        \n\t"
177     "sh          %[tmp3],           4(%[pfreq_signal])                    \n\t"
178     "sh          %[tmp4],           6(%[pfreq_signal])                    \n\t"
179     "lh          %[tmp1],           8(%[pfrfi])                           \n\t"
180     "lh          %[tmp2],           10(%[pfrfi])                          \n\t"
181     "lh          %[tmp3],           12(%[pfrfi])                          \n\t"
182     "lh          %[tmp4],           14(%[pfrfi])                          \n\t"
183     "addiu       %[j],              %[j],                  -8             \n\t"
184     "subu        %[tmp2],           $zero,                 %[tmp2]        \n\t"
185     "sh          %[tmp1],           8(%[pfreq_signal])                    \n\t"
186     "sh          %[tmp2],           10(%[pfreq_signal])                   \n\t"
187     "subu        %[tmp4],           $zero,                 %[tmp4]        \n\t"
188     "sh          %[tmp3],           12(%[pfreq_signal])                   \n\t"
189     "sh          %[tmp4],           14(%[pfreq_signal])                   \n\t"
190     "addiu       %[pfreq_signal],   %[pfreq_signal],       16             \n\t"
191     "bgtz        %[j],              1b                                    \n\t"
192     " addiu      %[pfrfi],          %[pfrfi],              16             \n\t"
193     ".set        pop                                                      \n\t"
194     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [tmp3] "=&r" (tmp3),
195       [j] "=&r" (j), [pfrfi] "+r" (pfrfi), [pfreq_signal] "+r" (pfreq_signal),
196       [tmp4] "=&r" (tmp4)
197     :
198     : "memory"
199   );
200 }
201 
InverseFFTAndWindow(AecmCore * aecm,int16_t * fft,ComplexInt16 * efw,int16_t * output,const int16_t * nearendClean)202 static void InverseFFTAndWindow(AecmCore* aecm,
203                                 int16_t* fft,
204                                 ComplexInt16* efw,
205                                 int16_t* output,
206                                 const int16_t* nearendClean) {
207   int i, outCFFT;
208   int32_t tmp1, tmp2, tmp3, tmp4, tmp_re, tmp_im;
209   int16_t* pcoefTable_ifft = coefTable_ifft;
210   int16_t* pfft = fft;
211   int16_t* ppfft = fft;
212   ComplexInt16* pefw = efw;
213   int32_t out_aecm;
214   int16_t* paecm_buf = aecm->outBuf;
215   const int16_t* p_kSqrtHanning = WebRtcAecm_kSqrtHanning;
216   const int16_t* pp_kSqrtHanning = &WebRtcAecm_kSqrtHanning[PART_LEN];
217   int16_t* output1 = output;
218 
219   __asm __volatile (
220     ".set      push                                                        \n\t"
221     ".set      noreorder                                                   \n\t"
222     "addiu     %[i],                $zero,                   64            \n\t"
223    "1:                                                                     \n\t"
224     "lh        %[tmp1],             0(%[pcoefTable_ifft])                  \n\t"
225     "lh        %[tmp2],             2(%[pcoefTable_ifft])                  \n\t"
226     "lh        %[tmp_re],           0(%[pefw])                             \n\t"
227     "lh        %[tmp_im],           2(%[pefw])                             \n\t"
228     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
229     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
230     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
231     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
232     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
233     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
234     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
235     "lh        %[tmp1],             4(%[pcoefTable_ifft])                  \n\t"
236     "lh        %[tmp2],             6(%[pcoefTable_ifft])                  \n\t"
237     "lh        %[tmp_re],           4(%[pefw])                             \n\t"
238     "lh        %[tmp_im],           6(%[pefw])                             \n\t"
239     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
240     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
241     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
242     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
243     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
244     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
245     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
246     "lh        %[tmp1],             8(%[pcoefTable_ifft])                  \n\t"
247     "lh        %[tmp2],             10(%[pcoefTable_ifft])                 \n\t"
248     "lh        %[tmp_re],           8(%[pefw])                             \n\t"
249     "lh        %[tmp_im],           10(%[pefw])                            \n\t"
250     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
251     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
252     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
253     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
254     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
255     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
256     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
257     "lh        %[tmp1],             12(%[pcoefTable_ifft])                 \n\t"
258     "lh        %[tmp2],             14(%[pcoefTable_ifft])                 \n\t"
259     "lh        %[tmp_re],           12(%[pefw])                            \n\t"
260     "lh        %[tmp_im],           14(%[pefw])                            \n\t"
261     "addu      %[pfft],             %[fft],                  %[tmp2]       \n\t"
262     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
263     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
264     "addu      %[pfft],             %[fft],                  %[tmp1]       \n\t"
265     "sh        %[tmp_re],           0(%[pfft])                             \n\t"
266     "subu      %[tmp_im],           $zero,                   %[tmp_im]     \n\t"
267     "sh        %[tmp_im],           2(%[pfft])                             \n\t"
268     "addiu     %[pcoefTable_ifft],  %[pcoefTable_ifft],      16            \n\t"
269     "addiu     %[i],                %[i],                    -4            \n\t"
270     "bgtz      %[i],                1b                                     \n\t"
271     " addiu    %[pefw],             %[pefw],                 16            \n\t"
272     ".set      pop                                                         \n\t"
273     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
274       [i] "=&r" (i), [tmp_re] "=&r" (tmp_re), [tmp_im] "=&r" (tmp_im),
275       [pefw] "+r" (pefw), [pcoefTable_ifft] "+r" (pcoefTable_ifft),
276       [fft] "+r" (fft)
277     :
278     : "memory"
279   );
280 
281   fft[2] = efw[PART_LEN].real;
282   fft[3] = -efw[PART_LEN].imag;
283 
284   outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
285   pfft = fft;
286 
287   __asm __volatile (
288     ".set       push                                               \n\t"
289     ".set       noreorder                                          \n\t"
290     "addiu      %[i],            $zero,               128          \n\t"
291    "1:                                                             \n\t"
292     "lh         %[tmp1],         0(%[ppfft])                       \n\t"
293     "lh         %[tmp2],         4(%[ppfft])                       \n\t"
294     "lh         %[tmp3],         8(%[ppfft])                       \n\t"
295     "lh         %[tmp4],         12(%[ppfft])                      \n\t"
296     "addiu      %[i],            %[i],                -4           \n\t"
297     "sh         %[tmp1],         0(%[pfft])                        \n\t"
298     "sh         %[tmp2],         2(%[pfft])                        \n\t"
299     "sh         %[tmp3],         4(%[pfft])                        \n\t"
300     "sh         %[tmp4],         6(%[pfft])                        \n\t"
301     "addiu      %[ppfft],        %[ppfft],            16           \n\t"
302     "bgtz       %[i],            1b                                \n\t"
303     " addiu     %[pfft],         %[pfft],             8            \n\t"
304     ".set       pop                                                \n\t"
305     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
306       [i] "=&r" (i), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
307       [ppfft] "+r" (ppfft)
308     :
309     : "memory"
310   );
311 
312   pfft = fft;
313   out_aecm = (int32_t)(outCFFT - aecm->dfaCleanQDomain);
314 
315   __asm __volatile (
316     ".set       push                                                       \n\t"
317     ".set       noreorder                                                  \n\t"
318     "addiu      %[i],                $zero,                  64            \n\t"
319    "11:                                                                    \n\t"
320     "lh         %[tmp1],             0(%[pfft])                            \n\t"
321     "lh         %[tmp2],             0(%[p_kSqrtHanning])                  \n\t"
322     "addiu      %[i],                %[i],                   -2            \n\t"
323     "mul        %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
324     "lh         %[tmp3],             2(%[pfft])                            \n\t"
325     "lh         %[tmp4],             2(%[p_kSqrtHanning])                  \n\t"
326     "mul        %[tmp3],             %[tmp3],                %[tmp4]       \n\t"
327     "addiu      %[tmp1],             %[tmp1],                8192          \n\t"
328     "sra        %[tmp1],             %[tmp1],                14            \n\t"
329     "addiu      %[tmp3],             %[tmp3],                8192          \n\t"
330     "sra        %[tmp3],             %[tmp3],                14            \n\t"
331     "bgez       %[out_aecm],         1f                                    \n\t"
332     " negu      %[tmp2],             %[out_aecm]                           \n\t"
333     "srav       %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
334     "b          2f                                                         \n\t"
335     " srav      %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
336    "1:                                                                     \n\t"
337     "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   \n\t"
338     "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   \n\t"
339    "2:                                                                     \n\t"
340     "lh         %[tmp4],             0(%[paecm_buf])                       \n\t"
341     "lh         %[tmp2],             2(%[paecm_buf])                       \n\t"
342     "addu       %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
343     "addu       %[tmp1],             %[tmp1],                %[tmp4]       \n\t"
344 #if defined(MIPS_DSP_R1_LE)
345     "shll_s.w   %[tmp1],             %[tmp1],                16            \n\t"
346     "sra        %[tmp1],             %[tmp1],                16            \n\t"
347     "shll_s.w   %[tmp3],             %[tmp3],                16            \n\t"
348     "sra        %[tmp3],             %[tmp3],                16            \n\t"
349 #else  // #if defined(MIPS_DSP_R1_LE)
350     "sra        %[tmp4],             %[tmp1],                31            \n\t"
351     "sra        %[tmp2],             %[tmp1],                15            \n\t"
352     "beq        %[tmp4],             %[tmp2],                3f            \n\t"
353     " ori       %[tmp2],             $zero,                  0x7fff        \n\t"
354     "xor        %[tmp1],             %[tmp2],                %[tmp4]       \n\t"
355    "3:                                                                     \n\t"
356     "sra        %[tmp2],             %[tmp3],                31            \n\t"
357     "sra        %[tmp4],             %[tmp3],                15            \n\t"
358     "beq        %[tmp2],             %[tmp4],                4f            \n\t"
359     " ori       %[tmp4],             $zero,                  0x7fff        \n\t"
360     "xor        %[tmp3],             %[tmp4],                %[tmp2]       \n\t"
361    "4:                                                                     \n\t"
362 #endif  // #if defined(MIPS_DSP_R1_LE)
363     "sh         %[tmp1],             0(%[pfft])                            \n\t"
364     "sh         %[tmp1],             0(%[output1])                         \n\t"
365     "sh         %[tmp3],             2(%[pfft])                            \n\t"
366     "sh         %[tmp3],             2(%[output1])                         \n\t"
367     "lh         %[tmp1],             128(%[pfft])                          \n\t"
368     "lh         %[tmp2],             0(%[pp_kSqrtHanning])                 \n\t"
369     "mul        %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
370     "lh         %[tmp3],             130(%[pfft])                          \n\t"
371     "lh         %[tmp4],             -2(%[pp_kSqrtHanning])                \n\t"
372     "mul        %[tmp3],             %[tmp3],                %[tmp4]       \n\t"
373     "sra        %[tmp1],             %[tmp1],                14            \n\t"
374     "sra        %[tmp3],             %[tmp3],                14            \n\t"
375     "bgez       %[out_aecm],         5f                                    \n\t"
376     " negu      %[tmp2],             %[out_aecm]                           \n\t"
377     "srav       %[tmp3],             %[tmp3],                %[tmp2]       \n\t"
378     "b          6f                                                         \n\t"
379     " srav      %[tmp1],             %[tmp1],                %[tmp2]       \n\t"
380    "5:                                                                     \n\t"
381     "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   \n\t"
382     "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   \n\t"
383    "6:                                                                     \n\t"
384 #if defined(MIPS_DSP_R1_LE)
385     "shll_s.w   %[tmp1],             %[tmp1],                16            \n\t"
386     "sra        %[tmp1],             %[tmp1],                16            \n\t"
387     "shll_s.w   %[tmp3],             %[tmp3],                16            \n\t"
388     "sra        %[tmp3],             %[tmp3],                16            \n\t"
389 #else  // #if defined(MIPS_DSP_R1_LE)
390     "sra        %[tmp4],             %[tmp1],                31            \n\t"
391     "sra        %[tmp2],             %[tmp1],                15            \n\t"
392     "beq        %[tmp4],             %[tmp2],                7f            \n\t"
393     " ori       %[tmp2],             $zero,                  0x7fff        \n\t"
394     "xor        %[tmp1],             %[tmp2],                %[tmp4]       \n\t"
395    "7:                                                                     \n\t"
396     "sra        %[tmp2],             %[tmp3],                31            \n\t"
397     "sra        %[tmp4],             %[tmp3],                15            \n\t"
398     "beq        %[tmp2],             %[tmp4],                8f            \n\t"
399     " ori       %[tmp4],             $zero,                  0x7fff        \n\t"
400     "xor        %[tmp3],             %[tmp4],                %[tmp2]       \n\t"
401    "8:                                                                     \n\t"
402 #endif  // #if defined(MIPS_DSP_R1_LE)
403     "sh         %[tmp1],             0(%[paecm_buf])                       \n\t"
404     "sh         %[tmp3],             2(%[paecm_buf])                       \n\t"
405     "addiu      %[output1],          %[output1],             4             \n\t"
406     "addiu      %[paecm_buf],        %[paecm_buf],           4             \n\t"
407     "addiu      %[pfft],             %[pfft],                4             \n\t"
408     "addiu      %[p_kSqrtHanning],   %[p_kSqrtHanning],      4             \n\t"
409     "bgtz       %[i],                11b                                   \n\t"
410     " addiu     %[pp_kSqrtHanning],  %[pp_kSqrtHanning],     -4            \n\t"
411     ".set       pop                                                        \n\t"
412     : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), [pfft] "+r" (pfft),
413       [output1] "+r" (output1), [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
414       [paecm_buf] "+r" (paecm_buf), [i] "=&r" (i),
415       [pp_kSqrtHanning] "+r" (pp_kSqrtHanning),
416       [p_kSqrtHanning] "+r" (p_kSqrtHanning)
417     : [out_aecm] "r" (out_aecm),
418       [WebRtcAecm_kSqrtHanning] "r" (WebRtcAecm_kSqrtHanning)
419     : "hi", "lo","memory"
420   );
421 
422   // Copy the current block to the old position
423   // (aecm->outBuf is shifted elsewhere)
424   memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(int16_t) * PART_LEN);
425   memcpy(aecm->dBufNoisy,
426          aecm->dBufNoisy + PART_LEN,
427          sizeof(int16_t) * PART_LEN);
428   if (nearendClean != NULL) {
429     memcpy(aecm->dBufClean,
430            aecm->dBufClean + PART_LEN,
431            sizeof(int16_t) * PART_LEN);
432   }
433 }
434 
WebRtcAecm_CalcLinearEnergies_mips(AecmCore * aecm,const uint16_t * far_spectrum,int32_t * echo_est,uint32_t * far_energy,uint32_t * echo_energy_adapt,uint32_t * echo_energy_stored)435 void WebRtcAecm_CalcLinearEnergies_mips(AecmCore* aecm,
436                                         const uint16_t* far_spectrum,
437                                         int32_t* echo_est,
438                                         uint32_t* far_energy,
439                                         uint32_t* echo_energy_adapt,
440                                         uint32_t* echo_energy_stored) {
441   int i;
442   uint32_t par1 = (*far_energy);
443   uint32_t par2 = (*echo_energy_adapt);
444   uint32_t par3 = (*echo_energy_stored);
445   int16_t* ch_stored_p = &(aecm->channelStored[0]);
446   int16_t* ch_adapt_p = &(aecm->channelAdapt16[0]);
447   uint16_t* spectrum_p = (uint16_t*)(&(far_spectrum[0]));
448   int32_t* echo_p = &(echo_est[0]);
449   int32_t temp0, stored0, echo0, adept0, spectrum0;
450   int32_t stored1, adept1, spectrum1, echo1, temp1;
451 
452   // Get energy for the delayed far end signal and estimated
453   // echo using both stored and adapted channels.
454   for (i = 0; i < PART_LEN; i+= 4) {
455     __asm __volatile (
456       ".set           push                                            \n\t"
457       ".set           noreorder                                       \n\t"
458       "lh             %[stored0],     0(%[ch_stored_p])               \n\t"
459       "lhu            %[adept0],      0(%[ch_adapt_p])                \n\t"
460       "lhu            %[spectrum0],   0(%[spectrum_p])                \n\t"
461       "lh             %[stored1],     2(%[ch_stored_p])               \n\t"
462       "lhu            %[adept1],      2(%[ch_adapt_p])                \n\t"
463       "lhu            %[spectrum1],   2(%[spectrum_p])                \n\t"
464       "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
465       "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
466       "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
467       "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
468       "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
469       "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
470       "addiu          %[echo_p],      %[echo_p],      16              \n\t"
471       "addu           %[par3],        %[par3],        %[echo0]        \n\t"
472       "addu           %[par2],        %[par2],        %[temp0]        \n\t"
473       "addu           %[par3],        %[par3],        %[echo1]        \n\t"
474       "addu           %[par2],        %[par2],        %[temp1]        \n\t"
475       "usw            %[echo0],       -16(%[echo_p])                  \n\t"
476       "usw            %[echo1],       -12(%[echo_p])                  \n\t"
477       "lh             %[stored0],     4(%[ch_stored_p])               \n\t"
478       "lhu            %[adept0],      4(%[ch_adapt_p])                \n\t"
479       "lhu            %[spectrum0],   4(%[spectrum_p])                \n\t"
480       "lh             %[stored1],     6(%[ch_stored_p])               \n\t"
481       "lhu            %[adept1],      6(%[ch_adapt_p])                \n\t"
482       "lhu            %[spectrum1],   6(%[spectrum_p])                \n\t"
483       "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
484       "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
485       "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
486       "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
487       "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
488       "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
489       "addiu          %[ch_stored_p], %[ch_stored_p], 8               \n\t"
490       "addiu          %[ch_adapt_p],  %[ch_adapt_p],  8               \n\t"
491       "addiu          %[spectrum_p],  %[spectrum_p],  8               \n\t"
492       "addu           %[par3],        %[par3],        %[echo0]        \n\t"
493       "addu           %[par2],        %[par2],        %[temp0]        \n\t"
494       "addu           %[par3],        %[par3],        %[echo1]        \n\t"
495       "addu           %[par2],        %[par2],        %[temp1]        \n\t"
496       "usw            %[echo0],       -8(%[echo_p])                   \n\t"
497       "usw            %[echo1],       -4(%[echo_p])                   \n\t"
498       ".set           pop                                             \n\t"
499       : [temp0] "=&r" (temp0), [stored0] "=&r" (stored0),
500         [adept0] "=&r" (adept0), [spectrum0] "=&r" (spectrum0),
501         [echo0] "=&r" (echo0), [echo_p] "+r" (echo_p), [par3] "+r" (par3),
502         [par1] "+r" (par1), [par2] "+r" (par2), [stored1] "=&r" (stored1),
503         [adept1] "=&r" (adept1), [echo1] "=&r" (echo1),
504         [spectrum1] "=&r" (spectrum1), [temp1] "=&r" (temp1),
505         [ch_stored_p] "+r" (ch_stored_p), [ch_adapt_p] "+r" (ch_adapt_p),
506         [spectrum_p] "+r" (spectrum_p)
507       :
508       : "hi", "lo", "memory"
509     );
510   }
511 
512   echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
513                                              far_spectrum[PART_LEN]);
514   par1 += (uint32_t)(far_spectrum[PART_LEN]);
515   par2 += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
516   par3 += (uint32_t)echo_est[PART_LEN];
517 
518   (*far_energy) = par1;
519   (*echo_energy_adapt) = par2;
520   (*echo_energy_stored) = par3;
521 }
522 
523 #if defined(MIPS_DSP_R1_LE)
WebRtcAecm_StoreAdaptiveChannel_mips(AecmCore * aecm,const uint16_t * far_spectrum,int32_t * echo_est)524 void WebRtcAecm_StoreAdaptiveChannel_mips(AecmCore* aecm,
525                                           const uint16_t* far_spectrum,
526                                           int32_t* echo_est) {
527   int i;
528   int16_t* temp1;
529   uint16_t* temp8;
530   int32_t temp0, temp2, temp3, temp4, temp5, temp6;
531   int32_t* temp7 = &(echo_est[0]);
532   temp1 = &(aecm->channelStored[0]);
533   temp8 = (uint16_t*)(&far_spectrum[0]);
534 
535   // During startup we store the channel every block.
536   memcpy(aecm->channelStored, aecm->channelAdapt16,
537          sizeof(int16_t) * PART_LEN1);
538   // Recalculate echo estimate
539   for (i = 0; i < PART_LEN; i += 4) {
540     __asm __volatile (
541       "ulw            %[temp0],   0(%[temp8])               \n\t"
542       "ulw            %[temp2],   0(%[temp1])               \n\t"
543       "ulw            %[temp4],   4(%[temp8])               \n\t"
544       "ulw            %[temp5],   4(%[temp1])               \n\t"
545       "muleq_s.w.phl  %[temp3],   %[temp2],     %[temp0]    \n\t"
546       "muleq_s.w.phr  %[temp0],   %[temp2],     %[temp0]    \n\t"
547       "muleq_s.w.phl  %[temp6],   %[temp5],     %[temp4]    \n\t"
548       "muleq_s.w.phr  %[temp4],   %[temp5],     %[temp4]    \n\t"
549       "addiu          %[temp7],   %[temp7],     16          \n\t"
550       "addiu          %[temp1],   %[temp1],     8           \n\t"
551       "addiu          %[temp8],   %[temp8],     8           \n\t"
552       "sra            %[temp3],   %[temp3],     1           \n\t"
553       "sra            %[temp0],   %[temp0],     1           \n\t"
554       "sra            %[temp6],   %[temp6],     1           \n\t"
555       "sra            %[temp4],   %[temp4],     1           \n\t"
556       "usw            %[temp3],   -12(%[temp7])             \n\t"
557       "usw            %[temp0],   -16(%[temp7])             \n\t"
558       "usw            %[temp6],   -4(%[temp7])              \n\t"
559       "usw            %[temp4],   -8(%[temp7])              \n\t"
560       : [temp0] "=&r" (temp0), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
561         [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [temp6] "=&r" (temp6),
562         [temp1] "+r" (temp1), [temp8] "+r" (temp8), [temp7] "+r" (temp7)
563       :
564       : "hi", "lo", "memory"
565     );
566   }
567   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
568                                       far_spectrum[i]);
569 }
570 
WebRtcAecm_ResetAdaptiveChannel_mips(AecmCore * aecm)571 void WebRtcAecm_ResetAdaptiveChannel_mips(AecmCore* aecm) {
572   int i;
573   int32_t* temp3;
574   int16_t* temp0;
575   int32_t temp1, temp2, temp4, temp5;
576 
577   temp0 = &(aecm->channelStored[0]);
578   temp3 = &(aecm->channelAdapt32[0]);
579 
580   // The stored channel has a significantly lower MSE than the adaptive one for
581   // two consecutive calculations. Reset the adaptive channel.
582   memcpy(aecm->channelAdapt16,
583          aecm->channelStored,
584          sizeof(int16_t) * PART_LEN1);
585 
586   // Restore the W32 channel
587   for (i = 0; i < PART_LEN; i += 4) {
588     __asm __volatile (
589       "ulw            %[temp1], 0(%[temp0])           \n\t"
590       "ulw            %[temp4], 4(%[temp0])           \n\t"
591       "preceq.w.phl   %[temp2], %[temp1]              \n\t"
592       "preceq.w.phr   %[temp1], %[temp1]              \n\t"
593       "preceq.w.phl   %[temp5], %[temp4]              \n\t"
594       "preceq.w.phr   %[temp4], %[temp4]              \n\t"
595       "addiu          %[temp0], %[temp0], 8           \n\t"
596       "usw            %[temp2], 4(%[temp3])           \n\t"
597       "usw            %[temp1], 0(%[temp3])           \n\t"
598       "usw            %[temp5], 12(%[temp3])          \n\t"
599       "usw            %[temp4], 8(%[temp3])           \n\t"
600       "addiu          %[temp3], %[temp3], 16          \n\t"
601       : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
602         [temp4] "=&r" (temp4), [temp5] "=&r" (temp5),
603         [temp3] "+r" (temp3), [temp0] "+r" (temp0)
604       :
605       : "memory"
606     );
607   }
608 
609   aecm->channelAdapt32[i] = (int32_t)aecm->channelStored[i] << 16;
610 }
611 #endif  // #if defined(MIPS_DSP_R1_LE)
612 
613 // Transforms a time domain signal into the frequency domain, outputting the
614 // complex valued signal, absolute value and sum of absolute values.
615 //
616 // time_signal          [in]    Pointer to time domain signal
617 // freq_signal_real     [out]   Pointer to real part of frequency domain array
618 // freq_signal_imag     [out]   Pointer to imaginary part of frequency domain
619 //                              array
620 // freq_signal_abs      [out]   Pointer to absolute value of frequency domain
621 //                              array
622 // freq_signal_sum_abs  [out]   Pointer to the sum of all absolute values in
623 //                              the frequency domain array
624 // return value                 The Q-domain of current frequency values
625 //
TimeToFrequencyDomain(AecmCore * aecm,const int16_t * time_signal,ComplexInt16 * freq_signal,uint16_t * freq_signal_abs,uint32_t * freq_signal_sum_abs)626 static int TimeToFrequencyDomain(AecmCore* aecm,
627                                  const int16_t* time_signal,
628                                  ComplexInt16* freq_signal,
629                                  uint16_t* freq_signal_abs,
630                                  uint32_t* freq_signal_sum_abs) {
631   int i = 0;
632   int time_signal_scaling = 0;
633 
634   // In fft_buf, +16 for 32-byte alignment.
635   int16_t fft_buf[PART_LEN4 + 16];
636   int16_t *fft = (int16_t *) (((uintptr_t) fft_buf + 31) & ~31);
637 
638   int16_t tmp16no1;
639 #if !defined(MIPS_DSP_R2_LE)
640   int32_t tmp32no1;
641   int32_t tmp32no2;
642   int16_t tmp16no2;
643 #else
644   int32_t tmp32no10, tmp32no11, tmp32no12, tmp32no13;
645   int32_t tmp32no20, tmp32no21, tmp32no22, tmp32no23;
646   int16_t* freqp;
647   uint16_t* freqabsp;
648   uint32_t freqt0, freqt1, freqt2, freqt3;
649   uint32_t freqs;
650 #endif
651 
652 #ifdef AECM_DYNAMIC_Q
653   tmp16no1 = WebRtcSpl_MaxAbsValueW16(time_signal, PART_LEN2);
654   time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
655 #endif
656 
657   WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling);
658 
659   // Extract imaginary and real part,
660   // calculate the magnitude for all frequency bins
661   freq_signal[0].imag = 0;
662   freq_signal[PART_LEN].imag = 0;
663   freq_signal[PART_LEN].real = fft[PART_LEN2];
664   freq_signal_abs[0] = (uint16_t)WEBRTC_SPL_ABS_W16(freq_signal[0].real);
665   freq_signal_abs[PART_LEN] = (uint16_t)WEBRTC_SPL_ABS_W16(
666     freq_signal[PART_LEN].real);
667   (*freq_signal_sum_abs) = (uint32_t)(freq_signal_abs[0]) +
668     (uint32_t)(freq_signal_abs[PART_LEN]);
669 
670 #if !defined(MIPS_DSP_R2_LE)
671   for (i = 1; i < PART_LEN; i++) {
672     if (freq_signal[i].real == 0)
673     {
674       freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(
675         freq_signal[i].imag);
676     }
677     else if (freq_signal[i].imag == 0)
678     {
679       freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(
680         freq_signal[i].real);
681     }
682     else
683     {
684       // Approximation for magnitude of complex fft output
685       // magn = sqrt(real^2 + imag^2)
686       // magn ~= alpha * max(|imag|,|real|) + beta * min(|imag|,|real|)
687       //
688       // The parameters alpha and beta are stored in Q15
689       tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
690       tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
691       tmp32no1 = tmp16no1 * tmp16no1;
692       tmp32no2 = tmp16no2 * tmp16no2;
693       tmp32no2 = WebRtcSpl_AddSatW32(tmp32no1, tmp32no2);
694       tmp32no1 = WebRtcSpl_SqrtFloor(tmp32no2);
695 
696       freq_signal_abs[i] = (uint16_t)tmp32no1;
697     }
698     (*freq_signal_sum_abs) += (uint32_t)freq_signal_abs[i];
699   }
700 #else // #if !defined(MIPS_DSP_R2_LE)
701   freqs = (uint32_t)(freq_signal_abs[0]) +
702           (uint32_t)(freq_signal_abs[PART_LEN]);
703   freqp = &(freq_signal[1].real);
704 
705   __asm __volatile (
706     "lw             %[freqt0],      0(%[freqp])             \n\t"
707     "lw             %[freqt1],      4(%[freqp])             \n\t"
708     "lw             %[freqt2],      8(%[freqp])             \n\t"
709     "mult           $ac0,           $zero,      $zero       \n\t"
710     "mult           $ac1,           $zero,      $zero       \n\t"
711     "mult           $ac2,           $zero,      $zero       \n\t"
712     "dpaq_s.w.ph    $ac0,           %[freqt0],  %[freqt0]   \n\t"
713     "dpaq_s.w.ph    $ac1,           %[freqt1],  %[freqt1]   \n\t"
714     "dpaq_s.w.ph    $ac2,           %[freqt2],  %[freqt2]   \n\t"
715     "addiu          %[freqp],       %[freqp],   12          \n\t"
716     "extr.w         %[tmp32no20],   $ac0,       1           \n\t"
717     "extr.w         %[tmp32no21],   $ac1,       1           \n\t"
718     "extr.w         %[tmp32no22],   $ac2,       1           \n\t"
719     : [freqt0] "=&r" (freqt0), [freqt1] "=&r" (freqt1),
720       [freqt2] "=&r" (freqt2), [freqp] "+r" (freqp),
721       [tmp32no20] "=r" (tmp32no20), [tmp32no21] "=r" (tmp32no21),
722       [tmp32no22] "=r" (tmp32no22)
723     :
724     : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo"
725   );
726 
727   tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
728   tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
729   tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
730   freq_signal_abs[1] = (uint16_t)tmp32no10;
731   freq_signal_abs[2] = (uint16_t)tmp32no11;
732   freq_signal_abs[3] = (uint16_t)tmp32no12;
733   freqs += (uint32_t)tmp32no10;
734   freqs += (uint32_t)tmp32no11;
735   freqs += (uint32_t)tmp32no12;
736   freqabsp = &(freq_signal_abs[4]);
737   for (i = 4; i < PART_LEN; i+=4)
738   {
739     __asm __volatile (
740       "ulw            %[freqt0],      0(%[freqp])                 \n\t"
741       "ulw            %[freqt1],      4(%[freqp])                 \n\t"
742       "ulw            %[freqt2],      8(%[freqp])                 \n\t"
743       "ulw            %[freqt3],      12(%[freqp])                \n\t"
744       "mult           $ac0,           $zero,          $zero       \n\t"
745       "mult           $ac1,           $zero,          $zero       \n\t"
746       "mult           $ac2,           $zero,          $zero       \n\t"
747       "mult           $ac3,           $zero,          $zero       \n\t"
748       "dpaq_s.w.ph    $ac0,           %[freqt0],      %[freqt0]   \n\t"
749       "dpaq_s.w.ph    $ac1,           %[freqt1],      %[freqt1]   \n\t"
750       "dpaq_s.w.ph    $ac2,           %[freqt2],      %[freqt2]   \n\t"
751       "dpaq_s.w.ph    $ac3,           %[freqt3],      %[freqt3]   \n\t"
752       "addiu          %[freqp],       %[freqp],       16          \n\t"
753       "addiu          %[freqabsp],    %[freqabsp],    8           \n\t"
754       "extr.w         %[tmp32no20],   $ac0,           1           \n\t"
755       "extr.w         %[tmp32no21],   $ac1,           1           \n\t"
756       "extr.w         %[tmp32no22],   $ac2,           1           \n\t"
757       "extr.w         %[tmp32no23],   $ac3,           1           \n\t"
758       : [freqt0] "=&r" (freqt0), [freqt1] "=&r" (freqt1),
759         [freqt2] "=&r" (freqt2), [freqt3] "=&r" (freqt3),
760         [tmp32no20] "=r" (tmp32no20), [tmp32no21] "=r" (tmp32no21),
761         [tmp32no22] "=r" (tmp32no22), [tmp32no23] "=r" (tmp32no23),
762         [freqabsp] "+r" (freqabsp), [freqp] "+r" (freqp)
763       :
764       : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
765         "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
766     );
767 
768     tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
769     tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
770     tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
771     tmp32no13 = WebRtcSpl_SqrtFloor(tmp32no23);
772 
773     __asm __volatile (
774       "sh             %[tmp32no10],   -8(%[freqabsp])                 \n\t"
775       "sh             %[tmp32no11],   -6(%[freqabsp])                 \n\t"
776       "sh             %[tmp32no12],   -4(%[freqabsp])                 \n\t"
777       "sh             %[tmp32no13],   -2(%[freqabsp])                 \n\t"
778       "addu           %[freqs],       %[freqs],       %[tmp32no10]    \n\t"
779       "addu           %[freqs],       %[freqs],       %[tmp32no11]    \n\t"
780       "addu           %[freqs],       %[freqs],       %[tmp32no12]    \n\t"
781       "addu           %[freqs],       %[freqs],       %[tmp32no13]    \n\t"
782       : [freqs] "+r" (freqs)
783       : [tmp32no10] "r" (tmp32no10), [tmp32no11] "r" (tmp32no11),
784         [tmp32no12] "r" (tmp32no12), [tmp32no13] "r" (tmp32no13),
785         [freqabsp] "r" (freqabsp)
786       : "memory"
787     );
788   }
789 
790   (*freq_signal_sum_abs) = freqs;
791 #endif
792 
793   return time_signal_scaling;
794 }
795 
WebRtcAecm_ProcessBlock(AecmCore * aecm,const int16_t * farend,const int16_t * nearendNoisy,const int16_t * nearendClean,int16_t * output)796 int WebRtcAecm_ProcessBlock(AecmCore* aecm,
797                             const int16_t* farend,
798                             const int16_t* nearendNoisy,
799                             const int16_t* nearendClean,
800                             int16_t* output) {
801   int i;
802   uint32_t xfaSum;
803   uint32_t dfaNoisySum;
804   uint32_t dfaCleanSum;
805   uint32_t echoEst32Gained;
806   uint32_t tmpU32;
807   int32_t tmp32no1;
808 
809   uint16_t xfa[PART_LEN1];
810   uint16_t dfaNoisy[PART_LEN1];
811   uint16_t dfaClean[PART_LEN1];
812   uint16_t* ptrDfaClean = dfaClean;
813   const uint16_t* far_spectrum_ptr = NULL;
814 
815   // 32 byte aligned buffers (with +8 or +16).
816   int16_t fft_buf[PART_LEN4 + 2 + 16]; // +2 to make a loop safe.
817   int32_t echoEst32_buf[PART_LEN1 + 8];
818   int32_t dfw_buf[PART_LEN2 + 8];
819   int32_t efw_buf[PART_LEN2 + 8];
820 
821   int16_t* fft = (int16_t*)(((uint32_t)fft_buf + 31) & ~ 31);
822   int32_t* echoEst32 = (int32_t*)(((uint32_t)echoEst32_buf + 31) & ~ 31);
823   ComplexInt16* dfw = (ComplexInt16*)(((uint32_t)dfw_buf + 31) & ~31);
824   ComplexInt16* efw = (ComplexInt16*)(((uint32_t)efw_buf + 31) & ~31);
825 
826   int16_t hnl[PART_LEN1];
827   int16_t numPosCoef = 0;
828   int delay;
829   int16_t tmp16no1;
830   int16_t tmp16no2;
831   int16_t mu;
832   int16_t supGain;
833   int16_t zeros32, zeros16;
834   int16_t zerosDBufNoisy, zerosDBufClean, zerosXBuf;
835   int far_q;
836   int16_t resolutionDiff, qDomainDiff, dfa_clean_q_domain_diff;
837 
838   const int kMinPrefBand = 4;
839   const int kMaxPrefBand = 24;
840   int32_t avgHnl32 = 0;
841 
842   int32_t temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
843   int16_t* ptr;
844   int16_t* ptr1;
845   int16_t* er_ptr;
846   int16_t* dr_ptr;
847 
848   ptr = &hnl[0];
849   ptr1 = &hnl[0];
850   er_ptr = &efw[0].real;
851   dr_ptr = &dfw[0].real;
852 
853   // Determine startup state. There are three states:
854   // (0) the first CONV_LEN blocks
855   // (1) another CONV_LEN blocks
856   // (2) the rest
857 
858   if (aecm->startupState < 2) {
859     aecm->startupState = (aecm->totCount >= CONV_LEN) +
860                          (aecm->totCount >= CONV_LEN2);
861   }
862   // END: Determine startup state
863 
864   // Buffer near and far end signals
865   memcpy(aecm->xBuf + PART_LEN, farend, sizeof(int16_t) * PART_LEN);
866   memcpy(aecm->dBufNoisy + PART_LEN,
867          nearendNoisy,
868          sizeof(int16_t) * PART_LEN);
869   if (nearendClean != NULL) {
870     memcpy(aecm->dBufClean + PART_LEN,
871            nearendClean,
872            sizeof(int16_t) * PART_LEN);
873   }
874 
875   // Transform far end signal from time domain to frequency domain.
876   far_q = TimeToFrequencyDomain(aecm,
877                                 aecm->xBuf,
878                                 dfw,
879                                 xfa,
880                                 &xfaSum);
881 
882   // Transform noisy near end signal from time domain to frequency domain.
883   zerosDBufNoisy = TimeToFrequencyDomain(aecm,
884                                          aecm->dBufNoisy,
885                                          dfw,
886                                          dfaNoisy,
887                                          &dfaNoisySum);
888   aecm->dfaNoisyQDomainOld = aecm->dfaNoisyQDomain;
889   aecm->dfaNoisyQDomain = (int16_t)zerosDBufNoisy;
890 
891   if (nearendClean == NULL) {
892     ptrDfaClean = dfaNoisy;
893     aecm->dfaCleanQDomainOld = aecm->dfaNoisyQDomainOld;
894     aecm->dfaCleanQDomain = aecm->dfaNoisyQDomain;
895     dfaCleanSum = dfaNoisySum;
896   } else {
897     // Transform clean near end signal from time domain to frequency domain.
898     zerosDBufClean = TimeToFrequencyDomain(aecm,
899                                            aecm->dBufClean,
900                                            dfw,
901                                            dfaClean,
902                                            &dfaCleanSum);
903     aecm->dfaCleanQDomainOld = aecm->dfaCleanQDomain;
904     aecm->dfaCleanQDomain = (int16_t)zerosDBufClean;
905   }
906 
907   // Get the delay
908   // Save far-end history and estimate delay
909   WebRtcAecm_UpdateFarHistory(aecm, xfa, far_q);
910 
911   if (WebRtc_AddFarSpectrumFix(aecm->delay_estimator_farend, xfa, PART_LEN1,
912                                far_q) == -1) {
913     return -1;
914   }
915   delay = WebRtc_DelayEstimatorProcessFix(aecm->delay_estimator,
916                                           dfaNoisy,
917                                           PART_LEN1,
918                                           zerosDBufNoisy);
919   if (delay == -1) {
920     return -1;
921   }
922   else if (delay == -2) {
923     // If the delay is unknown, we assume zero.
924     // NOTE: this will have to be adjusted if we ever add lookahead.
925     delay = 0;
926   }
927 
928   if (aecm->fixedDelay >= 0) {
929     // Use fixed delay
930     delay = aecm->fixedDelay;
931   }
932 
933   // Get aligned far end spectrum
934   far_spectrum_ptr = WebRtcAecm_AlignedFarend(aecm, &far_q, delay);
935   zerosXBuf = (int16_t) far_q;
936 
937   if (far_spectrum_ptr == NULL) {
938     return -1;
939   }
940 
941   // Calculate log(energy) and update energy threshold levels
942   WebRtcAecm_CalcEnergies(aecm,
943                           far_spectrum_ptr,
944                           zerosXBuf,
945                           dfaNoisySum,
946                           echoEst32);
947   // Calculate stepsize
948   mu = WebRtcAecm_CalcStepSize(aecm);
949 
950   // Update counters
951   aecm->totCount++;
952 
953   // This is the channel estimation algorithm.
954   // It is base on NLMS but has a variable step length,
955   // which was calculated above.
956   WebRtcAecm_UpdateChannel(aecm,
957                            far_spectrum_ptr,
958                            zerosXBuf,
959                            dfaNoisy,
960                            mu,
961                            echoEst32);
962 
963   supGain = WebRtcAecm_CalcSuppressionGain(aecm);
964 
965   // Calculate Wiener filter hnl[]
966   for (i = 0; i < PART_LEN1; i++) {
967     // Far end signal through channel estimate in Q8
968     // How much can we shift right to preserve resolution
969     tmp32no1 = echoEst32[i] - aecm->echoFilt[i];
970     aecm->echoFilt[i] +=
971         rtc::dchecked_cast<int32_t>((int64_t{tmp32no1} * 50) >> 8);
972 
973     zeros32 = WebRtcSpl_NormW32(aecm->echoFilt[i]) + 1;
974     zeros16 = WebRtcSpl_NormW16(supGain) + 1;
975     if (zeros32 + zeros16 > 16) {
976       // Multiplication is safe
977       // Result in
978       // Q(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN+aecm->xfaQDomainBuf[diff])
979       echoEst32Gained = WEBRTC_SPL_UMUL_32_16((uint32_t)aecm->echoFilt[i],
980                                               (uint16_t)supGain);
981       resolutionDiff = 14 - RESOLUTION_CHANNEL16 - RESOLUTION_SUPGAIN;
982       resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
983     } else {
984       tmp16no1 = 17 - zeros32 - zeros16;
985       resolutionDiff = 14 + tmp16no1 - RESOLUTION_CHANNEL16 -
986                        RESOLUTION_SUPGAIN;
987       resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
988       if (zeros32 > tmp16no1) {
989         echoEst32Gained = WEBRTC_SPL_UMUL_32_16(
990                             (uint32_t)aecm->echoFilt[i],
991                             supGain >> tmp16no1);
992       } else {
993         // Result in Q-(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN-16)
994         echoEst32Gained = (aecm->echoFilt[i] >> tmp16no1) * supGain;
995       }
996     }
997 
998     zeros16 = WebRtcSpl_NormW16(aecm->nearFilt[i]);
999     RTC_DCHECK_GE(zeros16, 0);  // |zeros16| is a norm, hence non-negative.
1000     dfa_clean_q_domain_diff = aecm->dfaCleanQDomain - aecm->dfaCleanQDomainOld;
1001     if (zeros16 < dfa_clean_q_domain_diff && aecm->nearFilt[i]) {
1002       tmp16no1 = aecm->nearFilt[i] << zeros16;
1003       qDomainDiff = zeros16 - dfa_clean_q_domain_diff;
1004       tmp16no2 = ptrDfaClean[i] >> -qDomainDiff;
1005     } else {
1006       tmp16no1 = dfa_clean_q_domain_diff < 0
1007           ? aecm->nearFilt[i] >> -dfa_clean_q_domain_diff
1008           : aecm->nearFilt[i] << dfa_clean_q_domain_diff;
1009       qDomainDiff = 0;
1010       tmp16no2 = ptrDfaClean[i];
1011     }
1012 
1013     tmp32no1 = (int32_t)(tmp16no2 - tmp16no1);
1014     tmp16no2 = (int16_t)(tmp32no1 >> 4);
1015     tmp16no2 += tmp16no1;
1016     zeros16 = WebRtcSpl_NormW16(tmp16no2);
1017     if ((tmp16no2) & (-qDomainDiff > zeros16)) {
1018       aecm->nearFilt[i] = WEBRTC_SPL_WORD16_MAX;
1019     } else {
1020       aecm->nearFilt[i] = qDomainDiff < 0 ? tmp16no2 << -qDomainDiff
1021                                           : tmp16no2 >> qDomainDiff;
1022     }
1023 
1024     // Wiener filter coefficients, resulting hnl in Q14
1025     if (echoEst32Gained == 0) {
1026       hnl[i] = ONE_Q14;
1027       numPosCoef++;
1028     } else if (aecm->nearFilt[i] == 0) {
1029       hnl[i] = 0;
1030     } else {
1031       // Multiply the suppression gain
1032       // Rounding
1033       echoEst32Gained += (uint32_t)(aecm->nearFilt[i] >> 1);
1034       tmpU32 = WebRtcSpl_DivU32U16(echoEst32Gained,
1035                                    (uint16_t)aecm->nearFilt[i]);
1036 
1037       // Current resolution is
1038       // Q-(RESOLUTION_CHANNEL + RESOLUTION_SUPGAIN
1039       //    - max(0, 17 - zeros16 - zeros32))
1040       // Make sure we are in Q14
1041       tmp32no1 = (int32_t)WEBRTC_SPL_SHIFT_W32(tmpU32, resolutionDiff);
1042       if (tmp32no1 > ONE_Q14) {
1043         hnl[i] = 0;
1044       } else if (tmp32no1 < 0) {
1045         hnl[i] = ONE_Q14;
1046         numPosCoef++;
1047       } else {
1048         // 1-echoEst/dfa
1049         hnl[i] = ONE_Q14 - (int16_t)tmp32no1;
1050         if (hnl[i] <= 0) {
1051           hnl[i] = 0;
1052         } else {
1053           numPosCoef++;
1054         }
1055       }
1056     }
1057   }
1058 
1059   // Only in wideband. Prevent the gain in upper band from being larger than
1060   // in lower band.
1061   if (aecm->mult == 2) {
1062     // TODO(bjornv): Investigate if the scaling of hnl[i] below can cause
1063     //               speech distortion in double-talk.
1064     for (i = 0; i < (PART_LEN1 >> 3); i++) {
1065       __asm __volatile (
1066         "lh         %[temp1],       0(%[ptr1])                  \n\t"
1067         "lh         %[temp2],       2(%[ptr1])                  \n\t"
1068         "lh         %[temp3],       4(%[ptr1])                  \n\t"
1069         "lh         %[temp4],       6(%[ptr1])                  \n\t"
1070         "lh         %[temp5],       8(%[ptr1])                  \n\t"
1071         "lh         %[temp6],       10(%[ptr1])                 \n\t"
1072         "lh         %[temp7],       12(%[ptr1])                 \n\t"
1073         "lh         %[temp8],       14(%[ptr1])                 \n\t"
1074         "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
1075         "mul        %[temp2],       %[temp2],       %[temp2]    \n\t"
1076         "mul        %[temp3],       %[temp3],       %[temp3]    \n\t"
1077         "mul        %[temp4],       %[temp4],       %[temp4]    \n\t"
1078         "mul        %[temp5],       %[temp5],       %[temp5]    \n\t"
1079         "mul        %[temp6],       %[temp6],       %[temp6]    \n\t"
1080         "mul        %[temp7],       %[temp7],       %[temp7]    \n\t"
1081         "mul        %[temp8],       %[temp8],       %[temp8]    \n\t"
1082         "sra        %[temp1],       %[temp1],       14          \n\t"
1083         "sra        %[temp2],       %[temp2],       14          \n\t"
1084         "sra        %[temp3],       %[temp3],       14          \n\t"
1085         "sra        %[temp4],       %[temp4],       14          \n\t"
1086         "sra        %[temp5],       %[temp5],       14          \n\t"
1087         "sra        %[temp6],       %[temp6],       14          \n\t"
1088         "sra        %[temp7],       %[temp7],       14          \n\t"
1089         "sra        %[temp8],       %[temp8],       14          \n\t"
1090         "sh         %[temp1],       0(%[ptr1])                  \n\t"
1091         "sh         %[temp2],       2(%[ptr1])                  \n\t"
1092         "sh         %[temp3],       4(%[ptr1])                  \n\t"
1093         "sh         %[temp4],       6(%[ptr1])                  \n\t"
1094         "sh         %[temp5],       8(%[ptr1])                  \n\t"
1095         "sh         %[temp6],       10(%[ptr1])                 \n\t"
1096         "sh         %[temp7],       12(%[ptr1])                 \n\t"
1097         "sh         %[temp8],       14(%[ptr1])                 \n\t"
1098         "addiu      %[ptr1],        %[ptr1],        16          \n\t"
1099         : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1100           [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [temp6] "=&r" (temp6),
1101           [temp7] "=&r" (temp7), [temp8] "=&r" (temp8), [ptr1] "+r" (ptr1)
1102         :
1103         : "memory", "hi", "lo"
1104       );
1105     }
1106     for(i = 0; i < (PART_LEN1 & 7); i++) {
1107       __asm __volatile (
1108         "lh         %[temp1],       0(%[ptr1])                  \n\t"
1109         "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
1110         "sra        %[temp1],       %[temp1],       14          \n\t"
1111         "sh         %[temp1],       0(%[ptr1])                  \n\t"
1112         "addiu      %[ptr1],        %[ptr1],        2           \n\t"
1113         : [temp1] "=&r" (temp1), [ptr1] "+r" (ptr1)
1114         :
1115         : "memory", "hi", "lo"
1116       );
1117     }
1118 
1119     for (i = kMinPrefBand; i <= kMaxPrefBand; i++) {
1120       avgHnl32 += (int32_t)hnl[i];
1121     }
1122 
1123     RTC_DCHECK_GT(kMaxPrefBand - kMinPrefBand + 1, 0);
1124     avgHnl32 /= (kMaxPrefBand - kMinPrefBand + 1);
1125 
1126     for (i = kMaxPrefBand; i < PART_LEN1; i++) {
1127       if (hnl[i] > (int16_t)avgHnl32) {
1128         hnl[i] = (int16_t)avgHnl32;
1129       }
1130     }
1131   }
1132 
1133   // Calculate NLP gain, result is in Q14
1134   if (aecm->nlpFlag) {
1135     if (numPosCoef < 3) {
1136       for (i = 0; i < PART_LEN1; i++) {
1137         efw[i].real = 0;
1138         efw[i].imag = 0;
1139         hnl[i] = 0;
1140       }
1141     } else {
1142       for (i = 0; i < PART_LEN1; i++) {
1143 #if defined(MIPS_DSP_R1_LE)
1144         __asm __volatile (
1145           ".set       push                                        \n\t"
1146           ".set       noreorder                                   \n\t"
1147           "lh         %[temp1],       0(%[ptr])                   \n\t"
1148           "lh         %[temp2],       0(%[dr_ptr])                \n\t"
1149           "slti       %[temp4],       %[temp1],       0x4001      \n\t"
1150           "beqz       %[temp4],       3f                          \n\t"
1151           " lh        %[temp3],       2(%[dr_ptr])                \n\t"
1152           "slti       %[temp5],       %[temp1],       3277        \n\t"
1153           "bnez       %[temp5],       2f                          \n\t"
1154           " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
1155           "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
1156           "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
1157           "shra_r.w   %[temp2],       %[temp2],       14          \n\t"
1158           "shra_r.w   %[temp3],       %[temp3],       14          \n\t"
1159           "b          4f                                          \n\t"
1160           " nop                                                   \n\t"
1161          "2:                                                      \n\t"
1162           "addu       %[temp1],       $zero,          $zero       \n\t"
1163           "addu       %[temp2],       $zero,          $zero       \n\t"
1164           "addu       %[temp3],       $zero,          $zero       \n\t"
1165           "b          1f                                          \n\t"
1166           " nop                                                   \n\t"
1167          "3:                                                      \n\t"
1168           "addiu      %[temp1],       $0,             0x4000      \n\t"
1169          "1:                                                      \n\t"
1170           "sh         %[temp1],       0(%[ptr])                   \n\t"
1171          "4:                                                      \n\t"
1172           "sh         %[temp2],       0(%[er_ptr])                \n\t"
1173           "sh         %[temp3],       2(%[er_ptr])                \n\t"
1174           "addiu      %[ptr],         %[ptr],         2           \n\t"
1175           "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
1176           ".set       pop                                         \n\t"
1177           : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1178             [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [ptr] "+r" (ptr),
1179             [er_ptr] "+r" (er_ptr), [dr_ptr] "+r" (dr_ptr)
1180           :
1181           : "memory", "hi", "lo"
1182         );
1183 #else
1184         __asm __volatile (
1185           ".set       push                                        \n\t"
1186           ".set       noreorder                                   \n\t"
1187           "lh         %[temp1],       0(%[ptr])                   \n\t"
1188           "lh         %[temp2],       0(%[dr_ptr])                \n\t"
1189           "slti       %[temp4],       %[temp1],       0x4001      \n\t"
1190           "beqz       %[temp4],       3f                          \n\t"
1191           " lh        %[temp3],       2(%[dr_ptr])                \n\t"
1192           "slti       %[temp5],       %[temp1],       3277        \n\t"
1193           "bnez       %[temp5],       2f                          \n\t"
1194           " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
1195           "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
1196           "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
1197           "addiu      %[temp2],       %[temp2],       0x2000      \n\t"
1198           "addiu      %[temp3],       %[temp3],       0x2000      \n\t"
1199           "sra        %[temp2],       %[temp2],       14          \n\t"
1200           "sra        %[temp3],       %[temp3],       14          \n\t"
1201           "b          4f                                          \n\t"
1202           " nop                                                   \n\t"
1203          "2:                                                      \n\t"
1204           "addu       %[temp1],       $zero,          $zero       \n\t"
1205           "addu       %[temp2],       $zero,          $zero       \n\t"
1206           "addu       %[temp3],       $zero,          $zero       \n\t"
1207           "b          1f                                          \n\t"
1208           " nop                                                   \n\t"
1209          "3:                                                      \n\t"
1210           "addiu      %[temp1],       $0,             0x4000      \n\t"
1211          "1:                                                      \n\t"
1212           "sh         %[temp1],       0(%[ptr])                   \n\t"
1213          "4:                                                      \n\t"
1214           "sh         %[temp2],       0(%[er_ptr])                \n\t"
1215           "sh         %[temp3],       2(%[er_ptr])                \n\t"
1216           "addiu      %[ptr],         %[ptr],         2           \n\t"
1217           "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
1218           ".set       pop                                         \n\t"
1219           : [temp1] "=&r" (temp1), [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
1220             [temp4] "=&r" (temp4), [temp5] "=&r" (temp5), [ptr] "+r" (ptr),
1221             [er_ptr] "+r" (er_ptr), [dr_ptr] "+r" (dr_ptr)
1222           :
1223           : "memory", "hi", "lo"
1224         );
1225 #endif
1226       }
1227     }
1228   }
1229   else {
1230     // multiply with Wiener coefficients
1231     for (i = 0; i < PART_LEN1; i++) {
1232       efw[i].real = (int16_t)
1233                       (WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real,
1234                                                             hnl[i],
1235                                                             14));
1236       efw[i].imag = (int16_t)
1237                       (WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag,
1238                                                             hnl[i],
1239                                                             14));
1240     }
1241   }
1242 
1243   if (aecm->cngMode == AecmTrue) {
1244     ComfortNoise(aecm, ptrDfaClean, efw, hnl);
1245   }
1246 
1247   InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
1248 
1249   return 0;
1250 }
1251 
1252 // Generate comfort noise and add to output signal.
ComfortNoise(AecmCore * aecm,const uint16_t * dfa,ComplexInt16 * out,const int16_t * lambda)1253 static void ComfortNoise(AecmCore* aecm,
1254                          const uint16_t* dfa,
1255                          ComplexInt16* out,
1256                          const int16_t* lambda) {
1257   int16_t i;
1258   int16_t tmp16, tmp161, tmp162, tmp163, nrsh1, nrsh2;
1259   int32_t tmp32, tmp321, tnoise, tnoise1;
1260   int32_t tmp322, tmp323, *tmp1;
1261   int16_t* dfap;
1262   int16_t* lambdap;
1263   const int32_t c2049 = 2049;
1264   const int32_t c359 = 359;
1265   const int32_t c114 = ONE_Q14;
1266 
1267   int16_t randW16[PART_LEN];
1268   int16_t uReal[PART_LEN1];
1269   int16_t uImag[PART_LEN1];
1270   int32_t outLShift32;
1271 
1272   int16_t shiftFromNearToNoise = kNoiseEstQDomain - aecm->dfaCleanQDomain;
1273   int16_t minTrackShift = 9;
1274 
1275   RTC_DCHECK_GE(shiftFromNearToNoise, 0);
1276   RTC_DCHECK_LT(shiftFromNearToNoise, 16);
1277 
1278   if (aecm->noiseEstCtr < 100) {
1279     // Track the minimum more quickly initially.
1280     aecm->noiseEstCtr++;
1281     minTrackShift = 6;
1282   }
1283 
1284   // Generate a uniform random array on [0 2^15-1].
1285   WebRtcSpl_RandUArray(randW16, PART_LEN, &aecm->seed);
1286   int16_t* randW16p = (int16_t*)randW16;
1287 #if defined (MIPS_DSP_R1_LE)
1288   int16_t* kCosTablep = (int16_t*)WebRtcAecm_kCosTable;
1289   int16_t* kSinTablep = (int16_t*)WebRtcAecm_kSinTable;
1290 #endif   // #if defined(MIPS_DSP_R1_LE)
1291   tmp1 = (int32_t*)aecm->noiseEst + 1;
1292   dfap = (int16_t*)dfa + 1;
1293   lambdap = (int16_t*)lambda + 1;
1294   // Estimate noise power.
1295   for (i = 1; i < PART_LEN1; i+=2) {
1296   // Shift to the noise domain.
1297     __asm __volatile (
1298       "lh     %[tmp32],       0(%[dfap])                              \n\t"
1299       "lw     %[tnoise],      0(%[tmp1])                              \n\t"
1300       "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
1301       : [tmp32] "=&r" (tmp32), [outLShift32] "=r" (outLShift32),
1302         [tnoise] "=&r" (tnoise)
1303       : [tmp1] "r" (tmp1), [dfap] "r" (dfap),
1304         [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1305       : "memory"
1306     );
1307 
1308     if (outLShift32 < tnoise) {
1309       // Reset "too low" counter
1310       aecm->noiseEstTooLowCtr[i] = 0;
1311       // Track the minimum.
1312       if (tnoise < (1 << minTrackShift)) {
1313         // For small values, decrease noiseEst[i] every
1314         // |kNoiseEstIncCount| block. The regular approach below can not
1315         // go further down due to truncation.
1316         aecm->noiseEstTooHighCtr[i]++;
1317         if (aecm->noiseEstTooHighCtr[i] >= kNoiseEstIncCount) {
1318           tnoise--;
1319           aecm->noiseEstTooHighCtr[i] = 0;  // Reset the counter
1320         }
1321       } else {
1322         __asm __volatile (
1323           "subu   %[tmp32],       %[tnoise],      %[outLShift32]      \n\t"
1324           "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
1325           "subu   %[tnoise],      %[tnoise],      %[tmp32]            \n\t"
1326           : [tmp32] "=&r" (tmp32), [tnoise] "+r" (tnoise)
1327           : [outLShift32] "r" (outLShift32), [minTrackShift] "r" (minTrackShift)
1328         );
1329       }
1330     } else {
1331       // Reset "too high" counter
1332       aecm->noiseEstTooHighCtr[i] = 0;
1333       // Ramp slowly upwards until we hit the minimum again.
1334       if ((tnoise >> 19) <= 0) {
1335         if ((tnoise >> 11) > 0) {
1336           // Large enough for relative increase
1337           __asm __volatile (
1338             "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
1339             "sra    %[tnoise],  %[tnoise],  11          \n\t"
1340             : [tnoise] "+r" (tnoise)
1341             : [c2049] "r" (c2049)
1342             : "hi", "lo"
1343           );
1344         } else {
1345           // Make incremental increases based on size every
1346           // |kNoiseEstIncCount| block
1347           aecm->noiseEstTooLowCtr[i]++;
1348           if (aecm->noiseEstTooLowCtr[i] >= kNoiseEstIncCount) {
1349             __asm __volatile (
1350               "sra    %[tmp32],   %[tnoise],  9           \n\t"
1351               "addi   %[tnoise],  %[tnoise],  1           \n\t"
1352               "addu   %[tnoise],  %[tnoise],  %[tmp32]    \n\t"
1353               : [tnoise] "+r" (tnoise), [tmp32] "=&r" (tmp32)
1354               :
1355             );
1356             aecm->noiseEstTooLowCtr[i] = 0; // Reset counter
1357           }
1358         }
1359       } else {
1360         // Avoid overflow.
1361         // Multiplication with 2049 will cause wrap around. Scale
1362         // down first and then multiply
1363         __asm __volatile (
1364           "sra    %[tnoise],  %[tnoise],  11          \n\t"
1365           "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
1366           : [tnoise] "+r" (tnoise)
1367           : [c2049] "r" (c2049)
1368           : "hi", "lo"
1369         );
1370       }
1371     }
1372 
1373     // Shift to the noise domain.
1374     __asm __volatile (
1375       "lh     %[tmp32],       2(%[dfap])                              \n\t"
1376       "lw     %[tnoise1],     4(%[tmp1])                              \n\t"
1377       "addiu  %[dfap],        %[dfap],    4                           \n\t"
1378       "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
1379       : [tmp32] "=&r" (tmp32), [dfap] "+r" (dfap),
1380         [outLShift32] "=r" (outLShift32), [tnoise1] "=&r" (tnoise1)
1381       : [tmp1] "r" (tmp1), [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1382       : "memory"
1383     );
1384 
1385     if (outLShift32 < tnoise1) {
1386       // Reset "too low" counter
1387       aecm->noiseEstTooLowCtr[i + 1] = 0;
1388       // Track the minimum.
1389       if (tnoise1 < (1 << minTrackShift)) {
1390         // For small values, decrease noiseEst[i] every
1391         // |kNoiseEstIncCount| block. The regular approach below can not
1392         // go further down due to truncation.
1393         aecm->noiseEstTooHighCtr[i + 1]++;
1394         if (aecm->noiseEstTooHighCtr[i + 1] >= kNoiseEstIncCount) {
1395           tnoise1--;
1396           aecm->noiseEstTooHighCtr[i + 1] = 0; // Reset the counter
1397         }
1398       } else {
1399         __asm __volatile (
1400           "subu   %[tmp32],       %[tnoise1],     %[outLShift32]      \n\t"
1401           "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
1402           "subu   %[tnoise1],     %[tnoise1],     %[tmp32]            \n\t"
1403           : [tmp32] "=&r" (tmp32), [tnoise1] "+r" (tnoise1)
1404           : [outLShift32] "r" (outLShift32), [minTrackShift] "r" (minTrackShift)
1405         );
1406       }
1407     } else {
1408       // Reset "too high" counter
1409       aecm->noiseEstTooHighCtr[i + 1] = 0;
1410       // Ramp slowly upwards until we hit the minimum again.
1411       if ((tnoise1 >> 19) <= 0) {
1412         if ((tnoise1 >> 11) > 0) {
1413           // Large enough for relative increase
1414           __asm __volatile (
1415             "mul    %[tnoise1], %[tnoise1], %[c2049]   \n\t"
1416             "sra    %[tnoise1], %[tnoise1], 11         \n\t"
1417             : [tnoise1] "+r" (tnoise1)
1418             : [c2049] "r" (c2049)
1419             : "hi", "lo"
1420           );
1421         } else {
1422           // Make incremental increases based on size every
1423           // |kNoiseEstIncCount| block
1424           aecm->noiseEstTooLowCtr[i + 1]++;
1425           if (aecm->noiseEstTooLowCtr[i + 1] >= kNoiseEstIncCount) {
1426             __asm __volatile (
1427               "sra    %[tmp32],   %[tnoise1], 9           \n\t"
1428               "addi   %[tnoise1], %[tnoise1], 1           \n\t"
1429               "addu   %[tnoise1], %[tnoise1], %[tmp32]    \n\t"
1430               : [tnoise1] "+r" (tnoise1), [tmp32] "=&r" (tmp32)
1431               :
1432             );
1433             aecm->noiseEstTooLowCtr[i + 1] = 0; // Reset counter
1434           }
1435         }
1436       } else {
1437         // Avoid overflow.
1438         // Multiplication with 2049 will cause wrap around. Scale
1439         // down first and then multiply
1440         __asm __volatile (
1441           "sra    %[tnoise1], %[tnoise1], 11          \n\t"
1442           "mul    %[tnoise1], %[tnoise1], %[c2049]    \n\t"
1443           : [tnoise1] "+r" (tnoise1)
1444           : [c2049] "r" (c2049)
1445           : "hi", "lo"
1446         );
1447       }
1448     }
1449 
1450     __asm __volatile (
1451       "lh     %[tmp16],   0(%[lambdap])                           \n\t"
1452       "lh     %[tmp161],  2(%[lambdap])                           \n\t"
1453       "sw     %[tnoise],  0(%[tmp1])                              \n\t"
1454       "sw     %[tnoise1], 4(%[tmp1])                              \n\t"
1455       "subu   %[tmp16],   %[c114],        %[tmp16]                \n\t"
1456       "subu   %[tmp161],  %[c114],        %[tmp161]               \n\t"
1457       "srav   %[tmp32],   %[tnoise],      %[shiftFromNearToNoise] \n\t"
1458       "srav   %[tmp321],  %[tnoise1],     %[shiftFromNearToNoise] \n\t"
1459       "addiu  %[lambdap], %[lambdap],     4                       \n\t"
1460       "addiu  %[tmp1],    %[tmp1],        8                       \n\t"
1461       : [tmp16] "=&r" (tmp16), [tmp161] "=&r" (tmp161), [tmp1] "+r" (tmp1),
1462         [tmp32] "=&r" (tmp32), [tmp321] "=&r" (tmp321), [lambdap] "+r" (lambdap)
1463       : [tnoise] "r" (tnoise), [tnoise1] "r" (tnoise1), [c114] "r" (c114),
1464         [shiftFromNearToNoise] "r" (shiftFromNearToNoise)
1465       : "memory"
1466     );
1467 
1468     if (tmp32 > 32767) {
1469       tmp32 = 32767;
1470       aecm->noiseEst[i] = tmp32 << shiftFromNearToNoise;
1471     }
1472     if (tmp321 > 32767) {
1473       tmp321 = 32767;
1474       aecm->noiseEst[i+1] = tmp321 << shiftFromNearToNoise;
1475     }
1476 
1477     __asm __volatile (
1478       "mul    %[tmp32],   %[tmp32],       %[tmp16]                \n\t"
1479       "mul    %[tmp321],  %[tmp321],      %[tmp161]               \n\t"
1480       "sra    %[nrsh1],   %[tmp32],       14                      \n\t"
1481       "sra    %[nrsh2],   %[tmp321],      14                      \n\t"
1482       : [nrsh1] "=&r" (nrsh1), [nrsh2] "=r" (nrsh2)
1483       : [tmp16] "r" (tmp16), [tmp161] "r" (tmp161), [tmp32] "r" (tmp32),
1484         [tmp321] "r" (tmp321)
1485       : "memory", "hi", "lo"
1486     );
1487 
1488     __asm __volatile (
1489       "lh     %[tmp32],       0(%[randW16p])              \n\t"
1490       "lh     %[tmp321],      2(%[randW16p])              \n\t"
1491       "addiu  %[randW16p],    %[randW16p],    4           \n\t"
1492       "mul    %[tmp32],       %[tmp32],       %[c359]     \n\t"
1493       "mul    %[tmp321],      %[tmp321],      %[c359]     \n\t"
1494       "sra    %[tmp16],       %[tmp32],       15          \n\t"
1495       "sra    %[tmp161],      %[tmp321],      15          \n\t"
1496       : [randW16p] "+r" (randW16p), [tmp32] "=&r" (tmp32),
1497         [tmp16] "=r" (tmp16), [tmp161] "=r" (tmp161), [tmp321] "=&r" (tmp321)
1498       : [c359] "r" (c359)
1499       : "memory", "hi", "lo"
1500     );
1501 
1502 #if !defined(MIPS_DSP_R1_LE)
1503     tmp32 = WebRtcAecm_kCosTable[tmp16];
1504     tmp321 = WebRtcAecm_kSinTable[tmp16];
1505     tmp322 = WebRtcAecm_kCosTable[tmp161];
1506     tmp323 = WebRtcAecm_kSinTable[tmp161];
1507 #else
1508     __asm __volatile (
1509       "sll    %[tmp16],       %[tmp16],                   1           \n\t"
1510       "sll    %[tmp161],      %[tmp161],                  1           \n\t"
1511       "lhx    %[tmp32],       %[tmp16](%[kCosTablep])                 \n\t"
1512       "lhx    %[tmp321],      %[tmp16](%[kSinTablep])                 \n\t"
1513       "lhx    %[tmp322],      %[tmp161](%[kCosTablep])                \n\t"
1514       "lhx    %[tmp323],      %[tmp161](%[kSinTablep])                \n\t"
1515       : [tmp32] "=&r" (tmp32), [tmp321] "=&r" (tmp321),
1516         [tmp322] "=&r" (tmp322), [tmp323] "=&r" (tmp323)
1517       : [kCosTablep] "r" (kCosTablep), [tmp16] "r" (tmp16),
1518         [tmp161] "r" (tmp161), [kSinTablep] "r" (kSinTablep)
1519       : "memory"
1520     );
1521 #endif
1522     __asm __volatile (
1523       "mul    %[tmp32],       %[tmp32],                   %[nrsh1]    \n\t"
1524       "negu   %[tmp162],      %[nrsh1]                                \n\t"
1525       "mul    %[tmp322],      %[tmp322],                  %[nrsh2]    \n\t"
1526       "negu   %[tmp163],      %[nrsh2]                                \n\t"
1527       "sra    %[tmp32],       %[tmp32],                   13          \n\t"
1528       "mul    %[tmp321],      %[tmp321],                  %[tmp162]   \n\t"
1529       "sra    %[tmp322],      %[tmp322],                  13          \n\t"
1530       "mul    %[tmp323],      %[tmp323],                  %[tmp163]   \n\t"
1531       "sra    %[tmp321],      %[tmp321],                  13          \n\t"
1532       "sra    %[tmp323],      %[tmp323],                  13          \n\t"
1533       : [tmp32] "+r" (tmp32), [tmp321] "+r" (tmp321), [tmp162] "=&r" (tmp162),
1534         [tmp322] "+r" (tmp322), [tmp323] "+r" (tmp323), [tmp163] "=&r" (tmp163)
1535       : [nrsh1] "r" (nrsh1), [nrsh2] "r" (nrsh2)
1536       : "hi", "lo"
1537     );
1538     // Tables are in Q13.
1539     uReal[i] = (int16_t)tmp32;
1540     uImag[i] = (int16_t)tmp321;
1541     uReal[i + 1] = (int16_t)tmp322;
1542     uImag[i + 1] = (int16_t)tmp323;
1543   }
1544 
1545   int32_t tt, sgn;
1546   tt = out[0].real;
1547   sgn = ((int)tt) >> 31;
1548   out[0].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1549   tt = out[0].imag;
1550   sgn = ((int)tt) >> 31;
1551   out[0].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1552   for (i = 1; i < PART_LEN; i++) {
1553     tt = out[i].real + uReal[i];
1554     sgn = ((int)tt) >> 31;
1555     out[i].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1556     tt = out[i].imag + uImag[i];
1557     sgn = ((int)tt) >> 31;
1558     out[i].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1559   }
1560   tt = out[PART_LEN].real + uReal[PART_LEN];
1561   sgn = ((int)tt) >> 31;
1562   out[PART_LEN].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1563   tt = out[PART_LEN].imag;
1564   sgn = ((int)tt) >> 31;
1565   out[PART_LEN].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1566 }
1567