1 /*
2  *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_processing/aecm/aecm_core.h"
12 #include "modules/audio_processing/aecm/echo_control_mobile.h"
13 #include "modules/audio_processing/utility/delay_estimator_wrapper.h"
14 #include "rtc_base/checks.h"
15 #include "rtc_base/numerics/safe_conversions.h"
16 
17 namespace webrtc {
18 
19 namespace {
20 
21 static const ALIGN8_BEG int16_t WebRtcAecm_kSqrtHanning[] ALIGN8_END = {
22     0,     399,   798,   1196,  1594,  1990,  2386,  2780,  3172,  3562,  3951,
23     4337,  4720,  5101,  5478,  5853,  6224,  6591,  6954,  7313,  7668,  8019,
24     8364,  8705,  9040,  9370,  9695,  10013, 10326, 10633, 10933, 11227, 11514,
25     11795, 12068, 12335, 12594, 12845, 13089, 13325, 13553, 13773, 13985, 14189,
26     14384, 14571, 14749, 14918, 15079, 15231, 15373, 15506, 15631, 15746, 15851,
27     15947, 16034, 16111, 16179, 16237, 16286, 16325, 16354, 16373, 16384};
28 
29 static const int16_t kNoiseEstQDomain = 15;
30 static const int16_t kNoiseEstIncCount = 5;
31 
32 static int16_t coefTable[] = {
33     0,   4,   256, 260, 128, 132, 384, 388, 64,  68,  320, 324, 192, 196, 448,
34     452, 32,  36,  288, 292, 160, 164, 416, 420, 96,  100, 352, 356, 224, 228,
35     480, 484, 16,  20,  272, 276, 144, 148, 400, 404, 80,  84,  336, 340, 208,
36     212, 464, 468, 48,  52,  304, 308, 176, 180, 432, 436, 112, 116, 368, 372,
37     240, 244, 496, 500, 8,   12,  264, 268, 136, 140, 392, 396, 72,  76,  328,
38     332, 200, 204, 456, 460, 40,  44,  296, 300, 168, 172, 424, 428, 104, 108,
39     360, 364, 232, 236, 488, 492, 24,  28,  280, 284, 152, 156, 408, 412, 88,
40     92,  344, 348, 216, 220, 472, 476, 56,  60,  312, 316, 184, 188, 440, 444,
41     120, 124, 376, 380, 248, 252, 504, 508};
42 
43 static int16_t coefTable_ifft[] = {
44     0,   512, 256, 508, 128, 252, 384, 380, 64,  124, 320, 444, 192, 188, 448,
45     316, 32,  60,  288, 476, 160, 220, 416, 348, 96,  92,  352, 412, 224, 156,
46     480, 284, 16,  28,  272, 492, 144, 236, 400, 364, 80,  108, 336, 428, 208,
47     172, 464, 300, 48,  44,  304, 460, 176, 204, 432, 332, 112, 76,  368, 396,
48     240, 140, 496, 268, 8,   12,  264, 500, 136, 244, 392, 372, 72,  116, 328,
49     436, 200, 180, 456, 308, 40,  52,  296, 468, 168, 212, 424, 340, 104, 84,
50     360, 404, 232, 148, 488, 276, 24,  20,  280, 484, 152, 228, 408, 356, 88,
51     100, 344, 420, 216, 164, 472, 292, 56,  36,  312, 452, 184, 196, 440, 324,
52     120, 68,  376, 388, 248, 132, 504, 260};
53 
54 }  // namespace
55 
56 static void ComfortNoise(AecmCore* aecm,
57                          const uint16_t* dfa,
58                          ComplexInt16* out,
59                          const int16_t* lambda);
60 
WindowAndFFT(AecmCore * aecm,int16_t * fft,const int16_t * time_signal,ComplexInt16 * freq_signal,int time_signal_scaling)61 static void WindowAndFFT(AecmCore* aecm,
62                          int16_t* fft,
63                          const int16_t* time_signal,
64                          ComplexInt16* freq_signal,
65                          int time_signal_scaling) {
66   int i, j;
67   int32_t tmp1, tmp2, tmp3, tmp4;
68   int16_t* pfrfi;
69   ComplexInt16* pfreq_signal;
70   int16_t f_coef, s_coef;
71   int32_t load_ptr, store_ptr1, store_ptr2, shift, shift1;
72   int32_t hann, hann1, coefs;
73 
74   memset(fft, 0, sizeof(int16_t) * PART_LEN4);
75 
76   // FFT of signal
77   __asm __volatile(
78       ".set        push                                                    \n\t"
79       ".set        noreorder                                               \n\t"
80       "addiu       %[shift],          %[time_signal_scaling], -14          \n\t"
81       "addiu       %[i],              $zero,                  64           \n\t"
82       "addiu       %[load_ptr],       %[time_signal],         0            \n\t"
83       "addiu       %[hann],           %[hanning],             0            \n\t"
84       "addiu       %[hann1],          %[hanning],             128          \n\t"
85       "addiu       %[coefs],          %[coefTable],           0            \n\t"
86       "bltz        %[shift],          2f                                   \n\t"
87       " negu       %[shift1],         %[shift]                             \n\t"
88       "1:                                                                   "
89       "\n\t"
90       "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
91       "lh          %[tmp2],           0(%[hann])                           \n\t"
92       "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
93       "lh          %[tmp4],           0(%[hann1])                          \n\t"
94       "addiu       %[i],              %[i],                   -1           \n\t"
95       "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
96       "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
97       "lh          %[f_coef],         0(%[coefs])                          \n\t"
98       "lh          %[s_coef],         2(%[coefs])                          \n\t"
99       "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
100       "addiu       %[hann],           %[hann],                2            \n\t"
101       "addiu       %[hann1],          %[hann1],               -2           \n\t"
102       "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
103       "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
104       "sllv        %[tmp1],           %[tmp1],                %[shift]     \n\t"
105       "sllv        %[tmp3],           %[tmp3],                %[shift]     \n\t"
106       "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
107       "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
108       "bgtz        %[i],              1b                                   \n\t"
109       " addiu      %[coefs],          %[coefs],               4            \n\t"
110       "b           3f                                                      \n\t"
111       " nop                                                                \n\t"
112       "2:                                                                   "
113       "\n\t"
114       "lh          %[tmp1],           0(%[load_ptr])                       \n\t"
115       "lh          %[tmp2],           0(%[hann])                           \n\t"
116       "lh          %[tmp3],           128(%[load_ptr])                     \n\t"
117       "lh          %[tmp4],           0(%[hann1])                          \n\t"
118       "addiu       %[i],              %[i],                   -1           \n\t"
119       "mul         %[tmp1],           %[tmp1],                %[tmp2]      \n\t"
120       "mul         %[tmp3],           %[tmp3],                %[tmp4]      \n\t"
121       "lh          %[f_coef],         0(%[coefs])                          \n\t"
122       "lh          %[s_coef],         2(%[coefs])                          \n\t"
123       "addiu       %[load_ptr],       %[load_ptr],            2            \n\t"
124       "addiu       %[hann],           %[hann],                2            \n\t"
125       "addiu       %[hann1],          %[hann1],               -2           \n\t"
126       "addu        %[store_ptr1],     %[fft],                 %[f_coef]    \n\t"
127       "addu        %[store_ptr2],     %[fft],                 %[s_coef]    \n\t"
128       "srav        %[tmp1],           %[tmp1],                %[shift1]    \n\t"
129       "srav        %[tmp3],           %[tmp3],                %[shift1]    \n\t"
130       "sh          %[tmp1],           0(%[store_ptr1])                     \n\t"
131       "sh          %[tmp3],           0(%[store_ptr2])                     \n\t"
132       "bgtz        %[i],              2b                                   \n\t"
133       " addiu      %[coefs],          %[coefs],               4            \n\t"
134       "3:                                                                   "
135       "\n\t"
136       ".set        pop                                                     \n\t"
137       : [load_ptr] "=&r"(load_ptr), [shift] "=&r"(shift), [hann] "=&r"(hann),
138         [hann1] "=&r"(hann1), [shift1] "=&r"(shift1), [coefs] "=&r"(coefs),
139         [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
140         [tmp4] "=&r"(tmp4), [i] "=&r"(i), [f_coef] "=&r"(f_coef),
141         [s_coef] "=&r"(s_coef), [store_ptr1] "=&r"(store_ptr1),
142         [store_ptr2] "=&r"(store_ptr2)
143       : [time_signal] "r"(time_signal), [coefTable] "r"(coefTable),
144         [time_signal_scaling] "r"(time_signal_scaling),
145         [hanning] "r"(WebRtcAecm_kSqrtHanning), [fft] "r"(fft)
146       : "memory", "hi", "lo");
147 
148   WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
149   pfrfi = fft;
150   pfreq_signal = freq_signal;
151 
152   __asm __volatile(
153       ".set        push                                                     "
154       "\n\t"
155       ".set        noreorder                                                "
156       "\n\t"
157       "addiu       %[j],              $zero,                 128            "
158       "\n\t"
159       "1:                                                                    "
160       "\n\t"
161       "lh          %[tmp1],           0(%[pfrfi])                           "
162       "\n\t"
163       "lh          %[tmp2],           2(%[pfrfi])                           "
164       "\n\t"
165       "lh          %[tmp3],           4(%[pfrfi])                           "
166       "\n\t"
167       "lh          %[tmp4],           6(%[pfrfi])                           "
168       "\n\t"
169       "subu        %[tmp2],           $zero,                 %[tmp2]        "
170       "\n\t"
171       "sh          %[tmp1],           0(%[pfreq_signal])                    "
172       "\n\t"
173       "sh          %[tmp2],           2(%[pfreq_signal])                    "
174       "\n\t"
175       "subu        %[tmp4],           $zero,                 %[tmp4]        "
176       "\n\t"
177       "sh          %[tmp3],           4(%[pfreq_signal])                    "
178       "\n\t"
179       "sh          %[tmp4],           6(%[pfreq_signal])                    "
180       "\n\t"
181       "lh          %[tmp1],           8(%[pfrfi])                           "
182       "\n\t"
183       "lh          %[tmp2],           10(%[pfrfi])                          "
184       "\n\t"
185       "lh          %[tmp3],           12(%[pfrfi])                          "
186       "\n\t"
187       "lh          %[tmp4],           14(%[pfrfi])                          "
188       "\n\t"
189       "addiu       %[j],              %[j],                  -8             "
190       "\n\t"
191       "subu        %[tmp2],           $zero,                 %[tmp2]        "
192       "\n\t"
193       "sh          %[tmp1],           8(%[pfreq_signal])                    "
194       "\n\t"
195       "sh          %[tmp2],           10(%[pfreq_signal])                   "
196       "\n\t"
197       "subu        %[tmp4],           $zero,                 %[tmp4]        "
198       "\n\t"
199       "sh          %[tmp3],           12(%[pfreq_signal])                   "
200       "\n\t"
201       "sh          %[tmp4],           14(%[pfreq_signal])                   "
202       "\n\t"
203       "addiu       %[pfreq_signal],   %[pfreq_signal],       16             "
204       "\n\t"
205       "bgtz        %[j],              1b                                    "
206       "\n\t"
207       " addiu      %[pfrfi],          %[pfrfi],              16             "
208       "\n\t"
209       ".set        pop                                                      "
210       "\n\t"
211       : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
212         [j] "=&r"(j), [pfrfi] "+r"(pfrfi), [pfreq_signal] "+r"(pfreq_signal),
213         [tmp4] "=&r"(tmp4)
214       :
215       : "memory");
216 }
217 
InverseFFTAndWindow(AecmCore * aecm,int16_t * fft,ComplexInt16 * efw,int16_t * output,const int16_t * nearendClean)218 static void InverseFFTAndWindow(AecmCore* aecm,
219                                 int16_t* fft,
220                                 ComplexInt16* efw,
221                                 int16_t* output,
222                                 const int16_t* nearendClean) {
223   int i, outCFFT;
224   int32_t tmp1, tmp2, tmp3, tmp4, tmp_re, tmp_im;
225   int16_t* pcoefTable_ifft = coefTable_ifft;
226   int16_t* pfft = fft;
227   int16_t* ppfft = fft;
228   ComplexInt16* pefw = efw;
229   int32_t out_aecm;
230   int16_t* paecm_buf = aecm->outBuf;
231   const int16_t* p_kSqrtHanning = WebRtcAecm_kSqrtHanning;
232   const int16_t* pp_kSqrtHanning = &WebRtcAecm_kSqrtHanning[PART_LEN];
233   int16_t* output1 = output;
234 
235   __asm __volatile(
236       ".set      push                                                        "
237       "\n\t"
238       ".set      noreorder                                                   "
239       "\n\t"
240       "addiu     %[i],                $zero,                   64            "
241       "\n\t"
242       "1:                                                                     "
243       "\n\t"
244       "lh        %[tmp1],             0(%[pcoefTable_ifft])                  "
245       "\n\t"
246       "lh        %[tmp2],             2(%[pcoefTable_ifft])                  "
247       "\n\t"
248       "lh        %[tmp_re],           0(%[pefw])                             "
249       "\n\t"
250       "lh        %[tmp_im],           2(%[pefw])                             "
251       "\n\t"
252       "addu      %[pfft],             %[fft],                  %[tmp2]       "
253       "\n\t"
254       "sh        %[tmp_re],           0(%[pfft])                             "
255       "\n\t"
256       "sh        %[tmp_im],           2(%[pfft])                             "
257       "\n\t"
258       "addu      %[pfft],             %[fft],                  %[tmp1]       "
259       "\n\t"
260       "sh        %[tmp_re],           0(%[pfft])                             "
261       "\n\t"
262       "subu      %[tmp_im],           $zero,                   %[tmp_im]     "
263       "\n\t"
264       "sh        %[tmp_im],           2(%[pfft])                             "
265       "\n\t"
266       "lh        %[tmp1],             4(%[pcoefTable_ifft])                  "
267       "\n\t"
268       "lh        %[tmp2],             6(%[pcoefTable_ifft])                  "
269       "\n\t"
270       "lh        %[tmp_re],           4(%[pefw])                             "
271       "\n\t"
272       "lh        %[tmp_im],           6(%[pefw])                             "
273       "\n\t"
274       "addu      %[pfft],             %[fft],                  %[tmp2]       "
275       "\n\t"
276       "sh        %[tmp_re],           0(%[pfft])                             "
277       "\n\t"
278       "sh        %[tmp_im],           2(%[pfft])                             "
279       "\n\t"
280       "addu      %[pfft],             %[fft],                  %[tmp1]       "
281       "\n\t"
282       "sh        %[tmp_re],           0(%[pfft])                             "
283       "\n\t"
284       "subu      %[tmp_im],           $zero,                   %[tmp_im]     "
285       "\n\t"
286       "sh        %[tmp_im],           2(%[pfft])                             "
287       "\n\t"
288       "lh        %[tmp1],             8(%[pcoefTable_ifft])                  "
289       "\n\t"
290       "lh        %[tmp2],             10(%[pcoefTable_ifft])                 "
291       "\n\t"
292       "lh        %[tmp_re],           8(%[pefw])                             "
293       "\n\t"
294       "lh        %[tmp_im],           10(%[pefw])                            "
295       "\n\t"
296       "addu      %[pfft],             %[fft],                  %[tmp2]       "
297       "\n\t"
298       "sh        %[tmp_re],           0(%[pfft])                             "
299       "\n\t"
300       "sh        %[tmp_im],           2(%[pfft])                             "
301       "\n\t"
302       "addu      %[pfft],             %[fft],                  %[tmp1]       "
303       "\n\t"
304       "sh        %[tmp_re],           0(%[pfft])                             "
305       "\n\t"
306       "subu      %[tmp_im],           $zero,                   %[tmp_im]     "
307       "\n\t"
308       "sh        %[tmp_im],           2(%[pfft])                             "
309       "\n\t"
310       "lh        %[tmp1],             12(%[pcoefTable_ifft])                 "
311       "\n\t"
312       "lh        %[tmp2],             14(%[pcoefTable_ifft])                 "
313       "\n\t"
314       "lh        %[tmp_re],           12(%[pefw])                            "
315       "\n\t"
316       "lh        %[tmp_im],           14(%[pefw])                            "
317       "\n\t"
318       "addu      %[pfft],             %[fft],                  %[tmp2]       "
319       "\n\t"
320       "sh        %[tmp_re],           0(%[pfft])                             "
321       "\n\t"
322       "sh        %[tmp_im],           2(%[pfft])                             "
323       "\n\t"
324       "addu      %[pfft],             %[fft],                  %[tmp1]       "
325       "\n\t"
326       "sh        %[tmp_re],           0(%[pfft])                             "
327       "\n\t"
328       "subu      %[tmp_im],           $zero,                   %[tmp_im]     "
329       "\n\t"
330       "sh        %[tmp_im],           2(%[pfft])                             "
331       "\n\t"
332       "addiu     %[pcoefTable_ifft],  %[pcoefTable_ifft],      16            "
333       "\n\t"
334       "addiu     %[i],                %[i],                    -4            "
335       "\n\t"
336       "bgtz      %[i],                1b                                     "
337       "\n\t"
338       " addiu    %[pefw],             %[pefw],                 16            "
339       "\n\t"
340       ".set      pop                                                         "
341       "\n\t"
342       : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [pfft] "+r"(pfft), [i] "=&r"(i),
343         [tmp_re] "=&r"(tmp_re), [tmp_im] "=&r"(tmp_im), [pefw] "+r"(pefw),
344         [pcoefTable_ifft] "+r"(pcoefTable_ifft), [fft] "+r"(fft)
345       :
346       : "memory");
347 
348   fft[2] = efw[PART_LEN].real;
349   fft[3] = -efw[PART_LEN].imag;
350 
351   outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
352   pfft = fft;
353 
354   __asm __volatile(
355       ".set       push                                               \n\t"
356       ".set       noreorder                                          \n\t"
357       "addiu      %[i],            $zero,               128          \n\t"
358       "1:                                                             \n\t"
359       "lh         %[tmp1],         0(%[ppfft])                       \n\t"
360       "lh         %[tmp2],         4(%[ppfft])                       \n\t"
361       "lh         %[tmp3],         8(%[ppfft])                       \n\t"
362       "lh         %[tmp4],         12(%[ppfft])                      \n\t"
363       "addiu      %[i],            %[i],                -4           \n\t"
364       "sh         %[tmp1],         0(%[pfft])                        \n\t"
365       "sh         %[tmp2],         2(%[pfft])                        \n\t"
366       "sh         %[tmp3],         4(%[pfft])                        \n\t"
367       "sh         %[tmp4],         6(%[pfft])                        \n\t"
368       "addiu      %[ppfft],        %[ppfft],            16           \n\t"
369       "bgtz       %[i],            1b                                \n\t"
370       " addiu     %[pfft],         %[pfft],             8            \n\t"
371       ".set       pop                                                \n\t"
372       : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [pfft] "+r"(pfft), [i] "=&r"(i),
373         [tmp3] "=&r"(tmp3), [tmp4] "=&r"(tmp4), [ppfft] "+r"(ppfft)
374       :
375       : "memory");
376 
377   pfft = fft;
378   out_aecm = (int32_t)(outCFFT - aecm->dfaCleanQDomain);
379 
380   __asm __volatile(
381       ".set       push                                                       "
382       "\n\t"
383       ".set       noreorder                                                  "
384       "\n\t"
385       "addiu      %[i],                $zero,                  64            "
386       "\n\t"
387       "11:                                                                    "
388       "\n\t"
389       "lh         %[tmp1],             0(%[pfft])                            "
390       "\n\t"
391       "lh         %[tmp2],             0(%[p_kSqrtHanning])                  "
392       "\n\t"
393       "addiu      %[i],                %[i],                   -2            "
394       "\n\t"
395       "mul        %[tmp1],             %[tmp1],                %[tmp2]       "
396       "\n\t"
397       "lh         %[tmp3],             2(%[pfft])                            "
398       "\n\t"
399       "lh         %[tmp4],             2(%[p_kSqrtHanning])                  "
400       "\n\t"
401       "mul        %[tmp3],             %[tmp3],                %[tmp4]       "
402       "\n\t"
403       "addiu      %[tmp1],             %[tmp1],                8192          "
404       "\n\t"
405       "sra        %[tmp1],             %[tmp1],                14            "
406       "\n\t"
407       "addiu      %[tmp3],             %[tmp3],                8192          "
408       "\n\t"
409       "sra        %[tmp3],             %[tmp3],                14            "
410       "\n\t"
411       "bgez       %[out_aecm],         1f                                    "
412       "\n\t"
413       " negu      %[tmp2],             %[out_aecm]                           "
414       "\n\t"
415       "srav       %[tmp1],             %[tmp1],                %[tmp2]       "
416       "\n\t"
417       "b          2f                                                         "
418       "\n\t"
419       " srav      %[tmp3],             %[tmp3],                %[tmp2]       "
420       "\n\t"
421       "1:                                                                     "
422       "\n\t"
423       "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   "
424       "\n\t"
425       "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   "
426       "\n\t"
427       "2:                                                                     "
428       "\n\t"
429       "lh         %[tmp4],             0(%[paecm_buf])                       "
430       "\n\t"
431       "lh         %[tmp2],             2(%[paecm_buf])                       "
432       "\n\t"
433       "addu       %[tmp3],             %[tmp3],                %[tmp2]       "
434       "\n\t"
435       "addu       %[tmp1],             %[tmp1],                %[tmp4]       "
436       "\n\t"
437 #if defined(MIPS_DSP_R1_LE)
438       "shll_s.w   %[tmp1],             %[tmp1],                16            "
439       "\n\t"
440       "sra        %[tmp1],             %[tmp1],                16            "
441       "\n\t"
442       "shll_s.w   %[tmp3],             %[tmp3],                16            "
443       "\n\t"
444       "sra        %[tmp3],             %[tmp3],                16            "
445       "\n\t"
446 #else   // #if defined(MIPS_DSP_R1_LE)
447       "sra        %[tmp4],             %[tmp1],                31            "
448       "\n\t"
449       "sra        %[tmp2],             %[tmp1],                15            "
450       "\n\t"
451       "beq        %[tmp4],             %[tmp2],                3f            "
452       "\n\t"
453       " ori       %[tmp2],             $zero,                  0x7fff        "
454       "\n\t"
455       "xor        %[tmp1],             %[tmp2],                %[tmp4]       "
456       "\n\t"
457       "3:                                                                     "
458       "\n\t"
459       "sra        %[tmp2],             %[tmp3],                31            "
460       "\n\t"
461       "sra        %[tmp4],             %[tmp3],                15            "
462       "\n\t"
463       "beq        %[tmp2],             %[tmp4],                4f            "
464       "\n\t"
465       " ori       %[tmp4],             $zero,                  0x7fff        "
466       "\n\t"
467       "xor        %[tmp3],             %[tmp4],                %[tmp2]       "
468       "\n\t"
469       "4:                                                                     "
470       "\n\t"
471 #endif  // #if defined(MIPS_DSP_R1_LE)
472       "sh         %[tmp1],             0(%[pfft])                            "
473       "\n\t"
474       "sh         %[tmp1],             0(%[output1])                         "
475       "\n\t"
476       "sh         %[tmp3],             2(%[pfft])                            "
477       "\n\t"
478       "sh         %[tmp3],             2(%[output1])                         "
479       "\n\t"
480       "lh         %[tmp1],             128(%[pfft])                          "
481       "\n\t"
482       "lh         %[tmp2],             0(%[pp_kSqrtHanning])                 "
483       "\n\t"
484       "mul        %[tmp1],             %[tmp1],                %[tmp2]       "
485       "\n\t"
486       "lh         %[tmp3],             130(%[pfft])                          "
487       "\n\t"
488       "lh         %[tmp4],             -2(%[pp_kSqrtHanning])                "
489       "\n\t"
490       "mul        %[tmp3],             %[tmp3],                %[tmp4]       "
491       "\n\t"
492       "sra        %[tmp1],             %[tmp1],                14            "
493       "\n\t"
494       "sra        %[tmp3],             %[tmp3],                14            "
495       "\n\t"
496       "bgez       %[out_aecm],         5f                                    "
497       "\n\t"
498       " negu      %[tmp2],             %[out_aecm]                           "
499       "\n\t"
500       "srav       %[tmp3],             %[tmp3],                %[tmp2]       "
501       "\n\t"
502       "b          6f                                                         "
503       "\n\t"
504       " srav      %[tmp1],             %[tmp1],                %[tmp2]       "
505       "\n\t"
506       "5:                                                                     "
507       "\n\t"
508       "sllv       %[tmp1],             %[tmp1],                %[out_aecm]   "
509       "\n\t"
510       "sllv       %[tmp3],             %[tmp3],                %[out_aecm]   "
511       "\n\t"
512       "6:                                                                     "
513       "\n\t"
514 #if defined(MIPS_DSP_R1_LE)
515       "shll_s.w   %[tmp1],             %[tmp1],                16            "
516       "\n\t"
517       "sra        %[tmp1],             %[tmp1],                16            "
518       "\n\t"
519       "shll_s.w   %[tmp3],             %[tmp3],                16            "
520       "\n\t"
521       "sra        %[tmp3],             %[tmp3],                16            "
522       "\n\t"
523 #else   // #if defined(MIPS_DSP_R1_LE)
524       "sra        %[tmp4],             %[tmp1],                31            "
525       "\n\t"
526       "sra        %[tmp2],             %[tmp1],                15            "
527       "\n\t"
528       "beq        %[tmp4],             %[tmp2],                7f            "
529       "\n\t"
530       " ori       %[tmp2],             $zero,                  0x7fff        "
531       "\n\t"
532       "xor        %[tmp1],             %[tmp2],                %[tmp4]       "
533       "\n\t"
534       "7:                                                                     "
535       "\n\t"
536       "sra        %[tmp2],             %[tmp3],                31            "
537       "\n\t"
538       "sra        %[tmp4],             %[tmp3],                15            "
539       "\n\t"
540       "beq        %[tmp2],             %[tmp4],                8f            "
541       "\n\t"
542       " ori       %[tmp4],             $zero,                  0x7fff        "
543       "\n\t"
544       "xor        %[tmp3],             %[tmp4],                %[tmp2]       "
545       "\n\t"
546       "8:                                                                     "
547       "\n\t"
548 #endif  // #if defined(MIPS_DSP_R1_LE)
549       "sh         %[tmp1],             0(%[paecm_buf])                       "
550       "\n\t"
551       "sh         %[tmp3],             2(%[paecm_buf])                       "
552       "\n\t"
553       "addiu      %[output1],          %[output1],             4             "
554       "\n\t"
555       "addiu      %[paecm_buf],        %[paecm_buf],           4             "
556       "\n\t"
557       "addiu      %[pfft],             %[pfft],                4             "
558       "\n\t"
559       "addiu      %[p_kSqrtHanning],   %[p_kSqrtHanning],      4             "
560       "\n\t"
561       "bgtz       %[i],                11b                                   "
562       "\n\t"
563       " addiu     %[pp_kSqrtHanning],  %[pp_kSqrtHanning],     -4            "
564       "\n\t"
565       ".set       pop                                                        "
566       "\n\t"
567       : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [pfft] "+r"(pfft),
568         [output1] "+r"(output1), [tmp3] "=&r"(tmp3), [tmp4] "=&r"(tmp4),
569         [paecm_buf] "+r"(paecm_buf), [i] "=&r"(i),
570         [pp_kSqrtHanning] "+r"(pp_kSqrtHanning),
571         [p_kSqrtHanning] "+r"(p_kSqrtHanning)
572       : [out_aecm] "r"(out_aecm),
573         [WebRtcAecm_kSqrtHanning] "r"(WebRtcAecm_kSqrtHanning)
574       : "hi", "lo", "memory");
575 
576   // Copy the current block to the old position
577   // (aecm->outBuf is shifted elsewhere)
578   memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(int16_t) * PART_LEN);
579   memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN,
580          sizeof(int16_t) * PART_LEN);
581   if (nearendClean != NULL) {
582     memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN,
583            sizeof(int16_t) * PART_LEN);
584   }
585 }
586 
WebRtcAecm_CalcLinearEnergies_mips(AecmCore * aecm,const uint16_t * far_spectrum,int32_t * echo_est,uint32_t * far_energy,uint32_t * echo_energy_adapt,uint32_t * echo_energy_stored)587 void WebRtcAecm_CalcLinearEnergies_mips(AecmCore* aecm,
588                                         const uint16_t* far_spectrum,
589                                         int32_t* echo_est,
590                                         uint32_t* far_energy,
591                                         uint32_t* echo_energy_adapt,
592                                         uint32_t* echo_energy_stored) {
593   int i;
594   uint32_t par1 = (*far_energy);
595   uint32_t par2 = (*echo_energy_adapt);
596   uint32_t par3 = (*echo_energy_stored);
597   int16_t* ch_stored_p = &(aecm->channelStored[0]);
598   int16_t* ch_adapt_p = &(aecm->channelAdapt16[0]);
599   uint16_t* spectrum_p = (uint16_t*)(&(far_spectrum[0]));
600   int32_t* echo_p = &(echo_est[0]);
601   int32_t temp0, stored0, echo0, adept0, spectrum0;
602   int32_t stored1, adept1, spectrum1, echo1, temp1;
603 
604   // Get energy for the delayed far end signal and estimated
605   // echo using both stored and adapted channels.
606   for (i = 0; i < PART_LEN; i += 4) {
607     __asm __volatile(
608         ".set           push                                            \n\t"
609         ".set           noreorder                                       \n\t"
610         "lh             %[stored0],     0(%[ch_stored_p])               \n\t"
611         "lhu            %[adept0],      0(%[ch_adapt_p])                \n\t"
612         "lhu            %[spectrum0],   0(%[spectrum_p])                \n\t"
613         "lh             %[stored1],     2(%[ch_stored_p])               \n\t"
614         "lhu            %[adept1],      2(%[ch_adapt_p])                \n\t"
615         "lhu            %[spectrum1],   2(%[spectrum_p])                \n\t"
616         "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
617         "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
618         "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
619         "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
620         "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
621         "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
622         "addiu          %[echo_p],      %[echo_p],      16              \n\t"
623         "addu           %[par3],        %[par3],        %[echo0]        \n\t"
624         "addu           %[par2],        %[par2],        %[temp0]        \n\t"
625         "addu           %[par3],        %[par3],        %[echo1]        \n\t"
626         "addu           %[par2],        %[par2],        %[temp1]        \n\t"
627         "usw            %[echo0],       -16(%[echo_p])                  \n\t"
628         "usw            %[echo1],       -12(%[echo_p])                  \n\t"
629         "lh             %[stored0],     4(%[ch_stored_p])               \n\t"
630         "lhu            %[adept0],      4(%[ch_adapt_p])                \n\t"
631         "lhu            %[spectrum0],   4(%[spectrum_p])                \n\t"
632         "lh             %[stored1],     6(%[ch_stored_p])               \n\t"
633         "lhu            %[adept1],      6(%[ch_adapt_p])                \n\t"
634         "lhu            %[spectrum1],   6(%[spectrum_p])                \n\t"
635         "mul            %[echo0],       %[stored0],     %[spectrum0]    \n\t"
636         "mul            %[temp0],       %[adept0],      %[spectrum0]    \n\t"
637         "mul            %[echo1],       %[stored1],     %[spectrum1]    \n\t"
638         "mul            %[temp1],       %[adept1],      %[spectrum1]    \n\t"
639         "addu           %[par1],        %[par1],        %[spectrum0]    \n\t"
640         "addu           %[par1],        %[par1],        %[spectrum1]    \n\t"
641         "addiu          %[ch_stored_p], %[ch_stored_p], 8               \n\t"
642         "addiu          %[ch_adapt_p],  %[ch_adapt_p],  8               \n\t"
643         "addiu          %[spectrum_p],  %[spectrum_p],  8               \n\t"
644         "addu           %[par3],        %[par3],        %[echo0]        \n\t"
645         "addu           %[par2],        %[par2],        %[temp0]        \n\t"
646         "addu           %[par3],        %[par3],        %[echo1]        \n\t"
647         "addu           %[par2],        %[par2],        %[temp1]        \n\t"
648         "usw            %[echo0],       -8(%[echo_p])                   \n\t"
649         "usw            %[echo1],       -4(%[echo_p])                   \n\t"
650         ".set           pop                                             \n\t"
651         : [temp0] "=&r"(temp0), [stored0] "=&r"(stored0),
652           [adept0] "=&r"(adept0), [spectrum0] "=&r"(spectrum0),
653           [echo0] "=&r"(echo0), [echo_p] "+r"(echo_p), [par3] "+r"(par3),
654           [par1] "+r"(par1), [par2] "+r"(par2), [stored1] "=&r"(stored1),
655           [adept1] "=&r"(adept1), [echo1] "=&r"(echo1),
656           [spectrum1] "=&r"(spectrum1), [temp1] "=&r"(temp1),
657           [ch_stored_p] "+r"(ch_stored_p), [ch_adapt_p] "+r"(ch_adapt_p),
658           [spectrum_p] "+r"(spectrum_p)
659         :
660         : "hi", "lo", "memory");
661   }
662 
663   echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
664                                              far_spectrum[PART_LEN]);
665   par1 += (uint32_t)(far_spectrum[PART_LEN]);
666   par2 += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
667   par3 += (uint32_t)echo_est[PART_LEN];
668 
669   (*far_energy) = par1;
670   (*echo_energy_adapt) = par2;
671   (*echo_energy_stored) = par3;
672 }
673 
674 #if defined(MIPS_DSP_R1_LE)
WebRtcAecm_StoreAdaptiveChannel_mips(AecmCore * aecm,const uint16_t * far_spectrum,int32_t * echo_est)675 void WebRtcAecm_StoreAdaptiveChannel_mips(AecmCore* aecm,
676                                           const uint16_t* far_spectrum,
677                                           int32_t* echo_est) {
678   int i;
679   int16_t* temp1;
680   uint16_t* temp8;
681   int32_t temp0, temp2, temp3, temp4, temp5, temp6;
682   int32_t* temp7 = &(echo_est[0]);
683   temp1 = &(aecm->channelStored[0]);
684   temp8 = (uint16_t*)(&far_spectrum[0]);
685 
686   // During startup we store the channel every block.
687   memcpy(aecm->channelStored, aecm->channelAdapt16,
688          sizeof(int16_t) * PART_LEN1);
689   // Recalculate echo estimate
690   for (i = 0; i < PART_LEN; i += 4) {
691     __asm __volatile(
692         "ulw            %[temp0],   0(%[temp8])               \n\t"
693         "ulw            %[temp2],   0(%[temp1])               \n\t"
694         "ulw            %[temp4],   4(%[temp8])               \n\t"
695         "ulw            %[temp5],   4(%[temp1])               \n\t"
696         "muleq_s.w.phl  %[temp3],   %[temp2],     %[temp0]    \n\t"
697         "muleq_s.w.phr  %[temp0],   %[temp2],     %[temp0]    \n\t"
698         "muleq_s.w.phl  %[temp6],   %[temp5],     %[temp4]    \n\t"
699         "muleq_s.w.phr  %[temp4],   %[temp5],     %[temp4]    \n\t"
700         "addiu          %[temp7],   %[temp7],     16          \n\t"
701         "addiu          %[temp1],   %[temp1],     8           \n\t"
702         "addiu          %[temp8],   %[temp8],     8           \n\t"
703         "sra            %[temp3],   %[temp3],     1           \n\t"
704         "sra            %[temp0],   %[temp0],     1           \n\t"
705         "sra            %[temp6],   %[temp6],     1           \n\t"
706         "sra            %[temp4],   %[temp4],     1           \n\t"
707         "usw            %[temp3],   -12(%[temp7])             \n\t"
708         "usw            %[temp0],   -16(%[temp7])             \n\t"
709         "usw            %[temp6],   -4(%[temp7])              \n\t"
710         "usw            %[temp4],   -8(%[temp7])              \n\t"
711         : [temp0] "=&r"(temp0), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
712           [temp4] "=&r"(temp4), [temp5] "=&r"(temp5), [temp6] "=&r"(temp6),
713           [temp1] "+r"(temp1), [temp8] "+r"(temp8), [temp7] "+r"(temp7)
714         :
715         : "hi", "lo", "memory");
716   }
717   echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
718 }
719 
WebRtcAecm_ResetAdaptiveChannel_mips(AecmCore * aecm)720 void WebRtcAecm_ResetAdaptiveChannel_mips(AecmCore* aecm) {
721   int i;
722   int32_t* temp3;
723   int16_t* temp0;
724   int32_t temp1, temp2, temp4, temp5;
725 
726   temp0 = &(aecm->channelStored[0]);
727   temp3 = &(aecm->channelAdapt32[0]);
728 
729   // The stored channel has a significantly lower MSE than the adaptive one for
730   // two consecutive calculations. Reset the adaptive channel.
731   memcpy(aecm->channelAdapt16, aecm->channelStored,
732          sizeof(int16_t) * PART_LEN1);
733 
734   // Restore the W32 channel
735   for (i = 0; i < PART_LEN; i += 4) {
736     __asm __volatile(
737         "ulw            %[temp1], 0(%[temp0])           \n\t"
738         "ulw            %[temp4], 4(%[temp0])           \n\t"
739         "preceq.w.phl   %[temp2], %[temp1]              \n\t"
740         "preceq.w.phr   %[temp1], %[temp1]              \n\t"
741         "preceq.w.phl   %[temp5], %[temp4]              \n\t"
742         "preceq.w.phr   %[temp4], %[temp4]              \n\t"
743         "addiu          %[temp0], %[temp0], 8           \n\t"
744         "usw            %[temp2], 4(%[temp3])           \n\t"
745         "usw            %[temp1], 0(%[temp3])           \n\t"
746         "usw            %[temp5], 12(%[temp3])          \n\t"
747         "usw            %[temp4], 8(%[temp3])           \n\t"
748         "addiu          %[temp3], %[temp3], 16          \n\t"
749         : [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp4] "=&r"(temp4),
750           [temp5] "=&r"(temp5), [temp3] "+r"(temp3), [temp0] "+r"(temp0)
751         :
752         : "memory");
753   }
754 
755   aecm->channelAdapt32[i] = (int32_t)aecm->channelStored[i] << 16;
756 }
757 #endif  // #if defined(MIPS_DSP_R1_LE)
758 
759 // Transforms a time domain signal into the frequency domain, outputting the
760 // complex valued signal, absolute value and sum of absolute values.
761 //
762 // time_signal          [in]    Pointer to time domain signal
763 // freq_signal_real     [out]   Pointer to real part of frequency domain array
764 // freq_signal_imag     [out]   Pointer to imaginary part of frequency domain
765 //                              array
766 // freq_signal_abs      [out]   Pointer to absolute value of frequency domain
767 //                              array
768 // freq_signal_sum_abs  [out]   Pointer to the sum of all absolute values in
769 //                              the frequency domain array
770 // return value                 The Q-domain of current frequency values
771 //
TimeToFrequencyDomain(AecmCore * aecm,const int16_t * time_signal,ComplexInt16 * freq_signal,uint16_t * freq_signal_abs,uint32_t * freq_signal_sum_abs)772 static int TimeToFrequencyDomain(AecmCore* aecm,
773                                  const int16_t* time_signal,
774                                  ComplexInt16* freq_signal,
775                                  uint16_t* freq_signal_abs,
776                                  uint32_t* freq_signal_sum_abs) {
777   int i = 0;
778   int time_signal_scaling = 0;
779 
780   // In fft_buf, +16 for 32-byte alignment.
781   int16_t fft_buf[PART_LEN4 + 16];
782   int16_t* fft = (int16_t*)(((uintptr_t)fft_buf + 31) & ~31);
783 
784   int16_t tmp16no1;
785 #if !defined(MIPS_DSP_R2_LE)
786   int32_t tmp32no1;
787   int32_t tmp32no2;
788   int16_t tmp16no2;
789 #else
790   int32_t tmp32no10, tmp32no11, tmp32no12, tmp32no13;
791   int32_t tmp32no20, tmp32no21, tmp32no22, tmp32no23;
792   int16_t* freqp;
793   uint16_t* freqabsp;
794   uint32_t freqt0, freqt1, freqt2, freqt3;
795   uint32_t freqs;
796 #endif
797 
798 #ifdef AECM_DYNAMIC_Q
799   tmp16no1 = WebRtcSpl_MaxAbsValueW16(time_signal, PART_LEN2);
800   time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
801 #endif
802 
803   WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling);
804 
805   // Extract imaginary and real part,
806   // calculate the magnitude for all frequency bins
807   freq_signal[0].imag = 0;
808   freq_signal[PART_LEN].imag = 0;
809   freq_signal[PART_LEN].real = fft[PART_LEN2];
810   freq_signal_abs[0] = (uint16_t)WEBRTC_SPL_ABS_W16(freq_signal[0].real);
811   freq_signal_abs[PART_LEN] =
812       (uint16_t)WEBRTC_SPL_ABS_W16(freq_signal[PART_LEN].real);
813   (*freq_signal_sum_abs) =
814       (uint32_t)(freq_signal_abs[0]) + (uint32_t)(freq_signal_abs[PART_LEN]);
815 
816 #if !defined(MIPS_DSP_R2_LE)
817   for (i = 1; i < PART_LEN; i++) {
818     if (freq_signal[i].real == 0) {
819       freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
820     } else if (freq_signal[i].imag == 0) {
821       freq_signal_abs[i] = (uint16_t)WEBRTC_SPL_ABS_W16(freq_signal[i].real);
822     } else {
823       // Approximation for magnitude of complex fft output
824       // magn = sqrt(real^2 + imag^2)
825       // magn ~= alpha * max(|imag|,|real|) + beta * min(|imag|,|real|)
826       //
827       // The parameters alpha and beta are stored in Q15
828       tmp16no1 = WEBRTC_SPL_ABS_W16(freq_signal[i].real);
829       tmp16no2 = WEBRTC_SPL_ABS_W16(freq_signal[i].imag);
830       tmp32no1 = tmp16no1 * tmp16no1;
831       tmp32no2 = tmp16no2 * tmp16no2;
832       tmp32no2 = WebRtcSpl_AddSatW32(tmp32no1, tmp32no2);
833       tmp32no1 = WebRtcSpl_SqrtFloor(tmp32no2);
834 
835       freq_signal_abs[i] = (uint16_t)tmp32no1;
836     }
837     (*freq_signal_sum_abs) += (uint32_t)freq_signal_abs[i];
838   }
839 #else  // #if !defined(MIPS_DSP_R2_LE)
840   freqs =
841       (uint32_t)(freq_signal_abs[0]) + (uint32_t)(freq_signal_abs[PART_LEN]);
842   freqp = &(freq_signal[1].real);
843 
844   __asm __volatile(
845       "lw             %[freqt0],      0(%[freqp])             \n\t"
846       "lw             %[freqt1],      4(%[freqp])             \n\t"
847       "lw             %[freqt2],      8(%[freqp])             \n\t"
848       "mult           $ac0,           $zero,      $zero       \n\t"
849       "mult           $ac1,           $zero,      $zero       \n\t"
850       "mult           $ac2,           $zero,      $zero       \n\t"
851       "dpaq_s.w.ph    $ac0,           %[freqt0],  %[freqt0]   \n\t"
852       "dpaq_s.w.ph    $ac1,           %[freqt1],  %[freqt1]   \n\t"
853       "dpaq_s.w.ph    $ac2,           %[freqt2],  %[freqt2]   \n\t"
854       "addiu          %[freqp],       %[freqp],   12          \n\t"
855       "extr.w         %[tmp32no20],   $ac0,       1           \n\t"
856       "extr.w         %[tmp32no21],   $ac1,       1           \n\t"
857       "extr.w         %[tmp32no22],   $ac2,       1           \n\t"
858       : [freqt0] "=&r"(freqt0), [freqt1] "=&r"(freqt1), [freqt2] "=&r"(freqt2),
859         [freqp] "+r"(freqp), [tmp32no20] "=r"(tmp32no20),
860         [tmp32no21] "=r"(tmp32no21), [tmp32no22] "=r"(tmp32no22)
861       :
862       : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo");
863 
864   tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
865   tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
866   tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
867   freq_signal_abs[1] = (uint16_t)tmp32no10;
868   freq_signal_abs[2] = (uint16_t)tmp32no11;
869   freq_signal_abs[3] = (uint16_t)tmp32no12;
870   freqs += (uint32_t)tmp32no10;
871   freqs += (uint32_t)tmp32no11;
872   freqs += (uint32_t)tmp32no12;
873   freqabsp = &(freq_signal_abs[4]);
874   for (i = 4; i < PART_LEN; i += 4) {
875     __asm __volatile(
876         "ulw            %[freqt0],      0(%[freqp])                 \n\t"
877         "ulw            %[freqt1],      4(%[freqp])                 \n\t"
878         "ulw            %[freqt2],      8(%[freqp])                 \n\t"
879         "ulw            %[freqt3],      12(%[freqp])                \n\t"
880         "mult           $ac0,           $zero,          $zero       \n\t"
881         "mult           $ac1,           $zero,          $zero       \n\t"
882         "mult           $ac2,           $zero,          $zero       \n\t"
883         "mult           $ac3,           $zero,          $zero       \n\t"
884         "dpaq_s.w.ph    $ac0,           %[freqt0],      %[freqt0]   \n\t"
885         "dpaq_s.w.ph    $ac1,           %[freqt1],      %[freqt1]   \n\t"
886         "dpaq_s.w.ph    $ac2,           %[freqt2],      %[freqt2]   \n\t"
887         "dpaq_s.w.ph    $ac3,           %[freqt3],      %[freqt3]   \n\t"
888         "addiu          %[freqp],       %[freqp],       16          \n\t"
889         "addiu          %[freqabsp],    %[freqabsp],    8           \n\t"
890         "extr.w         %[tmp32no20],   $ac0,           1           \n\t"
891         "extr.w         %[tmp32no21],   $ac1,           1           \n\t"
892         "extr.w         %[tmp32no22],   $ac2,           1           \n\t"
893         "extr.w         %[tmp32no23],   $ac3,           1           \n\t"
894         : [freqt0] "=&r"(freqt0), [freqt1] "=&r"(freqt1),
895           [freqt2] "=&r"(freqt2), [freqt3] "=&r"(freqt3),
896           [tmp32no20] "=r"(tmp32no20), [tmp32no21] "=r"(tmp32no21),
897           [tmp32no22] "=r"(tmp32no22), [tmp32no23] "=r"(tmp32no23),
898           [freqabsp] "+r"(freqabsp), [freqp] "+r"(freqp)
899         :
900         : "memory", "hi", "lo", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo",
901           "$ac3hi", "$ac3lo");
902 
903     tmp32no10 = WebRtcSpl_SqrtFloor(tmp32no20);
904     tmp32no11 = WebRtcSpl_SqrtFloor(tmp32no21);
905     tmp32no12 = WebRtcSpl_SqrtFloor(tmp32no22);
906     tmp32no13 = WebRtcSpl_SqrtFloor(tmp32no23);
907 
908     __asm __volatile(
909         "sh             %[tmp32no10],   -8(%[freqabsp])                 \n\t"
910         "sh             %[tmp32no11],   -6(%[freqabsp])                 \n\t"
911         "sh             %[tmp32no12],   -4(%[freqabsp])                 \n\t"
912         "sh             %[tmp32no13],   -2(%[freqabsp])                 \n\t"
913         "addu           %[freqs],       %[freqs],       %[tmp32no10]    \n\t"
914         "addu           %[freqs],       %[freqs],       %[tmp32no11]    \n\t"
915         "addu           %[freqs],       %[freqs],       %[tmp32no12]    \n\t"
916         "addu           %[freqs],       %[freqs],       %[tmp32no13]    \n\t"
917         : [freqs] "+r"(freqs)
918         : [tmp32no10] "r"(tmp32no10), [tmp32no11] "r"(tmp32no11),
919           [tmp32no12] "r"(tmp32no12), [tmp32no13] "r"(tmp32no13),
920           [freqabsp] "r"(freqabsp)
921         : "memory");
922   }
923 
924   (*freq_signal_sum_abs) = freqs;
925 #endif
926 
927   return time_signal_scaling;
928 }
929 
WebRtcAecm_ProcessBlock(AecmCore * aecm,const int16_t * farend,const int16_t * nearendNoisy,const int16_t * nearendClean,int16_t * output)930 int WebRtcAecm_ProcessBlock(AecmCore* aecm,
931                             const int16_t* farend,
932                             const int16_t* nearendNoisy,
933                             const int16_t* nearendClean,
934                             int16_t* output) {
935   int i;
936   uint32_t xfaSum;
937   uint32_t dfaNoisySum;
938   uint32_t dfaCleanSum;
939   uint32_t echoEst32Gained;
940   uint32_t tmpU32;
941   int32_t tmp32no1;
942 
943   uint16_t xfa[PART_LEN1];
944   uint16_t dfaNoisy[PART_LEN1];
945   uint16_t dfaClean[PART_LEN1];
946   uint16_t* ptrDfaClean = dfaClean;
947   const uint16_t* far_spectrum_ptr = NULL;
948 
949   // 32 byte aligned buffers (with +8 or +16).
950   int16_t fft_buf[PART_LEN4 + 2 + 16];  // +2 to make a loop safe.
951   int32_t echoEst32_buf[PART_LEN1 + 8];
952   int32_t dfw_buf[PART_LEN2 + 8];
953   int32_t efw_buf[PART_LEN2 + 8];
954 
955   int16_t* fft = (int16_t*)(((uint32_t)fft_buf + 31) & ~31);
956   int32_t* echoEst32 = (int32_t*)(((uint32_t)echoEst32_buf + 31) & ~31);
957   ComplexInt16* dfw = (ComplexInt16*)(((uint32_t)dfw_buf + 31) & ~31);
958   ComplexInt16* efw = (ComplexInt16*)(((uint32_t)efw_buf + 31) & ~31);
959 
960   int16_t hnl[PART_LEN1];
961   int16_t numPosCoef = 0;
962   int delay;
963   int16_t tmp16no1;
964   int16_t tmp16no2;
965   int16_t mu;
966   int16_t supGain;
967   int16_t zeros32, zeros16;
968   int16_t zerosDBufNoisy, zerosDBufClean, zerosXBuf;
969   int far_q;
970   int16_t resolutionDiff, qDomainDiff, dfa_clean_q_domain_diff;
971 
972   const int kMinPrefBand = 4;
973   const int kMaxPrefBand = 24;
974   int32_t avgHnl32 = 0;
975 
976   int32_t temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
977   int16_t* ptr;
978   int16_t* ptr1;
979   int16_t* er_ptr;
980   int16_t* dr_ptr;
981 
982   ptr = &hnl[0];
983   ptr1 = &hnl[0];
984   er_ptr = &efw[0].real;
985   dr_ptr = &dfw[0].real;
986 
987   // Determine startup state. There are three states:
988   // (0) the first CONV_LEN blocks
989   // (1) another CONV_LEN blocks
990   // (2) the rest
991 
992   if (aecm->startupState < 2) {
993     aecm->startupState =
994         (aecm->totCount >= CONV_LEN) + (aecm->totCount >= CONV_LEN2);
995   }
996   // END: Determine startup state
997 
998   // Buffer near and far end signals
999   memcpy(aecm->xBuf + PART_LEN, farend, sizeof(int16_t) * PART_LEN);
1000   memcpy(aecm->dBufNoisy + PART_LEN, nearendNoisy, sizeof(int16_t) * PART_LEN);
1001   if (nearendClean != NULL) {
1002     memcpy(aecm->dBufClean + PART_LEN, nearendClean,
1003            sizeof(int16_t) * PART_LEN);
1004   }
1005 
1006   // Transform far end signal from time domain to frequency domain.
1007   far_q = TimeToFrequencyDomain(aecm, aecm->xBuf, dfw, xfa, &xfaSum);
1008 
1009   // Transform noisy near end signal from time domain to frequency domain.
1010   zerosDBufNoisy =
1011       TimeToFrequencyDomain(aecm, aecm->dBufNoisy, dfw, dfaNoisy, &dfaNoisySum);
1012   aecm->dfaNoisyQDomainOld = aecm->dfaNoisyQDomain;
1013   aecm->dfaNoisyQDomain = (int16_t)zerosDBufNoisy;
1014 
1015   if (nearendClean == NULL) {
1016     ptrDfaClean = dfaNoisy;
1017     aecm->dfaCleanQDomainOld = aecm->dfaNoisyQDomainOld;
1018     aecm->dfaCleanQDomain = aecm->dfaNoisyQDomain;
1019     dfaCleanSum = dfaNoisySum;
1020   } else {
1021     // Transform clean near end signal from time domain to frequency domain.
1022     zerosDBufClean = TimeToFrequencyDomain(aecm, aecm->dBufClean, dfw, dfaClean,
1023                                            &dfaCleanSum);
1024     aecm->dfaCleanQDomainOld = aecm->dfaCleanQDomain;
1025     aecm->dfaCleanQDomain = (int16_t)zerosDBufClean;
1026   }
1027 
1028   // Get the delay
1029   // Save far-end history and estimate delay
1030   WebRtcAecm_UpdateFarHistory(aecm, xfa, far_q);
1031 
1032   if (WebRtc_AddFarSpectrumFix(aecm->delay_estimator_farend, xfa, PART_LEN1,
1033                                far_q) == -1) {
1034     return -1;
1035   }
1036   delay = WebRtc_DelayEstimatorProcessFix(aecm->delay_estimator, dfaNoisy,
1037                                           PART_LEN1, zerosDBufNoisy);
1038   if (delay == -1) {
1039     return -1;
1040   } else if (delay == -2) {
1041     // If the delay is unknown, we assume zero.
1042     // NOTE: this will have to be adjusted if we ever add lookahead.
1043     delay = 0;
1044   }
1045 
1046   if (aecm->fixedDelay >= 0) {
1047     // Use fixed delay
1048     delay = aecm->fixedDelay;
1049   }
1050 
1051   // Get aligned far end spectrum
1052   far_spectrum_ptr = WebRtcAecm_AlignedFarend(aecm, &far_q, delay);
1053   zerosXBuf = (int16_t)far_q;
1054 
1055   if (far_spectrum_ptr == NULL) {
1056     return -1;
1057   }
1058 
1059   // Calculate log(energy) and update energy threshold levels
1060   WebRtcAecm_CalcEnergies(aecm, far_spectrum_ptr, zerosXBuf, dfaNoisySum,
1061                           echoEst32);
1062   // Calculate stepsize
1063   mu = WebRtcAecm_CalcStepSize(aecm);
1064 
1065   // Update counters
1066   aecm->totCount++;
1067 
1068   // This is the channel estimation algorithm.
1069   // It is base on NLMS but has a variable step length,
1070   // which was calculated above.
1071   WebRtcAecm_UpdateChannel(aecm, far_spectrum_ptr, zerosXBuf, dfaNoisy, mu,
1072                            echoEst32);
1073 
1074   supGain = WebRtcAecm_CalcSuppressionGain(aecm);
1075 
1076   // Calculate Wiener filter hnl[]
1077   for (i = 0; i < PART_LEN1; i++) {
1078     // Far end signal through channel estimate in Q8
1079     // How much can we shift right to preserve resolution
1080     tmp32no1 = echoEst32[i] - aecm->echoFilt[i];
1081     aecm->echoFilt[i] +=
1082         rtc::dchecked_cast<int32_t>((int64_t{tmp32no1} * 50) >> 8);
1083 
1084     zeros32 = WebRtcSpl_NormW32(aecm->echoFilt[i]) + 1;
1085     zeros16 = WebRtcSpl_NormW16(supGain) + 1;
1086     if (zeros32 + zeros16 > 16) {
1087       // Multiplication is safe
1088       // Result in
1089       // Q(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN+aecm->xfaQDomainBuf[diff])
1090       echoEst32Gained =
1091           WEBRTC_SPL_UMUL_32_16((uint32_t)aecm->echoFilt[i], (uint16_t)supGain);
1092       resolutionDiff = 14 - RESOLUTION_CHANNEL16 - RESOLUTION_SUPGAIN;
1093       resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
1094     } else {
1095       tmp16no1 = 17 - zeros32 - zeros16;
1096       resolutionDiff =
1097           14 + tmp16no1 - RESOLUTION_CHANNEL16 - RESOLUTION_SUPGAIN;
1098       resolutionDiff += (aecm->dfaCleanQDomain - zerosXBuf);
1099       if (zeros32 > tmp16no1) {
1100         echoEst32Gained = WEBRTC_SPL_UMUL_32_16((uint32_t)aecm->echoFilt[i],
1101                                                 supGain >> tmp16no1);
1102       } else {
1103         // Result in Q-(RESOLUTION_CHANNEL+RESOLUTION_SUPGAIN-16)
1104         echoEst32Gained = (aecm->echoFilt[i] >> tmp16no1) * supGain;
1105       }
1106     }
1107 
1108     zeros16 = WebRtcSpl_NormW16(aecm->nearFilt[i]);
1109     RTC_DCHECK_GE(zeros16, 0);  // |zeros16| is a norm, hence non-negative.
1110     dfa_clean_q_domain_diff = aecm->dfaCleanQDomain - aecm->dfaCleanQDomainOld;
1111     if (zeros16 < dfa_clean_q_domain_diff && aecm->nearFilt[i]) {
1112       tmp16no1 = aecm->nearFilt[i] << zeros16;
1113       qDomainDiff = zeros16 - dfa_clean_q_domain_diff;
1114       tmp16no2 = ptrDfaClean[i] >> -qDomainDiff;
1115     } else {
1116       tmp16no1 = dfa_clean_q_domain_diff < 0
1117                      ? aecm->nearFilt[i] >> -dfa_clean_q_domain_diff
1118                      : aecm->nearFilt[i] << dfa_clean_q_domain_diff;
1119       qDomainDiff = 0;
1120       tmp16no2 = ptrDfaClean[i];
1121     }
1122 
1123     tmp32no1 = (int32_t)(tmp16no2 - tmp16no1);
1124     tmp16no2 = (int16_t)(tmp32no1 >> 4);
1125     tmp16no2 += tmp16no1;
1126     zeros16 = WebRtcSpl_NormW16(tmp16no2);
1127     if ((tmp16no2) & (-qDomainDiff > zeros16)) {
1128       aecm->nearFilt[i] = WEBRTC_SPL_WORD16_MAX;
1129     } else {
1130       aecm->nearFilt[i] =
1131           qDomainDiff < 0 ? tmp16no2 << -qDomainDiff : tmp16no2 >> qDomainDiff;
1132     }
1133 
1134     // Wiener filter coefficients, resulting hnl in Q14
1135     if (echoEst32Gained == 0) {
1136       hnl[i] = ONE_Q14;
1137       numPosCoef++;
1138     } else if (aecm->nearFilt[i] == 0) {
1139       hnl[i] = 0;
1140     } else {
1141       // Multiply the suppression gain
1142       // Rounding
1143       echoEst32Gained += (uint32_t)(aecm->nearFilt[i] >> 1);
1144       tmpU32 =
1145           WebRtcSpl_DivU32U16(echoEst32Gained, (uint16_t)aecm->nearFilt[i]);
1146 
1147       // Current resolution is
1148       // Q-(RESOLUTION_CHANNEL + RESOLUTION_SUPGAIN
1149       //    - max(0, 17 - zeros16 - zeros32))
1150       // Make sure we are in Q14
1151       tmp32no1 = (int32_t)WEBRTC_SPL_SHIFT_W32(tmpU32, resolutionDiff);
1152       if (tmp32no1 > ONE_Q14) {
1153         hnl[i] = 0;
1154       } else if (tmp32no1 < 0) {
1155         hnl[i] = ONE_Q14;
1156         numPosCoef++;
1157       } else {
1158         // 1-echoEst/dfa
1159         hnl[i] = ONE_Q14 - (int16_t)tmp32no1;
1160         if (hnl[i] <= 0) {
1161           hnl[i] = 0;
1162         } else {
1163           numPosCoef++;
1164         }
1165       }
1166     }
1167   }
1168 
1169   // Only in wideband. Prevent the gain in upper band from being larger than
1170   // in lower band.
1171   if (aecm->mult == 2) {
1172     // TODO(bjornv): Investigate if the scaling of hnl[i] below can cause
1173     //               speech distortion in double-talk.
1174     for (i = 0; i < (PART_LEN1 >> 3); i++) {
1175       __asm __volatile(
1176           "lh         %[temp1],       0(%[ptr1])                  \n\t"
1177           "lh         %[temp2],       2(%[ptr1])                  \n\t"
1178           "lh         %[temp3],       4(%[ptr1])                  \n\t"
1179           "lh         %[temp4],       6(%[ptr1])                  \n\t"
1180           "lh         %[temp5],       8(%[ptr1])                  \n\t"
1181           "lh         %[temp6],       10(%[ptr1])                 \n\t"
1182           "lh         %[temp7],       12(%[ptr1])                 \n\t"
1183           "lh         %[temp8],       14(%[ptr1])                 \n\t"
1184           "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
1185           "mul        %[temp2],       %[temp2],       %[temp2]    \n\t"
1186           "mul        %[temp3],       %[temp3],       %[temp3]    \n\t"
1187           "mul        %[temp4],       %[temp4],       %[temp4]    \n\t"
1188           "mul        %[temp5],       %[temp5],       %[temp5]    \n\t"
1189           "mul        %[temp6],       %[temp6],       %[temp6]    \n\t"
1190           "mul        %[temp7],       %[temp7],       %[temp7]    \n\t"
1191           "mul        %[temp8],       %[temp8],       %[temp8]    \n\t"
1192           "sra        %[temp1],       %[temp1],       14          \n\t"
1193           "sra        %[temp2],       %[temp2],       14          \n\t"
1194           "sra        %[temp3],       %[temp3],       14          \n\t"
1195           "sra        %[temp4],       %[temp4],       14          \n\t"
1196           "sra        %[temp5],       %[temp5],       14          \n\t"
1197           "sra        %[temp6],       %[temp6],       14          \n\t"
1198           "sra        %[temp7],       %[temp7],       14          \n\t"
1199           "sra        %[temp8],       %[temp8],       14          \n\t"
1200           "sh         %[temp1],       0(%[ptr1])                  \n\t"
1201           "sh         %[temp2],       2(%[ptr1])                  \n\t"
1202           "sh         %[temp3],       4(%[ptr1])                  \n\t"
1203           "sh         %[temp4],       6(%[ptr1])                  \n\t"
1204           "sh         %[temp5],       8(%[ptr1])                  \n\t"
1205           "sh         %[temp6],       10(%[ptr1])                 \n\t"
1206           "sh         %[temp7],       12(%[ptr1])                 \n\t"
1207           "sh         %[temp8],       14(%[ptr1])                 \n\t"
1208           "addiu      %[ptr1],        %[ptr1],        16          \n\t"
1209           : [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
1210             [temp4] "=&r"(temp4), [temp5] "=&r"(temp5), [temp6] "=&r"(temp6),
1211             [temp7] "=&r"(temp7), [temp8] "=&r"(temp8), [ptr1] "+r"(ptr1)
1212           :
1213           : "memory", "hi", "lo");
1214     }
1215     for (i = 0; i < (PART_LEN1 & 7); i++) {
1216       __asm __volatile(
1217           "lh         %[temp1],       0(%[ptr1])                  \n\t"
1218           "mul        %[temp1],       %[temp1],       %[temp1]    \n\t"
1219           "sra        %[temp1],       %[temp1],       14          \n\t"
1220           "sh         %[temp1],       0(%[ptr1])                  \n\t"
1221           "addiu      %[ptr1],        %[ptr1],        2           \n\t"
1222           : [temp1] "=&r"(temp1), [ptr1] "+r"(ptr1)
1223           :
1224           : "memory", "hi", "lo");
1225     }
1226 
1227     for (i = kMinPrefBand; i <= kMaxPrefBand; i++) {
1228       avgHnl32 += (int32_t)hnl[i];
1229     }
1230 
1231     RTC_DCHECK_GT(kMaxPrefBand - kMinPrefBand + 1, 0);
1232     avgHnl32 /= (kMaxPrefBand - kMinPrefBand + 1);
1233 
1234     for (i = kMaxPrefBand; i < PART_LEN1; i++) {
1235       if (hnl[i] > (int16_t)avgHnl32) {
1236         hnl[i] = (int16_t)avgHnl32;
1237       }
1238     }
1239   }
1240 
1241   // Calculate NLP gain, result is in Q14
1242   if (aecm->nlpFlag) {
1243     if (numPosCoef < 3) {
1244       for (i = 0; i < PART_LEN1; i++) {
1245         efw[i].real = 0;
1246         efw[i].imag = 0;
1247         hnl[i] = 0;
1248       }
1249     } else {
1250       for (i = 0; i < PART_LEN1; i++) {
1251 #if defined(MIPS_DSP_R1_LE)
1252         __asm __volatile(
1253             ".set       push                                        \n\t"
1254             ".set       noreorder                                   \n\t"
1255             "lh         %[temp1],       0(%[ptr])                   \n\t"
1256             "lh         %[temp2],       0(%[dr_ptr])                \n\t"
1257             "slti       %[temp4],       %[temp1],       0x4001      \n\t"
1258             "beqz       %[temp4],       3f                          \n\t"
1259             " lh        %[temp3],       2(%[dr_ptr])                \n\t"
1260             "slti       %[temp5],       %[temp1],       3277        \n\t"
1261             "bnez       %[temp5],       2f                          \n\t"
1262             " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
1263             "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
1264             "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
1265             "shra_r.w   %[temp2],       %[temp2],       14          \n\t"
1266             "shra_r.w   %[temp3],       %[temp3],       14          \n\t"
1267             "b          4f                                          \n\t"
1268             " nop                                                   \n\t"
1269             "2:                                                      \n\t"
1270             "addu       %[temp1],       $zero,          $zero       \n\t"
1271             "addu       %[temp2],       $zero,          $zero       \n\t"
1272             "addu       %[temp3],       $zero,          $zero       \n\t"
1273             "b          1f                                          \n\t"
1274             " nop                                                   \n\t"
1275             "3:                                                      \n\t"
1276             "addiu      %[temp1],       $0,             0x4000      \n\t"
1277             "1:                                                      \n\t"
1278             "sh         %[temp1],       0(%[ptr])                   \n\t"
1279             "4:                                                      \n\t"
1280             "sh         %[temp2],       0(%[er_ptr])                \n\t"
1281             "sh         %[temp3],       2(%[er_ptr])                \n\t"
1282             "addiu      %[ptr],         %[ptr],         2           \n\t"
1283             "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
1284             ".set       pop                                         \n\t"
1285             : [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
1286               [temp4] "=&r"(temp4), [temp5] "=&r"(temp5), [ptr] "+r"(ptr),
1287               [er_ptr] "+r"(er_ptr), [dr_ptr] "+r"(dr_ptr)
1288             :
1289             : "memory", "hi", "lo");
1290 #else
1291         __asm __volatile(
1292             ".set       push                                        \n\t"
1293             ".set       noreorder                                   \n\t"
1294             "lh         %[temp1],       0(%[ptr])                   \n\t"
1295             "lh         %[temp2],       0(%[dr_ptr])                \n\t"
1296             "slti       %[temp4],       %[temp1],       0x4001      \n\t"
1297             "beqz       %[temp4],       3f                          \n\t"
1298             " lh        %[temp3],       2(%[dr_ptr])                \n\t"
1299             "slti       %[temp5],       %[temp1],       3277        \n\t"
1300             "bnez       %[temp5],       2f                          \n\t"
1301             " addiu     %[dr_ptr],      %[dr_ptr],      4           \n\t"
1302             "mul        %[temp2],       %[temp2],       %[temp1]    \n\t"
1303             "mul        %[temp3],       %[temp3],       %[temp1]    \n\t"
1304             "addiu      %[temp2],       %[temp2],       0x2000      \n\t"
1305             "addiu      %[temp3],       %[temp3],       0x2000      \n\t"
1306             "sra        %[temp2],       %[temp2],       14          \n\t"
1307             "sra        %[temp3],       %[temp3],       14          \n\t"
1308             "b          4f                                          \n\t"
1309             " nop                                                   \n\t"
1310             "2:                                                      \n\t"
1311             "addu       %[temp1],       $zero,          $zero       \n\t"
1312             "addu       %[temp2],       $zero,          $zero       \n\t"
1313             "addu       %[temp3],       $zero,          $zero       \n\t"
1314             "b          1f                                          \n\t"
1315             " nop                                                   \n\t"
1316             "3:                                                      \n\t"
1317             "addiu      %[temp1],       $0,             0x4000      \n\t"
1318             "1:                                                      \n\t"
1319             "sh         %[temp1],       0(%[ptr])                   \n\t"
1320             "4:                                                      \n\t"
1321             "sh         %[temp2],       0(%[er_ptr])                \n\t"
1322             "sh         %[temp3],       2(%[er_ptr])                \n\t"
1323             "addiu      %[ptr],         %[ptr],         2           \n\t"
1324             "addiu      %[er_ptr],      %[er_ptr],      4           \n\t"
1325             ".set       pop                                         \n\t"
1326             : [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
1327               [temp4] "=&r"(temp4), [temp5] "=&r"(temp5), [ptr] "+r"(ptr),
1328               [er_ptr] "+r"(er_ptr), [dr_ptr] "+r"(dr_ptr)
1329             :
1330             : "memory", "hi", "lo");
1331 #endif
1332       }
1333     }
1334   } else {
1335     // multiply with Wiener coefficients
1336     for (i = 0; i < PART_LEN1; i++) {
1337       efw[i].real = (int16_t)(
1338           WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].real, hnl[i], 14));
1339       efw[i].imag = (int16_t)(
1340           WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(dfw[i].imag, hnl[i], 14));
1341     }
1342   }
1343 
1344   if (aecm->cngMode == AecmTrue) {
1345     ComfortNoise(aecm, ptrDfaClean, efw, hnl);
1346   }
1347 
1348   InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
1349 
1350   return 0;
1351 }
1352 
1353 // Generate comfort noise and add to output signal.
ComfortNoise(AecmCore * aecm,const uint16_t * dfa,ComplexInt16 * out,const int16_t * lambda)1354 static void ComfortNoise(AecmCore* aecm,
1355                          const uint16_t* dfa,
1356                          ComplexInt16* out,
1357                          const int16_t* lambda) {
1358   int16_t i;
1359   int16_t tmp16, tmp161, tmp162, tmp163, nrsh1, nrsh2;
1360   int32_t tmp32, tmp321, tnoise, tnoise1;
1361   int32_t tmp322, tmp323, *tmp1;
1362   int16_t* dfap;
1363   int16_t* lambdap;
1364   const int32_t c2049 = 2049;
1365   const int32_t c359 = 359;
1366   const int32_t c114 = ONE_Q14;
1367 
1368   int16_t randW16[PART_LEN];
1369   int16_t uReal[PART_LEN1];
1370   int16_t uImag[PART_LEN1];
1371   int32_t outLShift32;
1372 
1373   int16_t shiftFromNearToNoise = kNoiseEstQDomain - aecm->dfaCleanQDomain;
1374   int16_t minTrackShift = 9;
1375 
1376   RTC_DCHECK_GE(shiftFromNearToNoise, 0);
1377   RTC_DCHECK_LT(shiftFromNearToNoise, 16);
1378 
1379   if (aecm->noiseEstCtr < 100) {
1380     // Track the minimum more quickly initially.
1381     aecm->noiseEstCtr++;
1382     minTrackShift = 6;
1383   }
1384 
1385   // Generate a uniform random array on [0 2^15-1].
1386   WebRtcSpl_RandUArray(randW16, PART_LEN, &aecm->seed);
1387   int16_t* randW16p = (int16_t*)randW16;
1388 #if defined(MIPS_DSP_R1_LE)
1389   int16_t* kCosTablep = (int16_t*)WebRtcAecm_kCosTable;
1390   int16_t* kSinTablep = (int16_t*)WebRtcAecm_kSinTable;
1391 #endif  // #if defined(MIPS_DSP_R1_LE)
1392   tmp1 = (int32_t*)aecm->noiseEst + 1;
1393   dfap = (int16_t*)dfa + 1;
1394   lambdap = (int16_t*)lambda + 1;
1395   // Estimate noise power.
1396   for (i = 1; i < PART_LEN1; i += 2) {
1397     // Shift to the noise domain.
1398     __asm __volatile(
1399         "lh     %[tmp32],       0(%[dfap])                              \n\t"
1400         "lw     %[tnoise],      0(%[tmp1])                              \n\t"
1401         "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
1402         : [tmp32] "=&r"(tmp32), [outLShift32] "=r"(outLShift32),
1403           [tnoise] "=&r"(tnoise)
1404         : [tmp1] "r"(tmp1), [dfap] "r"(dfap),
1405           [shiftFromNearToNoise] "r"(shiftFromNearToNoise)
1406         : "memory");
1407 
1408     if (outLShift32 < tnoise) {
1409       // Reset "too low" counter
1410       aecm->noiseEstTooLowCtr[i] = 0;
1411       // Track the minimum.
1412       if (tnoise < (1 << minTrackShift)) {
1413         // For small values, decrease noiseEst[i] every
1414         // |kNoiseEstIncCount| block. The regular approach below can not
1415         // go further down due to truncation.
1416         aecm->noiseEstTooHighCtr[i]++;
1417         if (aecm->noiseEstTooHighCtr[i] >= kNoiseEstIncCount) {
1418           tnoise--;
1419           aecm->noiseEstTooHighCtr[i] = 0;  // Reset the counter
1420         }
1421       } else {
1422         __asm __volatile(
1423             "subu   %[tmp32],       %[tnoise],      %[outLShift32]      \n\t"
1424             "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
1425             "subu   %[tnoise],      %[tnoise],      %[tmp32]            \n\t"
1426             : [tmp32] "=&r"(tmp32), [tnoise] "+r"(tnoise)
1427             :
1428             [outLShift32] "r"(outLShift32), [minTrackShift] "r"(minTrackShift));
1429       }
1430     } else {
1431       // Reset "too high" counter
1432       aecm->noiseEstTooHighCtr[i] = 0;
1433       // Ramp slowly upwards until we hit the minimum again.
1434       if ((tnoise >> 19) <= 0) {
1435         if ((tnoise >> 11) > 0) {
1436           // Large enough for relative increase
1437           __asm __volatile(
1438               "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
1439               "sra    %[tnoise],  %[tnoise],  11          \n\t"
1440               : [tnoise] "+r"(tnoise)
1441               : [c2049] "r"(c2049)
1442               : "hi", "lo");
1443         } else {
1444           // Make incremental increases based on size every
1445           // |kNoiseEstIncCount| block
1446           aecm->noiseEstTooLowCtr[i]++;
1447           if (aecm->noiseEstTooLowCtr[i] >= kNoiseEstIncCount) {
1448             __asm __volatile(
1449                 "sra    %[tmp32],   %[tnoise],  9           \n\t"
1450                 "addi   %[tnoise],  %[tnoise],  1           \n\t"
1451                 "addu   %[tnoise],  %[tnoise],  %[tmp32]    \n\t"
1452                 : [tnoise] "+r"(tnoise), [tmp32] "=&r"(tmp32)
1453                 :);
1454             aecm->noiseEstTooLowCtr[i] = 0;  // Reset counter
1455           }
1456         }
1457       } else {
1458         // Avoid overflow.
1459         // Multiplication with 2049 will cause wrap around. Scale
1460         // down first and then multiply
1461         __asm __volatile(
1462             "sra    %[tnoise],  %[tnoise],  11          \n\t"
1463             "mul    %[tnoise],  %[tnoise],  %[c2049]    \n\t"
1464             : [tnoise] "+r"(tnoise)
1465             : [c2049] "r"(c2049)
1466             : "hi", "lo");
1467       }
1468     }
1469 
1470     // Shift to the noise domain.
1471     __asm __volatile(
1472         "lh     %[tmp32],       2(%[dfap])                              \n\t"
1473         "lw     %[tnoise1],     4(%[tmp1])                              \n\t"
1474         "addiu  %[dfap],        %[dfap],    4                           \n\t"
1475         "sllv   %[outLShift32], %[tmp32],   %[shiftFromNearToNoise]     \n\t"
1476         : [tmp32] "=&r"(tmp32), [dfap] "+r"(dfap),
1477           [outLShift32] "=r"(outLShift32), [tnoise1] "=&r"(tnoise1)
1478         : [tmp1] "r"(tmp1), [shiftFromNearToNoise] "r"(shiftFromNearToNoise)
1479         : "memory");
1480 
1481     if (outLShift32 < tnoise1) {
1482       // Reset "too low" counter
1483       aecm->noiseEstTooLowCtr[i + 1] = 0;
1484       // Track the minimum.
1485       if (tnoise1 < (1 << minTrackShift)) {
1486         // For small values, decrease noiseEst[i] every
1487         // |kNoiseEstIncCount| block. The regular approach below can not
1488         // go further down due to truncation.
1489         aecm->noiseEstTooHighCtr[i + 1]++;
1490         if (aecm->noiseEstTooHighCtr[i + 1] >= kNoiseEstIncCount) {
1491           tnoise1--;
1492           aecm->noiseEstTooHighCtr[i + 1] = 0;  // Reset the counter
1493         }
1494       } else {
1495         __asm __volatile(
1496             "subu   %[tmp32],       %[tnoise1],     %[outLShift32]      \n\t"
1497             "srav   %[tmp32],       %[tmp32],       %[minTrackShift]    \n\t"
1498             "subu   %[tnoise1],     %[tnoise1],     %[tmp32]            \n\t"
1499             : [tmp32] "=&r"(tmp32), [tnoise1] "+r"(tnoise1)
1500             :
1501             [outLShift32] "r"(outLShift32), [minTrackShift] "r"(minTrackShift));
1502       }
1503     } else {
1504       // Reset "too high" counter
1505       aecm->noiseEstTooHighCtr[i + 1] = 0;
1506       // Ramp slowly upwards until we hit the minimum again.
1507       if ((tnoise1 >> 19) <= 0) {
1508         if ((tnoise1 >> 11) > 0) {
1509           // Large enough for relative increase
1510           __asm __volatile(
1511               "mul    %[tnoise1], %[tnoise1], %[c2049]   \n\t"
1512               "sra    %[tnoise1], %[tnoise1], 11         \n\t"
1513               : [tnoise1] "+r"(tnoise1)
1514               : [c2049] "r"(c2049)
1515               : "hi", "lo");
1516         } else {
1517           // Make incremental increases based on size every
1518           // |kNoiseEstIncCount| block
1519           aecm->noiseEstTooLowCtr[i + 1]++;
1520           if (aecm->noiseEstTooLowCtr[i + 1] >= kNoiseEstIncCount) {
1521             __asm __volatile(
1522                 "sra    %[tmp32],   %[tnoise1], 9           \n\t"
1523                 "addi   %[tnoise1], %[tnoise1], 1           \n\t"
1524                 "addu   %[tnoise1], %[tnoise1], %[tmp32]    \n\t"
1525                 : [tnoise1] "+r"(tnoise1), [tmp32] "=&r"(tmp32)
1526                 :);
1527             aecm->noiseEstTooLowCtr[i + 1] = 0;  // Reset counter
1528           }
1529         }
1530       } else {
1531         // Avoid overflow.
1532         // Multiplication with 2049 will cause wrap around. Scale
1533         // down first and then multiply
1534         __asm __volatile(
1535             "sra    %[tnoise1], %[tnoise1], 11          \n\t"
1536             "mul    %[tnoise1], %[tnoise1], %[c2049]    \n\t"
1537             : [tnoise1] "+r"(tnoise1)
1538             : [c2049] "r"(c2049)
1539             : "hi", "lo");
1540       }
1541     }
1542 
1543     __asm __volatile(
1544         "lh     %[tmp16],   0(%[lambdap])                           \n\t"
1545         "lh     %[tmp161],  2(%[lambdap])                           \n\t"
1546         "sw     %[tnoise],  0(%[tmp1])                              \n\t"
1547         "sw     %[tnoise1], 4(%[tmp1])                              \n\t"
1548         "subu   %[tmp16],   %[c114],        %[tmp16]                \n\t"
1549         "subu   %[tmp161],  %[c114],        %[tmp161]               \n\t"
1550         "srav   %[tmp32],   %[tnoise],      %[shiftFromNearToNoise] \n\t"
1551         "srav   %[tmp321],  %[tnoise1],     %[shiftFromNearToNoise] \n\t"
1552         "addiu  %[lambdap], %[lambdap],     4                       \n\t"
1553         "addiu  %[tmp1],    %[tmp1],        8                       \n\t"
1554         : [tmp16] "=&r"(tmp16), [tmp161] "=&r"(tmp161), [tmp1] "+r"(tmp1),
1555           [tmp32] "=&r"(tmp32), [tmp321] "=&r"(tmp321), [lambdap] "+r"(lambdap)
1556         : [tnoise] "r"(tnoise), [tnoise1] "r"(tnoise1), [c114] "r"(c114),
1557           [shiftFromNearToNoise] "r"(shiftFromNearToNoise)
1558         : "memory");
1559 
1560     if (tmp32 > 32767) {
1561       tmp32 = 32767;
1562       aecm->noiseEst[i] = tmp32 << shiftFromNearToNoise;
1563     }
1564     if (tmp321 > 32767) {
1565       tmp321 = 32767;
1566       aecm->noiseEst[i + 1] = tmp321 << shiftFromNearToNoise;
1567     }
1568 
1569     __asm __volatile(
1570         "mul    %[tmp32],   %[tmp32],       %[tmp16]                \n\t"
1571         "mul    %[tmp321],  %[tmp321],      %[tmp161]               \n\t"
1572         "sra    %[nrsh1],   %[tmp32],       14                      \n\t"
1573         "sra    %[nrsh2],   %[tmp321],      14                      \n\t"
1574         : [nrsh1] "=&r"(nrsh1), [nrsh2] "=r"(nrsh2)
1575         : [tmp16] "r"(tmp16), [tmp161] "r"(tmp161), [tmp32] "r"(tmp32),
1576           [tmp321] "r"(tmp321)
1577         : "memory", "hi", "lo");
1578 
1579     __asm __volatile(
1580         "lh     %[tmp32],       0(%[randW16p])              \n\t"
1581         "lh     %[tmp321],      2(%[randW16p])              \n\t"
1582         "addiu  %[randW16p],    %[randW16p],    4           \n\t"
1583         "mul    %[tmp32],       %[tmp32],       %[c359]     \n\t"
1584         "mul    %[tmp321],      %[tmp321],      %[c359]     \n\t"
1585         "sra    %[tmp16],       %[tmp32],       15          \n\t"
1586         "sra    %[tmp161],      %[tmp321],      15          \n\t"
1587         : [randW16p] "+r"(randW16p), [tmp32] "=&r"(tmp32), [tmp16] "=r"(tmp16),
1588           [tmp161] "=r"(tmp161), [tmp321] "=&r"(tmp321)
1589         : [c359] "r"(c359)
1590         : "memory", "hi", "lo");
1591 
1592 #if !defined(MIPS_DSP_R1_LE)
1593     tmp32 = WebRtcAecm_kCosTable[tmp16];
1594     tmp321 = WebRtcAecm_kSinTable[tmp16];
1595     tmp322 = WebRtcAecm_kCosTable[tmp161];
1596     tmp323 = WebRtcAecm_kSinTable[tmp161];
1597 #else
1598     __asm __volatile(
1599         "sll    %[tmp16],       %[tmp16],                   1           \n\t"
1600         "sll    %[tmp161],      %[tmp161],                  1           \n\t"
1601         "lhx    %[tmp32],       %[tmp16](%[kCosTablep])                 \n\t"
1602         "lhx    %[tmp321],      %[tmp16](%[kSinTablep])                 \n\t"
1603         "lhx    %[tmp322],      %[tmp161](%[kCosTablep])                \n\t"
1604         "lhx    %[tmp323],      %[tmp161](%[kSinTablep])                \n\t"
1605         : [tmp32] "=&r"(tmp32), [tmp321] "=&r"(tmp321), [tmp322] "=&r"(tmp322),
1606           [tmp323] "=&r"(tmp323)
1607         : [kCosTablep] "r"(kCosTablep), [tmp16] "r"(tmp16),
1608           [tmp161] "r"(tmp161), [kSinTablep] "r"(kSinTablep)
1609         : "memory");
1610 #endif
1611     __asm __volatile(
1612         "mul    %[tmp32],       %[tmp32],                   %[nrsh1]    \n\t"
1613         "negu   %[tmp162],      %[nrsh1]                                \n\t"
1614         "mul    %[tmp322],      %[tmp322],                  %[nrsh2]    \n\t"
1615         "negu   %[tmp163],      %[nrsh2]                                \n\t"
1616         "sra    %[tmp32],       %[tmp32],                   13          \n\t"
1617         "mul    %[tmp321],      %[tmp321],                  %[tmp162]   \n\t"
1618         "sra    %[tmp322],      %[tmp322],                  13          \n\t"
1619         "mul    %[tmp323],      %[tmp323],                  %[tmp163]   \n\t"
1620         "sra    %[tmp321],      %[tmp321],                  13          \n\t"
1621         "sra    %[tmp323],      %[tmp323],                  13          \n\t"
1622         : [tmp32] "+r"(tmp32), [tmp321] "+r"(tmp321), [tmp162] "=&r"(tmp162),
1623           [tmp322] "+r"(tmp322), [tmp323] "+r"(tmp323), [tmp163] "=&r"(tmp163)
1624         : [nrsh1] "r"(nrsh1), [nrsh2] "r"(nrsh2)
1625         : "hi", "lo");
1626     // Tables are in Q13.
1627     uReal[i] = (int16_t)tmp32;
1628     uImag[i] = (int16_t)tmp321;
1629     uReal[i + 1] = (int16_t)tmp322;
1630     uImag[i + 1] = (int16_t)tmp323;
1631   }
1632 
1633   int32_t tt, sgn;
1634   tt = out[0].real;
1635   sgn = ((int)tt) >> 31;
1636   out[0].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1637   tt = out[0].imag;
1638   sgn = ((int)tt) >> 31;
1639   out[0].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1640   for (i = 1; i < PART_LEN; i++) {
1641     tt = out[i].real + uReal[i];
1642     sgn = ((int)tt) >> 31;
1643     out[i].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1644     tt = out[i].imag + uImag[i];
1645     sgn = ((int)tt) >> 31;
1646     out[i].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1647   }
1648   tt = out[PART_LEN].real + uReal[PART_LEN];
1649   sgn = ((int)tt) >> 31;
1650   out[PART_LEN].real = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1651   tt = out[PART_LEN].imag;
1652   sgn = ((int)tt) >> 31;
1653   out[PART_LEN].imag = sgn == (int16_t)(tt >> 15) ? (int16_t)tt : (16384 ^ sgn);
1654 }
1655 
1656 }  // namespace webrtc
1657