1 /*
2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "common_audio/signal_processing/include/real_fft.h"
14 #include "modules/audio_processing/aecm/aecm_core.h"
15 #include "rtc_base/checks.h"
16 
17 namespace webrtc {
18 
19 namespace {
20 
21 // TODO(kma): Re-write the corresponding assembly file, the offset
22 // generating script and makefile, to replace these C functions.
23 
AddLanes(uint32_t * ptr,uint32x4_t v)24 static inline void AddLanes(uint32_t* ptr, uint32x4_t v) {
25 #if defined(WEBRTC_ARCH_ARM64)
26   *(ptr) = vaddvq_u32(v);
27 #else
28   uint32x2_t tmp_v;
29   tmp_v = vadd_u32(vget_low_u32(v), vget_high_u32(v));
30   tmp_v = vpadd_u32(tmp_v, tmp_v);
31   *(ptr) = vget_lane_u32(tmp_v, 0);
32 #endif
33 }
34 
35 }  // namespace
36 
WebRtcAecm_CalcLinearEnergiesNeon(AecmCore * aecm,const uint16_t * far_spectrum,int32_t * echo_est,uint32_t * far_energy,uint32_t * echo_energy_adapt,uint32_t * echo_energy_stored)37 void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm,
38                                        const uint16_t* far_spectrum,
39                                        int32_t* echo_est,
40                                        uint32_t* far_energy,
41                                        uint32_t* echo_energy_adapt,
42                                        uint32_t* echo_energy_stored) {
43   int16_t* start_stored_p = aecm->channelStored;
44   int16_t* start_adapt_p = aecm->channelAdapt16;
45   int32_t* echo_est_p = echo_est;
46   const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
47   const uint16_t* far_spectrum_p = far_spectrum;
48   int16x8_t store_v, adapt_v;
49   uint16x8_t spectrum_v;
50   uint32x4_t echo_est_v_low, echo_est_v_high;
51   uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;
52 
53   far_energy_v = vdupq_n_u32(0);
54   echo_adapt_v = vdupq_n_u32(0);
55   echo_stored_v = vdupq_n_u32(0);
56 
57   // Get energy for the delayed far end signal and estimated
58   // echo using both stored and adapted channels.
59   // The C code:
60   //  for (i = 0; i < PART_LEN1; i++) {
61   //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
62   //                                         far_spectrum[i]);
63   //      (*far_energy) += (uint32_t)(far_spectrum[i]);
64   //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
65   //      (*echo_energy_stored) += (uint32_t)echo_est[i];
66   //  }
67   while (start_stored_p < end_stored_p) {
68     spectrum_v = vld1q_u16(far_spectrum_p);
69     adapt_v = vld1q_s16(start_adapt_p);
70     store_v = vld1q_s16(start_stored_p);
71 
72     far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
73     far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));
74 
75     echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
76                                vget_low_u16(spectrum_v));
77     echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
78                                 vget_high_u16(spectrum_v));
79     vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
80     vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));
81 
82     echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
83     echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);
84 
85     echo_adapt_v =
86         vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_low_s16(adapt_v)),
87                   vget_low_u16(spectrum_v));
88     echo_adapt_v =
89         vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_high_s16(adapt_v)),
90                   vget_high_u16(spectrum_v));
91 
92     start_stored_p += 8;
93     start_adapt_p += 8;
94     far_spectrum_p += 8;
95     echo_est_p += 8;
96   }
97 
98   AddLanes(far_energy, far_energy_v);
99   AddLanes(echo_energy_stored, echo_stored_v);
100   AddLanes(echo_energy_adapt, echo_adapt_v);
101 
102   echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
103                                              far_spectrum[PART_LEN]);
104   *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
105   *far_energy += (uint32_t)far_spectrum[PART_LEN];
106   *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
107 }
108 
WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore * aecm,const uint16_t * far_spectrum,int32_t * echo_est)109 void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore* aecm,
110                                          const uint16_t* far_spectrum,
111                                          int32_t* echo_est) {
112   RTC_DCHECK_EQ(0, (uintptr_t)echo_est % 32);
113   RTC_DCHECK_EQ(0, (uintptr_t)aecm->channelStored % 16);
114   RTC_DCHECK_EQ(0, (uintptr_t)aecm->channelAdapt16 % 16);
115 
116   // This is C code of following optimized code.
117   // During startup we store the channel every block.
118   //  memcpy(aecm->channelStored,
119   //         aecm->channelAdapt16,
120   //         sizeof(int16_t) * PART_LEN1);
121   // Recalculate echo estimate
122   //  for (i = 0; i < PART_LEN; i += 4) {
123   //    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
124   //                                        far_spectrum[i]);
125   //    echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
126   //                                            far_spectrum[i + 1]);
127   //    echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
128   //                                            far_spectrum[i + 2]);
129   //    echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
130   //                                            far_spectrum[i + 3]);
131   //  }
132   //  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
133   //                                     far_spectrum[i]);
134   const uint16_t* far_spectrum_p = far_spectrum;
135   int16_t* start_adapt_p = aecm->channelAdapt16;
136   int16_t* start_stored_p = aecm->channelStored;
137   const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
138   int32_t* echo_est_p = echo_est;
139 
140   uint16x8_t far_spectrum_v;
141   int16x8_t adapt_v;
142   uint32x4_t echo_est_v_low, echo_est_v_high;
143 
144   while (start_stored_p < end_stored_p) {
145     far_spectrum_v = vld1q_u16(far_spectrum_p);
146     adapt_v = vld1q_s16(start_adapt_p);
147 
148     vst1q_s16(start_stored_p, adapt_v);
149 
150     echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v),
151                                vget_low_u16(vreinterpretq_u16_s16(adapt_v)));
152     echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v),
153                                 vget_high_u16(vreinterpretq_u16_s16(adapt_v)));
154 
155     vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
156     vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));
157 
158     far_spectrum_p += 8;
159     start_adapt_p += 8;
160     start_stored_p += 8;
161     echo_est_p += 8;
162   }
163   aecm->channelStored[PART_LEN] = aecm->channelAdapt16[PART_LEN];
164   echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
165                                              far_spectrum[PART_LEN]);
166 }
167 
WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore * aecm)168 void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore* aecm) {
169   RTC_DCHECK_EQ(0, (uintptr_t)aecm->channelStored % 16);
170   RTC_DCHECK_EQ(0, (uintptr_t)aecm->channelAdapt16 % 16);
171   RTC_DCHECK_EQ(0, (uintptr_t)aecm->channelAdapt32 % 32);
172 
173   // The C code of following optimized code.
174   // for (i = 0; i < PART_LEN1; i++) {
175   //   aecm->channelAdapt16[i] = aecm->channelStored[i];
176   //   aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
177   //              (int32_t)aecm->channelStored[i], 16);
178   // }
179 
180   int16_t* start_stored_p = aecm->channelStored;
181   int16_t* start_adapt16_p = aecm->channelAdapt16;
182   int32_t* start_adapt32_p = aecm->channelAdapt32;
183   const int16_t* end_stored_p = start_stored_p + PART_LEN;
184 
185   int16x8_t stored_v;
186   int32x4_t adapt32_v_low, adapt32_v_high;
187 
188   while (start_stored_p < end_stored_p) {
189     stored_v = vld1q_s16(start_stored_p);
190     vst1q_s16(start_adapt16_p, stored_v);
191 
192     adapt32_v_low = vshll_n_s16(vget_low_s16(stored_v), 16);
193     adapt32_v_high = vshll_n_s16(vget_high_s16(stored_v), 16);
194 
195     vst1q_s32(start_adapt32_p, adapt32_v_low);
196     vst1q_s32(start_adapt32_p + 4, adapt32_v_high);
197 
198     start_stored_p += 8;
199     start_adapt16_p += 8;
200     start_adapt32_p += 8;
201   }
202   aecm->channelAdapt16[PART_LEN] = aecm->channelStored[PART_LEN];
203   aecm->channelAdapt32[PART_LEN] = (int32_t)aecm->channelStored[PART_LEN] << 16;
204 }
205 
206 }  // namespace webrtc
207