1 /*
2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/audio_processing/agc2/rnn_vad/rnn.h"
12
13 #include <algorithm>
14 #include <array>
15 #include <cmath>
16
17 #include "rtc_base/checks.h"
18 #include "third_party/rnnoise/src/rnn_activations.h"
19 #include "third_party/rnnoise/src/rnn_vad_weights.h"
20
21 namespace webrtc {
22 namespace rnn_vad {
23
24 using rnnoise::kWeightsScale;
25
26 using rnnoise::kInputLayerInputSize;
27 static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
28 using rnnoise::kInputDenseWeights;
29 using rnnoise::kInputDenseBias;
30 using rnnoise::kInputLayerOutputSize;
31 static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
32 "Increase kFullyConnectedLayersMaxUnits.");
33
34 using rnnoise::kHiddenGruRecurrentWeights;
35 using rnnoise::kHiddenGruWeights;
36 using rnnoise::kHiddenGruBias;
37 using rnnoise::kHiddenLayerOutputSize;
38 static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
39 "Increase kRecurrentLayersMaxUnits.");
40
41 using rnnoise::kOutputDenseWeights;
42 using rnnoise::kOutputDenseBias;
43 using rnnoise::kOutputLayerOutputSize;
44 static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
45 "Increase kFullyConnectedLayersMaxUnits.");
46
47 using rnnoise::RectifiedLinearUnit;
48 using rnnoise::SigmoidApproximated;
49 using rnnoise::TansigApproximated;
50
FullyConnectedLayer(const size_t input_size,const size_t output_size,const rtc::ArrayView<const int8_t> bias,const rtc::ArrayView<const int8_t> weights,float (* const activation_function)(float))51 FullyConnectedLayer::FullyConnectedLayer(
52 const size_t input_size,
53 const size_t output_size,
54 const rtc::ArrayView<const int8_t> bias,
55 const rtc::ArrayView<const int8_t> weights,
56 float (*const activation_function)(float))
57 : input_size_(input_size),
58 output_size_(output_size),
59 bias_(bias),
60 weights_(weights),
61 activation_function_(activation_function) {
62 RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
63 << "Static over-allocation of fully-connected layers output vectors is "
64 "not sufficient.";
65 RTC_DCHECK_EQ(output_size_, bias_.size())
66 << "Mismatching output size and bias terms array size.";
67 RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
68 << "Mismatching input-output size and weight coefficients array size.";
69 }
70
71 FullyConnectedLayer::~FullyConnectedLayer() = default;
72
GetOutput() const73 rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
74 return rtc::ArrayView<const float>(output_.data(), output_size_);
75 }
76
ComputeOutput(rtc::ArrayView<const float> input)77 void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
78 // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
79 // operations.
80 for (size_t o = 0; o < output_size_; ++o) {
81 output_[o] = bias_[o];
82 // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
83 // |weights_| change the performance across different platforms.
84 for (size_t i = 0; i < input_size_; ++i) {
85 output_[o] += input[i] * weights_[i * output_size_ + o];
86 }
87 output_[o] = (*activation_function_)(kWeightsScale * output_[o]);
88 }
89 }
90
GatedRecurrentLayer(const size_t input_size,const size_t output_size,const rtc::ArrayView<const int8_t> bias,const rtc::ArrayView<const int8_t> weights,const rtc::ArrayView<const int8_t> recurrent_weights,float (* const activation_function)(float))91 GatedRecurrentLayer::GatedRecurrentLayer(
92 const size_t input_size,
93 const size_t output_size,
94 const rtc::ArrayView<const int8_t> bias,
95 const rtc::ArrayView<const int8_t> weights,
96 const rtc::ArrayView<const int8_t> recurrent_weights,
97 float (*const activation_function)(float))
98 : input_size_(input_size),
99 output_size_(output_size),
100 bias_(bias),
101 weights_(weights),
102 recurrent_weights_(recurrent_weights),
103 activation_function_(activation_function) {
104 RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
105 << "Static over-allocation of recurrent layers state vectors is not "
106 << "sufficient.";
107 RTC_DCHECK_EQ(3 * output_size_, bias_.size())
108 << "Mismatching output size and bias terms array size.";
109 RTC_DCHECK_EQ(3 * input_size_ * output_size_, weights_.size())
110 << "Mismatching input-output size and weight coefficients array size.";
111 RTC_DCHECK_EQ(3 * input_size_ * output_size_, recurrent_weights_.size())
112 << "Mismatching input-output size and recurrent weight coefficients array"
113 << " size.";
114 Reset();
115 }
116
117 GatedRecurrentLayer::~GatedRecurrentLayer() = default;
118
GetOutput() const119 rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
120 return rtc::ArrayView<const float>(state_.data(), output_size_);
121 }
122
Reset()123 void GatedRecurrentLayer::Reset() {
124 state_.fill(0.f);
125 }
126
ComputeOutput(rtc::ArrayView<const float> input)127 void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
128 // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
129 // operations.
130 // Stride and offset used to read parameter arrays.
131 const size_t stride = 3 * output_size_;
132 size_t offset = 0;
133
134 // Compute update gates.
135 std::array<float, kRecurrentLayersMaxUnits> update;
136 for (size_t o = 0; o < output_size_; ++o) {
137 update[o] = bias_[o];
138 // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
139 // |weights_| and |recurrent_weights_| change the performance across
140 // different platforms.
141 for (size_t i = 0; i < input_size_; ++i) { // Add input.
142 update[o] += input[i] * weights_[i * stride + o];
143 }
144 for (size_t s = 0; s < output_size_; ++s) {
145 update[o] += state_[s] * recurrent_weights_[s * stride + o];
146 } // Add state.
147 update[o] = SigmoidApproximated(kWeightsScale * update[o]);
148 }
149
150 // Compute reset gates.
151 offset += output_size_;
152 std::array<float, kRecurrentLayersMaxUnits> reset;
153 for (size_t o = 0; o < output_size_; ++o) {
154 reset[o] = bias_[offset + o];
155 for (size_t i = 0; i < input_size_; ++i) { // Add input.
156 reset[o] += input[i] * weights_[offset + i * stride + o];
157 }
158 for (size_t s = 0; s < output_size_; ++s) { // Add state.
159 reset[o] += state_[s] * recurrent_weights_[offset + s * stride + o];
160 }
161 reset[o] = SigmoidApproximated(kWeightsScale * reset[o]);
162 }
163
164 // Compute output.
165 offset += output_size_;
166 std::array<float, kRecurrentLayersMaxUnits> output;
167 for (size_t o = 0; o < output_size_; ++o) {
168 output[o] = bias_[offset + o];
169 for (size_t i = 0; i < input_size_; ++i) { // Add input.
170 output[o] += input[i] * weights_[offset + i * stride + o];
171 }
172 for (size_t s = 0; s < output_size_;
173 ++s) { // Add state through reset gates.
174 output[o] +=
175 state_[s] * recurrent_weights_[offset + s * stride + o] * reset[s];
176 }
177 output[o] = (*activation_function_)(kWeightsScale * output[o]);
178 // Update output through the update gates.
179 output[o] = update[o] * state_[o] + (1.f - update[o]) * output[o];
180 }
181
182 // Update the state. Not done in the previous loop since that would pollute
183 // the current state and lead to incorrect output values.
184 std::copy(output.begin(), output.end(), state_.begin());
185 }
186
RnnBasedVad()187 RnnBasedVad::RnnBasedVad()
188 : input_layer_(kInputLayerInputSize,
189 kInputLayerOutputSize,
190 kInputDenseBias,
191 kInputDenseWeights,
192 TansigApproximated),
193 hidden_layer_(kInputLayerOutputSize,
194 kHiddenLayerOutputSize,
195 kHiddenGruBias,
196 kHiddenGruWeights,
197 kHiddenGruRecurrentWeights,
198 RectifiedLinearUnit),
199 output_layer_(kHiddenLayerOutputSize,
200 kOutputLayerOutputSize,
201 kOutputDenseBias,
202 kOutputDenseWeights,
203 SigmoidApproximated) {
204 // Input-output chaining size checks.
205 RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
206 << "The input and the hidden layers sizes do not match.";
207 RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
208 << "The hidden and the output layers sizes do not match.";
209 }
210
211 RnnBasedVad::~RnnBasedVad() = default;
212
Reset()213 void RnnBasedVad::Reset() {
214 hidden_layer_.Reset();
215 }
216
ComputeVadProbability(rtc::ArrayView<const float,kFeatureVectorSize> feature_vector,bool is_silence)217 float RnnBasedVad::ComputeVadProbability(
218 rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
219 bool is_silence) {
220 if (is_silence) {
221 Reset();
222 return 0.f;
223 }
224 input_layer_.ComputeOutput(feature_vector);
225 hidden_layer_.ComputeOutput(input_layer_.GetOutput());
226 output_layer_.ComputeOutput(hidden_layer_.GetOutput());
227 const auto vad_output = output_layer_.GetOutput();
228 return vad_output[0];
229 }
230
231 } // namespace rnn_vad
232 } // namespace webrtc
233