1 /*
2  *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/audio_processing/agc2/rnn_vad/rnn.h"
12 
13 #include <algorithm>
14 #include <array>
15 #include <cmath>
16 
17 #include "rtc_base/checks.h"
18 #include "third_party/rnnoise/src/rnn_activations.h"
19 #include "third_party/rnnoise/src/rnn_vad_weights.h"
20 
21 namespace webrtc {
22 namespace rnn_vad {
23 
24 using rnnoise::kWeightsScale;
25 
26 using rnnoise::kInputLayerInputSize;
27 static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
28 using rnnoise::kInputDenseWeights;
29 using rnnoise::kInputDenseBias;
30 using rnnoise::kInputLayerOutputSize;
31 static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
32               "Increase kFullyConnectedLayersMaxUnits.");
33 
34 using rnnoise::kHiddenGruRecurrentWeights;
35 using rnnoise::kHiddenGruWeights;
36 using rnnoise::kHiddenGruBias;
37 using rnnoise::kHiddenLayerOutputSize;
38 static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
39               "Increase kRecurrentLayersMaxUnits.");
40 
41 using rnnoise::kOutputDenseWeights;
42 using rnnoise::kOutputDenseBias;
43 using rnnoise::kOutputLayerOutputSize;
44 static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
45               "Increase kFullyConnectedLayersMaxUnits.");
46 
47 using rnnoise::RectifiedLinearUnit;
48 using rnnoise::SigmoidApproximated;
49 using rnnoise::TansigApproximated;
50 
FullyConnectedLayer(const size_t input_size,const size_t output_size,const rtc::ArrayView<const int8_t> bias,const rtc::ArrayView<const int8_t> weights,float (* const activation_function)(float))51 FullyConnectedLayer::FullyConnectedLayer(
52     const size_t input_size,
53     const size_t output_size,
54     const rtc::ArrayView<const int8_t> bias,
55     const rtc::ArrayView<const int8_t> weights,
56     float (*const activation_function)(float))
57     : input_size_(input_size),
58       output_size_(output_size),
59       bias_(bias),
60       weights_(weights),
61       activation_function_(activation_function) {
62   RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
63       << "Static over-allocation of fully-connected layers output vectors is "
64          "not sufficient.";
65   RTC_DCHECK_EQ(output_size_, bias_.size())
66       << "Mismatching output size and bias terms array size.";
67   RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
68       << "Mismatching input-output size and weight coefficients array size.";
69 }
70 
71 FullyConnectedLayer::~FullyConnectedLayer() = default;
72 
GetOutput() const73 rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
74   return rtc::ArrayView<const float>(output_.data(), output_size_);
75 }
76 
ComputeOutput(rtc::ArrayView<const float> input)77 void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
78   // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
79   // operations.
80   for (size_t o = 0; o < output_size_; ++o) {
81     output_[o] = bias_[o];
82     // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
83     // |weights_| change the performance across different platforms.
84     for (size_t i = 0; i < input_size_; ++i) {
85       output_[o] += input[i] * weights_[i * output_size_ + o];
86     }
87     output_[o] = (*activation_function_)(kWeightsScale * output_[o]);
88   }
89 }
90 
GatedRecurrentLayer(const size_t input_size,const size_t output_size,const rtc::ArrayView<const int8_t> bias,const rtc::ArrayView<const int8_t> weights,const rtc::ArrayView<const int8_t> recurrent_weights,float (* const activation_function)(float))91 GatedRecurrentLayer::GatedRecurrentLayer(
92     const size_t input_size,
93     const size_t output_size,
94     const rtc::ArrayView<const int8_t> bias,
95     const rtc::ArrayView<const int8_t> weights,
96     const rtc::ArrayView<const int8_t> recurrent_weights,
97     float (*const activation_function)(float))
98     : input_size_(input_size),
99       output_size_(output_size),
100       bias_(bias),
101       weights_(weights),
102       recurrent_weights_(recurrent_weights),
103       activation_function_(activation_function) {
104   RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
105       << "Static over-allocation of recurrent layers state vectors is not "
106       << "sufficient.";
107   RTC_DCHECK_EQ(3 * output_size_, bias_.size())
108       << "Mismatching output size and bias terms array size.";
109   RTC_DCHECK_EQ(3 * input_size_ * output_size_, weights_.size())
110       << "Mismatching input-output size and weight coefficients array size.";
111   RTC_DCHECK_EQ(3 * input_size_ * output_size_, recurrent_weights_.size())
112       << "Mismatching input-output size and recurrent weight coefficients array"
113       << " size.";
114   Reset();
115 }
116 
117 GatedRecurrentLayer::~GatedRecurrentLayer() = default;
118 
GetOutput() const119 rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
120   return rtc::ArrayView<const float>(state_.data(), output_size_);
121 }
122 
Reset()123 void GatedRecurrentLayer::Reset() {
124   state_.fill(0.f);
125 }
126 
ComputeOutput(rtc::ArrayView<const float> input)127 void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
128   // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
129   // operations.
130   // Stride and offset used to read parameter arrays.
131   const size_t stride = 3 * output_size_;
132   size_t offset = 0;
133 
134   // Compute update gates.
135   std::array<float, kRecurrentLayersMaxUnits> update;
136   for (size_t o = 0; o < output_size_; ++o) {
137     update[o] = bias_[o];
138     // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
139     // |weights_| and |recurrent_weights_| change the performance across
140     // different platforms.
141     for (size_t i = 0; i < input_size_; ++i) {  // Add input.
142       update[o] += input[i] * weights_[i * stride + o];
143     }
144     for (size_t s = 0; s < output_size_; ++s) {
145       update[o] += state_[s] * recurrent_weights_[s * stride + o];
146     }  // Add state.
147     update[o] = SigmoidApproximated(kWeightsScale * update[o]);
148   }
149 
150   // Compute reset gates.
151   offset += output_size_;
152   std::array<float, kRecurrentLayersMaxUnits> reset;
153   for (size_t o = 0; o < output_size_; ++o) {
154     reset[o] = bias_[offset + o];
155     for (size_t i = 0; i < input_size_; ++i) {  // Add input.
156       reset[o] += input[i] * weights_[offset + i * stride + o];
157     }
158     for (size_t s = 0; s < output_size_; ++s) {  // Add state.
159       reset[o] += state_[s] * recurrent_weights_[offset + s * stride + o];
160     }
161     reset[o] = SigmoidApproximated(kWeightsScale * reset[o]);
162   }
163 
164   // Compute output.
165   offset += output_size_;
166   std::array<float, kRecurrentLayersMaxUnits> output;
167   for (size_t o = 0; o < output_size_; ++o) {
168     output[o] = bias_[offset + o];
169     for (size_t i = 0; i < input_size_; ++i) {  // Add input.
170       output[o] += input[i] * weights_[offset + i * stride + o];
171     }
172     for (size_t s = 0; s < output_size_;
173          ++s) {  // Add state through reset gates.
174       output[o] +=
175           state_[s] * recurrent_weights_[offset + s * stride + o] * reset[s];
176     }
177     output[o] = (*activation_function_)(kWeightsScale * output[o]);
178     // Update output through the update gates.
179     output[o] = update[o] * state_[o] + (1.f - update[o]) * output[o];
180   }
181 
182   // Update the state. Not done in the previous loop since that would pollute
183   // the current state and lead to incorrect output values.
184   std::copy(output.begin(), output.end(), state_.begin());
185 }
186 
RnnBasedVad()187 RnnBasedVad::RnnBasedVad()
188     : input_layer_(kInputLayerInputSize,
189                    kInputLayerOutputSize,
190                    kInputDenseBias,
191                    kInputDenseWeights,
192                    TansigApproximated),
193       hidden_layer_(kInputLayerOutputSize,
194                     kHiddenLayerOutputSize,
195                     kHiddenGruBias,
196                     kHiddenGruWeights,
197                     kHiddenGruRecurrentWeights,
198                     RectifiedLinearUnit),
199       output_layer_(kHiddenLayerOutputSize,
200                     kOutputLayerOutputSize,
201                     kOutputDenseBias,
202                     kOutputDenseWeights,
203                     SigmoidApproximated) {
204   // Input-output chaining size checks.
205   RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
206       << "The input and the hidden layers sizes do not match.";
207   RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
208       << "The hidden and the output layers sizes do not match.";
209 }
210 
211 RnnBasedVad::~RnnBasedVad() = default;
212 
Reset()213 void RnnBasedVad::Reset() {
214   hidden_layer_.Reset();
215 }
216 
ComputeVadProbability(rtc::ArrayView<const float,kFeatureVectorSize> feature_vector,bool is_silence)217 float RnnBasedVad::ComputeVadProbability(
218     rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
219     bool is_silence) {
220   if (is_silence) {
221     Reset();
222     return 0.f;
223   }
224   input_layer_.ComputeOutput(feature_vector);
225   hidden_layer_.ComputeOutput(input_layer_.GetOutput());
226   output_layer_.ComputeOutput(hidden_layer_.GetOutput());
227   const auto vad_output = output_layer_.GetOutput();
228   return vad_output[0];
229 }
230 
231 }  // namespace rnn_vad
232 }  // namespace webrtc
233