agc2/rnn_vad/rnn.cc

/*
 *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_processing/agc2/rnn_vad/rnn.h"

#include <algorithm>
#include <array>
#include <cmath>

#include "rtc_base/checks.h"
#include "third_party/rnnoise/src/rnn_activations.h"
#include "third_party/rnnoise/src/rnn_vad_weights.h"

namespace webrtc {
namespace rnn_vad {

using rnnoise::kWeightsScale;

using rnnoise::kInputLayerInputSize;
static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
using rnnoise::kInputDenseWeights;
using rnnoise::kInputDenseBias;
using rnnoise::kInputLayerOutputSize;
static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
              "Increase kFullyConnectedLayersMaxUnits.");

using rnnoise::kHiddenGruRecurrentWeights;
using rnnoise::kHiddenGruWeights;
using rnnoise::kHiddenGruBias;
using rnnoise::kHiddenLayerOutputSize;
static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
              "Increase kRecurrentLayersMaxUnits.");

using rnnoise::kOutputDenseWeights;
using rnnoise::kOutputDenseBias;
using rnnoise::kOutputLayerOutputSize;
static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
              "Increase kFullyConnectedLayersMaxUnits.");

using rnnoise::RectifiedLinearUnit;
using rnnoise::SigmoidApproximated;
using rnnoise::TansigApproximated;

FullyConnectedLayer::FullyConnectedLayer(
    const size_t input_size,
    const size_t output_size,
    const rtc::ArrayView<const int8_t> bias,
    const rtc::ArrayView<const int8_t> weights,
    float (*const activation_function)(float))
    : input_size_(input_size),
      output_size_(output_size),
      bias_(bias),
      weights_(weights),
      activation_function_(activation_function) {
  RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
      << "Static over-allocation of fully-connected layers output vectors is "
         "not sufficient.";
  RTC_DCHECK_EQ(output_size_, bias_.size())
      << "Mismatching output size and bias terms array size.";
  RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
      << "Mismatching input-output size and weight coefficients array size.";
}

FullyConnectedLayer::~FullyConnectedLayer() = default;

rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
  return rtc::ArrayView<const float>(output_.data(), output_size_);
}

void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
  // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
  // operations.
  for (size_t o = 0; o < output_size_; ++o) {
    output_[o] = bias_[o];
    // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
    // |weights_| change the performance across different platforms.
    for (size_t i = 0; i < input_size_; ++i) {
      output_[o] += input[i] * weights_[i * output_size_ + o];
    }
    output_[o] = (*activation_function_)(kWeightsScale * output_[o]);
  }
}

GatedRecurrentLayer::GatedRecurrentLayer(
    const size_t input_size,
    const size_t output_size,
    const rtc::ArrayView<const int8_t> bias,
    const rtc::ArrayView<const int8_t> weights,
    const rtc::ArrayView<const int8_t> recurrent_weights,
    float (*const activation_function)(float))
    : input_size_(input_size),
      output_size_(output_size),
      bias_(bias),
      weights_(weights),
      recurrent_weights_(recurrent_weights),
      activation_function_(activation_function) {
  RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
      << "Static over-allocation of recurrent layers state vectors is not "
      << "sufficient.";
  RTC_DCHECK_EQ(3 * output_size_, bias_.size())
      << "Mismatching output size and bias terms array size.";
  RTC_DCHECK_EQ(3 * input_size_ * output_size_, weights_.size())
      << "Mismatching input-output size and weight coefficients array size.";
  RTC_DCHECK_EQ(3 * input_size_ * output_size_, recurrent_weights_.size())
      << "Mismatching input-output size and recurrent weight coefficients array"
      << " size.";
  Reset();
}

GatedRecurrentLayer::~GatedRecurrentLayer() = default;

rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
  return rtc::ArrayView<const float>(state_.data(), output_size_);
}

void GatedRecurrentLayer::Reset() {
  state_.fill(0.f);
}

void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
  // TODO(bugs.chromium.org/9076): Optimize using SSE/AVX fused multiply-add
  // operations.
  // Stride and offset used to read parameter arrays.
  const size_t stride = 3 * output_size_;
  size_t offset = 0;

  // Compute update gates.
  std::array<float, kRecurrentLayersMaxUnits> update;
  for (size_t o = 0; o < output_size_; ++o) {
    update[o] = bias_[o];
    // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
    // |weights_| and |recurrent_weights_| change the performance across
    // different platforms.
    for (size_t i = 0; i < input_size_; ++i) {  // Add input.
      update[o] += input[i] * weights_[i * stride + o];
    }
    for (size_t s = 0; s < output_size_; ++s) {
      update[o] += state_[s] * recurrent_weights_[s * stride + o];
    }  // Add state.
    update[o] = SigmoidApproximated(kWeightsScale * update[o]);
  }

  // Compute reset gates.
  offset += output_size_;
  std::array<float, kRecurrentLayersMaxUnits> reset;
  for (size_t o = 0; o < output_size_; ++o) {
    reset[o] = bias_[offset + o];
    for (size_t i = 0; i < input_size_; ++i) {  // Add input.
      reset[o] += input[i] * weights_[offset + i * stride + o];
    }
    for (size_t s = 0; s < output_size_; ++s) {  // Add state.
      reset[o] += state_[s] * recurrent_weights_[offset + s * stride + o];
    }
    reset[o] = SigmoidApproximated(kWeightsScale * reset[o]);
  }

  // Compute output.
  offset += output_size_;
  std::array<float, kRecurrentLayersMaxUnits> output;
  for (size_t o = 0; o < output_size_; ++o) {
    output[o] = bias_[offset + o];
    for (size_t i = 0; i < input_size_; ++i) {  // Add input.
      output[o] += input[i] * weights_[offset + i * stride + o];
    }
    for (size_t s = 0; s < output_size_;
         ++s) {  // Add state through reset gates.
      output[o] +=
          state_[s] * recurrent_weights_[offset + s * stride + o] * reset[s];
    }
    output[o] = (*activation_function_)(kWeightsScale * output[o]);
    // Update output through the update gates.
    output[o] = update[o] * state_[o] + (1.f - update[o]) * output[o];
  }

  // Update the state. Not done in the previous loop since that would pollute
  // the current state and lead to incorrect output values.
  std::copy(output.begin(), output.end(), state_.begin());
}

RnnBasedVad::RnnBasedVad()
    : input_layer_(kInputLayerInputSize,
                   kInputLayerOutputSize,
                   kInputDenseBias,
                   kInputDenseWeights,
                   TansigApproximated),
      hidden_layer_(kInputLayerOutputSize,
                    kHiddenLayerOutputSize,
                    kHiddenGruBias,
                    kHiddenGruWeights,
                    kHiddenGruRecurrentWeights,
                    RectifiedLinearUnit),
      output_layer_(kHiddenLayerOutputSize,
                    kOutputLayerOutputSize,
                    kOutputDenseBias,
                    kOutputDenseWeights,
                    SigmoidApproximated) {
  // Input-output chaining size checks.
  RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
      << "The input and the hidden layers sizes do not match.";
  RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
      << "The hidden and the output layers sizes do not match.";
}

RnnBasedVad::~RnnBasedVad() = default;

void RnnBasedVad::Reset() {
  hidden_layer_.Reset();
}

float RnnBasedVad::ComputeVadProbability(
    rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
    bool is_silence) {
  if (is_silence) {
    Reset();
    return 0.f;
  }
  input_layer_.ComputeOutput(feature_vector);
  hidden_layer_.ComputeOutput(input_layer_.GetOutput());
  output_layer_.ComputeOutput(hidden_layer_.GetOutput());
  const auto vad_output = output_layer_.GetOutput();
  return vad_output[0];
}

}  // namespace rnn_vad
}  // namespace webrtc