1 /* 2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 // 12 // Specifies core class for intelligbility enhancement. 13 // 14 15 #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 16 #define WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 17 18 #include <complex> 19 #include <vector> 20 21 #include "webrtc/base/scoped_ptr.h" 22 #include "webrtc/common_audio/lapped_transform.h" 23 #include "webrtc/common_audio/channel_buffer.h" 24 #include "webrtc/modules/audio_processing/intelligibility/intelligibility_utils.h" 25 26 namespace webrtc { 27 28 // Speech intelligibility enhancement module. Reads render and capture 29 // audio streams and modifies the render stream with a set of gains per 30 // frequency bin to enhance speech against the noise background. 31 // Note: assumes speech and noise streams are already separated. 32 class IntelligibilityEnhancer { 33 public: 34 struct Config { 35 // |var_*| are parameters for the VarianceArray constructor for the 36 // clear speech stream. 37 // TODO(bercic): the |var_*|, |*_rate| and |gain_limit| parameters should 38 // probably go away once fine tuning is done. ConfigConfig39 Config() 40 : sample_rate_hz(16000), 41 num_capture_channels(1), 42 num_render_channels(1), 43 var_type(intelligibility::VarianceArray::kStepDecaying), 44 var_decay_rate(0.9f), 45 var_window_size(10), 46 analysis_rate(800), 47 gain_change_limit(0.1f), 48 rho(0.02f) {} 49 int sample_rate_hz; 50 int num_capture_channels; 51 int num_render_channels; 52 intelligibility::VarianceArray::StepType var_type; 53 float var_decay_rate; 54 size_t var_window_size; 55 int analysis_rate; 56 float gain_change_limit; 57 float rho; 58 }; 59 60 explicit IntelligibilityEnhancer(const Config& config); 61 IntelligibilityEnhancer(); // Initialize with default config. 62 63 // Reads and processes chunk of noise stream in time domain. 64 void AnalyzeCaptureAudio(float* const* audio, 65 int sample_rate_hz, 66 int num_channels); 67 68 // Reads chunk of speech in time domain and updates with modified signal. 69 void ProcessRenderAudio(float* const* audio, 70 int sample_rate_hz, 71 int num_channels); 72 bool active() const; 73 74 private: 75 enum AudioSource { 76 kRenderStream = 0, // Clear speech stream. 77 kCaptureStream, // Noise stream. 78 }; 79 80 // Provides access point to the frequency domain. 81 class TransformCallback : public LappedTransform::Callback { 82 public: 83 TransformCallback(IntelligibilityEnhancer* parent, AudioSource source); 84 85 // All in frequency domain, receives input |in_block|, applies 86 // intelligibility enhancement, and writes result to |out_block|. 87 void ProcessAudioBlock(const std::complex<float>* const* in_block, 88 int in_channels, 89 size_t frames, 90 int out_channels, 91 std::complex<float>* const* out_block) override; 92 93 private: 94 IntelligibilityEnhancer* parent_; 95 AudioSource source_; 96 }; 97 friend class TransformCallback; 98 #ifndef WEBRTC_AUDIO_PROCESSING_ONLY_BUILD 99 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestErbCreation); 100 FRIEND_TEST_ALL_PREFIXES(IntelligibilityEnhancerTest, TestSolveForGains); 101 #endif 102 103 // Sends streams to ProcessClearBlock or ProcessNoiseBlock based on source. 104 void DispatchAudio(AudioSource source, 105 const std::complex<float>* in_block, 106 std::complex<float>* out_block); 107 108 // Updates variance computation and analysis with |in_block_|, 109 // and writes modified speech to |out_block|. 110 void ProcessClearBlock(const std::complex<float>* in_block, 111 std::complex<float>* out_block); 112 113 // Computes and sets modified gains. 114 void AnalyzeClearBlock(float power_target); 115 116 // Bisection search for optimal |lambda|. 117 void SolveForLambda(float power_target, float power_bot, float power_top); 118 119 // Transforms freq gains to ERB gains. 120 void UpdateErbGains(); 121 122 // Updates variance calculation for noise input with |in_block|. 123 void ProcessNoiseBlock(const std::complex<float>* in_block, 124 std::complex<float>* out_block); 125 126 // Returns number of ERB filters. 127 static size_t GetBankSize(int sample_rate, size_t erb_resolution); 128 129 // Initializes ERB filterbank. 130 void CreateErbBank(); 131 132 // Analytically solves quadratic for optimal gains given |lambda|. 133 // Negative gains are set to 0. Stores the results in |sols|. 134 void SolveForGainsGivenLambda(float lambda, size_t start_freq, float* sols); 135 136 // Computes variance across ERB filters from freq variance |var|. 137 // Stores in |result|. 138 void FilterVariance(const float* var, float* result); 139 140 // Returns dot product of vectors specified by size |length| arrays |a|,|b|. 141 static float DotProduct(const float* a, const float* b, size_t length); 142 143 const size_t freqs_; // Num frequencies in frequency domain. 144 const size_t window_size_; // Window size in samples; also the block size. 145 const size_t chunk_length_; // Chunk size in samples. 146 const size_t bank_size_; // Num ERB filters. 147 const int sample_rate_hz_; 148 const int erb_resolution_; 149 const int num_capture_channels_; 150 const int num_render_channels_; 151 const int analysis_rate_; // Num blocks before gains recalculated. 152 153 const bool active_; // Whether render gains are being updated. 154 // TODO(ekm): Add logic for updating |active_|. 155 156 intelligibility::VarianceArray clear_variance_; 157 intelligibility::VarianceArray noise_variance_; 158 rtc::scoped_ptr<float[]> filtered_clear_var_; 159 rtc::scoped_ptr<float[]> filtered_noise_var_; 160 std::vector<std::vector<float>> filter_bank_; 161 rtc::scoped_ptr<float[]> center_freqs_; 162 size_t start_freq_; 163 rtc::scoped_ptr<float[]> rho_; // Production and interpretation SNR. 164 // for each ERB band. 165 rtc::scoped_ptr<float[]> gains_eq_; // Pre-filter modified gains. 166 intelligibility::GainApplier gain_applier_; 167 168 // Destination buffers used to reassemble blocked chunks before overwriting 169 // the original input array with modifications. 170 ChannelBuffer<float> temp_render_out_buffer_; 171 ChannelBuffer<float> temp_capture_out_buffer_; 172 173 rtc::scoped_ptr<float[]> kbd_window_; 174 TransformCallback render_callback_; 175 TransformCallback capture_callback_; 176 rtc::scoped_ptr<LappedTransform> render_mangler_; 177 rtc::scoped_ptr<LappedTransform> capture_mangler_; 178 int block_count_; 179 int analysis_step_; 180 }; 181 182 } // namespace webrtc 183 184 #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INTELLIGIBILITY_INTELLIGIBILITY_ENHANCER_H_ 185