1 /*
2  *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef MODULES_AUDIO_PROCESSING_BEAMFORMER_NONLINEAR_BEAMFORMER_H_
12 #define MODULES_AUDIO_PROCESSING_BEAMFORMER_NONLINEAR_BEAMFORMER_H_
13 
14 // MSVC++ requires this to be set before any other includes to get M_PI.
15 #define _USE_MATH_DEFINES
16 
17 #include <math.h>
18 
19 #include <memory>
20 #include <vector>
21 
22 #include "common_audio/lapped_transform.h"
23 #include "common_audio/channel_buffer.h"
24 #include "modules/audio_processing/beamformer/array_util.h"
25 #include "modules/audio_processing/beamformer/complex_matrix.h"
26 
27 namespace webrtc {
28 
29 class PostFilterTransform : public LappedTransform::Callback {
30  public:
31   PostFilterTransform(size_t num_channels,
32                       size_t chunk_length,
33                       float* window,
34                       size_t fft_size);
35 
36   void ProcessChunk(float* const* data, float* final_mask);
37 
38  protected:
39   void ProcessAudioBlock(const complex<float>* const* input,
40                          size_t num_input_channels,
41                          size_t num_freq_bins,
42                          size_t num_output_channels,
43                          complex<float>* const* output) override;
44 
45  private:
46   LappedTransform transform_;
47   const size_t num_freq_bins_;
48   float* final_mask_;
49 };
50 
51 // Enhances sound sources coming directly in front of a uniform linear array
52 // and suppresses sound sources coming from all other directions. Operates on
53 // multichannel signals and produces single-channel output.
54 //
55 // The implemented nonlinear postfilter algorithm taken from "A Robust Nonlinear
56 // Beamforming Postprocessor" by Bastiaan Kleijn.
57 class NonlinearBeamformer : public LappedTransform::Callback {
58  public:
59   static const float kHalfBeamWidthRadians;
60 
61   explicit NonlinearBeamformer(
62       const std::vector<Point>& array_geometry,
63       size_t num_postfilter_channels = 1u,
64       SphericalPointf target_direction =
65           SphericalPointf(static_cast<float>(M_PI) / 2.f, 0.f, 1.f));
66   ~NonlinearBeamformer() override;
67 
68   // Sample rate corresponds to the lower band.
69   // Needs to be called before the NonlinearBeamformer can be used.
70   virtual void Initialize(int chunk_size_ms, int sample_rate_hz);
71 
72   // Analyzes one time-domain chunk of audio. The audio is expected to be split
73   // into frequency bands inside the ChannelBuffer. The number of frames and
74   // channels must correspond to the constructor parameters.
75   virtual void AnalyzeChunk(const ChannelBuffer<float>& data);
76 
77   // Applies the postfilter mask to one chunk of audio. The audio is expected to
78   // be split into frequency bands inside the ChannelBuffer. The number of
79   // frames and channels must correspond to the constructor parameters.
80   virtual void PostFilter(ChannelBuffer<float>* data);
81 
82   virtual void AimAt(const SphericalPointf& target_direction);
83 
84   virtual bool IsInBeam(const SphericalPointf& spherical_point);
85 
86   // After processing each block |is_target_present_| is set to true if the
87   // target signal es present and to false otherwise. This methods can be called
88   // to know if the data is target signal or interference and process it
89   // accordingly.
90   virtual bool is_target_present();
91 
92  protected:
93   // Process one frequency-domain block of audio. This is where the fun
94   // happens. Implements LappedTransform::Callback.
95   void ProcessAudioBlock(const complex<float>* const* input,
96                          size_t num_input_channels,
97                          size_t num_freq_bins,
98                          size_t num_output_channels,
99                          complex<float>* const* output) override;
100 
101  private:
102   FRIEND_TEST_ALL_PREFIXES(NonlinearBeamformerTest,
103                            InterfAnglesTakeAmbiguityIntoAccount);
104 
105   typedef Matrix<float> MatrixF;
106   typedef ComplexMatrix<float> ComplexMatrixF;
107   typedef complex<float> complex_f;
108 
109   void InitLowFrequencyCorrectionRanges();
110   void InitHighFrequencyCorrectionRanges();
111   void InitInterfAngles();
112   void InitDelaySumMasks();
113   void InitTargetCovMats();
114   void InitDiffuseCovMats();
115   void InitInterfCovMats();
116   void NormalizeCovMats();
117 
118   // Calculates postfilter masks that minimize the mean squared error of our
119   // estimation of the desired signal.
120   float CalculatePostfilterMask(const ComplexMatrixF& interf_cov_mat,
121                                 float rpsiw,
122                                 float ratio_rxiw_rxim,
123                                 float rmxi_r);
124 
125   // Prevents the postfilter masks from degenerating too quickly (a cause of
126   // musical noise).
127   void ApplyMaskTimeSmoothing();
128   void ApplyMaskFrequencySmoothing();
129 
130   // The postfilter masks are unreliable at low frequencies. Calculates a better
131   // mask by averaging mid-low frequency values.
132   void ApplyLowFrequencyCorrection();
133 
134   // Postfilter masks are also unreliable at high frequencies. Average mid-high
135   // frequency masks to calculate a single mask per block which can be applied
136   // in the time-domain. Further, we average these block-masks over a chunk,
137   // resulting in one postfilter mask per audio chunk. This allows us to skip
138   // both transforming and blocking the high-frequency signal.
139   void ApplyHighFrequencyCorrection();
140 
141   // Compute the means needed for the above frequency correction.
142   float MaskRangeMean(size_t start_bin, size_t end_bin);
143 
144   // Applies post-filter mask to |input| and store in |output|.
145   void ApplyPostFilter(const complex_f* input, complex_f* output);
146 
147   void EstimateTargetPresence();
148 
149   static const size_t kFftSize = 256;
150   static const size_t kNumFreqBins = kFftSize / 2 + 1;
151 
152   // Deals with the fft transform and blocking.
153   size_t chunk_length_;
154   std::unique_ptr<LappedTransform> process_transform_;
155   std::unique_ptr<PostFilterTransform> postfilter_transform_;
156   float window_[kFftSize];
157 
158   // Parameters exposed to the user.
159   const size_t num_input_channels_;
160   const size_t num_postfilter_channels_;
161   int sample_rate_hz_;
162 
163   const std::vector<Point> array_geometry_;
164   // The normal direction of the array if it has one and it is in the xy-plane.
165   const rtc::Optional<Point> array_normal_;
166 
167   // Minimum spacing between microphone pairs.
168   const float min_mic_spacing_;
169 
170   // Calculated based on user-input and constants in the .cc file.
171   size_t low_mean_start_bin_;
172   size_t low_mean_end_bin_;
173   size_t high_mean_start_bin_;
174   size_t high_mean_end_bin_;
175 
176   // Quickly varying mask updated every block.
177   float new_mask_[kNumFreqBins];
178   // Time smoothed mask.
179   float time_smooth_mask_[kNumFreqBins];
180   // Time and frequency smoothed mask.
181   float final_mask_[kNumFreqBins];
182 
183   float target_angle_radians_;
184   // Angles of the interferer scenarios.
185   std::vector<float> interf_angles_radians_;
186   // The angle between the target and the interferer scenarios.
187   const float away_radians_;
188 
189   // Array of length |kNumFreqBins|, Matrix of size |1| x |num_channels_|.
190   ComplexMatrixF delay_sum_masks_[kNumFreqBins];
191 
192   // Arrays of length |kNumFreqBins|, Matrix of size |num_input_channels_| x
193   // |num_input_channels_|.
194   ComplexMatrixF target_cov_mats_[kNumFreqBins];
195   ComplexMatrixF uniform_cov_mat_[kNumFreqBins];
196   // Array of length |kNumFreqBins|, Matrix of size |num_input_channels_| x
197   // |num_input_channels_|. The vector has a size equal to the number of
198   // interferer scenarios.
199   std::vector<std::unique_ptr<ComplexMatrixF>> interf_cov_mats_[kNumFreqBins];
200 
201   // Of length |kNumFreqBins|.
202   float wave_numbers_[kNumFreqBins];
203 
204   // Preallocated for ProcessAudioBlock()
205   // Of length |kNumFreqBins|.
206   float rxiws_[kNumFreqBins];
207   // The vector has a size equal to the number of interferer scenarios.
208   std::vector<float> rpsiws_[kNumFreqBins];
209 
210   // The microphone normalization factor.
211   ComplexMatrixF eig_m_;
212 
213   // For processing the high-frequency input signal.
214   float high_pass_postfilter_mask_;
215   float old_high_pass_mask_;
216 
217   // True when the target signal is present.
218   bool is_target_present_;
219   // Number of blocks after which the data is considered interference if the
220   // mask does not pass |kMaskSignalThreshold|.
221   size_t hold_target_blocks_;
222   // Number of blocks since the last mask that passed |kMaskSignalThreshold|.
223   size_t interference_blocks_count_;
224 };
225 
226 }  // namespace webrtc
227 
228 #endif  // MODULES_AUDIO_PROCESSING_BEAMFORMER_NONLINEAR_BEAMFORMER_H_
229