1 /*
2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "common_audio/vad/vad_core.h"
12 
13 #include "rtc_base/sanitizer.h"
14 #include "common_audio/signal_processing/include/signal_processing_library.h"
15 #include "common_audio/vad/vad_filterbank.h"
16 #include "common_audio/vad/vad_gmm.h"
17 #include "common_audio/vad/vad_sp.h"
18 #include "typedefs.h"  // NOLINT(build/include)
19 
20 // Spectrum Weighting
21 static const int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 };
22 static const int16_t kNoiseUpdateConst = 655; // Q15
23 static const int16_t kSpeechUpdateConst = 6554; // Q15
24 static const int16_t kBackEta = 154; // Q8
25 // Minimum difference between the two models, Q5
26 static const int16_t kMinimumDifference[kNumChannels] = {
27     544, 544, 576, 576, 576, 576 };
28 // Upper limit of mean value for speech model, Q7
29 static const int16_t kMaximumSpeech[kNumChannels] = {
30     11392, 11392, 11520, 11520, 11520, 11520 };
31 // Minimum value for mean value
32 static const int16_t kMinimumMean[kNumGaussians] = { 640, 768 };
33 // Upper limit of mean value for noise model, Q7
34 static const int16_t kMaximumNoise[kNumChannels] = {
35     9216, 9088, 8960, 8832, 8704, 8576 };
36 // Start values for the Gaussian models, Q7
37 // Weights for the two Gaussians for the six channels (noise)
38 static const int16_t kNoiseDataWeights[kTableSize] = {
39     34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 };
40 // Weights for the two Gaussians for the six channels (speech)
41 static const int16_t kSpeechDataWeights[kTableSize] = {
42     48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 };
43 // Means for the two Gaussians for the six channels (noise)
44 static const int16_t kNoiseDataMeans[kTableSize] = {
45     6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 };
46 // Means for the two Gaussians for the six channels (speech)
47 static const int16_t kSpeechDataMeans[kTableSize] = {
48     8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
49 };
50 // Stds for the two Gaussians for the six channels (noise)
51 static const int16_t kNoiseDataStds[kTableSize] = {
52     378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 };
53 // Stds for the two Gaussians for the six channels (speech)
54 static const int16_t kSpeechDataStds[kTableSize] = {
55     555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
56 
57 // Constants used in GmmProbability().
58 //
59 // Maximum number of counted speech (VAD = 1) frames in a row.
60 static const int16_t kMaxSpeechFrames = 6;
61 // Minimum standard deviation for both speech and noise.
62 static const int16_t kMinStd = 384;
63 
64 // Constants in WebRtcVad_InitCore().
65 // Default aggressiveness mode.
66 static const short kDefaultMode = 0;
67 static const int kInitCheck = 42;
68 
69 // Constants used in WebRtcVad_set_mode_core().
70 //
71 // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
72 //
73 // Mode 0, Quality.
74 static const int16_t kOverHangMax1Q[3] = { 8, 4, 3 };
75 static const int16_t kOverHangMax2Q[3] = { 14, 7, 5 };
76 static const int16_t kLocalThresholdQ[3] = { 24, 21, 24 };
77 static const int16_t kGlobalThresholdQ[3] = { 57, 48, 57 };
78 // Mode 1, Low bitrate.
79 static const int16_t kOverHangMax1LBR[3] = { 8, 4, 3 };
80 static const int16_t kOverHangMax2LBR[3] = { 14, 7, 5 };
81 static const int16_t kLocalThresholdLBR[3] = { 37, 32, 37 };
82 static const int16_t kGlobalThresholdLBR[3] = { 100, 80, 100 };
83 // Mode 2, Aggressive.
84 static const int16_t kOverHangMax1AGG[3] = { 6, 3, 2 };
85 static const int16_t kOverHangMax2AGG[3] = { 9, 5, 3 };
86 static const int16_t kLocalThresholdAGG[3] = { 82, 78, 82 };
87 static const int16_t kGlobalThresholdAGG[3] = { 285, 260, 285 };
88 // Mode 3, Very aggressive.
89 static const int16_t kOverHangMax1VAG[3] = { 6, 3, 2 };
90 static const int16_t kOverHangMax2VAG[3] = { 9, 5, 3 };
91 static const int16_t kLocalThresholdVAG[3] = { 94, 94, 94 };
92 static const int16_t kGlobalThresholdVAG[3] = { 1100, 1050, 1100 };
93 
94 // Calculates the weighted average w.r.t. number of Gaussians. The |data| are
95 // updated with an |offset| before averaging.
96 //
97 // - data     [i/o] : Data to average.
98 // - offset   [i]   : An offset added to |data|.
99 // - weights  [i]   : Weights used for averaging.
100 //
101 // returns          : The weighted average.
WeightedAverage(int16_t * data,int16_t offset,const int16_t * weights)102 static int32_t WeightedAverage(int16_t* data, int16_t offset,
103                                const int16_t* weights) {
104   int k;
105   int32_t weighted_average = 0;
106 
107   for (k = 0; k < kNumGaussians; k++) {
108     data[k * kNumChannels] += offset;
109     weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
110   }
111   return weighted_average;
112 }
113 
114 // An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
115 // undefined behavior, so not a good idea; this just makes UBSan ignore the
116 // violation, so that our old code can continue to do what it's always been
117 // doing.)
118 static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow")
OverflowingMulS16ByS32ToS32(int16_t a,int32_t b)119     OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) {
120   return a * b;
121 }
122 
123 // Calculates the probabilities for both speech and background noise using
124 // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
125 // type of signal is most probable.
126 //
127 // - self           [i/o] : Pointer to VAD instance
128 // - features       [i]   : Feature vector of length |kNumChannels|
129 //                          = log10(energy in frequency band)
130 // - total_power    [i]   : Total power in audio frame.
131 // - frame_length   [i]   : Number of input samples
132 //
133 // - returns              : the VAD decision (0 - noise, 1 - speech).
GmmProbability(VadInstT * self,int16_t * features,int16_t total_power,size_t frame_length)134 static int16_t GmmProbability(VadInstT* self, int16_t* features,
135                               int16_t total_power, size_t frame_length) {
136   int channel, k;
137   int16_t feature_minimum;
138   int16_t h0, h1;
139   int16_t log_likelihood_ratio;
140   int16_t vadflag = 0;
141   int16_t shifts_h0, shifts_h1;
142   int16_t tmp_s16, tmp1_s16, tmp2_s16;
143   int16_t diff;
144   int gaussian;
145   int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
146   int16_t delt, ndelt;
147   int16_t maxspe, maxmu;
148   int16_t deltaN[kTableSize], deltaS[kTableSize];
149   int16_t ngprvec[kTableSize] = { 0 };  // Conditional probability = 0.
150   int16_t sgprvec[kTableSize] = { 0 };  // Conditional probability = 0.
151   int32_t h0_test, h1_test;
152   int32_t tmp1_s32, tmp2_s32;
153   int32_t sum_log_likelihood_ratios = 0;
154   int32_t noise_global_mean, speech_global_mean;
155   int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
156   int16_t overhead1, overhead2, individualTest, totalTest;
157 
158   // Set various thresholds based on frame lengths (80, 160 or 240 samples).
159   if (frame_length == 80) {
160     overhead1 = self->over_hang_max_1[0];
161     overhead2 = self->over_hang_max_2[0];
162     individualTest = self->individual[0];
163     totalTest = self->total[0];
164   } else if (frame_length == 160) {
165     overhead1 = self->over_hang_max_1[1];
166     overhead2 = self->over_hang_max_2[1];
167     individualTest = self->individual[1];
168     totalTest = self->total[1];
169   } else {
170     overhead1 = self->over_hang_max_1[2];
171     overhead2 = self->over_hang_max_2[2];
172     individualTest = self->individual[2];
173     totalTest = self->total[2];
174   }
175 
176   if (total_power > kMinEnergy) {
177     // The signal power of current frame is large enough for processing. The
178     // processing consists of two parts:
179     // 1) Calculating the likelihood of speech and thereby a VAD decision.
180     // 2) Updating the underlying model, w.r.t., the decision made.
181 
182     // The detection scheme is an LRT with hypothesis
183     // H0: Noise
184     // H1: Speech
185     //
186     // We combine a global LRT with local tests, for each frequency sub-band,
187     // here defined as |channel|.
188     for (channel = 0; channel < kNumChannels; channel++) {
189       // For each channel we model the probability with a GMM consisting of
190       // |kNumGaussians|, with different means and standard deviations depending
191       // on H0 or H1.
192       h0_test = 0;
193       h1_test = 0;
194       for (k = 0; k < kNumGaussians; k++) {
195         gaussian = channel + k * kNumChannels;
196         // Probability under H0, that is, probability of frame being noise.
197         // Value given in Q27 = Q7 * Q20.
198         tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
199                                                  self->noise_means[gaussian],
200                                                  self->noise_stds[gaussian],
201                                                  &deltaN[gaussian]);
202         noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
203         h0_test += noise_probability[k];  // Q27
204 
205         // Probability under H1, that is, probability of frame being speech.
206         // Value given in Q27 = Q7 * Q20.
207         tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
208                                                  self->speech_means[gaussian],
209                                                  self->speech_stds[gaussian],
210                                                  &deltaS[gaussian]);
211         speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
212         h1_test += speech_probability[k];  // Q27
213       }
214 
215       // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
216       // Approximation:
217       // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
218       //                           = log2(h1_test) - log2(h0_test)
219       //                           = log2(2^(31-shifts_h1)*(1+b1))
220       //                             - log2(2^(31-shifts_h0)*(1+b0))
221       //                           = shifts_h0 - shifts_h1
222       //                             + log2(1+b1) - log2(1+b0)
223       //                          ~= shifts_h0 - shifts_h1
224       //
225       // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
226       // Further, b0 and b1 are independent and on the average the two terms
227       // cancel.
228       shifts_h0 = WebRtcSpl_NormW32(h0_test);
229       shifts_h1 = WebRtcSpl_NormW32(h1_test);
230       if (h0_test == 0) {
231         shifts_h0 = 31;
232       }
233       if (h1_test == 0) {
234         shifts_h1 = 31;
235       }
236       log_likelihood_ratio = shifts_h0 - shifts_h1;
237 
238       // Update |sum_log_likelihood_ratios| with spectrum weighting. This is
239       // used for the global VAD decision.
240       sum_log_likelihood_ratios +=
241           (int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
242 
243       // Local VAD decision.
244       if ((log_likelihood_ratio * 4) > individualTest) {
245         vadflag = 1;
246       }
247 
248       // TODO(bjornv): The conditional probabilities below are applied on the
249       // hard coded number of Gaussians set to two. Find a way to generalize.
250       // Calculate local noise probabilities used later when updating the GMM.
251       h0 = (int16_t) (h0_test >> 12);  // Q15
252       if (h0 > 0) {
253         // High probability of noise. Assign conditional probabilities for each
254         // Gaussian in the GMM.
255         tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2;  // Q29
256         ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0);  // Q14
257         ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
258       } else {
259         // Low noise probability. Assign conditional probability 1 to the first
260         // Gaussian and 0 to the rest (which is already set at initialization).
261         ngprvec[channel] = 16384;
262       }
263 
264       // Calculate local speech probabilities used later when updating the GMM.
265       h1 = (int16_t) (h1_test >> 12);  // Q15
266       if (h1 > 0) {
267         // High probability of speech. Assign conditional probabilities for each
268         // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
269         tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2;  // Q29
270         sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1);  // Q14
271         sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
272       }
273     }
274 
275     // Make a global VAD decision.
276     vadflag |= (sum_log_likelihood_ratios >= totalTest);
277 
278     // Update the model parameters.
279     maxspe = 12800;
280     for (channel = 0; channel < kNumChannels; channel++) {
281 
282       // Get minimum value in past which is used for long term correction in Q4.
283       feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
284 
285       // Compute the "global" mean, that is the sum of the two means weighted.
286       noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
287                                           &kNoiseDataWeights[channel]);
288       tmp1_s16 = (int16_t) (noise_global_mean >> 6);  // Q8
289 
290       for (k = 0; k < kNumGaussians; k++) {
291         gaussian = channel + k * kNumChannels;
292 
293         nmk = self->noise_means[gaussian];
294         smk = self->speech_means[gaussian];
295         nsk = self->noise_stds[gaussian];
296         ssk = self->speech_stds[gaussian];
297 
298         // Update noise mean vector if the frame consists of noise only.
299         nmk2 = nmk;
300         if (!vadflag) {
301           // deltaN = (x-mu)/sigma^2
302           // ngprvec[k] = |noise_probability[k]| /
303           //   (|noise_probability[0]| + |noise_probability[1]|)
304 
305           // (Q14 * Q11 >> 11) = Q14.
306           delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
307           // Q7 + (Q14 * Q15 >> 22) = Q7.
308           nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
309         }
310 
311         // Long term correction of the noise mean.
312         // Q8 - Q8 = Q8.
313         ndelt = (feature_minimum << 4) - tmp1_s16;
314         // Q7 + (Q8 * Q8) >> 9 = Q7.
315         nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
316 
317         // Control that the noise mean does not drift to much.
318         tmp_s16 = (int16_t) ((k + 5) << 7);
319         if (nmk3 < tmp_s16) {
320           nmk3 = tmp_s16;
321         }
322         tmp_s16 = (int16_t) ((72 + k - channel) << 7);
323         if (nmk3 > tmp_s16) {
324           nmk3 = tmp_s16;
325         }
326         self->noise_means[gaussian] = nmk3;
327 
328         if (vadflag) {
329           // Update speech mean vector:
330           // |deltaS| = (x-mu)/sigma^2
331           // sgprvec[k] = |speech_probability[k]| /
332           //   (|speech_probability[0]| + |speech_probability[1]|)
333 
334           // (Q14 * Q11) >> 11 = Q14.
335           delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
336           // Q14 * Q15 >> 21 = Q8.
337           tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
338           // Q7 + (Q8 >> 1) = Q7. With rounding.
339           smk2 = smk + ((tmp_s16 + 1) >> 1);
340 
341           // Control that the speech mean does not drift to much.
342           maxmu = maxspe + 640;
343           if (smk2 < kMinimumMean[k]) {
344             smk2 = kMinimumMean[k];
345           }
346           if (smk2 > maxmu) {
347             smk2 = maxmu;
348           }
349           self->speech_means[gaussian] = smk2;  // Q7.
350 
351           // (Q7 >> 3) = Q4. With rounding.
352           tmp_s16 = ((smk + 4) >> 3);
353 
354           tmp_s16 = features[channel] - tmp_s16;  // Q4
355           // (Q11 * Q4 >> 3) = Q12.
356           tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
357           tmp2_s32 = tmp1_s32 - 4096;
358           tmp_s16 = sgprvec[gaussian] >> 2;
359           // (Q14 >> 2) * Q12 = Q24.
360           tmp1_s32 = tmp_s16 * tmp2_s32;
361 
362           tmp2_s32 = tmp1_s32 >> 4;  // Q20
363 
364           // 0.1 * Q20 / Q7 = Q13.
365           if (tmp2_s32 > 0) {
366             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
367           } else {
368             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
369             tmp_s16 = -tmp_s16;
370           }
371           // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
372           // Note that division by 4 equals shift by 2, hence,
373           // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
374           tmp_s16 += 128;  // Rounding.
375           ssk += (tmp_s16 >> 8);
376           if (ssk < kMinStd) {
377             ssk = kMinStd;
378           }
379           self->speech_stds[gaussian] = ssk;
380         } else {
381           // Update GMM variance vectors.
382           // deltaN * (features[channel] - nmk) - 1
383           // Q4 - (Q7 >> 3) = Q4.
384           tmp_s16 = features[channel] - (nmk >> 3);
385           // (Q11 * Q4 >> 3) = Q12.
386           tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
387           tmp1_s32 -= 4096;
388 
389           // (Q14 >> 2) * Q12 = Q24.
390           tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
391           tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32);
392           // Q20  * approx 0.001 (2^-10=0.0009766), hence,
393           // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
394           tmp1_s32 = tmp2_s32 >> 14;
395 
396           // Q20 / Q7 = Q13.
397           if (tmp1_s32 > 0) {
398             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, nsk);
399           } else {
400             tmp_s16 = (int16_t) WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
401             tmp_s16 = -tmp_s16;
402           }
403           tmp_s16 += 32;  // Rounding
404           nsk += tmp_s16 >> 6;  // Q13 >> 6 = Q7.
405           if (nsk < kMinStd) {
406             nsk = kMinStd;
407           }
408           self->noise_stds[gaussian] = nsk;
409         }
410       }
411 
412       // Separate models if they are too close.
413       // |noise_global_mean| in Q14 (= Q7 * Q7).
414       noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
415                                           &kNoiseDataWeights[channel]);
416 
417       // |speech_global_mean| in Q14 (= Q7 * Q7).
418       speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
419                                            &kSpeechDataWeights[channel]);
420 
421       // |diff| = "global" speech mean - "global" noise mean.
422       // (Q14 >> 9) - (Q14 >> 9) = Q5.
423       diff = (int16_t) (speech_global_mean >> 9) -
424           (int16_t) (noise_global_mean >> 9);
425       if (diff < kMinimumDifference[channel]) {
426         tmp_s16 = kMinimumDifference[channel] - diff;
427 
428         // |tmp1_s16| = ~0.8 * (kMinimumDifference - diff) in Q7.
429         // |tmp2_s16| = ~0.2 * (kMinimumDifference - diff) in Q7.
430         tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
431         tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
432 
433         // Move Gaussian means for speech model by |tmp1_s16| and update
434         // |speech_global_mean|. Note that |self->speech_means[channel]| is
435         // changed after the call.
436         speech_global_mean = WeightedAverage(&self->speech_means[channel],
437                                              tmp1_s16,
438                                              &kSpeechDataWeights[channel]);
439 
440         // Move Gaussian means for noise model by -|tmp2_s16| and update
441         // |noise_global_mean|. Note that |self->noise_means[channel]| is
442         // changed after the call.
443         noise_global_mean = WeightedAverage(&self->noise_means[channel],
444                                             -tmp2_s16,
445                                             &kNoiseDataWeights[channel]);
446       }
447 
448       // Control that the speech & noise means do not drift to much.
449       maxspe = kMaximumSpeech[channel];
450       tmp2_s16 = (int16_t) (speech_global_mean >> 7);
451       if (tmp2_s16 > maxspe) {
452         // Upper limit of speech model.
453         tmp2_s16 -= maxspe;
454 
455         for (k = 0; k < kNumGaussians; k++) {
456           self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
457         }
458       }
459 
460       tmp2_s16 = (int16_t) (noise_global_mean >> 7);
461       if (tmp2_s16 > kMaximumNoise[channel]) {
462         tmp2_s16 -= kMaximumNoise[channel];
463 
464         for (k = 0; k < kNumGaussians; k++) {
465           self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
466         }
467       }
468     }
469     self->frame_counter++;
470   }
471 
472   // Smooth with respect to transition hysteresis.
473   if (!vadflag) {
474     if (self->over_hang > 0) {
475       vadflag = 2 + self->over_hang;
476       self->over_hang--;
477     }
478     self->num_of_speech = 0;
479   } else {
480     self->num_of_speech++;
481     if (self->num_of_speech > kMaxSpeechFrames) {
482       self->num_of_speech = kMaxSpeechFrames;
483       self->over_hang = overhead2;
484     } else {
485       self->over_hang = overhead1;
486     }
487   }
488   return vadflag;
489 }
490 
491 // Initialize the VAD. Set aggressiveness mode to default value.
WebRtcVad_InitCore(VadInstT * self)492 int WebRtcVad_InitCore(VadInstT* self) {
493   int i;
494 
495   if (self == NULL) {
496     return -1;
497   }
498 
499   // Initialization of general struct variables.
500   self->vad = 1;  // Speech active (=1).
501   self->frame_counter = 0;
502   self->over_hang = 0;
503   self->num_of_speech = 0;
504 
505   // Initialization of downsampling filter state.
506   memset(self->downsampling_filter_states, 0,
507          sizeof(self->downsampling_filter_states));
508 
509   // Initialization of 48 to 8 kHz downsampling.
510   WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
511 
512   // Read initial PDF parameters.
513   for (i = 0; i < kTableSize; i++) {
514     self->noise_means[i] = kNoiseDataMeans[i];
515     self->speech_means[i] = kSpeechDataMeans[i];
516     self->noise_stds[i] = kNoiseDataStds[i];
517     self->speech_stds[i] = kSpeechDataStds[i];
518   }
519 
520   // Initialize Index and Minimum value vectors.
521   for (i = 0; i < 16 * kNumChannels; i++) {
522     self->low_value_vector[i] = 10000;
523     self->index_vector[i] = 0;
524   }
525 
526   // Initialize splitting filter states.
527   memset(self->upper_state, 0, sizeof(self->upper_state));
528   memset(self->lower_state, 0, sizeof(self->lower_state));
529 
530   // Initialize high pass filter states.
531   memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
532 
533   // Initialize mean value memory, for WebRtcVad_FindMinimum().
534   for (i = 0; i < kNumChannels; i++) {
535     self->mean_value[i] = 1600;
536   }
537 
538   // Set aggressiveness mode to default (=|kDefaultMode|).
539   if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
540     return -1;
541   }
542 
543   self->init_flag = kInitCheck;
544 
545   return 0;
546 }
547 
548 // Set aggressiveness mode
WebRtcVad_set_mode_core(VadInstT * self,int mode)549 int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
550   int return_value = 0;
551 
552   switch (mode) {
553     case 0:
554       // Quality mode.
555       memcpy(self->over_hang_max_1, kOverHangMax1Q,
556              sizeof(self->over_hang_max_1));
557       memcpy(self->over_hang_max_2, kOverHangMax2Q,
558              sizeof(self->over_hang_max_2));
559       memcpy(self->individual, kLocalThresholdQ,
560              sizeof(self->individual));
561       memcpy(self->total, kGlobalThresholdQ,
562              sizeof(self->total));
563       break;
564     case 1:
565       // Low bitrate mode.
566       memcpy(self->over_hang_max_1, kOverHangMax1LBR,
567              sizeof(self->over_hang_max_1));
568       memcpy(self->over_hang_max_2, kOverHangMax2LBR,
569              sizeof(self->over_hang_max_2));
570       memcpy(self->individual, kLocalThresholdLBR,
571              sizeof(self->individual));
572       memcpy(self->total, kGlobalThresholdLBR,
573              sizeof(self->total));
574       break;
575     case 2:
576       // Aggressive mode.
577       memcpy(self->over_hang_max_1, kOverHangMax1AGG,
578              sizeof(self->over_hang_max_1));
579       memcpy(self->over_hang_max_2, kOverHangMax2AGG,
580              sizeof(self->over_hang_max_2));
581       memcpy(self->individual, kLocalThresholdAGG,
582              sizeof(self->individual));
583       memcpy(self->total, kGlobalThresholdAGG,
584              sizeof(self->total));
585       break;
586     case 3:
587       // Very aggressive mode.
588       memcpy(self->over_hang_max_1, kOverHangMax1VAG,
589              sizeof(self->over_hang_max_1));
590       memcpy(self->over_hang_max_2, kOverHangMax2VAG,
591              sizeof(self->over_hang_max_2));
592       memcpy(self->individual, kLocalThresholdVAG,
593              sizeof(self->individual));
594       memcpy(self->total, kGlobalThresholdVAG,
595              sizeof(self->total));
596       break;
597     default:
598       return_value = -1;
599       break;
600   }
601 
602   return return_value;
603 }
604 
605 // Calculate VAD decision by first extracting feature values and then calculate
606 // probability for both speech and background noise.
607 
WebRtcVad_CalcVad48khz(VadInstT * inst,const int16_t * speech_frame,size_t frame_length)608 int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
609                            size_t frame_length) {
610   int vad;
611   size_t i;
612   int16_t speech_nb[240];  // 30 ms in 8 kHz.
613   // |tmp_mem| is a temporary memory used by resample function, length is
614   // frame length in 10 ms (480 samples) + 256 extra.
615   int32_t tmp_mem[480 + 256] = { 0 };
616   const size_t kFrameLen10ms48khz = 480;
617   const size_t kFrameLen10ms8khz = 80;
618   size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
619 
620   for (i = 0; i < num_10ms_frames; i++) {
621     WebRtcSpl_Resample48khzTo8khz(speech_frame,
622                                   &speech_nb[i * kFrameLen10ms8khz],
623                                   &inst->state_48_to_8,
624                                   tmp_mem);
625   }
626 
627   // Do VAD on an 8 kHz signal
628   vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
629 
630   return vad;
631 }
632 
WebRtcVad_CalcVad32khz(VadInstT * inst,const int16_t * speech_frame,size_t frame_length)633 int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
634                            size_t frame_length)
635 {
636     size_t len;
637     int vad;
638     int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
639     int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
640 
641 
642     // Downsample signal 32->16->8 before doing VAD
643     WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
644                            frame_length);
645     len = frame_length / 2;
646 
647     WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
648     len /= 2;
649 
650     // Do VAD on an 8 kHz signal
651     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
652 
653     return vad;
654 }
655 
WebRtcVad_CalcVad16khz(VadInstT * inst,const int16_t * speech_frame,size_t frame_length)656 int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
657                            size_t frame_length)
658 {
659     size_t len;
660     int vad;
661     int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
662 
663     // Wideband: Downsample signal before doing VAD
664     WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
665                            frame_length);
666 
667     len = frame_length / 2;
668     vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
669 
670     return vad;
671 }
672 
WebRtcVad_CalcVad8khz(VadInstT * inst,const int16_t * speech_frame,size_t frame_length)673 int WebRtcVad_CalcVad8khz(VadInstT* inst, const int16_t* speech_frame,
674                           size_t frame_length)
675 {
676     int16_t feature_vector[kNumChannels], total_power;
677 
678     // Get power in the bands
679     total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
680                                               feature_vector);
681 
682     // Make a VAD
683     inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
684 
685     return inst->vad;
686 }
687