1 /*
2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 
12 #include <math.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 
16 #include <algorithm>
17 
18 #include "gflags/gflags.h"
19 #include "testing/gtest/include/gtest/gtest.h"
20 #include "webrtc/modules/audio_processing/agc/agc.h"
21 #include "webrtc/modules/audio_processing/agc/agc_audio_proc.h"
22 #include "webrtc/modules/audio_processing/agc/common.h"
23 #include "webrtc/modules/audio_processing/agc/histogram.h"
24 #include "webrtc/modules/audio_processing/agc/pitch_based_vad.h"
25 #include "webrtc/modules/audio_processing/agc/standalone_vad.h"
26 #include "webrtc/modules/audio_processing/agc/utility.h"
27 #include "webrtc/modules/interface/module_common_types.h"
28 
29 static const int kAgcAnalWindowSamples = 100;
30 static const double kDefaultActivityThreshold = 0.3;
31 
32 DEFINE_bool(standalone_vad, true, "enable stand-alone VAD");
33 DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'"
34               " format");
35 DEFINE_string(video_vad, "", "name of a file containing video VAD (activity"
36               " probabilities) in double format. One activity per 10ms is"
37               " required. If no file is given the video information is not"
38               " incorporated. Negative activity is interpreted as video is"
39               " not adapted and the statistics are not computed during"
40               " the learning phase. Note that the negative video activities"
41               " are ONLY allowed at the beginning.");
42 DEFINE_string(result, "", "name of a file to write the results. The results"
43               " will be appended to the end of the file. This is optional.");
44 DEFINE_string(audio_content, "", "name of a file where audio content is written"
45               " to, in double format.");
46 DEFINE_double(activity_threshold, kDefaultActivityThreshold,
47               "Activity threshold");
48 
49 namespace webrtc {
50 
51 // TODO(turajs) A new CL will be committed soon where ExtractFeatures will
52 // notify the caller of "silence" input, instead of bailing out. We would not
53 // need the following function when such a change is made.
54 
55 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a
56 // silence frame. Otherwise true VAD would drift with respect to the audio.
57 // We only consider mono inputs.
DitherSilence(AudioFrame * frame)58 static void DitherSilence(AudioFrame* frame) {
59   ASSERT_EQ(1, frame->num_channels_);
60   const double kRmsSilence = 5;
61   const double sum_squared_silence = kRmsSilence * kRmsSilence *
62       frame->samples_per_channel_;
63   double sum_squared = 0;
64   for (int n = 0; n < frame->samples_per_channel_; n++)
65     sum_squared += frame->data_[n] * frame->data_[n];
66   if (sum_squared <= sum_squared_silence) {
67     for (int n = 0; n < frame->samples_per_channel_; n++)
68       frame->data_[n] = (rand() & 0xF) - 8;
69   }
70 }
71 
72 class AgcStat {
73  public:
AgcStat()74   AgcStat()
75       : video_index_(0),
76         activity_threshold_(kDefaultActivityThreshold),
77         audio_content_(Histogram::Create(kAgcAnalWindowSamples)),
78         audio_processing_(new AgcAudioProc()),
79         vad_(new PitchBasedVad()),
80         standalone_vad_(StandaloneVad::Create()),
81         audio_content_fid_(NULL) {
82     for (int n = 0; n < kMaxNumFrames; n++)
83       video_vad_[n] = 0.5;
84   }
85 
~AgcStat()86   ~AgcStat() {
87     if (audio_content_fid_ != NULL) {
88       fclose(audio_content_fid_);
89     }
90   }
91 
set_audio_content_file(FILE * audio_content_fid)92   void set_audio_content_file(FILE* audio_content_fid) {
93     audio_content_fid_ = audio_content_fid;
94   }
95 
AddAudio(const AudioFrame & frame,double p_video,int * combined_vad)96   int AddAudio(const AudioFrame& frame, double p_video,
97                int* combined_vad) {
98     if (frame.num_channels_ != 1 ||
99         frame.samples_per_channel_ !=
100             kSampleRateHz / 100 ||
101             frame.sample_rate_hz_ != kSampleRateHz)
102       return -1;
103     video_vad_[video_index_++] = p_video;
104     AudioFeatures features;
105     audio_processing_->ExtractFeatures(
106         frame.data_, frame.samples_per_channel_, &features);
107     if (FLAGS_standalone_vad) {
108       standalone_vad_->AddAudio(frame.data_,
109                                 frame.samples_per_channel_);
110     }
111     if (features.num_frames > 0) {
112       double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5};
113       if (FLAGS_standalone_vad) {
114         standalone_vad_->GetActivity(p, kMaxNumFrames);
115       }
116       // TODO(turajs) combining and limiting are used in the source files as
117       // well they can be moved to utility.
118       // Combine Video and stand-alone VAD.
119       for (int n = 0; n < features.num_frames; n++) {
120         double p_active = p[n] * video_vad_[n];
121         double p_passive = (1 - p[n]) * (1 - video_vad_[n]);
122         p[n]  = p_active / (p_active + p_passive);
123         // Limit probabilities.
124         p[n] = std::min(std::max(p[n], 0.01), 0.99);
125       }
126       if (vad_->VoicingProbability(features, p) < 0)
127         return -1;
128       for (int n = 0; n < features.num_frames; n++) {
129         audio_content_->Update(features.rms[n], p[n]);
130         double ac = audio_content_->AudioContent();
131         if (audio_content_fid_ != NULL) {
132           fwrite(&ac, sizeof(ac), 1, audio_content_fid_);
133         }
134         if (ac > kAgcAnalWindowSamples * activity_threshold_) {
135           combined_vad[n] = 1;
136         } else {
137           combined_vad[n] = 0;
138         }
139       }
140       video_index_ = 0;
141     }
142     return features.num_frames;
143   }
144 
Reset()145   void Reset() {
146     audio_content_->Reset();
147   }
148 
SetActivityThreshold(double activity_threshold)149   void SetActivityThreshold(double activity_threshold) {
150     activity_threshold_ = activity_threshold;
151   }
152 
153  private:
154   int video_index_;
155   double activity_threshold_;
156   double video_vad_[kMaxNumFrames];
157   rtc::scoped_ptr<Histogram> audio_content_;
158   rtc::scoped_ptr<AgcAudioProc> audio_processing_;
159   rtc::scoped_ptr<PitchBasedVad> vad_;
160   rtc::scoped_ptr<StandaloneVad> standalone_vad_;
161 
162   FILE* audio_content_fid_;
163 };
164 
165 
void_main(int argc,char * argv[])166 void void_main(int argc, char* argv[]) {
167   webrtc::AgcStat agc_stat;
168 
169   FILE* pcm_fid = fopen(argv[1], "rb");
170   ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1];
171 
172   if (argc < 2) {
173     fprintf(stderr, "\nNot Enough arguments\n");
174   }
175 
176   FILE* true_vad_fid = NULL;
177   ASSERT_GT(FLAGS_true_vad.size(), 0u) << "Specify the file containing true "
178       "VADs using --true_vad flag.";
179   true_vad_fid = fopen(FLAGS_true_vad.c_str(), "rb");
180   ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " <<
181       FLAGS_true_vad;
182 
183   FILE* results_fid = NULL;
184   if (FLAGS_result.size() > 0) {
185     // True if this is the first time writing to this function and we add a
186     // header to the beginning of the file.
187     bool write_header;
188     // Open in the read mode. If it fails, the file doesn't exist and has to
189     // write a header for it. Otherwise no need to write a header.
190     results_fid = fopen(FLAGS_result.c_str(), "r");
191     if (results_fid == NULL) {
192       write_header = true;
193     } else {
194       fclose(results_fid);
195       write_header = false;
196     }
197     // Open in append mode.
198     results_fid = fopen(FLAGS_result.c_str(), "a");
199     ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " <<
200               FLAGS_result << ", to write the results.";
201     // Write the header if required.
202     if (write_header) {
203       fprintf(results_fid, "%% Total Active,  Misdetection,  "
204               "Total inactive,  False Positive,  On-sets,  Missed segments,  "
205               "Average response\n");
206     }
207   }
208 
209   FILE* video_vad_fid = NULL;
210   if (FLAGS_video_vad.size() > 0) {
211     video_vad_fid = fopen(FLAGS_video_vad.c_str(), "rb");
212     ASSERT_TRUE(video_vad_fid != NULL) <<  "Cannot open the file, " <<
213               FLAGS_video_vad << " to read video-based VAD decisions.\n";
214   }
215 
216   // AgsStat will be the owner of this file and will close it at its
217   // destructor.
218   FILE* audio_content_fid = NULL;
219   if (FLAGS_audio_content.size() > 0) {
220     audio_content_fid = fopen(FLAGS_audio_content.c_str(), "wb");
221     ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " <<
222               FLAGS_audio_content << " to write audio-content.\n";
223     agc_stat.set_audio_content_file(audio_content_fid);
224   }
225 
226   webrtc::AudioFrame frame;
227   frame.num_channels_ = 1;
228   frame.sample_rate_hz_ = 16000;
229   frame.samples_per_channel_ = frame.sample_rate_hz_ / 100;
230   const size_t kSamplesToRead = frame.num_channels_ *
231       frame.samples_per_channel_;
232 
233   agc_stat.SetActivityThreshold(FLAGS_activity_threshold);
234 
235   int ret_val = 0;
236   int num_frames = 0;
237   int agc_vad[kMaxNumFrames];
238   uint8_t true_vad[kMaxNumFrames];
239   double p_video = 0.5;
240   int total_active = 0;
241   int total_passive = 0;
242   int total_false_positive = 0;
243   int total_missed_detection = 0;
244   int onset_adaptation = 0;
245   int num_onsets = 0;
246   bool onset = false;
247   uint8_t previous_true_vad = 0;
248   int num_not_adapted = 0;
249   int true_vad_index = 0;
250   bool in_false_positive_region = false;
251   int total_false_positive_duration = 0;
252   bool video_adapted = false;
253   while (kSamplesToRead == fread(frame.data_, sizeof(int16_t),
254                                  kSamplesToRead, pcm_fid)) {
255     assert(true_vad_index < kMaxNumFrames);
256     ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1,
257                         true_vad_fid))
258         << "Size mismatch between True-VAD and the PCM file.\n";
259     if (video_vad_fid != NULL) {
260       ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) <<
261           "Not enough video-based VAD probabilities.";
262     }
263 
264     // Negative video activity indicates that the video-based VAD is not yet
265     // adapted. Disregards the learning phase in statistics.
266     if (p_video < 0) {
267       if (video_adapted) {
268         fprintf(stderr, "Negative video probabilities ONLY allowed at the "
269             "beginning of the sequence, not in the middle.\n");
270         exit(1);
271       }
272       continue;
273     } else {
274       video_adapted = true;
275     }
276 
277     num_frames++;
278     uint8_t last_true_vad;
279     if (true_vad_index == 0) {
280       last_true_vad = previous_true_vad;
281     } else {
282       last_true_vad = true_vad[true_vad_index - 1];
283     }
284     if (last_true_vad == 1 && true_vad[true_vad_index] == 0) {
285       agc_stat.Reset();
286     }
287     true_vad_index++;
288 
289     DitherSilence(&frame);
290 
291     ret_val = agc_stat.AddAudio(frame, p_video, agc_vad);
292     ASSERT_GE(ret_val, 0);
293 
294     if (ret_val > 0) {
295       ASSERT_TRUE(ret_val == true_vad_index);
296       for (int n = 0; n < ret_val; n++) {
297         if (true_vad[n] == 1) {
298           total_active++;
299           if (previous_true_vad == 0) {
300             num_onsets++;
301             onset = true;
302           }
303           if (agc_vad[n] == 0) {
304             total_missed_detection++;
305             if (onset)
306               onset_adaptation++;
307           } else {
308             in_false_positive_region = false;
309             onset = false;
310           }
311         } else if (true_vad[n] == 0) {
312           // Check if |on_set| flag is still up. If so it means that we totally
313           // missed an active region
314           if (onset)
315             num_not_adapted++;
316           onset = false;
317 
318           total_passive++;
319           if (agc_vad[n] == 1) {
320             total_false_positive++;
321             in_false_positive_region = true;
322           }
323           if (in_false_positive_region) {
324             total_false_positive_duration++;
325           }
326         } else {
327           ASSERT_TRUE(false) << "Invalid value for true-VAD.\n";
328         }
329         previous_true_vad = true_vad[n];
330       }
331       true_vad_index = 0;
332     }
333   }
334 
335   if (results_fid != NULL) {
336     fprintf(results_fid, "%4d  %4d  %4d  %4d  %4d  %4d  %4.0f %4.0f\n",
337             total_active,
338             total_missed_detection,
339             total_passive,
340             total_false_positive,
341             num_onsets,
342             num_not_adapted,
343             static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
344             static_cast<float>(total_false_positive_duration) /
345             (total_passive + 1e-12));
346   }
347   fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
348           total_active,
349           total_missed_detection,
350           total_passive,
351           total_false_positive,
352           num_onsets,
353           num_not_adapted,
354           static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
355           static_cast<float>(total_false_positive_duration) /
356               (total_passive + 1e-12));
357 
358   fclose(true_vad_fid);
359   fclose(pcm_fid);
360   if (video_vad_fid != NULL) {
361     fclose(video_vad_fid);
362   }
363   if (results_fid != NULL) {
364     fclose(results_fid);
365   }
366 }
367 
368 }  // namespace webrtc
369 
main(int argc,char * argv[])370 int main(int argc, char* argv[]) {
371   char kUsage[] =
372       "\nCompute the number of misdetected and false-positive frames. Not\n"
373       " that for each frame of audio (10 ms) there should be one true\n"
374       " activity. If any video-based activity is given, there should also be\n"
375       " one probability per frame.\n"
376       "\nUsage:\n\n"
377       "activity_metric input_pcm [options]\n"
378       "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits "
379       "format.\n\n";
380   google::SetUsageMessage(kUsage);
381   google::ParseCommandLineFlags(&argc, &argv, true);
382   webrtc::void_main(argc, argv);
383   return 0;
384 }
385