1 /*
2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include <math.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15
16 #include <algorithm>
17
18 #include "gflags/gflags.h"
19 #include "testing/gtest/include/gtest/gtest.h"
20 #include "webrtc/modules/audio_processing/agc/agc.h"
21 #include "webrtc/modules/audio_processing/agc/agc_audio_proc.h"
22 #include "webrtc/modules/audio_processing/agc/common.h"
23 #include "webrtc/modules/audio_processing/agc/histogram.h"
24 #include "webrtc/modules/audio_processing/agc/pitch_based_vad.h"
25 #include "webrtc/modules/audio_processing/agc/standalone_vad.h"
26 #include "webrtc/modules/audio_processing/agc/utility.h"
27 #include "webrtc/modules/interface/module_common_types.h"
28
29 static const int kAgcAnalWindowSamples = 100;
30 static const double kDefaultActivityThreshold = 0.3;
31
32 DEFINE_bool(standalone_vad, true, "enable stand-alone VAD");
33 DEFINE_string(true_vad, "", "name of a file containing true VAD in 'int'"
34 " format");
35 DEFINE_string(video_vad, "", "name of a file containing video VAD (activity"
36 " probabilities) in double format. One activity per 10ms is"
37 " required. If no file is given the video information is not"
38 " incorporated. Negative activity is interpreted as video is"
39 " not adapted and the statistics are not computed during"
40 " the learning phase. Note that the negative video activities"
41 " are ONLY allowed at the beginning.");
42 DEFINE_string(result, "", "name of a file to write the results. The results"
43 " will be appended to the end of the file. This is optional.");
44 DEFINE_string(audio_content, "", "name of a file where audio content is written"
45 " to, in double format.");
46 DEFINE_double(activity_threshold, kDefaultActivityThreshold,
47 "Activity threshold");
48
49 namespace webrtc {
50
51 // TODO(turajs) A new CL will be committed soon where ExtractFeatures will
52 // notify the caller of "silence" input, instead of bailing out. We would not
53 // need the following function when such a change is made.
54
55 // Add some dither to quiet frames. This avoids the ExtractFeatures skip a
56 // silence frame. Otherwise true VAD would drift with respect to the audio.
57 // We only consider mono inputs.
DitherSilence(AudioFrame * frame)58 static void DitherSilence(AudioFrame* frame) {
59 ASSERT_EQ(1, frame->num_channels_);
60 const double kRmsSilence = 5;
61 const double sum_squared_silence = kRmsSilence * kRmsSilence *
62 frame->samples_per_channel_;
63 double sum_squared = 0;
64 for (int n = 0; n < frame->samples_per_channel_; n++)
65 sum_squared += frame->data_[n] * frame->data_[n];
66 if (sum_squared <= sum_squared_silence) {
67 for (int n = 0; n < frame->samples_per_channel_; n++)
68 frame->data_[n] = (rand() & 0xF) - 8;
69 }
70 }
71
72 class AgcStat {
73 public:
AgcStat()74 AgcStat()
75 : video_index_(0),
76 activity_threshold_(kDefaultActivityThreshold),
77 audio_content_(Histogram::Create(kAgcAnalWindowSamples)),
78 audio_processing_(new AgcAudioProc()),
79 vad_(new PitchBasedVad()),
80 standalone_vad_(StandaloneVad::Create()),
81 audio_content_fid_(NULL) {
82 for (int n = 0; n < kMaxNumFrames; n++)
83 video_vad_[n] = 0.5;
84 }
85
~AgcStat()86 ~AgcStat() {
87 if (audio_content_fid_ != NULL) {
88 fclose(audio_content_fid_);
89 }
90 }
91
set_audio_content_file(FILE * audio_content_fid)92 void set_audio_content_file(FILE* audio_content_fid) {
93 audio_content_fid_ = audio_content_fid;
94 }
95
AddAudio(const AudioFrame & frame,double p_video,int * combined_vad)96 int AddAudio(const AudioFrame& frame, double p_video,
97 int* combined_vad) {
98 if (frame.num_channels_ != 1 ||
99 frame.samples_per_channel_ !=
100 kSampleRateHz / 100 ||
101 frame.sample_rate_hz_ != kSampleRateHz)
102 return -1;
103 video_vad_[video_index_++] = p_video;
104 AudioFeatures features;
105 audio_processing_->ExtractFeatures(
106 frame.data_, frame.samples_per_channel_, &features);
107 if (FLAGS_standalone_vad) {
108 standalone_vad_->AddAudio(frame.data_,
109 frame.samples_per_channel_);
110 }
111 if (features.num_frames > 0) {
112 double p[kMaxNumFrames] = {0.5, 0.5, 0.5, 0.5};
113 if (FLAGS_standalone_vad) {
114 standalone_vad_->GetActivity(p, kMaxNumFrames);
115 }
116 // TODO(turajs) combining and limiting are used in the source files as
117 // well they can be moved to utility.
118 // Combine Video and stand-alone VAD.
119 for (int n = 0; n < features.num_frames; n++) {
120 double p_active = p[n] * video_vad_[n];
121 double p_passive = (1 - p[n]) * (1 - video_vad_[n]);
122 p[n] = p_active / (p_active + p_passive);
123 // Limit probabilities.
124 p[n] = std::min(std::max(p[n], 0.01), 0.99);
125 }
126 if (vad_->VoicingProbability(features, p) < 0)
127 return -1;
128 for (int n = 0; n < features.num_frames; n++) {
129 audio_content_->Update(features.rms[n], p[n]);
130 double ac = audio_content_->AudioContent();
131 if (audio_content_fid_ != NULL) {
132 fwrite(&ac, sizeof(ac), 1, audio_content_fid_);
133 }
134 if (ac > kAgcAnalWindowSamples * activity_threshold_) {
135 combined_vad[n] = 1;
136 } else {
137 combined_vad[n] = 0;
138 }
139 }
140 video_index_ = 0;
141 }
142 return features.num_frames;
143 }
144
Reset()145 void Reset() {
146 audio_content_->Reset();
147 }
148
SetActivityThreshold(double activity_threshold)149 void SetActivityThreshold(double activity_threshold) {
150 activity_threshold_ = activity_threshold;
151 }
152
153 private:
154 int video_index_;
155 double activity_threshold_;
156 double video_vad_[kMaxNumFrames];
157 rtc::scoped_ptr<Histogram> audio_content_;
158 rtc::scoped_ptr<AgcAudioProc> audio_processing_;
159 rtc::scoped_ptr<PitchBasedVad> vad_;
160 rtc::scoped_ptr<StandaloneVad> standalone_vad_;
161
162 FILE* audio_content_fid_;
163 };
164
165
void_main(int argc,char * argv[])166 void void_main(int argc, char* argv[]) {
167 webrtc::AgcStat agc_stat;
168
169 FILE* pcm_fid = fopen(argv[1], "rb");
170 ASSERT_TRUE(pcm_fid != NULL) << "Cannot open PCM file " << argv[1];
171
172 if (argc < 2) {
173 fprintf(stderr, "\nNot Enough arguments\n");
174 }
175
176 FILE* true_vad_fid = NULL;
177 ASSERT_GT(FLAGS_true_vad.size(), 0u) << "Specify the file containing true "
178 "VADs using --true_vad flag.";
179 true_vad_fid = fopen(FLAGS_true_vad.c_str(), "rb");
180 ASSERT_TRUE(true_vad_fid != NULL) << "Cannot open the active list " <<
181 FLAGS_true_vad;
182
183 FILE* results_fid = NULL;
184 if (FLAGS_result.size() > 0) {
185 // True if this is the first time writing to this function and we add a
186 // header to the beginning of the file.
187 bool write_header;
188 // Open in the read mode. If it fails, the file doesn't exist and has to
189 // write a header for it. Otherwise no need to write a header.
190 results_fid = fopen(FLAGS_result.c_str(), "r");
191 if (results_fid == NULL) {
192 write_header = true;
193 } else {
194 fclose(results_fid);
195 write_header = false;
196 }
197 // Open in append mode.
198 results_fid = fopen(FLAGS_result.c_str(), "a");
199 ASSERT_TRUE(results_fid != NULL) << "Cannot open the file, " <<
200 FLAGS_result << ", to write the results.";
201 // Write the header if required.
202 if (write_header) {
203 fprintf(results_fid, "%% Total Active, Misdetection, "
204 "Total inactive, False Positive, On-sets, Missed segments, "
205 "Average response\n");
206 }
207 }
208
209 FILE* video_vad_fid = NULL;
210 if (FLAGS_video_vad.size() > 0) {
211 video_vad_fid = fopen(FLAGS_video_vad.c_str(), "rb");
212 ASSERT_TRUE(video_vad_fid != NULL) << "Cannot open the file, " <<
213 FLAGS_video_vad << " to read video-based VAD decisions.\n";
214 }
215
216 // AgsStat will be the owner of this file and will close it at its
217 // destructor.
218 FILE* audio_content_fid = NULL;
219 if (FLAGS_audio_content.size() > 0) {
220 audio_content_fid = fopen(FLAGS_audio_content.c_str(), "wb");
221 ASSERT_TRUE(audio_content_fid != NULL) << "Cannot open file, " <<
222 FLAGS_audio_content << " to write audio-content.\n";
223 agc_stat.set_audio_content_file(audio_content_fid);
224 }
225
226 webrtc::AudioFrame frame;
227 frame.num_channels_ = 1;
228 frame.sample_rate_hz_ = 16000;
229 frame.samples_per_channel_ = frame.sample_rate_hz_ / 100;
230 const size_t kSamplesToRead = frame.num_channels_ *
231 frame.samples_per_channel_;
232
233 agc_stat.SetActivityThreshold(FLAGS_activity_threshold);
234
235 int ret_val = 0;
236 int num_frames = 0;
237 int agc_vad[kMaxNumFrames];
238 uint8_t true_vad[kMaxNumFrames];
239 double p_video = 0.5;
240 int total_active = 0;
241 int total_passive = 0;
242 int total_false_positive = 0;
243 int total_missed_detection = 0;
244 int onset_adaptation = 0;
245 int num_onsets = 0;
246 bool onset = false;
247 uint8_t previous_true_vad = 0;
248 int num_not_adapted = 0;
249 int true_vad_index = 0;
250 bool in_false_positive_region = false;
251 int total_false_positive_duration = 0;
252 bool video_adapted = false;
253 while (kSamplesToRead == fread(frame.data_, sizeof(int16_t),
254 kSamplesToRead, pcm_fid)) {
255 assert(true_vad_index < kMaxNumFrames);
256 ASSERT_EQ(1u, fread(&true_vad[true_vad_index], sizeof(*true_vad), 1,
257 true_vad_fid))
258 << "Size mismatch between True-VAD and the PCM file.\n";
259 if (video_vad_fid != NULL) {
260 ASSERT_EQ(1u, fread(&p_video, sizeof(p_video), 1, video_vad_fid)) <<
261 "Not enough video-based VAD probabilities.";
262 }
263
264 // Negative video activity indicates that the video-based VAD is not yet
265 // adapted. Disregards the learning phase in statistics.
266 if (p_video < 0) {
267 if (video_adapted) {
268 fprintf(stderr, "Negative video probabilities ONLY allowed at the "
269 "beginning of the sequence, not in the middle.\n");
270 exit(1);
271 }
272 continue;
273 } else {
274 video_adapted = true;
275 }
276
277 num_frames++;
278 uint8_t last_true_vad;
279 if (true_vad_index == 0) {
280 last_true_vad = previous_true_vad;
281 } else {
282 last_true_vad = true_vad[true_vad_index - 1];
283 }
284 if (last_true_vad == 1 && true_vad[true_vad_index] == 0) {
285 agc_stat.Reset();
286 }
287 true_vad_index++;
288
289 DitherSilence(&frame);
290
291 ret_val = agc_stat.AddAudio(frame, p_video, agc_vad);
292 ASSERT_GE(ret_val, 0);
293
294 if (ret_val > 0) {
295 ASSERT_TRUE(ret_val == true_vad_index);
296 for (int n = 0; n < ret_val; n++) {
297 if (true_vad[n] == 1) {
298 total_active++;
299 if (previous_true_vad == 0) {
300 num_onsets++;
301 onset = true;
302 }
303 if (agc_vad[n] == 0) {
304 total_missed_detection++;
305 if (onset)
306 onset_adaptation++;
307 } else {
308 in_false_positive_region = false;
309 onset = false;
310 }
311 } else if (true_vad[n] == 0) {
312 // Check if |on_set| flag is still up. If so it means that we totally
313 // missed an active region
314 if (onset)
315 num_not_adapted++;
316 onset = false;
317
318 total_passive++;
319 if (agc_vad[n] == 1) {
320 total_false_positive++;
321 in_false_positive_region = true;
322 }
323 if (in_false_positive_region) {
324 total_false_positive_duration++;
325 }
326 } else {
327 ASSERT_TRUE(false) << "Invalid value for true-VAD.\n";
328 }
329 previous_true_vad = true_vad[n];
330 }
331 true_vad_index = 0;
332 }
333 }
334
335 if (results_fid != NULL) {
336 fprintf(results_fid, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
337 total_active,
338 total_missed_detection,
339 total_passive,
340 total_false_positive,
341 num_onsets,
342 num_not_adapted,
343 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
344 static_cast<float>(total_false_positive_duration) /
345 (total_passive + 1e-12));
346 }
347 fprintf(stdout, "%4d %4d %4d %4d %4d %4d %4.0f %4.0f\n",
348 total_active,
349 total_missed_detection,
350 total_passive,
351 total_false_positive,
352 num_onsets,
353 num_not_adapted,
354 static_cast<float>(onset_adaptation) / (num_onsets + 1e-12),
355 static_cast<float>(total_false_positive_duration) /
356 (total_passive + 1e-12));
357
358 fclose(true_vad_fid);
359 fclose(pcm_fid);
360 if (video_vad_fid != NULL) {
361 fclose(video_vad_fid);
362 }
363 if (results_fid != NULL) {
364 fclose(results_fid);
365 }
366 }
367
368 } // namespace webrtc
369
main(int argc,char * argv[])370 int main(int argc, char* argv[]) {
371 char kUsage[] =
372 "\nCompute the number of misdetected and false-positive frames. Not\n"
373 " that for each frame of audio (10 ms) there should be one true\n"
374 " activity. If any video-based activity is given, there should also be\n"
375 " one probability per frame.\n"
376 "\nUsage:\n\n"
377 "activity_metric input_pcm [options]\n"
378 "where 'input_pcm' is the input audio sampled at 16 kHz in 16 bits "
379 "format.\n\n";
380 google::SetUsageMessage(kUsage);
381 google::ParseCommandLineFlags(&argc, &argv, true);
382 webrtc::void_main(argc, argv);
383 return 0;
384 }
385