1 #include <switch.h>
2
3 #define AMD_PARAMS (2)
4 #define AMD_SYNTAX "<uuid> <command>"
5
6 SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_amd_shutdown);
7 SWITCH_MODULE_LOAD_FUNCTION(mod_amd_load);
8 SWITCH_MODULE_DEFINITION(mod_amd, mod_amd_load, mod_amd_shutdown, NULL);
9 SWITCH_STANDARD_APP(amd_start_function);
10
11 static struct {
12 uint32_t initial_silence;
13 uint32_t greeting;
14 uint32_t after_greeting_silence;
15 uint32_t total_analysis_time;
16 uint32_t minimum_word_length;
17 uint32_t between_words_silence;
18 uint32_t maximum_number_of_words;
19 uint32_t silence_threshold;
20 uint32_t maximum_word_length;
21 } globals;
22
23 static switch_xml_config_item_t instructions[] = {
24 SWITCH_CONFIG_ITEM(
25 "initial_silence",
26 SWITCH_CONFIG_INT,
27 CONFIG_RELOADABLE,
28 &globals.initial_silence,
29 (void *) 2500,
30 NULL, NULL, NULL),
31
32 SWITCH_CONFIG_ITEM(
33 "greeting",
34 SWITCH_CONFIG_INT,
35 CONFIG_RELOADABLE,
36 &globals.greeting,
37 (void *) 1500,
38 NULL, NULL, NULL),
39
40 SWITCH_CONFIG_ITEM(
41 "after_greeting_silence",
42 SWITCH_CONFIG_INT,
43 CONFIG_RELOADABLE,
44 &globals.after_greeting_silence,
45 (void *) 800,
46 NULL, NULL, NULL),
47
48 SWITCH_CONFIG_ITEM(
49 "total_analysis_time",
50 SWITCH_CONFIG_INT,
51 CONFIG_RELOADABLE,
52 &globals.total_analysis_time,
53 (void *) 5000,
54 NULL, NULL, NULL),
55
56 SWITCH_CONFIG_ITEM(
57 "min_word_length",
58 SWITCH_CONFIG_INT,
59 CONFIG_RELOADABLE,
60 &globals.minimum_word_length,
61 (void *) 100,
62 NULL, NULL, NULL),
63
64 SWITCH_CONFIG_ITEM(
65 "between_words_silence",
66 SWITCH_CONFIG_INT,
67 CONFIG_RELOADABLE,
68 &globals.between_words_silence,
69 (void *) 50,
70 NULL, NULL, NULL),
71
72 SWITCH_CONFIG_ITEM(
73 "maximum_number_of_words",
74 SWITCH_CONFIG_INT,
75 CONFIG_RELOADABLE,
76 &globals.maximum_number_of_words,
77 (void *) 3,
78 NULL, NULL, NULL),
79
80 SWITCH_CONFIG_ITEM(
81 "maximum_word_length",
82 SWITCH_CONFIG_INT,
83 CONFIG_RELOADABLE,
84 &globals.maximum_word_length,
85 (void *)5000,
86 NULL, NULL, NULL),
87
88 SWITCH_CONFIG_ITEM(
89 "silence_threshold",
90 SWITCH_CONFIG_INT,
91 CONFIG_RELOADABLE,
92 &globals.silence_threshold,
93 (void *) 256,
94 NULL, NULL, NULL),
95
96 SWITCH_CONFIG_ITEM_END()
97 };
98
do_config(switch_bool_t reload)99 static switch_status_t do_config(switch_bool_t reload)
100 {
101 memset(&globals, 0, sizeof(globals));
102
103 if (switch_xml_config_parse_module_settings("amd.conf", reload, instructions) != SWITCH_STATUS_SUCCESS) {
104 return SWITCH_STATUS_FALSE;
105 }
106
107 return SWITCH_STATUS_SUCCESS;
108 }
109
SWITCH_MODULE_LOAD_FUNCTION(mod_amd_load)110 SWITCH_MODULE_LOAD_FUNCTION(mod_amd_load)
111 {
112 switch_application_interface_t *app_interface;
113
114 *module_interface = switch_loadable_module_create_module_interface(pool, modname);
115
116 do_config(SWITCH_FALSE);
117
118 SWITCH_ADD_APP(
119 app_interface,
120 "amd",
121 "Voice activity detection (blocking)",
122 "Asterisk's AMD (Blocking)",
123 amd_start_function,
124 NULL,
125 SAF_NONE);
126
127 return SWITCH_STATUS_SUCCESS;
128 }
129
SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_amd_shutdown)130 SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_amd_shutdown)
131 {
132 switch_xml_config_cleanup(instructions);
133
134 return SWITCH_STATUS_SUCCESS;
135 }
136
137 typedef enum {
138 SILENCE,
139 VOICED
140 } amd_frame_classifier;
141
142 typedef enum {
143 VAD_STATE_IN_WORD,
144 VAD_STATE_IN_SILENCE,
145 } amd_vad_state_t;
146
147 typedef struct {
148 const switch_core_session_t *session;
149 switch_channel_t *channel;
150 amd_vad_state_t state;
151 uint32_t frame_ms;
152
153 uint32_t silence_duration;
154 uint32_t voice_duration;
155 uint32_t words;
156
157 uint32_t in_initial_silence:1;
158 uint32_t in_greeting:1;
159 } amd_vad_t;
160
classify_frame(const switch_frame_t * f,const switch_codec_implementation_t * codec)161 static amd_frame_classifier classify_frame(const switch_frame_t *f, const switch_codec_implementation_t *codec)
162 {
163 int16_t *audio = f->data;
164 uint32_t score, count, j;
165 double energy;
166 int divisor;
167
168 divisor = codec->actual_samples_per_second / 8000;
169
170 for (energy = 0, j = 0, count = 0; count < f->samples; count++) {
171 energy += abs(audio[j++]);
172 j += codec->number_of_channels;
173 }
174
175 score = (uint32_t) (energy / (f->samples / divisor));
176
177 if (score >= globals.silence_threshold) {
178 return VOICED;
179 }
180
181 return SILENCE;
182 }
183
amd_handle_silence_frame(amd_vad_t * vad,const switch_frame_t * f)184 static switch_bool_t amd_handle_silence_frame(amd_vad_t *vad, const switch_frame_t *f)
185 {
186 vad->silence_duration += vad->frame_ms;
187
188 if (vad->silence_duration >= globals.between_words_silence) {
189 if (vad->state != VAD_STATE_IN_SILENCE) {
190 switch_log_printf(
191 SWITCH_CHANNEL_SESSION_LOG(vad->session),
192 SWITCH_LOG_DEBUG,
193 "AMD: Changed state to VAD_STATE_IN_SILENCE\n");
194 }
195
196 vad->state = VAD_STATE_IN_SILENCE;
197 vad->voice_duration = 0;
198 }
199
200 if (vad->in_initial_silence && vad->silence_duration >= globals.initial_silence) {
201 switch_log_printf(
202 SWITCH_CHANNEL_SESSION_LOG(vad->session),
203 SWITCH_LOG_DEBUG,
204 "AMD: MACHINE (silence_duration: %d, initial_silence: %d)\n",
205 vad->silence_duration,
206 globals.initial_silence);
207
208 switch_channel_set_variable(vad->channel, "amd_result", "MACHINE");
209 switch_channel_set_variable(vad->channel, "amd_cause", "INITIALSILENCE");
210 return SWITCH_TRUE;
211 }
212
213 if (vad->silence_duration >= globals.after_greeting_silence && vad->in_greeting) {
214 switch_log_printf(
215 SWITCH_CHANNEL_SESSION_LOG(vad->session),
216 SWITCH_LOG_DEBUG,
217 "AMD: HUMAN (silence_duration: %d, after_greeting_silence: %d)\n",
218 vad->silence_duration,
219 globals.after_greeting_silence);
220
221 switch_channel_set_variable(vad->channel, "amd_result", "HUMAN");
222 switch_channel_set_variable(vad->channel, "amd_cause", "HUMAN");
223 return SWITCH_TRUE;
224 }
225
226 return SWITCH_FALSE;
227 }
228
amd_handle_voiced_frame(amd_vad_t * vad,const switch_frame_t * f)229 static switch_bool_t amd_handle_voiced_frame(amd_vad_t *vad, const switch_frame_t *f)
230 {
231 vad->voice_duration += vad->frame_ms;
232
233 if (vad->voice_duration >= globals.minimum_word_length && vad->state == VAD_STATE_IN_SILENCE) {
234 vad->words++;
235
236 switch_log_printf(
237 SWITCH_CHANNEL_SESSION_LOG(vad->session),
238 SWITCH_LOG_DEBUG,
239 "AMD: Word detected (words: %d)\n",
240 vad->words);
241
242 vad->state = VAD_STATE_IN_WORD;
243 }
244
245 if (vad->voice_duration >= globals.maximum_word_length) {
246 switch_log_printf(
247 SWITCH_CHANNEL_SESSION_LOG(vad->session),
248 SWITCH_LOG_DEBUG,
249 "AMD: MACHINE (voice_duration: %d, maximum_word_length: %d)\n",
250 vad->voice_duration,
251 globals.maximum_word_length);
252
253 switch_channel_set_variable(vad->channel, "amd_result", "MACHINE");
254 switch_channel_set_variable(vad->channel, "amd_cause", "MAXWORDLENGTH");
255 return SWITCH_TRUE;
256 }
257
258 if (vad->words >= globals.maximum_number_of_words) {
259 switch_log_printf(
260 SWITCH_CHANNEL_SESSION_LOG(vad->session),
261 SWITCH_LOG_DEBUG,
262 "AMD: MACHINE (words: %d, maximum_number_of_words: %d)\n",
263 vad->words,
264 globals.maximum_number_of_words);
265
266 switch_channel_set_variable(vad->channel, "amd_result", "MACHINE");
267 switch_channel_set_variable(vad->channel, "amd_cause", "MAXWORDS");
268 return SWITCH_TRUE;
269 }
270
271 if (vad->in_greeting && vad->voice_duration >= globals.greeting) {
272 switch_log_printf(
273 SWITCH_CHANNEL_SESSION_LOG(vad->session),
274 SWITCH_LOG_DEBUG,
275 "AMD: MACHINE (voice_duration: %d, greeting: %d)\n",
276 vad->voice_duration,
277 globals.greeting);
278
279 switch_channel_set_variable(vad->channel, "amd_result", "MACHINE");
280 switch_channel_set_variable(vad->channel, "amd_cause", "LONGGREETING");
281 return SWITCH_TRUE;
282 }
283
284 if (vad->voice_duration >= globals.minimum_word_length) {
285 if (vad->silence_duration) {
286 switch_log_printf(
287 SWITCH_CHANNEL_SESSION_LOG(vad->session),
288 SWITCH_LOG_DEBUG,
289 "AMD: Detected Talk, previous silence duration: %dms\n",
290 vad->silence_duration);
291 }
292
293 vad->silence_duration = 0;
294 }
295
296 if (vad->voice_duration >= globals.minimum_word_length && !vad->in_greeting) {
297 if (vad->silence_duration) {
298 switch_log_printf(
299 SWITCH_CHANNEL_SESSION_LOG(vad->session),
300 SWITCH_LOG_DEBUG,
301 "AMD: Before Greeting Time (silence_duration: %d, voice_duration: %d)\n",
302 vad->silence_duration,
303 vad->voice_duration);
304 }
305
306 vad->in_initial_silence = 0;
307 vad->in_greeting = 1;
308 }
309
310 return SWITCH_FALSE;
311 }
312
SWITCH_STANDARD_APP(amd_start_function)313 SWITCH_STANDARD_APP(amd_start_function)
314 {
315 switch_channel_t *channel = switch_core_session_get_channel(session);
316 switch_codec_t raw_codec = { 0 };
317 switch_codec_implementation_t read_impl = { 0 };
318 switch_frame_t *read_frame;
319 switch_status_t status;
320 uint32_t timeout_ms = globals.total_analysis_time;
321 int32_t sample_count_limit;
322 switch_bool_t complete = SWITCH_FALSE;
323
324 amd_vad_t vad = { 0 };
325
326 if (!session) {
327 return;
328 }
329
330 vad.channel = channel;
331 vad.session = session;
332 vad.state = VAD_STATE_IN_WORD;
333 vad.silence_duration = 0;
334 vad.voice_duration = 0;
335 vad.frame_ms = 0;
336 vad.in_initial_silence = 1;
337 vad.in_greeting = 0;
338 vad.words = 0;
339
340 switch_core_session_get_read_impl(session, &read_impl);
341
342 if (timeout_ms) {
343 sample_count_limit = (read_impl.actual_samples_per_second / 1000) * timeout_ms;
344 }
345
346 /*
347 * We are creating a new L16 (raw 16-bit samples) codec for the read end
348 * of our channel. We'll use this to process the audio coming off of the
349 * channel so that we always know what we are dealing with.
350 */
351 status = switch_core_codec_init(
352 &raw_codec,
353 "L16",
354 NULL,
355 NULL,
356 read_impl.actual_samples_per_second,
357 read_impl.microseconds_per_packet / 1000,
358 1,
359 SWITCH_CODEC_FLAG_ENCODE | SWITCH_CODEC_FLAG_DECODE,
360 NULL,
361 switch_core_session_get_pool(session));
362
363 if (status != SWITCH_STATUS_SUCCESS) {
364 switch_log_printf(
365 SWITCH_CHANNEL_SESSION_LOG(session),
366 SWITCH_LOG_ERROR,
367 "Unable to initialize L16 (raw) codec.\n");
368 return;
369 }
370
371 switch_core_session_set_read_codec(session, &raw_codec);
372
373 while (switch_channel_ready(channel)) {
374 status = switch_core_session_read_frame(session, &read_frame, SWITCH_IO_FLAG_NONE, 0);
375
376 if (!SWITCH_READ_ACCEPTABLE(status)) {
377 break;
378 }
379
380 if (read_frame->samples == 0) {
381 continue;
382 }
383
384 vad.frame_ms = 1000 / (read_impl.actual_samples_per_second / read_frame->samples);
385
386 if (sample_count_limit) {
387 sample_count_limit -= raw_codec.implementation->samples_per_packet;
388 if (sample_count_limit <= 0) {
389 switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "AMD: Timeout\n");
390
391 switch_channel_set_variable(channel, "amd_result", "NOTSURE");
392 switch_channel_set_variable(channel, "amd_cause", "TOOLONG");
393 break;
394 }
395 }
396
397 switch (classify_frame(read_frame, &read_impl)) {
398 case SILENCE:
399 switch_log_printf(
400 SWITCH_CHANNEL_SESSION_LOG(session),
401 SWITCH_LOG_DEBUG,
402 "AMD: Silence\n");
403
404 if (amd_handle_silence_frame(&vad, read_frame)) {
405 complete = SWITCH_TRUE;
406 }
407 break;
408 case VOICED:
409 default:
410 switch_log_printf(
411 SWITCH_CHANNEL_SESSION_LOG(session),
412 SWITCH_LOG_DEBUG,
413 "AMD: Voiced\n");
414
415 if (amd_handle_voiced_frame(&vad, read_frame)) {
416 complete = SWITCH_TRUE;
417 }
418 break;
419 }
420
421 if (complete) {
422 break;
423 }
424 }
425
426 switch_core_session_reset(session, SWITCH_FALSE, SWITCH_TRUE);
427 switch_core_codec_destroy(&raw_codec);
428 }
429