1 /* Silence effect for SoX
2  * by Heikki Leinonen (heilei@iki.fi) 25.03.2001
3  * Major Modifications by Chris Bagwell 06.08.2001
4  * Minor addition by Donnie Smith 13.08.2003
5  *
6  * This effect can delete samples from the start of a sound file
7  * until it sees a specified count of samples exceed a given threshold
8  * (any of the channels).
9  * This effect can also delete samples from the end of a sound file
10  * when it sees a specified count of samples below a given threshold
11  * (all channels).
12  * It may also be used to delete samples anywhere in a sound file.
13  * Thesholds can be given as either a percentage or in decibels.
14  */
15 
16 #include "sox_i.h"
17 
18 #include <string.h>
19 
20 /* Private data for silence effect. */
21 
22 #define SILENCE_TRIM        0
23 #define SILENCE_TRIM_FLUSH  1
24 #define SILENCE_COPY        2
25 #define SILENCE_COPY_FLUSH  3
26 #define SILENCE_STOP        4
27 
28 typedef struct {
29     char        start;
30     int         start_periods;
31     char        *start_duration_str;
32     size_t   start_duration;
33     double      start_threshold;
34     char        start_unit; /* "d" for decibels or "%" for percent. */
35     int         restart;
36 
37     sox_sample_t *start_holdoff;
38     size_t   start_holdoff_offset;
39     size_t   start_holdoff_end;
40     int         start_found_periods;
41 
42     char        stop;
43     int         stop_periods;
44     char        *stop_duration_str;
45     size_t   stop_duration;
46     double      stop_threshold;
47     char        stop_unit;
48 
49     sox_sample_t *stop_holdoff;
50     size_t   stop_holdoff_offset;
51     size_t   stop_holdoff_end;
52     int         stop_found_periods;
53 
54     double      *window;
55     double      *window_current;
56     double      *window_end;
57     size_t   window_size;
58     double      rms_sum;
59 
60     char        leave_silence;
61 
62     /* State Machine */
63     char        mode;
64 } priv_t;
65 
66 static void clear_rms(sox_effect_t * effp)
67 
68 {
69     priv_t * silence = (priv_t *) effp->priv;
70 
71     memset(silence->window, 0,
72            silence->window_size * sizeof(double));
73 
74     silence->window_current = silence->window;
75     silence->window_end = silence->window + silence->window_size;
76     silence->rms_sum = 0;
77 }
78 
79 static int sox_silence_getopts(sox_effect_t * effp, int argc, char **argv)
80 {
81     priv_t *   silence = (priv_t *) effp->priv;
82     int parse_count;
83     uint64_t temp;
84     const char *n;
85   --argc, ++argv;
86 
87     /* check for option switches */
88     silence->leave_silence = sox_false;
89     if (argc > 0)
90     {
91         if (!strcmp("-l", *argv)) {
92             argc--; argv++;
93             silence->leave_silence = sox_true;
94         }
95     }
96 
97     if (argc < 1)
98       return lsx_usage(effp);
99 
100     /* Parse data related to trimming front side */
101     silence->start = sox_false;
102     if (sscanf(argv[0], "%d", &silence->start_periods) != 1)
103       return lsx_usage(effp);
104     if (silence->start_periods < 0)
105     {
106         lsx_fail("Periods must not be negative");
107         return(SOX_EOF);
108     }
109     argv++;
110     argc--;
111 
112     if (silence->start_periods > 0)
113     {
114         silence->start = sox_true;
115         if (argc < 2)
116           return lsx_usage(effp);
117 
118         /* We do not know the sample rate so we can not fully
119          * parse the duration info yet.  So save argument off
120          * for future processing.
121          */
122         silence->start_duration_str = lsx_strdup(argv[0]);
123         /* Perform a fake parse to do error checking */
124         n = lsx_parsesamples(0.,silence->start_duration_str,&temp,'s');
125         if (!n || *n)
126           return lsx_usage(effp);
127         silence->start_duration = temp;
128 
129         parse_count = sscanf(argv[1], "%lf%c", &silence->start_threshold,
130                 &silence->start_unit);
131         if (parse_count < 1)
132           return lsx_usage(effp);
133         else if (parse_count < 2)
134             silence->start_unit = '%';
135 
136         argv++; argv++;
137         argc--; argc--;
138     }
139 
140     silence->stop = sox_false;
141     /* Parse data needed for trimming of backside */
142     if (argc > 0)
143     {
144         if (argc < 3)
145           return lsx_usage(effp);
146         if (sscanf(argv[0], "%d", &silence->stop_periods) != 1)
147           return lsx_usage(effp);
148         if (silence->stop_periods < 0)
149         {
150             silence->stop_periods = -silence->stop_periods;
151             silence->restart = 1;
152         }
153         else
154             silence->restart = 0;
155         silence->stop = sox_true;
156         argv++;
157         argc--;
158 
159         /* We do not know the sample rate so we can not fully
160          * parse the duration info yet.  So save argument off
161          * for future processing.
162          */
163         silence->stop_duration_str = lsx_strdup(argv[0]);
164         /* Perform a fake parse to do error checking */
165         n = lsx_parsesamples(0.,silence->stop_duration_str,&temp,'s');
166         if (!n || *n)
167           return lsx_usage(effp);
168         silence->stop_duration = temp;
169 
170         parse_count = sscanf(argv[1], "%lf%c", &silence->stop_threshold,
171                              &silence->stop_unit);
172         if (parse_count < 1)
173           return lsx_usage(effp);
174         else if (parse_count < 2)
175             silence->stop_unit = '%';
176 
177         argv++; argv++;
178         argc--; argc--;
179     }
180 
181     /* Error checking */
182     if (silence->start)
183     {
184         if ((silence->start_unit != '%') && (silence->start_unit != 'd'))
185         {
186             lsx_fail("Invalid unit specified");
187             return lsx_usage(effp);
188         }
189         if ((silence->start_unit == '%') && ((silence->start_threshold < 0.0)
190             || (silence->start_threshold > 100.0)))
191         {
192             lsx_fail("silence threshold should be between 0.0 and 100.0 %%");
193             return (SOX_EOF);
194         }
195         if ((silence->start_unit == 'd') && (silence->start_threshold >= 0.0))
196         {
197             lsx_fail("silence threshold should be less than 0.0 dB");
198             return(SOX_EOF);
199         }
200     }
201 
202     if (silence->stop)
203     {
204         if ((silence->stop_unit != '%') && (silence->stop_unit != 'd'))
205         {
206             lsx_fail("Invalid unit specified");
207             return(SOX_EOF);
208         }
209         if ((silence->stop_unit == '%') && ((silence->stop_threshold < 0.0) ||
210                     (silence->stop_threshold > 100.0)))
211         {
212             lsx_fail("silence threshold should be between 0.0 and 100.0 %%");
213             return (SOX_EOF);
214         }
215         if ((silence->stop_unit == 'd') && (silence->stop_threshold >= 0.0))
216         {
217             lsx_fail("silence threshold should be less than 0.0 dB");
218             return(SOX_EOF);
219         }
220     }
221     return(SOX_SUCCESS);
222 }
223 
224 static int sox_silence_start(sox_effect_t * effp)
225 {
226     priv_t *silence = (priv_t *)effp->priv;
227     uint64_t temp;
228 
229     /* When you want to remove silence, small window sizes are
230      * better or else RMS will look like non-silence at
231      * aburpt changes from load to silence.
232      */
233     silence->window_size = (effp->in_signal.rate / 50) *
234         effp->in_signal.channels;
235     silence->window = lsx_malloc(silence->window_size * sizeof(double));
236 
237     clear_rms(effp);
238 
239     /* Now that we know sample rate, reparse duration. */
240     if (silence->start)
241     {
242         if (lsx_parsesamples(effp->in_signal.rate, silence->start_duration_str,
243                              &temp, 's') == NULL)
244             return lsx_usage(effp);
245         silence->start_duration = temp * effp->in_signal.channels;
246     }
247     if (silence->stop)
248     {
249         if (lsx_parsesamples(effp->in_signal.rate,silence->stop_duration_str,
250                              &temp,'s') == NULL)
251             return lsx_usage(effp);
252         silence->stop_duration = temp * effp->in_signal.channels;
253     }
254 
255     if (silence->start)
256         silence->mode = SILENCE_TRIM;
257     else
258         silence->mode = SILENCE_COPY;
259 
260     silence->start_holdoff = lsx_malloc(sizeof(sox_sample_t)*silence->start_duration);
261     silence->start_holdoff_offset = 0;
262     silence->start_holdoff_end = 0;
263     silence->start_found_periods = 0;
264 
265     silence->stop_holdoff = lsx_malloc(sizeof(sox_sample_t)*silence->stop_duration);
266     silence->stop_holdoff_offset = 0;
267     silence->stop_holdoff_end = 0;
268     silence->stop_found_periods = 0;
269 
270     effp->out_signal.length = SOX_UNKNOWN_LEN; /* depends on input data */
271 
272     return(SOX_SUCCESS);
273 }
274 
275 static sox_bool aboveThreshold(sox_effect_t const * effp,
276     sox_sample_t value /* >= 0 */, double threshold, int unit)
277 {
278   /* When scaling low bit data, noise values got scaled way up */
279   /* Only consider the original bits when looking for silence */
280   sox_sample_t masked_value = value & (-1 << (32 - effp->in_signal.precision));
281 
282   double scaled_value = (double)masked_value / SOX_SAMPLE_MAX;
283 
284   if (unit == '%')
285     scaled_value *= 100;
286   else if (unit == 'd')
287     scaled_value = linear_to_dB(scaled_value);
288 
289   return scaled_value > threshold;
290 }
291 
292 static sox_sample_t compute_rms(sox_effect_t * effp, sox_sample_t sample)
293 {
294     priv_t * silence = (priv_t *) effp->priv;
295     double new_sum;
296     sox_sample_t rms;
297 
298     new_sum = silence->rms_sum;
299     new_sum -= *silence->window_current;
300     new_sum += ((double)sample * (double)sample);
301 
302     rms = sqrt(new_sum / silence->window_size);
303 
304     return (rms);
305 }
306 
307 static void update_rms(sox_effect_t * effp, sox_sample_t sample)
308 {
309     priv_t * silence = (priv_t *) effp->priv;
310 
311     silence->rms_sum -= *silence->window_current;
312     *silence->window_current = ((double)sample * (double)sample);
313     silence->rms_sum += *silence->window_current;
314 
315     silence->window_current++;
316     if (silence->window_current >= silence->window_end)
317         silence->window_current = silence->window;
318 }
319 
320 /* Process signed long samples from ibuf to obuf. */
321 /* Return number of samples processed in isamp and osamp. */
322 static int sox_silence_flow(sox_effect_t * effp, const sox_sample_t *ibuf, sox_sample_t *obuf,
323                     size_t *isamp, size_t *osamp)
324 {
325     priv_t * silence = (priv_t *) effp->priv;
326     int threshold;
327     size_t i, j;
328     size_t nrOfTicks, /* sometimes wide, sometimes non-wide samples */
329       nrOfInSamplesRead, nrOfOutSamplesWritten; /* non-wide samples */
330 
331     nrOfInSamplesRead = 0;
332     nrOfOutSamplesWritten = 0;
333 
334     switch (silence->mode)
335     {
336         case SILENCE_TRIM:
337             /* Reads and discards all input data until it detects a
338              * sample that is above the specified threshold.  Turns on
339              * copy mode when detected.
340              * Need to make sure and copy input in groups of "channels" to
341              * prevent getting buffers out of sync.
342              * nrOfTicks counts wide samples here.
343              */
344 silence_trim:
345             nrOfTicks = min((*isamp-nrOfInSamplesRead),
346                             (*osamp-nrOfOutSamplesWritten)) /
347                            effp->in_signal.channels;
348             for(i = 0; i < nrOfTicks; i++)
349             {
350                 threshold = 0;
351                 for (j = 0; j < effp->in_signal.channels; j++)
352                 {
353                     threshold |= aboveThreshold(effp,
354                                                 compute_rms(effp, ibuf[j]),
355                                                 silence->start_threshold,
356                                                 silence->start_unit);
357                 }
358 
359                 if (threshold)
360                 {
361                     /* Add to holdoff buffer */
362                     for (j = 0; j < effp->in_signal.channels; j++)
363                     {
364                         update_rms(effp, *ibuf);
365                         silence->start_holdoff[
366                             silence->start_holdoff_end++] = *ibuf++;
367                         nrOfInSamplesRead++;
368                     }
369 
370                     if (silence->start_holdoff_end >=
371                             silence->start_duration)
372                     {
373                         if (++silence->start_found_periods >=
374                                 silence->start_periods)
375                         {
376                             silence->mode = SILENCE_TRIM_FLUSH;
377                             goto silence_trim_flush;
378                         }
379                         /* Trash holdoff buffer since its not
380                          * needed.  Start looking again.
381                          */
382                         silence->start_holdoff_offset = 0;
383                         silence->start_holdoff_end = 0;
384                     }
385                 }
386                 else /* !above Threshold */
387                 {
388                     silence->start_holdoff_end = 0;
389                     for (j = 0; j < effp->in_signal.channels; j++)
390                     {
391                         update_rms(effp, ibuf[j]);
392                     }
393                     ibuf += effp->in_signal.channels;
394                     nrOfInSamplesRead += effp->in_signal.channels;
395                 }
396             } /* for nrOfTicks */
397             break;
398 
399         case SILENCE_TRIM_FLUSH:
400              /* nrOfTicks counts non-wide samples here. */
401 silence_trim_flush:
402             nrOfTicks = min((silence->start_holdoff_end -
403                              silence->start_holdoff_offset),
404                              (*osamp-nrOfOutSamplesWritten));
405             nrOfTicks -= nrOfTicks % effp->in_signal.channels;
406             for(i = 0; i < nrOfTicks; i++)
407             {
408                 *obuf++ = silence->start_holdoff[silence->start_holdoff_offset++];
409                 nrOfOutSamplesWritten++;
410             }
411 
412             /* If fully drained holdoff then switch to copy mode */
413             if (silence->start_holdoff_offset == silence->start_holdoff_end)
414             {
415                 silence->start_holdoff_offset = 0;
416                 silence->start_holdoff_end = 0;
417                 silence->mode = SILENCE_COPY;
418                 goto silence_copy;
419             }
420             break;
421 
422         case SILENCE_COPY:
423             /* Attempts to copy samples into output buffer.
424              *
425              * Case B:
426              * If not looking for silence to terminate copy then
427              * blindly copy data into output buffer.
428              *
429              * Case A:
430              *
431              * Case 1a:
432              * If previous silence was detect then see if input sample is
433              * above threshold.  If found then flush out hold off buffer
434              * and copy over to output buffer.
435              *
436              * Case 1b:
437              * If no previous silence detect then see if input sample
438              * is above threshold.  If found then copy directly
439              * to output buffer.
440              *
441              * Case 2:
442              * If not above threshold then silence is detect so
443              * store in hold off buffer and do not write to output
444              * buffer.  Even though it wasn't put in output
445              * buffer, inform user that input was consumed.
446              *
447              * If hold off buffer is full after this then stop
448              * copying data and discard data in hold off buffer.
449              *
450              * Special leave_silence logic:
451              *
452              * During this mode, go ahead and copy input
453              * samples to output buffer instead of holdoff buffer
454              * Then also short ciruit any flushes that would occur
455              * when non-silence is detect since samples were already
456              * copied.  This has the effect of always leaving
457              * holdoff[] amount of silence but deleting any
458              * beyond that amount.
459              *
460              * nrOfTicks counts wide samples here.
461              */
462 silence_copy:
463             nrOfTicks = min((*isamp-nrOfInSamplesRead),
464                             (*osamp-nrOfOutSamplesWritten)) /
465                            effp->in_signal.channels;
466             if (silence->stop)
467             {
468                 /* Case A */
469                 for(i = 0; i < nrOfTicks; i++)
470                 {
471                     threshold = 1;
472                     for (j = 0; j < effp->in_signal.channels; j++)
473                     {
474                         threshold &= aboveThreshold(effp,
475                                                     compute_rms(effp, ibuf[j]),
476                                                     silence->stop_threshold,
477                                                     silence->stop_unit);
478                     }
479 
480                     /* Case 1a
481                      * If above threshold, check to see if we where holding
482                      * off previously.  If so then flush this buffer.
483                      * We haven't incremented any pointers yet so nothing
484                      * is lost.
485                      *
486                      * If user wants to leave_silence, then we
487                      * were already copying the data and so no
488                      * need to flush the old data.  Just resume
489                      * copying as if we were not holding off.
490                      */
491                     if (threshold && silence->stop_holdoff_end
492                         && !silence->leave_silence)
493                     {
494                         silence->mode = SILENCE_COPY_FLUSH;
495                         goto silence_copy_flush;
496                     }
497                     /* Case 1b */
498                     else if (threshold)
499                     {
500                         /* Not holding off so copy into output buffer */
501                         for (j = 0; j < effp->in_signal.channels; j++)
502                         {
503                             update_rms(effp, *ibuf);
504                             *obuf++ = *ibuf++;
505                             nrOfInSamplesRead++;
506                             nrOfOutSamplesWritten++;
507                         }
508                     }
509                     /* Case 2 */
510                     else if (!threshold)
511                     {
512                         /* Add to holdoff buffer */
513                         for (j = 0; j < effp->in_signal.channels; j++)
514                         {
515                             update_rms(effp, *ibuf);
516                             if (silence->leave_silence) {
517                                 *obuf++ = *ibuf;
518                                 nrOfOutSamplesWritten++;
519                             }
520                             silence->stop_holdoff[
521                                 silence->stop_holdoff_end++] = *ibuf++;
522                             nrOfInSamplesRead++;
523                         }
524 
525                         /* Check if holdoff buffer is greater than duration
526                          */
527                         if (silence->stop_holdoff_end >=
528                                 silence->stop_duration)
529                         {
530                             /* Increment found counter and see if this
531                              * is the last period.  If so then exit.
532                              */
533                             if (++silence->stop_found_periods >=
534                                     silence->stop_periods)
535                             {
536                                 silence->stop_holdoff_offset = 0;
537                                 silence->stop_holdoff_end = 0;
538                                 if (!silence->restart)
539                                 {
540                                     *isamp = nrOfInSamplesRead;
541                                     *osamp = nrOfOutSamplesWritten;
542                                     silence->mode = SILENCE_STOP;
543                                     /* Return SOX_EOF since no more processing */
544                                     return (SOX_EOF);
545                                 }
546                                 else
547                                 {
548                                     silence->stop_found_periods = 0;
549                                     silence->start_found_periods = 0;
550                                     silence->start_holdoff_offset = 0;
551                                     silence->start_holdoff_end = 0;
552                                     clear_rms(effp);
553                                     silence->mode = SILENCE_TRIM;
554 
555                                     goto silence_trim;
556                                 }
557                             }
558                             else
559                             {
560                                 /* Flush this buffer and start
561                                  * looking again.
562                                  */
563                                 silence->mode = SILENCE_COPY_FLUSH;
564                                 goto silence_copy_flush;
565                             }
566                             break;
567                         } /* Filled holdoff buffer */
568                     } /* Detected silence */
569                 } /* For # of samples */
570             } /* Trimming off backend */
571             else /* !(silence->stop) */
572             {
573                 /* Case B */
574                 memcpy(obuf, ibuf, sizeof(sox_sample_t)*nrOfTicks*
575                                    effp->in_signal.channels);
576                 nrOfInSamplesRead += (nrOfTicks*effp->in_signal.channels);
577                 nrOfOutSamplesWritten += (nrOfTicks*effp->in_signal.channels);
578             }
579             break;
580 
581         case SILENCE_COPY_FLUSH:
582              /* nrOfTicks counts non-wide samples here. */
583 silence_copy_flush:
584             nrOfTicks = min((silence->stop_holdoff_end -
585                                 silence->stop_holdoff_offset),
586                             (*osamp-nrOfOutSamplesWritten));
587             nrOfTicks -= nrOfTicks % effp->in_signal.channels;
588 
589             for(i = 0; i < nrOfTicks; i++)
590             {
591                 *obuf++ = silence->stop_holdoff[silence->stop_holdoff_offset++];
592                 nrOfOutSamplesWritten++;
593             }
594 
595             /* If fully drained holdoff then return to copy mode */
596             if (silence->stop_holdoff_offset == silence->stop_holdoff_end)
597             {
598                 silence->stop_holdoff_offset = 0;
599                 silence->stop_holdoff_end = 0;
600                 silence->mode = SILENCE_COPY;
601                 goto silence_copy;
602             }
603             break;
604 
605         case SILENCE_STOP:
606             /* This code can't be reached. */
607             nrOfInSamplesRead = *isamp;
608             break;
609         }
610 
611         *isamp = nrOfInSamplesRead;
612         *osamp = nrOfOutSamplesWritten;
613 
614         return (SOX_SUCCESS);
615 }
616 
617 static int sox_silence_drain(sox_effect_t * effp, sox_sample_t *obuf, size_t *osamp)
618 {
619     priv_t * silence = (priv_t *) effp->priv;
620     size_t i;
621     size_t nrOfTicks, nrOfOutSamplesWritten = 0; /* non-wide samples */
622 
623     /* Only if in flush mode will there be possible samples to write
624      * out during drain() call.
625      */
626     if (silence->mode == SILENCE_COPY_FLUSH ||
627         silence->mode == SILENCE_COPY)
628     {
629         nrOfTicks = min((silence->stop_holdoff_end -
630                             silence->stop_holdoff_offset), *osamp);
631         nrOfTicks -= nrOfTicks % effp->in_signal.channels;
632         for(i = 0; i < nrOfTicks; i++)
633         {
634             *obuf++ = silence->stop_holdoff[silence->stop_holdoff_offset++];
635             nrOfOutSamplesWritten++;
636         }
637 
638         /* If fully drained holdoff then stop */
639         if (silence->stop_holdoff_offset == silence->stop_holdoff_end)
640         {
641             silence->stop_holdoff_offset = 0;
642             silence->stop_holdoff_end = 0;
643             silence->mode = SILENCE_STOP;
644         }
645     }
646 
647     *osamp = nrOfOutSamplesWritten;
648     if (silence->mode == SILENCE_STOP || *osamp == 0)
649         return SOX_EOF;
650     else
651         return SOX_SUCCESS;
652 }
653 
654 static int sox_silence_stop(sox_effect_t * effp)
655 {
656   priv_t * silence = (priv_t *) effp->priv;
657 
658   free(silence->window);
659   free(silence->start_holdoff);
660   free(silence->stop_holdoff);
661 
662   return(SOX_SUCCESS);
663 }
664 
665 static int lsx_kill(sox_effect_t * effp)
666 {
667   priv_t * silence = (priv_t *) effp->priv;
668 
669   free(silence->start_duration_str);
670   free(silence->stop_duration_str);
671 
672   return SOX_SUCCESS;
673 }
674 
675 static sox_effect_handler_t sox_silence_effect = {
676   "silence",
677   "[ -l ] above_periods [ duration threshold[d|%] ] [ below_periods duration threshold[d|%] ]",
678   SOX_EFF_MCHAN | SOX_EFF_MODIFY | SOX_EFF_LENGTH,
679   sox_silence_getopts,
680   sox_silence_start,
681   sox_silence_flow,
682   sox_silence_drain,
683   sox_silence_stop,
684   lsx_kill, sizeof(priv_t)
685 };
686 
687 const sox_effect_handler_t *lsx_silence_effect_fn(void)
688 {
689     return &sox_silence_effect;
690 }
691