1 /***********************************************************************
2 Copyright (c) 2006-2011, Skype Limited. All rights reserved.
3 Redistribution and use in source and binary forms, with or without
4 modification, are permitted provided that the following conditions
5 are met:
6 - Redistributions of source code must retain the above copyright notice,
7 this list of conditions and the following disclaimer.
8 - Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
12 names of specific contributors, may be used to endorse or promote
13 products derived from this software without specific prior written
14 permission.
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***********************************************************************/
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include "main.h"
33 
34 /* Silk VAD noise level estimation */
35 static inline void silk_VAD_GetNoiseLevels(
36     const opus_int32             pX[ VAD_N_BANDS ], /* I    subband energies                            */
37     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
38 );
39 
40 /**********************************/
41 /* Initialization of the Silk VAD */
42 /**********************************/
43 opus_int silk_VAD_Init(                                         /* O    Return value, 0 if success                  */
44     silk_VAD_state              *psSilk_VAD                     /* I/O  Pointer to Silk VAD state                   */
45 )
46 {
47     opus_int b, ret = 0;
48 
49     /* reset state memory */
50     silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
51 
52     /* init noise levels */
53     /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
54     for( b = 0; b < VAD_N_BANDS; b++ ) {
55         psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
56     }
57 
58     /* Initialize state */
59     for( b = 0; b < VAD_N_BANDS; b++ ) {
60         psSilk_VAD->NL[ b ]     = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
61         psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
62     }
63     psSilk_VAD->counter = 15;
64 
65     /* init smoothed energy-to-noise ratio*/
66     for( b = 0; b < VAD_N_BANDS; b++ ) {
67         psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256;       /* 100 * 256 --> 20 dB SNR */
68     }
69 
70     return( ret );
71 }
72 
73 /* Weighting factors for tilt measure */
74 static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
75 
76 /***************************************/
77 /* Get the speech activity level in Q8 */
78 /***************************************/
79 opus_int silk_VAD_GetSA_Q8(                                     /* O    Return value, 0 if success                  */
80     silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
81     const opus_int16            pIn[]                           /* I    PCM input                                   */
82 )
83 {
84     opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
85     opus_int   decimated_framelength, dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
86     opus_int32 sumSquared, smooth_coef_Q16;
87     opus_int16 HPstateTmp;
88     opus_int16 X[ VAD_N_BANDS ][ MAX_FRAME_LENGTH / 2 ];
89     opus_int32 Xnrg[ VAD_N_BANDS ];
90     opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
91     opus_int32 speech_nrg, x_tmp;
92     opus_int   ret = 0;
93     silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
94 
95     /* Safety checks */
96     silk_assert( VAD_N_BANDS == 4 );
97     silk_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
98     silk_assert( psEncC->frame_length <= 512 );
99     silk_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
100 
101     /***********************/
102     /* Filter and Decimate */
103     /***********************/
104     /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
105     silk_ana_filt_bank_1( pIn,          &psSilk_VAD->AnaState[  0 ], &X[ 0 ][ 0 ], &X[ 3 ][ 0 ], psEncC->frame_length );
106 
107     /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
108     silk_ana_filt_bank_1( &X[ 0 ][ 0 ], &psSilk_VAD->AnaState1[ 0 ], &X[ 0 ][ 0 ], &X[ 2 ][ 0 ], silk_RSHIFT( psEncC->frame_length, 1 ) );
109 
110     /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
111     silk_ana_filt_bank_1( &X[ 0 ][ 0 ], &psSilk_VAD->AnaState2[ 0 ], &X[ 0 ][ 0 ], &X[ 1 ][ 0 ], silk_RSHIFT( psEncC->frame_length, 2 ) );
112 
113     /*********************************************/
114     /* HP filter on lowest band (differentiator) */
115     /*********************************************/
116     decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
117     X[ 0 ][ decimated_framelength - 1 ] = silk_RSHIFT( X[ 0 ][ decimated_framelength - 1 ], 1 );
118     HPstateTmp = X[ 0 ][ decimated_framelength - 1 ];
119     for( i = decimated_framelength - 1; i > 0; i-- ) {
120         X[ 0 ][ i - 1 ]  = silk_RSHIFT( X[ 0 ][ i - 1 ], 1 );
121         X[ 0 ][ i ]     -= X[ 0 ][ i - 1 ];
122     }
123     X[ 0 ][ 0 ] -= psSilk_VAD->HPstate;
124     psSilk_VAD->HPstate = HPstateTmp;
125 
126     /*************************************/
127     /* Calculate the energy in each band */
128     /*************************************/
129     for( b = 0; b < VAD_N_BANDS; b++ ) {
130         /* Find the decimated framelength in the non-uniformly divided bands */
131         decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
132 
133         /* Split length into subframe lengths */
134         dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
135         dec_subframe_offset = 0;
136 
137         /* Compute energy per sub-frame */
138         /* initialize with summed energy of last subframe */
139         Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
140         for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
141             sumSquared = 0;
142             for( i = 0; i < dec_subframe_length; i++ ) {
143                 /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
144                 /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
145                 x_tmp = silk_RSHIFT( X[ b ][ i + dec_subframe_offset ], 3 );
146                 sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
147 
148                 /* Safety check */
149                 silk_assert( sumSquared >= 0 );
150             }
151 
152             /* Add/saturate summed energy of current subframe */
153             if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
154                 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
155             } else {
156                 /* Look-ahead subframe */
157                 Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
158             }
159 
160             dec_subframe_offset += dec_subframe_length;
161         }
162         psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
163     }
164 
165     /********************/
166     /* Noise estimation */
167     /********************/
168     silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
169 
170     /***********************************************/
171     /* Signal-plus-noise to noise ratio estimation */
172     /***********************************************/
173     sumSquared = 0;
174     input_tilt = 0;
175     for( b = 0; b < VAD_N_BANDS; b++ ) {
176         speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
177         if( speech_nrg > 0 ) {
178             /* Divide, with sufficient resolution */
179             if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
180                 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
181             } else {
182                 NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
183             }
184 
185             /* Convert to log domain */
186             SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
187 
188             /* Sum-of-squares */
189             sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
190 
191             /* Tilt measure */
192             if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
193                 /* Scale down SNR value for small subband speech energies */
194                 SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
195             }
196             input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
197         } else {
198             NrgToNoiseRatio_Q8[ b ] = 256;
199         }
200     }
201 
202     /* Mean-of-squares */
203     sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
204 
205     /* Root-mean-square approximation, scale to dBs, and write to output pointer */
206     pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
207 
208     /*********************************/
209     /* Speech Probability Estimation */
210     /*********************************/
211     SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
212 
213     /**************************/
214     /* Frequency Tilt Measure */
215     /**************************/
216     psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
217 
218     /**************************************************/
219     /* Scale the sigmoid output based on power levels */
220     /**************************************************/
221     speech_nrg = 0;
222     for( b = 0; b < VAD_N_BANDS; b++ ) {
223         /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
224         speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
225     }
226 
227     /* Power scaling */
228     if( speech_nrg <= 0 ) {
229         SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
230     } else if( speech_nrg < 32768 ) {
231         if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
232             speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 16 );
233         } else {
234             speech_nrg = silk_LSHIFT_SAT32( speech_nrg, 15 );
235         }
236 
237         /* square-root */
238         speech_nrg = silk_SQRT_APPROX( speech_nrg );
239         SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
240     }
241 
242     /* Copy the resulting speech activity in Q8 */
243     psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
244 
245     /***********************************/
246     /* Energy Level and SNR estimation */
247     /***********************************/
248     /* Smoothing coefficient */
249     smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
250 
251     if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
252         smooth_coef_Q16 >>= 1;
253     }
254 
255     for( b = 0; b < VAD_N_BANDS; b++ ) {
256         /* compute smoothed energy-to-noise ratio per band */
257         psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
258             NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
259 
260         /* signal to noise ratio in dB per band */
261         SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
262         /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
263         psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
264     }
265 
266     return( ret );
267 }
268 
269 /**************************/
270 /* Noise level estimation */
271 /**************************/
272 static inline void silk_VAD_GetNoiseLevels(
273     const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
274     silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
275 )
276 {
277     opus_int   k;
278     opus_int32 nl, nrg, inv_nrg;
279     opus_int   coef, min_coef;
280 
281     /* Initially faster smoothing */
282     if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
283         min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
284     } else {
285         min_coef = 0;
286     }
287 
288     for( k = 0; k < VAD_N_BANDS; k++ ) {
289         /* Get old noise level estimate for current band */
290         nl = psSilk_VAD->NL[ k ];
291         silk_assert( nl >= 0 );
292 
293         /* Add bias */
294         nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
295         silk_assert( nrg > 0 );
296 
297         /* Invert energies */
298         inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
299         silk_assert( inv_nrg >= 0 );
300 
301         /* Less update when subband energy is high */
302         if( nrg > silk_LSHIFT( nl, 3 ) ) {
303             coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
304         } else if( nrg < nl ) {
305             coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
306         } else {
307             coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
308         }
309 
310         /* Initially faster smoothing */
311         coef = silk_max_int( coef, min_coef );
312 
313         /* Smooth inverse energies */
314         psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
315         silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
316 
317         /* Compute noise level by inverting again */
318         nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
319         silk_assert( nl >= 0 );
320 
321         /* Limit noise levels (guarantee 7 bits of head room) */
322         nl = silk_min( nl, 0x00FFFFFF );
323 
324         /* Store as part of state */
325         psSilk_VAD->NL[ k ] = nl;
326     }
327 
328     /* Increment frame counter */
329     psSilk_VAD->counter++;
330 }
331