1 /* Copyright (c) 2011 Xiph.Org Foundation
2 Written by Jean-Marc Valin */
3 /*
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31
32 #define ANALYSIS_C
33
34 #include <stdio.h>
35
36 #include "mathops.h"
37 #include "kiss_fft.h"
38 #include "celt.h"
39 #include "modes.h"
40 #include "arch.h"
41 #include "quant_bands.h"
42 #include "analysis.h"
43 #include "mlp.h"
44 #include "stack_alloc.h"
45 #include "float_cast.h"
46
47 #ifndef M_PI
48 #define M_PI 3.141592653
49 #endif
50
51 #ifndef DISABLE_FLOAT_API
52
53 #define TRANSITION_PENALTY 10
54
55 static const float dct_table[128] = {
56 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
57 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
58 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
59 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
60 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
61 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
62 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
63 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
64 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
65 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
66 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
67 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
68 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
69 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
70 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
71 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
72 };
73
74 static const float analysis_window[240] = {
75 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
76 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
77 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
78 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
79 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
80 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
81 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
82 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
83 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
84 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
85 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
86 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
87 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
88 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
89 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
90 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
91 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
92 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
93 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
94 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
95 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
96 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
97 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
98 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
99 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
100 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
101 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
102 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
103 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
104 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
105 };
106
107 static const int tbands[NB_TBANDS+1] = {
108 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240
109 };
110
111 #define NB_TONAL_SKIP_BANDS 9
112
silk_resampler_down2_hp(opus_val32 * S,opus_val32 * out,const opus_val32 * in,int inLen)113 static opus_val32 silk_resampler_down2_hp(
114 opus_val32 *S, /* I/O State vector [ 2 ] */
115 opus_val32 *out, /* O Output signal [ floor(len/2) ] */
116 const opus_val32 *in, /* I Input signal [ len ] */
117 int inLen /* I Number of input samples */
118 )
119 {
120 int k, len2 = inLen/2;
121 opus_val32 in32, out32, out32_hp, Y, X;
122 opus_val64 hp_ener = 0;
123 /* Internal variables and state are in Q10 format */
124 for( k = 0; k < len2; k++ ) {
125 /* Convert to Q10 */
126 in32 = in[ 2 * k ];
127
128 /* All-pass section for even input sample */
129 Y = SUB32( in32, S[ 0 ] );
130 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);
131 out32 = ADD32( S[ 0 ], X );
132 S[ 0 ] = ADD32( in32, X );
133 out32_hp = out32;
134 /* Convert to Q10 */
135 in32 = in[ 2 * k + 1 ];
136
137 /* All-pass section for odd input sample, and add to output of previous section */
138 Y = SUB32( in32, S[ 1 ] );
139 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
140 out32 = ADD32( out32, S[ 1 ] );
141 out32 = ADD32( out32, X );
142 S[ 1 ] = ADD32( in32, X );
143
144 Y = SUB32( -in32, S[ 2 ] );
145 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);
146 out32_hp = ADD32( out32_hp, S[ 2 ] );
147 out32_hp = ADD32( out32_hp, X );
148 S[ 2 ] = ADD32( -in32, X );
149
150 hp_ener += out32_hp*(opus_val64)out32_hp;
151 /* Add, convert back to int16 and store to output */
152 out[ k ] = HALF32(out32);
153 }
154 #ifdef FIXED_POINT
155 /* len2 can be up to 480, so we shift by 8 more to make it fit. */
156 hp_ener = hp_ener >> (2*SIG_SHIFT + 8);
157 #endif
158 return (opus_val32)hp_ener;
159 }
160
downmix_and_resample(downmix_func downmix,const void * _x,opus_val32 * y,opus_val32 S[3],int subframe,int offset,int c1,int c2,int C,int Fs)161 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)
162 {
163 VARDECL(opus_val32, tmp);
164 opus_val32 scale;
165 int j;
166 opus_val32 ret = 0;
167 SAVE_STACK;
168
169 if (subframe==0) return 0;
170 if (Fs == 48000)
171 {
172 subframe *= 2;
173 offset *= 2;
174 } else if (Fs == 16000) {
175 subframe = subframe*2/3;
176 offset = offset*2/3;
177 }
178 ALLOC(tmp, subframe, opus_val32);
179
180 downmix(_x, tmp, subframe, offset, c1, c2, C);
181 #ifdef FIXED_POINT
182 scale = (1<<SIG_SHIFT);
183 #else
184 scale = 1.f/32768;
185 #endif
186 if (c2==-2)
187 scale /= C;
188 else if (c2>-1)
189 scale /= 2;
190 for (j=0;j<subframe;j++)
191 tmp[j] *= scale;
192 if (Fs == 48000)
193 {
194 ret = silk_resampler_down2_hp(S, y, tmp, subframe);
195 } else if (Fs == 24000) {
196 OPUS_COPY(y, tmp, subframe);
197 } else if (Fs == 16000) {
198 VARDECL(opus_val32, tmp3x);
199 ALLOC(tmp3x, 3*subframe, opus_val32);
200 /* Don't do this at home! This resampler is horrible and it's only (barely)
201 usable for the purpose of the analysis because we don't care about all
202 the aliasing between 8 kHz and 12 kHz. */
203 for (j=0;j<subframe;j++)
204 {
205 tmp3x[3*j] = tmp[j];
206 tmp3x[3*j+1] = tmp[j];
207 tmp3x[3*j+2] = tmp[j];
208 }
209 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);
210 }
211 RESTORE_STACK;
212 return ret;
213 }
214
tonality_analysis_init(TonalityAnalysisState * tonal,opus_int32 Fs)215 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)
216 {
217 /* Initialize reusable fields. */
218 tonal->arch = opus_select_arch();
219 tonal->Fs = Fs;
220 /* Clear remaining fields. */
221 tonality_analysis_reset(tonal);
222 }
223
tonality_analysis_reset(TonalityAnalysisState * tonal)224 void tonality_analysis_reset(TonalityAnalysisState *tonal)
225 {
226 /* Clear non-reusable fields. */
227 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
228 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
229 }
230
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)231 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
232 {
233 int pos;
234 int curr_lookahead;
235 float tonality_max;
236 float tonality_avg;
237 int tonality_count;
238 int i;
239 int pos0;
240 float prob_avg;
241 float prob_count;
242 float prob_min, prob_max;
243 float vad_prob;
244 int mpos, vpos;
245 int bandwidth_span;
246
247 pos = tonal->read_pos;
248 curr_lookahead = tonal->write_pos-tonal->read_pos;
249 if (curr_lookahead<0)
250 curr_lookahead += DETECT_SIZE;
251
252 tonal->read_subframe += len/(tonal->Fs/400);
253 while (tonal->read_subframe>=8)
254 {
255 tonal->read_subframe -= 8;
256 tonal->read_pos++;
257 }
258 if (tonal->read_pos>=DETECT_SIZE)
259 tonal->read_pos-=DETECT_SIZE;
260
261 /* On long frames, look at the second analysis window rather than the first. */
262 if (len > tonal->Fs/50 && pos != tonal->write_pos)
263 {
264 pos++;
265 if (pos==DETECT_SIZE)
266 pos=0;
267 }
268 if (pos == tonal->write_pos)
269 pos--;
270 if (pos<0)
271 pos = DETECT_SIZE-1;
272 pos0 = pos;
273 OPUS_COPY(info_out, &tonal->info[pos], 1);
274 if (!info_out->valid)
275 return;
276 tonality_max = tonality_avg = info_out->tonality;
277 tonality_count = 1;
278 /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
279 bandwidth_span = 6;
280 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */
281 for (i=0;i<3;i++)
282 {
283 pos++;
284 if (pos==DETECT_SIZE)
285 pos = 0;
286 if (pos == tonal->write_pos)
287 break;
288 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality);
289 tonality_avg += tonal->info[pos].tonality;
290 tonality_count++;
291 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
292 bandwidth_span--;
293 }
294 pos = pos0;
295 /* Look back in time to see if any has a wider bandwidth than the current frame. */
296 for (i=0;i<bandwidth_span;i++)
297 {
298 pos--;
299 if (pos < 0)
300 pos = DETECT_SIZE-1;
301 if (pos == tonal->write_pos)
302 break;
303 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth);
304 }
305 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f);
306
307 mpos = vpos = pos0;
308 /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and
309 ~1 frame delay in the VAD prob. */
310 if (curr_lookahead > 15)
311 {
312 mpos += 5;
313 if (mpos>=DETECT_SIZE)
314 mpos -= DETECT_SIZE;
315 vpos += 1;
316 if (vpos>=DETECT_SIZE)
317 vpos -= DETECT_SIZE;
318 }
319
320 /* The following calculations attempt to minimize a "badness function"
321 for the transition. When switching from speech to music, the badness
322 of switching at frame k is
323 b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
324 where
325 v_i is the activity probability (VAD) at frame i,
326 p_i is the music probability at frame i
327 T is the probability threshold for switching
328 S is the penalty for switching during active audio rather than silence
329 the current frame has index i=0
330
331 Rather than apply badness to directly decide when to switch, what we compute
332 instead is the threshold for which the optimal switching point is now. When
333 considering whether to switch now (frame 0) or at frame k, we have:
334 S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T)
335 which gives us:
336 T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i )
337 We take the min threshold across all positive values of k (up to the maximum
338 amount of lookahead we have) to give us the threshold for which the current
339 frame is the optimal switch point.
340
341 The last step is that we need to consider whether we want to switch at all.
342 For that we use the average of the music probability over the entire window.
343 If the threshold is higher than that average we're not going to
344 switch, so we compute a min with the average as well. The result of all these
345 min operations is music_prob_min, which gives the threshold for switching to music
346 if we're currently encoding for speech.
347
348 We do the exact opposite to compute music_prob_max which is used for switching
349 from music to speech.
350 */
351 prob_min = 1.f;
352 prob_max = 0.f;
353 vad_prob = tonal->info[vpos].activity_probability;
354 prob_count = MAX16(.1f, vad_prob);
355 prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob;
356 while (1)
357 {
358 float pos_vad;
359 mpos++;
360 if (mpos==DETECT_SIZE)
361 mpos = 0;
362 if (mpos == tonal->write_pos)
363 break;
364 vpos++;
365 if (vpos==DETECT_SIZE)
366 vpos = 0;
367 if (vpos == tonal->write_pos)
368 break;
369 pos_vad = tonal->info[vpos].activity_probability;
370 prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min);
371 prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max);
372 prob_count += MAX16(.1f, pos_vad);
373 prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob;
374 }
375 info_out->music_prob = prob_avg/prob_count;
376 prob_min = MIN16(prob_avg/prob_count, prob_min);
377 prob_max = MAX16(prob_avg/prob_count, prob_max);
378 prob_min = MAX16(prob_min, 0.f);
379 prob_max = MIN16(prob_max, 1.f);
380
381 /* If we don't have enough look-ahead, do our best to make a decent decision. */
382 if (curr_lookahead < 10)
383 {
384 float pmin, pmax;
385 pmin = prob_min;
386 pmax = prob_max;
387 pos = pos0;
388 /* Look for min/max in the past. */
389 for (i=0;i<IMIN(tonal->count-1, 15);i++)
390 {
391 pos--;
392 if (pos < 0)
393 pos = DETECT_SIZE-1;
394 pmin = MIN16(pmin, tonal->info[pos].music_prob);
395 pmax = MAX16(pmax, tonal->info[pos].music_prob);
396 }
397 /* Bias against switching on active audio. */
398 pmin = MAX16(0.f, pmin - .1f*vad_prob);
399 pmax = MIN16(1.f, pmax + .1f*vad_prob);
400 prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min);
401 prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max);
402 }
403 info_out->music_prob_min = prob_min;
404 info_out->music_prob_max = prob_max;
405
406 /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
407 }
408
409 static const float std_feature_bias[9] = {
410 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f,
411 2.163313f, 1.260756f, 1.116868f, 1.918795f
412 };
413
414 #define LEAKAGE_OFFSET 2.5f
415 #define LEAKAGE_SLOPE 2.f
416
417 #ifdef FIXED_POINT
418 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to
419 compensate for that in the energy. */
420 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT)))
421 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e))
422 #else
423 #define SCALE_ENER(e) (e)
424 #endif
425
426 #ifdef FIXED_POINT
is_digital_silence32(const opus_val32 * pcm,int frame_size,int channels,int lsb_depth)427 static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth)
428 {
429 int silence = 0;
430 opus_val32 sample_max = 0;
431 #ifdef MLP_TRAINING
432 return 0;
433 #endif
434 sample_max = celt_maxabs32(pcm, frame_size*channels);
435
436 silence = (sample_max == 0);
437 (void)lsb_depth;
438 return silence;
439 }
440 #else
441 #define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth)
442 #endif
443
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)444 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
445 {
446 int i, b;
447 const kiss_fft_state *kfft;
448 VARDECL(kiss_fft_cpx, in);
449 VARDECL(kiss_fft_cpx, out);
450 int N = 480, N2=240;
451 float * OPUS_RESTRICT A = tonal->angle;
452 float * OPUS_RESTRICT dA = tonal->d_angle;
453 float * OPUS_RESTRICT d2A = tonal->d2_angle;
454 VARDECL(float, tonality);
455 VARDECL(float, noisiness);
456 float band_tonality[NB_TBANDS];
457 float logE[NB_TBANDS];
458 float BFCC[8];
459 float features[25];
460 float frame_tonality;
461 float max_frame_tonality;
462 /*float tw_sum=0;*/
463 float frame_noisiness;
464 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
465 float slope=0;
466 float frame_stationarity;
467 float relativeE;
468 float frame_probs[2];
469 float alpha, alphaE, alphaE2;
470 float frame_loudness;
471 float bandwidth_mask;
472 int is_masked[NB_TBANDS+1];
473 int bandwidth=0;
474 float maxE = 0;
475 float noise_floor;
476 int remaining;
477 AnalysisInfo *info;
478 float hp_ener;
479 float tonality2[240];
480 float midE[8];
481 float spec_variability=0;
482 float band_log2[NB_TBANDS+1];
483 float leakage_from[NB_TBANDS+1];
484 float leakage_to[NB_TBANDS+1];
485 float layer_out[MAX_NEURONS];
486 float below_max_pitch;
487 float above_max_pitch;
488 int is_silence;
489 SAVE_STACK;
490
491 if (!tonal->initialized)
492 {
493 tonal->mem_fill = 240;
494 tonal->initialized = 1;
495 }
496 alpha = 1.f/IMIN(10, 1+tonal->count);
497 alphaE = 1.f/IMIN(25, 1+tonal->count);
498 /* Noise floor related decay for bandwidth detection: -2.2 dB/second */
499 alphaE2 = 1.f/IMIN(100, 1+tonal->count);
500 if (tonal->count <= 1) alphaE2 = 1;
501
502 if (tonal->Fs == 48000)
503 {
504 /* len and offset are now at 24 kHz. */
505 len/= 2;
506 offset /= 2;
507 } else if (tonal->Fs == 16000) {
508 len = 3*len/2;
509 offset = 3*offset/2;
510 }
511
512 kfft = celt_mode->mdct.kfft[0];
513 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x,
514 &tonal->inmem[tonal->mem_fill], tonal->downmix_state,
515 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);
516 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
517 {
518 tonal->mem_fill += len;
519 /* Don't have enough to update the analysis */
520 RESTORE_STACK;
521 return;
522 }
523 hp_ener = tonal->hp_ener_accum;
524 info = &tonal->info[tonal->write_pos++];
525 if (tonal->write_pos>=DETECT_SIZE)
526 tonal->write_pos-=DETECT_SIZE;
527
528 is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth);
529
530 ALLOC(in, 480, kiss_fft_cpx);
531 ALLOC(out, 480, kiss_fft_cpx);
532 ALLOC(tonality, 240, float);
533 ALLOC(noisiness, 240, float);
534 for (i=0;i<N2;i++)
535 {
536 float w = analysis_window[i];
537 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
538 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
539 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
540 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
541 }
542 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
543 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
544 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x,
545 &tonal->inmem[240], tonal->downmix_state, remaining,
546 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
547 tonal->mem_fill = 240 + remaining;
548 if (is_silence)
549 {
550 /* On silence, copy the previous analysis. */
551 int prev_pos = tonal->write_pos-2;
552 if (prev_pos < 0)
553 prev_pos += DETECT_SIZE;
554 OPUS_COPY(info, &tonal->info[prev_pos], 1);
555 RESTORE_STACK;
556 return;
557 }
558 opus_fft(kfft, in, out, tonal->arch);
559 #ifndef FIXED_POINT
560 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
561 if (celt_isnan(out[0].r))
562 {
563 info->valid = 0;
564 RESTORE_STACK;
565 return;
566 }
567 #endif
568
569 for (i=1;i<N2;i++)
570 {
571 float X1r, X2r, X1i, X2i;
572 float angle, d_angle, d2_angle;
573 float angle2, d_angle2, d2_angle2;
574 float mod1, mod2, avg_mod;
575 X1r = (float)out[i].r+out[N-i].r;
576 X1i = (float)out[i].i-out[N-i].i;
577 X2r = (float)out[i].i+out[N-i].i;
578 X2i = (float)out[N-i].r-out[i].r;
579
580 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
581 d_angle = angle - A[i];
582 d2_angle = d_angle - dA[i];
583
584 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
585 d_angle2 = angle2 - angle;
586 d2_angle2 = d_angle2 - d_angle;
587
588 mod1 = d2_angle - (float)float2int(d2_angle);
589 noisiness[i] = ABS16(mod1);
590 mod1 *= mod1;
591 mod1 *= mod1;
592
593 mod2 = d2_angle2 - (float)float2int(d2_angle2);
594 noisiness[i] += ABS16(mod2);
595 mod2 *= mod2;
596 mod2 *= mod2;
597
598 avg_mod = .25f*(d2A[i]+mod1+2*mod2);
599 /* This introduces an extra delay of 2 frames in the detection. */
600 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
601 /* No delay on this detection, but it's less reliable. */
602 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;
603
604 A[i] = angle2;
605 dA[i] = d_angle2;
606 d2A[i] = mod2;
607 }
608 for (i=2;i<N2-1;i++)
609 {
610 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));
611 tonality[i] = .9f*MAX32(tonality[i], tt-.1f);
612 }
613 frame_tonality = 0;
614 max_frame_tonality = 0;
615 /*tw_sum = 0;*/
616 info->activity = 0;
617 frame_noisiness = 0;
618 frame_stationarity = 0;
619 if (!tonal->count)
620 {
621 for (b=0;b<NB_TBANDS;b++)
622 {
623 tonal->lowE[b] = 1e10;
624 tonal->highE[b] = -1e10;
625 }
626 }
627 relativeE = 0;
628 frame_loudness = 0;
629 /* The energy of the very first band is special because of DC. */
630 {
631 float E = 0;
632 float X1r, X2r;
633 X1r = 2*(float)out[0].r;
634 X2r = 2*(float)out[0].i;
635 E = X1r*X1r + X2r*X2r;
636 for (i=1;i<4;i++)
637 {
638 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
639 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
640 E += binE;
641 }
642 E = SCALE_ENER(E);
643 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f);
644 }
645 for (b=0;b<NB_TBANDS;b++)
646 {
647 float E=0, tE=0, nE=0;
648 float L1, L2;
649 float stationarity;
650 for (i=tbands[b];i<tbands[b+1];i++)
651 {
652 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
653 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
654 binE = SCALE_ENER(binE);
655 E += binE;
656 tE += binE*MAX32(0, tonality[i]);
657 nE += binE*2.f*(.5f-noisiness[i]);
658 }
659 #ifndef FIXED_POINT
660 /* Check for extreme band energies that could cause NaNs later. */
661 if (!(E<1e9f) || celt_isnan(E))
662 {
663 info->valid = 0;
664 RESTORE_STACK;
665 return;
666 }
667 #endif
668
669 tonal->E[tonal->E_count][b] = E;
670 frame_noisiness += nE/(1e-15f+E);
671
672 frame_loudness += (float)sqrt(E+1e-10f);
673 logE[b] = (float)log(E+1e-10f);
674 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f);
675 tonal->logE[tonal->E_count][b] = logE[b];
676 if (tonal->count==0)
677 tonal->highE[b] = tonal->lowE[b] = logE[b];
678 if (tonal->highE[b] > tonal->lowE[b] + 7.5)
679 {
680 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])
681 tonal->highE[b] -= .01f;
682 else
683 tonal->lowE[b] += .01f;
684 }
685 if (logE[b] > tonal->highE[b])
686 {
687 tonal->highE[b] = logE[b];
688 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);
689 } else if (logE[b] < tonal->lowE[b])
690 {
691 tonal->lowE[b] = logE[b];
692 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);
693 }
694 relativeE += (logE[b]-tonal->lowE[b])/(1e-5f + (tonal->highE[b]-tonal->lowE[b]));
695
696 L1=L2=0;
697 for (i=0;i<NB_FRAMES;i++)
698 {
699 L1 += (float)sqrt(tonal->E[i][b]);
700 L2 += tonal->E[i][b];
701 }
702
703 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
704 stationarity *= stationarity;
705 stationarity *= stationarity;
706 frame_stationarity += stationarity;
707 /*band_tonality[b] = tE/(1e-15+E)*/;
708 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
709 #if 0
710 if (b>=NB_TONAL_SKIP_BANDS)
711 {
712 frame_tonality += tweight[b]*band_tonality[b];
713 tw_sum += tweight[b];
714 }
715 #else
716 frame_tonality += band_tonality[b];
717 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
718 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
719 #endif
720 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
721 slope += band_tonality[b]*(b-8);
722 /*printf("%f %f ", band_tonality[b], stationarity);*/
723 tonal->prev_band_tonality[b] = band_tonality[b];
724 }
725
726 leakage_from[0] = band_log2[0];
727 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET;
728 for (b=1;b<NB_TBANDS+1;b++)
729 {
730 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4;
731 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]);
732 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET);
733 }
734 for (b=NB_TBANDS-2;b>=0;b--)
735 {
736 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4;
737 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]);
738 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]);
739 }
740 celt_assert(NB_TBANDS+1 <= LEAK_BANDS);
741 for (b=0;b<NB_TBANDS+1;b++)
742 {
743 /* leak_boost[] is made up of two terms. The first, based on leakage_to[],
744 represents the boost needed to overcome the amount of analysis leakage
745 cause in a weaker band b by louder neighbouring bands.
746 The second, based on leakage_from[], applies to a loud band b for
747 which the quantization noise causes synthesis leakage to the weaker
748 neighbouring bands. */
749 float boost = MAX16(0, leakage_to[b] - band_log2[b]) +
750 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET));
751 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost));
752 }
753 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0;
754
755 for (i=0;i<NB_FRAMES;i++)
756 {
757 int j;
758 float mindist = 1e15f;
759 for (j=0;j<NB_FRAMES;j++)
760 {
761 int k;
762 float dist=0;
763 for (k=0;k<NB_TBANDS;k++)
764 {
765 float tmp;
766 tmp = tonal->logE[i][k] - tonal->logE[j][k];
767 dist += tmp*tmp;
768 }
769 if (j!=i)
770 mindist = MIN32(mindist, dist);
771 }
772 spec_variability += mindist;
773 }
774 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS);
775 bandwidth_mask = 0;
776 bandwidth = 0;
777 maxE = 0;
778 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
779 noise_floor *= noise_floor;
780 below_max_pitch=0;
781 above_max_pitch=0;
782 for (b=0;b<NB_TBANDS;b++)
783 {
784 float E=0;
785 float Em;
786 int band_start, band_end;
787 /* Keep a margin of 300 Hz for aliasing */
788 band_start = tbands[b];
789 band_end = tbands[b+1];
790 for (i=band_start;i<band_end;i++)
791 {
792 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
793 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
794 E += binE;
795 }
796 E = SCALE_ENER(E);
797 maxE = MAX32(maxE, E);
798 if (band_start < 64)
799 {
800 below_max_pitch += E;
801 } else {
802 above_max_pitch += E;
803 }
804 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
805 Em = MAX32(E, tonal->meanE[b]);
806 /* Consider the band "active" only if all these conditions are met:
807 1) less than 90 dB below the peak band (maximal masking possible considering
808 both the ATH and the loudness-dependent slope of the spreading function)
809 2) above the PCM quantization noise floor
810 We use b+1 because the first CELT band isn't included in tbands[]
811 */
812 if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start)))
813 bandwidth = b+1;
814 /* Check if the band is masked (see below). */
815 is_masked[b] = E < (tonal->prev_bandwidth >= b+1 ? .01f : .05f)*bandwidth_mask;
816 /* Use a simple follower with 13 dB/Bark slope for spreading function. */
817 bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
818 }
819 /* Special case for the last two bands, for which we don't have spectrum but only
820 the energy above 12 kHz. The difficulty here is that the high-pass we use
821 leaks some LF energy, so we need to increase the threshold without accidentally cutting
822 off the band. */
823 if (tonal->Fs == 48000) {
824 float noise_ratio;
825 float Em;
826 float E = hp_ener*(1.f/(60*60));
827 noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f;
828
829 #ifdef FIXED_POINT
830 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */
831 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE);
832 #endif
833 above_max_pitch += E;
834 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
835 Em = MAX32(E, tonal->meanE[b]);
836 if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160)
837 bandwidth = 20;
838 /* Check if the band is masked (see below). */
839 is_masked[b] = E < (tonal->prev_bandwidth == 20 ? .01f : .05f)*bandwidth_mask;
840 }
841 if (above_max_pitch > below_max_pitch)
842 info->max_pitch_ratio = below_max_pitch/above_max_pitch;
843 else
844 info->max_pitch_ratio = 1;
845 /* In some cases, resampling aliasing can create a small amount of energy in the first band
846 being cut. So if the last band is masked, we don't include it. */
847 if (bandwidth == 20 && is_masked[NB_TBANDS])
848 bandwidth-=2;
849 else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1])
850 bandwidth--;
851 if (tonal->count<=2)
852 bandwidth = 20;
853 frame_loudness = 20*(float)log10(frame_loudness);
854 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);
855 tonal->lowECount *= (1-alphaE);
856 if (frame_loudness < tonal->Etracker-30)
857 tonal->lowECount += alphaE;
858
859 for (i=0;i<8;i++)
860 {
861 float sum=0;
862 for (b=0;b<16;b++)
863 sum += dct_table[i*16+b]*logE[b];
864 BFCC[i] = sum;
865 }
866 for (i=0;i<8;i++)
867 {
868 float sum=0;
869 for (b=0;b<16;b++)
870 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]);
871 midE[i] = sum;
872 }
873
874 frame_stationarity /= NB_TBANDS;
875 relativeE /= NB_TBANDS;
876 if (tonal->count<10)
877 relativeE = .5f;
878 frame_noisiness /= NB_TBANDS;
879 #if 1
880 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
881 #else
882 info->activity = .5*(1+frame_noisiness-frame_stationarity);
883 #endif
884 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
885 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
886 tonal->prev_tonality = frame_tonality;
887
888 slope /= 8*8;
889 info->tonality_slope = slope;
890
891 tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
892 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);
893 info->tonality = frame_tonality;
894
895 for (i=0;i<4;i++)
896 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
897
898 for (i=0;i<4;i++)
899 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
900
901 for (i=0;i<4;i++)
902 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
903 for (i=0;i<3;i++)
904 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
905
906 if (tonal->count > 5)
907 {
908 for (i=0;i<9;i++)
909 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
910 }
911 for (i=0;i<4;i++)
912 features[i] = BFCC[i]-midE[i];
913
914 for (i=0;i<8;i++)
915 {
916 tonal->mem[i+24] = tonal->mem[i+16];
917 tonal->mem[i+16] = tonal->mem[i+8];
918 tonal->mem[i+8] = tonal->mem[i];
919 tonal->mem[i] = BFCC[i];
920 }
921 for (i=0;i<9;i++)
922 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];
923 features[18] = spec_variability - 0.78f;
924 features[20] = info->tonality - 0.154723f;
925 features[21] = info->activity - 0.724643f;
926 features[22] = frame_stationarity - 0.743717f;
927 features[23] = info->tonality_slope + 0.069216f;
928 features[24] = tonal->lowECount - 0.067930f;
929
930 compute_dense(&layer0, layer_out, features);
931 compute_gru(&layer1, tonal->rnn_state, layer_out);
932 compute_dense(&layer2, frame_probs, tonal->rnn_state);
933
934 /* Probability of speech or music vs noise */
935 info->activity_probability = frame_probs[1];
936 info->music_prob = frame_probs[0];
937
938 /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/
939 #ifdef MLP_TRAINING
940 for (i=0;i<25;i++)
941 printf("%f ", features[i]);
942 printf("\n");
943 #endif
944
945 info->bandwidth = bandwidth;
946 tonal->prev_bandwidth = bandwidth;
947 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
948 info->noisiness = frame_noisiness;
949 info->valid = 1;
950 RESTORE_STACK;
951 }
952
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)953 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
954 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
955 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
956 {
957 int offset;
958 int pcm_len;
959
960 analysis_frame_size -= analysis_frame_size&1;
961 if (analysis_pcm != NULL)
962 {
963 /* Avoid overflow/wrap-around of the analysis buffer */
964 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);
965
966 pcm_len = analysis_frame_size - analysis->analysis_offset;
967 offset = analysis->analysis_offset;
968 while (pcm_len>0) {
969 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
970 offset += Fs/50;
971 pcm_len -= Fs/50;
972 }
973 analysis->analysis_offset = analysis_frame_size;
974
975 analysis->analysis_offset -= frame_size;
976 }
977
978 tonality_get_info(analysis, analysis_info, frame_size);
979 }
980
981 #endif /* DISABLE_FLOAT_API */
982