1 /* Copyright (c) 2011 Xiph.Org Foundation
2    Written by Jean-Marc Valin */
3 /*
4    Redistribution and use in source and binary forms, with or without
5    modification, are permitted provided that the following conditions
6    are met:
7 
8    - Redistributions of source code must retain the above copyright
9    notice, this list of conditions and the following disclaimer.
10 
11    - Redistributions in binary form must reproduce the above copyright
12    notice, this list of conditions and the following disclaimer in the
13    documentation and/or other materials provided with the distribution.
14 
15    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
19    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include "kiss_fft.h"
33 #include "celt.h"
34 #include "modes.h"
35 #include "arch.h"
36 #include "quant_bands.h"
37 #include <stdio.h>
38 #include "analysis.h"
39 #include "mlp.h"
40 #include "stack_alloc.h"
41 
42 #ifndef M_PI
43 #define M_PI 3.141592653
44 #endif
45 
46 static const float dct_table[128] = {
47         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
48         0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f,
49         0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f,
50        -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f,
51         0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f,
52        -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f,
53         0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f,
54         0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f,
55         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
56         0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f,
57         0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f,
58        -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f,
59         0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f,
60        -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f,
61         0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f,
62         0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f,
63 };
64 
65 static const float analysis_window[240] = {
66       0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f,
67       0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f,
68       0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f,
69       0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f,
70       0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f,
71       0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f,
72       0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f,
73       0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f,
74       0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f,
75       0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f,
76       0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f,
77       0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f,
78       0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f,
79       0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f,
80       0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f,
81       0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f,
82       0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f,
83       0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f,
84       0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f,
85       0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f,
86       0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f,
87       0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f,
88       0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f,
89       0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f,
90       0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f,
91       0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f,
92       0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f,
93       0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f,
94       0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f,
95       0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f,
96 };
97 
98 static const int tbands[NB_TBANDS+1] = {
99        2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120
100 };
101 
102 static const int extra_bands[NB_TOT_BANDS+1] = {
103       1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200
104 };
105 
106 /*static const float tweight[NB_TBANDS+1] = {
107       .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5
108 };*/
109 
110 #define NB_TONAL_SKIP_BANDS 9
111 
112 #define cA 0.43157974f
113 #define cB 0.67848403f
114 #define cC 0.08595542f
115 #define cE ((float)M_PI/2)
fast_atan2f(float y,float x)116 static OPUS_INLINE float fast_atan2f(float y, float x) {
117    float x2, y2;
118    /* Should avoid underflow on the values we'll get */
119    if (ABS16(x)+ABS16(y)<1e-9f)
120    {
121       x*=1e12f;
122       y*=1e12f;
123    }
124    x2 = x*x;
125    y2 = y*y;
126    if(x2<y2){
127       float den = (y2 + cB*x2) * (y2 + cC*x2);
128       if (den!=0)
129          return -x*y*(y2 + cA*x2) / den + (y<0 ? -cE : cE);
130       else
131          return (y<0 ? -cE : cE);
132    }else{
133       float den = (x2 + cB*y2) * (x2 + cC*y2);
134       if (den!=0)
135          return  x*y*(x2 + cA*y2) / den + (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
136       else
137          return (y<0 ? -cE : cE) - (x*y<0 ? -cE : cE);
138    }
139 }
140 
tonality_analysis_init(TonalityAnalysisState * tonal)141 void tonality_analysis_init(TonalityAnalysisState *tonal)
142 {
143   /* Initialize reusable fields. */
144   tonal->arch = opus_select_arch();
145   /* Clear remaining fields. */
146   tonality_analysis_reset(tonal);
147 }
148 
tonality_analysis_reset(TonalityAnalysisState * tonal)149 void tonality_analysis_reset(TonalityAnalysisState *tonal)
150 {
151   /* Clear non-reusable fields. */
152   char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START;
153   OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal));
154 }
155 
tonality_get_info(TonalityAnalysisState * tonal,AnalysisInfo * info_out,int len)156 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len)
157 {
158    int pos;
159    int curr_lookahead;
160    float psum;
161    int i;
162 
163    pos = tonal->read_pos;
164    curr_lookahead = tonal->write_pos-tonal->read_pos;
165    if (curr_lookahead<0)
166       curr_lookahead += DETECT_SIZE;
167 
168    if (len > 480 && pos != tonal->write_pos)
169    {
170       pos++;
171       if (pos==DETECT_SIZE)
172          pos=0;
173    }
174    if (pos == tonal->write_pos)
175       pos--;
176    if (pos<0)
177       pos = DETECT_SIZE-1;
178    OPUS_COPY(info_out, &tonal->info[pos], 1);
179    tonal->read_subframe += len/120;
180    while (tonal->read_subframe>=4)
181    {
182       tonal->read_subframe -= 4;
183       tonal->read_pos++;
184    }
185    if (tonal->read_pos>=DETECT_SIZE)
186       tonal->read_pos-=DETECT_SIZE;
187 
188    /* Compensate for the delay in the features themselves.
189       FIXME: Need a better estimate the 10 I just made up */
190    curr_lookahead = IMAX(curr_lookahead-10, 0);
191 
192    psum=0;
193    /* Summing the probability of transition patterns that involve music at
194       time (DETECT_SIZE-curr_lookahead-1) */
195    for (i=0;i<DETECT_SIZE-curr_lookahead;i++)
196       psum += tonal->pmusic[i];
197    for (;i<DETECT_SIZE;i++)
198       psum += tonal->pspeech[i];
199    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;
200    /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/
201 
202    info_out->music_prob = psum;
203 }
204 
tonality_analysis(TonalityAnalysisState * tonal,const CELTMode * celt_mode,const void * x,int len,int offset,int c1,int c2,int C,int lsb_depth,downmix_func downmix)205 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
206 {
207     int i, b;
208     const kiss_fft_state *kfft;
209     VARDECL(kiss_fft_cpx, in);
210     VARDECL(kiss_fft_cpx, out);
211     int N = 480, N2=240;
212     float * OPUS_RESTRICT A = tonal->angle;
213     float * OPUS_RESTRICT dA = tonal->d_angle;
214     float * OPUS_RESTRICT d2A = tonal->d2_angle;
215     VARDECL(float, tonality);
216     VARDECL(float, noisiness);
217     float band_tonality[NB_TBANDS];
218     float logE[NB_TBANDS];
219     float BFCC[8];
220     float features[25];
221     float frame_tonality;
222     float max_frame_tonality;
223     /*float tw_sum=0;*/
224     float frame_noisiness;
225     const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI);
226     float slope=0;
227     float frame_stationarity;
228     float relativeE;
229     float frame_probs[2];
230     float alpha, alphaE, alphaE2;
231     float frame_loudness;
232     float bandwidth_mask;
233     int bandwidth=0;
234     float maxE = 0;
235     float noise_floor;
236     int remaining;
237     AnalysisInfo *info;
238     SAVE_STACK;
239 
240     tonal->last_transition++;
241     alpha = 1.f/IMIN(20, 1+tonal->count);
242     alphaE = 1.f/IMIN(50, 1+tonal->count);
243     alphaE2 = 1.f/IMIN(1000, 1+tonal->count);
244 
245     if (tonal->count<4)
246        tonal->music_prob = .5;
247     kfft = celt_mode->mdct.kfft[0];
248     if (tonal->count==0)
249        tonal->mem_fill = 240;
250     downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);
251     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)
252     {
253        tonal->mem_fill += len;
254        /* Don't have enough to update the analysis */
255        RESTORE_STACK;
256        return;
257     }
258     info = &tonal->info[tonal->write_pos++];
259     if (tonal->write_pos>=DETECT_SIZE)
260        tonal->write_pos-=DETECT_SIZE;
261 
262     ALLOC(in, 480, kiss_fft_cpx);
263     ALLOC(out, 480, kiss_fft_cpx);
264     ALLOC(tonality, 240, float);
265     ALLOC(noisiness, 240, float);
266     for (i=0;i<N2;i++)
267     {
268        float w = analysis_window[i];
269        in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]);
270        in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]);
271        in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]);
272        in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]);
273     }
274     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);
275     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
276     downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
277     tonal->mem_fill = 240 + remaining;
278     opus_fft(kfft, in, out, tonal->arch);
279 #ifndef FIXED_POINT
280     /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
281     if (celt_isnan(out[0].r))
282     {
283        info->valid = 0;
284        RESTORE_STACK;
285        return;
286     }
287 #endif
288 
289     for (i=1;i<N2;i++)
290     {
291        float X1r, X2r, X1i, X2i;
292        float angle, d_angle, d2_angle;
293        float angle2, d_angle2, d2_angle2;
294        float mod1, mod2, avg_mod;
295        X1r = (float)out[i].r+out[N-i].r;
296        X1i = (float)out[i].i-out[N-i].i;
297        X2r = (float)out[i].i+out[N-i].i;
298        X2i = (float)out[N-i].r-out[i].r;
299 
300        angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r);
301        d_angle = angle - A[i];
302        d2_angle = d_angle - dA[i];
303 
304        angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r);
305        d_angle2 = angle2 - angle;
306        d2_angle2 = d_angle2 - d_angle;
307 
308        mod1 = d2_angle - (float)floor(.5+d2_angle);
309        noisiness[i] = ABS16(mod1);
310        mod1 *= mod1;
311        mod1 *= mod1;
312 
313        mod2 = d2_angle2 - (float)floor(.5+d2_angle2);
314        noisiness[i] += ABS16(mod2);
315        mod2 *= mod2;
316        mod2 *= mod2;
317 
318        avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);
319        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;
320 
321        A[i] = angle2;
322        dA[i] = d_angle2;
323        d2A[i] = mod2;
324     }
325 
326     frame_tonality = 0;
327     max_frame_tonality = 0;
328     /*tw_sum = 0;*/
329     info->activity = 0;
330     frame_noisiness = 0;
331     frame_stationarity = 0;
332     if (!tonal->count)
333     {
334        for (b=0;b<NB_TBANDS;b++)
335        {
336           tonal->lowE[b] = 1e10;
337           tonal->highE[b] = -1e10;
338        }
339     }
340     relativeE = 0;
341     frame_loudness = 0;
342     for (b=0;b<NB_TBANDS;b++)
343     {
344        float E=0, tE=0, nE=0;
345        float L1, L2;
346        float stationarity;
347        for (i=tbands[b];i<tbands[b+1];i++)
348        {
349           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
350                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
351 #ifdef FIXED_POINT
352           /* FIXME: It's probably best to change the BFCC filter initial state instead */
353           binE *= 5.55e-17f;
354 #endif
355           E += binE;
356           tE += binE*tonality[i];
357           nE += binE*2.f*(.5f-noisiness[i]);
358        }
359 #ifndef FIXED_POINT
360        /* Check for extreme band energies that could cause NaNs later. */
361        if (!(E<1e9f) || celt_isnan(E))
362        {
363           info->valid = 0;
364           RESTORE_STACK;
365           return;
366        }
367 #endif
368 
369        tonal->E[tonal->E_count][b] = E;
370        frame_noisiness += nE/(1e-15f+E);
371 
372        frame_loudness += (float)sqrt(E+1e-10f);
373        logE[b] = (float)log(E+1e-10f);
374        tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);
375        tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);
376        if (tonal->highE[b] < tonal->lowE[b]+1.f)
377        {
378           tonal->highE[b]+=.5f;
379           tonal->lowE[b]-=.5f;
380        }
381        relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);
382 
383        L1=L2=0;
384        for (i=0;i<NB_FRAMES;i++)
385        {
386           L1 += (float)sqrt(tonal->E[i][b]);
387           L2 += tonal->E[i][b];
388        }
389 
390        stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2));
391        stationarity *= stationarity;
392        stationarity *= stationarity;
393        frame_stationarity += stationarity;
394        /*band_tonality[b] = tE/(1e-15+E)*/;
395        band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]);
396 #if 0
397        if (b>=NB_TONAL_SKIP_BANDS)
398        {
399           frame_tonality += tweight[b]*band_tonality[b];
400           tw_sum += tweight[b];
401        }
402 #else
403        frame_tonality += band_tonality[b];
404        if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS)
405           frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS];
406 #endif
407        max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality);
408        slope += band_tonality[b]*(b-8);
409        /*printf("%f %f ", band_tonality[b], stationarity);*/
410        tonal->prev_band_tonality[b] = band_tonality[b];
411     }
412 
413     bandwidth_mask = 0;
414     bandwidth = 0;
415     maxE = 0;
416     noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8)));
417 #ifdef FIXED_POINT
418     noise_floor *= 1<<(15+SIG_SHIFT);
419 #endif
420     noise_floor *= noise_floor;
421     for (b=0;b<NB_TOT_BANDS;b++)
422     {
423        float E=0;
424        int band_start, band_end;
425        /* Keep a margin of 300 Hz for aliasing */
426        band_start = extra_bands[b];
427        band_end = extra_bands[b+1];
428        for (i=band_start;i<band_end;i++)
429        {
430           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r
431                      + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i;
432           E += binE;
433        }
434        maxE = MAX32(maxE, E);
435        tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);
436        E = MAX32(E, tonal->meanE[b]);
437        /* Use a simple follower with 13 dB/Bark slope for spreading function */
438        bandwidth_mask = MAX32(.05f*bandwidth_mask, E);
439        /* Consider the band "active" only if all these conditions are met:
440           1) less than 10 dB below the simple follower
441           2) less than 90 dB below the peak band (maximal masking possible considering
442              both the ATH and the loudness-dependent slope of the spreading function)
443           3) above the PCM quantization noise floor
444        */
445        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))
446           bandwidth = b;
447     }
448     if (tonal->count<=2)
449        bandwidth = 20;
450     frame_loudness = 20*(float)log10(frame_loudness);
451     tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);
452     tonal->lowECount *= (1-alphaE);
453     if (frame_loudness < tonal->Etracker-30)
454        tonal->lowECount += alphaE;
455 
456     for (i=0;i<8;i++)
457     {
458        float sum=0;
459        for (b=0;b<16;b++)
460           sum += dct_table[i*16+b]*logE[b];
461        BFCC[i] = sum;
462     }
463 
464     frame_stationarity /= NB_TBANDS;
465     relativeE /= NB_TBANDS;
466     if (tonal->count<10)
467        relativeE = .5;
468     frame_noisiness /= NB_TBANDS;
469 #if 1
470     info->activity = frame_noisiness + (1-frame_noisiness)*relativeE;
471 #else
472     info->activity = .5*(1+frame_noisiness-frame_stationarity);
473 #endif
474     frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS));
475     frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f);
476     tonal->prev_tonality = frame_tonality;
477 
478     slope /= 8*8;
479     info->tonality_slope = slope;
480 
481     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;
482     tonal->count++;
483     info->tonality = frame_tonality;
484 
485     for (i=0;i<4;i++)
486        features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i];
487 
488     for (i=0;i<4;i++)
489        tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i];
490 
491     for (i=0;i<4;i++)
492         features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]);
493     for (i=0;i<3;i++)
494         features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8];
495 
496     if (tonal->count > 5)
497     {
498        for (i=0;i<9;i++)
499           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];
500     }
501 
502     for (i=0;i<8;i++)
503     {
504        tonal->mem[i+24] = tonal->mem[i+16];
505        tonal->mem[i+16] = tonal->mem[i+8];
506        tonal->mem[i+8] = tonal->mem[i];
507        tonal->mem[i] = BFCC[i];
508     }
509     for (i=0;i<9;i++)
510        features[11+i] = (float)sqrt(tonal->std[i]);
511     features[20] = info->tonality;
512     features[21] = info->activity;
513     features[22] = frame_stationarity;
514     features[23] = info->tonality_slope;
515     features[24] = tonal->lowECount;
516 
517 #ifndef DISABLE_FLOAT_API
518     mlp_process(&net, features, frame_probs);
519     frame_probs[0] = .5f*(frame_probs[0]+1);
520     /* Curve fitting between the MLP probability and the actual probability */
521     frame_probs[0] = .01f + 1.21f*frame_probs[0]*frame_probs[0] - .23f*(float)pow(frame_probs[0], 10);
522     /* Probability of active audio (as opposed to silence) */
523     frame_probs[1] = .5f*frame_probs[1]+.5f;
524     /* Consider that silence has a 50-50 probability. */
525     frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;
526 
527     /*printf("%f %f ", frame_probs[0], frame_probs[1]);*/
528     {
529        /* Probability of state transition */
530        float tau;
531        /* Represents independence of the MLP probabilities, where
532           beta=1 means fully independent. */
533        float beta;
534        /* Denormalized probability of speech (p0) and music (p1) after update */
535        float p0, p1;
536        /* Probabilities for "all speech" and "all music" */
537        float s0, m0;
538        /* Probability sum for renormalisation */
539        float psum;
540        /* Instantaneous probability of speech and music, with beta pre-applied. */
541        float speech0;
542        float music0;
543        float p, q;
544 
545        /* One transition every 3 minutes of active audio */
546        tau = .00005f*frame_probs[1];
547        /* Adapt beta based on how "unexpected" the new prob is */
548        p = MAX16(.05f,MIN16(.95f,frame_probs[0]));
549        q = MAX16(.05f,MIN16(.95f,tonal->music_prob));
550        beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));
551        /* p0 and p1 are the probabilities of speech and music at this frame
552           using only information from previous frame and applying the
553           state transition model */
554        p0 = (1-tonal->music_prob)*(1-tau) +    tonal->music_prob *tau;
555        p1 =    tonal->music_prob *(1-tau) + (1-tonal->music_prob)*tau;
556        /* We apply the current probability with exponent beta to work around
557           the fact that the probability estimates aren't independent. */
558        p0 *= (float)pow(1-frame_probs[0], beta);
559        p1 *= (float)pow(frame_probs[0], beta);
560        /* Normalise the probabilities to get the Marokv probability of music. */
561        tonal->music_prob = p1/(p0+p1);
562        info->music_prob = tonal->music_prob;
563 
564        /* This chunk of code deals with delayed decision. */
565        psum=1e-20f;
566        /* Instantaneous probability of speech and music, with beta pre-applied. */
567        speech0 = (float)pow(1-frame_probs[0], beta);
568        music0  = (float)pow(frame_probs[0], beta);
569        if (tonal->count==1)
570        {
571           tonal->pspeech[0]=.5;
572           tonal->pmusic [0]=.5;
573        }
574        /* Updated probability of having only speech (s0) or only music (m0),
575           before considering the new observation. */
576        s0 = tonal->pspeech[0] + tonal->pspeech[1];
577        m0 = tonal->pmusic [0] + tonal->pmusic [1];
578        /* Updates s0 and m0 with instantaneous probability. */
579        tonal->pspeech[0] = s0*(1-tau)*speech0;
580        tonal->pmusic [0] = m0*(1-tau)*music0;
581        /* Propagate the transition probabilities */
582        for (i=1;i<DETECT_SIZE-1;i++)
583        {
584           tonal->pspeech[i] = tonal->pspeech[i+1]*speech0;
585           tonal->pmusic [i] = tonal->pmusic [i+1]*music0;
586        }
587        /* Probability that the latest frame is speech, when all the previous ones were music. */
588        tonal->pspeech[DETECT_SIZE-1] = m0*tau*speech0;
589        /* Probability that the latest frame is music, when all the previous ones were speech. */
590        tonal->pmusic [DETECT_SIZE-1] = s0*tau*music0;
591 
592        /* Renormalise probabilities to 1 */
593        for (i=0;i<DETECT_SIZE;i++)
594           psum += tonal->pspeech[i] + tonal->pmusic[i];
595        psum = 1.f/psum;
596        for (i=0;i<DETECT_SIZE;i++)
597        {
598           tonal->pspeech[i] *= psum;
599           tonal->pmusic [i] *= psum;
600        }
601        psum = tonal->pmusic[0];
602        for (i=1;i<DETECT_SIZE;i++)
603           psum += tonal->pspeech[i];
604 
605        /* Estimate our confidence in the speech/music decisions */
606        if (frame_probs[1]>.75)
607        {
608           if (tonal->music_prob>.9)
609           {
610              float adapt;
611              adapt = 1.f/(++tonal->music_confidence_count);
612              tonal->music_confidence_count = IMIN(tonal->music_confidence_count, 500);
613              tonal->music_confidence += adapt*MAX16(-.2f,frame_probs[0]-tonal->music_confidence);
614           }
615           if (tonal->music_prob<.1)
616           {
617              float adapt;
618              adapt = 1.f/(++tonal->speech_confidence_count);
619              tonal->speech_confidence_count = IMIN(tonal->speech_confidence_count, 500);
620              tonal->speech_confidence += adapt*MIN16(.2f,frame_probs[0]-tonal->speech_confidence);
621           }
622        } else {
623           if (tonal->music_confidence_count==0)
624              tonal->music_confidence = .9f;
625           if (tonal->speech_confidence_count==0)
626              tonal->speech_confidence = .1f;
627        }
628     }
629     if (tonal->last_music != (tonal->music_prob>.5f))
630        tonal->last_transition=0;
631     tonal->last_music = tonal->music_prob>.5f;
632 #else
633     info->music_prob = 0;
634 #endif
635     /*for (i=0;i<25;i++)
636        printf("%f ", features[i]);
637     printf("\n");*/
638 
639     info->bandwidth = bandwidth;
640     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/
641     info->noisiness = frame_noisiness;
642     info->valid = 1;
643     RESTORE_STACK;
644 }
645 
run_analysis(TonalityAnalysisState * analysis,const CELTMode * celt_mode,const void * analysis_pcm,int analysis_frame_size,int frame_size,int c1,int c2,int C,opus_int32 Fs,int lsb_depth,downmix_func downmix,AnalysisInfo * analysis_info)646 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm,
647                  int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs,
648                  int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info)
649 {
650    int offset;
651    int pcm_len;
652 
653    if (analysis_pcm != NULL)
654    {
655       /* Avoid overflow/wrap-around of the analysis buffer */
656       analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);
657 
658       pcm_len = analysis_frame_size - analysis->analysis_offset;
659       offset = analysis->analysis_offset;
660       do {
661          tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);
662          offset += 480;
663          pcm_len -= 480;
664       } while (pcm_len>0);
665       analysis->analysis_offset = analysis_frame_size;
666 
667       analysis->analysis_offset -= frame_size;
668    }
669 
670    analysis_info->valid = 0;
671    tonality_get_info(analysis, analysis_info, frame_size);
672 }
673