1 /* Copyright (C) 2003 Epic Games
2    Written by Jean-Marc Valin
3 
4    File: preprocess.c
5    Preprocessor with denoising based on the algorithm by Ephraim and Malah
6 
7    Redistribution and use in source and binary forms, with or without
8    modification, are permitted provided that the following conditions are
9    met:
10 
11    1. Redistributions of source code must retain the above copyright notice,
12    this list of conditions and the following disclaimer.
13 
14    2. Redistributions in binary form must reproduce the above copyright
15    notice, this list of conditions and the following disclaimer in the
16    documentation and/or other materials provided with the distribution.
17 
18    3. The name of the author may not be used to endorse or promote products
19    derived from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23    OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24    DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31    POSSIBILITY OF SUCH DAMAGE.
32 */
33 
34 #ifdef HAVE_CONFIG_H
35 #include "config.h"
36 #endif
37 
38 #include <math.h>
39 #include "speex/speex_preprocess.h"
40 #include "misc.h"
41 #include "smallft.h"
42 
43 #define max(a,b) ((a) > (b) ? (a) : (b))
44 #define min(a,b) ((a) < (b) ? (a) : (b))
45 
46 #ifndef M_PI
47 #define M_PI 3.14159263
48 #endif
49 
50 #define SQRT_M_PI_2 0.88623
51 #define LOUDNESS_EXP 2.5
52 
53 #define SPEEX_PROB_START_DEFAULT    0.35f
54 #define SPEEX_PROB_CONTINUE_DEFAULT 0.20f
55 
56 #define NB_BANDS 8
57 
58 #define ZMIN .1
59 #define ZMAX .316
60 #define ZMIN_1 10
61 #define LOG_MIN_MAX_1 0.86859
62 
conj_window(float * w,int len)63 static void conj_window(float *w, int len)
64 {
65    int i;
66    for (i=0;i<len;i++)
67    {
68       float x=4*((float)i)/len;
69       int inv=0;
70       if (x<1)
71       {
72       } else if (x<2)
73       {
74          x=2-x;
75          inv=1;
76       } else if (x<3)
77       {
78          x=x-2;
79          inv=1;
80       } else {
81          x=4-x;
82       }
83       x*=1.9979;
84       w[i]=(.5-.5*cos(x))*(.5-.5*cos(x));
85       if (inv)
86          w[i]=1-w[i];
87       w[i]=sqrt(w[i]);
88    }
89 }
90 
91 /* This function approximates the gain function
92    y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
93    which multiplied by xi/(1+xi) is the optimal gain
94    in the loudness domain ( sqrt[amplitude] )
95 */
hypergeom_gain(float x)96 static float hypergeom_gain(float x)
97 {
98    int ind;
99    float integer, frac;
100    static const float table[21] = {
101       0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f,
102       1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
103       2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
104 
105    if (x>9.5)
106       return 1+.12/x;
107 
108    integer = floor(2*x);
109    frac = 2*x-integer;
110    ind = (int)integer;
111    /*if (ind > 20 || ind < 0)
112    fprintf (stderr, "error: %d %f\n", ind, x);*/
113    return ((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
114 }
115 
speex_preprocess_state_init(int frame_size,int sampling_rate)116 SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
117 {
118    int i;
119    int N, N3, N4;
120 
121    SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
122    st->frame_size = frame_size;
123 
124    /* Round ps_size down to the nearest power of two */
125 #if 0
126    i=1;
127    st->ps_size = st->frame_size;
128    while(1)
129    {
130       if (st->ps_size & ~i)
131       {
132          st->ps_size &= ~i;
133          i<<=1;
134       } else {
135          break;
136       }
137    }
138 
139 
140    if (st->ps_size < 3*st->frame_size/4)
141       st->ps_size = st->ps_size * 3 / 2;
142 #else
143    st->ps_size = st->frame_size;
144 #endif
145 
146    N = st->ps_size;
147    N3 = 2*N - st->frame_size;
148    N4 = st->frame_size - N3;
149 
150    st->sampling_rate = sampling_rate;
151    st->denoise_enabled = 1;
152    st->agc_enabled = 0;
153    st->agc_level = 8000;
154    st->vad_enabled = 0;
155    st->dereverb_enabled = 0;
156    st->reverb_decay = .5;
157    st->reverb_level = .2;
158 
159    st->speech_prob_start = SPEEX_PROB_START_DEFAULT;
160    st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT;
161 
162    st->frame = (float*)speex_alloc(2*N*sizeof(float));
163    st->ps = (float*)speex_alloc(N*sizeof(float));
164    st->gain2 = (float*)speex_alloc(N*sizeof(float));
165    st->window = (float*)speex_alloc(2*N*sizeof(float));
166    st->noise = (float*)speex_alloc(N*sizeof(float));
167    st->reverb_estimate = (float*)speex_alloc(N*sizeof(float));
168    st->old_ps = (float*)speex_alloc(N*sizeof(float));
169    st->gain = (float*)speex_alloc(N*sizeof(float));
170    st->prior = (float*)speex_alloc(N*sizeof(float));
171    st->post = (float*)speex_alloc(N*sizeof(float));
172    st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
173    st->inbuf = (float*)speex_alloc(N3*sizeof(float));
174    st->outbuf = (float*)speex_alloc(N3*sizeof(float));
175    st->echo_noise = (float*)speex_alloc(N*sizeof(float));
176 
177    st->S = (float*)speex_alloc(N*sizeof(float));
178    st->Smin = (float*)speex_alloc(N*sizeof(float));
179    st->Stmp = (float*)speex_alloc(N*sizeof(float));
180    st->update_prob = (float*)speex_alloc(N*sizeof(float));
181 
182    st->zeta = (float*)speex_alloc(N*sizeof(float));
183    st->Zpeak = 0;
184    st->Zlast = 0;
185 
186    st->noise_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
187    st->noise_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
188    st->speech_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
189    st->speech_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
190    st->noise_bandsN = st->speech_bandsN = 1;
191 
192    conj_window(st->window, 2*N3);
193    for (i=2*N3;i<2*st->ps_size;i++)
194       st->window[i]=1;
195 
196    if (N4>0)
197    {
198       for (i=N3-1;i>=0;i--)
199       {
200          st->window[i+N3+N4]=st->window[i+N3];
201          st->window[i+N3]=1;
202       }
203    }
204    for (i=0;i<N;i++)
205    {
206       st->noise[i]=1e4;
207       st->reverb_estimate[i]=0.;
208       st->old_ps[i]=1e4;
209       st->gain[i]=1;
210       st->post[i]=1;
211       st->prior[i]=1;
212    }
213 
214    for (i=0;i<N3;i++)
215    {
216       st->inbuf[i]=0;
217       st->outbuf[i]=0;
218    }
219 
220    for (i=0;i<N;i++)
221    {
222       float ff=((float)i)*.5*sampling_rate/((float)N);
223       st->loudness_weight[i] = .35f-.35f*ff/16000.f+.73f*exp(-.5f*(ff-3800)*(ff-3800)/9e5f);
224       if (st->loudness_weight[i]<.01f)
225          st->loudness_weight[i]=.01f;
226       st->loudness_weight[i] *= st->loudness_weight[i];
227    }
228 
229    st->speech_prob = 0;
230    st->last_speech = 1000;
231    st->loudness = pow(6000,LOUDNESS_EXP);
232    st->loudness2 = 6000;
233    st->nb_loudness_adapt = 0;
234 
235    st->fft_lookup = (struct drft_lookup*)speex_alloc(sizeof(struct drft_lookup));
236    spx_drft_init(st->fft_lookup,2*N);
237 
238    st->nb_adapt=0;
239    st->consec_noise=0;
240    st->nb_preprocess=0;
241    return st;
242 }
243 
speex_preprocess_state_destroy(SpeexPreprocessState * st)244 void speex_preprocess_state_destroy(SpeexPreprocessState *st)
245 {
246    speex_free(st->frame);
247    speex_free(st->ps);
248    speex_free(st->gain2);
249    speex_free(st->window);
250    speex_free(st->noise);
251    speex_free(st->reverb_estimate);
252    speex_free(st->old_ps);
253    speex_free(st->gain);
254    speex_free(st->prior);
255    speex_free(st->post);
256    speex_free(st->loudness_weight);
257    speex_free(st->echo_noise);
258 
259    speex_free(st->S);
260    speex_free(st->Smin);
261    speex_free(st->Stmp);
262    speex_free(st->update_prob);
263    speex_free(st->zeta);
264 
265    speex_free(st->noise_bands);
266    speex_free(st->noise_bands2);
267    speex_free(st->speech_bands);
268    speex_free(st->speech_bands2);
269 
270    speex_free(st->inbuf);
271    speex_free(st->outbuf);
272 
273    spx_drft_clear(st->fft_lookup);
274    speex_free(st->fft_lookup);
275 
276    speex_free(st);
277 }
278 
update_noise(SpeexPreprocessState * st,float * ps,float * echo)279 static void update_noise(SpeexPreprocessState *st, float *ps, float *echo)
280 {
281    int i;
282    float beta;
283    st->nb_adapt++;
284    beta=1.0f/st->nb_adapt;
285    if (beta < .05f)
286       beta=.05f;
287 
288    if (!echo)
289    {
290       for (i=0;i<st->ps_size;i++)
291          st->noise[i] = (1.f-beta)*st->noise[i] + beta*ps[i];
292    } else {
293       for (i=0;i<st->ps_size;i++)
294          st->noise[i] = (1.f-beta)*st->noise[i] + beta*max(1.f,ps[i]-echo[i]);
295 #if 0
296       for (i=0;i<st->ps_size;i++)
297          st->noise[i] = 0;
298 #endif
299    }
300 }
301 
speex_compute_vad(SpeexPreprocessState * st,float * ps,float mean_prior,float mean_post)302 static int speex_compute_vad(SpeexPreprocessState *st, float *ps, float mean_prior, float mean_post)
303 {
304    int i, is_speech=0;
305    int N = st->ps_size;
306    float scale=.5f/N;
307 
308    /* FIXME: Clean this up a bit */
309    {
310       float bands[NB_BANDS];
311       int j;
312       float p0, p1;
313       float tot_loudness=0;
314       float x = sqrt(mean_post);
315 
316       for (i=5;i<N-10;i++)
317       {
318          tot_loudness += scale*st->ps[i] * st->loudness_weight[i];
319       }
320 
321       for (i=0;i<NB_BANDS;i++)
322       {
323          bands[i]=1e4f;
324          for (j=i*N/NB_BANDS;j<(i+1)*N/NB_BANDS;j++)
325          {
326             bands[i] += ps[j];
327          }
328          bands[i]=log(bands[i]);
329       }
330 
331       /*p1 = .0005+.6*exp(-.5*(x-.4)*(x-.4)*11)+.1*exp(-1.2*x);
332       if (x<1.5)
333          p0=.1*exp(2*(x-1.5));
334       else
335          p0=.02+.1*exp(-.2*(x-1.5));
336       */
337 
338       p0=1.f/(1.f+exp(3.f*(1.5f-x)));
339       p1=1.f-p0;
340 
341       /*fprintf (stderr, "%f %f ", p0, p1);*/
342       /*p0 *= .99*st->speech_prob + .01*(1-st->speech_prob);
343       p1 *= .01*st->speech_prob + .99*(1-st->speech_prob);
344 
345       st->speech_prob = p0/(p1+p0);
346       */
347 
348       if (st->noise_bandsN < 50 || st->speech_bandsN < 50)
349       {
350          if (mean_post > 5.f)
351          {
352             float adapt = 1./st->speech_bandsN++;
353             if (adapt<.005f)
354                adapt = .005f;
355             for (i=0;i<NB_BANDS;i++)
356             {
357                st->speech_bands[i] = (1.f-adapt)*st->speech_bands[i] + adapt*bands[i];
358                /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
359                st->speech_bands2[i] = (1.f-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
360             }
361          } else {
362             float adapt = 1./st->noise_bandsN++;
363             if (adapt<.005f)
364                adapt = .005f;
365             for (i=0;i<NB_BANDS;i++)
366             {
367                st->noise_bands[i] = (1.f-adapt)*st->noise_bands[i] + adapt*bands[i];
368                /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
369                st->noise_bands2[i] = (1.f-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
370             }
371          }
372       }
373       p0=p1=1;
374       for (i=0;i<NB_BANDS;i++)
375       {
376          float noise_var, speech_var;
377          float noise_mean, speech_mean;
378          float tmp1, tmp2, pr;
379 
380          /*noise_var = 1.01*st->noise_bands2[i] - st->noise_bands[i]*st->noise_bands[i];
381            speech_var = 1.01*st->speech_bands2[i] - st->speech_bands[i]*st->speech_bands[i];*/
382          noise_var = st->noise_bands2[i];
383          speech_var = st->speech_bands2[i];
384          if (noise_var < .1f)
385             noise_var = .1f;
386          if (speech_var < .1f)
387             speech_var = .1f;
388 
389          /*speech_var = sqrt(speech_var*noise_var);
390            noise_var = speech_var;*/
391          if (speech_var < .05f*speech_var)
392             noise_var = .05f*speech_var;
393          if (speech_var < .05f*noise_var)
394             speech_var = .05f*noise_var;
395 
396          if (bands[i] < st->noise_bands[i])
397             speech_var = noise_var;
398          if (bands[i] > st->speech_bands[i])
399             noise_var = speech_var;
400 
401          speech_mean = st->speech_bands[i];
402          noise_mean = st->noise_bands[i];
403          if (noise_mean < speech_mean - 5.f)
404             noise_mean = speech_mean - 5.f;
405 
406          tmp1 = exp(-.5f*(bands[i]-speech_mean)*(bands[i]-speech_mean)/speech_var)/sqrt(2.f*M_PI*speech_var);
407          tmp2 = exp(-.5f*(bands[i]-noise_mean)*(bands[i]-noise_mean)/noise_var)/sqrt(2.f*M_PI*noise_var);
408          /*fprintf (stderr, "%f ", (float)(p0/(.01+p0+p1)));*/
409          /*fprintf (stderr, "%f ", (float)(bands[i]));*/
410          pr = tmp1/(1e-25+tmp1+tmp2);
411          /*if (bands[i] < st->noise_bands[i])
412             pr=.01;
413          if (bands[i] > st->speech_bands[i] && pr < .995)
414          pr=.995;*/
415          if (pr>.999f)
416             pr=.999f;
417          if (pr<.001f)
418             pr=.001f;
419          /*fprintf (stderr, "%f ", pr);*/
420          p0 *= pr;
421          p1 *= (1-pr);
422       }
423 
424       p0 = pow(p0,.2);
425       p1 = pow(p1,.2);
426 
427 #if 1
428       p0 *= 2.f;
429       p0=p0/(p1+p0);
430       if (st->last_speech>20)
431       {
432          float tmp = sqrt(tot_loudness)/st->loudness2;
433          tmp = 1.f-exp(-10.f*tmp);
434          if (p0>tmp)
435             p0=tmp;
436       }
437       p1=1-p0;
438 #else
439       if (sqrt(tot_loudness) < .6f*st->loudness2 && p0>15.f*p1)
440          p0=15.f*p1;
441       if (sqrt(tot_loudness) < .45f*st->loudness2 && p0>7.f*p1)
442          p0=7.f*p1;
443       if (sqrt(tot_loudness) < .3f*st->loudness2 && p0>3.f*p1)
444          p0=3.f*p1;
445       if (sqrt(tot_loudness) < .15f*st->loudness2 && p0>p1)
446          p0=p1;
447       /*fprintf (stderr, "%f %f ", (float)(sqrt(tot_loudness) /( .25*st->loudness2)), p0/(p1+p0));*/
448 #endif
449 
450       p0 *= .99f*st->speech_prob + .01f*(1-st->speech_prob);
451       p1 *= .01f*st->speech_prob + .99f*(1-st->speech_prob);
452 
453       st->speech_prob = p0/(1e-25f+p1+p0);
454       /*fprintf (stderr, "%f %f %f ", tot_loudness, st->loudness2, st->speech_prob);*/
455 
456       if (st->speech_prob > st->speech_prob_start
457 	  || (st->last_speech < 20 && st->speech_prob > st->speech_prob_continue))
458       {
459          is_speech = 1;
460          st->last_speech = 0;
461       } else {
462          st->last_speech++;
463          if (st->last_speech<20)
464            is_speech = 1;
465       }
466 
467       if (st->noise_bandsN > 50 && st->speech_bandsN > 50)
468       {
469          if (mean_post > 5)
470          {
471             float adapt = 1./st->speech_bandsN++;
472             if (adapt<.005f)
473                adapt = .005f;
474             for (i=0;i<NB_BANDS;i++)
475             {
476                st->speech_bands[i] = (1-adapt)*st->speech_bands[i] + adapt*bands[i];
477                /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
478                st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
479             }
480          } else {
481             float adapt = 1./st->noise_bandsN++;
482             if (adapt<.005f)
483                adapt = .005f;
484             for (i=0;i<NB_BANDS;i++)
485             {
486                st->noise_bands[i] = (1-adapt)*st->noise_bands[i] + adapt*bands[i];
487                /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
488                st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
489             }
490          }
491       }
492 
493 
494    }
495 
496    return is_speech;
497 }
498 
speex_compute_agc(SpeexPreprocessState * st,float mean_prior)499 static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior)
500 {
501    int i;
502    int N = st->ps_size;
503    float scale=.5f/N;
504    float agc_gain;
505    int freq_start, freq_end;
506    float active_bands = 0;
507 
508    freq_start = (int)(300.0f*2*N/st->sampling_rate);
509    freq_end   = (int)(2000.0f*2*N/st->sampling_rate);
510    for (i=freq_start;i<freq_end;i++)
511    {
512       if (st->S[i] > 20.f*st->Smin[i]+1000.f)
513          active_bands+=1;
514    }
515    active_bands /= (freq_end-freq_start+1);
516 
517    if (active_bands > .2f)
518    {
519       float loudness=0.f;
520       float rate, rate2=.2f;
521       st->nb_loudness_adapt++;
522       rate=2.0f/(1+st->nb_loudness_adapt);
523       if (rate < .05f)
524          rate = .05f;
525       if (rate < .1f && pow(loudness, LOUDNESS_EXP) > st->loudness)
526          rate = .1f;
527       if (rate < .2f && pow(loudness, LOUDNESS_EXP) > 3.f*st->loudness)
528          rate = .2f;
529       if (rate < .4f && pow(loudness, LOUDNESS_EXP) > 10.f*st->loudness)
530          rate = .4f;
531 
532       for (i=2;i<N;i++)
533       {
534          loudness += scale*st->ps[i] * st->gain2[i] * st->gain2[i] * st->loudness_weight[i];
535       }
536       loudness=sqrt(loudness);
537       /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
538         loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
539       st->loudness = (1-rate)*st->loudness + (rate)*pow(loudness, LOUDNESS_EXP);
540 
541       st->loudness2 = (1-rate2)*st->loudness2 + rate2*pow(st->loudness, 1.0f/LOUDNESS_EXP);
542 
543       loudness = pow(st->loudness, 1.0f/LOUDNESS_EXP);
544 
545       /*fprintf (stderr, "%f %f %f\n", loudness, st->loudness2, rate);*/
546    }
547 
548    agc_gain = st->agc_level/st->loudness2;
549    /*fprintf (stderr, "%f %f %f %f\n", active_bands, st->loudness, st->loudness2, agc_gain);*/
550    if (agc_gain>200)
551       agc_gain = 200;
552 
553    for (i=0;i<N;i++)
554       st->gain2[i] *= agc_gain;
555 
556 }
557 
preprocess_analysis(SpeexPreprocessState * st,spx_int16_t * x)558 static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
559 {
560    int i;
561    int N = st->ps_size;
562    int N3 = 2*N - st->frame_size;
563    int N4 = st->frame_size - N3;
564    float *ps=st->ps;
565 
566    /* 'Build' input frame */
567    for (i=0;i<N3;i++)
568       st->frame[i]=st->inbuf[i];
569    for (i=0;i<st->frame_size;i++)
570       st->frame[N3+i]=x[i];
571 
572    /* Update inbuf */
573    for (i=0;i<N3;i++)
574       st->inbuf[i]=x[N4+i];
575 
576    /* Windowing */
577    for (i=0;i<2*N;i++)
578       st->frame[i] *= st->window[i];
579 
580    /* Perform FFT */
581    spx_drft_forward(st->fft_lookup, st->frame);
582 
583    /* Power spectrum */
584    ps[0]=1;
585    for (i=1;i<N;i++)
586       ps[i]=1+st->frame[2*i-1]*st->frame[2*i-1] + st->frame[2*i]*st->frame[2*i];
587 
588 }
589 
update_noise_prob(SpeexPreprocessState * st)590 static void update_noise_prob(SpeexPreprocessState *st)
591 {
592    int i;
593    int N = st->ps_size;
594 
595    for (i=1;i<N-1;i++)
596       st->S[i] = 100.f+ .8f*st->S[i] + .05f*st->ps[i-1]+.1f*st->ps[i]+.05f*st->ps[i+1];
597 
598    if (st->nb_preprocess<1)
599    {
600       for (i=1;i<N-1;i++)
601          st->Smin[i] = st->Stmp[i] = st->S[i]+100.f;
602    }
603 
604    if (st->nb_preprocess%200==0)
605    {
606       for (i=1;i<N-1;i++)
607       {
608          st->Smin[i] = min(st->Stmp[i], st->S[i]);
609          st->Stmp[i] = st->S[i];
610       }
611    } else {
612       for (i=1;i<N-1;i++)
613       {
614          st->Smin[i] = min(st->Smin[i], st->S[i]);
615          st->Stmp[i] = min(st->Stmp[i], st->S[i]);
616       }
617    }
618    for (i=1;i<N-1;i++)
619    {
620       st->update_prob[i] *= .2f;
621       if (st->S[i] > 2.5*st->Smin[i])
622          st->update_prob[i] += .8f;
623       /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
624       /*fprintf (stderr, "%f ", st->update_prob[i]);*/
625    }
626 
627 }
628 
629 #define NOISE_OVERCOMPENS 1.4
630 
speex_preprocess(SpeexPreprocessState * st,spx_int16_t * x,float * echo)631 int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, float *echo)
632 {
633    int i;
634    int is_speech=1;
635    float mean_post=0;
636    float mean_prior=0;
637    int N = st->ps_size;
638    int N3 = 2*N - st->frame_size;
639    int N4 = st->frame_size - N3;
640    float scale=.5f/N;
641    float *ps=st->ps;
642    float Zframe=0, Pframe;
643 
644    preprocess_analysis(st, x);
645 
646    update_noise_prob(st);
647 
648    st->nb_preprocess++;
649 
650    /* Noise estimation always updated for the 20 first times */
651    if (st->nb_adapt<10)
652    {
653       update_noise(st, ps, echo);
654    }
655 
656    /* Deal with residual echo if provided */
657    if (echo)
658       for (i=1;i<N;i++)
659          st->echo_noise[i] = (.3f*st->echo_noise[i] + echo[i]);
660 
661    /* Compute a posteriori SNR */
662    for (i=1;i<N;i++)
663    {
664       st->post[i] = ps[i]/(1.f+NOISE_OVERCOMPENS*st->noise[i]+st->echo_noise[i]+st->reverb_estimate[i]) - 1.f;
665       if (st->post[i]>100.f)
666          st->post[i]=100.f;
667       /*if (st->post[i]<0)
668         st->post[i]=0;*/
669       mean_post+=st->post[i];
670    }
671    mean_post /= N;
672    if (mean_post<0.f)
673       mean_post=0.f;
674 
675    /* Special case for first frame */
676    if (st->nb_adapt==1)
677       for (i=1;i<N;i++)
678          st->old_ps[i] = ps[i];
679 
680    /* Compute a priori SNR */
681    {
682       /* A priori update rate */
683       float gamma;
684       float min_gamma=0.12f;
685       gamma = 1.0f/st->nb_preprocess;
686 
687       /*Make update rate smaller when there's no speech*/
688 #if 0
689       if (mean_post<3.5 && mean_prior < 1)
690          min_gamma *= (mean_post+.5);
691       else
692          min_gamma *= 4.;
693 #else
694       min_gamma = .1f*fabs(mean_prior - mean_post)*fabs(mean_prior - mean_post);
695       if (min_gamma>.15f)
696          min_gamma = .15f;
697       if (min_gamma<.02f)
698          min_gamma = .02f;
699 #endif
700       /*min_gamma = .08;*/
701 
702       /*if (gamma<min_gamma)*/
703          gamma=min_gamma;
704       gamma = .1;
705       for (i=1;i<N;i++)
706       {
707 
708          /* A priori SNR update */
709          st->prior[i] = gamma*max(0.0f,st->post[i]) +
710          (1.f-gamma)*st->gain[i]*st->gain[i]*st->old_ps[i]/(1.f+NOISE_OVERCOMPENS*st->noise[i]+st->echo_noise[i]+st->reverb_estimate[i]);
711 
712          if (st->prior[i]>100.f)
713             st->prior[i]=100.f;
714 
715          mean_prior+=st->prior[i];
716       }
717    }
718    mean_prior /= N;
719 
720 #if 0
721    for (i=0;i<N;i++)
722    {
723       fprintf (stderr, "%f ", st->prior[i]);
724    }
725    fprintf (stderr, "\n");
726 #endif
727    /*fprintf (stderr, "%f %f\n", mean_prior,mean_post);*/
728 
729    if (st->nb_preprocess>=20)
730    {
731       int do_update = 0;
732       float noise_ener=0, sig_ener=0;
733       /* If SNR is low (both a priori and a posteriori), update the noise estimate*/
734       /*if (mean_prior<.23 && mean_post < .5)*/
735       if (mean_prior<.23f && mean_post < .5f)
736          do_update = 1;
737       for (i=1;i<N;i++)
738       {
739          noise_ener += st->noise[i];
740          sig_ener += ps[i];
741       }
742       if (noise_ener > 3.f*sig_ener)
743          do_update = 1;
744       /*do_update = 0;*/
745       if (do_update)
746       {
747          st->consec_noise++;
748       } else {
749          st->consec_noise=0;
750       }
751    }
752 
753    if (st->vad_enabled)
754       is_speech = speex_compute_vad(st, ps, mean_prior, mean_post);
755 
756 
757    if (st->consec_noise>=3)
758    {
759       update_noise(st, st->old_ps, echo);
760    } else {
761       for (i=1;i<N-1;i++)
762       {
763          if (st->update_prob[i]<.5f || st->ps[i] < st->noise[i])
764          {
765             if (echo)
766                st->noise[i] = .90f*st->noise[i] + .1f*max(1.0f,st->ps[i]-echo[i]);
767             else
768                st->noise[i] = .90f*st->noise[i] + .1f*st->ps[i];
769          }
770       }
771    }
772 
773    for (i=1;i<N;i++)
774    {
775       st->zeta[i] = .7f*st->zeta[i] + .3f*st->prior[i];
776    }
777 
778    {
779       int freq_start = (int)(300.0f*2.f*N/st->sampling_rate);
780       int freq_end   = (int)(2000.0f*2.f*N/st->sampling_rate);
781       for (i=freq_start;i<freq_end;i++)
782       {
783          Zframe += st->zeta[i];
784       }
785    }
786 
787    Zframe /= N;
788    if (Zframe<ZMIN)
789    {
790       Pframe = 0;
791    } else {
792       if (Zframe > 1.5f*st->Zlast)
793       {
794          Pframe = 1.f;
795          st->Zpeak = Zframe;
796          if (st->Zpeak > 10.f)
797             st->Zpeak = 10.f;
798          if (st->Zpeak < 1.f)
799             st->Zpeak = 1.f;
800       } else {
801          if (Zframe < st->Zpeak*ZMIN)
802          {
803             Pframe = 0;
804          } else if (Zframe > st->Zpeak*ZMAX)
805          {
806             Pframe = 1;
807          } else {
808             Pframe = log(Zframe/(st->Zpeak*ZMIN)) / log(ZMAX/ZMIN);
809          }
810       }
811    }
812    st->Zlast = Zframe;
813 
814    /*fprintf (stderr, "%f\n", Pframe);*/
815    /* Compute gain according to the Ephraim-Malah algorithm */
816    for (i=1;i<N;i++)
817    {
818       float MM;
819       float theta;
820       float prior_ratio;
821       float p, q;
822       float zeta1;
823       float P1;
824 
825       prior_ratio = st->prior[i]/(1.0001f+st->prior[i]);
826       theta = (1.f+st->post[i])*prior_ratio;
827 
828       if (i==1 || i==N-1)
829          zeta1 = st->zeta[i];
830       else
831          zeta1 = .25f*st->zeta[i-1] + .5f*st->zeta[i] + .25f*st->zeta[i+1];
832       if (zeta1<ZMIN)
833          P1 = 0.f;
834       else if (zeta1>ZMAX)
835          P1 = 1.f;
836       else
837          P1 = LOG_MIN_MAX_1 * log(ZMIN_1*zeta1);
838 
839       /*P1 = log(zeta1/ZMIN)/log(ZMAX/ZMIN);*/
840 
841       /* FIXME: add global prob (P2) */
842       q = 1-Pframe*P1;
843       q = 1-P1;
844       if (q>.95f)
845          q=.95f;
846       p=1.f/(1.f + (q/(1.f-q))*(1.f+st->prior[i])*exp(-theta));
847       /*p=1;*/
848 
849 #if 0
850       /* log-spectral magnitude estimator */
851       if (theta<6)
852          MM = 0.74082*pow(theta+1,.61)/sqrt(.0001+theta);
853       else
854          MM=1;
855 #else
856       /* Optimal estimator for loudness domain */
857       MM = hypergeom_gain(theta);
858 #endif
859 
860       st->gain[i] = prior_ratio * MM;
861       /*Put some (very arbitraty) limit on the gain*/
862       if (st->gain[i]>2.f)
863       {
864          st->gain[i]=2.f;
865       }
866 
867       st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];
868       if (st->denoise_enabled)
869       {
870          st->gain2[i]=p*p*st->gain[i];
871       } else {
872          st->gain2[i]=1.f;
873       }
874    }
875    st->gain2[0]=st->gain[0]=0.f;
876    st->gain2[N-1]=st->gain[N-1]=0.f;
877 
878    if (st->agc_enabled)
879       speex_compute_agc(st, mean_prior);
880 
881 #if 0
882    if (!is_speech)
883    {
884       for (i=0;i<N;i++)
885          st->gain2[i] = 0;
886    }
887 #if 0
888  else {
889       for (i=0;i<N;i++)
890          st->gain2[i] = 1;
891    }
892 #endif
893 #endif
894 
895    /* Apply computed gain */
896    for (i=1;i<N;i++)
897    {
898       st->frame[2*i-1] *= st->gain2[i];
899       st->frame[2*i] *= st->gain2[i];
900    }
901 
902    /* Get rid of the DC and very low frequencies */
903    st->frame[0]=0;
904    st->frame[1]=0;
905    st->frame[2]=0;
906    /* Nyquist frequency is mostly useless too */
907    st->frame[2*N-1]=0;
908 
909    /* Inverse FFT with 1/N scaling */
910    spx_drft_backward(st->fft_lookup, st->frame);
911 
912    for (i=0;i<2*N;i++)
913       st->frame[i] *= scale;
914 
915    {
916       float max_sample=0;
917       for (i=0;i<2*N;i++)
918          if (fabs(st->frame[i])>max_sample)
919             max_sample = fabs(st->frame[i]);
920       if (max_sample>28000.f)
921       {
922          float damp = 28000.f/max_sample;
923          for (i=0;i<2*N;i++)
924             st->frame[i] *= damp;
925       }
926    }
927 
928    for (i=0;i<2*N;i++)
929       st->frame[i] *= st->window[i];
930 
931    /* Perform overlap and add */
932    for (i=0;i<N3;i++)
933       x[i] = st->outbuf[i] + st->frame[i];
934    for (i=0;i<N4;i++)
935       x[N3+i] = st->frame[N3+i];
936 
937    /* Update outbuf */
938    for (i=0;i<N3;i++)
939       st->outbuf[i] = st->frame[st->frame_size+i];
940 
941    /* Save old power spectrum */
942    for (i=1;i<N;i++)
943       st->old_ps[i] = ps[i];
944 
945    return is_speech;
946 }
947 
speex_preprocess_estimate_update(SpeexPreprocessState * st,spx_int16_t * x,float * echo)948 void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x, float *echo)
949 {
950    int i;
951    int N = st->ps_size;
952    int N3 = 2*N - st->frame_size;
953 
954    float *ps=st->ps;
955 
956    preprocess_analysis(st, x);
957 
958    update_noise_prob(st);
959 
960    st->nb_preprocess++;
961 
962    for (i=1;i<N-1;i++)
963    {
964       if (st->update_prob[i]<.5f || st->ps[i] < st->noise[i])
965       {
966          if (echo)
967             st->noise[i] = .90f*st->noise[i] + .1f*max(1.0f,st->ps[i]-echo[i]);
968          else
969             st->noise[i] = .90f*st->noise[i] + .1f*st->ps[i];
970       }
971    }
972 
973    for (i=0;i<N3;i++)
974       st->outbuf[i] = x[st->frame_size-N3+i]*st->window[st->frame_size+i];
975 
976    /* Save old power spectrum */
977    for (i=1;i<N;i++)
978       st->old_ps[i] = ps[i];
979 
980    for (i=1;i<N;i++)
981       st->reverb_estimate[i] *= st->reverb_decay;
982 }
983 
984 
speex_preprocess_ctl(SpeexPreprocessState * state,int request,void * ptr)985 int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
986 {
987    int i;
988    SpeexPreprocessState *st;
989    st=(SpeexPreprocessState*)state;
990    switch(request)
991    {
992    case SPEEX_PREPROCESS_SET_DENOISE:
993       st->denoise_enabled = (*(int*)ptr);
994       break;
995    case SPEEX_PREPROCESS_GET_DENOISE:
996       (*(int*)ptr) = st->denoise_enabled;
997       break;
998 
999    case SPEEX_PREPROCESS_SET_AGC:
1000       st->agc_enabled = (*(int*)ptr);
1001       break;
1002    case SPEEX_PREPROCESS_GET_AGC:
1003       (*(int*)ptr) = st->agc_enabled;
1004       break;
1005 
1006    case SPEEX_PREPROCESS_SET_AGC_LEVEL:
1007       st->agc_level = (*(float*)ptr);
1008       if (st->agc_level<1)
1009          st->agc_level=1;
1010       if (st->agc_level>32768)
1011          st->agc_level=32768;
1012       break;
1013    case SPEEX_PREPROCESS_GET_AGC_LEVEL:
1014       (*(float*)ptr) = st->agc_level;
1015       break;
1016 
1017    case SPEEX_PREPROCESS_SET_VAD:
1018       st->vad_enabled = (*(int*)ptr);
1019       break;
1020    case SPEEX_PREPROCESS_GET_VAD:
1021       (*(int*)ptr) = st->vad_enabled;
1022       break;
1023 
1024    case SPEEX_PREPROCESS_SET_DEREVERB:
1025       st->dereverb_enabled = (*(int*)ptr);
1026       for (i=0;i<st->ps_size;i++)
1027          st->reverb_estimate[i]=0;
1028       break;
1029    case SPEEX_PREPROCESS_GET_DEREVERB:
1030       (*(int*)ptr) = st->dereverb_enabled;
1031       break;
1032 
1033    case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL:
1034       st->reverb_level = (*(float*)ptr);
1035       break;
1036    case SPEEX_PREPROCESS_GET_DEREVERB_LEVEL:
1037       (*(float*)ptr) = st->reverb_level;
1038       break;
1039 
1040    case SPEEX_PREPROCESS_SET_DEREVERB_DECAY:
1041       st->reverb_decay = (*(float*)ptr);
1042       break;
1043    case SPEEX_PREPROCESS_GET_DEREVERB_DECAY:
1044       (*(float*)ptr) = st->reverb_decay;
1045       break;
1046 
1047    case SPEEX_PREPROCESS_SET_PROB_START:
1048       st->speech_prob_start = (*(float*)ptr);
1049       if ( st->speech_prob_start > 1 )
1050 	  st->speech_prob_start = st->speech_prob_start / 100;
1051       if ( st->speech_prob_start > 1 || st->speech_prob_start < 0 )
1052 	  st->speech_prob_start = SPEEX_PROB_START_DEFAULT;
1053       break;
1054    case SPEEX_PREPROCESS_GET_PROB_START:
1055       (*(float*)ptr) = st->speech_prob_start ;
1056       break;
1057 
1058    case SPEEX_PREPROCESS_SET_PROB_CONTINUE:
1059       st->speech_prob_continue = (*(float*)ptr);
1060       if ( st->speech_prob_continue > 1 )
1061 	  st->speech_prob_continue = st->speech_prob_continue / 100;
1062       if ( st->speech_prob_continue > 1 || st->speech_prob_continue < 0 )
1063 	  st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT;
1064       break;
1065    case SPEEX_PREPROCESS_GET_PROB_CONTINUE:
1066       (*(float*)ptr) = st->speech_prob_continue;
1067       break;
1068 
1069    default:
1070       speex_warning_int("Unknown speex_preprocess_ctl request: ", request);
1071       return -1;
1072    }
1073    return 0;
1074 }
1075