1 /* Copyright (C) 2003 Epic Games
2 Written by Jean-Marc Valin
3
4 File: preprocess.c
5 Preprocessor with denoising based on the algorithm by Ephraim and Malah
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
9 met:
10
11 1. Redistributions of source code must retain the above copyright notice,
12 this list of conditions and the following disclaimer.
13
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17
18 3. The name of the author may not be used to endorse or promote products
19 derived from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #ifdef HAVE_CONFIG_H
35 #include "config.h"
36 #endif
37
38 #include <math.h>
39 #include "speex/speex_preprocess.h"
40 #include "misc.h"
41 #include "smallft.h"
42
43 #define max(a,b) ((a) > (b) ? (a) : (b))
44 #define min(a,b) ((a) < (b) ? (a) : (b))
45
46 #ifndef M_PI
47 #define M_PI 3.14159263
48 #endif
49
50 #define SQRT_M_PI_2 0.88623
51 #define LOUDNESS_EXP 2.5
52
53 #define SPEEX_PROB_START_DEFAULT 0.35f
54 #define SPEEX_PROB_CONTINUE_DEFAULT 0.20f
55
56 #define NB_BANDS 8
57
58 #define ZMIN .1
59 #define ZMAX .316
60 #define ZMIN_1 10
61 #define LOG_MIN_MAX_1 0.86859
62
conj_window(float * w,int len)63 static void conj_window(float *w, int len)
64 {
65 int i;
66 for (i=0;i<len;i++)
67 {
68 float x=4*((float)i)/len;
69 int inv=0;
70 if (x<1)
71 {
72 } else if (x<2)
73 {
74 x=2-x;
75 inv=1;
76 } else if (x<3)
77 {
78 x=x-2;
79 inv=1;
80 } else {
81 x=4-x;
82 }
83 x*=1.9979;
84 w[i]=(.5-.5*cos(x))*(.5-.5*cos(x));
85 if (inv)
86 w[i]=1-w[i];
87 w[i]=sqrt(w[i]);
88 }
89 }
90
91 /* This function approximates the gain function
92 y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
93 which multiplied by xi/(1+xi) is the optimal gain
94 in the loudness domain ( sqrt[amplitude] )
95 */
hypergeom_gain(float x)96 static float hypergeom_gain(float x)
97 {
98 int ind;
99 float integer, frac;
100 static const float table[21] = {
101 0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f,
102 1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
103 2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
104
105 if (x>9.5)
106 return 1+.12/x;
107
108 integer = floor(2*x);
109 frac = 2*x-integer;
110 ind = (int)integer;
111 /*if (ind > 20 || ind < 0)
112 fprintf (stderr, "error: %d %f\n", ind, x);*/
113 return ((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
114 }
115
speex_preprocess_state_init(int frame_size,int sampling_rate)116 SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
117 {
118 int i;
119 int N, N3, N4;
120
121 SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
122 st->frame_size = frame_size;
123
124 /* Round ps_size down to the nearest power of two */
125 #if 0
126 i=1;
127 st->ps_size = st->frame_size;
128 while(1)
129 {
130 if (st->ps_size & ~i)
131 {
132 st->ps_size &= ~i;
133 i<<=1;
134 } else {
135 break;
136 }
137 }
138
139
140 if (st->ps_size < 3*st->frame_size/4)
141 st->ps_size = st->ps_size * 3 / 2;
142 #else
143 st->ps_size = st->frame_size;
144 #endif
145
146 N = st->ps_size;
147 N3 = 2*N - st->frame_size;
148 N4 = st->frame_size - N3;
149
150 st->sampling_rate = sampling_rate;
151 st->denoise_enabled = 1;
152 st->agc_enabled = 0;
153 st->agc_level = 8000;
154 st->vad_enabled = 0;
155 st->dereverb_enabled = 0;
156 st->reverb_decay = .5;
157 st->reverb_level = .2;
158
159 st->speech_prob_start = SPEEX_PROB_START_DEFAULT;
160 st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT;
161
162 st->frame = (float*)speex_alloc(2*N*sizeof(float));
163 st->ps = (float*)speex_alloc(N*sizeof(float));
164 st->gain2 = (float*)speex_alloc(N*sizeof(float));
165 st->window = (float*)speex_alloc(2*N*sizeof(float));
166 st->noise = (float*)speex_alloc(N*sizeof(float));
167 st->reverb_estimate = (float*)speex_alloc(N*sizeof(float));
168 st->old_ps = (float*)speex_alloc(N*sizeof(float));
169 st->gain = (float*)speex_alloc(N*sizeof(float));
170 st->prior = (float*)speex_alloc(N*sizeof(float));
171 st->post = (float*)speex_alloc(N*sizeof(float));
172 st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
173 st->inbuf = (float*)speex_alloc(N3*sizeof(float));
174 st->outbuf = (float*)speex_alloc(N3*sizeof(float));
175 st->echo_noise = (float*)speex_alloc(N*sizeof(float));
176
177 st->S = (float*)speex_alloc(N*sizeof(float));
178 st->Smin = (float*)speex_alloc(N*sizeof(float));
179 st->Stmp = (float*)speex_alloc(N*sizeof(float));
180 st->update_prob = (float*)speex_alloc(N*sizeof(float));
181
182 st->zeta = (float*)speex_alloc(N*sizeof(float));
183 st->Zpeak = 0;
184 st->Zlast = 0;
185
186 st->noise_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
187 st->noise_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
188 st->speech_bands = (float*)speex_alloc(NB_BANDS*sizeof(float));
189 st->speech_bands2 = (float*)speex_alloc(NB_BANDS*sizeof(float));
190 st->noise_bandsN = st->speech_bandsN = 1;
191
192 conj_window(st->window, 2*N3);
193 for (i=2*N3;i<2*st->ps_size;i++)
194 st->window[i]=1;
195
196 if (N4>0)
197 {
198 for (i=N3-1;i>=0;i--)
199 {
200 st->window[i+N3+N4]=st->window[i+N3];
201 st->window[i+N3]=1;
202 }
203 }
204 for (i=0;i<N;i++)
205 {
206 st->noise[i]=1e4;
207 st->reverb_estimate[i]=0.;
208 st->old_ps[i]=1e4;
209 st->gain[i]=1;
210 st->post[i]=1;
211 st->prior[i]=1;
212 }
213
214 for (i=0;i<N3;i++)
215 {
216 st->inbuf[i]=0;
217 st->outbuf[i]=0;
218 }
219
220 for (i=0;i<N;i++)
221 {
222 float ff=((float)i)*.5*sampling_rate/((float)N);
223 st->loudness_weight[i] = .35f-.35f*ff/16000.f+.73f*exp(-.5f*(ff-3800)*(ff-3800)/9e5f);
224 if (st->loudness_weight[i]<.01f)
225 st->loudness_weight[i]=.01f;
226 st->loudness_weight[i] *= st->loudness_weight[i];
227 }
228
229 st->speech_prob = 0;
230 st->last_speech = 1000;
231 st->loudness = pow(6000,LOUDNESS_EXP);
232 st->loudness2 = 6000;
233 st->nb_loudness_adapt = 0;
234
235 st->fft_lookup = (struct drft_lookup*)speex_alloc(sizeof(struct drft_lookup));
236 spx_drft_init(st->fft_lookup,2*N);
237
238 st->nb_adapt=0;
239 st->consec_noise=0;
240 st->nb_preprocess=0;
241 return st;
242 }
243
speex_preprocess_state_destroy(SpeexPreprocessState * st)244 void speex_preprocess_state_destroy(SpeexPreprocessState *st)
245 {
246 speex_free(st->frame);
247 speex_free(st->ps);
248 speex_free(st->gain2);
249 speex_free(st->window);
250 speex_free(st->noise);
251 speex_free(st->reverb_estimate);
252 speex_free(st->old_ps);
253 speex_free(st->gain);
254 speex_free(st->prior);
255 speex_free(st->post);
256 speex_free(st->loudness_weight);
257 speex_free(st->echo_noise);
258
259 speex_free(st->S);
260 speex_free(st->Smin);
261 speex_free(st->Stmp);
262 speex_free(st->update_prob);
263 speex_free(st->zeta);
264
265 speex_free(st->noise_bands);
266 speex_free(st->noise_bands2);
267 speex_free(st->speech_bands);
268 speex_free(st->speech_bands2);
269
270 speex_free(st->inbuf);
271 speex_free(st->outbuf);
272
273 spx_drft_clear(st->fft_lookup);
274 speex_free(st->fft_lookup);
275
276 speex_free(st);
277 }
278
update_noise(SpeexPreprocessState * st,float * ps,float * echo)279 static void update_noise(SpeexPreprocessState *st, float *ps, float *echo)
280 {
281 int i;
282 float beta;
283 st->nb_adapt++;
284 beta=1.0f/st->nb_adapt;
285 if (beta < .05f)
286 beta=.05f;
287
288 if (!echo)
289 {
290 for (i=0;i<st->ps_size;i++)
291 st->noise[i] = (1.f-beta)*st->noise[i] + beta*ps[i];
292 } else {
293 for (i=0;i<st->ps_size;i++)
294 st->noise[i] = (1.f-beta)*st->noise[i] + beta*max(1.f,ps[i]-echo[i]);
295 #if 0
296 for (i=0;i<st->ps_size;i++)
297 st->noise[i] = 0;
298 #endif
299 }
300 }
301
speex_compute_vad(SpeexPreprocessState * st,float * ps,float mean_prior,float mean_post)302 static int speex_compute_vad(SpeexPreprocessState *st, float *ps, float mean_prior, float mean_post)
303 {
304 int i, is_speech=0;
305 int N = st->ps_size;
306 float scale=.5f/N;
307
308 /* FIXME: Clean this up a bit */
309 {
310 float bands[NB_BANDS];
311 int j;
312 float p0, p1;
313 float tot_loudness=0;
314 float x = sqrt(mean_post);
315
316 for (i=5;i<N-10;i++)
317 {
318 tot_loudness += scale*st->ps[i] * st->loudness_weight[i];
319 }
320
321 for (i=0;i<NB_BANDS;i++)
322 {
323 bands[i]=1e4f;
324 for (j=i*N/NB_BANDS;j<(i+1)*N/NB_BANDS;j++)
325 {
326 bands[i] += ps[j];
327 }
328 bands[i]=log(bands[i]);
329 }
330
331 /*p1 = .0005+.6*exp(-.5*(x-.4)*(x-.4)*11)+.1*exp(-1.2*x);
332 if (x<1.5)
333 p0=.1*exp(2*(x-1.5));
334 else
335 p0=.02+.1*exp(-.2*(x-1.5));
336 */
337
338 p0=1.f/(1.f+exp(3.f*(1.5f-x)));
339 p1=1.f-p0;
340
341 /*fprintf (stderr, "%f %f ", p0, p1);*/
342 /*p0 *= .99*st->speech_prob + .01*(1-st->speech_prob);
343 p1 *= .01*st->speech_prob + .99*(1-st->speech_prob);
344
345 st->speech_prob = p0/(p1+p0);
346 */
347
348 if (st->noise_bandsN < 50 || st->speech_bandsN < 50)
349 {
350 if (mean_post > 5.f)
351 {
352 float adapt = 1./st->speech_bandsN++;
353 if (adapt<.005f)
354 adapt = .005f;
355 for (i=0;i<NB_BANDS;i++)
356 {
357 st->speech_bands[i] = (1.f-adapt)*st->speech_bands[i] + adapt*bands[i];
358 /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
359 st->speech_bands2[i] = (1.f-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
360 }
361 } else {
362 float adapt = 1./st->noise_bandsN++;
363 if (adapt<.005f)
364 adapt = .005f;
365 for (i=0;i<NB_BANDS;i++)
366 {
367 st->noise_bands[i] = (1.f-adapt)*st->noise_bands[i] + adapt*bands[i];
368 /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
369 st->noise_bands2[i] = (1.f-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
370 }
371 }
372 }
373 p0=p1=1;
374 for (i=0;i<NB_BANDS;i++)
375 {
376 float noise_var, speech_var;
377 float noise_mean, speech_mean;
378 float tmp1, tmp2, pr;
379
380 /*noise_var = 1.01*st->noise_bands2[i] - st->noise_bands[i]*st->noise_bands[i];
381 speech_var = 1.01*st->speech_bands2[i] - st->speech_bands[i]*st->speech_bands[i];*/
382 noise_var = st->noise_bands2[i];
383 speech_var = st->speech_bands2[i];
384 if (noise_var < .1f)
385 noise_var = .1f;
386 if (speech_var < .1f)
387 speech_var = .1f;
388
389 /*speech_var = sqrt(speech_var*noise_var);
390 noise_var = speech_var;*/
391 if (speech_var < .05f*speech_var)
392 noise_var = .05f*speech_var;
393 if (speech_var < .05f*noise_var)
394 speech_var = .05f*noise_var;
395
396 if (bands[i] < st->noise_bands[i])
397 speech_var = noise_var;
398 if (bands[i] > st->speech_bands[i])
399 noise_var = speech_var;
400
401 speech_mean = st->speech_bands[i];
402 noise_mean = st->noise_bands[i];
403 if (noise_mean < speech_mean - 5.f)
404 noise_mean = speech_mean - 5.f;
405
406 tmp1 = exp(-.5f*(bands[i]-speech_mean)*(bands[i]-speech_mean)/speech_var)/sqrt(2.f*M_PI*speech_var);
407 tmp2 = exp(-.5f*(bands[i]-noise_mean)*(bands[i]-noise_mean)/noise_var)/sqrt(2.f*M_PI*noise_var);
408 /*fprintf (stderr, "%f ", (float)(p0/(.01+p0+p1)));*/
409 /*fprintf (stderr, "%f ", (float)(bands[i]));*/
410 pr = tmp1/(1e-25+tmp1+tmp2);
411 /*if (bands[i] < st->noise_bands[i])
412 pr=.01;
413 if (bands[i] > st->speech_bands[i] && pr < .995)
414 pr=.995;*/
415 if (pr>.999f)
416 pr=.999f;
417 if (pr<.001f)
418 pr=.001f;
419 /*fprintf (stderr, "%f ", pr);*/
420 p0 *= pr;
421 p1 *= (1-pr);
422 }
423
424 p0 = pow(p0,.2);
425 p1 = pow(p1,.2);
426
427 #if 1
428 p0 *= 2.f;
429 p0=p0/(p1+p0);
430 if (st->last_speech>20)
431 {
432 float tmp = sqrt(tot_loudness)/st->loudness2;
433 tmp = 1.f-exp(-10.f*tmp);
434 if (p0>tmp)
435 p0=tmp;
436 }
437 p1=1-p0;
438 #else
439 if (sqrt(tot_loudness) < .6f*st->loudness2 && p0>15.f*p1)
440 p0=15.f*p1;
441 if (sqrt(tot_loudness) < .45f*st->loudness2 && p0>7.f*p1)
442 p0=7.f*p1;
443 if (sqrt(tot_loudness) < .3f*st->loudness2 && p0>3.f*p1)
444 p0=3.f*p1;
445 if (sqrt(tot_loudness) < .15f*st->loudness2 && p0>p1)
446 p0=p1;
447 /*fprintf (stderr, "%f %f ", (float)(sqrt(tot_loudness) /( .25*st->loudness2)), p0/(p1+p0));*/
448 #endif
449
450 p0 *= .99f*st->speech_prob + .01f*(1-st->speech_prob);
451 p1 *= .01f*st->speech_prob + .99f*(1-st->speech_prob);
452
453 st->speech_prob = p0/(1e-25f+p1+p0);
454 /*fprintf (stderr, "%f %f %f ", tot_loudness, st->loudness2, st->speech_prob);*/
455
456 if (st->speech_prob > st->speech_prob_start
457 || (st->last_speech < 20 && st->speech_prob > st->speech_prob_continue))
458 {
459 is_speech = 1;
460 st->last_speech = 0;
461 } else {
462 st->last_speech++;
463 if (st->last_speech<20)
464 is_speech = 1;
465 }
466
467 if (st->noise_bandsN > 50 && st->speech_bandsN > 50)
468 {
469 if (mean_post > 5)
470 {
471 float adapt = 1./st->speech_bandsN++;
472 if (adapt<.005f)
473 adapt = .005f;
474 for (i=0;i<NB_BANDS;i++)
475 {
476 st->speech_bands[i] = (1-adapt)*st->speech_bands[i] + adapt*bands[i];
477 /*st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*bands[i]*bands[i];*/
478 st->speech_bands2[i] = (1-adapt)*st->speech_bands2[i] + adapt*(bands[i]-st->speech_bands[i])*(bands[i]-st->speech_bands[i]);
479 }
480 } else {
481 float adapt = 1./st->noise_bandsN++;
482 if (adapt<.005f)
483 adapt = .005f;
484 for (i=0;i<NB_BANDS;i++)
485 {
486 st->noise_bands[i] = (1-adapt)*st->noise_bands[i] + adapt*bands[i];
487 /*st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*bands[i]*bands[i];*/
488 st->noise_bands2[i] = (1-adapt)*st->noise_bands2[i] + adapt*(bands[i]-st->noise_bands[i])*(bands[i]-st->noise_bands[i]);
489 }
490 }
491 }
492
493
494 }
495
496 return is_speech;
497 }
498
speex_compute_agc(SpeexPreprocessState * st,float mean_prior)499 static void speex_compute_agc(SpeexPreprocessState *st, float mean_prior)
500 {
501 int i;
502 int N = st->ps_size;
503 float scale=.5f/N;
504 float agc_gain;
505 int freq_start, freq_end;
506 float active_bands = 0;
507
508 freq_start = (int)(300.0f*2*N/st->sampling_rate);
509 freq_end = (int)(2000.0f*2*N/st->sampling_rate);
510 for (i=freq_start;i<freq_end;i++)
511 {
512 if (st->S[i] > 20.f*st->Smin[i]+1000.f)
513 active_bands+=1;
514 }
515 active_bands /= (freq_end-freq_start+1);
516
517 if (active_bands > .2f)
518 {
519 float loudness=0.f;
520 float rate, rate2=.2f;
521 st->nb_loudness_adapt++;
522 rate=2.0f/(1+st->nb_loudness_adapt);
523 if (rate < .05f)
524 rate = .05f;
525 if (rate < .1f && pow(loudness, LOUDNESS_EXP) > st->loudness)
526 rate = .1f;
527 if (rate < .2f && pow(loudness, LOUDNESS_EXP) > 3.f*st->loudness)
528 rate = .2f;
529 if (rate < .4f && pow(loudness, LOUDNESS_EXP) > 10.f*st->loudness)
530 rate = .4f;
531
532 for (i=2;i<N;i++)
533 {
534 loudness += scale*st->ps[i] * st->gain2[i] * st->gain2[i] * st->loudness_weight[i];
535 }
536 loudness=sqrt(loudness);
537 /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
538 loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
539 st->loudness = (1-rate)*st->loudness + (rate)*pow(loudness, LOUDNESS_EXP);
540
541 st->loudness2 = (1-rate2)*st->loudness2 + rate2*pow(st->loudness, 1.0f/LOUDNESS_EXP);
542
543 loudness = pow(st->loudness, 1.0f/LOUDNESS_EXP);
544
545 /*fprintf (stderr, "%f %f %f\n", loudness, st->loudness2, rate);*/
546 }
547
548 agc_gain = st->agc_level/st->loudness2;
549 /*fprintf (stderr, "%f %f %f %f\n", active_bands, st->loudness, st->loudness2, agc_gain);*/
550 if (agc_gain>200)
551 agc_gain = 200;
552
553 for (i=0;i<N;i++)
554 st->gain2[i] *= agc_gain;
555
556 }
557
preprocess_analysis(SpeexPreprocessState * st,spx_int16_t * x)558 static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
559 {
560 int i;
561 int N = st->ps_size;
562 int N3 = 2*N - st->frame_size;
563 int N4 = st->frame_size - N3;
564 float *ps=st->ps;
565
566 /* 'Build' input frame */
567 for (i=0;i<N3;i++)
568 st->frame[i]=st->inbuf[i];
569 for (i=0;i<st->frame_size;i++)
570 st->frame[N3+i]=x[i];
571
572 /* Update inbuf */
573 for (i=0;i<N3;i++)
574 st->inbuf[i]=x[N4+i];
575
576 /* Windowing */
577 for (i=0;i<2*N;i++)
578 st->frame[i] *= st->window[i];
579
580 /* Perform FFT */
581 spx_drft_forward(st->fft_lookup, st->frame);
582
583 /* Power spectrum */
584 ps[0]=1;
585 for (i=1;i<N;i++)
586 ps[i]=1+st->frame[2*i-1]*st->frame[2*i-1] + st->frame[2*i]*st->frame[2*i];
587
588 }
589
update_noise_prob(SpeexPreprocessState * st)590 static void update_noise_prob(SpeexPreprocessState *st)
591 {
592 int i;
593 int N = st->ps_size;
594
595 for (i=1;i<N-1;i++)
596 st->S[i] = 100.f+ .8f*st->S[i] + .05f*st->ps[i-1]+.1f*st->ps[i]+.05f*st->ps[i+1];
597
598 if (st->nb_preprocess<1)
599 {
600 for (i=1;i<N-1;i++)
601 st->Smin[i] = st->Stmp[i] = st->S[i]+100.f;
602 }
603
604 if (st->nb_preprocess%200==0)
605 {
606 for (i=1;i<N-1;i++)
607 {
608 st->Smin[i] = min(st->Stmp[i], st->S[i]);
609 st->Stmp[i] = st->S[i];
610 }
611 } else {
612 for (i=1;i<N-1;i++)
613 {
614 st->Smin[i] = min(st->Smin[i], st->S[i]);
615 st->Stmp[i] = min(st->Stmp[i], st->S[i]);
616 }
617 }
618 for (i=1;i<N-1;i++)
619 {
620 st->update_prob[i] *= .2f;
621 if (st->S[i] > 2.5*st->Smin[i])
622 st->update_prob[i] += .8f;
623 /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
624 /*fprintf (stderr, "%f ", st->update_prob[i]);*/
625 }
626
627 }
628
629 #define NOISE_OVERCOMPENS 1.4
630
speex_preprocess(SpeexPreprocessState * st,spx_int16_t * x,float * echo)631 int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, float *echo)
632 {
633 int i;
634 int is_speech=1;
635 float mean_post=0;
636 float mean_prior=0;
637 int N = st->ps_size;
638 int N3 = 2*N - st->frame_size;
639 int N4 = st->frame_size - N3;
640 float scale=.5f/N;
641 float *ps=st->ps;
642 float Zframe=0, Pframe;
643
644 preprocess_analysis(st, x);
645
646 update_noise_prob(st);
647
648 st->nb_preprocess++;
649
650 /* Noise estimation always updated for the 20 first times */
651 if (st->nb_adapt<10)
652 {
653 update_noise(st, ps, echo);
654 }
655
656 /* Deal with residual echo if provided */
657 if (echo)
658 for (i=1;i<N;i++)
659 st->echo_noise[i] = (.3f*st->echo_noise[i] + echo[i]);
660
661 /* Compute a posteriori SNR */
662 for (i=1;i<N;i++)
663 {
664 st->post[i] = ps[i]/(1.f+NOISE_OVERCOMPENS*st->noise[i]+st->echo_noise[i]+st->reverb_estimate[i]) - 1.f;
665 if (st->post[i]>100.f)
666 st->post[i]=100.f;
667 /*if (st->post[i]<0)
668 st->post[i]=0;*/
669 mean_post+=st->post[i];
670 }
671 mean_post /= N;
672 if (mean_post<0.f)
673 mean_post=0.f;
674
675 /* Special case for first frame */
676 if (st->nb_adapt==1)
677 for (i=1;i<N;i++)
678 st->old_ps[i] = ps[i];
679
680 /* Compute a priori SNR */
681 {
682 /* A priori update rate */
683 float gamma;
684 float min_gamma=0.12f;
685 gamma = 1.0f/st->nb_preprocess;
686
687 /*Make update rate smaller when there's no speech*/
688 #if 0
689 if (mean_post<3.5 && mean_prior < 1)
690 min_gamma *= (mean_post+.5);
691 else
692 min_gamma *= 4.;
693 #else
694 min_gamma = .1f*fabs(mean_prior - mean_post)*fabs(mean_prior - mean_post);
695 if (min_gamma>.15f)
696 min_gamma = .15f;
697 if (min_gamma<.02f)
698 min_gamma = .02f;
699 #endif
700 /*min_gamma = .08;*/
701
702 /*if (gamma<min_gamma)*/
703 gamma=min_gamma;
704 gamma = .1;
705 for (i=1;i<N;i++)
706 {
707
708 /* A priori SNR update */
709 st->prior[i] = gamma*max(0.0f,st->post[i]) +
710 (1.f-gamma)*st->gain[i]*st->gain[i]*st->old_ps[i]/(1.f+NOISE_OVERCOMPENS*st->noise[i]+st->echo_noise[i]+st->reverb_estimate[i]);
711
712 if (st->prior[i]>100.f)
713 st->prior[i]=100.f;
714
715 mean_prior+=st->prior[i];
716 }
717 }
718 mean_prior /= N;
719
720 #if 0
721 for (i=0;i<N;i++)
722 {
723 fprintf (stderr, "%f ", st->prior[i]);
724 }
725 fprintf (stderr, "\n");
726 #endif
727 /*fprintf (stderr, "%f %f\n", mean_prior,mean_post);*/
728
729 if (st->nb_preprocess>=20)
730 {
731 int do_update = 0;
732 float noise_ener=0, sig_ener=0;
733 /* If SNR is low (both a priori and a posteriori), update the noise estimate*/
734 /*if (mean_prior<.23 && mean_post < .5)*/
735 if (mean_prior<.23f && mean_post < .5f)
736 do_update = 1;
737 for (i=1;i<N;i++)
738 {
739 noise_ener += st->noise[i];
740 sig_ener += ps[i];
741 }
742 if (noise_ener > 3.f*sig_ener)
743 do_update = 1;
744 /*do_update = 0;*/
745 if (do_update)
746 {
747 st->consec_noise++;
748 } else {
749 st->consec_noise=0;
750 }
751 }
752
753 if (st->vad_enabled)
754 is_speech = speex_compute_vad(st, ps, mean_prior, mean_post);
755
756
757 if (st->consec_noise>=3)
758 {
759 update_noise(st, st->old_ps, echo);
760 } else {
761 for (i=1;i<N-1;i++)
762 {
763 if (st->update_prob[i]<.5f || st->ps[i] < st->noise[i])
764 {
765 if (echo)
766 st->noise[i] = .90f*st->noise[i] + .1f*max(1.0f,st->ps[i]-echo[i]);
767 else
768 st->noise[i] = .90f*st->noise[i] + .1f*st->ps[i];
769 }
770 }
771 }
772
773 for (i=1;i<N;i++)
774 {
775 st->zeta[i] = .7f*st->zeta[i] + .3f*st->prior[i];
776 }
777
778 {
779 int freq_start = (int)(300.0f*2.f*N/st->sampling_rate);
780 int freq_end = (int)(2000.0f*2.f*N/st->sampling_rate);
781 for (i=freq_start;i<freq_end;i++)
782 {
783 Zframe += st->zeta[i];
784 }
785 }
786
787 Zframe /= N;
788 if (Zframe<ZMIN)
789 {
790 Pframe = 0;
791 } else {
792 if (Zframe > 1.5f*st->Zlast)
793 {
794 Pframe = 1.f;
795 st->Zpeak = Zframe;
796 if (st->Zpeak > 10.f)
797 st->Zpeak = 10.f;
798 if (st->Zpeak < 1.f)
799 st->Zpeak = 1.f;
800 } else {
801 if (Zframe < st->Zpeak*ZMIN)
802 {
803 Pframe = 0;
804 } else if (Zframe > st->Zpeak*ZMAX)
805 {
806 Pframe = 1;
807 } else {
808 Pframe = log(Zframe/(st->Zpeak*ZMIN)) / log(ZMAX/ZMIN);
809 }
810 }
811 }
812 st->Zlast = Zframe;
813
814 /*fprintf (stderr, "%f\n", Pframe);*/
815 /* Compute gain according to the Ephraim-Malah algorithm */
816 for (i=1;i<N;i++)
817 {
818 float MM;
819 float theta;
820 float prior_ratio;
821 float p, q;
822 float zeta1;
823 float P1;
824
825 prior_ratio = st->prior[i]/(1.0001f+st->prior[i]);
826 theta = (1.f+st->post[i])*prior_ratio;
827
828 if (i==1 || i==N-1)
829 zeta1 = st->zeta[i];
830 else
831 zeta1 = .25f*st->zeta[i-1] + .5f*st->zeta[i] + .25f*st->zeta[i+1];
832 if (zeta1<ZMIN)
833 P1 = 0.f;
834 else if (zeta1>ZMAX)
835 P1 = 1.f;
836 else
837 P1 = LOG_MIN_MAX_1 * log(ZMIN_1*zeta1);
838
839 /*P1 = log(zeta1/ZMIN)/log(ZMAX/ZMIN);*/
840
841 /* FIXME: add global prob (P2) */
842 q = 1-Pframe*P1;
843 q = 1-P1;
844 if (q>.95f)
845 q=.95f;
846 p=1.f/(1.f + (q/(1.f-q))*(1.f+st->prior[i])*exp(-theta));
847 /*p=1;*/
848
849 #if 0
850 /* log-spectral magnitude estimator */
851 if (theta<6)
852 MM = 0.74082*pow(theta+1,.61)/sqrt(.0001+theta);
853 else
854 MM=1;
855 #else
856 /* Optimal estimator for loudness domain */
857 MM = hypergeom_gain(theta);
858 #endif
859
860 st->gain[i] = prior_ratio * MM;
861 /*Put some (very arbitraty) limit on the gain*/
862 if (st->gain[i]>2.f)
863 {
864 st->gain[i]=2.f;
865 }
866
867 st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];
868 if (st->denoise_enabled)
869 {
870 st->gain2[i]=p*p*st->gain[i];
871 } else {
872 st->gain2[i]=1.f;
873 }
874 }
875 st->gain2[0]=st->gain[0]=0.f;
876 st->gain2[N-1]=st->gain[N-1]=0.f;
877
878 if (st->agc_enabled)
879 speex_compute_agc(st, mean_prior);
880
881 #if 0
882 if (!is_speech)
883 {
884 for (i=0;i<N;i++)
885 st->gain2[i] = 0;
886 }
887 #if 0
888 else {
889 for (i=0;i<N;i++)
890 st->gain2[i] = 1;
891 }
892 #endif
893 #endif
894
895 /* Apply computed gain */
896 for (i=1;i<N;i++)
897 {
898 st->frame[2*i-1] *= st->gain2[i];
899 st->frame[2*i] *= st->gain2[i];
900 }
901
902 /* Get rid of the DC and very low frequencies */
903 st->frame[0]=0;
904 st->frame[1]=0;
905 st->frame[2]=0;
906 /* Nyquist frequency is mostly useless too */
907 st->frame[2*N-1]=0;
908
909 /* Inverse FFT with 1/N scaling */
910 spx_drft_backward(st->fft_lookup, st->frame);
911
912 for (i=0;i<2*N;i++)
913 st->frame[i] *= scale;
914
915 {
916 float max_sample=0;
917 for (i=0;i<2*N;i++)
918 if (fabs(st->frame[i])>max_sample)
919 max_sample = fabs(st->frame[i]);
920 if (max_sample>28000.f)
921 {
922 float damp = 28000.f/max_sample;
923 for (i=0;i<2*N;i++)
924 st->frame[i] *= damp;
925 }
926 }
927
928 for (i=0;i<2*N;i++)
929 st->frame[i] *= st->window[i];
930
931 /* Perform overlap and add */
932 for (i=0;i<N3;i++)
933 x[i] = st->outbuf[i] + st->frame[i];
934 for (i=0;i<N4;i++)
935 x[N3+i] = st->frame[N3+i];
936
937 /* Update outbuf */
938 for (i=0;i<N3;i++)
939 st->outbuf[i] = st->frame[st->frame_size+i];
940
941 /* Save old power spectrum */
942 for (i=1;i<N;i++)
943 st->old_ps[i] = ps[i];
944
945 return is_speech;
946 }
947
speex_preprocess_estimate_update(SpeexPreprocessState * st,spx_int16_t * x,float * echo)948 void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x, float *echo)
949 {
950 int i;
951 int N = st->ps_size;
952 int N3 = 2*N - st->frame_size;
953
954 float *ps=st->ps;
955
956 preprocess_analysis(st, x);
957
958 update_noise_prob(st);
959
960 st->nb_preprocess++;
961
962 for (i=1;i<N-1;i++)
963 {
964 if (st->update_prob[i]<.5f || st->ps[i] < st->noise[i])
965 {
966 if (echo)
967 st->noise[i] = .90f*st->noise[i] + .1f*max(1.0f,st->ps[i]-echo[i]);
968 else
969 st->noise[i] = .90f*st->noise[i] + .1f*st->ps[i];
970 }
971 }
972
973 for (i=0;i<N3;i++)
974 st->outbuf[i] = x[st->frame_size-N3+i]*st->window[st->frame_size+i];
975
976 /* Save old power spectrum */
977 for (i=1;i<N;i++)
978 st->old_ps[i] = ps[i];
979
980 for (i=1;i<N;i++)
981 st->reverb_estimate[i] *= st->reverb_decay;
982 }
983
984
speex_preprocess_ctl(SpeexPreprocessState * state,int request,void * ptr)985 int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
986 {
987 int i;
988 SpeexPreprocessState *st;
989 st=(SpeexPreprocessState*)state;
990 switch(request)
991 {
992 case SPEEX_PREPROCESS_SET_DENOISE:
993 st->denoise_enabled = (*(int*)ptr);
994 break;
995 case SPEEX_PREPROCESS_GET_DENOISE:
996 (*(int*)ptr) = st->denoise_enabled;
997 break;
998
999 case SPEEX_PREPROCESS_SET_AGC:
1000 st->agc_enabled = (*(int*)ptr);
1001 break;
1002 case SPEEX_PREPROCESS_GET_AGC:
1003 (*(int*)ptr) = st->agc_enabled;
1004 break;
1005
1006 case SPEEX_PREPROCESS_SET_AGC_LEVEL:
1007 st->agc_level = (*(float*)ptr);
1008 if (st->agc_level<1)
1009 st->agc_level=1;
1010 if (st->agc_level>32768)
1011 st->agc_level=32768;
1012 break;
1013 case SPEEX_PREPROCESS_GET_AGC_LEVEL:
1014 (*(float*)ptr) = st->agc_level;
1015 break;
1016
1017 case SPEEX_PREPROCESS_SET_VAD:
1018 st->vad_enabled = (*(int*)ptr);
1019 break;
1020 case SPEEX_PREPROCESS_GET_VAD:
1021 (*(int*)ptr) = st->vad_enabled;
1022 break;
1023
1024 case SPEEX_PREPROCESS_SET_DEREVERB:
1025 st->dereverb_enabled = (*(int*)ptr);
1026 for (i=0;i<st->ps_size;i++)
1027 st->reverb_estimate[i]=0;
1028 break;
1029 case SPEEX_PREPROCESS_GET_DEREVERB:
1030 (*(int*)ptr) = st->dereverb_enabled;
1031 break;
1032
1033 case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL:
1034 st->reverb_level = (*(float*)ptr);
1035 break;
1036 case SPEEX_PREPROCESS_GET_DEREVERB_LEVEL:
1037 (*(float*)ptr) = st->reverb_level;
1038 break;
1039
1040 case SPEEX_PREPROCESS_SET_DEREVERB_DECAY:
1041 st->reverb_decay = (*(float*)ptr);
1042 break;
1043 case SPEEX_PREPROCESS_GET_DEREVERB_DECAY:
1044 (*(float*)ptr) = st->reverb_decay;
1045 break;
1046
1047 case SPEEX_PREPROCESS_SET_PROB_START:
1048 st->speech_prob_start = (*(float*)ptr);
1049 if ( st->speech_prob_start > 1 )
1050 st->speech_prob_start = st->speech_prob_start / 100;
1051 if ( st->speech_prob_start > 1 || st->speech_prob_start < 0 )
1052 st->speech_prob_start = SPEEX_PROB_START_DEFAULT;
1053 break;
1054 case SPEEX_PREPROCESS_GET_PROB_START:
1055 (*(float*)ptr) = st->speech_prob_start ;
1056 break;
1057
1058 case SPEEX_PREPROCESS_SET_PROB_CONTINUE:
1059 st->speech_prob_continue = (*(float*)ptr);
1060 if ( st->speech_prob_continue > 1 )
1061 st->speech_prob_continue = st->speech_prob_continue / 100;
1062 if ( st->speech_prob_continue > 1 || st->speech_prob_continue < 0 )
1063 st->speech_prob_continue = SPEEX_PROB_CONTINUE_DEFAULT;
1064 break;
1065 case SPEEX_PREPROCESS_GET_PROB_CONTINUE:
1066 (*(float*)ptr) = st->speech_prob_continue;
1067 break;
1068
1069 default:
1070 speex_warning_int("Unknown speex_preprocess_ctl request: ", request);
1071 return -1;
1072 }
1073 return 0;
1074 }
1075