1 /* Copyright (c) 2018 Gregor Richards 2 * Copyright (c) 2017 Mozilla */ 3 /* 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include <stdlib.h> 33 #include <string.h> 34 #include <stdio.h> 35 #include "kiss_fft.h" 36 #include "common.h" 37 #include <math.h> 38 #include "rnnoise-nu.h" 39 #include "pitch.h" 40 #include "arch.h" 41 #include "rnn.h" 42 #include "rnn_data.h" 43 44 #define DEFAULT_SAMPLE_RATE 48000 45 46 #define FRAME_SIZE_SHIFT 2 47 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT) 48 #define WINDOW_SIZE (2*FRAME_SIZE) 49 #define FREQ_SIZE (FRAME_SIZE + 1) 50 51 #define PITCH_MIN_PERIOD 60 52 #define PITCH_MAX_PERIOD 768 53 #define PITCH_FRAME_SIZE 960 54 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE) 55 56 #define SQUARE(x) ((x)*(x)) 57 58 #define SMOOTH_BANDS 1 59 60 #define NB_BAND_BOUNDARIES 22 61 62 #if SMOOTH_BANDS 63 #define NB_BANDS 22 64 #else 65 #define NB_BANDS 21 66 #endif 67 68 #define CEPS_MEM 8 69 #define NB_DELTA_CEPS 6 70 71 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2) 72 73 /* We don't allow max attenuation to be more than 60dB */ 74 #define MIN_MAX_ATTENUATION 0.000001f 75 76 77 #ifndef TRAINING 78 #define TRAINING 0 79 #endif 80 81 82 /* cb is the default model */ 83 extern const struct RNNModel model_cb; 84 85 86 static const opus_int16 eband5ms[] = { 87 /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/ 88 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100 89 }; 90 91 92 struct DenoiseState { 93 int init; 94 kiss_fft_state *kfft; 95 float half_window[FRAME_SIZE]; 96 float dct_table[NB_BANDS*NB_BANDS]; 97 98 int sample_rate; 99 100 float analysis_mem[FRAME_SIZE]; 101 float cepstral_mem[CEPS_MEM][NB_BANDS]; 102 int memid; 103 float synthesis_mem[FRAME_SIZE]; 104 float pitch_buf[PITCH_BUF_SIZE]; 105 float pitch_enh_buf[PITCH_BUF_SIZE]; 106 107 /* Bands adjusted for the sample rate */ 108 opus_int16 band_bins[NB_BAND_BOUNDARIES]; 109 110 float last_gain; 111 int last_period; 112 float mem_hp_x[2]; 113 float lastg[NB_BANDS]; 114 RNNState rnn; 115 116 float max_attenuation; 117 }; 118 119 #if SMOOTH_BANDS 120 void compute_band_energy(DenoiseState *st, float *bandE, const kiss_fft_cpx *X) { 121 int i; 122 float sum[NB_BANDS] = {0}; 123 for (i=0;i<NB_BANDS-1;i++) 124 { 125 int j; 126 int band_size; 127 band_size = st->band_bins[i+1] - st->band_bins[i]; 128 for (j=0;j<band_size;j++) { 129 float tmp; 130 float frac = (float)j/band_size; 131 tmp = SQUARE(X[st->band_bins[i] + j].r); 132 tmp += SQUARE(X[st->band_bins[i] + j].i); 133 sum[i] += (1-frac)*tmp; 134 sum[i+1] += frac*tmp; 135 } 136 } 137 sum[0] *= 2; 138 sum[NB_BANDS-1] *= 2; 139 for (i=0;i<NB_BANDS;i++) 140 { 141 bandE[i] = sum[i]; 142 } 143 } 144 145 void compute_band_corr(DenoiseState *st, float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) { 146 int i; 147 float sum[NB_BANDS] = {0}; 148 for (i=0;i<NB_BANDS-1;i++) 149 { 150 int j; 151 int band_size; 152 band_size = st->band_bins[i+1] - st->band_bins[i]; 153 for (j=0;j<band_size;j++) { 154 float tmp; 155 float frac = (float)j/band_size; 156 tmp = X[st->band_bins[i] + j].r * P[st->band_bins[i] + j].r; 157 tmp += X[st->band_bins[i] + j].i * P[st->band_bins[i] + j].i; 158 sum[i] += (1-frac)*tmp; 159 sum[i+1] += frac*tmp; 160 } 161 } 162 sum[0] *= 2; 163 sum[NB_BANDS-1] *= 2; 164 for (i=0;i<NB_BANDS;i++) 165 { 166 bandE[i] = sum[i]; 167 } 168 } 169 170 void interp_band_gain(DenoiseState *st, float *g, const float *bandE) { 171 int i; 172 memset(g, 0, FREQ_SIZE); 173 for (i=0;i<NB_BANDS-1;i++) 174 { 175 int j; 176 int band_size; 177 band_size = st->band_bins[i+1] - st->band_bins[i]; 178 for (j=0;j<band_size;j++) { 179 float frac = (float)j/band_size; 180 g[st->band_bins[i] + j] = (1-frac)*bandE[i] + frac*bandE[i+1]; 181 } 182 } 183 } 184 #else 185 void compute_band_energy(DenoiseState *st, float *bandE, const kiss_fft_cpx *X) { 186 int i; 187 for (i=0;i<NB_BANDS;i++) 188 { 189 int j; 190 opus_val32 sum = 0; 191 for (j=0;j<(st->band_bins[i+1] - st->band_bins[i]);j++) { 192 sum += SQUARE(X[st->band_bins[i] + j].r); 193 sum += SQUARE(X[st->band_bins[i] + j].i); 194 } 195 bandE[i] = sum; 196 } 197 } 198 199 void interp_band_gain(DenoiseState *st, float *g, const float *bandE) { 200 int i; 201 memset(g, 0, FREQ_SIZE); 202 for (i=0;i<NB_BANDS;i++) 203 { 204 int j; 205 for (j=0;j<(st->band_bins[i+1] - st->band_bins[i]);j++) 206 g[st->band_bins[i] + j] = bandE[i]; 207 } 208 } 209 #endif 210 211 212 static void check_init(DenoiseState *st) { 213 int i; 214 if (st->init) return; 215 /* FIXME: Deallocate this! */ 216 st->kfft = opus_fft_alloc_twiddles(2*FRAME_SIZE, NULL, NULL, NULL, 0); 217 218 /* Get the sample rate set up */ 219 if (st->sample_rate <= 0) st->sample_rate = DEFAULT_SAMPLE_RATE; 220 221 /* Adjust the bins for the sample rate */ 222 for (i = 0; i < NB_BAND_BOUNDARIES; i++) 223 st->band_bins[i] = (((long) eband5ms[i]) << FRAME_SIZE_SHIFT) * DEFAULT_SAMPLE_RATE / st->sample_rate; 224 225 /* Make sure nothing's above the Nyquist frequency */ 226 for (i = 0; i < NB_BANDS; i++) 227 if (st->band_bins[i] >= FRAME_SIZE) st->band_bins[i] = FRAME_SIZE - 1; 228 229 for (i=0;i<FRAME_SIZE;i++) 230 st->half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE)); 231 for (i=0;i<NB_BANDS;i++) { 232 int j; 233 for (j=0;j<NB_BANDS;j++) { 234 st->dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS); 235 if (j==0) st->dct_table[i*NB_BANDS + j] *= sqrt(.5); 236 } 237 } 238 st->init = 1; 239 } 240 241 static void dct(DenoiseState *st, float *out, const float *in) { 242 int i; 243 check_init(st); 244 for (i=0;i<NB_BANDS;i++) { 245 int j; 246 float sum = 0; 247 for (j=0;j<NB_BANDS;j++) { 248 sum += in[j] * st->dct_table[j*NB_BANDS + i]; 249 } 250 out[i] = sum*sqrt(2./22); 251 } 252 } 253 254 #if 0 255 static void idct(DenoiseState *st, float *out, const float *in) { 256 int i; 257 check_init(st); 258 for (i=0;i<NB_BANDS;i++) { 259 int j; 260 float sum = 0; 261 for (j=0;j<NB_BANDS;j++) { 262 sum += in[j] * st->dct_table[i*NB_BANDS + j]; 263 } 264 out[i] = sum*sqrt(2./22); 265 } 266 } 267 #endif 268 269 static void forward_transform(DenoiseState *st, kiss_fft_cpx *out, const float *in) { 270 int i; 271 kiss_fft_cpx x[WINDOW_SIZE]; 272 kiss_fft_cpx y[WINDOW_SIZE]; 273 check_init(st); 274 for (i=0;i<WINDOW_SIZE;i++) { 275 x[i].r = in[i]; 276 x[i].i = 0; 277 } 278 opus_fft(st->kfft, x, y, 0); 279 for (i=0;i<FREQ_SIZE;i++) { 280 out[i] = y[i]; 281 } 282 } 283 284 static void inverse_transform(DenoiseState *st, float *out, const kiss_fft_cpx *in) { 285 int i; 286 kiss_fft_cpx x[WINDOW_SIZE]; 287 kiss_fft_cpx y[WINDOW_SIZE]; 288 check_init(st); 289 for (i=0;i<FREQ_SIZE;i++) { 290 x[i] = in[i]; 291 } 292 for (;i<WINDOW_SIZE;i++) { 293 x[i].r = x[WINDOW_SIZE - i].r; 294 x[i].i = -x[WINDOW_SIZE - i].i; 295 } 296 opus_fft(st->kfft, x, y, 0); 297 /* output in reverse order for IFFT. */ 298 out[0] = WINDOW_SIZE*y[0].r; 299 for (i=1;i<WINDOW_SIZE;i++) { 300 out[i] = WINDOW_SIZE*y[WINDOW_SIZE - i].r; 301 } 302 } 303 304 static void apply_window(DenoiseState *st, float *x) { 305 int i; 306 check_init(st); 307 for (i=0;i<FRAME_SIZE;i++) { 308 x[i] *= st->half_window[i]; 309 x[WINDOW_SIZE - 1 - i] *= st->half_window[i]; 310 } 311 } 312 313 int rnnoise_get_size() { 314 return sizeof(DenoiseState); 315 } 316 317 int rnnoise_init(DenoiseState *st, RNNModel *model) { 318 memset(st, 0, sizeof(*st)); 319 if (model) 320 st->rnn.model = model; 321 else 322 st->rnn.model = &model_cb; 323 st->rnn.vad_gru_state = calloc(sizeof(float), st->rnn.model->vad_gru_size); 324 st->rnn.noise_gru_state = calloc(sizeof(float), st->rnn.model->noise_gru_size); 325 st->rnn.denoise_gru_state = calloc(sizeof(float), st->rnn.model->denoise_gru_size); 326 return 0; 327 } 328 329 DenoiseState *rnnoise_create(RNNModel *model) { 330 DenoiseState *st; 331 st = malloc(rnnoise_get_size()); 332 rnnoise_init(st, model); 333 return st; 334 } 335 336 void rnnoise_destroy(DenoiseState *st) { 337 if (st->init) 338 free(st->kfft); 339 free(st->rnn.vad_gru_state); 340 free(st->rnn.noise_gru_state); 341 free(st->rnn.denoise_gru_state); 342 free(st); 343 } 344 345 #if TRAINING 346 int lowpass = FREQ_SIZE; 347 int band_lp = NB_BANDS; 348 #endif 349 350 static void frame_analysis(DenoiseState *st, kiss_fft_cpx *X, float *Ex, const float *in) { 351 int i; 352 float x[WINDOW_SIZE]; 353 RNN_COPY(x, st->analysis_mem, FRAME_SIZE); 354 for (i=0;i<FRAME_SIZE;i++) x[FRAME_SIZE + i] = in[i]; 355 RNN_COPY(st->analysis_mem, in, FRAME_SIZE); 356 apply_window(st, x); 357 forward_transform(st, X, x); 358 #if TRAINING 359 for (i=lowpass;i<FREQ_SIZE;i++) 360 X[i].r = X[i].i = 0; 361 #endif 362 compute_band_energy(st, Ex, X); 363 } 364 365 static int compute_frame_features(DenoiseState *st, kiss_fft_cpx *X, kiss_fft_cpx *P, 366 float *Ex, float *Ep, float *Exp, float *features, const float *in) { 367 int i; 368 float E = 0; 369 float *ceps_0, *ceps_1, *ceps_2; 370 float spec_variability = 0; 371 float Ly[NB_BANDS]; 372 float p[WINDOW_SIZE]; 373 float pitch_buf[PITCH_BUF_SIZE>>1]; 374 int pitch_index; 375 float gain; 376 float *(pre[1]); 377 float tmp[NB_BANDS]; 378 float follow, logMax; 379 frame_analysis(st, X, Ex, in); 380 RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE); 381 RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE); 382 pre[0] = &st->pitch_buf[0]; 383 pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1); 384 pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE, 385 PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index); 386 pitch_index = PITCH_MAX_PERIOD-pitch_index; 387 388 gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD, 389 PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain); 390 st->last_period = pitch_index; 391 st->last_gain = gain; 392 for (i=0;i<WINDOW_SIZE;i++) 393 p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i]; 394 apply_window(st, p); 395 forward_transform(st, P, p); 396 compute_band_energy(st, Ep, P); 397 #if SMOOTH_BANDS 398 compute_band_corr(st, Exp, X, P); 399 #endif 400 for (i=0;i<NB_BANDS;i++) Exp[i] = Exp[i]/sqrt(.001+Ex[i]*Ep[i]); 401 dct(st, tmp, Exp); 402 for (i=0;i<NB_DELTA_CEPS;i++) features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i]; 403 features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3; 404 features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9; 405 features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300); 406 logMax = -2; 407 follow = -2; 408 for (i=0;i<NB_BANDS;i++) { 409 Ly[i] = log10(1e-2+Ex[i]); 410 Ly[i] = MAX16(logMax-7, MAX16(follow-1.5, Ly[i])); 411 logMax = MAX16(logMax, Ly[i]); 412 follow = MAX16(follow-1.5, Ly[i]); 413 E += Ex[i]; 414 } 415 if (!TRAINING && E < 0.04) { 416 /* If there's no audio, avoid messing up the state. */ 417 RNN_CLEAR(features, NB_FEATURES); 418 return 1; 419 } 420 dct(st, features, Ly); 421 features[0] -= 12; 422 features[1] -= 4; 423 ceps_0 = st->cepstral_mem[st->memid]; 424 ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1]; 425 ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2]; 426 for (i=0;i<NB_BANDS;i++) ceps_0[i] = features[i]; 427 st->memid++; 428 for (i=0;i<NB_DELTA_CEPS;i++) { 429 features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i]; 430 features[NB_BANDS+i] = ceps_0[i] - ceps_2[i]; 431 features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i]; 432 } 433 /* Spectral variability features. */ 434 if (st->memid == CEPS_MEM) st->memid = 0; 435 for (i=0;i<CEPS_MEM;i++) 436 { 437 int j; 438 float mindist = 1e15f; 439 for (j=0;j<CEPS_MEM;j++) 440 { 441 int k; 442 float dist=0; 443 for (k=0;k<NB_BANDS;k++) 444 { 445 float tmp; 446 tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k]; 447 dist += tmp*tmp; 448 } 449 if (j!=i) 450 mindist = MIN32(mindist, dist); 451 } 452 spec_variability += mindist; 453 } 454 features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1; 455 return TRAINING && E < 0.1; 456 } 457 458 static void frame_synthesis(DenoiseState *st, float *out, const kiss_fft_cpx *y) { 459 float x[WINDOW_SIZE]; 460 int i; 461 inverse_transform(st, x, y); 462 apply_window(st, x); 463 for (i=0;i<FRAME_SIZE;i++) out[i] = x[i] + st->synthesis_mem[i]; 464 RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE); 465 } 466 467 static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) { 468 int i; 469 for (i=0;i<N;i++) { 470 float xi, yi; 471 xi = x[i]; 472 yi = x[i] + mem[0]; 473 mem[0] = mem[1] + (b[0]*(double)xi - a[0]*(double)yi); 474 mem[1] = (b[1]*(double)xi - a[1]*(double)yi); 475 y[i] = yi; 476 } 477 } 478 479 void pitch_filter(DenoiseState *st, kiss_fft_cpx *X, const kiss_fft_cpx *P, const float *Ex, const float *Ep, 480 const float *Exp, const float *g) { 481 int i; 482 float r[NB_BANDS]; 483 float rf[FREQ_SIZE] = {0}; 484 for (i=0;i<NB_BANDS;i++) { 485 #if 0 486 if (Exp[i]>g[i]) r[i] = 1; 487 else r[i] = Exp[i]*(1-g[i])/(.001 + g[i]*(1-Exp[i])); 488 r[i] = MIN16(1, MAX16(0, r[i])); 489 #else 490 if (Exp[i]>g[i]) r[i] = 1; 491 else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i]))); 492 r[i] = sqrt(MIN16(1, MAX16(0, r[i]))); 493 #endif 494 r[i] *= sqrt(Ex[i]/(1e-8+Ep[i])); 495 } 496 interp_band_gain(st, rf, r); 497 for (i=0;i<FREQ_SIZE;i++) { 498 X[i].r += rf[i]*P[i].r; 499 X[i].i += rf[i]*P[i].i; 500 } 501 float newE[NB_BANDS]; 502 compute_band_energy(st, newE, X); 503 float norm[NB_BANDS]; 504 float normf[FREQ_SIZE]={0}; 505 for (i=0;i<NB_BANDS;i++) { 506 norm[i] = sqrt(Ex[i]/(1e-8+newE[i])); 507 } 508 interp_band_gain(st, normf, norm); 509 for (i=0;i<FREQ_SIZE;i++) { 510 X[i].r *= normf[i]; 511 X[i].i *= normf[i]; 512 } 513 } 514 515 float rnnoise_process_frame(DenoiseState *st, float *out, const float *in) { 516 int i; 517 kiss_fft_cpx X[FREQ_SIZE]; 518 kiss_fft_cpx P[WINDOW_SIZE]; 519 float x[FRAME_SIZE]; 520 float Ex[NB_BANDS], Ep[NB_BANDS]; 521 float Exp[NB_BANDS]; 522 float features[NB_FEATURES]; 523 float g[NB_BANDS]; 524 float gf[FREQ_SIZE]={1}; 525 float vad_prob = 0; 526 int silence; 527 static const float a_hp[2] = {-1.99599, 0.99600}; 528 static const float b_hp[2] = {-2, 1}; 529 biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE); 530 silence = compute_frame_features(st, X, P, Ex, Ep, Exp, features, x); 531 532 if (!silence) { 533 compute_rnn(&st->rnn, g, &vad_prob, features); 534 pitch_filter(st, X, P, Ex, Ep, Exp, g); 535 for (i=0;i<NB_BANDS;i++) { 536 float alpha = .6f; 537 g[i] = MAX16(g[i], alpha*st->lastg[i]); 538 st->lastg[i] = g[i]; 539 } 540 541 /* Apply maximum attenuation (minimum value) */ 542 if (st->max_attenuation) { 543 float min = 1, mult; 544 for (i=0;i<NB_BANDS;i++) { 545 if (g[i] < min) min = g[i]; 546 } 547 if (min < st->max_attenuation) { 548 if (min < MIN_MAX_ATTENUATION) 549 min = MIN_MAX_ATTENUATION; 550 mult = (1.0f-st->max_attenuation) / (1.0f-min); 551 for (i=0;i<NB_BANDS;i++) { 552 if (g[i] < MIN_MAX_ATTENUATION) g[i] = MIN_MAX_ATTENUATION; 553 g[i] = 1.0f-((1.0f-g[i]) * mult); 554 st->lastg[i] = g[i]; 555 } 556 } 557 } 558 559 interp_band_gain(st, gf, g); 560 #if 1 561 for (i=0;i<FREQ_SIZE;i++) { 562 X[i].r *= gf[i]; 563 X[i].i *= gf[i]; 564 } 565 #endif 566 } 567 568 frame_synthesis(st, out, X); 569 return vad_prob; 570 } 571 572 void rnnoise_set_param(DenoiseState *st, int param, float value) 573 { 574 switch (param) { 575 case RNNOISE_PARAM_MAX_ATTENUATION: 576 if ((value > MIN_MAX_ATTENUATION && value <= 1) || value == 0) 577 st->max_attenuation = value; 578 else 579 st->max_attenuation = MIN_MAX_ATTENUATION; 580 break; 581 582 case RNNOISE_PARAM_SAMPLE_RATE: 583 if (value <= 0) 584 st->sample_rate = 0; 585 else 586 st->sample_rate = value; 587 break; 588 } 589 } 590 591 #if TRAINING 592 593 static float uni_rand() { 594 return rand()/(double)RAND_MAX-.5; 595 } 596 597 static void rand_resp(float *a, float *b) { 598 a[0] = .75*uni_rand(); 599 a[1] = .75*uni_rand(); 600 b[0] = .75*uni_rand(); 601 b[1] = .75*uni_rand(); 602 } 603 604 int main(int argc, char **argv) { 605 int i; 606 int count=0; 607 static const float a_hp[2] = {-1.99599, 0.99600}; 608 static const float b_hp[2] = {-2, 1}; 609 float a_noise[2] = {0}; 610 float b_noise[2] = {0}; 611 float a_sig[2] = {0}; 612 float b_sig[2] = {0}; 613 float mem_hp_x[2]={0}; 614 float mem_hp_n[2]={0}; 615 float mem_resp_x[2]={0}; 616 float mem_resp_n[2]={0}; 617 float x[FRAME_SIZE]; 618 float n[FRAME_SIZE]; 619 float xn[FRAME_SIZE]; 620 int vad_cnt=0; 621 int gain_change_count=0; 622 float speech_gain = 1, noise_gain = 1; 623 FILE *f1, *f2; 624 int maxCount; 625 DenoiseState *st; 626 DenoiseState *noise_state; 627 DenoiseState *noisy; 628 st = rnnoise_create(NULL); 629 noise_state = rnnoise_create(NULL); 630 noisy = rnnoise_create(NULL); 631 if (argc!=4) { 632 fprintf(stderr, "usage: %s <speech> <noise> <count>\n", argv[0]); 633 return 1; 634 } 635 f1 = fopen(argv[1], "r"); 636 f2 = fopen(argv[2], "r"); 637 maxCount = atoi(argv[3]); 638 for(i=0;i<150;i++) { 639 short tmp[FRAME_SIZE]; 640 fread(tmp, sizeof(short), FRAME_SIZE, f2); 641 } 642 while (1) { 643 kiss_fft_cpx X[FREQ_SIZE], Y[FREQ_SIZE], N[FREQ_SIZE], P[WINDOW_SIZE]; 644 float Ex[NB_BANDS], Ey[NB_BANDS], En[NB_BANDS], Ep[NB_BANDS]; 645 float Exp[NB_BANDS]; 646 float Ln[NB_BANDS]; 647 float features[NB_FEATURES]; 648 float g[NB_BANDS]; 649 short tmp[FRAME_SIZE]; 650 float vad=0; 651 float E=0; 652 if (count==maxCount) break; 653 if ((count%1000)==0) fprintf(stderr, "%d\r", count); 654 if (++gain_change_count > 2821) { 655 speech_gain = pow(10., (-40+(rand()%60))/20.); 656 noise_gain = pow(10., (-30+(rand()%50))/20.); 657 if (rand()%10==0) noise_gain = 0; 658 noise_gain *= speech_gain; 659 if (rand()%10==0) speech_gain = 0; 660 gain_change_count = 0; 661 rand_resp(a_noise, b_noise); 662 rand_resp(a_sig, b_sig); 663 lowpass = FREQ_SIZE * 3000./24000. * pow(50., rand()/(double)RAND_MAX); 664 for (i=0;i<NB_BANDS;i++) { 665 if (eband5ms[i]<<FRAME_SIZE_SHIFT > lowpass) { 666 band_lp = i; 667 break; 668 } 669 } 670 } 671 if (speech_gain != 0) { 672 fread(tmp, sizeof(short), FRAME_SIZE, f1); 673 if (feof(f1)) { 674 rewind(f1); 675 fread(tmp, sizeof(short), FRAME_SIZE, f1); 676 } 677 for (i=0;i<FRAME_SIZE;i++) x[i] = speech_gain*tmp[i]; 678 for (i=0;i<FRAME_SIZE;i++) E += tmp[i]*(float)tmp[i]; 679 } else { 680 for (i=0;i<FRAME_SIZE;i++) x[i] = 0; 681 E = 0; 682 } 683 if (noise_gain!=0) { 684 fread(tmp, sizeof(short), FRAME_SIZE, f2); 685 if (feof(f2)) { 686 rewind(f2); 687 fread(tmp, sizeof(short), FRAME_SIZE, f2); 688 } 689 for (i=0;i<FRAME_SIZE;i++) n[i] = noise_gain*tmp[i]; 690 } else { 691 for (i=0;i<FRAME_SIZE;i++) n[i] = 0; 692 } 693 biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE); 694 biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE); 695 biquad(n, mem_hp_n, n, b_hp, a_hp, FRAME_SIZE); 696 biquad(n, mem_resp_n, n, b_noise, a_noise, FRAME_SIZE); 697 for (i=0;i<FRAME_SIZE;i++) xn[i] = x[i] + n[i]; 698 if (E > 1e9f) { 699 vad_cnt=0; 700 } else if (E > 1e8f) { 701 vad_cnt -= 5; 702 } else if (E > 1e7f) { 703 vad_cnt++; 704 } else { 705 vad_cnt+=2; 706 } 707 if (vad_cnt < 0) vad_cnt = 0; 708 if (vad_cnt > 15) vad_cnt = 15; 709 710 if (vad_cnt >= 10) vad = 0; 711 else if (vad_cnt > 0) vad = 0.5f; 712 else vad = 1.f; 713 714 frame_analysis(st, Y, Ey, x); 715 frame_analysis(noise_state, N, En, n); 716 for (i=0;i<NB_BANDS;i++) Ln[i] = log10(1e-2+En[i]); 717 int silence = compute_frame_features(noisy, X, P, Ex, Ep, Exp, features, xn); 718 pitch_filter(st, X, P, Ex, Ep, Exp, g); 719 //printf("%f %d\n", noisy->last_gain, noisy->last_period); 720 for (i=0;i<NB_BANDS;i++) { 721 g[i] = sqrt((Ey[i]+1e-3)/(Ex[i]+1e-3)); 722 if (g[i] > 1) g[i] = 1; 723 if (silence || i > band_lp) g[i] = -1; 724 if (Ey[i] < 5e-2 && Ex[i] < 5e-2) g[i] = -1; 725 if (vad==0 && noise_gain==0) g[i] = -1; 726 } 727 count++; 728 #if 1 729 fwrite(features, sizeof(float), NB_FEATURES, stdout); 730 fwrite(g, sizeof(float), NB_BANDS, stdout); 731 fwrite(Ln, sizeof(float), NB_BANDS, stdout); 732 fwrite(&vad, sizeof(float), 1, stdout); 733 #endif 734 } 735 fprintf(stderr, "matrix size: %d x %d\n", count, NB_FEATURES + 2*NB_BANDS + 1); 736 fclose(f1); 737 fclose(f2); 738 return 0; 739 } 740 741 #endif 742