1 // Copyright 2005-2019 The Mumble Developers. All rights reserved.
2 // Use of this source code is governed by a BSD-style license
3 // that can be found in the LICENSE file at the root of the
4 // Mumble source tree or at <https://www.mumble.info/LICENSE>.
5
6 #include "mumble_pch.hpp"
7
8 #include "AudioInput.h"
9
10 #include "AudioOutput.h"
11 #include "CELTCodec.h"
12 #include "OpusCodec.h"
13 #include "ServerHandler.h"
14 #include "MainWindow.h"
15 #include "User.h"
16 #include "PacketDataStream.h"
17 #include "Plugins.h"
18 #include "Message.h"
19 #include "Global.h"
20 #include "NetworkConfig.h"
21 #include "VoiceRecorder.h"
22
23 #ifdef USE_RNNOISE
24 extern "C" {
25 #include "rnnoise.h"
26 }
27 #endif
28
29 // Remember that we cannot use static member classes that are not pointers, as the constructor
30 // for AudioInputRegistrar() might be called before they are initialized, as the constructor
31 // is called from global initialization.
32 // Hence, we allocate upon first call.
33
34 QMap<QString, AudioInputRegistrar *> *AudioInputRegistrar::qmNew;
35 QString AudioInputRegistrar::current = QString();
36
AudioInputRegistrar(const QString & n,int p)37 AudioInputRegistrar::AudioInputRegistrar(const QString &n, int p) : name(n), priority(p) {
38 if (! qmNew)
39 qmNew = new QMap<QString, AudioInputRegistrar *>();
40 qmNew->insert(name,this);
41 }
42
~AudioInputRegistrar()43 AudioInputRegistrar::~AudioInputRegistrar() {
44 qmNew->remove(name);
45 }
46
newFromChoice(QString choice)47 AudioInputPtr AudioInputRegistrar::newFromChoice(QString choice) {
48 if (! qmNew)
49 return AudioInputPtr();
50
51 if (!choice.isEmpty() && qmNew->contains(choice)) {
52 g.s.qsAudioInput = choice;
53 current = choice;
54 return AudioInputPtr(qmNew->value(current)->create());
55 }
56 choice = g.s.qsAudioInput;
57 if (qmNew->contains(choice)) {
58 current = choice;
59 return AudioInputPtr(qmNew->value(choice)->create());
60 }
61
62 AudioInputRegistrar *r = NULL;
63 foreach(AudioInputRegistrar *air, *qmNew)
64 if (!r || (air->priority > r->priority))
65 r = air;
66 if (r) {
67 current = r->name;
68 return AudioInputPtr(r->create());
69 }
70 return AudioInputPtr();
71 }
72
canExclusive() const73 bool AudioInputRegistrar::canExclusive() const {
74 return false;
75 }
76
AudioInput()77 AudioInput::AudioInput() : opusBuffer(g.s.iFramesPerPacket * (SAMPLE_RATE / 100)) {
78 adjustBandwidth(g.iMaxBandwidth, iAudioQuality, iAudioFrames);
79
80 g.iAudioBandwidth = getNetworkBandwidth(iAudioQuality, iAudioFrames);
81
82 umtType = MessageHandler::UDPVoiceCELTAlpha;
83
84 activityState = ActivityStateActive;
85 oCodec = NULL;
86 opusState = NULL;
87 cCodec = NULL;
88 ceEncoder = NULL;
89
90 iSampleRate = SAMPLE_RATE;
91 iFrameSize = SAMPLE_RATE / 100;
92
93 #ifdef USE_OPUS
94 oCodec = g.oCodec;
95 if (oCodec) {
96 if (!g.s.bUseOpusMusicEncoding) {
97 opusState = oCodec->opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_VOIP, NULL);
98 qWarning("AudioInput: Opus encoder set for VOIP");
99 } else {
100 opusState = oCodec->opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_AUDIO, NULL);
101 qWarning("AudioInput: Opus encoder set for Music");
102 }
103
104 oCodec->opus_encoder_ctl(opusState, OPUS_SET_VBR(0)); // CBR
105 }
106 #endif
107
108 #ifdef USE_RNNOISE
109 denoiseState = rnnoise_create();
110 #endif
111
112 qWarning("AudioInput: %d bits/s, %d hz, %d sample", iAudioQuality, iSampleRate, iFrameSize);
113 iEchoFreq = iMicFreq = iSampleRate;
114
115 iFrameCounter = 0;
116 iSilentFrames = 0;
117 iHoldFrames = 0;
118 iBufferedFrames = 0;
119
120 bResetProcessor = true;
121
122 bEchoMulti = false;
123
124 sppPreprocess = NULL;
125 sesEcho = NULL;
126 srsMic = srsEcho = NULL;
127 iJitterSeq = 0;
128 iMinBuffered = 1000;
129
130 psMic = new short[iFrameSize];
131 psClean = new short[iFrameSize];
132
133 psSpeaker = NULL;
134
135 iEchoChannels = iMicChannels = 0;
136 iEchoFilled = iMicFilled = 0;
137 eMicFormat = eEchoFormat = SampleFloat;
138 iMicSampleSize = iEchoSampleSize = 0;
139
140 bPreviousVoice = false;
141
142 bResetEncoder = true;
143
144 pfMicInput = pfEchoInput = pfOutput = NULL;
145
146 iBitrate = 0;
147 dPeakSignal = dPeakSpeaker = dPeakMic = dPeakCleanMic = 0.0;
148
149 if (g.uiSession) {
150 setMaxBandwidth(g.iMaxBandwidth);
151 }
152
153 bRunning = true;
154
155 connect(this, SIGNAL(doDeaf()), g.mw->qaAudioDeaf, SLOT(trigger()), Qt::QueuedConnection);
156 connect(this, SIGNAL(doMute()), g.mw->qaAudioMute, SLOT(trigger()), Qt::QueuedConnection);
157 }
158
~AudioInput()159 AudioInput::~AudioInput() {
160 bRunning = false;
161 wait();
162
163 #ifdef USE_OPUS
164 if (opusState) {
165 oCodec->opus_encoder_destroy(opusState);
166 }
167 #endif
168
169 #ifdef USE_RNNOISE
170 if (denoiseState) {
171 rnnoise_destroy(denoiseState);
172 }
173 #endif
174
175 if (ceEncoder) {
176 cCodec->celt_encoder_destroy(ceEncoder);
177 }
178
179 foreach(short *buf, qlEchoFrames)
180 delete [] buf;
181
182 if (sppPreprocess)
183 speex_preprocess_state_destroy(sppPreprocess);
184 if (sesEcho)
185 speex_echo_state_destroy(sesEcho);
186
187 if (srsMic)
188 speex_resampler_destroy(srsMic);
189 if (srsEcho)
190 speex_resampler_destroy(srsEcho);
191
192 delete [] psMic;
193 delete [] psClean;
194 delete [] psSpeaker;
195
196 delete [] pfMicInput;
197 delete [] pfEchoInput;
198 delete [] pfOutput;
199 }
200
isTransmitting() const201 bool AudioInput::isTransmitting() const {
202 return bPreviousVoice;
203 };
204
205 #define IN_MIXER_FLOAT(channels) \
206 static void inMixerFloat##channels ( float * RESTRICT buffer, const void * RESTRICT ipt, unsigned int nsamp, unsigned int N, quint64 mask) { \
207 const float * RESTRICT input = reinterpret_cast<const float *>(ipt); \
208 const float m = 1.0f / static_cast<float>(channels); \
209 Q_UNUSED(N); \
210 Q_UNUSED(mask); \
211 for(unsigned int i=0;i<nsamp;++i) {\
212 float v= 0.0f; \
213 for(unsigned int j=0;j<channels;++j) \
214 v += input[i*channels+j]; \
215 buffer[i] = v * m; \
216 } \
217 }
218
219 #define IN_MIXER_SHORT(channels) \
220 static void inMixerShort##channels ( float * RESTRICT buffer, const void * RESTRICT ipt, unsigned int nsamp, unsigned int N, quint64 mask) { \
221 const short * RESTRICT input = reinterpret_cast<const short *>(ipt); \
222 const float m = 1.0f / (32768.f * static_cast<float>(channels)); \
223 Q_UNUSED(N); \
224 Q_UNUSED(mask); \
225 for(unsigned int i=0;i<nsamp;++i) {\
226 float v= 0.0f; \
227 for(unsigned int j=0;j<channels;++j) \
228 v += static_cast<float>(input[i*channels+j]); \
229 buffer[i] = v * m; \
230 } \
231 }
232
inMixerFloatMask(float * RESTRICT buffer,const void * RESTRICT ipt,unsigned int nsamp,unsigned int N,quint64 mask)233 static void inMixerFloatMask(float * RESTRICT buffer, const void * RESTRICT ipt, unsigned int nsamp, unsigned int N, quint64 mask) { \
234 const float * RESTRICT input = reinterpret_cast<const float *>(ipt);
235
236 unsigned int chancount = 0;
237 STACKVAR(unsigned int, chanindex, N);
238 for (unsigned int j = 0; j < N; ++j) {
239 if ((mask & (1ULL << j)) == 0) {
240 continue;
241 }
242 chanindex[chancount] = j; // Use chancount as index into chanindex.
243 ++chancount;
244 }
245
246 const float m = 1.0f / static_cast<float>(chancount);
247 for(unsigned int i = 0; i < nsamp; ++i) {
248 float v = 0.0f;
249 for(unsigned int j = 0; j < chancount; ++j) {
250 v += input[i * N + chanindex[j]];
251 }
252 buffer[i] = v * m;
253 }
254 }
255
inMixerShortMask(float * RESTRICT buffer,const void * RESTRICT ipt,unsigned int nsamp,unsigned int N,quint64 mask)256 static void inMixerShortMask(float * RESTRICT buffer, const void * RESTRICT ipt, unsigned int nsamp, unsigned int N, quint64 mask) {
257 const short * RESTRICT input = reinterpret_cast<const short *>(ipt);
258
259 unsigned int chancount = 0;
260 STACKVAR(unsigned int, chanindex, N);
261 for (unsigned int j = 0; j < N; ++j) {
262 if ((mask & (1ULL << j)) == 0) {
263 continue;
264 }
265 chanindex[chancount] = j; // Use chancount as index into chanindex.
266 ++chancount;
267 }
268
269 const float m = 1.0f / static_cast<float>(chancount);
270 for(unsigned int i = 0; i < nsamp; ++i) {
271 float v = 0.0f;
272 for(unsigned int j = 0; j < chancount; ++j) {
273 v += static_cast<float>(input[i * N + chanindex[j]]);
274 }
275 buffer[i] = v * m;
276 }
277 }
278
279 IN_MIXER_FLOAT(1)
280 IN_MIXER_FLOAT(2)
281 IN_MIXER_FLOAT(3)
282 IN_MIXER_FLOAT(4)
283 IN_MIXER_FLOAT(5)
284 IN_MIXER_FLOAT(6)
285 IN_MIXER_FLOAT(7)
286 IN_MIXER_FLOAT(8)
IN_MIXER_FLOAT(N)287 IN_MIXER_FLOAT(N)
288
289 IN_MIXER_SHORT(1)
290 IN_MIXER_SHORT(2)
291 IN_MIXER_SHORT(3)
292 IN_MIXER_SHORT(4)
293 IN_MIXER_SHORT(5)
294 IN_MIXER_SHORT(6)
295 IN_MIXER_SHORT(7)
296 IN_MIXER_SHORT(8)
297 IN_MIXER_SHORT(N)
298
299 AudioInput::inMixerFunc AudioInput::chooseMixer(const unsigned int nchan, SampleFormat sf, quint64 chanmask) {
300 inMixerFunc r = NULL;
301
302 if (chanmask != 0xffffffffffffffffULL) {
303 if (sf == SampleFloat) {
304 r = inMixerFloatMask;
305 } else if (sf == SampleShort) {
306 r = inMixerShortMask;
307 }
308 return r;
309 }
310
311 if (sf == SampleFloat) {
312 switch (nchan) {
313 case 1:
314 r = inMixerFloat1;
315 break;
316 case 2:
317 r = inMixerFloat2;
318 break;
319 case 3:
320 r = inMixerFloat3;
321 break;
322 case 4:
323 r = inMixerFloat4;
324 break;
325 case 5:
326 r = inMixerFloat5;
327 break;
328 case 6:
329 r = inMixerFloat6;
330 break;
331 case 7:
332 r = inMixerFloat7;
333 break;
334 case 8:
335 r = inMixerFloat8;
336 break;
337 default:
338 r = inMixerFloatN;
339 break;
340 }
341 } else {
342 switch (nchan) {
343 case 1:
344 r = inMixerShort1;
345 break;
346 case 2:
347 r = inMixerShort2;
348 break;
349 case 3:
350 r = inMixerShort3;
351 break;
352 case 4:
353 r = inMixerShort4;
354 break;
355 case 5:
356 r = inMixerShort5;
357 break;
358 case 6:
359 r = inMixerShort6;
360 break;
361 case 7:
362 r = inMixerShort7;
363 break;
364 case 8:
365 r = inMixerShort8;
366 break;
367 default:
368 r = inMixerShortN;
369 break;
370 }
371 }
372 return r;
373 }
374
initializeMixer()375 void AudioInput::initializeMixer() {
376 int err;
377
378 if (srsMic)
379 speex_resampler_destroy(srsMic);
380 if (srsEcho)
381 speex_resampler_destroy(srsEcho);
382 delete [] pfMicInput;
383 delete [] pfEchoInput;
384 delete [] pfOutput;
385
386 if (iMicFreq != iSampleRate)
387 srsMic = speex_resampler_init(1, iMicFreq, iSampleRate, 3, &err);
388
389 iMicLength = (iFrameSize * iMicFreq) / iSampleRate;
390
391 pfMicInput = new float[iMicLength];
392 pfOutput = new float[iFrameSize * qMax(1U,iEchoChannels)];
393
394 if (iEchoChannels > 0) {
395 bEchoMulti = g.s.bEchoMulti;
396 if (iEchoFreq != iSampleRate)
397 srsEcho = speex_resampler_init(bEchoMulti ? iEchoChannels : 1, iEchoFreq, iSampleRate, 3, &err);
398 iEchoLength = (iFrameSize * iEchoFreq) / iSampleRate;
399 iEchoMCLength = bEchoMulti ? iEchoLength * iEchoChannels : iEchoLength;
400 iEchoFrameSize = bEchoMulti ? iFrameSize * iEchoChannels : iFrameSize;
401 pfEchoInput = new float[iEchoMCLength];
402 } else {
403 srsEcho = NULL;
404 pfEchoInput = NULL;
405 }
406
407 uiMicChannelMask = g.s.uiAudioInputChannelMask;
408
409 // There is no channel mask setting for the echo canceller, so allow all channels.
410 uiEchoChannelMask = 0xffffffffffffffffULL;
411
412 imfMic = chooseMixer(iMicChannels, eMicFormat, uiMicChannelMask);
413 imfEcho = chooseMixer(iEchoChannels, eEchoFormat, uiEchoChannelMask);
414
415 iMicSampleSize = static_cast<int>(iMicChannels * ((eMicFormat == SampleFloat) ? sizeof(float) : sizeof(short)));
416 iEchoSampleSize = static_cast<int>(iEchoChannels * ((eEchoFormat == SampleFloat) ? sizeof(float) : sizeof(short)));
417
418 bResetProcessor = true;
419
420 qWarning("AudioInput: Initialized mixer for %d channel %d hz mic and %d channel %d hz echo", iMicChannels, iMicFreq, iEchoChannels, iEchoFreq);
421 if (uiMicChannelMask != 0xffffffffffffffffULL) {
422 qWarning("AudioInput: using mic channel mask 0x%llx", static_cast<unsigned long long>(uiMicChannelMask));
423 }
424 }
425
addMic(const void * data,unsigned int nsamp)426 void AudioInput::addMic(const void *data, unsigned int nsamp) {
427 while (nsamp > 0) {
428 // Make sure we don't overrun the frame buffer
429 const unsigned int left = qMin(nsamp, iMicLength - iMicFilled);
430
431 // Append mix into pfMicInput frame buffer (converts 16bit pcm->float if necessary)
432 imfMic(pfMicInput + iMicFilled, data, left, iMicChannels, uiMicChannelMask);
433
434 iMicFilled += left;
435 nsamp -= left;
436
437 // If new samples are left offset data pointer to point at the first one for next iteration
438 if (nsamp > 0) {
439 if (eMicFormat == SampleFloat)
440 data = reinterpret_cast<const float *>(data) + left * iMicChannels;
441 else
442 data = reinterpret_cast<const short *>(data) + left * iMicChannels;
443 }
444
445 if (iMicFilled == iMicLength) {
446 // Frame complete
447 iMicFilled = 0;
448
449 // If needed resample frame
450 float *ptr = srsMic ? pfOutput : pfMicInput;
451
452 if (srsMic) {
453 spx_uint32_t inlen = iMicLength;
454 spx_uint32_t outlen = iFrameSize;
455 speex_resampler_process_float(srsMic, 0, pfMicInput, &inlen, pfOutput, &outlen);
456 }
457
458 // Convert float to 16bit PCM
459 const float mul = 32768.f;
460 for (int j = 0; j < iFrameSize; ++j)
461 psMic[j] = static_cast<short>(qBound(-32768.f, (ptr[j] * mul), 32767.f));
462
463 // If we have echo chancellation enabled...
464 if (iEchoChannels > 0) {
465 short *echo = NULL;
466
467 {
468 QMutexLocker l(&qmEcho);
469
470 if (qlEchoFrames.isEmpty()) {
471 iJitterSeq = 0;
472 iMinBuffered = 1000;
473 } else {
474 // Compensate for drift between the microphone and the echo source
475 iMinBuffered = qMin(iMinBuffered, qlEchoFrames.count());
476
477 if ((iJitterSeq > 100) && (iMinBuffered > 1)) {
478 iJitterSeq = 0;
479 iMinBuffered = 1000;
480 delete [] qlEchoFrames.takeFirst();
481 }
482 echo = qlEchoFrames.takeFirst();
483 }
484 }
485
486 if (echo) {
487 // We have echo data for the current frame, remember that
488 delete [] psSpeaker;
489 psSpeaker = echo;
490 }
491 }
492
493 // Encode and send frame
494 encodeAudioFrame();
495 }
496 }
497 }
498
addEcho(const void * data,unsigned int nsamp)499 void AudioInput::addEcho(const void *data, unsigned int nsamp) {
500 while (nsamp > 0) {
501 // Make sure we don't overrun the echo frame buffer
502 const unsigned int left = qMin(nsamp, iEchoLength - iEchoFilled);
503
504 if (bEchoMulti) {
505 const unsigned int samples = left * iEchoChannels;
506
507 if (eEchoFormat == SampleFloat) {
508 for (unsigned int i=0;i<samples;++i)
509 pfEchoInput[i] = reinterpret_cast<const float *>(data)[i];
510 }
511 else {
512 // 16bit PCM -> float
513 for (unsigned int i=0;i<samples;++i)
514 pfEchoInput[i] = static_cast<float>(reinterpret_cast<const short *>(data)[i]) * (1.0f / 32768.f);
515 }
516 } else {
517 // Mix echo channels (converts 16bit PCM -> float if needed)
518 imfEcho(pfEchoInput + iEchoFilled, data, left, iEchoChannels, uiEchoChannelMask);
519 }
520
521 iEchoFilled += left;
522 nsamp -= left;
523
524 // If new samples are left offset data pointer to point at the first one for next iteration
525 if (nsamp > 0) {
526 if (eEchoFormat == SampleFloat)
527 data = reinterpret_cast<const float *>(data) + left * iEchoChannels;
528 else
529 data = reinterpret_cast<const short *>(data) + left * iEchoChannels;
530 }
531
532 if (iEchoFilled == iEchoLength) {
533 //Frame complete
534
535 iEchoFilled = 0;
536
537 // Resample if necessary
538 float *ptr = srsEcho ? pfOutput : pfEchoInput;
539
540 if (srsEcho) {
541 spx_uint32_t inlen = iEchoLength;
542 spx_uint32_t outlen = iFrameSize;
543 speex_resampler_process_interleaved_float(srsEcho, pfEchoInput, &inlen, pfOutput, &outlen);
544 }
545
546 short *outbuff = new short[iEchoFrameSize];
547
548 // float -> 16bit PCM
549 const float mul = 32768.f;
550 for (unsigned int j=0;j<iEchoFrameSize;++j)
551 outbuff[j] = static_cast<short>(ptr[j] * mul);
552
553 // Push frame into the echo chancellers jitter buffer
554 QMutexLocker l(&qmEcho);
555
556 iJitterSeq = qMin(iJitterSeq + 1,10000U);
557 qlEchoFrames.append(outbuff);
558 }
559 }
560 }
561
adjustBandwidth(int bitspersec,int & bitrate,int & frames)562 void AudioInput::adjustBandwidth(int bitspersec, int &bitrate, int &frames) {
563 frames = g.s.iFramesPerPacket;
564 bitrate = g.s.iQuality;
565
566 if (bitspersec == -1) {
567 // No limit
568 } else {
569 if (getNetworkBandwidth(bitrate, frames) > bitspersec) {
570 if ((frames <= 4) && (bitspersec <= 32000))
571 frames = 4;
572 else if ((frames == 1) && (bitspersec <= 64000))
573 frames = 2;
574 else if ((frames == 2) && (bitspersec <= 48000))
575 frames = 4;
576 if (getNetworkBandwidth(bitrate, frames) > bitspersec) {
577 do {
578 bitrate -= 1000;
579 } while ((bitrate > 8000) && (getNetworkBandwidth(bitrate, frames) > bitspersec));
580 }
581 }
582 }
583 if (bitrate <= 8000)
584 bitrate = 8000;
585 }
586
setMaxBandwidth(int bitspersec)587 void AudioInput::setMaxBandwidth(int bitspersec) {
588 if (bitspersec == g.iMaxBandwidth)
589 return;
590
591 int frames;
592 int bitrate;
593 adjustBandwidth(bitspersec, bitrate, frames);
594
595 g.iMaxBandwidth = bitspersec;
596
597 if (bitspersec != -1) {
598 if ((bitrate != g.s.iQuality) || (frames != g.s.iFramesPerPacket))
599 g.mw->msgBox(tr("Server maximum network bandwidth is only %1 kbit/s. Audio quality auto-adjusted to %2 kbit/s (%3 ms)").arg(bitspersec / 1000).arg(bitrate / 1000).arg(frames*10));
600 }
601
602 AudioInputPtr ai = g.ai;
603 if (ai) {
604 g.iAudioBandwidth = getNetworkBandwidth(bitrate, frames);
605 ai->iAudioQuality = bitrate;
606 ai->iAudioFrames = frames;
607 return;
608 }
609
610 ai.reset();
611
612 Audio::stopInput();
613 Audio::startInput();
614 }
615
getNetworkBandwidth(int bitrate,int frames)616 int AudioInput::getNetworkBandwidth(int bitrate, int frames) {
617 int overhead = 20 + 8 + 4 + 1 + 2 + (g.s.bTransmitPosition ? 12 : 0) + (NetworkConfig::TcpModeEnabled() ? 12 : 0) + frames;
618 overhead *= (800 / frames);
619 int bw = overhead + bitrate;
620
621 return bw;
622 }
623
resetAudioProcessor()624 void AudioInput::resetAudioProcessor() {
625 if (!bResetProcessor)
626 return;
627
628 int iArg;
629
630 if (sppPreprocess)
631 speex_preprocess_state_destroy(sppPreprocess);
632 if (sesEcho)
633 speex_echo_state_destroy(sesEcho);
634
635 sppPreprocess = speex_preprocess_state_init(iFrameSize, iSampleRate);
636
637 iArg = 1;
638 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_VAD, &iArg);
639 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC, &iArg);
640 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_DENOISE, &iArg);
641 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_DEREVERB, &iArg);
642
643 iArg = 30000;
644 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_TARGET, &iArg);
645
646 float v = 30000.0f / static_cast<float>(g.s.iMinLoudness);
647 iArg = iroundf(floorf(20.0f * log10f(v)));
648 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_MAX_GAIN, &iArg);
649
650 iArg = -60;
651 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_DECREMENT, &iArg);
652
653 iArg = g.s.iNoiseSuppress;
654 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &iArg);
655
656 if (iEchoChannels > 0) {
657 sesEcho = speex_echo_state_init_mc(iFrameSize, iFrameSize * 10, 1, bEchoMulti ? iEchoChannels : 1);
658 iArg = iSampleRate;
659 speex_echo_ctl(sesEcho, SPEEX_ECHO_SET_SAMPLING_RATE, &iArg);
660 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_ECHO_STATE, sesEcho);
661
662 qWarning("AudioInput: ECHO CANCELLER ACTIVE");
663 } else {
664 sesEcho = NULL;
665 }
666
667 bResetEncoder = true;
668
669 bResetProcessor = false;
670 }
671
selectCodec()672 bool AudioInput::selectCodec() {
673 bool useOpus = false;
674
675 // Currently talking, use previous Opus status.
676 if (bPreviousVoice) {
677 useOpus = (umtType == MessageHandler::UDPVoiceOpus);
678 } else {
679 #ifdef USE_OPUS
680 if (g.bOpus || (g.s.lmLoopMode == Settings::Local)) {
681 useOpus = true;
682 }
683 #endif
684 }
685
686 if (!useOpus) {
687 CELTCodec *switchto = NULL;
688 if ((!g.uiSession || (g.s.lmLoopMode == Settings::Local)) && (!g.qmCodecs.isEmpty())) {
689 // Use latest for local loopback
690 QMap<int, CELTCodec *>::const_iterator i = g.qmCodecs.constEnd();
691 --i;
692 switchto = i.value();
693 } else {
694 // Currently talking, don't switch unless you must.
695 if (cCodec && bPreviousVoice) {
696 int v = cCodec->bitstreamVersion();
697 if ((v == g.iCodecAlpha) || (v == g.iCodecBeta))
698 switchto = cCodec;
699 }
700 }
701 if (!switchto) {
702 switchto = g.qmCodecs.value(g.bPreferAlpha ? g.iCodecAlpha : g.iCodecBeta);
703 if (!switchto)
704 switchto = g.qmCodecs.value(g.bPreferAlpha ? g.iCodecBeta : g.iCodecAlpha);
705 }
706 if (switchto != cCodec) {
707 if (cCodec && ceEncoder) {
708 cCodec->celt_encoder_destroy(ceEncoder);
709 ceEncoder = NULL;
710 }
711 cCodec = switchto;
712 if (cCodec)
713 ceEncoder = cCodec->encoderCreate();
714 }
715
716 if (!cCodec)
717 return false;
718 }
719
720 MessageHandler::UDPMessageType previousType = umtType;
721 if (useOpus) {
722 umtType = MessageHandler::UDPVoiceOpus;
723 } else {
724 if (!g.uiSession) {
725 umtType = MessageHandler::UDPVoiceCELTAlpha;
726 } else {
727 int v = cCodec->bitstreamVersion();
728 if (v == g.iCodecAlpha)
729 umtType = MessageHandler::UDPVoiceCELTAlpha;
730 else if (v == g.iCodecBeta)
731 umtType = MessageHandler::UDPVoiceCELTBeta;
732 else {
733 qWarning() << "Couldn't find message type for codec version" << v;
734 }
735 }
736 }
737
738 if (umtType != previousType) {
739 iBufferedFrames = 0;
740 qlFrames.clear();
741 opusBuffer.clear();
742 }
743
744 return true;
745 }
746
encodeOpusFrame(short * source,int size,EncodingOutputBuffer & buffer)747 int AudioInput::encodeOpusFrame(short *source, int size, EncodingOutputBuffer& buffer) {
748 int len;
749 #ifdef USE_OPUS
750 if (!oCodec) {
751 return 0;
752 }
753
754 if (bResetEncoder) {
755 oCodec->opus_encoder_ctl(opusState, OPUS_RESET_STATE, NULL);
756 bResetEncoder = false;
757 }
758
759 oCodec->opus_encoder_ctl(opusState, OPUS_SET_BITRATE(iAudioQuality));
760
761 len = oCodec->opus_encode(opusState, source, size, &buffer[0], static_cast<opus_int32>(buffer.size()));
762 const int tenMsFrameCount = (size / iFrameSize);
763 iBitrate = (len * 100 * 8) / tenMsFrameCount;
764 #endif
765 return len;
766 }
767
encodeCELTFrame(short * psSource,EncodingOutputBuffer & buffer)768 int AudioInput::encodeCELTFrame(short *psSource, EncodingOutputBuffer& buffer) {
769 int len;
770 if (!cCodec)
771 return 0;
772
773 if (bResetEncoder) {
774 cCodec->celt_encoder_ctl(ceEncoder, CELT_RESET_STATE);
775 bResetEncoder = false;
776 }
777
778 cCodec->celt_encoder_ctl(ceEncoder, CELT_SET_PREDICTION(0));
779
780 cCodec->celt_encoder_ctl(ceEncoder, CELT_SET_VBR_RATE(iAudioQuality));
781 len = cCodec->encode(ceEncoder, psSource, &buffer[0], qMin<int>(iAudioQuality / (8 * 100), static_cast<int>(buffer.size())));
782 iBitrate = len * 100 * 8;
783
784 return len;
785 }
786
encodeAudioFrame()787 void AudioInput::encodeAudioFrame() {
788 int iArg;
789 int i;
790 float sum;
791 short max;
792
793 short *psSource;
794
795 iFrameCounter++;
796
797 // As g.iTarget is not protected by any locks, we avoid race-conditions by
798 // copying it once at this point and stick to whatever value it is here. Thus
799 // if the value of g.iTarget changes during the execution of this function,
800 // it won't cause any inconsistencies and the change is reflected once this
801 // function is called again.
802 int voiceTargetID = g.iTarget;
803
804 if (! bRunning)
805 return;
806
807 sum=1.0f;
808 max = 1;
809 for (i=0;i<iFrameSize;i++) {
810 sum += static_cast<float>(psMic[i] * psMic[i]);
811 max = std::max(static_cast<short>(abs(psMic[i])), max);
812 }
813 dPeakMic = qMax(20.0f*log10f(sqrtf(sum / static_cast<float>(iFrameSize)) / 32768.0f), -96.0f);
814 dMaxMic = max;
815
816 if (psSpeaker && (iEchoChannels > 0)) {
817 sum=1.0f;
818 for (i=0;i<iFrameSize;i++)
819 sum += static_cast<float>(psSpeaker[i] * psSpeaker[i]);
820 dPeakSpeaker = qMax(20.0f*log10f(sqrtf(sum / static_cast<float>(iFrameSize)) / 32768.0f), -96.0f);
821 } else {
822 dPeakSpeaker = 0.0;
823 }
824
825 QMutexLocker l(&qmSpeex);
826 resetAudioProcessor();
827
828 #ifdef USE_RNNOISE
829 // At the time of writing this code, RNNoise only supports a sample rate of 48000 Hz.
830 if (g.s.bDenoise && denoiseState && (iFrameSize == 480)) {
831 float denoiseFrames[480];
832 for (int i = 0; i < 480; i++) {
833 denoiseFrames[i] = psMic[i];
834 }
835
836 rnnoise_process_frame(denoiseState, denoiseFrames, denoiseFrames);
837
838 for (int i = 0; i < 480; i++) {
839 psMic[i] = denoiseFrames[i];
840 }
841 }
842 #endif
843
844 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_GET_AGC_GAIN, &iArg);
845 float gainValue = static_cast<float>(iArg);
846 iArg = g.s.iNoiseSuppress - iArg;
847 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &iArg);
848
849 if (sesEcho && psSpeaker) {
850 speex_echo_cancellation(sesEcho, psMic, psSpeaker, psClean);
851 speex_preprocess_run(sppPreprocess, psClean);
852 psSource = psClean;
853 } else {
854 speex_preprocess_run(sppPreprocess, psMic);
855 psSource = psMic;
856 }
857
858 sum=1.0f;
859 for (i=0;i<iFrameSize;i++)
860 sum += static_cast<float>(psSource[i] * psSource[i]);
861 float micLevel = sqrtf(sum / static_cast<float>(iFrameSize));
862 dPeakSignal = qMax(20.0f*log10f(micLevel / 32768.0f), -96.0f);
863
864 spx_int32_t prob = 0;
865 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_GET_PROB, &prob);
866 fSpeechProb = static_cast<float>(prob) / 100.0f;
867
868 // clean microphone level: peak of filtered signal attenuated by AGC gain
869 dPeakCleanMic = qMax(dPeakSignal - gainValue, -96.0f);
870 float level = (g.s.vsVAD == Settings::SignalToNoise) ? fSpeechProb : (1.0f + dPeakCleanMic / 96.0f);
871
872 bool bIsSpeech = false;
873
874 if (level > g.s.fVADmax) {
875 // Voice-activation threshold has been reached
876 bIsSpeech = true;
877 } else if (level > g.s.fVADmin && bPreviousVoice) {
878 // Voice-deactivation threshold has not yet been reached
879 bIsSpeech = true;
880 }
881
882 if (! bIsSpeech) {
883 iHoldFrames++;
884 if (iHoldFrames < g.s.iVoiceHold)
885 bIsSpeech = true;
886 } else {
887 iHoldFrames = 0;
888 }
889
890 if (g.s.atTransmit == Settings::Continuous) {
891 // Continous transmission is enabled
892 bIsSpeech = true;
893 } else if (g.s.atTransmit == Settings::PushToTalk) {
894 // PTT is enabled, so check if it is currently active
895 bIsSpeech = g.s.uiDoublePush && ((g.uiDoublePush < g.s.uiDoublePush) || (g.tDoublePush.elapsed() < g.s.uiDoublePush));
896 }
897
898 // If g.iPushToTalk > 0 that means that we are currently in some sort of PTT action. For
899 // instance this could mean we're currently whispering
900 bIsSpeech = bIsSpeech || (g.iPushToTalk > 0);
901
902 ClientUser *p = ClientUser::get(g.uiSession);
903 if (g.s.bMute || ((g.s.lmLoopMode != Settings::Local) && p && (p->bMute || p->bSuppress)) || g.bPushToMute || (voiceTargetID < 0)) {
904 bIsSpeech = false;
905 }
906
907 if (bIsSpeech) {
908 iSilentFrames = 0;
909 } else {
910 iSilentFrames++;
911 if (iSilentFrames > 500)
912 iFrameCounter = 0;
913 }
914
915 if (p) {
916 if (! bIsSpeech)
917 p->setTalking(Settings::Passive);
918 else if (voiceTargetID == 0)
919 p->setTalking(Settings::Talking);
920 else
921 p->setTalking(Settings::Shouting);
922 }
923
924 if (g.s.bTxAudioCue && g.uiSession != 0) {
925 AudioOutputPtr ao = g.ao;
926 if (bIsSpeech && ! bPreviousVoice && ao)
927 ao->playSample(g.s.qsTxAudioCueOn);
928 else if (ao && !bIsSpeech && bPreviousVoice)
929 ao->playSample(g.s.qsTxAudioCueOff);
930 }
931
932 if (! bIsSpeech && ! bPreviousVoice) {
933 iBitrate = 0;
934
935 if ((tIdle.elapsed() / 1000000ULL) > g.s.iIdleTime) {
936 activityState = ActivityStateIdle;
937 tIdle.restart();
938 if (g.s.iaeIdleAction == Settings::Deafen && !g.s.bDeaf) {
939 emit doDeaf();
940 } else if (g.s.iaeIdleAction == Settings::Mute && !g.s.bMute) {
941 emit doMute();
942 }
943 }
944
945 if (activityState == ActivityStateReturnedFromIdle) {
946 activityState = ActivityStateActive;
947 if (g.s.iaeIdleAction != Settings::Nothing && g.s.bUndoIdleActionUponActivity) {
948 if (g.s.iaeIdleAction == Settings::Deafen && g.s.bDeaf) {
949 emit doDeaf();
950 } else if (g.s.iaeIdleAction == Settings::Mute && g.s.bMute) {
951 emit doMute();
952 }
953 }
954 }
955
956 spx_int32_t increment = 0;
957 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &increment);
958 return;
959 } else {
960 spx_int32_t increment = 12;
961 speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &increment);
962 }
963
964 if (bIsSpeech && !bPreviousVoice) {
965 bResetEncoder = true;
966 }
967
968 tIdle.restart();
969
970 EncodingOutputBuffer buffer;
971 Q_ASSERT(buffer.size() >= static_cast<size_t>(iAudioQuality / 100 * iAudioFrames / 8));
972
973 int len = 0;
974
975 bool encoded = true;
976 if (!selectCodec())
977 return;
978
979 if (umtType == MessageHandler::UDPVoiceCELTAlpha || umtType == MessageHandler::UDPVoiceCELTBeta) {
980 len = encodeCELTFrame(psSource, buffer);
981 if (len <= 0) {
982 iBitrate = 0;
983 qWarning() << "encodeCELTFrame failed" << iBufferedFrames << iFrameSize << len;
984 return;
985 }
986 ++iBufferedFrames;
987 } else if (umtType == MessageHandler::UDPVoiceOpus) {
988 encoded = false;
989 opusBuffer.insert(opusBuffer.end(), psSource, psSource + iFrameSize);
990 ++iBufferedFrames;
991
992 if (!bIsSpeech || iBufferedFrames >= iAudioFrames) {
993 if (iBufferedFrames < iAudioFrames) {
994 // Stuff frame to framesize if speech ends and we don't have enough audio
995 // this way we are guaranteed to have a valid framecount and won't cause
996 // a codec configuration switch by suddenly using a wildly different
997 // framecount per packet.
998 const int missingFrames = iAudioFrames - iBufferedFrames;
999 opusBuffer.insert(opusBuffer.end(), iFrameSize * missingFrames, 0);
1000 iBufferedFrames += missingFrames;
1001 iFrameCounter += missingFrames;
1002 }
1003
1004 Q_ASSERT(iBufferedFrames == iAudioFrames);
1005
1006 len = encodeOpusFrame(&opusBuffer[0], iBufferedFrames * iFrameSize, buffer);
1007 opusBuffer.clear();
1008 if (len <= 0) {
1009 iBitrate = 0;
1010 qWarning() << "encodeOpusFrame failed" << iBufferedFrames << iFrameSize << len;
1011 iBufferedFrames = 0; // These are lost. Make sure not to mess up our sequence counter next flushCheck.
1012 return;
1013 }
1014 encoded = true;
1015 }
1016 }
1017
1018 if (encoded) {
1019 flushCheck(QByteArray(reinterpret_cast<char *>(&buffer[0]), len), !bIsSpeech, voiceTargetID);
1020 }
1021
1022 if (! bIsSpeech)
1023 iBitrate = 0;
1024
1025 bPreviousVoice = bIsSpeech;
1026 }
1027
sendAudioFrame(const char * data,PacketDataStream & pds)1028 static void sendAudioFrame(const char *data, PacketDataStream &pds) {
1029 ServerHandlerPtr sh = g.sh;
1030 if (sh) {
1031 VoiceRecorderPtr recorder(sh->recorder);
1032 if (recorder)
1033 recorder->getRecordUser().addFrame(QByteArray(data, pds.size() + 1));
1034 }
1035
1036 if (g.s.lmLoopMode == Settings::Local)
1037 LoopUser::lpLoopy.addFrame(QByteArray(data, pds.size() + 1));
1038 else if (sh)
1039 sh->sendMessage(data, pds.size() + 1);
1040 }
1041
flushCheck(const QByteArray & frame,bool terminator,int voiceTargetID)1042 void AudioInput::flushCheck(const QByteArray &frame, bool terminator, int voiceTargetID) {
1043 qlFrames << frame;
1044
1045 if (! terminator && iBufferedFrames < iAudioFrames)
1046 return;
1047
1048 int flags = 0;
1049 if (voiceTargetID > 0) {
1050 flags = voiceTargetID;
1051 }
1052 if (terminator && g.iPrevTarget > 0) {
1053 // If we have been whispering to some target but have just ended, terminator will be true. However
1054 // in the case of whispering this means that we just released the whisper key so this here is the
1055 // last audio frame that is sent for whispering. The whisper key being released means that g.iTarget
1056 // is reset to 0 by now. In order to send the last whisper frame correctly, we have to use
1057 // g.iPrevTarget which is set to whatever g.iTarget has been before its last change.
1058
1059 flags = g.iPrevTarget;
1060
1061 // We reset g.iPrevTarget as it has fulfilled its purpose for this whisper-action. It'll be set
1062 // accordingly once the client whispers for the next time.
1063 g.iPrevTarget = 0;
1064 }
1065
1066 if (g.s.lmLoopMode == Settings::Server)
1067 flags = 0x1f; // Server loopback
1068
1069 flags |= (umtType << 5);
1070
1071 char data[1024];
1072 data[0] = static_cast<unsigned char>(flags);
1073
1074 int frames = iBufferedFrames;
1075 iBufferedFrames = 0;
1076
1077 PacketDataStream pds(data + 1, 1023);
1078 // Sequence number
1079 pds << iFrameCounter - frames;
1080
1081 if (umtType == MessageHandler::UDPVoiceOpus) {
1082 const QByteArray &qba = qlFrames.takeFirst();
1083 int size = qba.size();
1084 if (terminator)
1085 size |= 1 << 13;
1086 pds << size;
1087 pds.append(qba.constData(), qba.size());
1088 } else {
1089 if (terminator) {
1090 qlFrames << QByteArray();
1091 ++frames;
1092 }
1093
1094 for (int i = 0; i < frames; ++i) {
1095 const QByteArray &qba = qlFrames.takeFirst();
1096 unsigned char head = static_cast<unsigned char>(qba.size());
1097 if (i < frames - 1)
1098 head |= 0x80;
1099 pds.append(head);
1100 pds.append(qba.constData(), qba.size());
1101 }
1102 }
1103
1104 if (g.s.bTransmitPosition && g.p && ! g.bCenterPosition && g.p->fetch()) {
1105 pds << g.p->fPosition[0];
1106 pds << g.p->fPosition[1];
1107 pds << g.p->fPosition[2];
1108 }
1109
1110 sendAudioFrame(data, pds);
1111
1112 Q_ASSERT(qlFrames.isEmpty());
1113 }
1114
isAlive() const1115 bool AudioInput::isAlive() const {
1116 return isRunning();
1117 }
1118