1 // Copyright 2005-2019 The Mumble Developers. All rights reserved.
2 // Use of this source code is governed by a BSD-style license
3 // that can be found in the LICENSE file at the root of the
4 // Mumble source tree or at <https://www.mumble.info/LICENSE>.
5 
6 #include "mumble_pch.hpp"
7 
8 #include "AudioInput.h"
9 
10 #include "AudioOutput.h"
11 #include "CELTCodec.h"
12 #include "OpusCodec.h"
13 #include "ServerHandler.h"
14 #include "MainWindow.h"
15 #include "User.h"
16 #include "PacketDataStream.h"
17 #include "Plugins.h"
18 #include "Message.h"
19 #include "Global.h"
20 #include "NetworkConfig.h"
21 #include "VoiceRecorder.h"
22 
23 #ifdef USE_RNNOISE
24 extern "C" {
25 #include "rnnoise.h"
26 }
27 #endif
28 
29 // Remember that we cannot use static member classes that are not pointers, as the constructor
30 // for AudioInputRegistrar() might be called before they are initialized, as the constructor
31 // is called from global initialization.
32 // Hence, we allocate upon first call.
33 
34 QMap<QString, AudioInputRegistrar *> *AudioInputRegistrar::qmNew;
35 QString AudioInputRegistrar::current = QString();
36 
AudioInputRegistrar(const QString & n,int p)37 AudioInputRegistrar::AudioInputRegistrar(const QString &n, int p) : name(n), priority(p) {
38 	if (! qmNew)
39 		qmNew = new QMap<QString, AudioInputRegistrar *>();
40 	qmNew->insert(name,this);
41 }
42 
~AudioInputRegistrar()43 AudioInputRegistrar::~AudioInputRegistrar() {
44 	qmNew->remove(name);
45 }
46 
newFromChoice(QString choice)47 AudioInputPtr AudioInputRegistrar::newFromChoice(QString choice) {
48 	if (! qmNew)
49 		return AudioInputPtr();
50 
51 	if (!choice.isEmpty() && qmNew->contains(choice)) {
52 		g.s.qsAudioInput = choice;
53 		current = choice;
54 		return AudioInputPtr(qmNew->value(current)->create());
55 	}
56 	choice = g.s.qsAudioInput;
57 	if (qmNew->contains(choice)) {
58 		current = choice;
59 		return AudioInputPtr(qmNew->value(choice)->create());
60 	}
61 
62 	AudioInputRegistrar *r = NULL;
63 	foreach(AudioInputRegistrar *air, *qmNew)
64 		if (!r || (air->priority > r->priority))
65 			r = air;
66 	if (r) {
67 		current = r->name;
68 		return AudioInputPtr(r->create());
69 	}
70 	return AudioInputPtr();
71 }
72 
canExclusive() const73 bool AudioInputRegistrar::canExclusive() const {
74 	return false;
75 }
76 
AudioInput()77 AudioInput::AudioInput() : opusBuffer(g.s.iFramesPerPacket * (SAMPLE_RATE / 100)) {
78 	adjustBandwidth(g.iMaxBandwidth, iAudioQuality, iAudioFrames);
79 
80 	g.iAudioBandwidth = getNetworkBandwidth(iAudioQuality, iAudioFrames);
81 
82 	umtType = MessageHandler::UDPVoiceCELTAlpha;
83 
84 	activityState = ActivityStateActive;
85 	oCodec = NULL;
86 	opusState = NULL;
87 	cCodec = NULL;
88 	ceEncoder = NULL;
89 
90 	iSampleRate = SAMPLE_RATE;
91 	iFrameSize = SAMPLE_RATE / 100;
92 
93 #ifdef USE_OPUS
94 	oCodec = g.oCodec;
95 	if (oCodec) {
96 		if (!g.s.bUseOpusMusicEncoding) {
97 			opusState = oCodec->opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_VOIP, NULL);
98 			qWarning("AudioInput: Opus encoder set for VOIP");
99 		} else {
100 			opusState = oCodec->opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_AUDIO, NULL);
101 			qWarning("AudioInput: Opus encoder set for Music");
102 		}
103 
104 		oCodec->opus_encoder_ctl(opusState, OPUS_SET_VBR(0)); // CBR
105 	}
106 #endif
107 
108 #ifdef USE_RNNOISE
109 	denoiseState = rnnoise_create();
110 #endif
111 
112 	qWarning("AudioInput: %d bits/s, %d hz, %d sample", iAudioQuality, iSampleRate, iFrameSize);
113 	iEchoFreq = iMicFreq = iSampleRate;
114 
115 	iFrameCounter = 0;
116 	iSilentFrames = 0;
117 	iHoldFrames = 0;
118 	iBufferedFrames = 0;
119 
120 	bResetProcessor = true;
121 
122 	bEchoMulti = false;
123 
124 	sppPreprocess = NULL;
125 	sesEcho = NULL;
126 	srsMic = srsEcho = NULL;
127 	iJitterSeq = 0;
128 	iMinBuffered = 1000;
129 
130 	psMic = new short[iFrameSize];
131 	psClean = new short[iFrameSize];
132 
133 	psSpeaker = NULL;
134 
135 	iEchoChannels = iMicChannels = 0;
136 	iEchoFilled = iMicFilled = 0;
137 	eMicFormat = eEchoFormat = SampleFloat;
138 	iMicSampleSize = iEchoSampleSize = 0;
139 
140 	bPreviousVoice = false;
141 
142 	bResetEncoder = true;
143 
144 	pfMicInput = pfEchoInput = pfOutput = NULL;
145 
146 	iBitrate = 0;
147 	dPeakSignal = dPeakSpeaker = dPeakMic = dPeakCleanMic = 0.0;
148 
149 	if (g.uiSession) {
150 		setMaxBandwidth(g.iMaxBandwidth);
151 	}
152 
153 	bRunning = true;
154 
155 	connect(this, SIGNAL(doDeaf()), g.mw->qaAudioDeaf, SLOT(trigger()), Qt::QueuedConnection);
156 	connect(this, SIGNAL(doMute()), g.mw->qaAudioMute, SLOT(trigger()), Qt::QueuedConnection);
157 }
158 
~AudioInput()159 AudioInput::~AudioInput() {
160 	bRunning = false;
161 	wait();
162 
163 #ifdef USE_OPUS
164 	if (opusState) {
165 		oCodec->opus_encoder_destroy(opusState);
166 	}
167 #endif
168 
169 #ifdef USE_RNNOISE
170 	if (denoiseState) {
171 		rnnoise_destroy(denoiseState);
172 	}
173 #endif
174 
175 	if (ceEncoder) {
176 		cCodec->celt_encoder_destroy(ceEncoder);
177 	}
178 
179 	foreach(short *buf, qlEchoFrames)
180 		delete [] buf;
181 
182 	if (sppPreprocess)
183 		speex_preprocess_state_destroy(sppPreprocess);
184 	if (sesEcho)
185 		speex_echo_state_destroy(sesEcho);
186 
187 	if (srsMic)
188 		speex_resampler_destroy(srsMic);
189 	if (srsEcho)
190 		speex_resampler_destroy(srsEcho);
191 
192 	delete [] psMic;
193 	delete [] psClean;
194 	delete [] psSpeaker;
195 
196 	delete [] pfMicInput;
197 	delete [] pfEchoInput;
198 	delete [] pfOutput;
199 }
200 
isTransmitting() const201 bool AudioInput::isTransmitting() const {
202 	return bPreviousVoice;
203 };
204 
205 #define IN_MIXER_FLOAT(channels) \
206 static void inMixerFloat##channels ( float * RESTRICT buffer, const void * RESTRICT ipt, unsigned int nsamp, unsigned int N, quint64 mask) { \
207   const float * RESTRICT input = reinterpret_cast<const float *>(ipt); \
208   const float m = 1.0f / static_cast<float>(channels); \
209   Q_UNUSED(N); \
210   Q_UNUSED(mask); \
211   for(unsigned int i=0;i<nsamp;++i) {\
212 	  float v= 0.0f; \
213 	  for(unsigned int j=0;j<channels;++j) \
214 	  	v += input[i*channels+j]; \
215 	  buffer[i] = v * m; \
216   } \
217 }
218 
219 #define IN_MIXER_SHORT(channels) \
220 static void inMixerShort##channels ( float * RESTRICT buffer, const void * RESTRICT ipt, unsigned int nsamp, unsigned int N, quint64 mask) { \
221   const short * RESTRICT input = reinterpret_cast<const short *>(ipt); \
222   const float m = 1.0f / (32768.f * static_cast<float>(channels)); \
223   Q_UNUSED(N); \
224   Q_UNUSED(mask); \
225   for(unsigned int i=0;i<nsamp;++i) {\
226 	  float v= 0.0f; \
227 	  for(unsigned int j=0;j<channels;++j) \
228 	  	v += static_cast<float>(input[i*channels+j]); \
229 	  buffer[i] = v * m; \
230   } \
231 }
232 
inMixerFloatMask(float * RESTRICT buffer,const void * RESTRICT ipt,unsigned int nsamp,unsigned int N,quint64 mask)233 static void inMixerFloatMask(float * RESTRICT buffer, const void * RESTRICT ipt, unsigned int nsamp, unsigned int N, quint64 mask) { \
234 	const float * RESTRICT input = reinterpret_cast<const float *>(ipt);
235 
236 	unsigned int chancount = 0;
237 	STACKVAR(unsigned int, chanindex, N);
238 	for (unsigned int j = 0; j < N; ++j) {
239 		if ((mask & (1ULL << j)) == 0) {
240 			continue;
241 		}
242 		chanindex[chancount] = j; // Use chancount as index into chanindex.
243 		++chancount;
244 	}
245 
246 	const float m = 1.0f / static_cast<float>(chancount);
247 	for(unsigned int i = 0; i < nsamp; ++i) {
248 		float v = 0.0f;
249 		for(unsigned int j = 0; j < chancount; ++j) {
250 			v += input[i * N + chanindex[j]];
251 		}
252 		buffer[i] = v * m;
253 	}
254 }
255 
inMixerShortMask(float * RESTRICT buffer,const void * RESTRICT ipt,unsigned int nsamp,unsigned int N,quint64 mask)256 static void inMixerShortMask(float * RESTRICT buffer, const void * RESTRICT ipt, unsigned int nsamp, unsigned int N, quint64 mask) {
257 	const short * RESTRICT input = reinterpret_cast<const short *>(ipt);
258 
259 	unsigned int chancount = 0;
260 	STACKVAR(unsigned int, chanindex, N);
261 	for (unsigned int j = 0; j < N; ++j) {
262 		if ((mask & (1ULL << j)) == 0) {
263 			continue;
264 		}
265 		chanindex[chancount] = j; // Use chancount as index into chanindex.
266 		++chancount;
267 	}
268 
269 	const float m = 1.0f / static_cast<float>(chancount);
270 	for(unsigned int i = 0; i < nsamp; ++i) {
271 		float v = 0.0f;
272 		for(unsigned int j = 0; j < chancount; ++j) {
273 			v += static_cast<float>(input[i * N + chanindex[j]]);
274 		}
275 		buffer[i] = v * m;
276 	}
277 }
278 
279 IN_MIXER_FLOAT(1)
280 IN_MIXER_FLOAT(2)
281 IN_MIXER_FLOAT(3)
282 IN_MIXER_FLOAT(4)
283 IN_MIXER_FLOAT(5)
284 IN_MIXER_FLOAT(6)
285 IN_MIXER_FLOAT(7)
286 IN_MIXER_FLOAT(8)
IN_MIXER_FLOAT(N)287 IN_MIXER_FLOAT(N)
288 
289 IN_MIXER_SHORT(1)
290 IN_MIXER_SHORT(2)
291 IN_MIXER_SHORT(3)
292 IN_MIXER_SHORT(4)
293 IN_MIXER_SHORT(5)
294 IN_MIXER_SHORT(6)
295 IN_MIXER_SHORT(7)
296 IN_MIXER_SHORT(8)
297 IN_MIXER_SHORT(N)
298 
299 AudioInput::inMixerFunc AudioInput::chooseMixer(const unsigned int nchan, SampleFormat sf, quint64 chanmask) {
300 	inMixerFunc r = NULL;
301 
302 	if (chanmask != 0xffffffffffffffffULL) {
303 		if (sf == SampleFloat) {
304 			r = inMixerFloatMask;
305 		} else if (sf == SampleShort) {
306 			r = inMixerShortMask;
307 		}
308 		return r;
309 	}
310 
311 	if (sf == SampleFloat) {
312 		switch (nchan) {
313 			case 1:
314 				r = inMixerFloat1;
315 				break;
316 			case 2:
317 				r = inMixerFloat2;
318 				break;
319 			case 3:
320 				r = inMixerFloat3;
321 				break;
322 			case 4:
323 				r = inMixerFloat4;
324 				break;
325 			case 5:
326 				r = inMixerFloat5;
327 				break;
328 			case 6:
329 				r = inMixerFloat6;
330 				break;
331 			case 7:
332 				r = inMixerFloat7;
333 				break;
334 			case 8:
335 				r = inMixerFloat8;
336 				break;
337 			default:
338 				r = inMixerFloatN;
339 				break;
340 		}
341 	} else {
342 		switch (nchan) {
343 			case 1:
344 				r = inMixerShort1;
345 				break;
346 			case 2:
347 				r = inMixerShort2;
348 				break;
349 			case 3:
350 				r = inMixerShort3;
351 				break;
352 			case 4:
353 				r = inMixerShort4;
354 				break;
355 			case 5:
356 				r = inMixerShort5;
357 				break;
358 			case 6:
359 				r = inMixerShort6;
360 				break;
361 			case 7:
362 				r = inMixerShort7;
363 				break;
364 			case 8:
365 				r = inMixerShort8;
366 				break;
367 			default:
368 				r = inMixerShortN;
369 				break;
370 		}
371 	}
372 	return r;
373 }
374 
initializeMixer()375 void AudioInput::initializeMixer() {
376 	int err;
377 
378 	if (srsMic)
379 		speex_resampler_destroy(srsMic);
380 	if (srsEcho)
381 		speex_resampler_destroy(srsEcho);
382 	delete [] pfMicInput;
383 	delete [] pfEchoInput;
384 	delete [] pfOutput;
385 
386 	if (iMicFreq != iSampleRate)
387 		srsMic = speex_resampler_init(1, iMicFreq, iSampleRate, 3, &err);
388 
389 	iMicLength = (iFrameSize * iMicFreq) / iSampleRate;
390 
391 	pfMicInput = new float[iMicLength];
392 	pfOutput = new float[iFrameSize * qMax(1U,iEchoChannels)];
393 
394 	if (iEchoChannels > 0) {
395 		bEchoMulti = g.s.bEchoMulti;
396 		if (iEchoFreq != iSampleRate)
397 			srsEcho = speex_resampler_init(bEchoMulti ? iEchoChannels : 1, iEchoFreq, iSampleRate, 3, &err);
398 		iEchoLength = (iFrameSize * iEchoFreq) / iSampleRate;
399 		iEchoMCLength = bEchoMulti ? iEchoLength * iEchoChannels : iEchoLength;
400 		iEchoFrameSize = bEchoMulti ? iFrameSize * iEchoChannels : iFrameSize;
401 		pfEchoInput = new float[iEchoMCLength];
402 	} else {
403 		srsEcho = NULL;
404 		pfEchoInput = NULL;
405 	}
406 
407 	uiMicChannelMask = g.s.uiAudioInputChannelMask;
408 
409 	// There is no channel mask setting for the echo canceller, so allow all channels.
410 	uiEchoChannelMask = 0xffffffffffffffffULL;
411 
412 	imfMic = chooseMixer(iMicChannels, eMicFormat, uiMicChannelMask);
413 	imfEcho = chooseMixer(iEchoChannels, eEchoFormat, uiEchoChannelMask);
414 
415 	iMicSampleSize = static_cast<int>(iMicChannels * ((eMicFormat == SampleFloat) ? sizeof(float) : sizeof(short)));
416 	iEchoSampleSize = static_cast<int>(iEchoChannels * ((eEchoFormat == SampleFloat) ? sizeof(float) : sizeof(short)));
417 
418 	bResetProcessor = true;
419 
420 	qWarning("AudioInput: Initialized mixer for %d channel %d hz mic and %d channel %d hz echo", iMicChannels, iMicFreq, iEchoChannels, iEchoFreq);
421 	if (uiMicChannelMask != 0xffffffffffffffffULL) {
422 		qWarning("AudioInput: using mic channel mask 0x%llx", static_cast<unsigned long long>(uiMicChannelMask));
423 	}
424 }
425 
addMic(const void * data,unsigned int nsamp)426 void AudioInput::addMic(const void *data, unsigned int nsamp) {
427 	while (nsamp > 0) {
428 		// Make sure we don't overrun the frame buffer
429 		const unsigned int left = qMin(nsamp, iMicLength - iMicFilled);
430 
431 		// Append mix into pfMicInput frame buffer (converts 16bit pcm->float if necessary)
432 		imfMic(pfMicInput + iMicFilled, data, left, iMicChannels, uiMicChannelMask);
433 
434 		iMicFilled += left;
435 		nsamp -= left;
436 
437 		// If new samples are left offset data pointer to point at the first one for next iteration
438 		if (nsamp > 0) {
439 			if (eMicFormat == SampleFloat)
440 				data = reinterpret_cast<const float *>(data) + left * iMicChannels;
441 			else
442 				data = reinterpret_cast<const short *>(data) + left * iMicChannels;
443 		}
444 
445 		if (iMicFilled == iMicLength) {
446 			// Frame complete
447 			iMicFilled = 0;
448 
449 			// If needed resample frame
450 			float *ptr = srsMic ? pfOutput : pfMicInput;
451 
452 			if (srsMic) {
453 				spx_uint32_t inlen = iMicLength;
454 				spx_uint32_t outlen = iFrameSize;
455 				speex_resampler_process_float(srsMic, 0, pfMicInput, &inlen, pfOutput, &outlen);
456 			}
457 
458 			// Convert float to 16bit PCM
459 			const float mul = 32768.f;
460 			for (int j = 0; j < iFrameSize; ++j)
461 				psMic[j] = static_cast<short>(qBound(-32768.f, (ptr[j] * mul), 32767.f));
462 
463 			// If we have echo chancellation enabled...
464 			if (iEchoChannels > 0) {
465 				short *echo = NULL;
466 
467 				{
468 					QMutexLocker l(&qmEcho);
469 
470 					if (qlEchoFrames.isEmpty()) {
471 						iJitterSeq = 0;
472 						iMinBuffered = 1000;
473 					} else {
474 						// Compensate for drift between the microphone and the echo source
475 						iMinBuffered = qMin(iMinBuffered, qlEchoFrames.count());
476 
477 						if ((iJitterSeq > 100) && (iMinBuffered > 1)) {
478 							iJitterSeq = 0;
479 							iMinBuffered = 1000;
480 							delete [] qlEchoFrames.takeFirst();
481 						}
482 						echo = qlEchoFrames.takeFirst();
483 					}
484 				}
485 
486 				if (echo) {
487 					// We have echo data for the current frame, remember that
488 					delete [] psSpeaker;
489 					psSpeaker = echo;
490 				}
491 			}
492 
493 			// Encode and send frame
494 			encodeAudioFrame();
495 		}
496 	}
497 }
498 
addEcho(const void * data,unsigned int nsamp)499 void AudioInput::addEcho(const void *data, unsigned int nsamp) {
500 	while (nsamp > 0) {
501 		// Make sure we don't overrun the echo frame buffer
502 		const unsigned int left = qMin(nsamp, iEchoLength - iEchoFilled);
503 
504 		if (bEchoMulti) {
505 			const unsigned int samples = left * iEchoChannels;
506 
507 			if (eEchoFormat == SampleFloat) {
508 				for (unsigned int i=0;i<samples;++i)
509 					pfEchoInput[i] = reinterpret_cast<const float *>(data)[i];
510 			}
511 			else {
512 				// 16bit PCM -> float
513 				for (unsigned int i=0;i<samples;++i)
514 					pfEchoInput[i] = static_cast<float>(reinterpret_cast<const short *>(data)[i]) * (1.0f / 32768.f);
515 			}
516 		} else {
517 			// Mix echo channels (converts 16bit PCM -> float if needed)
518 			imfEcho(pfEchoInput + iEchoFilled, data, left, iEchoChannels, uiEchoChannelMask);
519 		}
520 
521 		iEchoFilled += left;
522 		nsamp -= left;
523 
524 		// If new samples are left offset data pointer to point at the first one for next iteration
525 		if (nsamp > 0) {
526 			if (eEchoFormat == SampleFloat)
527 				data = reinterpret_cast<const float *>(data) + left * iEchoChannels;
528 			else
529 				data = reinterpret_cast<const short *>(data) + left * iEchoChannels;
530 		}
531 
532 		if (iEchoFilled == iEchoLength) {
533 			//Frame complete
534 
535 			iEchoFilled = 0;
536 
537 			// Resample if necessary
538 			float *ptr = srsEcho ? pfOutput : pfEchoInput;
539 
540 			if (srsEcho) {
541 				spx_uint32_t inlen = iEchoLength;
542 				spx_uint32_t outlen = iFrameSize;
543 				speex_resampler_process_interleaved_float(srsEcho, pfEchoInput, &inlen, pfOutput, &outlen);
544 			}
545 
546 			short *outbuff = new short[iEchoFrameSize];
547 
548 			// float -> 16bit PCM
549 			const float mul = 32768.f;
550 			for (unsigned int j=0;j<iEchoFrameSize;++j)
551 				outbuff[j] = static_cast<short>(ptr[j] * mul);
552 
553 			// Push frame into the echo chancellers jitter buffer
554 			QMutexLocker l(&qmEcho);
555 
556 			iJitterSeq = qMin(iJitterSeq + 1,10000U);
557 			qlEchoFrames.append(outbuff);
558 		}
559 	}
560 }
561 
adjustBandwidth(int bitspersec,int & bitrate,int & frames)562 void AudioInput::adjustBandwidth(int bitspersec, int &bitrate, int &frames) {
563 	frames = g.s.iFramesPerPacket;
564 	bitrate = g.s.iQuality;
565 
566 	if (bitspersec == -1) {
567 		// No limit
568 	} else {
569 		if (getNetworkBandwidth(bitrate, frames) > bitspersec) {
570 			if ((frames <= 4) && (bitspersec <= 32000))
571 				frames = 4;
572 			else if ((frames == 1) && (bitspersec <= 64000))
573 				frames = 2;
574 			else if ((frames == 2) && (bitspersec <= 48000))
575 				frames = 4;
576 			if (getNetworkBandwidth(bitrate, frames) > bitspersec) {
577 				do {
578 					bitrate -= 1000;
579 				} while ((bitrate > 8000) && (getNetworkBandwidth(bitrate, frames) > bitspersec));
580 			}
581 		}
582 	}
583 	if (bitrate <= 8000)
584 		bitrate = 8000;
585 }
586 
setMaxBandwidth(int bitspersec)587 void AudioInput::setMaxBandwidth(int bitspersec) {
588 	if (bitspersec == g.iMaxBandwidth)
589 		return;
590 
591 	int frames;
592 	int bitrate;
593 	adjustBandwidth(bitspersec, bitrate, frames);
594 
595 	g.iMaxBandwidth = bitspersec;
596 
597 	if (bitspersec != -1) {
598 		if ((bitrate != g.s.iQuality) || (frames != g.s.iFramesPerPacket))
599 			g.mw->msgBox(tr("Server maximum network bandwidth is only %1 kbit/s. Audio quality auto-adjusted to %2 kbit/s (%3 ms)").arg(bitspersec / 1000).arg(bitrate / 1000).arg(frames*10));
600 	}
601 
602 	AudioInputPtr ai = g.ai;
603 	if (ai) {
604 		g.iAudioBandwidth = getNetworkBandwidth(bitrate, frames);
605 		ai->iAudioQuality = bitrate;
606 		ai->iAudioFrames = frames;
607 		return;
608 	}
609 
610 	ai.reset();
611 
612 	Audio::stopInput();
613 	Audio::startInput();
614 }
615 
getNetworkBandwidth(int bitrate,int frames)616 int AudioInput::getNetworkBandwidth(int bitrate, int frames) {
617 	int overhead = 20 + 8 + 4 + 1 + 2 + (g.s.bTransmitPosition ? 12 : 0) + (NetworkConfig::TcpModeEnabled() ? 12 : 0) + frames;
618 	overhead *= (800 / frames);
619 	int bw = overhead + bitrate;
620 
621 	return bw;
622 }
623 
resetAudioProcessor()624 void AudioInput::resetAudioProcessor() {
625 	if (!bResetProcessor)
626 		return;
627 
628 	int iArg;
629 
630 	if (sppPreprocess)
631 		speex_preprocess_state_destroy(sppPreprocess);
632 	if (sesEcho)
633 		speex_echo_state_destroy(sesEcho);
634 
635 	sppPreprocess = speex_preprocess_state_init(iFrameSize, iSampleRate);
636 
637 	iArg = 1;
638 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_VAD, &iArg);
639 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC, &iArg);
640 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_DENOISE, &iArg);
641 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_DEREVERB, &iArg);
642 
643 	iArg = 30000;
644 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_TARGET, &iArg);
645 
646 	float v = 30000.0f / static_cast<float>(g.s.iMinLoudness);
647 	iArg = iroundf(floorf(20.0f * log10f(v)));
648 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_MAX_GAIN, &iArg);
649 
650 	iArg = -60;
651 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_DECREMENT, &iArg);
652 
653 	iArg = g.s.iNoiseSuppress;
654 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &iArg);
655 
656 	if (iEchoChannels > 0) {
657 		sesEcho = speex_echo_state_init_mc(iFrameSize, iFrameSize * 10, 1, bEchoMulti ? iEchoChannels : 1);
658 		iArg = iSampleRate;
659 		speex_echo_ctl(sesEcho, SPEEX_ECHO_SET_SAMPLING_RATE, &iArg);
660 		speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_ECHO_STATE, sesEcho);
661 
662 		qWarning("AudioInput: ECHO CANCELLER ACTIVE");
663 	} else {
664 		sesEcho = NULL;
665 	}
666 
667 	bResetEncoder = true;
668 
669 	bResetProcessor = false;
670 }
671 
selectCodec()672 bool AudioInput::selectCodec() {
673 	bool useOpus = false;
674 
675 	// Currently talking, use previous Opus status.
676 	if (bPreviousVoice) {
677 		useOpus = (umtType == MessageHandler::UDPVoiceOpus);
678 	} else {
679 #ifdef USE_OPUS
680 		if (g.bOpus || (g.s.lmLoopMode == Settings::Local)) {
681 			useOpus = true;
682 		}
683 #endif
684 	}
685 
686 	if (!useOpus) {
687 		CELTCodec *switchto = NULL;
688 		if ((!g.uiSession || (g.s.lmLoopMode == Settings::Local)) && (!g.qmCodecs.isEmpty())) {
689 			// Use latest for local loopback
690 			QMap<int, CELTCodec *>::const_iterator i = g.qmCodecs.constEnd();
691 			--i;
692 			switchto = i.value();
693 		} else {
694 			// Currently talking, don't switch unless you must.
695 			if (cCodec && bPreviousVoice) {
696 				int v = cCodec->bitstreamVersion();
697 				if ((v == g.iCodecAlpha) || (v == g.iCodecBeta))
698 					switchto = cCodec;
699 			}
700 		}
701 		if (!switchto) {
702 			switchto = g.qmCodecs.value(g.bPreferAlpha ? g.iCodecAlpha : g.iCodecBeta);
703 			if (!switchto)
704 				switchto = g.qmCodecs.value(g.bPreferAlpha ? g.iCodecBeta : g.iCodecAlpha);
705 		}
706 		if (switchto != cCodec) {
707 			if (cCodec && ceEncoder) {
708 				cCodec->celt_encoder_destroy(ceEncoder);
709 				ceEncoder = NULL;
710 			}
711 			cCodec = switchto;
712 			if (cCodec)
713 				ceEncoder = cCodec->encoderCreate();
714 		}
715 
716 		if (!cCodec)
717 			return false;
718 	}
719 
720 	MessageHandler::UDPMessageType previousType = umtType;
721 	if (useOpus) {
722 		umtType = MessageHandler::UDPVoiceOpus;
723 	} else {
724 		if (!g.uiSession) {
725 			umtType = MessageHandler::UDPVoiceCELTAlpha;
726 		} else {
727 			int v = cCodec->bitstreamVersion();
728 			if (v == g.iCodecAlpha)
729 				umtType = MessageHandler::UDPVoiceCELTAlpha;
730 			else if (v == g.iCodecBeta)
731 				umtType = MessageHandler::UDPVoiceCELTBeta;
732 			else {
733 				qWarning() << "Couldn't find message type for codec version" << v;
734 			}
735 		}
736 	}
737 
738 	if (umtType != previousType) {
739 		iBufferedFrames = 0;
740 		qlFrames.clear();
741 		opusBuffer.clear();
742 	}
743 
744 	return true;
745 }
746 
encodeOpusFrame(short * source,int size,EncodingOutputBuffer & buffer)747 int AudioInput::encodeOpusFrame(short *source, int size, EncodingOutputBuffer& buffer) {
748 	int len;
749 #ifdef USE_OPUS
750 	if (!oCodec) {
751 		return 0;
752 	}
753 
754 	if (bResetEncoder) {
755 		oCodec->opus_encoder_ctl(opusState, OPUS_RESET_STATE, NULL);
756 		bResetEncoder = false;
757 	}
758 
759 	oCodec->opus_encoder_ctl(opusState, OPUS_SET_BITRATE(iAudioQuality));
760 
761 	len = oCodec->opus_encode(opusState, source, size, &buffer[0], static_cast<opus_int32>(buffer.size()));
762 	const int tenMsFrameCount = (size / iFrameSize);
763 	iBitrate = (len * 100 * 8) / tenMsFrameCount;
764 #endif
765 	return len;
766 }
767 
encodeCELTFrame(short * psSource,EncodingOutputBuffer & buffer)768 int AudioInput::encodeCELTFrame(short *psSource, EncodingOutputBuffer& buffer) {
769 	int len;
770 	if (!cCodec)
771 		return 0;
772 
773 	if (bResetEncoder) {
774 		cCodec->celt_encoder_ctl(ceEncoder, CELT_RESET_STATE);
775 		bResetEncoder = false;
776 	}
777 
778 	cCodec->celt_encoder_ctl(ceEncoder, CELT_SET_PREDICTION(0));
779 
780 	cCodec->celt_encoder_ctl(ceEncoder, CELT_SET_VBR_RATE(iAudioQuality));
781 	len = cCodec->encode(ceEncoder, psSource, &buffer[0], qMin<int>(iAudioQuality / (8 * 100), static_cast<int>(buffer.size())));
782 	iBitrate = len * 100 * 8;
783 
784 	return len;
785 }
786 
encodeAudioFrame()787 void AudioInput::encodeAudioFrame() {
788 	int iArg;
789 	int i;
790 	float sum;
791 	short max;
792 
793 	short *psSource;
794 
795 	iFrameCounter++;
796 
797 	// As g.iTarget is not protected by any locks, we avoid race-conditions by
798 	// copying it once at this point and stick to whatever value it is here. Thus
799 	// if the value of g.iTarget changes during the execution of this function,
800 	// it won't cause any inconsistencies and the change is reflected once this
801 	// function is called again.
802 	int voiceTargetID = g.iTarget;
803 
804 	if (! bRunning)
805 		return;
806 
807 	sum=1.0f;
808 	max = 1;
809 	for (i=0;i<iFrameSize;i++) {
810 		sum += static_cast<float>(psMic[i] * psMic[i]);
811 		max = std::max(static_cast<short>(abs(psMic[i])), max);
812 	}
813 	dPeakMic = qMax(20.0f*log10f(sqrtf(sum / static_cast<float>(iFrameSize)) / 32768.0f), -96.0f);
814 	dMaxMic = max;
815 
816 	if (psSpeaker && (iEchoChannels > 0)) {
817 		sum=1.0f;
818 		for (i=0;i<iFrameSize;i++)
819 			sum += static_cast<float>(psSpeaker[i] * psSpeaker[i]);
820 		dPeakSpeaker = qMax(20.0f*log10f(sqrtf(sum / static_cast<float>(iFrameSize)) / 32768.0f), -96.0f);
821 	} else {
822 		dPeakSpeaker = 0.0;
823 	}
824 
825 	QMutexLocker l(&qmSpeex);
826 	resetAudioProcessor();
827 
828 #ifdef USE_RNNOISE
829 	// At the time of writing this code, RNNoise only supports a sample rate of 48000 Hz.
830 	if (g.s.bDenoise && denoiseState && (iFrameSize == 480)) {
831 		float denoiseFrames[480];
832 		for (int i = 0; i < 480; i++) {
833 			denoiseFrames[i] = psMic[i];
834 		}
835 
836 		rnnoise_process_frame(denoiseState, denoiseFrames, denoiseFrames);
837 
838 		for (int i = 0; i < 480; i++) {
839 			psMic[i] = denoiseFrames[i];
840 		}
841 	}
842 #endif
843 
844 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_GET_AGC_GAIN, &iArg);
845 	float gainValue = static_cast<float>(iArg);
846 	iArg = g.s.iNoiseSuppress - iArg;
847 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &iArg);
848 
849 	if (sesEcho && psSpeaker) {
850 		speex_echo_cancellation(sesEcho, psMic, psSpeaker, psClean);
851 		speex_preprocess_run(sppPreprocess, psClean);
852 		psSource = psClean;
853 	} else {
854 		speex_preprocess_run(sppPreprocess, psMic);
855 		psSource = psMic;
856 	}
857 
858 	sum=1.0f;
859 	for (i=0;i<iFrameSize;i++)
860 		sum += static_cast<float>(psSource[i] * psSource[i]);
861 	float micLevel = sqrtf(sum / static_cast<float>(iFrameSize));
862 	dPeakSignal = qMax(20.0f*log10f(micLevel / 32768.0f), -96.0f);
863 
864 	spx_int32_t prob = 0;
865 	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_GET_PROB, &prob);
866 	fSpeechProb = static_cast<float>(prob) / 100.0f;
867 
868 	// clean microphone level: peak of filtered signal attenuated by AGC gain
869 	dPeakCleanMic = qMax(dPeakSignal - gainValue, -96.0f);
870 	float level = (g.s.vsVAD == Settings::SignalToNoise) ? fSpeechProb : (1.0f + dPeakCleanMic / 96.0f);
871 
872 	bool bIsSpeech = false;
873 
874 	if (level > g.s.fVADmax) {
875 		// Voice-activation threshold has been reached
876 		bIsSpeech = true;
877 	} else if (level > g.s.fVADmin && bPreviousVoice) {
878 		// Voice-deactivation threshold has not yet been reached
879 		bIsSpeech = true;
880 	}
881 
882 	if (! bIsSpeech) {
883 		iHoldFrames++;
884 		if (iHoldFrames < g.s.iVoiceHold)
885 			bIsSpeech = true;
886 	} else {
887 		iHoldFrames = 0;
888 	}
889 
890 	if (g.s.atTransmit == Settings::Continuous) {
891 		// Continous transmission is enabled
892 		bIsSpeech = true;
893 	} else if (g.s.atTransmit == Settings::PushToTalk) {
894 		// PTT is enabled, so check if it is currently active
895 		bIsSpeech = g.s.uiDoublePush && ((g.uiDoublePush < g.s.uiDoublePush) || (g.tDoublePush.elapsed() < g.s.uiDoublePush));
896 	}
897 
898 	// If g.iPushToTalk > 0 that means that we are currently in some sort of PTT action. For
899 	// instance this could mean we're currently whispering
900 	bIsSpeech = bIsSpeech || (g.iPushToTalk > 0);
901 
902 	ClientUser *p = ClientUser::get(g.uiSession);
903 	if (g.s.bMute || ((g.s.lmLoopMode != Settings::Local) && p && (p->bMute || p->bSuppress)) || g.bPushToMute || (voiceTargetID < 0)) {
904 		bIsSpeech = false;
905 	}
906 
907 	if (bIsSpeech) {
908 		iSilentFrames = 0;
909 	} else {
910 		iSilentFrames++;
911 		if (iSilentFrames > 500)
912 			iFrameCounter = 0;
913 	}
914 
915 	if (p) {
916 		if (! bIsSpeech)
917 			p->setTalking(Settings::Passive);
918 		else if (voiceTargetID == 0)
919 			p->setTalking(Settings::Talking);
920 		else
921 			p->setTalking(Settings::Shouting);
922 	}
923 
924 	if (g.s.bTxAudioCue && g.uiSession != 0) {
925 		AudioOutputPtr ao = g.ao;
926 		if (bIsSpeech && ! bPreviousVoice && ao)
927 			ao->playSample(g.s.qsTxAudioCueOn);
928 		else if (ao && !bIsSpeech && bPreviousVoice)
929 			ao->playSample(g.s.qsTxAudioCueOff);
930 	}
931 
932 	if (! bIsSpeech && ! bPreviousVoice) {
933 		iBitrate = 0;
934 
935 		if ((tIdle.elapsed() / 1000000ULL) > g.s.iIdleTime) {
936 			activityState = ActivityStateIdle;
937 			tIdle.restart();
938 			if (g.s.iaeIdleAction == Settings::Deafen && !g.s.bDeaf) {
939 				emit doDeaf();
940 			} else if (g.s.iaeIdleAction == Settings::Mute && !g.s.bMute) {
941 				emit doMute();
942 			}
943 		}
944 
945 		if (activityState == ActivityStateReturnedFromIdle) {
946 			activityState = ActivityStateActive;
947 			if (g.s.iaeIdleAction != Settings::Nothing && g.s.bUndoIdleActionUponActivity) {
948 				if (g.s.iaeIdleAction == Settings::Deafen && g.s.bDeaf) {
949 					emit doDeaf();
950 				} else if (g.s.iaeIdleAction == Settings::Mute && g.s.bMute) {
951 					emit doMute();
952 				}
953 			}
954 		}
955 
956 		spx_int32_t increment = 0;
957 		speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &increment);
958 		return;
959 	} else {
960 		spx_int32_t increment = 12;
961 		speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &increment);
962 	}
963 
964 	if (bIsSpeech && !bPreviousVoice) {
965 		bResetEncoder = true;
966 	}
967 
968 	tIdle.restart();
969 
970 	EncodingOutputBuffer buffer;
971 	Q_ASSERT(buffer.size() >= static_cast<size_t>(iAudioQuality / 100 * iAudioFrames / 8));
972 
973 	int len = 0;
974 
975 	bool encoded = true;
976 	if (!selectCodec())
977 		return;
978 
979 	if (umtType == MessageHandler::UDPVoiceCELTAlpha || umtType == MessageHandler::UDPVoiceCELTBeta) {
980 		len = encodeCELTFrame(psSource, buffer);
981 		if (len <= 0) {
982 			iBitrate = 0;
983 			qWarning() << "encodeCELTFrame failed" << iBufferedFrames << iFrameSize << len;
984 			return;
985 		}
986 		++iBufferedFrames;
987 	} else if (umtType == MessageHandler::UDPVoiceOpus) {
988 		encoded = false;
989 		opusBuffer.insert(opusBuffer.end(), psSource, psSource + iFrameSize);
990 		++iBufferedFrames;
991 
992 		if (!bIsSpeech || iBufferedFrames >= iAudioFrames) {
993 			if (iBufferedFrames < iAudioFrames) {
994 				// Stuff frame to framesize if speech ends and we don't have enough audio
995 				// this way we are guaranteed to have a valid framecount and won't cause
996 				// a codec configuration switch by suddenly using a wildly different
997 				// framecount per packet.
998 				const int missingFrames = iAudioFrames - iBufferedFrames;
999 				opusBuffer.insert(opusBuffer.end(), iFrameSize * missingFrames, 0);
1000 				iBufferedFrames += missingFrames;
1001 				iFrameCounter += missingFrames;
1002 			}
1003 
1004 			Q_ASSERT(iBufferedFrames == iAudioFrames);
1005 
1006 			len = encodeOpusFrame(&opusBuffer[0], iBufferedFrames * iFrameSize, buffer);
1007 			opusBuffer.clear();
1008 			if (len <= 0) {
1009 				iBitrate = 0;
1010 				qWarning() << "encodeOpusFrame failed" << iBufferedFrames << iFrameSize << len;
1011 				iBufferedFrames = 0; // These are lost. Make sure not to mess up our sequence counter next flushCheck.
1012 				return;
1013 			}
1014 			encoded = true;
1015 		}
1016 	}
1017 
1018 	if (encoded) {
1019 		flushCheck(QByteArray(reinterpret_cast<char *>(&buffer[0]), len), !bIsSpeech, voiceTargetID);
1020 	}
1021 
1022 	if (! bIsSpeech)
1023 		iBitrate = 0;
1024 
1025 	bPreviousVoice = bIsSpeech;
1026 }
1027 
sendAudioFrame(const char * data,PacketDataStream & pds)1028 static void sendAudioFrame(const char *data, PacketDataStream &pds) {
1029 	ServerHandlerPtr sh = g.sh;
1030 	if (sh) {
1031 		VoiceRecorderPtr recorder(sh->recorder);
1032 		if (recorder)
1033 			recorder->getRecordUser().addFrame(QByteArray(data, pds.size() + 1));
1034 	}
1035 
1036 	if (g.s.lmLoopMode == Settings::Local)
1037 		LoopUser::lpLoopy.addFrame(QByteArray(data, pds.size() + 1));
1038 	else if (sh)
1039 		sh->sendMessage(data, pds.size() + 1);
1040 }
1041 
flushCheck(const QByteArray & frame,bool terminator,int voiceTargetID)1042 void AudioInput::flushCheck(const QByteArray &frame, bool terminator, int voiceTargetID) {
1043 	qlFrames << frame;
1044 
1045 	if (! terminator && iBufferedFrames < iAudioFrames)
1046 		return;
1047 
1048 	int flags = 0;
1049 	if (voiceTargetID > 0) {
1050 		flags = voiceTargetID;
1051 	}
1052 	if (terminator && g.iPrevTarget > 0) {
1053 		// If we have been whispering to some target but have just ended, terminator will be true. However
1054 		// in the case of whispering this means that we just released the whisper key so this here is the
1055 		// last audio frame that is sent for whispering. The whisper key being released means that g.iTarget
1056 		// is reset to 0 by now. In order to send the last whisper frame correctly, we have to use
1057 		// g.iPrevTarget which is set to whatever g.iTarget has been before its last change.
1058 
1059 		flags = g.iPrevTarget;
1060 
1061 		// We reset g.iPrevTarget as it has fulfilled its purpose for this whisper-action. It'll be set
1062 		// accordingly once the client whispers for the next time.
1063 		g.iPrevTarget = 0;
1064 	}
1065 
1066 	if (g.s.lmLoopMode == Settings::Server)
1067 		flags = 0x1f; // Server loopback
1068 
1069 	flags |= (umtType << 5);
1070 
1071 	char data[1024];
1072 	data[0] = static_cast<unsigned char>(flags);
1073 
1074 	int frames = iBufferedFrames;
1075 	iBufferedFrames = 0;
1076 
1077 	PacketDataStream pds(data + 1, 1023);
1078 	// Sequence number
1079 	pds << iFrameCounter - frames;
1080 
1081 	if (umtType == MessageHandler::UDPVoiceOpus) {
1082 		const QByteArray &qba = qlFrames.takeFirst();
1083 		int size = qba.size();
1084 		if (terminator)
1085 			size |= 1 << 13;
1086 		pds << size;
1087 		pds.append(qba.constData(), qba.size());
1088 	} else {
1089 		if (terminator) {
1090 			qlFrames << QByteArray();
1091 			++frames;
1092 		}
1093 
1094 		for (int i = 0; i < frames; ++i) {
1095 			const QByteArray &qba = qlFrames.takeFirst();
1096 			unsigned char head = static_cast<unsigned char>(qba.size());
1097 			if (i < frames - 1)
1098 				head |= 0x80;
1099 			pds.append(head);
1100 			pds.append(qba.constData(), qba.size());
1101 		}
1102 	}
1103 
1104 	if (g.s.bTransmitPosition && g.p && ! g.bCenterPosition && g.p->fetch()) {
1105 		pds << g.p->fPosition[0];
1106 		pds << g.p->fPosition[1];
1107 		pds << g.p->fPosition[2];
1108 	}
1109 
1110 	sendAudioFrame(data, pds);
1111 
1112 	Q_ASSERT(qlFrames.isEmpty());
1113 }
1114 
isAlive() const1115 bool AudioInput::isAlive() const {
1116 	return isRunning();
1117 }
1118