1 /*
2  *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
12 #define MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
13 
14 // MSVC++ requires this to be set before any other includes to get M_PI.
15 #ifndef _USE_MATH_DEFINES
16 #define _USE_MATH_DEFINES
17 #endif
18 
19 #include <math.h>
20 #include <stddef.h>  // size_t
21 #include <stdio.h>   // FILE
22 #include <string.h>
23 
24 #include <vector>
25 
26 #include "absl/types/optional.h"
27 #include "api/array_view.h"
28 #include "api/audio/echo_canceller3_config.h"
29 #include "api/audio/echo_control.h"
30 #include "api/scoped_refptr.h"
31 #include "modules/audio_processing/include/audio_processing_statistics.h"
32 #include "modules/audio_processing/include/config.h"
33 #include "rtc_base/arraysize.h"
34 #include "rtc_base/constructor_magic.h"
35 #include "rtc_base/deprecation.h"
36 #include "rtc_base/ref_count.h"
37 #include "rtc_base/system/file_wrapper.h"
38 #include "rtc_base/system/rtc_export.h"
39 
40 namespace rtc {
41 class TaskQueue;
42 }  // namespace rtc
43 
44 namespace webrtc {
45 
46 class AecDump;
47 class AudioBuffer;
48 
49 class StreamConfig;
50 class ProcessingConfig;
51 
52 class EchoDetector;
53 class CustomAudioAnalyzer;
54 class CustomProcessing;
55 
56 // Use to enable experimental gain control (AGC). At startup the experimental
57 // AGC moves the microphone volume up to |startup_min_volume| if the current
58 // microphone volume is set too low. The value is clamped to its operating range
59 // [12, 255]. Here, 255 maps to 100%.
60 //
61 // Must be provided through AudioProcessingBuilder().Create(config).
62 #if defined(WEBRTC_CHROMIUM_BUILD)
63 static const int kAgcStartupMinVolume = 85;
64 #else
65 static const int kAgcStartupMinVolume = 0;
66 #endif  // defined(WEBRTC_CHROMIUM_BUILD)
67 static constexpr int kClippedLevelMin = 70;
68 
69 // To be deprecated: Please instead use the flag in the
70 // AudioProcessing::Config::AnalogGainController.
71 // TODO(webrtc:5298): Remove.
72 struct ExperimentalAgc {
73   ExperimentalAgc() = default;
ExperimentalAgcExperimentalAgc74   explicit ExperimentalAgc(bool enabled) : enabled(enabled) {}
ExperimentalAgcExperimentalAgc75   ExperimentalAgc(bool enabled,
76                   bool enabled_agc2_level_estimator,
77                   bool digital_adaptive_disabled)
78       : enabled(enabled),
79         enabled_agc2_level_estimator(enabled_agc2_level_estimator),
80         digital_adaptive_disabled(digital_adaptive_disabled) {}
81   // Deprecated constructor: will be removed.
ExperimentalAgcExperimentalAgc82   ExperimentalAgc(bool enabled,
83                   bool enabled_agc2_level_estimator,
84                   bool digital_adaptive_disabled,
85                   bool analyze_before_aec)
86       : enabled(enabled),
87         enabled_agc2_level_estimator(enabled_agc2_level_estimator),
88         digital_adaptive_disabled(digital_adaptive_disabled) {}
ExperimentalAgcExperimentalAgc89   ExperimentalAgc(bool enabled, int startup_min_volume)
90       : enabled(enabled), startup_min_volume(startup_min_volume) {}
ExperimentalAgcExperimentalAgc91   ExperimentalAgc(bool enabled, int startup_min_volume, int clipped_level_min)
92       : enabled(enabled),
93         startup_min_volume(startup_min_volume),
94         clipped_level_min(clipped_level_min) {}
95   static const ConfigOptionID identifier = ConfigOptionID::kExperimentalAgc;
96   bool enabled = true;
97   int startup_min_volume = kAgcStartupMinVolume;
98   // Lowest microphone level that will be applied in response to clipping.
99   int clipped_level_min = kClippedLevelMin;
100   bool enabled_agc2_level_estimator = false;
101   bool digital_adaptive_disabled = false;
102 };
103 
104 // To be deprecated: Please instead use the flag in the
105 // AudioProcessing::Config::TransientSuppression.
106 //
107 // Use to enable experimental noise suppression. It can be set in the
108 // constructor.
109 // TODO(webrtc:5298): Remove.
110 struct ExperimentalNs {
ExperimentalNsExperimentalNs111   ExperimentalNs() : enabled(false) {}
ExperimentalNsExperimentalNs112   explicit ExperimentalNs(bool enabled) : enabled(enabled) {}
113   static const ConfigOptionID identifier = ConfigOptionID::kExperimentalNs;
114   bool enabled;
115 };
116 
117 // The Audio Processing Module (APM) provides a collection of voice processing
118 // components designed for real-time communications software.
119 //
120 // APM operates on two audio streams on a frame-by-frame basis. Frames of the
121 // primary stream, on which all processing is applied, are passed to
122 // |ProcessStream()|. Frames of the reverse direction stream are passed to
123 // |ProcessReverseStream()|. On the client-side, this will typically be the
124 // near-end (capture) and far-end (render) streams, respectively. APM should be
125 // placed in the signal chain as close to the audio hardware abstraction layer
126 // (HAL) as possible.
127 //
128 // On the server-side, the reverse stream will normally not be used, with
129 // processing occurring on each incoming stream.
130 //
131 // Component interfaces follow a similar pattern and are accessed through
132 // corresponding getters in APM. All components are disabled at create-time,
133 // with default settings that are recommended for most situations. New settings
134 // can be applied without enabling a component. Enabling a component triggers
135 // memory allocation and initialization to allow it to start processing the
136 // streams.
137 //
138 // Thread safety is provided with the following assumptions to reduce locking
139 // overhead:
140 //   1. The stream getters and setters are called from the same thread as
141 //      ProcessStream(). More precisely, stream functions are never called
142 //      concurrently with ProcessStream().
143 //   2. Parameter getters are never called concurrently with the corresponding
144 //      setter.
145 //
146 // APM accepts only linear PCM audio data in chunks of 10 ms. The int16
147 // interfaces use interleaved data, while the float interfaces use deinterleaved
148 // data.
149 //
150 // Usage example, omitting error checking:
151 // AudioProcessing* apm = AudioProcessingBuilder().Create();
152 //
153 // AudioProcessing::Config config;
154 // config.echo_canceller.enabled = true;
155 // config.echo_canceller.mobile_mode = false;
156 //
157 // config.gain_controller1.enabled = true;
158 // config.gain_controller1.mode =
159 // AudioProcessing::Config::GainController1::kAdaptiveAnalog;
160 // config.gain_controller1.analog_level_minimum = 0;
161 // config.gain_controller1.analog_level_maximum = 255;
162 //
163 // config.gain_controller2.enabled = true;
164 //
165 // config.high_pass_filter.enabled = true;
166 //
167 // config.voice_detection.enabled = true;
168 //
169 // apm->ApplyConfig(config)
170 //
171 // apm->noise_reduction()->set_level(kHighSuppression);
172 // apm->noise_reduction()->Enable(true);
173 //
174 // // Start a voice call...
175 //
176 // // ... Render frame arrives bound for the audio HAL ...
177 // apm->ProcessReverseStream(render_frame);
178 //
179 // // ... Capture frame arrives from the audio HAL ...
180 // // Call required set_stream_ functions.
181 // apm->set_stream_delay_ms(delay_ms);
182 // apm->set_stream_analog_level(analog_level);
183 //
184 // apm->ProcessStream(capture_frame);
185 //
186 // // Call required stream_ functions.
187 // analog_level = apm->recommended_stream_analog_level();
188 // has_voice = apm->stream_has_voice();
189 //
190 // // Repeate render and capture processing for the duration of the call...
191 // // Start a new call...
192 // apm->Initialize();
193 //
194 // // Close the application...
195 // delete apm;
196 //
197 class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface {
198  public:
199   // The struct below constitutes the new parameter scheme for the audio
200   // processing. It is being introduced gradually and until it is fully
201   // introduced, it is prone to change.
202   // TODO(peah): Remove this comment once the new config scheme is fully rolled
203   // out.
204   //
205   // The parameters and behavior of the audio processing module are controlled
206   // by changing the default values in the AudioProcessing::Config struct.
207   // The config is applied by passing the struct to the ApplyConfig method.
208   //
209   // This config is intended to be used during setup, and to enable/disable
210   // top-level processing effects. Use during processing may cause undesired
211   // submodule resets, affecting the audio quality. Use the RuntimeSetting
212   // construct for runtime configuration.
213   struct RTC_EXPORT Config {
214 
215     // Sets the properties of the audio processing pipeline.
216     struct RTC_EXPORT Pipeline {
217       Pipeline();
218 
219       // Maximum allowed processing rate used internally. May only be set to
220       // 32000 or 48000 and any differing values will be treated as 48000. The
221       // default rate is currently selected based on the CPU architecture, but
222       // that logic may change.
223       int maximum_internal_processing_rate;
224       // Allow multi-channel processing of render audio.
225       bool multi_channel_render = false;
226       // Allow multi-channel processing of capture audio when AEC3 is active
227       // or a custom AEC is injected..
228       bool multi_channel_capture = false;
229     } pipeline;
230 
231     // Enabled the pre-amplifier. It amplifies the capture signal
232     // before any other processing is done.
233     struct PreAmplifier {
234       bool enabled = false;
235       float fixed_gain_factor = 1.f;
236     } pre_amplifier;
237 
238     struct HighPassFilter {
239       bool enabled = false;
240       bool apply_in_full_band = true;
241     } high_pass_filter;
242 
243     struct EchoCanceller {
244       bool enabled = false;
245       bool mobile_mode = false;
246       bool export_linear_aec_output = false;
247       // Enforce the highpass filter to be on (has no effect for the mobile
248       // mode).
249       bool enforce_high_pass_filtering = true;
250     } echo_canceller;
251 
252     // Enables background noise suppression.
253     struct NoiseSuppression {
254       bool enabled = false;
255       enum Level { kLow, kModerate, kHigh, kVeryHigh };
256       Level level = kModerate;
257       bool analyze_linear_aec_output_when_available = false;
258     } noise_suppression;
259 
260     // Enables transient suppression.
261     struct TransientSuppression {
262       bool enabled = false;
263     } transient_suppression;
264 
265     // Enables reporting of |voice_detected| in webrtc::AudioProcessingStats.
266     struct VoiceDetection {
267       bool enabled = false;
268     } voice_detection;
269 
270     // Enables automatic gain control (AGC) functionality.
271     // The automatic gain control (AGC) component brings the signal to an
272     // appropriate range. This is done by applying a digital gain directly and,
273     // in the analog mode, prescribing an analog gain to be applied at the audio
274     // HAL.
275     // Recommended to be enabled on the client-side.
276     struct GainController1 {
277       bool enabled = false;
278       enum Mode {
279         // Adaptive mode intended for use if an analog volume control is
280         // available on the capture device. It will require the user to provide
281         // coupling between the OS mixer controls and AGC through the
282         // stream_analog_level() functions.
283         // It consists of an analog gain prescription for the audio device and a
284         // digital compression stage.
285         kAdaptiveAnalog,
286         // Adaptive mode intended for situations in which an analog volume
287         // control is unavailable. It operates in a similar fashion to the
288         // adaptive analog mode, but with scaling instead applied in the digital
289         // domain. As with the analog mode, it additionally uses a digital
290         // compression stage.
291         kAdaptiveDigital,
292         // Fixed mode which enables only the digital compression stage also used
293         // by the two adaptive modes.
294         // It is distinguished from the adaptive modes by considering only a
295         // short time-window of the input signal. It applies a fixed gain
296         // through most of the input level range, and compresses (gradually
297         // reduces gain with increasing level) the input signal at higher
298         // levels. This mode is preferred on embedded devices where the capture
299         // signal level is predictable, so that a known gain can be applied.
300         kFixedDigital
301       };
302       Mode mode = kAdaptiveAnalog;
303       // Sets the target peak level (or envelope) of the AGC in dBFs (decibels
304       // from digital full-scale). The convention is to use positive values. For
305       // instance, passing in a value of 3 corresponds to -3 dBFs, or a target
306       // level 3 dB below full-scale. Limited to [0, 31].
307       int target_level_dbfs = 3;
308       // Sets the maximum gain the digital compression stage may apply, in dB. A
309       // higher number corresponds to greater compression, while a value of 0
310       // will leave the signal uncompressed. Limited to [0, 90].
311       // For updates after APM setup, use a RuntimeSetting instead.
312       int compression_gain_db = 9;
313       // When enabled, the compression stage will hard limit the signal to the
314       // target level. Otherwise, the signal will be compressed but not limited
315       // above the target level.
316       bool enable_limiter = true;
317       // Sets the minimum and maximum analog levels of the audio capture device.
318       // Must be set if an analog mode is used. Limited to [0, 65535].
319       int analog_level_minimum = 0;
320       int analog_level_maximum = 255;
321 
322       // Enables the analog gain controller functionality.
323       struct AnalogGainController {
324         bool enabled = true;
325         int startup_min_volume = kAgcStartupMinVolume;
326         // Lowest analog microphone level that will be applied in response to
327         // clipping.
328         int clipped_level_min = kClippedLevelMin;
329         bool enable_agc2_level_estimator = false;
330         bool enable_digital_adaptive = true;
331       } analog_gain_controller;
332     } gain_controller1;
333 
334     // Enables the next generation AGC functionality. This feature replaces the
335     // standard methods of gain control in the previous AGC. Enabling this
336     // submodule enables an adaptive digital AGC followed by a limiter. By
337     // setting |fixed_gain_db|, the limiter can be turned into a compressor that
338     // first applies a fixed gain. The adaptive digital AGC can be turned off by
339     // setting |adaptive_digital_mode=false|.
340     struct GainController2 {
341       enum LevelEstimator { kRms, kPeak };
342       bool enabled = false;
343       struct {
344         float gain_db = 0.f;
345       } fixed_digital;
346       struct {
347         bool enabled = false;
348         float vad_probability_attack = 1.f;
349         LevelEstimator level_estimator = kRms;
350         int level_estimator_adjacent_speech_frames_threshold = 1;
351         // TODO(crbug.com/webrtc/7494): Remove `use_saturation_protector`.
352         bool use_saturation_protector = true;
353         float initial_saturation_margin_db = 20.f;
354         float extra_saturation_margin_db = 2.f;
355         int gain_applier_adjacent_speech_frames_threshold = 1;
356         float max_gain_change_db_per_second = 3.f;
357         float max_output_noise_level_dbfs = -50.f;
358       } adaptive_digital;
359     } gain_controller2;
360 
361     struct ResidualEchoDetector {
362       bool enabled = true;
363     } residual_echo_detector;
364 
365     // Enables reporting of |output_rms_dbfs| in webrtc::AudioProcessingStats.
366     struct LevelEstimation {
367       bool enabled = false;
368     } level_estimation;
369 
370     std::string ToString() const;
371   };
372 
373   // TODO(mgraczyk): Remove once all methods that use ChannelLayout are gone.
374   enum ChannelLayout {
375     kMono,
376     // Left, right.
377     kStereo,
378     // Mono, keyboard, and mic.
379     kMonoAndKeyboard,
380     // Left, right, keyboard, and mic.
381     kStereoAndKeyboard
382   };
383 
384   // Specifies the properties of a setting to be passed to AudioProcessing at
385   // runtime.
386   class RuntimeSetting {
387    public:
388     enum class Type {
389       kNotSpecified,
390       kCapturePreGain,
391       kCaptureCompressionGain,
392       kCaptureFixedPostGain,
393       kPlayoutVolumeChange,
394       kCustomRenderProcessingRuntimeSetting,
395       kPlayoutAudioDeviceChange,
396       kCaptureOutputUsed
397     };
398 
399     // Play-out audio device properties.
400     struct PlayoutAudioDeviceInfo {
401       int id;          // Identifies the audio device.
402       int max_volume;  // Maximum play-out volume.
403     };
404 
RuntimeSetting()405     RuntimeSetting() : type_(Type::kNotSpecified), value_(0.f) {}
406     ~RuntimeSetting() = default;
407 
CreateCapturePreGain(float gain)408     static RuntimeSetting CreateCapturePreGain(float gain) {
409       RTC_DCHECK_GE(gain, 1.f) << "Attenuation is not allowed.";
410       return {Type::kCapturePreGain, gain};
411     }
412 
413     // Corresponds to Config::GainController1::compression_gain_db, but for
414     // runtime configuration.
CreateCompressionGainDb(int gain_db)415     static RuntimeSetting CreateCompressionGainDb(int gain_db) {
416       RTC_DCHECK_GE(gain_db, 0);
417       RTC_DCHECK_LE(gain_db, 90);
418       return {Type::kCaptureCompressionGain, static_cast<float>(gain_db)};
419     }
420 
421     // Corresponds to Config::GainController2::fixed_digital::gain_db, but for
422     // runtime configuration.
CreateCaptureFixedPostGain(float gain_db)423     static RuntimeSetting CreateCaptureFixedPostGain(float gain_db) {
424       RTC_DCHECK_GE(gain_db, 0.f);
425       RTC_DCHECK_LE(gain_db, 90.f);
426       return {Type::kCaptureFixedPostGain, gain_db};
427     }
428 
429     // Creates a runtime setting to notify play-out (aka render) audio device
430     // changes.
CreatePlayoutAudioDeviceChange(PlayoutAudioDeviceInfo audio_device)431     static RuntimeSetting CreatePlayoutAudioDeviceChange(
432         PlayoutAudioDeviceInfo audio_device) {
433       return {Type::kPlayoutAudioDeviceChange, audio_device};
434     }
435 
436     // Creates a runtime setting to notify play-out (aka render) volume changes.
437     // |volume| is the unnormalized volume, the maximum of which
CreatePlayoutVolumeChange(int volume)438     static RuntimeSetting CreatePlayoutVolumeChange(int volume) {
439       return {Type::kPlayoutVolumeChange, volume};
440     }
441 
CreateCustomRenderSetting(float payload)442     static RuntimeSetting CreateCustomRenderSetting(float payload) {
443       return {Type::kCustomRenderProcessingRuntimeSetting, payload};
444     }
445 
CreateCaptureOutputUsedSetting(bool payload)446     static RuntimeSetting CreateCaptureOutputUsedSetting(bool payload) {
447       return {Type::kCaptureOutputUsed, payload};
448     }
449 
type()450     Type type() const { return type_; }
451     // Getters do not return a value but instead modify the argument to protect
452     // from implicit casting.
GetFloat(float * value)453     void GetFloat(float* value) const {
454       RTC_DCHECK(value);
455       *value = value_.float_value;
456     }
GetInt(int * value)457     void GetInt(int* value) const {
458       RTC_DCHECK(value);
459       *value = value_.int_value;
460     }
GetBool(bool * value)461     void GetBool(bool* value) const {
462       RTC_DCHECK(value);
463       *value = value_.bool_value;
464     }
GetPlayoutAudioDeviceInfo(PlayoutAudioDeviceInfo * value)465     void GetPlayoutAudioDeviceInfo(PlayoutAudioDeviceInfo* value) const {
466       RTC_DCHECK(value);
467       *value = value_.playout_audio_device_info;
468     }
469 
470    private:
RuntimeSetting(Type id,float value)471     RuntimeSetting(Type id, float value) : type_(id), value_(value) {}
RuntimeSetting(Type id,int value)472     RuntimeSetting(Type id, int value) : type_(id), value_(value) {}
RuntimeSetting(Type id,PlayoutAudioDeviceInfo value)473     RuntimeSetting(Type id, PlayoutAudioDeviceInfo value)
474         : type_(id), value_(value) {}
475     Type type_;
476     union U {
U()477       U() {}
U(int value)478       U(int value) : int_value(value) {}
U(float value)479       U(float value) : float_value(value) {}
U(PlayoutAudioDeviceInfo value)480       U(PlayoutAudioDeviceInfo value) : playout_audio_device_info(value) {}
481       float float_value;
482       int int_value;
483       bool bool_value;
484       PlayoutAudioDeviceInfo playout_audio_device_info;
485     } value_;
486   };
487 
~AudioProcessing()488   ~AudioProcessing() override {}
489 
490   // Initializes internal states, while retaining all user settings. This
491   // should be called before beginning to process a new audio stream. However,
492   // it is not necessary to call before processing the first stream after
493   // creation.
494   //
495   // It is also not necessary to call if the audio parameters (sample
496   // rate and number of channels) have changed. Passing updated parameters
497   // directly to |ProcessStream()| and |ProcessReverseStream()| is permissible.
498   // If the parameters are known at init-time though, they may be provided.
499   // TODO(webrtc:5298): Change to return void.
500   virtual int Initialize() = 0;
501 
502   // The int16 interfaces require:
503   //   - only |NativeRate|s be used
504   //   - that the input, output and reverse rates must match
505   //   - that |processing_config.output_stream()| matches
506   //     |processing_config.input_stream()|.
507   //
508   // The float interfaces accept arbitrary rates and support differing input and
509   // output layouts, but the output must have either one channel or the same
510   // number of channels as the input.
511   virtual int Initialize(const ProcessingConfig& processing_config) = 0;
512 
513   // Initialize with unpacked parameters. See Initialize() above for details.
514   //
515   // TODO(mgraczyk): Remove once clients are updated to use the new interface.
516   virtual int Initialize(int capture_input_sample_rate_hz,
517                          int capture_output_sample_rate_hz,
518                          int render_sample_rate_hz,
519                          ChannelLayout capture_input_layout,
520                          ChannelLayout capture_output_layout,
521                          ChannelLayout render_input_layout) = 0;
522 
523   // TODO(peah): This method is a temporary solution used to take control
524   // over the parameters in the audio processing module and is likely to change.
525   virtual void ApplyConfig(const Config& config) = 0;
526 
527   // TODO(ajm): Only intended for internal use. Make private and friend the
528   // necessary classes?
529   virtual int proc_sample_rate_hz() const = 0;
530   virtual int proc_split_sample_rate_hz() const = 0;
531   virtual size_t num_input_channels() const = 0;
532   virtual size_t num_proc_channels() const = 0;
533   virtual size_t num_output_channels() const = 0;
534   virtual size_t num_reverse_channels() const = 0;
535 
536   // Set to true when the output of AudioProcessing will be muted or in some
537   // other way not used. Ideally, the captured audio would still be processed,
538   // but some components may change behavior based on this information.
539   // Default false.
540   virtual void set_output_will_be_muted(bool muted) = 0;
541 
542   // Enqueue a runtime setting.
543   virtual void SetRuntimeSetting(RuntimeSetting setting) = 0;
544 
545   // Accepts and produces a 10 ms frame interleaved 16 bit integer audio as
546   // specified in |input_config| and |output_config|. |src| and |dest| may use
547   // the same memory, if desired.
548   virtual int ProcessStream(const int16_t* const src,
549                             const StreamConfig& input_config,
550                             const StreamConfig& output_config,
551                             int16_t* const dest) = 0;
552 
553   // Accepts deinterleaved float audio with the range [-1, 1]. Each element of
554   // |src| points to a channel buffer, arranged according to |input_stream|. At
555   // output, the channels will be arranged according to |output_stream| in
556   // |dest|.
557   //
558   // The output must have one channel or as many channels as the input. |src|
559   // and |dest| may use the same memory, if desired.
560   virtual int ProcessStream(const float* const* src,
561                             const StreamConfig& input_config,
562                             const StreamConfig& output_config,
563                             float* const* dest) = 0;
564 
565   // Accepts and produces a 10 ms frame of interleaved 16 bit integer audio for
566   // the reverse direction audio stream as specified in |input_config| and
567   // |output_config|. |src| and |dest| may use the same memory, if desired.
568   virtual int ProcessReverseStream(const int16_t* const src,
569                                    const StreamConfig& input_config,
570                                    const StreamConfig& output_config,
571                                    int16_t* const dest) = 0;
572 
573   // Accepts deinterleaved float audio with the range [-1, 1]. Each element of
574   // |data| points to a channel buffer, arranged according to |reverse_config|.
575   virtual int ProcessReverseStream(const float* const* src,
576                                    const StreamConfig& input_config,
577                                    const StreamConfig& output_config,
578                                    float* const* dest) = 0;
579 
580   // Accepts deinterleaved float audio with the range [-1, 1]. Each element
581   // of |data| points to a channel buffer, arranged according to
582   // |reverse_config|.
583   virtual int AnalyzeReverseStream(const float* const* data,
584                                    const StreamConfig& reverse_config) = 0;
585 
586   // Returns the most recently produced 10 ms of the linear AEC output at a rate
587   // of 16 kHz. If there is more than one capture channel, a mono representation
588   // of the input is returned. Returns true/false to indicate whether an output
589   // returned.
590   virtual bool GetLinearAecOutput(
591       rtc::ArrayView<std::array<float, 160>> linear_output) const = 0;
592 
593   // This must be called prior to ProcessStream() if and only if adaptive analog
594   // gain control is enabled, to pass the current analog level from the audio
595   // HAL. Must be within the range provided in Config::GainController1.
596   virtual void set_stream_analog_level(int level) = 0;
597 
598   // When an analog mode is set, this should be called after ProcessStream()
599   // to obtain the recommended new analog level for the audio HAL. It is the
600   // user's responsibility to apply this level.
601   virtual int recommended_stream_analog_level() const = 0;
602 
603   // This must be called if and only if echo processing is enabled.
604   //
605   // Sets the |delay| in ms between ProcessReverseStream() receiving a far-end
606   // frame and ProcessStream() receiving a near-end frame containing the
607   // corresponding echo. On the client-side this can be expressed as
608   //   delay = (t_render - t_analyze) + (t_process - t_capture)
609   // where,
610   //   - t_analyze is the time a frame is passed to ProcessReverseStream() and
611   //     t_render is the time the first sample of the same frame is rendered by
612   //     the audio hardware.
613   //   - t_capture is the time the first sample of a frame is captured by the
614   //     audio hardware and t_process is the time the same frame is passed to
615   //     ProcessStream().
616   virtual int set_stream_delay_ms(int delay) = 0;
617   virtual int stream_delay_ms() const = 0;
618 
619   // Call to signal that a key press occurred (true) or did not occur (false)
620   // with this chunk of audio.
621   virtual void set_stream_key_pressed(bool key_pressed) = 0;
622 
623   // Creates and attaches an webrtc::AecDump for recording debugging
624   // information.
625   // The |worker_queue| may not be null and must outlive the created
626   // AecDump instance. |max_log_size_bytes == -1| means the log size
627   // will be unlimited. |handle| may not be null. The AecDump takes
628   // responsibility for |handle| and closes it in the destructor. A
629   // return value of true indicates that the file has been
630   // sucessfully opened, while a value of false indicates that
631   // opening the file failed.
632   virtual bool CreateAndAttachAecDump(const std::string& file_name,
633                                       int64_t max_log_size_bytes,
634                                       rtc::TaskQueue* worker_queue) = 0;
635   virtual bool CreateAndAttachAecDump(FILE* handle,
636                                       int64_t max_log_size_bytes,
637                                       rtc::TaskQueue* worker_queue) = 0;
638 
639   // TODO(webrtc:5298) Deprecated variant.
640   // Attaches provided webrtc::AecDump for recording debugging
641   // information. Log file and maximum file size logic is supposed to
642   // be handled by implementing instance of AecDump. Calling this
643   // method when another AecDump is attached resets the active AecDump
644   // with a new one. This causes the d-tor of the earlier AecDump to
645   // be called. The d-tor call may block until all pending logging
646   // tasks are completed.
647   virtual void AttachAecDump(std::unique_ptr<AecDump> aec_dump) = 0;
648 
649   // If no AecDump is attached, this has no effect. If an AecDump is
650   // attached, it's destructor is called. The d-tor may block until
651   // all pending logging tasks are completed.
652   virtual void DetachAecDump() = 0;
653 
654   // Get audio processing statistics.
655   virtual AudioProcessingStats GetStatistics() = 0;
656   // TODO(webrtc:5298) Deprecated variant. The |has_remote_tracks| argument
657   // should be set if there are active remote tracks (this would usually be true
658   // during a call). If there are no remote tracks some of the stats will not be
659   // set by AudioProcessing, because they only make sense if there is at least
660   // one remote track.
661   virtual AudioProcessingStats GetStatistics(bool has_remote_tracks) = 0;
662 
663   // Returns the last applied configuration.
664   virtual AudioProcessing::Config GetConfig() const = 0;
665 
666   enum Error {
667     // Fatal errors.
668     kNoError = 0,
669     kUnspecifiedError = -1,
670     kCreationFailedError = -2,
671     kUnsupportedComponentError = -3,
672     kUnsupportedFunctionError = -4,
673     kNullPointerError = -5,
674     kBadParameterError = -6,
675     kBadSampleRateError = -7,
676     kBadDataLengthError = -8,
677     kBadNumberChannelsError = -9,
678     kFileError = -10,
679     kStreamParameterNotSetError = -11,
680     kNotEnabledError = -12,
681 
682     // Warnings are non-fatal.
683     // This results when a set_stream_ parameter is out of range. Processing
684     // will continue, but the parameter may have been truncated.
685     kBadStreamParameterWarning = -13
686   };
687 
688   // Native rates supported by the integer interfaces.
689   enum NativeRate {
690     kSampleRate8kHz = 8000,
691     kSampleRate16kHz = 16000,
692     kSampleRate32kHz = 32000,
693     kSampleRate48kHz = 48000
694   };
695 
696   // TODO(kwiberg): We currently need to support a compiler (Visual C++) that
697   // complains if we don't explicitly state the size of the array here. Remove
698   // the size when that's no longer the case.
699   static constexpr int kNativeSampleRatesHz[4] = {
700       kSampleRate8kHz, kSampleRate16kHz, kSampleRate32kHz, kSampleRate48kHz};
701   static constexpr size_t kNumNativeSampleRates =
702       arraysize(kNativeSampleRatesHz);
703   static constexpr int kMaxNativeSampleRateHz =
704       kNativeSampleRatesHz[kNumNativeSampleRates - 1];
705 
706   static const int kChunkSizeMs = 10;
707 };
708 
709 class RTC_EXPORT AudioProcessingBuilder {
710  public:
711   AudioProcessingBuilder();
712   ~AudioProcessingBuilder();
713   // The AudioProcessingBuilder takes ownership of the echo_control_factory.
SetEchoControlFactory(std::unique_ptr<EchoControlFactory> echo_control_factory)714   AudioProcessingBuilder& SetEchoControlFactory(
715       std::unique_ptr<EchoControlFactory> echo_control_factory) {
716     echo_control_factory_ = std::move(echo_control_factory);
717     return *this;
718   }
719   // The AudioProcessingBuilder takes ownership of the capture_post_processing.
SetCapturePostProcessing(std::unique_ptr<CustomProcessing> capture_post_processing)720   AudioProcessingBuilder& SetCapturePostProcessing(
721       std::unique_ptr<CustomProcessing> capture_post_processing) {
722     capture_post_processing_ = std::move(capture_post_processing);
723     return *this;
724   }
725   // The AudioProcessingBuilder takes ownership of the render_pre_processing.
SetRenderPreProcessing(std::unique_ptr<CustomProcessing> render_pre_processing)726   AudioProcessingBuilder& SetRenderPreProcessing(
727       std::unique_ptr<CustomProcessing> render_pre_processing) {
728     render_pre_processing_ = std::move(render_pre_processing);
729     return *this;
730   }
731   // The AudioProcessingBuilder takes ownership of the echo_detector.
SetEchoDetector(rtc::scoped_refptr<EchoDetector> echo_detector)732   AudioProcessingBuilder& SetEchoDetector(
733       rtc::scoped_refptr<EchoDetector> echo_detector) {
734     echo_detector_ = std::move(echo_detector);
735     return *this;
736   }
737   // The AudioProcessingBuilder takes ownership of the capture_analyzer.
SetCaptureAnalyzer(std::unique_ptr<CustomAudioAnalyzer> capture_analyzer)738   AudioProcessingBuilder& SetCaptureAnalyzer(
739       std::unique_ptr<CustomAudioAnalyzer> capture_analyzer) {
740     capture_analyzer_ = std::move(capture_analyzer);
741     return *this;
742   }
743   // This creates an APM instance using the previously set components. Calling
744   // the Create function resets the AudioProcessingBuilder to its initial state.
745   AudioProcessing* Create();
746   AudioProcessing* Create(const webrtc::Config& config);
747 
748  private:
749   std::unique_ptr<EchoControlFactory> echo_control_factory_;
750   std::unique_ptr<CustomProcessing> capture_post_processing_;
751   std::unique_ptr<CustomProcessing> render_pre_processing_;
752   rtc::scoped_refptr<EchoDetector> echo_detector_;
753   std::unique_ptr<CustomAudioAnalyzer> capture_analyzer_;
754   RTC_DISALLOW_COPY_AND_ASSIGN(AudioProcessingBuilder);
755 };
756 
757 class StreamConfig {
758  public:
759   // sample_rate_hz: The sampling rate of the stream.
760   //
761   // num_channels: The number of audio channels in the stream, excluding the
762   //               keyboard channel if it is present. When passing a
763   //               StreamConfig with an array of arrays T*[N],
764   //
765   //                N == {num_channels + 1  if  has_keyboard
766   //                     {num_channels      if  !has_keyboard
767   //
768   // has_keyboard: True if the stream has a keyboard channel. When has_keyboard
769   //               is true, the last channel in any corresponding list of
770   //               channels is the keyboard channel.
771   StreamConfig(int sample_rate_hz = 0,
772                size_t num_channels = 0,
773                bool has_keyboard = false)
sample_rate_hz_(sample_rate_hz)774       : sample_rate_hz_(sample_rate_hz),
775         num_channels_(num_channels),
776         has_keyboard_(has_keyboard),
777         num_frames_(calculate_frames(sample_rate_hz)) {}
778 
set_sample_rate_hz(int value)779   void set_sample_rate_hz(int value) {
780     sample_rate_hz_ = value;
781     num_frames_ = calculate_frames(value);
782   }
set_num_channels(size_t value)783   void set_num_channels(size_t value) { num_channels_ = value; }
set_has_keyboard(bool value)784   void set_has_keyboard(bool value) { has_keyboard_ = value; }
785 
sample_rate_hz()786   int sample_rate_hz() const { return sample_rate_hz_; }
787 
788   // The number of channels in the stream, not including the keyboard channel if
789   // present.
num_channels()790   size_t num_channels() const { return num_channels_; }
791 
has_keyboard()792   bool has_keyboard() const { return has_keyboard_; }
num_frames()793   size_t num_frames() const { return num_frames_; }
num_samples()794   size_t num_samples() const { return num_channels_ * num_frames_; }
795 
796   bool operator==(const StreamConfig& other) const {
797     return sample_rate_hz_ == other.sample_rate_hz_ &&
798            num_channels_ == other.num_channels_ &&
799            has_keyboard_ == other.has_keyboard_;
800   }
801 
802   bool operator!=(const StreamConfig& other) const { return !(*this == other); }
803 
804  private:
calculate_frames(int sample_rate_hz)805   static size_t calculate_frames(int sample_rate_hz) {
806     return static_cast<size_t>(AudioProcessing::kChunkSizeMs * sample_rate_hz /
807                                1000);
808   }
809 
810   int sample_rate_hz_;
811   size_t num_channels_;
812   bool has_keyboard_;
813   size_t num_frames_;
814 };
815 
816 class ProcessingConfig {
817  public:
818   enum StreamName {
819     kInputStream,
820     kOutputStream,
821     kReverseInputStream,
822     kReverseOutputStream,
823     kNumStreamNames,
824   };
825 
input_stream()826   const StreamConfig& input_stream() const {
827     return streams[StreamName::kInputStream];
828   }
output_stream()829   const StreamConfig& output_stream() const {
830     return streams[StreamName::kOutputStream];
831   }
reverse_input_stream()832   const StreamConfig& reverse_input_stream() const {
833     return streams[StreamName::kReverseInputStream];
834   }
reverse_output_stream()835   const StreamConfig& reverse_output_stream() const {
836     return streams[StreamName::kReverseOutputStream];
837   }
838 
input_stream()839   StreamConfig& input_stream() { return streams[StreamName::kInputStream]; }
output_stream()840   StreamConfig& output_stream() { return streams[StreamName::kOutputStream]; }
reverse_input_stream()841   StreamConfig& reverse_input_stream() {
842     return streams[StreamName::kReverseInputStream];
843   }
reverse_output_stream()844   StreamConfig& reverse_output_stream() {
845     return streams[StreamName::kReverseOutputStream];
846   }
847 
848   bool operator==(const ProcessingConfig& other) const {
849     for (int i = 0; i < StreamName::kNumStreamNames; ++i) {
850       if (this->streams[i] != other.streams[i]) {
851         return false;
852       }
853     }
854     return true;
855   }
856 
857   bool operator!=(const ProcessingConfig& other) const {
858     return !(*this == other);
859   }
860 
861   StreamConfig streams[StreamName::kNumStreamNames];
862 };
863 
864 // Experimental interface for a custom analysis submodule.
865 class CustomAudioAnalyzer {
866  public:
867   // (Re-) Initializes the submodule.
868   virtual void Initialize(int sample_rate_hz, int num_channels) = 0;
869   // Analyzes the given capture or render signal.
870   virtual void Analyze(const AudioBuffer* audio) = 0;
871   // Returns a string representation of the module state.
872   virtual std::string ToString() const = 0;
873 
~CustomAudioAnalyzer()874   virtual ~CustomAudioAnalyzer() {}
875 };
876 
877 // Interface for a custom processing submodule.
878 class CustomProcessing {
879  public:
880   // (Re-)Initializes the submodule.
881   virtual void Initialize(int sample_rate_hz, int num_channels) = 0;
882   // Processes the given capture or render signal.
883   virtual void Process(AudioBuffer* audio) = 0;
884   // Returns a string representation of the module state.
885   virtual std::string ToString() const = 0;
886   // Handles RuntimeSettings. TODO(webrtc:9262): make pure virtual
887   // after updating dependencies.
888   virtual void SetRuntimeSetting(AudioProcessing::RuntimeSetting setting);
889 
~CustomProcessing()890   virtual ~CustomProcessing() {}
891 };
892 
893 // Interface for an echo detector submodule.
894 class EchoDetector : public rtc::RefCountInterface {
895  public:
896   // (Re-)Initializes the submodule.
897   virtual void Initialize(int capture_sample_rate_hz,
898                           int num_capture_channels,
899                           int render_sample_rate_hz,
900                           int num_render_channels) = 0;
901 
902   // Analysis (not changing) of the render signal.
903   virtual void AnalyzeRenderAudio(rtc::ArrayView<const float> render_audio) = 0;
904 
905   // Analysis (not changing) of the capture signal.
906   virtual void AnalyzeCaptureAudio(
907       rtc::ArrayView<const float> capture_audio) = 0;
908 
909   // Pack an AudioBuffer into a vector<float>.
910   static void PackRenderAudioBuffer(AudioBuffer* audio,
911                                     std::vector<float>* packed_buffer);
912 
913   struct Metrics {
914     absl::optional<double> echo_likelihood;
915     absl::optional<double> echo_likelihood_recent_max;
916   };
917 
918   // Collect current metrics from the echo detector.
919   virtual Metrics GetMetrics() const = 0;
920 };
921 
922 }  // namespace webrtc
923 
924 #endif  // MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
925