1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 //
5 // Implementation of AudioInputStream for Windows using Windows Core Audio
6 // WASAPI for low latency capturing.
7 //
8 // Overview of operation:
9 //
10 // - An object of WASAPIAudioInputStream is created by the AudioManager
11 //   factory.
12 // - Next some thread will call Open(), at that point the underlying
13 //   Core Audio APIs are utilized to create two WASAPI interfaces called
14 //   IAudioClient and IAudioCaptureClient.
15 // - Then some thread will call Start(sink).
16 //   A thread called "wasapi_capture_thread" is started and this thread listens
17 //   on an event signal which is set periodically by the audio engine for
18 //   each recorded data packet. As a result, data samples will be provided
19 //   to the registered sink.
20 // - At some point, a thread will call Stop(), which stops and joins the
21 //   capture thread and at the same time stops audio streaming.
22 // - The same thread that called stop will call Close() where we cleanup
23 //   and notify the audio manager, which likely will destroy this object.
24 //
25 // Implementation notes:
26 //
27 // - The minimum supported client is Windows Vista.
28 // - This implementation is single-threaded, hence:
29 //    o Construction and destruction must take place from the same thread.
30 //    o It is recommended to call all APIs from the same thread as well.
31 // - It is recommended to first acquire the native sample rate of the default
32 //   input device and then use the same rate when creating this object. Use
33 //   WASAPIAudioInputStream::HardwareSampleRate() to retrieve the sample rate.
34 // - Calling Close() also leads to self destruction.
35 //
36 // Core Audio API details:
37 //
38 // - Utilized MMDevice interfaces:
39 //     o IMMDeviceEnumerator
40 //     o IMMDevice
41 // - Utilized WASAPI interfaces:
42 //     o IAudioClient
43 //     o IAudioCaptureClient
44 // - The stream is initialized in shared mode and the processing of the
45 //   audio buffer is event driven.
46 // - The Multimedia Class Scheduler service (MMCSS) is utilized to boost
47 //   the priority of the capture thread.
48 // - Audio applications that use the MMDevice API and WASAPI typically use
49 //   the ISimpleAudioVolume interface to manage stream volume levels on a
50 //   per-session basis. It is also possible to use of the IAudioEndpointVolume
51 //   interface to control the master volume level of an audio endpoint device.
52 //   This implementation is using the ISimpleAudioVolume interface.
53 //   MSDN states that "In rare cases, a specialized audio application might
54 //   require the use of the IAudioEndpointVolume".
55 //
56 #ifndef MEDIA_AUDIO_WIN_AUDIO_LOW_LATENCY_INPUT_WIN_H_
57 #define MEDIA_AUDIO_WIN_AUDIO_LOW_LATENCY_INPUT_WIN_H_
58 
59 #include <Audioclient.h>
60 #include <MMDeviceAPI.h>
61 #include <endpointvolume.h>
62 #include <stddef.h>
63 #include <stdint.h>
64 #include <windows.media.effects.h>
65 #include <wrl/client.h>
66 
67 #include <memory>
68 #include <string>
69 #include <vector>
70 
71 #include "base/compiler_specific.h"
72 #include "base/macros.h"
73 #include "base/sequence_checker.h"
74 #include "base/threading/platform_thread.h"
75 #include "base/threading/simple_thread.h"
76 #include "base/win/scoped_co_mem.h"
77 #include "base/win/scoped_com_initializer.h"
78 #include "base/win/scoped_handle.h"
79 #include "media/audio/agc_audio_stream.h"
80 #include "media/audio/win/audio_manager_win.h"
81 #include "media/base/audio_converter.h"
82 #include "media/base/audio_parameters.h"
83 #include "media/base/media_export.h"
84 
85 namespace media {
86 
87 class AudioBlockFifo;
88 class AudioBus;
89 
90 // AudioInputStream implementation using Windows Core Audio APIs.
91 class MEDIA_EXPORT WASAPIAudioInputStream
92     : public AgcAudioStream<AudioInputStream>,
93       public base::DelegateSimpleThread::Delegate,
94       public AudioConverter::InputCallback {
95  public:
96   // Used to track down where we fail during initialization which at the
97   // moment seems to be happening frequently and we're not sure why.
98   // The reason might be expected (e.g. trying to open "default" on a machine
99   // that has no audio devices).
100   // Note: This enum is used to record a histogram value and should not be
101   // re-ordered.
102   enum StreamOpenResult {
103     OPEN_RESULT_OK = 0,
104     OPEN_RESULT_CREATE_INSTANCE = 1,
105     OPEN_RESULT_NO_ENDPOINT = 2,
106     OPEN_RESULT_NO_STATE = 3,
107     OPEN_RESULT_DEVICE_NOT_ACTIVE = 4,
108     OPEN_RESULT_ACTIVATION_FAILED = 5,
109     OPEN_RESULT_FORMAT_NOT_SUPPORTED = 6,
110     OPEN_RESULT_AUDIO_CLIENT_INIT_FAILED = 7,
111     OPEN_RESULT_GET_BUFFER_SIZE_FAILED = 8,
112     OPEN_RESULT_LOOPBACK_ACTIVATE_FAILED = 9,
113     OPEN_RESULT_LOOPBACK_INIT_FAILED = 10,
114     OPEN_RESULT_SET_EVENT_HANDLE = 11,
115     OPEN_RESULT_NO_CAPTURE_CLIENT = 12,
116     OPEN_RESULT_NO_AUDIO_VOLUME = 13,
117     OPEN_RESULT_OK_WITH_RESAMPLING = 14,
118     OPEN_RESULT_MAX = OPEN_RESULT_OK_WITH_RESAMPLING
119   };
120 
121   // The ctor takes all the usual parameters, plus |manager| which is the
122   // the audio manager who is creating this object.
123   WASAPIAudioInputStream(AudioManagerWin* manager,
124                          const AudioParameters& params,
125                          const std::string& device_id,
126                          AudioManager::LogCallback log_callback);
127 
128   // The dtor is typically called by the AudioManager only and it is usually
129   // triggered by calling AudioInputStream::Close().
130   ~WASAPIAudioInputStream() override;
131 
132   // Implementation of AudioInputStream.
133   bool Open() override;
134   void Start(AudioInputCallback* callback) override;
135   void Stop() override;
136   void Close() override;
137   double GetMaxVolume() override;
138   void SetVolume(double volume) override;
139   double GetVolume() override;
140   bool IsMuted() override;
141   void SetOutputDeviceForAec(const std::string& output_device_id) override;
142 
started()143   bool started() const { return started_; }
144 
145  private:
146   void SendLogMessage(const char* format, ...) PRINTF_FORMAT(2, 3);
147 
148   // DelegateSimpleThread::Delegate implementation.
149   void Run() override;
150 
151   // Pulls capture data from the endpoint device and pushes it to the sink.
152   void PullCaptureDataAndPushToSink();
153 
154   // Issues the OnError() callback to the |sink_|.
155   void HandleError(HRESULT err);
156 
157   // The Open() method is divided into these sub methods.
158   HRESULT SetCaptureDevice();
159   // Returns whether raw audio processing is supported or not for the selected
160   // capture device.
161   bool RawProcessingSupported();
162   // The Windows.Media.Effects.AudioEffectsManager UWP API contains a method
163   // called CreateAudioCaptureEffectsManagerWithMode() which is needed to
164   // enumerate active audio effects on the capture stream. This UWP method
165   // needs a device ID which differs from what can be derived from the default
166   // Win32 API in CoreAudio. The GetUWPDeviceId() method builds up the required
167   // device ID that the audio effects manager needs. Note that it is also
168   // possible to get the ID directly from the Windows.Devices.Enumeration UWP
169   // API but that is rather complex and requires use of asynchronous methods.
170   std::string GetUWPDeviceId();
171   // For the selected |uwp_device_id|, generate two lists of enabled audio
172   // effects and store them in |default_effect_types_| and |raw_effect_types_|.
173   HRESULT GetAudioCaptureEffects(const std::string& uwp_device_id);
174   HRESULT SetCommunicationsCategoryAndRawCaptureMode();
175   HRESULT GetAudioEngineStreamFormat();
176   // Returns whether the desired format is supported or not and writes the
177   // result of a failing system call to |*hr|, or S_OK if successful. If this
178   // function returns false with |*hr| == S_FALSE, the OS supports a closest
179   // match but we don't support conversion to it.
180   bool DesiredFormatIsSupported(HRESULT* hr);
181   void SetupConverterAndStoreFormatInfo();
182   HRESULT InitializeAudioEngine();
183   void ReportOpenResult(HRESULT hr);
184   // Reports stats for format related audio client initialization
185   // (IAudioClient::Initialize) errors, that is if |hr| is an error related to
186   // the format.
187   void MaybeReportFormatRelatedInitError(HRESULT hr) const;
188 
189   // AudioConverter::InputCallback implementation.
190   double ProvideInput(AudioBus* audio_bus, uint32_t frames_delayed) override;
191 
192   // Detects and counts glitches based on |device_position|.
193   void UpdateGlitchCount(UINT64 device_position);
194 
195   // Reports glitch stats and resets associated variables.
196   void ReportAndResetGlitchStats();
197 
198   // Our creator, the audio manager needs to be notified when we close.
199   AudioManagerWin* const manager_;
200 
201   // Capturing is driven by this thread (which has no message loop).
202   // All OnData() callbacks will be called from this thread.
203   std::unique_ptr<base::DelegateSimpleThread> capture_thread_;
204 
205   // Contains the desired output audio format which is set up at construction
206   // and then never modified. It is the audio format this class will output
207   // data to the sink in, or equivalently, the format after the converter if
208   // such is needed. Does not need the extended version since we only support
209   // max stereo at this stage.
210   WAVEFORMATEX output_format_;
211 
212   // Contains the audio format we get data from the audio engine in. Initially
213   // set to |output_format_| at construction but it might be changed to a close
214   // match if the audio engine doesn't support the originally set format. Note
215   // that, this is also the format after the FIFO, i.e. the input format to the
216   // converter if any.
217   WAVEFORMATEXTENSIBLE input_format_;
218 
219   bool opened_ = false;
220   bool started_ = false;
221   StreamOpenResult open_result_ = OPEN_RESULT_OK;
222 
223   // Size in bytes of each audio frame before the converter (4 bytes for 16-bit
224   // stereo PCM). Note that this is the same before and after the fifo.
225   size_t frame_size_bytes_ = 0;
226 
227   // Size in audio frames of each audio packet (buffer) after the fifo but
228   // before the converter.
229   size_t packet_size_frames_ = 0;
230 
231   // Size in bytes of each audio packet (buffer) after the fifo but before the
232   // converter.
233   size_t packet_size_bytes_ = 0;
234 
235   // Length of the audio endpoint buffer, i.e. the buffer size before the fifo.
236   uint32_t endpoint_buffer_size_frames_ = 0;
237 
238   // Contains the unique name of the selected endpoint device.
239   // Note that AudioDeviceDescription::kDefaultDeviceId represents the default
240   // device role and is not a valid ID as such.
241   std::string device_id_;
242 
243   // Pointer to the object that will receive the recorded audio samples.
244   AudioInputCallback* sink_ = nullptr;
245 
246   // Windows Multimedia Device (MMDevice) API interfaces.
247 
248   // An IMMDevice interface which represents an audio endpoint device.
249   Microsoft::WRL::ComPtr<IMMDevice> endpoint_device_;
250 
251   // Windows Audio Session API (WASAPI) interfaces.
252 
253   // An IAudioClient interface which enables a client to create and initialize
254   // an audio stream between an audio application and the audio engine.
255   Microsoft::WRL::ComPtr<IAudioClient> audio_client_;
256 
257   // Loopback IAudioClient doesn't support event-driven mode, so a separate
258   // IAudioClient is needed to receive notifications when data is available in
259   // the buffer. For loopback input |audio_client_| is used to receive data,
260   // while |audio_render_client_for_loopback_| is used to get notifications
261   // when a new buffer is ready. See comment in InitializeAudioEngine() for
262   // details.
263   Microsoft::WRL::ComPtr<IAudioClient> audio_render_client_for_loopback_;
264 
265   // The IAudioCaptureClient interface enables a client to read input data
266   // from a capture endpoint buffer.
267   Microsoft::WRL::ComPtr<IAudioCaptureClient> audio_capture_client_;
268 
269   // The IAudioClock interface is used to get the current timestamp, as the
270   // timestamp from IAudioCaptureClient::GetBuffer can be unreliable with some
271   // devices.
272   Microsoft::WRL::ComPtr<IAudioClock> audio_clock_;
273 
274   // The ISimpleAudioVolume interface enables a client to control the
275   // master volume level of an audio session.
276   // The volume-level is a value in the range 0.0 to 1.0.
277   // This interface does only work with shared-mode streams.
278   Microsoft::WRL::ComPtr<ISimpleAudioVolume> simple_audio_volume_;
279 
280   // The IAudioEndpointVolume allows a client to control the volume level of
281   // the whole system.
282   Microsoft::WRL::ComPtr<IAudioEndpointVolume> system_audio_volume_;
283 
284   // The audio engine will signal this event each time a buffer has been
285   // recorded.
286   base::win::ScopedHandle audio_samples_ready_event_;
287 
288   // This event will be signaled when capturing shall stop.
289   base::win::ScopedHandle stop_capture_event_;
290 
291   // Never set it through external API. Only used when |device_id_| ==
292   // kLoopbackWithMuteDeviceId.
293   // True, if we have muted the system audio for the stream capturing, and
294   // indicates that we need to unmute the system audio when stopping capturing.
295   bool mute_done_ = false;
296 
297   // Used for the captured audio on the callback thread.
298   std::unique_ptr<AudioBlockFifo> fifo_;
299 
300   // If the caller requires resampling (should only be in exceptional cases and
301   // ideally, never), we support using an AudioConverter.
302   std::unique_ptr<AudioConverter> converter_;
303   std::unique_ptr<AudioBus> convert_bus_;
304   bool imperfect_buffer_size_conversion_ = false;
305 
306   // Callback to send log messages to registered clients.
307   AudioManager::LogCallback log_callback_;
308 
309   // For detecting and reporting glitches.
310   UINT64 expected_next_device_position_ = 0;
311   int total_glitches_ = 0;
312   UINT64 total_lost_frames_ = 0;
313   UINT64 largest_glitch_frames_ = 0;
314 
315   // Enabled if the volume level of the audio session is set to zero when the
316   // session starts. Utilized in UMA histogram.
317   bool audio_session_starts_at_zero_volume_ = false;
318 
319   // Set to true if the selected audio device supports raw audio capture.
320   // Also added to a UMS histogram.
321   bool raw_processing_supported_ = false;
322 
323   // List of supported and active capture effects for the selected device in
324   // default (normal) audio processing mode.
325   std::vector<ABI::Windows::Media::Effects::AudioEffectType>
326       default_effect_types_;
327   // List of supported and active capture effects for the selected device in
328   // raw (minimal) audio processing mode. Will be empty in most cases.
329   std::vector<ABI::Windows::Media::Effects::AudioEffectType> raw_effect_types_;
330 
331   SEQUENCE_CHECKER(sequence_checker_);
332 
333   DISALLOW_COPY_AND_ASSIGN(WASAPIAudioInputStream);
334 };
335 
336 }  // namespace media
337 
338 #endif  // MEDIA_AUDIO_WIN_AUDIO_LOW_LATENCY_INPUT_WIN_H_
339