1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "chrome/renderer/media/chrome_speech_recognition_client.h"
6 
7 #include <utility>
8 
9 #include "base/metrics/field_trial_params.h"
10 #include "base/metrics/histogram_functions.h"
11 #include "content/public/renderer/render_frame.h"
12 #include "media/base/audio_bus.h"
13 #include "media/base/audio_parameters.h"
14 #include "media/base/bind_to_current_loop.h"
15 #include "media/base/channel_mixer.h"
16 #include "media/base/media_switches.h"
17 #include "media/mojo/mojom/media_types.mojom.h"
18 #include "third_party/blink/public/common/browser_interface_broker_proxy.h"
19 #include "third_party/blink/public/platform/web_string.h"
20 #include "third_party/blink/public/web/web_frame.h"
21 #include "third_party/blink/public/web/web_local_frame.h"
22 
23 // Get the list of blocked URLs defined by the Finch experiment parameter. These
24 // websites provide captions by default and thus do not require the live caption
25 // feature.
GetBlockedURLs()26 std::vector<std::string> GetBlockedURLs() {
27   return base::SplitString(base::GetFieldTrialParamValueByFeature(
28                                media::kLiveCaption, "blocked_websites"),
29                            ",", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
30 }
31 
ChromeSpeechRecognitionClient(content::RenderFrame * render_frame,media::SpeechRecognitionClient::OnReadyCallback callback)32 ChromeSpeechRecognitionClient::ChromeSpeechRecognitionClient(
33     content::RenderFrame* render_frame,
34     media::SpeechRecognitionClient::OnReadyCallback callback)
35     : render_frame_(render_frame),
36       on_ready_callback_(std::move(callback)),
37       blocked_urls_(GetBlockedURLs()) {
38   initialize_callback_ = media::BindToCurrentLoop(base::BindRepeating(
39       &ChromeSpeechRecognitionClient::Initialize, weak_factory_.GetWeakPtr()));
40 
41   send_audio_callback_ = media::BindToCurrentLoop(base::BindRepeating(
42       &ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService,
43       weak_factory_.GetWeakPtr()));
44 
45   mojo::PendingReceiver<media::mojom::SpeechRecognitionClientBrowserInterface>
46       speech_recognition_client_browser_interface_receiver =
47           speech_recognition_client_browser_interface_
48               .BindNewPipeAndPassReceiver();
49   speech_recognition_client_browser_interface_
50       ->BindSpeechRecognitionAvailabilityObserver(
51           speech_recognition_availability_observer_.BindNewPipeAndPassRemote());
52 
53   render_frame_->GetBrowserInterfaceBroker()->GetInterface(
54       std::move(speech_recognition_client_browser_interface_receiver));
55 }
56 
OnRecognizerBound(bool is_multichannel_supported)57 void ChromeSpeechRecognitionClient::OnRecognizerBound(
58     bool is_multichannel_supported) {
59   is_multichannel_supported_ = is_multichannel_supported;
60   is_recognizer_bound_ = true;
61 
62   if (on_ready_callback_)
63     std::move(on_ready_callback_).Run();
64 }
65 
OnRecognizerDisconnected()66 void ChromeSpeechRecognitionClient::OnRecognizerDisconnected() {
67   is_recognizer_bound_ = false;
68   caption_host_->OnError();
69 }
70 
OnCaptionHostDisconnected()71 void ChromeSpeechRecognitionClient::OnCaptionHostDisconnected() {
72   is_browser_requesting_transcription_ = false;
73 }
74 
75 ChromeSpeechRecognitionClient::~ChromeSpeechRecognitionClient() = default;
76 
AddAudio(scoped_refptr<media::AudioBuffer> buffer)77 void ChromeSpeechRecognitionClient::AddAudio(
78     scoped_refptr<media::AudioBuffer> buffer) {
79   DCHECK(buffer);
80   send_audio_callback_.Run(ConvertToAudioDataS16(std::move(buffer)));
81 }
82 
AddAudio(std::unique_ptr<media::AudioBus> audio_bus,int sample_rate,media::ChannelLayout channel_layout)83 void ChromeSpeechRecognitionClient::AddAudio(
84     std::unique_ptr<media::AudioBus> audio_bus,
85     int sample_rate,
86     media::ChannelLayout channel_layout) {
87   DCHECK(audio_bus);
88   send_audio_callback_.Run(
89       ConvertToAudioDataS16(std::move(audio_bus), sample_rate, channel_layout));
90 }
91 
IsSpeechRecognitionAvailable()92 bool ChromeSpeechRecognitionClient::IsSpeechRecognitionAvailable() {
93   // TODO(evliu): Check if SODA is available.
94   return !is_website_blocked_ && is_browser_requesting_transcription_ &&
95          is_recognizer_bound_;
96 }
97 
98 // The OnReadyCallback is set by the owner of |this| and is executed when speech
99 // recognition becomes available. Setting the callback will override any
100 // existing callback.
SetOnReadyCallback(SpeechRecognitionClient::OnReadyCallback callback)101 void ChromeSpeechRecognitionClient::SetOnReadyCallback(
102     SpeechRecognitionClient::OnReadyCallback callback) {
103   on_ready_callback_ = std::move(callback);
104 
105   // Immediately run the callback if speech recognition is already available.
106   if (IsSpeechRecognitionAvailable() && on_ready_callback_)
107     std::move(on_ready_callback_).Run();
108 }
109 
OnSpeechRecognitionRecognitionEvent(media::mojom::SpeechRecognitionResultPtr result)110 void ChromeSpeechRecognitionClient::OnSpeechRecognitionRecognitionEvent(
111     media::mojom::SpeechRecognitionResultPtr result) {
112   caption_host_->OnTranscription(
113       chrome::mojom::TranscriptionResult::New(result->transcription,
114                                               result->is_final),
115       base::BindOnce(&ChromeSpeechRecognitionClient::OnTranscriptionCallback,
116                      base::Unretained(this)));
117 }
118 
SpeechRecognitionAvailabilityChanged(bool is_speech_recognition_available)119 void ChromeSpeechRecognitionClient::SpeechRecognitionAvailabilityChanged(
120     bool is_speech_recognition_available) {
121   if (is_speech_recognition_available) {
122     initialize_callback_.Run();
123   } else {
124     Reset();
125   }
126 }
127 
OnTranscriptionCallback(bool success)128 void ChromeSpeechRecognitionClient::OnTranscriptionCallback(bool success) {
129   is_browser_requesting_transcription_ = success;
130 }
131 
CopyBufferToTempAudioBus(const media::AudioBuffer & buffer)132 void ChromeSpeechRecognitionClient::CopyBufferToTempAudioBus(
133     const media::AudioBuffer& buffer) {
134   if (!temp_audio_bus_ ||
135       buffer.channel_count() != temp_audio_bus_->channels() ||
136       buffer.frame_count() != temp_audio_bus_->frames()) {
137     temp_audio_bus_ =
138         media::AudioBus::Create(buffer.channel_count(), buffer.frame_count());
139   }
140 
141   buffer.ReadFrames(buffer.frame_count(),
142                     /* source_frame_offset */ 0, /* dest_frame_offset */ 0,
143                     temp_audio_bus_.get());
144 }
145 
ResetChannelMixer(int frame_count,media::ChannelLayout channel_layout)146 void ChromeSpeechRecognitionClient::ResetChannelMixer(
147     int frame_count,
148     media::ChannelLayout channel_layout) {
149   if (!monaural_audio_bus_ || frame_count != monaural_audio_bus_->frames()) {
150     monaural_audio_bus_ =
151         media::AudioBus::Create(1 /* channels */, frame_count);
152   }
153 
154   if (channel_layout != channel_layout_) {
155     channel_layout_ = channel_layout;
156     channel_mixer_ = std::make_unique<media::ChannelMixer>(
157         channel_layout, media::CHANNEL_LAYOUT_MONO);
158   }
159 }
160 
Initialize()161 void ChromeSpeechRecognitionClient::Initialize() {
162   if (speech_recognition_context_.is_bound())
163     return;
164 
165   mojo::PendingReceiver<media::mojom::SpeechRecognitionContext>
166       speech_recognition_context_receiver =
167           speech_recognition_context_.BindNewPipeAndPassReceiver();
168   speech_recognition_context_->BindRecognizer(
169       speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
170       speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
171       media::BindToCurrentLoop(
172           base::BindOnce(&ChromeSpeechRecognitionClient::OnRecognizerBound,
173                          weak_factory_.GetWeakPtr())));
174 
175   render_frame_->GetBrowserInterfaceBroker()->GetInterface(
176       std::move(speech_recognition_context_receiver));
177   render_frame_->GetBrowserInterfaceBroker()->GetInterface(
178       caption_host_.BindNewPipeAndPassReceiver());
179 
180   if (base::FeatureList::IsEnabled(media::kUseSodaForLiveCaption)) {
181     is_website_blocked_ = false;
182   } else {
183     is_website_blocked_ = IsUrlBlocked(
184         render_frame_->GetWebFrame()->GetSecurityOrigin().ToString().Utf8());
185     base::UmaHistogramBoolean("Accessibility.LiveCaption.WebsiteBlocked",
186                               is_website_blocked_);
187   }
188 
189   speech_recognition_context_.set_disconnect_handler(media::BindToCurrentLoop(
190       base::BindOnce(&ChromeSpeechRecognitionClient::OnRecognizerDisconnected,
191                      weak_factory_.GetWeakPtr())));
192 
193   // Unretained is safe because |this| owns the mojo::Remote.
194   caption_host_.set_disconnect_handler(
195       base::BindOnce(&ChromeSpeechRecognitionClient::OnCaptionHostDisconnected,
196                      base::Unretained(this)));
197 }
198 
Reset()199 void ChromeSpeechRecognitionClient::Reset() {
200   is_recognizer_bound_ = false;
201   speech_recognition_context_.reset();
202   speech_recognition_recognizer_.reset();
203   speech_recognition_client_receiver_.reset();
204   caption_host_.reset();
205 }
206 
SendAudioToSpeechRecognitionService(media::mojom::AudioDataS16Ptr audio_data)207 void ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService(
208     media::mojom::AudioDataS16Ptr audio_data) {
209   DCHECK(audio_data);
210   if (IsSpeechRecognitionAvailable()) {
211     speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
212         std::move(audio_data));
213   }
214 }
215 
216 media::mojom::AudioDataS16Ptr
ConvertToAudioDataS16(scoped_refptr<media::AudioBuffer> buffer)217 ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
218     scoped_refptr<media::AudioBuffer> buffer) {
219   DCHECK_GT(buffer->frame_count(), 0);
220   DCHECK_GT(buffer->channel_count(), 0);
221   DCHECK_GT(buffer->sample_rate(), 0);
222 
223   auto signed_buffer = media::mojom::AudioDataS16::New();
224   signed_buffer->channel_count = buffer->channel_count();
225   signed_buffer->frame_count = buffer->frame_count();
226   signed_buffer->sample_rate = buffer->sample_rate();
227 
228   // If multichannel audio is not supported by the speech recognition service,
229   // mix the channels into a monaural channel before converting it.
230   if (buffer->channel_count() > 1 && !is_multichannel_supported_) {
231     signed_buffer->channel_count = 1;
232     CopyBufferToTempAudioBus(*buffer);
233     ResetChannelMixer(buffer->frame_count(), buffer->channel_layout());
234     signed_buffer->data.resize(buffer->frame_count());
235     channel_mixer_->Transform(temp_audio_bus_.get(), monaural_audio_bus_.get());
236     monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
237         monaural_audio_bus_->frames(), &signed_buffer->data[0]);
238     return signed_buffer;
239   }
240 
241   // If the audio is already in the interleaved signed int 16 format, directly
242   // assign it to the buffer.
243   if (buffer->sample_format() == media::SampleFormat::kSampleFormatS16) {
244     int16_t* audio_data = reinterpret_cast<int16_t*>(buffer->channel_data()[0]);
245     signed_buffer->data.assign(
246         audio_data,
247         audio_data + buffer->frame_count() * buffer->channel_count());
248     return signed_buffer;
249   }
250 
251   // Convert the raw audio to the interleaved signed int 16 sample type.
252   CopyBufferToTempAudioBus(*buffer);
253   signed_buffer->data.resize(buffer->frame_count() * buffer->channel_count());
254   temp_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
255       temp_audio_bus_->frames(), &signed_buffer->data[0]);
256 
257   return signed_buffer;
258 }
259 
260 media::mojom::AudioDataS16Ptr
ConvertToAudioDataS16(std::unique_ptr<media::AudioBus> audio_bus,int sample_rate,media::ChannelLayout channel_layout)261 ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
262     std::unique_ptr<media::AudioBus> audio_bus,
263     int sample_rate,
264     media::ChannelLayout channel_layout) {
265   DCHECK_GT(audio_bus->frames(), 0);
266   DCHECK_GT(audio_bus->channels(), 0);
267 
268   auto signed_buffer = media::mojom::AudioDataS16::New();
269   signed_buffer->channel_count = audio_bus->channels();
270   signed_buffer->frame_count = audio_bus->frames();
271   signed_buffer->sample_rate = sample_rate;
272 
273   // If multichannel audio is not supported by the speech recognition service,
274   // mix the channels into a monaural channel before converting it.
275   if (audio_bus->channels() > 1 && !is_multichannel_supported_) {
276     signed_buffer->channel_count = 1;
277     ResetChannelMixer(audio_bus->frames(), channel_layout);
278     signed_buffer->data.resize(audio_bus->frames());
279 
280     channel_mixer_->Transform(audio_bus.get(), monaural_audio_bus_.get());
281     monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
282         monaural_audio_bus_->frames(), &signed_buffer->data[0]);
283 
284     return signed_buffer;
285   }
286 
287   signed_buffer->data.resize(audio_bus->frames() * audio_bus->channels());
288   audio_bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(
289       audio_bus->frames(), &signed_buffer->data[0]);
290 
291   return signed_buffer;
292 }
293 
IsUrlBlocked(const std::string & url) const294 bool ChromeSpeechRecognitionClient::IsUrlBlocked(const std::string& url) const {
295   return blocked_urls_.find(url) != blocked_urls_.end();
296 }
297