1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/media/chrome_speech_recognition_client.h"
6
7 #include <utility>
8
9 #include "base/metrics/field_trial_params.h"
10 #include "base/metrics/histogram_functions.h"
11 #include "content/public/renderer/render_frame.h"
12 #include "media/base/audio_bus.h"
13 #include "media/base/audio_parameters.h"
14 #include "media/base/bind_to_current_loop.h"
15 #include "media/base/channel_mixer.h"
16 #include "media/base/media_switches.h"
17 #include "media/mojo/mojom/media_types.mojom.h"
18 #include "third_party/blink/public/common/browser_interface_broker_proxy.h"
19 #include "third_party/blink/public/platform/web_string.h"
20 #include "third_party/blink/public/web/web_frame.h"
21 #include "third_party/blink/public/web/web_local_frame.h"
22
23 // Get the list of blocked URLs defined by the Finch experiment parameter. These
24 // websites provide captions by default and thus do not require the live caption
25 // feature.
GetBlockedURLs()26 std::vector<std::string> GetBlockedURLs() {
27 return base::SplitString(base::GetFieldTrialParamValueByFeature(
28 media::kLiveCaption, "blocked_websites"),
29 ",", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
30 }
31
ChromeSpeechRecognitionClient(content::RenderFrame * render_frame,media::SpeechRecognitionClient::OnReadyCallback callback)32 ChromeSpeechRecognitionClient::ChromeSpeechRecognitionClient(
33 content::RenderFrame* render_frame,
34 media::SpeechRecognitionClient::OnReadyCallback callback)
35 : render_frame_(render_frame),
36 on_ready_callback_(std::move(callback)),
37 blocked_urls_(GetBlockedURLs()) {
38 initialize_callback_ = media::BindToCurrentLoop(base::BindRepeating(
39 &ChromeSpeechRecognitionClient::Initialize, weak_factory_.GetWeakPtr()));
40
41 send_audio_callback_ = media::BindToCurrentLoop(base::BindRepeating(
42 &ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService,
43 weak_factory_.GetWeakPtr()));
44
45 mojo::PendingReceiver<media::mojom::SpeechRecognitionClientBrowserInterface>
46 speech_recognition_client_browser_interface_receiver =
47 speech_recognition_client_browser_interface_
48 .BindNewPipeAndPassReceiver();
49 speech_recognition_client_browser_interface_
50 ->BindSpeechRecognitionAvailabilityObserver(
51 speech_recognition_availability_observer_.BindNewPipeAndPassRemote());
52
53 render_frame_->GetBrowserInterfaceBroker()->GetInterface(
54 std::move(speech_recognition_client_browser_interface_receiver));
55 }
56
OnRecognizerBound(bool is_multichannel_supported)57 void ChromeSpeechRecognitionClient::OnRecognizerBound(
58 bool is_multichannel_supported) {
59 is_multichannel_supported_ = is_multichannel_supported;
60 is_recognizer_bound_ = true;
61
62 if (on_ready_callback_)
63 std::move(on_ready_callback_).Run();
64 }
65
OnRecognizerDisconnected()66 void ChromeSpeechRecognitionClient::OnRecognizerDisconnected() {
67 is_recognizer_bound_ = false;
68 caption_host_->OnError();
69 }
70
OnCaptionHostDisconnected()71 void ChromeSpeechRecognitionClient::OnCaptionHostDisconnected() {
72 is_browser_requesting_transcription_ = false;
73 }
74
75 ChromeSpeechRecognitionClient::~ChromeSpeechRecognitionClient() = default;
76
AddAudio(scoped_refptr<media::AudioBuffer> buffer)77 void ChromeSpeechRecognitionClient::AddAudio(
78 scoped_refptr<media::AudioBuffer> buffer) {
79 DCHECK(buffer);
80 send_audio_callback_.Run(ConvertToAudioDataS16(std::move(buffer)));
81 }
82
AddAudio(std::unique_ptr<media::AudioBus> audio_bus,int sample_rate,media::ChannelLayout channel_layout)83 void ChromeSpeechRecognitionClient::AddAudio(
84 std::unique_ptr<media::AudioBus> audio_bus,
85 int sample_rate,
86 media::ChannelLayout channel_layout) {
87 DCHECK(audio_bus);
88 send_audio_callback_.Run(
89 ConvertToAudioDataS16(std::move(audio_bus), sample_rate, channel_layout));
90 }
91
IsSpeechRecognitionAvailable()92 bool ChromeSpeechRecognitionClient::IsSpeechRecognitionAvailable() {
93 // TODO(evliu): Check if SODA is available.
94 return !is_website_blocked_ && is_browser_requesting_transcription_ &&
95 is_recognizer_bound_;
96 }
97
98 // The OnReadyCallback is set by the owner of |this| and is executed when speech
99 // recognition becomes available. Setting the callback will override any
100 // existing callback.
SetOnReadyCallback(SpeechRecognitionClient::OnReadyCallback callback)101 void ChromeSpeechRecognitionClient::SetOnReadyCallback(
102 SpeechRecognitionClient::OnReadyCallback callback) {
103 on_ready_callback_ = std::move(callback);
104
105 // Immediately run the callback if speech recognition is already available.
106 if (IsSpeechRecognitionAvailable() && on_ready_callback_)
107 std::move(on_ready_callback_).Run();
108 }
109
OnSpeechRecognitionRecognitionEvent(media::mojom::SpeechRecognitionResultPtr result)110 void ChromeSpeechRecognitionClient::OnSpeechRecognitionRecognitionEvent(
111 media::mojom::SpeechRecognitionResultPtr result) {
112 caption_host_->OnTranscription(
113 chrome::mojom::TranscriptionResult::New(result->transcription,
114 result->is_final),
115 base::BindOnce(&ChromeSpeechRecognitionClient::OnTranscriptionCallback,
116 base::Unretained(this)));
117 }
118
SpeechRecognitionAvailabilityChanged(bool is_speech_recognition_available)119 void ChromeSpeechRecognitionClient::SpeechRecognitionAvailabilityChanged(
120 bool is_speech_recognition_available) {
121 if (is_speech_recognition_available) {
122 initialize_callback_.Run();
123 } else {
124 Reset();
125 }
126 }
127
OnTranscriptionCallback(bool success)128 void ChromeSpeechRecognitionClient::OnTranscriptionCallback(bool success) {
129 is_browser_requesting_transcription_ = success;
130 }
131
CopyBufferToTempAudioBus(const media::AudioBuffer & buffer)132 void ChromeSpeechRecognitionClient::CopyBufferToTempAudioBus(
133 const media::AudioBuffer& buffer) {
134 if (!temp_audio_bus_ ||
135 buffer.channel_count() != temp_audio_bus_->channels() ||
136 buffer.frame_count() != temp_audio_bus_->frames()) {
137 temp_audio_bus_ =
138 media::AudioBus::Create(buffer.channel_count(), buffer.frame_count());
139 }
140
141 buffer.ReadFrames(buffer.frame_count(),
142 /* source_frame_offset */ 0, /* dest_frame_offset */ 0,
143 temp_audio_bus_.get());
144 }
145
ResetChannelMixer(int frame_count,media::ChannelLayout channel_layout)146 void ChromeSpeechRecognitionClient::ResetChannelMixer(
147 int frame_count,
148 media::ChannelLayout channel_layout) {
149 if (!monaural_audio_bus_ || frame_count != monaural_audio_bus_->frames()) {
150 monaural_audio_bus_ =
151 media::AudioBus::Create(1 /* channels */, frame_count);
152 }
153
154 if (channel_layout != channel_layout_) {
155 channel_layout_ = channel_layout;
156 channel_mixer_ = std::make_unique<media::ChannelMixer>(
157 channel_layout, media::CHANNEL_LAYOUT_MONO);
158 }
159 }
160
Initialize()161 void ChromeSpeechRecognitionClient::Initialize() {
162 if (speech_recognition_context_.is_bound())
163 return;
164
165 mojo::PendingReceiver<media::mojom::SpeechRecognitionContext>
166 speech_recognition_context_receiver =
167 speech_recognition_context_.BindNewPipeAndPassReceiver();
168 speech_recognition_context_->BindRecognizer(
169 speech_recognition_recognizer_.BindNewPipeAndPassReceiver(),
170 speech_recognition_client_receiver_.BindNewPipeAndPassRemote(),
171 media::BindToCurrentLoop(
172 base::BindOnce(&ChromeSpeechRecognitionClient::OnRecognizerBound,
173 weak_factory_.GetWeakPtr())));
174
175 render_frame_->GetBrowserInterfaceBroker()->GetInterface(
176 std::move(speech_recognition_context_receiver));
177 render_frame_->GetBrowserInterfaceBroker()->GetInterface(
178 caption_host_.BindNewPipeAndPassReceiver());
179
180 if (base::FeatureList::IsEnabled(media::kUseSodaForLiveCaption)) {
181 is_website_blocked_ = false;
182 } else {
183 is_website_blocked_ = IsUrlBlocked(
184 render_frame_->GetWebFrame()->GetSecurityOrigin().ToString().Utf8());
185 base::UmaHistogramBoolean("Accessibility.LiveCaption.WebsiteBlocked",
186 is_website_blocked_);
187 }
188
189 speech_recognition_context_.set_disconnect_handler(media::BindToCurrentLoop(
190 base::BindOnce(&ChromeSpeechRecognitionClient::OnRecognizerDisconnected,
191 weak_factory_.GetWeakPtr())));
192
193 // Unretained is safe because |this| owns the mojo::Remote.
194 caption_host_.set_disconnect_handler(
195 base::BindOnce(&ChromeSpeechRecognitionClient::OnCaptionHostDisconnected,
196 base::Unretained(this)));
197 }
198
Reset()199 void ChromeSpeechRecognitionClient::Reset() {
200 is_recognizer_bound_ = false;
201 speech_recognition_context_.reset();
202 speech_recognition_recognizer_.reset();
203 speech_recognition_client_receiver_.reset();
204 caption_host_.reset();
205 }
206
SendAudioToSpeechRecognitionService(media::mojom::AudioDataS16Ptr audio_data)207 void ChromeSpeechRecognitionClient::SendAudioToSpeechRecognitionService(
208 media::mojom::AudioDataS16Ptr audio_data) {
209 DCHECK(audio_data);
210 if (IsSpeechRecognitionAvailable()) {
211 speech_recognition_recognizer_->SendAudioToSpeechRecognitionService(
212 std::move(audio_data));
213 }
214 }
215
216 media::mojom::AudioDataS16Ptr
ConvertToAudioDataS16(scoped_refptr<media::AudioBuffer> buffer)217 ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
218 scoped_refptr<media::AudioBuffer> buffer) {
219 DCHECK_GT(buffer->frame_count(), 0);
220 DCHECK_GT(buffer->channel_count(), 0);
221 DCHECK_GT(buffer->sample_rate(), 0);
222
223 auto signed_buffer = media::mojom::AudioDataS16::New();
224 signed_buffer->channel_count = buffer->channel_count();
225 signed_buffer->frame_count = buffer->frame_count();
226 signed_buffer->sample_rate = buffer->sample_rate();
227
228 // If multichannel audio is not supported by the speech recognition service,
229 // mix the channels into a monaural channel before converting it.
230 if (buffer->channel_count() > 1 && !is_multichannel_supported_) {
231 signed_buffer->channel_count = 1;
232 CopyBufferToTempAudioBus(*buffer);
233 ResetChannelMixer(buffer->frame_count(), buffer->channel_layout());
234 signed_buffer->data.resize(buffer->frame_count());
235 channel_mixer_->Transform(temp_audio_bus_.get(), monaural_audio_bus_.get());
236 monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
237 monaural_audio_bus_->frames(), &signed_buffer->data[0]);
238 return signed_buffer;
239 }
240
241 // If the audio is already in the interleaved signed int 16 format, directly
242 // assign it to the buffer.
243 if (buffer->sample_format() == media::SampleFormat::kSampleFormatS16) {
244 int16_t* audio_data = reinterpret_cast<int16_t*>(buffer->channel_data()[0]);
245 signed_buffer->data.assign(
246 audio_data,
247 audio_data + buffer->frame_count() * buffer->channel_count());
248 return signed_buffer;
249 }
250
251 // Convert the raw audio to the interleaved signed int 16 sample type.
252 CopyBufferToTempAudioBus(*buffer);
253 signed_buffer->data.resize(buffer->frame_count() * buffer->channel_count());
254 temp_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
255 temp_audio_bus_->frames(), &signed_buffer->data[0]);
256
257 return signed_buffer;
258 }
259
260 media::mojom::AudioDataS16Ptr
ConvertToAudioDataS16(std::unique_ptr<media::AudioBus> audio_bus,int sample_rate,media::ChannelLayout channel_layout)261 ChromeSpeechRecognitionClient::ConvertToAudioDataS16(
262 std::unique_ptr<media::AudioBus> audio_bus,
263 int sample_rate,
264 media::ChannelLayout channel_layout) {
265 DCHECK_GT(audio_bus->frames(), 0);
266 DCHECK_GT(audio_bus->channels(), 0);
267
268 auto signed_buffer = media::mojom::AudioDataS16::New();
269 signed_buffer->channel_count = audio_bus->channels();
270 signed_buffer->frame_count = audio_bus->frames();
271 signed_buffer->sample_rate = sample_rate;
272
273 // If multichannel audio is not supported by the speech recognition service,
274 // mix the channels into a monaural channel before converting it.
275 if (audio_bus->channels() > 1 && !is_multichannel_supported_) {
276 signed_buffer->channel_count = 1;
277 ResetChannelMixer(audio_bus->frames(), channel_layout);
278 signed_buffer->data.resize(audio_bus->frames());
279
280 channel_mixer_->Transform(audio_bus.get(), monaural_audio_bus_.get());
281 monaural_audio_bus_->ToInterleaved<media::SignedInt16SampleTypeTraits>(
282 monaural_audio_bus_->frames(), &signed_buffer->data[0]);
283
284 return signed_buffer;
285 }
286
287 signed_buffer->data.resize(audio_bus->frames() * audio_bus->channels());
288 audio_bus->ToInterleaved<media::SignedInt16SampleTypeTraits>(
289 audio_bus->frames(), &signed_buffer->data[0]);
290
291 return signed_buffer;
292 }
293
IsUrlBlocked(const std::string & url) const294 bool ChromeSpeechRecognitionClient::IsUrlBlocked(const std::string& url) const {
295 return blocked_urls_.find(url) != blocked_urls_.end();
296 }
297