1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/dom_distiller/core/distiller.h"
6
7 #include <map>
8 #include <memory>
9 #include <utility>
10 #include <vector>
11
12 #include "base/auto_reset.h"
13 #include "base/bind.h"
14 #include "base/callback.h"
15 #include "base/location.h"
16 #include "base/memory/ptr_util.h"
17 #include "base/metrics/histogram_macros.h"
18 #include "base/single_thread_task_runner.h"
19 #include "base/strings/string_number_conversions.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/threading/thread_task_runner_handle.h"
22 #include "base/values.h"
23 #include "build/build_config.h"
24 #include "components/dom_distiller/core/distiller_page.h"
25 #include "components/dom_distiller/core/distiller_url_fetcher.h"
26 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
27 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
28
29 namespace {
30 // Maximum number of distilled pages in an article.
31 const size_t kMaxPagesInArticle = 32;
32 } // namespace
33
34 namespace dom_distiller {
35
DistillerFactoryImpl(std::unique_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,const dom_distiller::proto::DomDistillerOptions & dom_distiller_options)36 DistillerFactoryImpl::DistillerFactoryImpl(
37 std::unique_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
38 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
39 : distiller_url_fetcher_factory_(std::move(distiller_url_fetcher_factory)),
40 dom_distiller_options_(dom_distiller_options) {}
41
~DistillerFactoryImpl()42 DistillerFactoryImpl::~DistillerFactoryImpl() {}
43
CreateDistillerForUrl(const GURL & unused)44 std::unique_ptr<Distiller> DistillerFactoryImpl::CreateDistillerForUrl(
45 const GURL& unused) {
46 // This default implementation has the same behavior for all URLs.
47 std::unique_ptr<DistillerImpl> distiller(new DistillerImpl(
48 *distiller_url_fetcher_factory_, dom_distiller_options_));
49 return std::move(distiller);
50 }
51
DistilledPageData()52 DistillerImpl::DistilledPageData::DistilledPageData() {}
53
~DistilledPageData()54 DistillerImpl::DistilledPageData::~DistilledPageData() {}
55
DistillerImpl(const DistillerURLFetcherFactory & distiller_url_fetcher_factory,const dom_distiller::proto::DomDistillerOptions & dom_distiller_options)56 DistillerImpl::DistillerImpl(
57 const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
58 const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
59 : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
60 dom_distiller_options_(dom_distiller_options),
61 max_pages_in_article_(kMaxPagesInArticle),
62 destruction_allowed_(true) {}
63
~DistillerImpl()64 DistillerImpl::~DistillerImpl() {
65 DCHECK(destruction_allowed_);
66 }
67
DoesFetchImages()68 bool DistillerImpl::DoesFetchImages() {
69 // Only iOS makes use of the fetched image data.
70 #if defined(OS_IOS)
71 return true;
72 #else
73 return false;
74 #endif
75 }
76
SetMaxNumPagesInArticle(size_t max_num_pages)77 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
78 max_pages_in_article_ = max_num_pages;
79 }
80
AreAllPagesFinished() const81 bool DistillerImpl::AreAllPagesFinished() const {
82 return started_pages_index_.empty() && waiting_pages_.empty();
83 }
84
TotalPageCount() const85 size_t DistillerImpl::TotalPageCount() const {
86 return waiting_pages_.size() + started_pages_index_.size() +
87 finished_pages_index_.size();
88 }
89
AddToDistillationQueue(int page_num,const GURL & url)90 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
91 if (!IsPageNumberInUse(page_num) && url.is_valid() &&
92 TotalPageCount() < max_pages_in_article_ &&
93 seen_urls_.find(url.spec()) == seen_urls_.end()) {
94 waiting_pages_[page_num] = url;
95 }
96 }
97
IsPageNumberInUse(int page_num) const98 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
99 return waiting_pages_.find(page_num) != waiting_pages_.end() ||
100 started_pages_index_.find(page_num) != started_pages_index_.end() ||
101 finished_pages_index_.find(page_num) != finished_pages_index_.end();
102 }
103
GetPageAtIndex(size_t index) const104 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(
105 size_t index) const {
106 DCHECK_LT(index, pages_.size());
107 DistilledPageData* page_data = pages_[index].get();
108 DCHECK(page_data);
109 return page_data;
110 }
111
DistillPage(const GURL & url,std::unique_ptr<DistillerPage> distiller_page,DistillationFinishedCallback finished_cb,const DistillationUpdateCallback & update_cb)112 void DistillerImpl::DistillPage(const GURL& url,
113 std::unique_ptr<DistillerPage> distiller_page,
114 DistillationFinishedCallback finished_cb,
115 const DistillationUpdateCallback& update_cb) {
116 DCHECK(AreAllPagesFinished());
117 distiller_page_ = std::move(distiller_page);
118 finished_cb_ = std::move(finished_cb);
119 update_cb_ = update_cb;
120
121 AddToDistillationQueue(0, url);
122 DistillNextPage();
123 }
124
DistillNextPage()125 void DistillerImpl::DistillNextPage() {
126 if (!waiting_pages_.empty()) {
127 auto front = waiting_pages_.begin();
128 int page_num = front->first;
129 const GURL url = front->second;
130
131 waiting_pages_.erase(front);
132 DCHECK(url.is_valid());
133 DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
134 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
135 seen_urls_.insert(url.spec());
136 pages_.push_back(std::make_unique<DistilledPageData>());
137 started_pages_index_[page_num] = pages_.size() - 1;
138
139 // TODO(gilmanmh): Investigate whether this needs to be
140 // base::BindRepeating() or if base::BindOnce() can be used instead.
141 distiller_page_->DistillPage(
142 url, dom_distiller_options_,
143 base::BindRepeating(&DistillerImpl::OnPageDistillationFinished,
144 weak_factory_.GetWeakPtr(), page_num, url));
145 }
146 }
147
OnPageDistillationFinished(int page_num,const GURL & page_url,std::unique_ptr<proto::DomDistillerResult> distiller_result,bool distillation_successful)148 void DistillerImpl::OnPageDistillationFinished(
149 int page_num,
150 const GURL& page_url,
151 std::unique_ptr<proto::DomDistillerResult> distiller_result,
152 bool distillation_successful) {
153 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
154 if (!distillation_successful) {
155 started_pages_index_.erase(page_num);
156 RunDistillerCallbackIfDone();
157 return;
158 }
159
160 if (distiller_result->has_statistics_info() && page_num == 0) {
161 if (distiller_result->statistics_info().has_word_count()) {
162 UMA_HISTOGRAM_CUSTOM_COUNTS(
163 "DomDistiller.Statistics.FirstPageWordCount",
164 distiller_result->statistics_info().word_count(), 1, 4000, 50);
165 }
166 }
167
168 DCHECK(distiller_result);
169 CHECK_LT(started_pages_index_[page_num], pages_.size())
170 << "started_pages_index_[" << page_num
171 << "] (=" << started_pages_index_[page_num] << ") is out of range.";
172 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
173 CHECK(page_data) << "GetPageAtIndex(started_pages_index_[" << page_num
174 << "] (=" << started_pages_index_[page_num]
175 << ")) returns nullptr. pages_.size() = " << pages_.size()
176 << ".";
177 page_data->distilled_page_proto =
178 new base::RefCountedData<DistilledPageProto>();
179 page_data->page_num = page_num;
180 if (distiller_result->has_title()) {
181 page_data->distilled_page_proto->data.set_title(distiller_result->title());
182 }
183 page_data->distilled_page_proto->data.set_url(page_url.spec());
184 bool content_empty = true;
185 if (distiller_result->has_distilled_content() &&
186 distiller_result->distilled_content().has_html()) {
187 page_data->distilled_page_proto->data.set_html(
188 distiller_result->distilled_content().html());
189 if (!distiller_result->distilled_content().html().empty()) {
190 content_empty = false;
191 }
192 }
193
194 if (distiller_result->has_timing_info()) {
195 const proto::TimingInfo& distiller_timing_info =
196 distiller_result->timing_info();
197 DistilledPageProto::TimingInfo timing_info;
198 if (distiller_timing_info.has_markup_parsing_time()) {
199 timing_info.set_name("markup_parsing");
200 timing_info.set_time(distiller_timing_info.markup_parsing_time());
201 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
202 }
203
204 if (distiller_timing_info.has_document_construction_time()) {
205 timing_info.set_name("document_construction");
206 timing_info.set_time(distiller_timing_info.document_construction_time());
207 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
208 }
209
210 if (distiller_timing_info.has_article_processing_time()) {
211 timing_info.set_name("article_processing");
212 timing_info.set_time(distiller_timing_info.article_processing_time());
213 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
214 }
215
216 if (distiller_timing_info.has_formatting_time()) {
217 timing_info.set_name("formatting");
218 timing_info.set_time(distiller_timing_info.formatting_time());
219 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
220 }
221
222 if (distiller_timing_info.has_total_time()) {
223 timing_info.set_name("total");
224 timing_info.set_time(distiller_timing_info.total_time());
225 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
226 }
227
228 for (int i = 0; i < distiller_timing_info.other_times_size(); i++) {
229 timing_info.set_name(distiller_timing_info.other_times(i).name());
230 timing_info.set_time(distiller_timing_info.other_times(i).time());
231 *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
232 }
233 }
234
235 if (distiller_result->has_debug_info() &&
236 distiller_result->debug_info().has_log()) {
237 page_data->distilled_page_proto->data.mutable_debug_info()->set_log(
238 distiller_result->debug_info().log());
239 }
240
241 if (distiller_result->has_text_direction()) {
242 page_data->distilled_page_proto->data.set_text_direction(
243 distiller_result->text_direction());
244 } else {
245 page_data->distilled_page_proto->data.set_text_direction("auto");
246 }
247
248 if (distiller_result->has_pagination_info()) {
249 const proto::PaginationInfo& pagination_info =
250 distiller_result->pagination_info();
251 // Skip the next page if the first page is empty.
252 if (pagination_info.has_next_page() && (page_num != 0 || !content_empty)) {
253 GURL next_page_url(pagination_info.next_page());
254 if (next_page_url.is_valid()) {
255 // The pages should be in same origin.
256 DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
257 AddToDistillationQueue(page_num + 1, next_page_url);
258 page_data->distilled_page_proto->data.mutable_pagination_info()
259 ->set_next_page(next_page_url.spec());
260 }
261 }
262
263 if (pagination_info.has_prev_page()) {
264 GURL prev_page_url(pagination_info.prev_page());
265 if (prev_page_url.is_valid()) {
266 DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
267 AddToDistillationQueue(page_num - 1, prev_page_url);
268 page_data->distilled_page_proto->data.mutable_pagination_info()
269 ->set_prev_page(prev_page_url.spec());
270 }
271 }
272
273 if (pagination_info.has_canonical_page()) {
274 GURL canonical_page_url(pagination_info.canonical_page());
275 if (canonical_page_url.is_valid()) {
276 page_data->distilled_page_proto->data.mutable_pagination_info()
277 ->set_canonical_page(canonical_page_url.spec());
278 }
279 }
280 }
281
282 for (int img_num = 0; img_num < distiller_result->content_images_size();
283 ++img_num) {
284 std::string image_id = base::NumberToString(page_num + 1) + "_" +
285 base::NumberToString(img_num);
286 MaybeFetchImage(page_num, image_id,
287 distiller_result->content_images(img_num).url());
288 }
289
290 AddPageIfDone(page_num);
291 DistillNextPage();
292 }
293
MaybeFetchImage(int page_num,const std::string & image_id,const std::string & image_url)294 void DistillerImpl::MaybeFetchImage(int page_num,
295 const std::string& image_id,
296 const std::string& image_url) {
297 if (!GURL(image_url).is_valid())
298 return;
299 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
300 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
301
302 if (!DoesFetchImages()) {
303 DistilledPageProto_Image* image =
304 page_data->distilled_page_proto->data.add_image();
305 image->set_name(image_id);
306 image->set_url(image_url);
307 return;
308 }
309
310 DistillerURLFetcher* fetcher =
311 distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
312 page_data->image_fetchers_.push_back(base::WrapUnique(fetcher));
313
314 // TODO(gilmanmh): Investigate whether this needs to be base::BindRepeating()
315 // or if base::BindOnce() can be used instead.
316 fetcher->FetchURL(
317 image_url,
318 base::BindRepeating(&DistillerImpl::OnFetchImageDone,
319 weak_factory_.GetWeakPtr(), page_num,
320 base::Unretained(fetcher), image_id, image_url));
321 }
322
OnFetchImageDone(int page_num,DistillerURLFetcher * url_fetcher,const std::string & id,const std::string & original_url,const std::string & response)323 void DistillerImpl::OnFetchImageDone(int page_num,
324 DistillerURLFetcher* url_fetcher,
325 const std::string& id,
326 const std::string& original_url,
327 const std::string& response) {
328 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
329 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
330 DCHECK(page_data->distilled_page_proto);
331 DCHECK(url_fetcher);
332 auto fetcher_it = std::find_if(
333 page_data->image_fetchers_.begin(), page_data->image_fetchers_.end(),
334 [url_fetcher](const std::unique_ptr<DistillerURLFetcher>& f) {
335 return url_fetcher == f.get();
336 });
337
338 DCHECK(fetcher_it != page_data->image_fetchers_.end());
339 // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
340 // callback is invoked by the |url_fetcher|.
341 fetcher_it->release();
342 page_data->image_fetchers_.erase(fetcher_it);
343 base::ThreadTaskRunnerHandle::Get()->DeleteSoon(FROM_HERE, url_fetcher);
344
345 DistilledPageProto_Image* image =
346 page_data->distilled_page_proto->data.add_image();
347 image->set_name(id);
348 image->set_data(response);
349 image->set_url(original_url);
350
351 AddPageIfDone(page_num);
352 }
353
AddPageIfDone(int page_num)354 void DistillerImpl::AddPageIfDone(int page_num) {
355 DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
356 DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
357 DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
358 if (page_data->image_fetchers_.empty()) {
359 finished_pages_index_[page_num] = started_pages_index_[page_num];
360 started_pages_index_.erase(page_num);
361 const ArticleDistillationUpdate& article_update =
362 CreateDistillationUpdate();
363 DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
364 update_cb_.Run(article_update);
365 RunDistillerCallbackIfDone();
366 }
367 }
368
CreateDistillationUpdate() const369 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
370 const {
371 bool has_prev_page = false;
372 bool has_next_page = false;
373 if (!finished_pages_index_.empty()) {
374 int prev_page_num = finished_pages_index_.begin()->first - 1;
375 int next_page_num = finished_pages_index_.rbegin()->first + 1;
376 has_prev_page = IsPageNumberInUse(prev_page_num);
377 has_next_page = IsPageNumberInUse(next_page_num);
378 }
379
380 std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto>>
381 update_pages;
382 for (auto it = finished_pages_index_.begin();
383 it != finished_pages_index_.end(); ++it) {
384 update_pages.push_back(pages_[it->second]->distilled_page_proto);
385 }
386 return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
387 }
388
RunDistillerCallbackIfDone()389 void DistillerImpl::RunDistillerCallbackIfDone() {
390 DCHECK(!finished_cb_.is_null());
391 if (AreAllPagesFinished()) {
392 bool first_page = true;
393 std::unique_ptr<DistilledArticleProto> article_proto(
394 new DistilledArticleProto());
395 // Stitch the pages back into the article.
396 for (auto it = finished_pages_index_.begin();
397 it != finished_pages_index_.end();) {
398 DistilledPageData* page_data = GetPageAtIndex(it->second);
399 *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
400
401 if (first_page) {
402 article_proto->set_title(page_data->distilled_page_proto->data.title());
403 first_page = false;
404 }
405
406 finished_pages_index_.erase(it++);
407 }
408
409 pages_.clear();
410 DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
411 max_pages_in_article_);
412
413 DCHECK(pages_.empty());
414 DCHECK(finished_pages_index_.empty());
415
416 base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
417 false);
418 std::move(finished_cb_).Run(std::move(article_proto));
419 }
420 }
421
422 } // namespace dom_distiller
423