1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/dom_distiller/core/distiller.h"
6 
7 #include <map>
8 #include <memory>
9 #include <utility>
10 #include <vector>
11 
12 #include "base/auto_reset.h"
13 #include "base/bind.h"
14 #include "base/callback.h"
15 #include "base/location.h"
16 #include "base/memory/ptr_util.h"
17 #include "base/metrics/histogram_macros.h"
18 #include "base/single_thread_task_runner.h"
19 #include "base/strings/string_number_conversions.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "base/threading/thread_task_runner_handle.h"
22 #include "base/values.h"
23 #include "build/build_config.h"
24 #include "components/dom_distiller/core/distiller_page.h"
25 #include "components/dom_distiller/core/distiller_url_fetcher.h"
26 #include "components/dom_distiller/core/proto/distilled_article.pb.h"
27 #include "components/dom_distiller/core/proto/distilled_page.pb.h"
28 
29 namespace {
30 // Maximum number of distilled pages in an article.
31 const size_t kMaxPagesInArticle = 32;
32 }  // namespace
33 
34 namespace dom_distiller {
35 
DistillerFactoryImpl(std::unique_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,const dom_distiller::proto::DomDistillerOptions & dom_distiller_options)36 DistillerFactoryImpl::DistillerFactoryImpl(
37     std::unique_ptr<DistillerURLFetcherFactory> distiller_url_fetcher_factory,
38     const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
39     : distiller_url_fetcher_factory_(std::move(distiller_url_fetcher_factory)),
40       dom_distiller_options_(dom_distiller_options) {}
41 
~DistillerFactoryImpl()42 DistillerFactoryImpl::~DistillerFactoryImpl() {}
43 
CreateDistillerForUrl(const GURL & unused)44 std::unique_ptr<Distiller> DistillerFactoryImpl::CreateDistillerForUrl(
45     const GURL& unused) {
46   // This default implementation has the same behavior for all URLs.
47   std::unique_ptr<DistillerImpl> distiller(new DistillerImpl(
48       *distiller_url_fetcher_factory_, dom_distiller_options_));
49   return std::move(distiller);
50 }
51 
DistilledPageData()52 DistillerImpl::DistilledPageData::DistilledPageData() {}
53 
~DistilledPageData()54 DistillerImpl::DistilledPageData::~DistilledPageData() {}
55 
DistillerImpl(const DistillerURLFetcherFactory & distiller_url_fetcher_factory,const dom_distiller::proto::DomDistillerOptions & dom_distiller_options)56 DistillerImpl::DistillerImpl(
57     const DistillerURLFetcherFactory& distiller_url_fetcher_factory,
58     const dom_distiller::proto::DomDistillerOptions& dom_distiller_options)
59     : distiller_url_fetcher_factory_(distiller_url_fetcher_factory),
60       dom_distiller_options_(dom_distiller_options),
61       max_pages_in_article_(kMaxPagesInArticle),
62       destruction_allowed_(true) {}
63 
~DistillerImpl()64 DistillerImpl::~DistillerImpl() {
65   DCHECK(destruction_allowed_);
66 }
67 
DoesFetchImages()68 bool DistillerImpl::DoesFetchImages() {
69 // Only iOS makes use of the fetched image data.
70 #if defined(OS_IOS)
71   return true;
72 #else
73   return false;
74 #endif
75 }
76 
SetMaxNumPagesInArticle(size_t max_num_pages)77 void DistillerImpl::SetMaxNumPagesInArticle(size_t max_num_pages) {
78   max_pages_in_article_ = max_num_pages;
79 }
80 
AreAllPagesFinished() const81 bool DistillerImpl::AreAllPagesFinished() const {
82   return started_pages_index_.empty() && waiting_pages_.empty();
83 }
84 
TotalPageCount() const85 size_t DistillerImpl::TotalPageCount() const {
86   return waiting_pages_.size() + started_pages_index_.size() +
87          finished_pages_index_.size();
88 }
89 
AddToDistillationQueue(int page_num,const GURL & url)90 void DistillerImpl::AddToDistillationQueue(int page_num, const GURL& url) {
91   if (!IsPageNumberInUse(page_num) && url.is_valid() &&
92       TotalPageCount() < max_pages_in_article_ &&
93       seen_urls_.find(url.spec()) == seen_urls_.end()) {
94     waiting_pages_[page_num] = url;
95   }
96 }
97 
IsPageNumberInUse(int page_num) const98 bool DistillerImpl::IsPageNumberInUse(int page_num) const {
99   return waiting_pages_.find(page_num) != waiting_pages_.end() ||
100          started_pages_index_.find(page_num) != started_pages_index_.end() ||
101          finished_pages_index_.find(page_num) != finished_pages_index_.end();
102 }
103 
GetPageAtIndex(size_t index) const104 DistillerImpl::DistilledPageData* DistillerImpl::GetPageAtIndex(
105     size_t index) const {
106   DCHECK_LT(index, pages_.size());
107   DistilledPageData* page_data = pages_[index].get();
108   DCHECK(page_data);
109   return page_data;
110 }
111 
DistillPage(const GURL & url,std::unique_ptr<DistillerPage> distiller_page,DistillationFinishedCallback finished_cb,const DistillationUpdateCallback & update_cb)112 void DistillerImpl::DistillPage(const GURL& url,
113                                 std::unique_ptr<DistillerPage> distiller_page,
114                                 DistillationFinishedCallback finished_cb,
115                                 const DistillationUpdateCallback& update_cb) {
116   DCHECK(AreAllPagesFinished());
117   distiller_page_ = std::move(distiller_page);
118   finished_cb_ = std::move(finished_cb);
119   update_cb_ = update_cb;
120 
121   AddToDistillationQueue(0, url);
122   DistillNextPage();
123 }
124 
DistillNextPage()125 void DistillerImpl::DistillNextPage() {
126   if (!waiting_pages_.empty()) {
127     auto front = waiting_pages_.begin();
128     int page_num = front->first;
129     const GURL url = front->second;
130 
131     waiting_pages_.erase(front);
132     DCHECK(url.is_valid());
133     DCHECK(started_pages_index_.find(page_num) == started_pages_index_.end());
134     DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
135     seen_urls_.insert(url.spec());
136     pages_.push_back(std::make_unique<DistilledPageData>());
137     started_pages_index_[page_num] = pages_.size() - 1;
138 
139     // TODO(gilmanmh): Investigate whether this needs to be
140     // base::BindRepeating() or if base::BindOnce() can be used instead.
141     distiller_page_->DistillPage(
142         url, dom_distiller_options_,
143         base::BindRepeating(&DistillerImpl::OnPageDistillationFinished,
144                             weak_factory_.GetWeakPtr(), page_num, url));
145   }
146 }
147 
OnPageDistillationFinished(int page_num,const GURL & page_url,std::unique_ptr<proto::DomDistillerResult> distiller_result,bool distillation_successful)148 void DistillerImpl::OnPageDistillationFinished(
149     int page_num,
150     const GURL& page_url,
151     std::unique_ptr<proto::DomDistillerResult> distiller_result,
152     bool distillation_successful) {
153   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
154   if (!distillation_successful) {
155     started_pages_index_.erase(page_num);
156     RunDistillerCallbackIfDone();
157     return;
158   }
159 
160   if (distiller_result->has_statistics_info() && page_num == 0) {
161     if (distiller_result->statistics_info().has_word_count()) {
162       UMA_HISTOGRAM_CUSTOM_COUNTS(
163           "DomDistiller.Statistics.FirstPageWordCount",
164           distiller_result->statistics_info().word_count(), 1, 4000, 50);
165     }
166   }
167 
168   DCHECK(distiller_result);
169   CHECK_LT(started_pages_index_[page_num], pages_.size())
170       << "started_pages_index_[" << page_num
171       << "] (=" << started_pages_index_[page_num] << ") is out of range.";
172   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
173   CHECK(page_data) << "GetPageAtIndex(started_pages_index_[" << page_num
174                    << "] (=" << started_pages_index_[page_num]
175                    << ")) returns nullptr. pages_.size() = " << pages_.size()
176                    << ".";
177   page_data->distilled_page_proto =
178       new base::RefCountedData<DistilledPageProto>();
179   page_data->page_num = page_num;
180   if (distiller_result->has_title()) {
181     page_data->distilled_page_proto->data.set_title(distiller_result->title());
182   }
183   page_data->distilled_page_proto->data.set_url(page_url.spec());
184   bool content_empty = true;
185   if (distiller_result->has_distilled_content() &&
186       distiller_result->distilled_content().has_html()) {
187     page_data->distilled_page_proto->data.set_html(
188         distiller_result->distilled_content().html());
189     if (!distiller_result->distilled_content().html().empty()) {
190       content_empty = false;
191     }
192   }
193 
194   if (distiller_result->has_timing_info()) {
195     const proto::TimingInfo& distiller_timing_info =
196         distiller_result->timing_info();
197     DistilledPageProto::TimingInfo timing_info;
198     if (distiller_timing_info.has_markup_parsing_time()) {
199       timing_info.set_name("markup_parsing");
200       timing_info.set_time(distiller_timing_info.markup_parsing_time());
201       *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
202     }
203 
204     if (distiller_timing_info.has_document_construction_time()) {
205       timing_info.set_name("document_construction");
206       timing_info.set_time(distiller_timing_info.document_construction_time());
207       *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
208     }
209 
210     if (distiller_timing_info.has_article_processing_time()) {
211       timing_info.set_name("article_processing");
212       timing_info.set_time(distiller_timing_info.article_processing_time());
213       *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
214     }
215 
216     if (distiller_timing_info.has_formatting_time()) {
217       timing_info.set_name("formatting");
218       timing_info.set_time(distiller_timing_info.formatting_time());
219       *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
220     }
221 
222     if (distiller_timing_info.has_total_time()) {
223       timing_info.set_name("total");
224       timing_info.set_time(distiller_timing_info.total_time());
225       *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
226     }
227 
228     for (int i = 0; i < distiller_timing_info.other_times_size(); i++) {
229       timing_info.set_name(distiller_timing_info.other_times(i).name());
230       timing_info.set_time(distiller_timing_info.other_times(i).time());
231       *page_data->distilled_page_proto->data.add_timing_info() = timing_info;
232     }
233   }
234 
235   if (distiller_result->has_debug_info() &&
236       distiller_result->debug_info().has_log()) {
237     page_data->distilled_page_proto->data.mutable_debug_info()->set_log(
238         distiller_result->debug_info().log());
239   }
240 
241   if (distiller_result->has_text_direction()) {
242     page_data->distilled_page_proto->data.set_text_direction(
243         distiller_result->text_direction());
244   } else {
245     page_data->distilled_page_proto->data.set_text_direction("auto");
246   }
247 
248   if (distiller_result->has_pagination_info()) {
249     const proto::PaginationInfo& pagination_info =
250         distiller_result->pagination_info();
251     // Skip the next page if the first page is empty.
252     if (pagination_info.has_next_page() && (page_num != 0 || !content_empty)) {
253       GURL next_page_url(pagination_info.next_page());
254       if (next_page_url.is_valid()) {
255         // The pages should be in same origin.
256         DCHECK_EQ(next_page_url.GetOrigin(), page_url.GetOrigin());
257         AddToDistillationQueue(page_num + 1, next_page_url);
258         page_data->distilled_page_proto->data.mutable_pagination_info()
259             ->set_next_page(next_page_url.spec());
260       }
261     }
262 
263     if (pagination_info.has_prev_page()) {
264       GURL prev_page_url(pagination_info.prev_page());
265       if (prev_page_url.is_valid()) {
266         DCHECK_EQ(prev_page_url.GetOrigin(), page_url.GetOrigin());
267         AddToDistillationQueue(page_num - 1, prev_page_url);
268         page_data->distilled_page_proto->data.mutable_pagination_info()
269             ->set_prev_page(prev_page_url.spec());
270       }
271     }
272 
273     if (pagination_info.has_canonical_page()) {
274       GURL canonical_page_url(pagination_info.canonical_page());
275       if (canonical_page_url.is_valid()) {
276         page_data->distilled_page_proto->data.mutable_pagination_info()
277             ->set_canonical_page(canonical_page_url.spec());
278       }
279     }
280   }
281 
282   for (int img_num = 0; img_num < distiller_result->content_images_size();
283        ++img_num) {
284     std::string image_id = base::NumberToString(page_num + 1) + "_" +
285                            base::NumberToString(img_num);
286     MaybeFetchImage(page_num, image_id,
287                     distiller_result->content_images(img_num).url());
288   }
289 
290   AddPageIfDone(page_num);
291   DistillNextPage();
292 }
293 
MaybeFetchImage(int page_num,const std::string & image_id,const std::string & image_url)294 void DistillerImpl::MaybeFetchImage(int page_num,
295                                     const std::string& image_id,
296                                     const std::string& image_url) {
297   if (!GURL(image_url).is_valid())
298     return;
299   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
300   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
301 
302   if (!DoesFetchImages()) {
303     DistilledPageProto_Image* image =
304         page_data->distilled_page_proto->data.add_image();
305     image->set_name(image_id);
306     image->set_url(image_url);
307     return;
308   }
309 
310   DistillerURLFetcher* fetcher =
311       distiller_url_fetcher_factory_.CreateDistillerURLFetcher();
312   page_data->image_fetchers_.push_back(base::WrapUnique(fetcher));
313 
314   // TODO(gilmanmh): Investigate whether this needs to be base::BindRepeating()
315   // or if base::BindOnce() can be used instead.
316   fetcher->FetchURL(
317       image_url,
318       base::BindRepeating(&DistillerImpl::OnFetchImageDone,
319                           weak_factory_.GetWeakPtr(), page_num,
320                           base::Unretained(fetcher), image_id, image_url));
321 }
322 
OnFetchImageDone(int page_num,DistillerURLFetcher * url_fetcher,const std::string & id,const std::string & original_url,const std::string & response)323 void DistillerImpl::OnFetchImageDone(int page_num,
324                                      DistillerURLFetcher* url_fetcher,
325                                      const std::string& id,
326                                      const std::string& original_url,
327                                      const std::string& response) {
328   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
329   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
330   DCHECK(page_data->distilled_page_proto);
331   DCHECK(url_fetcher);
332   auto fetcher_it = std::find_if(
333       page_data->image_fetchers_.begin(), page_data->image_fetchers_.end(),
334       [url_fetcher](const std::unique_ptr<DistillerURLFetcher>& f) {
335         return url_fetcher == f.get();
336       });
337 
338   DCHECK(fetcher_it != page_data->image_fetchers_.end());
339   // Delete the |url_fetcher| by DeleteSoon since the OnFetchImageDone
340   // callback is invoked by the |url_fetcher|.
341   fetcher_it->release();
342   page_data->image_fetchers_.erase(fetcher_it);
343   base::ThreadTaskRunnerHandle::Get()->DeleteSoon(FROM_HERE, url_fetcher);
344 
345   DistilledPageProto_Image* image =
346       page_data->distilled_page_proto->data.add_image();
347   image->set_name(id);
348   image->set_data(response);
349   image->set_url(original_url);
350 
351   AddPageIfDone(page_num);
352 }
353 
AddPageIfDone(int page_num)354 void DistillerImpl::AddPageIfDone(int page_num) {
355   DCHECK(started_pages_index_.find(page_num) != started_pages_index_.end());
356   DCHECK(finished_pages_index_.find(page_num) == finished_pages_index_.end());
357   DistilledPageData* page_data = GetPageAtIndex(started_pages_index_[page_num]);
358   if (page_data->image_fetchers_.empty()) {
359     finished_pages_index_[page_num] = started_pages_index_[page_num];
360     started_pages_index_.erase(page_num);
361     const ArticleDistillationUpdate& article_update =
362         CreateDistillationUpdate();
363     DCHECK_EQ(article_update.GetPagesSize(), finished_pages_index_.size());
364     update_cb_.Run(article_update);
365     RunDistillerCallbackIfDone();
366   }
367 }
368 
CreateDistillationUpdate() const369 const ArticleDistillationUpdate DistillerImpl::CreateDistillationUpdate()
370     const {
371   bool has_prev_page = false;
372   bool has_next_page = false;
373   if (!finished_pages_index_.empty()) {
374     int prev_page_num = finished_pages_index_.begin()->first - 1;
375     int next_page_num = finished_pages_index_.rbegin()->first + 1;
376     has_prev_page = IsPageNumberInUse(prev_page_num);
377     has_next_page = IsPageNumberInUse(next_page_num);
378   }
379 
380   std::vector<scoped_refptr<ArticleDistillationUpdate::RefCountedPageProto>>
381       update_pages;
382   for (auto it = finished_pages_index_.begin();
383        it != finished_pages_index_.end(); ++it) {
384     update_pages.push_back(pages_[it->second]->distilled_page_proto);
385   }
386   return ArticleDistillationUpdate(update_pages, has_next_page, has_prev_page);
387 }
388 
RunDistillerCallbackIfDone()389 void DistillerImpl::RunDistillerCallbackIfDone() {
390   DCHECK(!finished_cb_.is_null());
391   if (AreAllPagesFinished()) {
392     bool first_page = true;
393     std::unique_ptr<DistilledArticleProto> article_proto(
394         new DistilledArticleProto());
395     // Stitch the pages back into the article.
396     for (auto it = finished_pages_index_.begin();
397          it != finished_pages_index_.end();) {
398       DistilledPageData* page_data = GetPageAtIndex(it->second);
399       *(article_proto->add_pages()) = page_data->distilled_page_proto->data;
400 
401       if (first_page) {
402         article_proto->set_title(page_data->distilled_page_proto->data.title());
403         first_page = false;
404       }
405 
406       finished_pages_index_.erase(it++);
407     }
408 
409     pages_.clear();
410     DCHECK_LE(static_cast<size_t>(article_proto->pages_size()),
411               max_pages_in_article_);
412 
413     DCHECK(pages_.empty());
414     DCHECK(finished_pages_index_.empty());
415 
416     base::AutoReset<bool> dont_delete_this_in_callback(&destruction_allowed_,
417                                                        false);
418     std::move(finished_cb_).Run(std::move(article_proto));
419   }
420 }
421 
422 }  // namespace dom_distiller
423