1 // Copyright 2019 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/page_image_annotation/content/renderer/content_page_annotator_driver.h"
6
7 #include "base/base64.h"
8 #include "base/bind.h"
9 #include "base/optional.h"
10 #include "content/public/renderer/render_frame.h"
11 #include "crypto/sha2.h"
12 #include "services/image_annotation/public/mojom/image_annotation.mojom.h"
13 #include "third_party/blink/public/common/browser_interface_broker_proxy.h"
14 #include "third_party/blink/public/platform/web_string.h"
15 #include "third_party/blink/public/web/web_document.h"
16 #include "third_party/blink/public/web/web_local_frame.h"
17 #include "third_party/blink/public/web/web_node.h"
18
19 namespace page_image_annotation {
20
21 namespace {
22
23 namespace ia_mojom = image_annotation::mojom;
24
25 // The number of milliseconds to wait before traversing the DOM to find image
26 // elements.
27 constexpr int kDomCrawlDelayMs = 3000;
28
29 // Attempts to produce image metadata for the given element. Will produce a null
30 // value if the element has a missing or malformed src attribute.
ProduceMetadata(const GURL & page_url,const blink::WebElement element,const uint64_t node_id)31 base::Optional<PageAnnotator::ImageMetadata> ProduceMetadata(
32 const GURL& page_url,
33 const blink::WebElement element,
34 const uint64_t node_id) {
35 const std::string source_id = ContentPageAnnotatorDriver::GenerateSourceId(
36 page_url, element.GetAttribute("src").Utf8());
37 if (source_id.empty())
38 return base::nullopt;
39
40 return PageAnnotator::ImageMetadata{node_id, source_id};
41 }
42
RequestAnnotator(content::RenderFrame * const render_frame)43 mojo::PendingRemote<ia_mojom::Annotator> RequestAnnotator(
44 content::RenderFrame* const render_frame) {
45 mojo::PendingRemote<ia_mojom::Annotator> annotator;
46 render_frame->GetBrowserInterfaceBroker()->GetInterface(
47 annotator.InitWithNewPipeAndPassReceiver());
48 return annotator;
49 }
50
51 } // namespace
52
ContentPageAnnotatorDriver(content::RenderFrame * const render_frame)53 ContentPageAnnotatorDriver::ContentPageAnnotatorDriver(
54 content::RenderFrame* const render_frame)
55 : RenderFrameObserver(render_frame),
56 RenderFrameObserverTracker<ContentPageAnnotatorDriver>(render_frame),
57 next_node_id_(1),
58 page_annotator_(RequestAnnotator(render_frame)) {}
59
~ContentPageAnnotatorDriver()60 ContentPageAnnotatorDriver::~ContentPageAnnotatorDriver() {}
61
62 // static
GetOrCreate(content::RenderFrame * const render_frame)63 ContentPageAnnotatorDriver* ContentPageAnnotatorDriver::GetOrCreate(
64 content::RenderFrame* const render_frame) {
65 ContentPageAnnotatorDriver* const existing = Get(render_frame);
66 if (existing)
67 return existing;
68
69 return new ContentPageAnnotatorDriver(render_frame);
70 }
71
GetPageAnnotator()72 PageAnnotator& ContentPageAnnotatorDriver::GetPageAnnotator() {
73 return page_annotator_;
74 }
75
GetElement(const uint64_t node_id)76 blink::WebElement ContentPageAnnotatorDriver::GetElement(
77 const uint64_t node_id) {
78 const auto lookup = tracked_elements_.find(node_id);
79 if (lookup == tracked_elements_.end())
80 return blink::WebElement();
81
82 return lookup->second.second;
83 }
84
85 // static
GenerateSourceId(const GURL & page_url,const std::string & uri_fragment)86 std::string ContentPageAnnotatorDriver::GenerateSourceId(
87 const GURL& page_url,
88 const std::string& uri_fragment) {
89 if (uri_fragment.empty())
90 return std::string();
91
92 const GURL src_url = page_url.Resolve(uri_fragment);
93 if (!src_url.is_valid())
94 return std::string();
95
96 // Assign a source ID: either the URL of this image (if it can be resolved) or
97 // a hash of its data URI.
98 if (src_url.SchemeIs("data")) {
99 const std::string& content = src_url.GetContent();
100
101 if (!content.empty()) {
102 // We use SHA256 since it has comparable (<2x) speed to e.g. crc32, but
103 // has no known collisions (which could lead to cached results for another
104 // image being returned for this one).
105 std::string source_id;
106 base::Base64Encode(crypto::SHA256HashString(content), &source_id);
107 return source_id;
108 }
109 } else if (src_url.SchemeIs("http") || src_url.SchemeIs("https")) {
110 return src_url.spec();
111 }
112
113 return std::string();
114 }
115
DidFinishDocumentLoad()116 void ContentPageAnnotatorDriver::DidFinishDocumentLoad() {
117 if (!render_frame()->IsMainFrame())
118 return;
119
120 // Cancel any pending DOM crawl.
121 weak_ptr_factory_.InvalidateWeakPtrs();
122
123 // Stop tracking old elements. After a page refresh, we're the only thing
124 // keeping these elements alive (i.e. they are not still being displayed to
125 // the user); we need to let Blink garbage collect them.
126 for (const auto& entry : tracked_elements_) {
127 page_annotator_.ImageRemoved(entry.first);
128 }
129 tracked_elements_.clear();
130
131 // Schedule new DOM crawl after page has likely reached a stable state.
132 //
133 // TODO(crbug.com/916363): this approach is ad-hoc (e.g. uses a heuristic
134 // delay to wait for a stable DOM) and can cause jank;
135 // reinvestigate it once we are done prototyping the
136 // feature.
137 base::ThreadTaskRunnerHandle::Get()->PostDelayedTask(
138 FROM_HERE,
139 base::BindOnce(&ContentPageAnnotatorDriver::FindAndTrackImages,
140 weak_ptr_factory_.GetWeakPtr()),
141 base::TimeDelta::FromMilliseconds(kDomCrawlDelayMs));
142 }
143
OnDestruct()144 void ContentPageAnnotatorDriver::OnDestruct() {
145 delete this;
146 }
147
FindImages(const GURL & page_url,blink::WebElement element)148 void ContentPageAnnotatorDriver::FindImages(const GURL& page_url,
149 blink::WebElement element) {
150 if (element.ImageContents().isNull()) {
151 // This element is not an image but it could have children that are.
152 for (blink::WebNode child = element.FirstChild(); !child.IsNull();
153 child = child.NextSibling()) {
154 if (child.IsElementNode())
155 FindImages(page_url, child.To<blink::WebElement>());
156 }
157 } else {
158 // This element is an image; attempt to produce metadata for it and begin
159 // tracking.
160 const base::Optional<PageAnnotator::ImageMetadata> metadata =
161 ProduceMetadata(page_url, element, next_node_id_);
162
163 if (metadata.has_value())
164 tracked_elements_.insert({next_node_id_++, {*metadata, element}});
165 }
166 }
167
FindAndTrackImages()168 void ContentPageAnnotatorDriver::FindAndTrackImages() {
169 const blink::WebDocument doc = render_frame()->GetWebFrame()->GetDocument();
170 if (doc.IsNull() || doc.Body().IsNull())
171 return;
172
173 const GURL page_url(doc.Url().GetString().Utf8(), doc.Url().GetParsed(),
174 doc.Url().IsValid());
175 FindImages(page_url, doc.Body());
176
177 // Inform the PageAnnotator of the new images.
178 for (const auto& entry : tracked_elements_) {
179 page_annotator_.ImageAddedOrPossiblyModified(
180 entry.second.first,
181 base::BindRepeating(&ContentPageAnnotatorDriver::GetBitmapForId,
182 base::Unretained(this), entry.first));
183 }
184 }
185
GetBitmapForId(const uint64_t node_id)186 SkBitmap ContentPageAnnotatorDriver::GetBitmapForId(const uint64_t node_id) {
187 return GetElement(node_id).ImageContents();
188 }
189
190 } // namespace page_image_annotation
191