1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "third_party/blink/renderer/modules/document_metadata/document_metadata_extractor.h"
6
7 #include <algorithm>
8 #include <memory>
9 #include <utility>
10
11 #include "base/metrics/histogram_functions.h"
12 #include "components/schema_org/common/metadata.mojom-blink.h"
13 #include "third_party/blink/public/mojom/document_metadata/document_metadata.mojom-blink.h"
14 #include "third_party/blink/renderer/core/dom/document.h"
15 #include "third_party/blink/renderer/core/dom/element_traversal.h"
16 #include "third_party/blink/renderer/core/frame/local_frame.h"
17 #include "third_party/blink/renderer/core/html/html_element.h"
18 #include "third_party/blink/renderer/core/html_names.h"
19 #include "third_party/blink/renderer/platform/instrumentation/tracing/trace_event.h"
20 #include "third_party/blink/renderer/platform/json/json_parser.h"
21 #include "third_party/blink/renderer/platform/json/json_values.h"
22 #include "third_party/blink/renderer/platform/wtf/text/atomic_string.h"
23 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
24 #include "third_party/blink/renderer/platform/wtf/vector.h"
25
26 namespace blink {
27
28 namespace {
29
30 using mojom::blink::WebPage;
31 using mojom::blink::WebPagePtr;
32 using schema_org::mojom::blink::Entity;
33 using schema_org::mojom::blink::EntityPtr;
34 using schema_org::mojom::blink::Property;
35 using schema_org::mojom::blink::PropertyPtr;
36 using schema_org::mojom::blink::Values;
37 using schema_org::mojom::blink::ValuesPtr;
38
39 // App Indexing enforces a max nesting depth of 5. Our top level message
40 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse
41 // entites up to this depth, and ignore any further nesting. If an object at the
42 // max nesting depth has a property corresponding to an entity, that property
43 // will be dropped. Note that we will still parse json-ld blocks deeper than
44 // this, but it won't be passed to App Indexing.
45 constexpr int kMaxDepth = 4;
46 // Some strings are very long, and we don't currently use those, so limit string
47 // length to something reasonable to avoid undue pressure on Icing. Note that
48 // App Indexing supports strings up to length 20k.
49 constexpr wtf_size_t kMaxStringLength = 200;
50 // Enforced by App Indexing, so stop processing early if possible.
51 constexpr wtf_size_t kMaxNumFields = 20;
52 // Enforced by App Indexing, so stop processing early if possible.
53 constexpr wtf_size_t kMaxRepeatedSize = 100;
54
55 constexpr char kJSONLDKeyType[] = "@type";
56 constexpr char kJSONLDKeyGraph[] = "@graph";
IsSupportedType(AtomicString type)57 bool IsSupportedType(AtomicString type) {
58 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements,
59 ({// Common types that include addresses.
60 "AutoDealer", "Hotel", "LocalBusiness", "Organization",
61 "Person", "Place", "PostalAddress", "Product",
62 "Residence", "Restaurant", "SingleFamilyResidence",
63 // Common types including phone numbers
64 "Store", "ContactPoint", "LodgingBusiness"}));
65 return type && elements.Contains(type);
66 }
67
68 void ExtractEntity(const JSONObject&, Entity&, int recursionLevel);
69
ParseRepeatedValue(const JSONArray & arr,Values & values,int recursionLevel)70 bool ParseRepeatedValue(const JSONArray& arr,
71 Values& values,
72 int recursionLevel) {
73 if (arr.size() < 1) {
74 return false;
75 }
76
77 const JSONValue::ValueType type = arr.at(0)->GetType();
78 switch (type) {
79 case JSONValue::ValueType::kTypeBoolean:
80 values.set_bool_values(Vector<bool>());
81 break;
82 case JSONValue::ValueType::kTypeInteger:
83 values.set_long_values(Vector<int64_t>());
84 break;
85 case JSONValue::ValueType::kTypeDouble:
86 // App Indexing doesn't support double type, so just encode its decimal
87 // value as a string instead.
88 values.set_string_values(Vector<String>());
89 break;
90 case JSONValue::ValueType::kTypeString:
91 values.set_string_values(Vector<String>());
92 break;
93 case JSONValue::ValueType::kTypeObject:
94 if (recursionLevel + 1 >= kMaxDepth) {
95 return false;
96 }
97 values.set_entity_values(Vector<EntityPtr>());
98 break;
99 case JSONArray::ValueType::kTypeArray:
100 // App Indexing doesn't support nested arrays.
101 return false;
102 default:
103 break;
104 }
105 for (wtf_size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) {
106 const JSONValue* innerVal = arr.at(j);
107 if (innerVal->GetType() != type) {
108 // App Indexing doesn't support mixed types. If there are mixed
109 // types in the parsed object, we will drop the property.
110 return false;
111 }
112 switch (innerVal->GetType()) {
113 case JSONValue::ValueType::kTypeBoolean: {
114 bool v;
115 innerVal->AsBoolean(&v);
116 values.get_bool_values().push_back(v);
117 } break;
118 case JSONValue::ValueType::kTypeInteger: {
119 int v;
120 innerVal->AsInteger(&v);
121 values.get_long_values().push_back(v);
122 } break;
123 case JSONValue::ValueType::kTypeDouble: {
124 // App Indexing doesn't support double type, so just encode its decimal
125 // value as a string instead.
126 double v;
127 innerVal->AsDouble(&v);
128 String s = String::Number(v);
129 s.Truncate(kMaxStringLength);
130 values.get_string_values().push_back(s);
131 } break;
132 case JSONValue::ValueType::kTypeString: {
133 String v;
134 innerVal->AsString(&v);
135 v.Truncate(kMaxStringLength);
136 values.get_string_values().push_back(v);
137 } break;
138 case JSONValue::ValueType::kTypeObject:
139 values.get_entity_values().push_back(Entity::New());
140 ExtractEntity(*(JSONObject::Cast(innerVal)),
141 *(values.get_entity_values().at(j)), recursionLevel + 1);
142 break;
143 default:
144 break;
145 }
146 }
147 return true;
148 }
149
ExtractEntity(const JSONObject & val,Entity & entity,int recursionLevel)150 void ExtractEntity(const JSONObject& val, Entity& entity, int recursionLevel) {
151 if (recursionLevel >= kMaxDepth) {
152 return;
153 }
154
155 String type;
156 val.GetString(kJSONLDKeyType, &type);
157 if (!type) {
158 type = "Thing";
159 }
160 entity.type = type;
161 for (wtf_size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
162 PropertyPtr property = Property::New();
163 const JSONObject::Entry& entry = val.at(i);
164 property->name = entry.first;
165 if (property->name == kJSONLDKeyType) {
166 continue;
167 }
168 property->values = Values::New();
169
170 bool addProperty = true;
171
172 switch (entry.second->GetType()) {
173 case JSONValue::ValueType::kTypeBoolean: {
174 bool v;
175 val.GetBoolean(entry.first, &v);
176 property->values->set_bool_values({v});
177 } break;
178 case JSONValue::ValueType::kTypeInteger: {
179 int v;
180 val.GetInteger(entry.first, &v);
181 property->values->set_long_values({v});
182 } break;
183 case JSONValue::ValueType::kTypeDouble: {
184 double v;
185 val.GetDouble(entry.first, &v);
186 String s = String::Number(v);
187 s.Truncate(kMaxStringLength);
188 property->values->set_string_values({s});
189 } break;
190 case JSONValue::ValueType::kTypeString: {
191 String v;
192 val.GetString(entry.first, &v);
193 v.Truncate(kMaxStringLength);
194 property->values->set_string_values({v});
195 } break;
196 case JSONValue::ValueType::kTypeObject: {
197 if (recursionLevel + 1 >= kMaxDepth) {
198 addProperty = false;
199 break;
200 }
201 property->values->set_entity_values(Vector<EntityPtr>());
202 property->values->get_entity_values().push_back(Entity::New());
203
204 ExtractEntity(*(val.GetJSONObject(entry.first)),
205 *(property->values->get_entity_values().at(0)),
206 recursionLevel + 1);
207 } break;
208 case JSONValue::ValueType::kTypeArray:
209 addProperty = ParseRepeatedValue(*(val.GetArray(entry.first)),
210 *(property->values), recursionLevel);
211 break;
212 default:
213 break;
214 }
215 if (addProperty)
216 entity.properties.push_back(std::move(property));
217 }
218 }
219
ExtractTopLevelEntity(const JSONObject & val,Vector<EntityPtr> & entities)220 void ExtractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
221 // Now we have a JSONObject which corresponds to a single (possibly nested)
222 // entity.
223 EntityPtr entity = Entity::New();
224 String type;
225 val.GetString(kJSONLDKeyType, &type);
226 if (!IsSupportedType(AtomicString(type))) {
227 return;
228 }
229 ExtractEntity(val, *entity, 0);
230 entities.push_back(std::move(entity));
231 }
232
ExtractEntitiesFromArray(const JSONArray & arr,Vector<EntityPtr> & entities)233 void ExtractEntitiesFromArray(const JSONArray& arr,
234 Vector<EntityPtr>& entities) {
235 for (wtf_size_t i = 0; i < arr.size(); ++i) {
236 const JSONValue* val = arr.at(i);
237 if (val->GetType() == JSONValue::ValueType::kTypeObject) {
238 ExtractTopLevelEntity(*(JSONObject::Cast(val)), entities);
239 }
240 }
241 }
242
ExtractEntityFromTopLevelObject(const JSONObject & val,Vector<EntityPtr> & entities)243 void ExtractEntityFromTopLevelObject(const JSONObject& val,
244 Vector<EntityPtr>& entities) {
245 const JSONArray* graph = val.GetArray(kJSONLDKeyGraph);
246 if (graph) {
247 ExtractEntitiesFromArray(*graph, entities);
248 }
249 ExtractTopLevelEntity(val, entities);
250 }
251
252 // These values are persisted to logs. Entries should not be renumbered and
253 // numeric values should never be reused.
254 enum ExtractionStatus {
255 kOK,
256 kEmpty,
257 kParseFailure,
258 kWrongType,
259 kMaxValue = kWrongType,
260 };
261
ExtractMetadata(const Element & root,Vector<EntityPtr> & entities)262 ExtractionStatus ExtractMetadata(const Element& root,
263 Vector<EntityPtr>& entities) {
264 for (Element& element : ElementTraversal::DescendantsOf(root)) {
265 if (element.HasTagName(html_names::kScriptTag) &&
266 element.FastGetAttribute(html_names::kTypeAttr) ==
267 "application/ld+json") {
268 std::unique_ptr<JSONValue> json = ParseJSON(element.textContent());
269 if (!json) {
270 LOG(ERROR) << "Failed to parse json.";
271 return ExtractionStatus::kParseFailure;
272 }
273 switch (json->GetType()) {
274 case JSONValue::ValueType::kTypeArray:
275 ExtractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities);
276 break;
277 case JSONValue::ValueType::kTypeObject:
278 ExtractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())),
279 entities);
280 break;
281 default:
282 return ExtractionStatus::kWrongType;
283 }
284 }
285 }
286 if (entities.IsEmpty()) {
287 return ExtractionStatus::kEmpty;
288 }
289 return ExtractionStatus::kOK;
290 }
291
292 } // namespace
293
Extract(const Document & document)294 WebPagePtr DocumentMetadataExtractor::Extract(const Document& document) {
295 TRACE_EVENT0("blink", "DocumentMetadataExtractor::Extract");
296
297 if (!document.GetFrame() || !document.GetFrame()->IsMainFrame())
298 return nullptr;
299
300 Element* html = document.documentElement();
301 if (!html)
302 return nullptr;
303
304 WebPagePtr page = WebPage::New();
305
306 // Traverse the DOM tree and extract the metadata.
307 base::TimeTicks start_time = base::TimeTicks::Now();
308 ExtractionStatus status = ExtractMetadata(*html, page->entities);
309 base::TimeDelta elapsed_time = base::TimeTicks::Now() - start_time;
310
311 base::UmaHistogramEnumeration("CopylessPaste.ExtractionStatus", status);
312
313 if (status != ExtractionStatus::kOK) {
314 base::UmaHistogramCustomMicrosecondsTimes(
315 "CopylessPaste.ExtractionFailedUs", elapsed_time,
316 base::TimeDelta::FromMicroseconds(1), base::TimeDelta::FromSeconds(1),
317 50);
318 return nullptr;
319 }
320 base::UmaHistogramCustomMicrosecondsTimes(
321 "CopylessPaste.ExtractionUs", elapsed_time,
322 base::TimeDelta::FromMicroseconds(1), base::TimeDelta::FromSeconds(1),
323 50);
324
325 page->url = document.Url();
326 page->title = document.title();
327 return page;
328 }
329
330 } // namespace blink
331