1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "third_party/blink/renderer/modules/document_metadata/document_metadata_extractor.h"
6 
7 #include <algorithm>
8 #include <memory>
9 #include <utility>
10 
11 #include "base/metrics/histogram_functions.h"
12 #include "components/schema_org/common/metadata.mojom-blink.h"
13 #include "third_party/blink/public/mojom/document_metadata/document_metadata.mojom-blink.h"
14 #include "third_party/blink/renderer/core/dom/document.h"
15 #include "third_party/blink/renderer/core/dom/element_traversal.h"
16 #include "third_party/blink/renderer/core/frame/local_frame.h"
17 #include "third_party/blink/renderer/core/html/html_element.h"
18 #include "third_party/blink/renderer/core/html_names.h"
19 #include "third_party/blink/renderer/platform/instrumentation/tracing/trace_event.h"
20 #include "third_party/blink/renderer/platform/json/json_parser.h"
21 #include "third_party/blink/renderer/platform/json/json_values.h"
22 #include "third_party/blink/renderer/platform/wtf/text/atomic_string.h"
23 #include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
24 #include "third_party/blink/renderer/platform/wtf/vector.h"
25 
26 namespace blink {
27 
28 namespace {
29 
30 using mojom::blink::WebPage;
31 using mojom::blink::WebPagePtr;
32 using schema_org::mojom::blink::Entity;
33 using schema_org::mojom::blink::EntityPtr;
34 using schema_org::mojom::blink::Property;
35 using schema_org::mojom::blink::PropertyPtr;
36 using schema_org::mojom::blink::Values;
37 using schema_org::mojom::blink::ValuesPtr;
38 
39 // App Indexing enforces a max nesting depth of 5. Our top level message
40 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse
41 // entites up to this depth, and ignore any further nesting. If an object at the
42 // max nesting depth has a property corresponding to an entity, that property
43 // will be dropped. Note that we will still parse json-ld blocks deeper than
44 // this, but it won't be passed to App Indexing.
45 constexpr int kMaxDepth = 4;
46 // Some strings are very long, and we don't currently use those, so limit string
47 // length to something reasonable to avoid undue pressure on Icing. Note that
48 // App Indexing supports strings up to length 20k.
49 constexpr wtf_size_t kMaxStringLength = 200;
50 // Enforced by App Indexing, so stop processing early if possible.
51 constexpr wtf_size_t kMaxNumFields = 20;
52 // Enforced by App Indexing, so stop processing early if possible.
53 constexpr wtf_size_t kMaxRepeatedSize = 100;
54 
55 constexpr char kJSONLDKeyType[] = "@type";
56 constexpr char kJSONLDKeyGraph[] = "@graph";
IsSupportedType(AtomicString type)57 bool IsSupportedType(AtomicString type) {
58   DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements,
59                       ({// Common types that include addresses.
60                         "AutoDealer", "Hotel", "LocalBusiness", "Organization",
61                         "Person", "Place", "PostalAddress", "Product",
62                         "Residence", "Restaurant", "SingleFamilyResidence",
63                         // Common types including phone numbers
64                         "Store", "ContactPoint", "LodgingBusiness"}));
65   return type && elements.Contains(type);
66 }
67 
68 void ExtractEntity(const JSONObject&, Entity&, int recursionLevel);
69 
ParseRepeatedValue(const JSONArray & arr,Values & values,int recursionLevel)70 bool ParseRepeatedValue(const JSONArray& arr,
71                         Values& values,
72                         int recursionLevel) {
73   if (arr.size() < 1) {
74     return false;
75   }
76 
77   const JSONValue::ValueType type = arr.at(0)->GetType();
78   switch (type) {
79     case JSONValue::ValueType::kTypeBoolean:
80       values.set_bool_values(Vector<bool>());
81       break;
82     case JSONValue::ValueType::kTypeInteger:
83       values.set_long_values(Vector<int64_t>());
84       break;
85     case JSONValue::ValueType::kTypeDouble:
86       // App Indexing doesn't support double type, so just encode its decimal
87       // value as a string instead.
88       values.set_string_values(Vector<String>());
89       break;
90     case JSONValue::ValueType::kTypeString:
91       values.set_string_values(Vector<String>());
92       break;
93     case JSONValue::ValueType::kTypeObject:
94       if (recursionLevel + 1 >= kMaxDepth) {
95         return false;
96       }
97       values.set_entity_values(Vector<EntityPtr>());
98       break;
99     case JSONArray::ValueType::kTypeArray:
100       // App Indexing doesn't support nested arrays.
101       return false;
102     default:
103       break;
104   }
105   for (wtf_size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) {
106     const JSONValue* innerVal = arr.at(j);
107     if (innerVal->GetType() != type) {
108       // App Indexing doesn't support mixed types. If there are mixed
109       // types in the parsed object, we will drop the property.
110       return false;
111     }
112     switch (innerVal->GetType()) {
113       case JSONValue::ValueType::kTypeBoolean: {
114         bool v;
115         innerVal->AsBoolean(&v);
116         values.get_bool_values().push_back(v);
117       } break;
118       case JSONValue::ValueType::kTypeInteger: {
119         int v;
120         innerVal->AsInteger(&v);
121         values.get_long_values().push_back(v);
122       } break;
123       case JSONValue::ValueType::kTypeDouble: {
124         // App Indexing doesn't support double type, so just encode its decimal
125         // value as a string instead.
126         double v;
127         innerVal->AsDouble(&v);
128         String s = String::Number(v);
129         s.Truncate(kMaxStringLength);
130         values.get_string_values().push_back(s);
131       } break;
132       case JSONValue::ValueType::kTypeString: {
133         String v;
134         innerVal->AsString(&v);
135         v.Truncate(kMaxStringLength);
136         values.get_string_values().push_back(v);
137       } break;
138       case JSONValue::ValueType::kTypeObject:
139         values.get_entity_values().push_back(Entity::New());
140         ExtractEntity(*(JSONObject::Cast(innerVal)),
141                       *(values.get_entity_values().at(j)), recursionLevel + 1);
142         break;
143       default:
144         break;
145     }
146   }
147   return true;
148 }
149 
ExtractEntity(const JSONObject & val,Entity & entity,int recursionLevel)150 void ExtractEntity(const JSONObject& val, Entity& entity, int recursionLevel) {
151   if (recursionLevel >= kMaxDepth) {
152     return;
153   }
154 
155   String type;
156   val.GetString(kJSONLDKeyType, &type);
157   if (!type) {
158     type = "Thing";
159   }
160   entity.type = type;
161   for (wtf_size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
162     PropertyPtr property = Property::New();
163     const JSONObject::Entry& entry = val.at(i);
164     property->name = entry.first;
165     if (property->name == kJSONLDKeyType) {
166       continue;
167     }
168     property->values = Values::New();
169 
170     bool addProperty = true;
171 
172     switch (entry.second->GetType()) {
173       case JSONValue::ValueType::kTypeBoolean: {
174         bool v;
175         val.GetBoolean(entry.first, &v);
176         property->values->set_bool_values({v});
177       } break;
178       case JSONValue::ValueType::kTypeInteger: {
179         int v;
180         val.GetInteger(entry.first, &v);
181         property->values->set_long_values({v});
182       } break;
183       case JSONValue::ValueType::kTypeDouble: {
184         double v;
185         val.GetDouble(entry.first, &v);
186         String s = String::Number(v);
187         s.Truncate(kMaxStringLength);
188         property->values->set_string_values({s});
189       } break;
190       case JSONValue::ValueType::kTypeString: {
191         String v;
192         val.GetString(entry.first, &v);
193         v.Truncate(kMaxStringLength);
194         property->values->set_string_values({v});
195       } break;
196       case JSONValue::ValueType::kTypeObject: {
197         if (recursionLevel + 1 >= kMaxDepth) {
198           addProperty = false;
199           break;
200         }
201         property->values->set_entity_values(Vector<EntityPtr>());
202         property->values->get_entity_values().push_back(Entity::New());
203 
204         ExtractEntity(*(val.GetJSONObject(entry.first)),
205                       *(property->values->get_entity_values().at(0)),
206                       recursionLevel + 1);
207       } break;
208       case JSONValue::ValueType::kTypeArray:
209         addProperty = ParseRepeatedValue(*(val.GetArray(entry.first)),
210                                          *(property->values), recursionLevel);
211         break;
212       default:
213         break;
214     }
215     if (addProperty)
216       entity.properties.push_back(std::move(property));
217   }
218 }
219 
ExtractTopLevelEntity(const JSONObject & val,Vector<EntityPtr> & entities)220 void ExtractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
221   // Now we have a JSONObject which corresponds to a single (possibly nested)
222   // entity.
223   EntityPtr entity = Entity::New();
224   String type;
225   val.GetString(kJSONLDKeyType, &type);
226   if (!IsSupportedType(AtomicString(type))) {
227     return;
228   }
229   ExtractEntity(val, *entity, 0);
230   entities.push_back(std::move(entity));
231 }
232 
ExtractEntitiesFromArray(const JSONArray & arr,Vector<EntityPtr> & entities)233 void ExtractEntitiesFromArray(const JSONArray& arr,
234                               Vector<EntityPtr>& entities) {
235   for (wtf_size_t i = 0; i < arr.size(); ++i) {
236     const JSONValue* val = arr.at(i);
237     if (val->GetType() == JSONValue::ValueType::kTypeObject) {
238       ExtractTopLevelEntity(*(JSONObject::Cast(val)), entities);
239     }
240   }
241 }
242 
ExtractEntityFromTopLevelObject(const JSONObject & val,Vector<EntityPtr> & entities)243 void ExtractEntityFromTopLevelObject(const JSONObject& val,
244                                      Vector<EntityPtr>& entities) {
245   const JSONArray* graph = val.GetArray(kJSONLDKeyGraph);
246   if (graph) {
247     ExtractEntitiesFromArray(*graph, entities);
248   }
249   ExtractTopLevelEntity(val, entities);
250 }
251 
252 // These values are persisted to logs. Entries should not be renumbered and
253 // numeric values should never be reused.
254 enum ExtractionStatus {
255   kOK,
256   kEmpty,
257   kParseFailure,
258   kWrongType,
259   kMaxValue = kWrongType,
260 };
261 
ExtractMetadata(const Element & root,Vector<EntityPtr> & entities)262 ExtractionStatus ExtractMetadata(const Element& root,
263                                  Vector<EntityPtr>& entities) {
264   for (Element& element : ElementTraversal::DescendantsOf(root)) {
265     if (element.HasTagName(html_names::kScriptTag) &&
266         element.FastGetAttribute(html_names::kTypeAttr) ==
267             "application/ld+json") {
268       std::unique_ptr<JSONValue> json = ParseJSON(element.textContent());
269       if (!json) {
270         LOG(ERROR) << "Failed to parse json.";
271         return ExtractionStatus::kParseFailure;
272       }
273       switch (json->GetType()) {
274         case JSONValue::ValueType::kTypeArray:
275           ExtractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities);
276           break;
277         case JSONValue::ValueType::kTypeObject:
278           ExtractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())),
279                                           entities);
280           break;
281         default:
282           return ExtractionStatus::kWrongType;
283       }
284     }
285   }
286   if (entities.IsEmpty()) {
287     return ExtractionStatus::kEmpty;
288   }
289   return ExtractionStatus::kOK;
290 }
291 
292 }  // namespace
293 
Extract(const Document & document)294 WebPagePtr DocumentMetadataExtractor::Extract(const Document& document) {
295   TRACE_EVENT0("blink", "DocumentMetadataExtractor::Extract");
296 
297   if (!document.GetFrame() || !document.GetFrame()->IsMainFrame())
298     return nullptr;
299 
300   Element* html = document.documentElement();
301   if (!html)
302     return nullptr;
303 
304   WebPagePtr page = WebPage::New();
305 
306   // Traverse the DOM tree and extract the metadata.
307   base::TimeTicks start_time = base::TimeTicks::Now();
308   ExtractionStatus status = ExtractMetadata(*html, page->entities);
309   base::TimeDelta elapsed_time = base::TimeTicks::Now() - start_time;
310 
311   base::UmaHistogramEnumeration("CopylessPaste.ExtractionStatus", status);
312 
313   if (status != ExtractionStatus::kOK) {
314     base::UmaHistogramCustomMicrosecondsTimes(
315         "CopylessPaste.ExtractionFailedUs", elapsed_time,
316         base::TimeDelta::FromMicroseconds(1), base::TimeDelta::FromSeconds(1),
317         50);
318     return nullptr;
319   }
320   base::UmaHistogramCustomMicrosecondsTimes(
321       "CopylessPaste.ExtractionUs", elapsed_time,
322       base::TimeDelta::FromMicroseconds(1), base::TimeDelta::FromSeconds(1),
323       50);
324 
325   page->url = document.Url();
326   page->title = document.title();
327   return page;
328 }
329 
330 }  // namespace blink
331