1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/dom_distiller/core/page_features.h"
6 
7 #include <stddef.h>
8 
9 #include <memory>
10 #include <string>
11 
12 #include "base/json/json_reader.h"
13 #include "third_party/re2/src/re2/re2.h"
14 #include "url/gurl.h"
15 
16 namespace dom_distiller {
17 /* This code needs to derive features in the same way and order in which they
18  * are derived when training the model. Parts of that code are reproduced in the
19  * comments below.
20  */
21 
22 namespace {
23 
GetLastSegment(const std::string & path)24 std::string GetLastSegment(const std::string& path) {
25   // return re.search('[^/]*\/?$', path).group(0)
26   if (path.size() == 0)
27     return "";
28   if (path.size() == 1) {
29     DCHECK(path[0] == '/');
30     return path;
31   }
32   size_t start = path.rfind("/", path.size() - 2);
33   return start == std::string::npos ? "" : path.substr(start + 1);
34 }
35 
CountMatches(const std::string & s,const std::string & p)36 int CountMatches(const std::string& s, const std::string& p) {
37   // return len(re.findall(p, s))
38   re2::StringPiece sp(s);
39   re2::RE2 regexp(p);
40   int count = 0;
41   while (re2::RE2::FindAndConsume(&sp, regexp))
42     count++;
43   return count;
44 }
45 
GetWordCount(const std::string & s)46 int GetWordCount(const std::string& s) {
47   return CountMatches(s, "\\w+");
48 }
49 
Contains(const std::string & n,const std::string & h)50 bool Contains(const std::string& n, const std::string& h) {
51   return h.find(n) != std::string::npos;
52 }
53 
EndsWith(const std::string & t,const std::string & s)54 bool EndsWith(const std::string& t, const std::string& s) {
55   return s.size() >= t.size() &&
56          s.compare(s.size() - t.size(), std::string::npos, t) == 0;
57 }
58 
59 }  // namespace
60 
61 int kDerivedFeaturesCount = 29;
62 
CalculateDerivedFeatures(bool isOGArticle,const GURL & url,double numElements,double numAnchors,double numForms,const std::string & innerText,const std::string & textContent,const std::string & innerHTML)63 std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
64                                              const GURL& url,
65                                              double numElements,
66                                              double numAnchors,
67                                              double numForms,
68                                              const std::string& innerText,
69                                              const std::string& textContent,
70                                              const std::string& innerHTML) {
71   // In the training pipeline, the strings are explicitly encoded in utf-8 (as
72   // they are here).
73   const std::string& path = url.path();
74   int innerTextWords = GetWordCount(innerText);
75   int textContentWords = GetWordCount(textContent);
76   int innerHTMLWords = GetWordCount(innerHTML);
77   std::vector<double> features;
78   // 'opengraph', opengraph,
79   features.push_back(isOGArticle);
80   // 'forum', 'forum' in path,
81   features.push_back(Contains("forum", path));
82   // 'index', 'index' in path,
83   features.push_back(Contains("index", path));
84   // 'view', 'view' in path,
85   features.push_back(Contains("view", path));
86   // 'asp', '.asp' in path,
87   features.push_back(Contains(".asp", path));
88   // 'phpbb', 'phpbb' in path,
89   features.push_back(Contains("phpbb", path));
90   // 'php', path.endswith('.php'),
91   features.push_back(EndsWith(".php", path));
92   // 'pathlength', len(path),
93   features.push_back(path.size());
94   // 'domain', len(path) < 2,
95   features.push_back(path.size() < 2);
96   // 'pathcomponents', CountMatches(path, r'\/.'),
97   features.push_back(CountMatches(path, "\\/."));
98   // 'slugdetector', CountMatches(path, r'[^\w/]'),
99   features.push_back(CountMatches(path, "[^\\w/]"));
100   // 'pathnumbers', CountMatches(path, r'\d+'),
101   features.push_back(CountMatches(path, "\\d+"));
102   // 'lastSegmentLength', len(GetLastSegment(path)),
103   features.push_back(GetLastSegment(path).size());
104   // 'formcount', numForms,
105   features.push_back(numForms);
106   // 'anchorcount', numAnchors,
107   features.push_back(numAnchors);
108   // 'elementcount', numElements,
109   features.push_back(numElements);
110   // 'anchorratio', float(numAnchors) / max(1, numElements),
111   features.push_back(double(numAnchors) / std::max<double>(1, numElements));
112   // 'innertextlength', len(innerText),
113   features.push_back(innerText.size());
114   // 'textcontentlength', len(textContent),
115   features.push_back(textContent.size());
116   // 'innerhtmllength', len(innerHTML),
117   features.push_back(innerHTML.size());
118   // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
119   features.push_back(double(innerText.size()) /
120                      std::max<double>(1.0, innerHTML.size()));
121   // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
122   features.push_back(double(textContent.size()) /
123                      std::max<double>(1.0, innerHTML.size()));
124   // 'innertexttextcontentlengthratio',
125   // float(len(innerText)) / max(1, len(textContent)),
126   features.push_back(double(innerText.size()) /
127                      std::max<double>(1.0, textContent.size()));
128   // 'innertextwordcount', innerTextWords,
129   features.push_back(innerTextWords);
130   // 'textcontentwordcount', textContentWords,
131   features.push_back(textContentWords);
132   // 'innerhtmlwordcount', innerHTMLWords,
133   features.push_back(innerHTMLWords);
134   // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
135   features.push_back(double(innerTextWords) /
136                      std::max<int>(1.0, innerHTMLWords));
137   // 'textcontentwordcountratio',
138   // float(textContentWords) / max(1, innerHTMLWords),
139   features.push_back(double(textContentWords) /
140                      std::max<int>(1.0, innerHTMLWords));
141   // 'innertexttextcontentwordcountratio',
142   // float(innerTextWords) / max(1, textContentWords),
143   features.push_back(double(innerTextWords) /
144                      std::max<int>(1.0, textContentWords));
145   return features;
146 }
147 
CalculateDerivedFeaturesFromJSON(const base::Value * stringified_json)148 std::vector<double> CalculateDerivedFeaturesFromJSON(
149     const base::Value* stringified_json) {
150   std::string stringified;
151   if (!stringified_json->GetAsString(&stringified)) {
152     return std::vector<double>();
153   }
154 
155   std::unique_ptr<base::Value> json =
156       base::JSONReader::ReadDeprecated(stringified);
157   if (!json) {
158     return std::vector<double>();
159   }
160 
161   const base::DictionaryValue* dict;
162   if (!json->GetAsDictionary(&dict)) {
163     return std::vector<double>();
164   }
165 
166   bool isOGArticle = false;
167   std::string url, innerText, textContent, innerHTML;
168   double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
169 
170   if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
171         dict->GetString("url", &url) &&
172         dict->GetDouble("numElements", &numElements) &&
173         dict->GetDouble("numAnchors", &numAnchors) &&
174         dict->GetDouble("numForms", &numForms) &&
175         dict->GetString("innerText", &innerText) &&
176         dict->GetString("textContent", &textContent) &&
177         dict->GetString("innerHTML", &innerHTML))) {
178     return std::vector<double>();
179   }
180 
181   GURL parsed_url(url);
182   if (!parsed_url.is_valid()) {
183     return std::vector<double>();
184   }
185 
186   return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
187                                   numAnchors, numForms, innerText, textContent,
188                                   innerHTML);
189 }
190 
CalculateDerivedFeatures(bool openGraph,const GURL & url,unsigned elementCount,unsigned anchorCount,unsigned formCount,double mozScore,double mozScoreAllSqrt,double mozScoreAllLinear)191 std::vector<double> CalculateDerivedFeatures(bool openGraph,
192                                              const GURL& url,
193                                              unsigned elementCount,
194                                              unsigned anchorCount,
195                                              unsigned formCount,
196                                              double mozScore,
197                                              double mozScoreAllSqrt,
198                                              double mozScoreAllLinear) {
199   const std::string& path = url.path();
200   std::vector<double> features;
201   // 'opengraph', opengraph,
202   features.push_back(openGraph);
203   // 'forum', 'forum' in path,
204   features.push_back(Contains("forum", path));
205   // 'index', 'index' in path,
206   features.push_back(Contains("index", path));
207   // 'search', 'search' in path,
208   features.push_back(Contains("search", path));
209   // 'view', 'view' in path,
210   features.push_back(Contains("view", path));
211   // 'archive', 'archive' in path,
212   features.push_back(Contains("archive", path));
213   // 'asp', '.asp' in path,
214   features.push_back(Contains(".asp", path));
215   // 'phpbb', 'phpbb' in path,
216   features.push_back(Contains("phpbb", path));
217   // 'php', path.endswith('.php'),
218   features.push_back(EndsWith(".php", path));
219   // 'pathLength', len(path),
220   features.push_back(path.size());
221   // 'domain', len(path) < 2,
222   features.push_back(path.size() < 2);
223   // 'pathComponents', CountMatches(path, r'\/.'),
224   features.push_back(CountMatches(path, "\\/."));
225   // 'slugDetector', CountMatches(path, r'[^\w/]'),
226   features.push_back(CountMatches(path, "[^\\w/]"));
227   // 'pathNumbers', CountMatches(path, r'\d+'),
228   features.push_back(CountMatches(path, "\\d+"));
229   // 'lastSegmentLength', len(GetLastSegment(path)),
230   features.push_back(GetLastSegment(path).size());
231   // 'formCount', numForms,
232   features.push_back(formCount);
233   // 'anchorCount', numAnchors,
234   features.push_back(anchorCount);
235   // 'elementCount', numElements,
236   features.push_back(elementCount);
237   // 'anchorRatio', float(numAnchors) / max(1, numElements),
238   features.push_back(double(anchorCount) / std::max<double>(1, elementCount));
239   // 'mozScore'
240   features.push_back(mozScore);
241   // 'mozScoreAllSqrt'
242   features.push_back(mozScoreAllSqrt);
243   // 'mozScoreAllLinear'
244   features.push_back(mozScoreAllLinear);
245 
246   return features;
247 }
248 
249 }  // namespace dom_distiller
250