1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/dom_distiller/core/page_features.h"
6
7 #include <stddef.h>
8
9 #include <memory>
10 #include <string>
11
12 #include "base/json/json_reader.h"
13 #include "third_party/re2/src/re2/re2.h"
14 #include "url/gurl.h"
15
16 namespace dom_distiller {
17 /* This code needs to derive features in the same way and order in which they
18 * are derived when training the model. Parts of that code are reproduced in the
19 * comments below.
20 */
21
22 namespace {
23
GetLastSegment(const std::string & path)24 std::string GetLastSegment(const std::string& path) {
25 // return re.search('[^/]*\/?$', path).group(0)
26 if (path.size() == 0)
27 return "";
28 if (path.size() == 1) {
29 DCHECK(path[0] == '/');
30 return path;
31 }
32 size_t start = path.rfind("/", path.size() - 2);
33 return start == std::string::npos ? "" : path.substr(start + 1);
34 }
35
CountMatches(const std::string & s,const std::string & p)36 int CountMatches(const std::string& s, const std::string& p) {
37 // return len(re.findall(p, s))
38 re2::StringPiece sp(s);
39 re2::RE2 regexp(p);
40 int count = 0;
41 while (re2::RE2::FindAndConsume(&sp, regexp))
42 count++;
43 return count;
44 }
45
GetWordCount(const std::string & s)46 int GetWordCount(const std::string& s) {
47 return CountMatches(s, "\\w+");
48 }
49
Contains(const std::string & n,const std::string & h)50 bool Contains(const std::string& n, const std::string& h) {
51 return h.find(n) != std::string::npos;
52 }
53
EndsWith(const std::string & t,const std::string & s)54 bool EndsWith(const std::string& t, const std::string& s) {
55 return s.size() >= t.size() &&
56 s.compare(s.size() - t.size(), std::string::npos, t) == 0;
57 }
58
59 } // namespace
60
61 int kDerivedFeaturesCount = 29;
62
CalculateDerivedFeatures(bool isOGArticle,const GURL & url,double numElements,double numAnchors,double numForms,const std::string & innerText,const std::string & textContent,const std::string & innerHTML)63 std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
64 const GURL& url,
65 double numElements,
66 double numAnchors,
67 double numForms,
68 const std::string& innerText,
69 const std::string& textContent,
70 const std::string& innerHTML) {
71 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
72 // they are here).
73 const std::string& path = url.path();
74 int innerTextWords = GetWordCount(innerText);
75 int textContentWords = GetWordCount(textContent);
76 int innerHTMLWords = GetWordCount(innerHTML);
77 std::vector<double> features;
78 // 'opengraph', opengraph,
79 features.push_back(isOGArticle);
80 // 'forum', 'forum' in path,
81 features.push_back(Contains("forum", path));
82 // 'index', 'index' in path,
83 features.push_back(Contains("index", path));
84 // 'view', 'view' in path,
85 features.push_back(Contains("view", path));
86 // 'asp', '.asp' in path,
87 features.push_back(Contains(".asp", path));
88 // 'phpbb', 'phpbb' in path,
89 features.push_back(Contains("phpbb", path));
90 // 'php', path.endswith('.php'),
91 features.push_back(EndsWith(".php", path));
92 // 'pathlength', len(path),
93 features.push_back(path.size());
94 // 'domain', len(path) < 2,
95 features.push_back(path.size() < 2);
96 // 'pathcomponents', CountMatches(path, r'\/.'),
97 features.push_back(CountMatches(path, "\\/."));
98 // 'slugdetector', CountMatches(path, r'[^\w/]'),
99 features.push_back(CountMatches(path, "[^\\w/]"));
100 // 'pathnumbers', CountMatches(path, r'\d+'),
101 features.push_back(CountMatches(path, "\\d+"));
102 // 'lastSegmentLength', len(GetLastSegment(path)),
103 features.push_back(GetLastSegment(path).size());
104 // 'formcount', numForms,
105 features.push_back(numForms);
106 // 'anchorcount', numAnchors,
107 features.push_back(numAnchors);
108 // 'elementcount', numElements,
109 features.push_back(numElements);
110 // 'anchorratio', float(numAnchors) / max(1, numElements),
111 features.push_back(double(numAnchors) / std::max<double>(1, numElements));
112 // 'innertextlength', len(innerText),
113 features.push_back(innerText.size());
114 // 'textcontentlength', len(textContent),
115 features.push_back(textContent.size());
116 // 'innerhtmllength', len(innerHTML),
117 features.push_back(innerHTML.size());
118 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
119 features.push_back(double(innerText.size()) /
120 std::max<double>(1.0, innerHTML.size()));
121 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
122 features.push_back(double(textContent.size()) /
123 std::max<double>(1.0, innerHTML.size()));
124 // 'innertexttextcontentlengthratio',
125 // float(len(innerText)) / max(1, len(textContent)),
126 features.push_back(double(innerText.size()) /
127 std::max<double>(1.0, textContent.size()));
128 // 'innertextwordcount', innerTextWords,
129 features.push_back(innerTextWords);
130 // 'textcontentwordcount', textContentWords,
131 features.push_back(textContentWords);
132 // 'innerhtmlwordcount', innerHTMLWords,
133 features.push_back(innerHTMLWords);
134 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
135 features.push_back(double(innerTextWords) /
136 std::max<int>(1.0, innerHTMLWords));
137 // 'textcontentwordcountratio',
138 // float(textContentWords) / max(1, innerHTMLWords),
139 features.push_back(double(textContentWords) /
140 std::max<int>(1.0, innerHTMLWords));
141 // 'innertexttextcontentwordcountratio',
142 // float(innerTextWords) / max(1, textContentWords),
143 features.push_back(double(innerTextWords) /
144 std::max<int>(1.0, textContentWords));
145 return features;
146 }
147
CalculateDerivedFeaturesFromJSON(const base::Value * stringified_json)148 std::vector<double> CalculateDerivedFeaturesFromJSON(
149 const base::Value* stringified_json) {
150 std::string stringified;
151 if (!stringified_json->GetAsString(&stringified)) {
152 return std::vector<double>();
153 }
154
155 std::unique_ptr<base::Value> json =
156 base::JSONReader::ReadDeprecated(stringified);
157 if (!json) {
158 return std::vector<double>();
159 }
160
161 const base::DictionaryValue* dict;
162 if (!json->GetAsDictionary(&dict)) {
163 return std::vector<double>();
164 }
165
166 bool isOGArticle = false;
167 std::string url, innerText, textContent, innerHTML;
168 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
169
170 if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
171 dict->GetString("url", &url) &&
172 dict->GetDouble("numElements", &numElements) &&
173 dict->GetDouble("numAnchors", &numAnchors) &&
174 dict->GetDouble("numForms", &numForms) &&
175 dict->GetString("innerText", &innerText) &&
176 dict->GetString("textContent", &textContent) &&
177 dict->GetString("innerHTML", &innerHTML))) {
178 return std::vector<double>();
179 }
180
181 GURL parsed_url(url);
182 if (!parsed_url.is_valid()) {
183 return std::vector<double>();
184 }
185
186 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
187 numAnchors, numForms, innerText, textContent,
188 innerHTML);
189 }
190
CalculateDerivedFeatures(bool openGraph,const GURL & url,unsigned elementCount,unsigned anchorCount,unsigned formCount,double mozScore,double mozScoreAllSqrt,double mozScoreAllLinear)191 std::vector<double> CalculateDerivedFeatures(bool openGraph,
192 const GURL& url,
193 unsigned elementCount,
194 unsigned anchorCount,
195 unsigned formCount,
196 double mozScore,
197 double mozScoreAllSqrt,
198 double mozScoreAllLinear) {
199 const std::string& path = url.path();
200 std::vector<double> features;
201 // 'opengraph', opengraph,
202 features.push_back(openGraph);
203 // 'forum', 'forum' in path,
204 features.push_back(Contains("forum", path));
205 // 'index', 'index' in path,
206 features.push_back(Contains("index", path));
207 // 'search', 'search' in path,
208 features.push_back(Contains("search", path));
209 // 'view', 'view' in path,
210 features.push_back(Contains("view", path));
211 // 'archive', 'archive' in path,
212 features.push_back(Contains("archive", path));
213 // 'asp', '.asp' in path,
214 features.push_back(Contains(".asp", path));
215 // 'phpbb', 'phpbb' in path,
216 features.push_back(Contains("phpbb", path));
217 // 'php', path.endswith('.php'),
218 features.push_back(EndsWith(".php", path));
219 // 'pathLength', len(path),
220 features.push_back(path.size());
221 // 'domain', len(path) < 2,
222 features.push_back(path.size() < 2);
223 // 'pathComponents', CountMatches(path, r'\/.'),
224 features.push_back(CountMatches(path, "\\/."));
225 // 'slugDetector', CountMatches(path, r'[^\w/]'),
226 features.push_back(CountMatches(path, "[^\\w/]"));
227 // 'pathNumbers', CountMatches(path, r'\d+'),
228 features.push_back(CountMatches(path, "\\d+"));
229 // 'lastSegmentLength', len(GetLastSegment(path)),
230 features.push_back(GetLastSegment(path).size());
231 // 'formCount', numForms,
232 features.push_back(formCount);
233 // 'anchorCount', numAnchors,
234 features.push_back(anchorCount);
235 // 'elementCount', numElements,
236 features.push_back(elementCount);
237 // 'anchorRatio', float(numAnchors) / max(1, numElements),
238 features.push_back(double(anchorCount) / std::max<double>(1, elementCount));
239 // 'mozScore'
240 features.push_back(mozScore);
241 // 'mozScoreAllSqrt'
242 features.push_back(mozScoreAllSqrt);
243 // 'mozScoreAllLinear'
244 features.push_back(mozScoreAllLinear);
245
246 return features;
247 }
248
249 } // namespace dom_distiller
250