1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/autofill/content/renderer/html_based_username_detector.h"
6 
7 #include <algorithm>
8 #include <string>
9 #include <tuple>
10 #include <utility>
11 
12 #include "base/containers/flat_set.h"
13 #include "base/i18n/case_conversion.h"
14 #include "base/macros.h"
15 #include "base/stl_util.h"
16 #include "base/strings/string_split.h"
17 #include "base/strings/utf_string_conversions.h"
18 #include "components/autofill/content/renderer/form_autofill_util.h"
19 #include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
20 #include "components/autofill/core/common/form_data.h"
21 #include "third_party/blink/public/web/web_form_element.h"
22 
23 using blink::WebFormControlElement;
24 using blink::WebFormElement;
25 using blink::WebInputElement;
26 
27 namespace autofill {
28 
29 namespace {
30 
31 // List of separators that can appear in HTML attribute values.
32 constexpr char kDelimiters[] = "$\"\'?%*@!\\/&^#:+~`;,>|<.[](){}-_ 0123456789";
33 
34 // Minimum length of a word, in order not to be considered short word. Short
35 // words will not be searched in attribute values (especially after delimiters
36 // removing), because a short word may be a part of another word. A short word
37 // should be enclosed between delimiters, otherwise an occurrence doesn't count.
38 constexpr int kMinimumWordLength = 4;
39 
40 // For each input element that can be a username, developer and user group
41 // values are computed. The user group value includes what a user sees: label,
42 // placeholder, aria-label (all are stored in FormFieldData.label). The
43 // developer group value consists of name and id attribute values.
44 // For each group the set of short tokens (tokens shorter than
45 // |kMinimumWordLength|) is computed as well.
46 struct UsernameFieldData {
47   WebInputElement input_element;
48   base::string16 developer_value;
49   base::flat_set<base::string16> developer_short_tokens;
50   base::string16 user_value;
51   base::flat_set<base::string16> user_short_tokens;
52 };
53 
54 // Words that the algorithm looks for are split into multiple categories based
55 // on feature reliability.
56 // A category may contain a latin dictionary and a non-latin dictionary. It is
57 // mandatory that it has a latin one, but a non-latin might be missing.
58 // "Latin" translations are the translations of the words for which the
59 // original translation is similar to the romanized translation (translation of
60 // the word only using ISO basic Latin alphabet).
61 // "Non-latin" translations are the translations of the words that have custom,
62 // country specific characters.
63 struct CategoryOfWords {
64   const char* const* const latin_dictionary;
65   const size_t latin_dictionary_size;
66   const char* const* const non_latin_dictionary;
67   const size_t non_latin_dictionary_size;
68 };
69 
70 // Used only inside DCHECK.
AllElementsBelongsToSameForm(const std::vector<WebFormControlElement> & all_control_elements)71 bool AllElementsBelongsToSameForm(
72     const std::vector<WebFormControlElement>& all_control_elements) {
73   return std::adjacent_find(all_control_elements.begin(),
74                             all_control_elements.end(),
75                             [](const WebFormControlElement& a,
76                                const WebFormControlElement& b) {
77                               return a.Form() != b.Form();
78                             }) == all_control_elements.end();
79 }
80 
81 // 1. Removes delimiters from |raw_value| and appends the remainder to
82 // |*field_data_value|. A sentinel symbol is added first if |*field_data_value|
83 // is not empty.
84 // 2. Tokenizes and appends short tokens (shorter than |kMinimumWordLength|)
85 // from |raw_value| to |*field_data_short_tokens|, if any.
AppendValueAndShortTokens(const base::string16 & raw_value,base::string16 * field_data_value,base::flat_set<base::string16> * field_data_short_tokens)86 void AppendValueAndShortTokens(
87     const base::string16& raw_value,
88     base::string16* field_data_value,
89     base::flat_set<base::string16>* field_data_short_tokens) {
90   const base::string16 lowercase_value = base::i18n::ToLower(raw_value);
91   const base::string16 delimiters = base::ASCIIToUTF16(kDelimiters);
92   std::vector<base::StringPiece16> tokens =
93       base::SplitStringPiece(lowercase_value, delimiters, base::TRIM_WHITESPACE,
94                              base::SPLIT_WANT_NONEMPTY);
95 
96   // When computing the developer value, '$' safety guard is being added
97   // between field name and id, so that forming of accidental words is
98   // prevented.
99   if (!field_data_value->empty())
100     field_data_value->push_back('$');
101 
102   field_data_value->reserve(field_data_value->size() + lowercase_value.size());
103   std::vector<base::string16> short_tokens;
104   for (const base::StringPiece16& token : tokens) {
105     if (token.size() < kMinimumWordLength)
106       short_tokens.push_back(token.as_string());
107     field_data_value->append(token.data(), token.size());
108   }
109   // It is better to insert elements to a |base::flat_set| in one operation.
110   field_data_short_tokens->insert(short_tokens.begin(), short_tokens.end());
111 }
112 
113 // For the given |input_element|, compute developer and user value, along with
114 // sets of short tokens, and returns it.
ComputeUsernameFieldData(const blink::WebInputElement & input_element,const FormFieldData & field)115 UsernameFieldData ComputeUsernameFieldData(
116     const blink::WebInputElement& input_element,
117     const FormFieldData& field) {
118   UsernameFieldData field_data;
119   field_data.input_element = input_element;
120 
121   AppendValueAndShortTokens(field.name, &field_data.developer_value,
122                             &field_data.developer_short_tokens);
123   AppendValueAndShortTokens(field.id_attribute, &field_data.developer_value,
124                             &field_data.developer_short_tokens);
125   AppendValueAndShortTokens(field.label, &field_data.user_value,
126                             &field_data.user_short_tokens);
127   return field_data;
128 }
129 
130 // For the fields of the given form (all_control_elements), computes
131 // |UsernameFieldData| needed by the detector.
InferUsernameFieldData(const std::vector<blink::WebFormControlElement> & all_control_elements,const FormData & form_data,std::vector<UsernameFieldData> * possible_usernames_data)132 void InferUsernameFieldData(
133     const std::vector<blink::WebFormControlElement>& all_control_elements,
134     const FormData& form_data,
135     std::vector<UsernameFieldData>* possible_usernames_data) {
136   // |all_control_elements| and |form_data.fields| may have different set of
137   // fields. Match them based on |WebInputElement.NameForAutofill| and
138   // |FormFieldData.name|.
139   size_t next_element_range_begin = 0;
140 
141   for (const blink::WebFormControlElement& control_element :
142        all_control_elements) {
143     const blink::WebInputElement* input_element =
144         ToWebInputElement(&control_element);
145     if (!input_element || input_element->IsPasswordFieldForAutofill())
146       continue;
147     const base::string16 element_name =
148         input_element->NameForAutofill().Utf16();
149     for (size_t i = next_element_range_begin; i < form_data.fields.size();
150          ++i) {
151       const FormFieldData& field_data = form_data.fields[i];
152       if (input_element->NameForAutofill().IsEmpty())
153         continue;
154 
155       // Find matching field data and web input element.
156       if (field_data.name == element_name) {
157         next_element_range_begin = i + 1;
158         possible_usernames_data->push_back(
159             ComputeUsernameFieldData(*input_element, field_data));
160         break;
161       }
162     }
163   }
164 }
165 
166 // Check if any word from |dictionary| is encountered in computed field
167 // information (i.e. |value|, |tokens|).
CheckFieldWithDictionary(const base::string16 & value,const base::flat_set<base::string16> & short_tokens,const char * const * dictionary,const size_t & dictionary_size)168 bool CheckFieldWithDictionary(
169     const base::string16& value,
170     const base::flat_set<base::string16>& short_tokens,
171     const char* const* dictionary,
172     const size_t& dictionary_size) {
173   for (size_t i = 0; i < dictionary_size; ++i) {
174     const base::string16 word = base::UTF8ToUTF16(dictionary[i]);
175     if (word.length() < kMinimumWordLength) {
176       // Treat short words by looking them up in the tokens set.
177       if (short_tokens.find(word) != short_tokens.end())
178         return true;
179     } else {
180       // Treat long words by looking them up as a substring in |value|.
181       if (value.find(word) != std::string::npos)
182         return true;
183     }
184   }
185   return false;
186 }
187 
188 // Check if any word from |category| is encountered in computed field
189 // information (|possible_username|).
ContainsWordFromCategory(const UsernameFieldData & possible_username,const CategoryOfWords & category)190 bool ContainsWordFromCategory(const UsernameFieldData& possible_username,
191                               const CategoryOfWords& category) {
192   // For user value, search in latin and non-latin dictionaries, because this
193   // value is user visible. For developer value, only look up in latin
194   /// dictionaries.
195   return CheckFieldWithDictionary(
196              possible_username.user_value, possible_username.user_short_tokens,
197              category.latin_dictionary, category.latin_dictionary_size) ||
198          CheckFieldWithDictionary(possible_username.user_value,
199                                   possible_username.user_short_tokens,
200                                   category.non_latin_dictionary,
201                                   category.non_latin_dictionary_size) ||
202          CheckFieldWithDictionary(possible_username.developer_value,
203                                   possible_username.developer_short_tokens,
204                                   category.latin_dictionary,
205                                   category.latin_dictionary_size);
206 }
207 
208 // Remove from |possible_usernames_data| the elements that definitely cannot be
209 // usernames, because their computed values contain at least one negative word.
RemoveFieldsWithNegativeWords(std::vector<UsernameFieldData> * possible_usernames_data)210 void RemoveFieldsWithNegativeWords(
211     std::vector<UsernameFieldData>* possible_usernames_data) {
212   static const CategoryOfWords kNegativeCategory = {
213       kNegativeLatin, kNegativeLatinSize, kNegativeNonLatin,
214       kNegativeNonLatinSize};
215 
216   base::EraseIf(
217       *possible_usernames_data, [](const UsernameFieldData& possible_username) {
218         return ContainsWordFromCategory(possible_username, kNegativeCategory);
219       });
220 }
221 
222 // Check if any word from the given category (|category|) appears in fields from
223 // the form (|possible_usernames_data|). If the category words appear in more
224 // than 2 fields, do nothing, because it may just be a prefix. If the words
225 // appears in 1 or 2 fields, the first field is added to |username_predictions|.
FindWordsFromCategoryInForm(const std::vector<UsernameFieldData> & possible_usernames_data,const CategoryOfWords & category,std::vector<uint32_t> * username_predictions)226 void FindWordsFromCategoryInForm(
227     const std::vector<UsernameFieldData>& possible_usernames_data,
228     const CategoryOfWords& category,
229     std::vector<uint32_t>* username_predictions) {
230   // Auxiliary element that contains the first field (in order of appearance in
231   // the form) in which a substring is encountered.
232   uint32_t chosen_field_renderer_id = FormData::kNotSetRendererId;
233 
234   size_t fields_found = 0;
235   for (const UsernameFieldData& field_data : possible_usernames_data) {
236     if (ContainsWordFromCategory(field_data, category)) {
237       if (fields_found == 0) {
238         chosen_field_renderer_id =
239             field_data.input_element.UniqueRendererFormControlId();
240       }
241       fields_found++;
242     }
243   }
244 
245   if (fields_found > 0 && fields_found <= 2)
246     if (!base::Contains(*username_predictions, chosen_field_renderer_id))
247       username_predictions->push_back(chosen_field_renderer_id);
248 }
249 
250 // Find username elements if there is no cached result for the given form and
251 // add them to |username_predictions| in the order of decreasing relibility.
FindUsernameFieldInternal(const std::vector<blink::WebFormControlElement> & all_control_elements,const FormData & form_data,std::vector<uint32_t> * username_predictions)252 void FindUsernameFieldInternal(
253     const std::vector<blink::WebFormControlElement>& all_control_elements,
254     const FormData& form_data,
255     std::vector<uint32_t>* username_predictions) {
256   DCHECK(username_predictions);
257   DCHECK(username_predictions->empty());
258 
259   static const CategoryOfWords kUsernameCategory = {
260       kUsernameLatin, kUsernameLatinSize, kUsernameNonLatin,
261       kUsernameNonLatinSize};
262   static const CategoryOfWords kUserCategory = {
263       kUserLatin, kUserLatinSize, kUserNonLatin, kUserNonLatinSize};
264   static const CategoryOfWords kTechnicalCategory = {
265       kTechnicalWords, kTechnicalWordsSize, nullptr, 0};
266   static const CategoryOfWords kWeakCategory = {kWeakWords, kWeakWordsSize,
267                                                 nullptr, 0};
268   // These categories contain words that point to username field.
269   // Order of categories is vital: the detector searches for words in descending
270   // order of probability to point to a username field.
271   static const CategoryOfWords kPositiveCategories[] = {
272       kUsernameCategory, kUserCategory, kTechnicalCategory, kWeakCategory};
273 
274   std::vector<UsernameFieldData> possible_usernames_data;
275 
276   InferUsernameFieldData(all_control_elements, form_data,
277                          &possible_usernames_data);
278   RemoveFieldsWithNegativeWords(&possible_usernames_data);
279 
280   // These are the searches performed by the username detector.
281   for (const CategoryOfWords& category : kPositiveCategories) {
282     FindWordsFromCategoryInForm(possible_usernames_data, category,
283                                 username_predictions);
284   }
285 }
286 
287 // Returns the |unique_renderer_id| of a given |WebFormElement|. If
288 // |WebFormElement::IsNull()| return |kNotSetRendererId|.
GetFormRendererId(WebFormElement form)289 uint32_t GetFormRendererId(WebFormElement form) {
290   return form.IsNull() ? FormData::kNotSetRendererId
291                        : form.UniqueRendererFormId();
292 }
293 
294 }  // namespace
295 
GetPredictionsFieldBasedOnHtmlAttributes(const std::vector<WebFormControlElement> & all_control_elements,const FormData & form_data,UsernameDetectorCache * username_detector_cache)296 const std::vector<uint32_t>& GetPredictionsFieldBasedOnHtmlAttributes(
297     const std::vector<WebFormControlElement>& all_control_elements,
298     const FormData& form_data,
299     UsernameDetectorCache* username_detector_cache) {
300   // The cache will store the object referenced in the return value, so it must
301   // exist. It can be empty.
302   DCHECK(username_detector_cache);
303 
304   DCHECK(!all_control_elements.empty());
305 
306   // All elements in |all_control_elements| should have the same |Form()|.
307   DCHECK(AllElementsBelongsToSameForm(all_control_elements));
308   const WebFormElement form = all_control_elements.at(0).Form();
309 
310   // True if the cache has no entry for |form|.
311   bool cache_miss = true;
312   // Iterator pointing to the entry for |form| if the entry for |form| is found.
313   UsernameDetectorCache::iterator form_position;
314   std::tie(form_position, cache_miss) = username_detector_cache->insert(
315       std::make_pair(GetFormRendererId(form), std::vector<uint32_t>()));
316 
317   if (cache_miss) {
318     std::vector<uint32_t> username_predictions;
319     FindUsernameFieldInternal(all_control_elements, form_data,
320                               &username_predictions);
321     if (!username_predictions.empty())
322       form_position->second = std::move(username_predictions);
323   }
324   return form_position->second;
325 }
326 
327 }  // namespace autofill
328