1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/autofill/core/browser/data_model/autofill_structured_address_component.h"
6 
7 #include <algorithm>
8 #include <map>
9 #include <string>
10 #include <utility>
11 
12 #include "base/feature_list.h"
13 #include "base/notreached.h"
14 #include "base/strings/strcat.h"
15 #include "base/strings/string16.h"
16 #include "base/strings/string_piece.h"
17 #include "base/strings/string_split.h"
18 #include "base/strings/string_util.h"
19 #include "base/strings/utf_string_conversions.h"
20 #include "components/autofill/core/browser/autofill_type.h"
21 #include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h"
22 #include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"
23 #include "components/autofill/core/browser/field_types.h"
24 #include "components/autofill/core/common/autofill_features.h"
25 
26 namespace autofill {
27 
28 namespace structured_address {
29 
IsLessSignificantVerificationStatus(VerificationStatus left,VerificationStatus right)30 bool IsLessSignificantVerificationStatus(VerificationStatus left,
31                                          VerificationStatus right) {
32   // Both the KUserVerified and kObserved are larger then kServerParsed although
33   // the underlying integer suggests differently.
34   if (left == VerificationStatus::kServerParsed &&
35       (right == VerificationStatus::kObserved ||
36        right == VerificationStatus::kUserVerified)) {
37     return true;
38   }
39 
40   if (right == VerificationStatus::kServerParsed &&
41       (left == VerificationStatus::kObserved ||
42        left == VerificationStatus::kUserVerified)) {
43     return false;
44   }
45 
46   // In all other cases, it is sufficient to compare the underlying integer
47   // values.
48   return static_cast<std::underlying_type_t<VerificationStatus>>(left) <
49          static_cast<std::underlying_type_t<VerificationStatus>>(right);
50 }
51 
AddressComponent(ServerFieldType storage_type,AddressComponent * parent,std::vector<AddressComponent * > subcomponents,unsigned int merge_mode)52 AddressComponent::AddressComponent(ServerFieldType storage_type,
53                                    AddressComponent* parent,
54                                    std::vector<AddressComponent*> subcomponents,
55                                    unsigned int merge_mode)
56     : value_verification_status_(VerificationStatus::kNoStatus),
57       storage_type_(storage_type),
58       subcomponents_(subcomponents),
59       parent_(parent),
60       merge_mode_(merge_mode) {}
61 
62 AddressComponent::~AddressComponent() = default;
63 
GetStorageType() const64 ServerFieldType AddressComponent::GetStorageType() const {
65   return storage_type_;
66 }
67 
GetStorageTypeName() const68 std::string AddressComponent::GetStorageTypeName() const {
69   return AutofillType(storage_type_).ToString();
70 }
71 
operator =(const AddressComponent & right)72 AddressComponent& AddressComponent::operator=(const AddressComponent& right) {
73   DCHECK(GetStorageType() == right.GetStorageType());
74   if (this == &right)
75     return *this;
76 
77   if (right.IsValueAssigned()) {
78     value_ = right.value_;
79     value_verification_status_ = right.value_verification_status_;
80     sorted_normalized_tokens_ = right.sorted_normalized_tokens_;
81   } else {
82     UnsetValue();
83   }
84 
85   DCHECK(right.subcomponents_.size() == subcomponents_.size());
86 
87   for (size_t i = 0; i < right.subcomponents_.size(); i++)
88     *subcomponents_[i] = *right.subcomponents_[i];
89 
90   PostAssignSanitization();
91 
92   return *this;
93 }
94 
operator ==(const AddressComponent & right) const95 bool AddressComponent::operator==(const AddressComponent& right) const {
96   if (this == &right)
97     return true;
98 
99   if (GetStorageType() != right.GetStorageType())
100     return false;
101 
102   if (GetValue() != right.GetValue() ||
103       value_verification_status_ != right.value_verification_status_) {
104     return false;
105   }
106 
107   DCHECK(right.subcomponents_.size() == subcomponents_.size());
108   for (size_t i = 0; i < right.subcomponents_.size(); i++)
109     if (!(*subcomponents_[i] == *right.subcomponents_[i]))
110       return false;
111   return true;
112 }
113 
operator !=(const AddressComponent & right) const114 bool AddressComponent::operator!=(const AddressComponent& right) const {
115   return !(*this == right);
116 }
117 
IsAtomic() const118 bool AddressComponent::IsAtomic() const {
119   return subcomponents_.empty();
120 }
121 
IsValueValid() const122 bool AddressComponent::IsValueValid() const {
123   return true;
124 }
125 
IsValueForTypeValid(const std::string & field_type_name,bool wipe_if_not)126 bool AddressComponent::IsValueForTypeValid(const std::string& field_type_name,
127                                            bool wipe_if_not) {
128   bool validity_status;
129   if (GetIsValueForTypeValidIfPossible(field_type_name, &validity_status,
130                                        wipe_if_not))
131     return validity_status;
132   return false;
133 }
134 
IsValueForTypeValid(ServerFieldType field_type,bool wipe_if_not)135 bool AddressComponent::IsValueForTypeValid(ServerFieldType field_type,
136                                            bool wipe_if_not) {
137   return IsValueForTypeValid(AutofillType(field_type).ToString(), wipe_if_not);
138 }
139 
GetIsValueForTypeValidIfPossible(const std::string & field_type_name,bool * validity_status,bool wipe_if_not)140 bool AddressComponent::GetIsValueForTypeValidIfPossible(
141     const std::string& field_type_name,
142     bool* validity_status,
143     bool wipe_if_not) {
144   if (field_type_name == GetStorageTypeName()) {
145     *validity_status = IsValueValid();
146     if (!(*validity_status) && wipe_if_not)
147       UnsetValue();
148     return true;
149   }
150 
151   for (auto* subcomponent : subcomponents_) {
152     if (subcomponent->GetIsValueForTypeValidIfPossible(
153             field_type_name, validity_status, wipe_if_not))
154       return true;
155   }
156   return false;
157 }
158 
GetVerificationStatus() const159 VerificationStatus AddressComponent::GetVerificationStatus() const {
160   return value_verification_status_;
161 }
162 
GetValue() const163 const base::string16& AddressComponent::GetValue() const {
164   if (value_.has_value())
165     return value_.value();
166   return base::EmptyString16();
167 }
168 
IsValueAssigned() const169 bool AddressComponent::IsValueAssigned() const {
170   return value_.has_value();
171 }
172 
SetValue(base::string16 value,VerificationStatus status)173 void AddressComponent::SetValue(base::string16 value,
174                                 VerificationStatus status) {
175   value_ = std::move(value);
176   value_verification_status_ = status;
177 }
178 
UnsetValue()179 void AddressComponent::UnsetValue() {
180   value_.reset();
181   value_verification_status_ = VerificationStatus::kNoStatus;
182   sorted_normalized_tokens_.reset();
183 }
184 
GetSupportedTypes(ServerFieldTypeSet * supported_types) const185 void AddressComponent::GetSupportedTypes(
186     ServerFieldTypeSet* supported_types) const {
187   // A proper AddressComponent tree contains every type only once.
188   DCHECK(supported_types->find(storage_type_) == supported_types->end())
189       << "The AddressComponent already contains a node that supports this "
190          "type: "
191       << storage_type_;
192   supported_types->insert(storage_type_);
193   GetAdditionalSupportedFieldTypes(supported_types);
194   for (auto* subcomponent : subcomponents_)
195     subcomponent->GetSupportedTypes(supported_types);
196 }
197 
ConvertAndSetValueForAdditionalFieldTypeName(const std::string & field_type_name,const base::string16 & value,const VerificationStatus & status)198 bool AddressComponent::ConvertAndSetValueForAdditionalFieldTypeName(
199     const std::string& field_type_name,
200     const base::string16& value,
201     const VerificationStatus& status) {
202   return false;
203 }
204 
ConvertAndGetTheValueForAdditionalFieldTypeName(const std::string & field_type_name,base::string16 * value) const205 bool AddressComponent::ConvertAndGetTheValueForAdditionalFieldTypeName(
206     const std::string& field_type_name,
207     base::string16* value) const {
208   return false;
209 }
210 
GetBestFormatString() const211 base::string16 AddressComponent::GetBestFormatString() const {
212   // If the component is atomic, the format string is just the value.
213   if (IsAtomic())
214     return base::ASCIIToUTF16(GetPlaceholderToken(GetStorageTypeName()));
215 
216   // Otherwise, the canonical format string is the concatenation of all
217   // subcomponents by their natural order.
218   std::vector<std::string> format_pieces;
219   for (const auto* subcomponent : subcomponents_) {
220     std::string format_piece = GetPlaceholderToken(
221         AutofillType(subcomponent->GetStorageType()).ToString());
222     format_pieces.emplace_back(std::move(format_piece));
223   }
224   return base::ASCIIToUTF16(base::JoinString(format_pieces, " "));
225 }
226 
GetSubcomponentTypes() const227 std::vector<ServerFieldType> AddressComponent::GetSubcomponentTypes() const {
228   std::vector<ServerFieldType> subcomponent_types;
229   subcomponent_types.reserve(subcomponents_.size());
230   for (const auto* subcomponent : subcomponents_) {
231     subcomponent_types.emplace_back(subcomponent->GetStorageType());
232   }
233   return subcomponent_types;
234 }
235 
SetValueForTypeIfPossible(const ServerFieldType & type,const base::string16 & value,const VerificationStatus & verification_status,bool invalidate_child_nodes,bool invalidate_parent_nodes)236 bool AddressComponent::SetValueForTypeIfPossible(
237     const ServerFieldType& type,
238     const base::string16& value,
239     const VerificationStatus& verification_status,
240     bool invalidate_child_nodes,
241     bool invalidate_parent_nodes) {
242   return SetValueForTypeIfPossible(AutofillType(type).ToString(), value,
243                                    verification_status, invalidate_child_nodes,
244                                    invalidate_parent_nodes);
245 }
246 
SetValueForTypeIfPossible(const ServerFieldType & type,const std::string & value,const VerificationStatus & verification_status,bool invalidate_child_nodes,bool invalidate_parent_nodes)247 bool AddressComponent::SetValueForTypeIfPossible(
248     const ServerFieldType& type,
249     const std::string& value,
250     const VerificationStatus& verification_status,
251     bool invalidate_child_nodes,
252     bool invalidate_parent_nodes) {
253   return SetValueForTypeIfPossible(type, base::UTF8ToUTF16(value),
254                                    verification_status, invalidate_child_nodes,
255                                    invalidate_parent_nodes);
256 }
257 
SetValueForTypeIfPossible(const std::string & type_name,const base::string16 & value,const VerificationStatus & verification_status,bool invalidate_child_nodes,bool invalidate_parent_nodes)258 bool AddressComponent::SetValueForTypeIfPossible(
259     const std::string& type_name,
260     const base::string16& value,
261     const VerificationStatus& verification_status,
262     bool invalidate_child_nodes,
263     bool invalidate_parent_nodes) {
264   bool value_set = false;
265   // If the type is the storage type of the component, it can directly be
266   // returned.
267   if (type_name == GetStorageTypeName()) {
268     SetValue(value, verification_status);
269     value_set = true;
270   } else if (ConvertAndSetValueForAdditionalFieldTypeName(
271                  type_name, value, verification_status)) {
272     // The conversion using a field type was successful.
273     value_set = true;
274   }
275 
276   if (value_set) {
277     if (invalidate_child_nodes)
278       UnsetSubcomponents();
279     return true;
280   }
281 
282   // Finally, probe if the type is supported by one of the subcomponents.
283   for (auto* subcomponent : subcomponents_) {
284     if (subcomponent->SetValueForTypeIfPossible(
285             type_name, value, verification_status, invalidate_child_nodes,
286             invalidate_parent_nodes)) {
287       if (invalidate_parent_nodes)
288         UnsetValue();
289       return true;
290     }
291   }
292 
293   return false;
294 }
295 
SetValueForTypeIfPossible(const std::string & type_name,const std::string & value,const VerificationStatus & verification_status,bool invalidate_child_nodes,bool invalidate_parent_nodes)296 bool AddressComponent::SetValueForTypeIfPossible(
297     const std::string& type_name,
298     const std::string& value,
299     const VerificationStatus& verification_status,
300     bool invalidate_child_nodes,
301     bool invalidate_parent_nodes) {
302   return SetValueForTypeIfPossible(type_name, base::UTF8ToUTF16(value),
303                                    verification_status, invalidate_child_nodes,
304                                    invalidate_parent_nodes);
305 }
306 
UnsetAddressComponentAndItsSubcomponents()307 void AddressComponent::UnsetAddressComponentAndItsSubcomponents() {
308   UnsetValue();
309   UnsetSubcomponents();
310 }
311 
UnsetSubcomponents()312 void AddressComponent::UnsetSubcomponents() {
313   for (auto* component : subcomponents_)
314     component->UnsetAddressComponentAndItsSubcomponents();
315 }
316 
GetValueAndStatusForTypeIfPossible(const ServerFieldType & type,base::string16 * value,VerificationStatus * status) const317 bool AddressComponent::GetValueAndStatusForTypeIfPossible(
318     const ServerFieldType& type,
319     base::string16* value,
320     VerificationStatus* status) const {
321   return GetValueAndStatusForTypeIfPossible(AutofillType(type).ToString(),
322                                             value, status);
323 }
324 
GetValueAndStatusForTypeIfPossible(const std::string & type_name,base::string16 * value,VerificationStatus * status) const325 bool AddressComponent::GetValueAndStatusForTypeIfPossible(
326     const std::string& type_name,
327     base::string16* value,
328     VerificationStatus* status) const {
329   // If the value is the storage type, it can be simply returned.
330   if (type_name == GetStorageTypeName()) {
331     if (value)
332       *value = value_.value_or(base::string16());
333     if (status)
334       *status = GetVerificationStatus();
335     return true;
336   }
337 
338   // Otherwise, probe if it is a supported field type that can be converted.
339   if (this->ConvertAndGetTheValueForAdditionalFieldTypeName(type_name, value)) {
340     if (status)
341       *status = GetVerificationStatus();
342     return true;
343   }
344 
345   // Finally, try to retrieve the value from one of the subcomponents.
346   for (const auto* subcomponent : subcomponents_) {
347     if (subcomponent->GetValueAndStatusForTypeIfPossible(type_name, value,
348                                                          status))
349       return true;
350   }
351   return false;
352 }
353 
GetValueForType(const ServerFieldType & type) const354 base::string16 AddressComponent::GetValueForType(
355     const ServerFieldType& type) const {
356   return GetValueForType(AutofillType(type).ToString());
357 }
358 
GetValueForType(const std::string & type_name) const359 base::string16 AddressComponent::GetValueForType(
360     const std::string& type_name) const {
361   base::string16 value;
362   bool success = GetValueAndStatusForTypeIfPossible(type_name, &value, nullptr);
363   // TODO(crbug.com/1113617): Honorifics are temporally disabled.
364   DCHECK(success || type_name == AutofillType(NAME_HONORIFIC_PREFIX).ToString())
365       << type_name;
366   return value;
367 }
368 
GetVerificationStatusForType(const ServerFieldType & type) const369 VerificationStatus AddressComponent::GetVerificationStatusForType(
370     const ServerFieldType& type) const {
371   return GetVerificationStatusForType(AutofillType(type).ToString());
372 }
373 
GetVerificationStatusForType(const std::string & type_name) const374 VerificationStatus AddressComponent::GetVerificationStatusForType(
375     const std::string& type_name) const {
376   VerificationStatus status = VerificationStatus::kNoStatus;
377   bool success =
378       GetValueAndStatusForTypeIfPossible(type_name, nullptr, &status);
379   // TODO(crbug.com/1113617): Honorifics are temporally disabled.
380   DCHECK(success ||
381          type_name == AutofillType(NAME_HONORIFIC_PREFIX).ToString());
382   return status;
383 }
384 
UnsetValueForTypeIfSupported(const ServerFieldType & type)385 bool AddressComponent::UnsetValueForTypeIfSupported(
386     const ServerFieldType& type) {
387   if (type == storage_type_) {
388     UnsetAddressComponentAndItsSubcomponents();
389     return true;
390   }
391 
392   for (auto* subcomponent : subcomponents_) {
393     if (subcomponent->UnsetValueForTypeIfSupported(type))
394       return true;
395   }
396 
397   return false;
398 }
399 
ParseValueAndAssignSubcomponentsByMethod()400 bool AddressComponent::ParseValueAndAssignSubcomponentsByMethod() {
401   return false;
402 }
403 
404 std::vector<const re2::RE2*>
GetParseRegularExpressionsByRelevance() const405 AddressComponent::GetParseRegularExpressionsByRelevance() const {
406   return {};
407 }
408 
ParseValueAndAssignSubcomponents()409 void AddressComponent::ParseValueAndAssignSubcomponents() {
410   // Set the values of all subcomponents to the empty string and set the
411   // verification status to kParsed.
412   for (auto* subcomponent : subcomponents_)
413     subcomponent->SetValue(base::string16(), VerificationStatus::kParsed);
414 
415   // First attempt, try to parse by method.
416   if (ParseValueAndAssignSubcomponentsByMethod())
417     return;
418 
419   // Second attempt, try to parse by expressions.
420   if (ParseValueAndAssignSubcomponentsByRegularExpressions())
421     return;
422 
423   // As a final fallback, parse using the fallback method.
424   ParseValueAndAssignSubcomponentsByFallbackMethod();
425 }
426 
ParseValueAndAssignSubcomponentsByRegularExpressions()427 bool AddressComponent::ParseValueAndAssignSubcomponentsByRegularExpressions() {
428   for (const auto* parse_expression : GetParseRegularExpressionsByRelevance()) {
429     if (!parse_expression)
430       continue;
431     if (ParseValueAndAssignSubcomponentsByRegularExpression(GetValue(),
432                                                             parse_expression))
433       return true;
434   }
435   return false;
436 }
437 
ParseValueAndAssignSubcomponentsByRegularExpression(const base::string16 & value,const RE2 * parse_expression)438 bool AddressComponent::ParseValueAndAssignSubcomponentsByRegularExpression(
439     const base::string16& value,
440     const RE2* parse_expression) {
441   std::map<std::string, std::string> result_map;
442   if (ParseValueByRegularExpression(base::UTF16ToUTF8(value), parse_expression,
443                                     &result_map)) {
444     // Parsing was successful and results from the result map can be written
445     // to the structure.
446     for (const auto& result_entry : result_map) {
447       const std::string& field_type = result_entry.first;
448       base::string16 field_value = base::UTF8ToUTF16(result_entry.second);
449       // Do not reassign the value of this node.
450       if (field_type == GetStorageTypeName())
451         continue;
452       // crbug.com(1113617): Honorifics are temporarily disabled.
453       if (field_type == AutofillType(NAME_HONORIFIC_PREFIX).ToString())
454         continue;
455       bool success = SetValueForTypeIfPossible(field_type, field_value,
456                                                VerificationStatus::kParsed);
457       // Setting the value should always work unless the regular expression is
458       // invalid.
459       DCHECK(success);
460     }
461     return true;
462   }
463   return false;
464 }
465 
ParseValueAndAssignSubcomponentsByFallbackMethod()466 void AddressComponent::ParseValueAndAssignSubcomponentsByFallbackMethod() {
467   // There is nothing to do for an atomic component.
468   if (IsAtomic())
469     return;
470 
471   // An empty string is trivially parsable.
472   if (GetValue().empty())
473     return;
474 
475   // Split the string by spaces.
476   std::vector<base::string16> space_separated_tokens =
477       base::SplitString(GetValue(), base::UTF8ToUTF16(" "),
478                         base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
479 
480   auto token_iterator = space_separated_tokens.begin();
481   auto subcomponent_types = GetSubcomponentTypes();
482 
483   // Assign one space-separated token each to all but the last subcomponent.
484   for (size_t i = 0; (i + 1) < subcomponent_types.size(); i++) {
485     // If there are no tokens left, parsing is done.
486     if (token_iterator == space_separated_tokens.end())
487       return;
488     // Set the current token to the type and advance the token iterator.
489     bool success = SetValueForTypeIfPossible(
490         subcomponent_types[i], *token_iterator, VerificationStatus::kParsed);
491     // By design, setting the value should never fail.
492     DCHECK(success);
493     token_iterator++;
494   }
495 
496   // Collect all remaining tokens in the last subcomponent.
497   base::string16 remaining_tokens = base::JoinString(
498       std::vector<base::string16>(token_iterator, space_separated_tokens.end()),
499       base::ASCIIToUTF16(" "));
500   // By design, it should be possible to assign the value unless the regular
501   // expression is wrong.
502   bool success = SetValueForTypeIfPossible(
503       subcomponent_types.back(), remaining_tokens, VerificationStatus::kParsed);
504   DCHECK(success);
505 }
506 
WipeInvalidStructure()507 bool AddressComponent::WipeInvalidStructure() {
508   if (IsAtomic()) {
509     return false;
510   }
511 
512   // Test that each structured token is part of the subcomponent.
513   // This is not perfect, because different components can match with an
514   // overlapping portion of the unstructured string, but it guarantees that all
515   // information in the components is contained in the unstructured
516   // representation.
517   for (const auto* component : Subcomponents()) {
518     if (GetValue().find(component->GetValue()) == base::string16::npos) {
519       // If the value of one component could not have been found, wipe the full
520       // structure.
521       RecursivelyUnsetSubcomponents();
522       return true;
523     }
524   }
525   return false;
526 }
527 
FormatValueFromSubcomponents()528 void AddressComponent::FormatValueFromSubcomponents() {
529   // Get the most suited format string.
530   base::string16 format_string = GetBestFormatString();
531 
532   // Perform the following steps on a copy of the format string.
533   // * Replace all the placeholders of the form ${TYPE_NAME} with the
534   // corresponding value.
535   // * Strip away double spaces as they may occur after replacing a placeholder
536   // with an empty value.
537 
538   base::string16 result = ReplacePlaceholderTypesWithValues(format_string);
539   result = base::CollapseWhitespace(result, /*trim_line_breaks=*/false);
540   SetValue(result, VerificationStatus::kFormatted);
541 }
542 
ReplacePlaceholderTypesWithValues(const base::string16 & format) const543 base::string16 AddressComponent::ReplacePlaceholderTypesWithValues(
544     const base::string16& format) const {
545   // Replaces placeholders using the following rules.
546   // Assumptions: Placeholder values are not nested.
547   //
548   // * Search for a substring of the form "{$[^}]*}".
549   // The substring can contain semicolon-separated tokens. The first token is
550   // always the type name. If present, the second token is a prefix that is only
551   // inserted if the corresponding value is not empty. Accordingly, the third
552   // token is a suffix.
553   //
554   // * Check if this substring is a supported type of this component.
555   //
556   // * If yes, replace the substring with the corresponding value.
557   //
558   // * If the corresponding value is empty, return false.
559 
560   auto control_parmater = base::ASCIIToUTF16("$").at(0);
561   auto control_parmater_open_delimitor = base::ASCIIToUTF16("{").at(0);
562   auto control_parmater_close_delimitor = base::ASCIIToUTF16("}").at(0);
563 
564   // Create a result vector for the tokens that are joined in the end.
565   std::vector<base::StringPiece16> result_pieces;
566 
567   // Store the token pieces that are joined in the end.
568   std::vector<base::string16> inserted_values;
569   inserted_values.reserve(20);
570 
571   // Use a StringPiece rather than the string since this allows for getting
572   // cheap views onto substrings.
573   const base::StringPiece16 format_piece = format;
574 
575   bool started_control_sequence = false;
576   // Track until which index the format string was fully processed.
577   size_t processed_until_index = 0;
578 
579   for (size_t i = 0; i < format_piece.size(); ++i) {
580     // Check if a control sequence is started by '${'
581     if (format_piece[i] == control_parmater && i < format_piece.size() - 1 &&
582         format_piece[i + 1] == control_parmater_open_delimitor) {
583       // A control sequence is started.
584       started_control_sequence = true;
585       // Append the preceding string since it can't be a valid placeholder.
586       if (i > 0) {
587         inserted_values.emplace_back(format_piece.substr(
588             processed_until_index, i - processed_until_index));
589       }
590       processed_until_index = i;
591       ++i;
592     } else if (started_control_sequence &&
593                format_piece[i] == control_parmater_close_delimitor) {
594       // The control sequence came to an end.
595       started_control_sequence = false;
596       size_t placeholder_start = processed_until_index + 2;
597       base::string16 placeholder(
598           format_piece.substr(placeholder_start, i - placeholder_start));
599 
600       std::vector<base::string16> placeholder_tokens =
601           base::SplitString(placeholder, base::ASCIIToUTF16(";"),
602                             base::KEEP_WHITESPACE, base::SPLIT_WANT_ALL);
603       DCHECK(placeholder_tokens.size() > 0);
604 
605       // By convention, the first token is the type of the placeholder.
606       base::string16 type_name = placeholder_tokens.at(0);
607       // If present, the second token is the prefix.
608       base::string16 prefix = placeholder_tokens.size() > 1
609                                   ? placeholder_tokens.at(1)
610                                   : base::string16();
611       // And the third token the suffix.
612       base::string16 suffix = placeholder_tokens.size() > 2
613                                   ? placeholder_tokens.at(2)
614                                   : base::string16();
615 
616       base::string16 value;
617       if (GetValueAndStatusForTypeIfPossible(base::UTF16ToASCII(type_name),
618                                              &value, nullptr)) {
619         // The type is valid and should be substituted.
620         if (!value.empty()) {
621           // Add the prefix if present.
622           if (!prefix.empty())
623             inserted_values.emplace_back(std::move(prefix));
624 
625           // Add the substituted value.
626           inserted_values.emplace_back(std::move(value));
627           // Add the suffix if present.
628           if (!suffix.empty())
629             inserted_values.emplace_back(std::move(suffix));
630         }
631       } else {
632         // Append the control sequence as it is, because the type is not
633         // supported by the component tree.
634         inserted_values.emplace_back(format_piece.substr(
635             processed_until_index, i - processed_until_index + 1));
636       }
637       processed_until_index = i + 1;
638     }
639   }
640   // Append the rest of the string.
641   inserted_values.emplace_back(
642       format_piece.substr(processed_until_index, base::string16::npos));
643 
644   // Build the final result.
645   return base::JoinString(inserted_values, base::ASCIIToUTF16(""));
646 }
647 
CompleteFullTree()648 bool AddressComponent::CompleteFullTree() {
649   int max_nodes_on_root_to_leaf_path =
650       GetRootNode().MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths();
651   // With more than one node the tree cannot be completed.
652   switch (max_nodes_on_root_to_leaf_path) {
653     // An empty tree is already complete.
654     case 0:
655       return true;
656     // With a single node, the tree is completable.
657     case 1:
658       GetRootNode().RecursivelyCompleteTree();
659       return true;
660     // In any other case, the tree is not completable.
661     default:
662       return false;
663   }
664 }
665 
RecursivelyCompleteTree()666 void AddressComponent::RecursivelyCompleteTree() {
667   if (IsAtomic())
668     return;
669 
670   // If the value is assigned, parse the subcomponents from the value.
671   if (!GetValue().empty() &&
672       MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths() == 1)
673     ParseValueAndAssignSubcomponents();
674 
675   // First call completion on all subcomponents.
676   for (auto* subcomponent : subcomponents_)
677     subcomponent->RecursivelyCompleteTree();
678 
679   // Finally format the value from the sucomponents if it is not already
680   // assigned.
681   if (GetValue().empty())
682     FormatValueFromSubcomponents();
683 }
684 
685 int AddressComponent::
MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths() const686     MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths() const {
687   int result = 0;
688 
689   for (auto* subcomponent : subcomponents_) {
690     result = std::max(
691         result,
692         subcomponent
693             ->MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths());
694   }
695 
696   // Only count non-empty nodes.
697   if (!GetValue().empty())
698     ++result;
699 
700   return result;
701 }
702 
IsTreeCompletable()703 bool AddressComponent::IsTreeCompletable() {
704   // An empty tree is also a completable tree.
705   return MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths() <= 1;
706 }
707 
GetRootNode() const708 const AddressComponent& AddressComponent::GetRootNode() const {
709   if (!parent_)
710     return *this;
711   return parent_->GetRootNode();
712 }
713 
GetRootNode()714 AddressComponent& AddressComponent::GetRootNode() {
715   return const_cast<AddressComponent&>(
716       const_cast<const AddressComponent*>(this)->GetRootNode());
717 }
718 
RecursivelyUnsetParsedAndFormattedValues()719 void AddressComponent::RecursivelyUnsetParsedAndFormattedValues() {
720   if (IsValueAssigned() &&
721       (GetVerificationStatus() == VerificationStatus::kFormatted ||
722        GetVerificationStatus() == VerificationStatus::kParsed))
723     UnsetValue();
724 
725   for (auto* component : subcomponents_)
726     component->RecursivelyUnsetParsedAndFormattedValues();
727 }
728 
RecursivelyUnsetSubcomponents()729 void AddressComponent::RecursivelyUnsetSubcomponents() {
730   for (auto* subcomponent : subcomponents_) {
731     subcomponent->UnsetValue();
732     subcomponent->RecursivelyUnsetSubcomponents();
733   }
734 }
735 
UnsetParsedAndFormattedValuesInEntireTree()736 void AddressComponent::UnsetParsedAndFormattedValuesInEntireTree() {
737   GetRootNode().RecursivelyUnsetParsedAndFormattedValues();
738 }
739 
MergeVerificationStatuses(const AddressComponent & newer_component)740 void AddressComponent::MergeVerificationStatuses(
741     const AddressComponent& newer_component) {
742   if (IsValueAssigned() && (GetValue() == newer_component.GetValue()) &&
743       HasNewerValuePrecendenceInMerging(newer_component)) {
744     value_verification_status_ = newer_component.GetVerificationStatus();
745   }
746 
747   DCHECK(newer_component.subcomponents_.size() == subcomponents_.size());
748   for (size_t i = 0; i < newer_component.subcomponents_.size(); i++) {
749     subcomponents_[i]->MergeVerificationStatuses(
750         *newer_component.subcomponents_.at(i));
751   }
752 }
753 
GetSortedTokens() const754 const std::vector<AddressToken> AddressComponent::GetSortedTokens() const {
755   return TokenizeValue(GetValue());
756 }
757 
IsMergeableWithComponent(const AddressComponent & newer_component) const758 bool AddressComponent::IsMergeableWithComponent(
759     const AddressComponent& newer_component) const {
760   const base::string16 value = ValueForComparison();
761   const base::string16 value_newer = newer_component.ValueForComparison();
762 
763   // If both components are the same, there is nothing to do.
764   if (*this == newer_component)
765     return true;
766 
767   if ((merge_mode_ & kReplaceEmpty) && (value.empty() || value_newer.empty())) {
768     return true;
769   }
770 
771   if (merge_mode_ & kUseBetterOrNewerForSameValue) {
772     if (base::ToUpperASCII(value) == base::ToUpperASCII(value_newer)) {
773       return true;
774     }
775   }
776 
777   SortedTokenComparisonResult token_comparison_result =
778       CompareSortedTokens(value, value_newer);
779 
780   if ((merge_mode_ & (kRecursivelyMergeTokenEquivalentValues |
781                       kRecursivelyMergeSingleTokenSubset)) &&
782       token_comparison_result.status == MATCH) {
783     return true;
784   }
785 
786   if ((merge_mode_ & (kReplaceSubset | kReplaceSuperset)) &&
787       (token_comparison_result.OneIsSubset() ||
788        token_comparison_result.status == MATCH)) {
789     return true;
790   }
791 
792   if ((merge_mode_ & kRecursivelyMergeSingleTokenSubset) &&
793       token_comparison_result.IsSingleTokenSuperset()) {
794     return true;
795   }
796 
797   if (merge_mode_ == kUseNewerIfDifferent)
798     return true;
799 
800   // If the one value is a substring of the other, use the substring of the
801   // corresponding mode is active.
802   if ((merge_mode_ & kUseMostRecentSubstring) &&
803       (value.find(value_newer) != base::string16::npos ||
804        value_newer.find(value) != base::string16::npos)) {
805     return true;
806   }
807 
808   if ((merge_mode_ & kPickShorterIfOneContainsTheOther) &&
809       token_comparison_result.ContainEachOther()) {
810     return true;
811   }
812 
813   if (merge_mode_ == kMergeChildrenAndReformat) {
814     bool is_mergeable = true;
815     DCHECK(newer_component.subcomponents_.size() == subcomponents_.size());
816     for (size_t i = 0; i < newer_component.subcomponents_.size(); i++) {
817       if (!subcomponents_[i]->IsMergeableWithComponent(
818               *newer_component.subcomponents_[i])) {
819         is_mergeable = false;
820         break;
821       }
822     }
823     if (is_mergeable)
824       return true;
825   }
826   return false;
827 }
828 
MergeWithComponent(const AddressComponent & newer_component,bool newer_was_more_recently_used)829 bool AddressComponent::MergeWithComponent(
830     const AddressComponent& newer_component,
831     bool newer_was_more_recently_used) {
832   // If both components are the same, there is nothing to do.
833 
834   const base::string16 value = ValueForComparison();
835   const base::string16 value_newer = newer_component.ValueForComparison();
836 
837   if (*this == newer_component)
838     return true;
839 
840   // Now, it is guaranteed that both values are not identical.
841   // Use the non empty one if the corresponding mode is active.
842   if (merge_mode_ & kReplaceEmpty) {
843     if (value.empty()) {
844       *this = newer_component;
845       return true;
846     }
847     if (value_newer.empty())
848       return true;
849   }
850 
851   // If the normalized values are the same, optimize the verification status.
852   if ((merge_mode_ & kUseBetterOrNewerForSameValue) && (value == value_newer)) {
853     if (HasNewerValuePrecendenceInMerging(newer_component)) {
854       *this = newer_component;
855     }
856     return true;
857   }
858 
859   // Compare the tokens of both values.
860   SortedTokenComparisonResult token_comparison_result =
861       CompareSortedTokens(value, value_newer);
862 
863   // Use the recursive merge strategy for token equivalent values if the
864   // corresponding mode is active.
865   if ((merge_mode_ & kRecursivelyMergeTokenEquivalentValues) &&
866       (token_comparison_result.status == MATCH)) {
867     return MergeTokenEquivalentComponent(newer_component);
868   }
869 
870   // Replace the subset with the superset if the corresponding mode is active.
871   if ((merge_mode_ & kReplaceSubset) && token_comparison_result.OneIsSubset()) {
872     if (token_comparison_result.status == SUBSET)
873       *this = newer_component;
874     return true;
875   }
876 
877   // Replace the superset with the subset if the corresponding mode is active.
878   if ((merge_mode_ & kReplaceSuperset) &&
879       token_comparison_result.OneIsSubset()) {
880     if (token_comparison_result.status == SUPERSET)
881       *this = newer_component;
882     return true;
883   }
884 
885   // If the tokens are already equivalent, use the more recently used one.
886   if ((merge_mode_ & (kReplaceSuperset | kReplaceSubset)) &&
887       token_comparison_result.status == MATCH) {
888     if (newer_was_more_recently_used)
889       *this = newer_component;
890     return true;
891   }
892 
893   // Recursively merge a single-token subset if the corresponding mode is
894   // active.
895   if ((merge_mode_ & kRecursivelyMergeSingleTokenSubset) &&
896       token_comparison_result.IsSingleTokenSuperset()) {
897     // For the merging of subset token, the tokenization must be done without
898     // prior normalization of the values.
899     SortedTokenComparisonResult token_comparison_result =
900         CompareSortedTokens(GetValue(), newer_component.GetValue());
901     return MergeSubsetComponent(newer_component, token_comparison_result);
902   }
903 
904   // Replace the older value with the newer one if the corresponding mode is
905   // active.
906   if (merge_mode_ & kUseNewerIfDifferent) {
907     *this = newer_component;
908     return true;
909   }
910 
911   // If the one value is a substring of the other, use the substring of the
912   // corresponding mode is active.
913   if ((merge_mode_ & kUseMostRecentSubstring) &&
914       (value.find(value_newer) != base::string16::npos ||
915        value_newer.find(value) != base::string16::npos)) {
916     if (newer_was_more_recently_used)
917       *this = newer_component;
918     return true;
919   }
920 
921   if ((merge_mode_ & kPickShorterIfOneContainsTheOther) &&
922       token_comparison_result.ContainEachOther()) {
923     if (newer_component.GetValue().size() <= GetValue().size())
924       *this = newer_component;
925     return true;
926   }
927 
928   // If the corresponding mode is active, ignore this mode and pair-wise merge
929   // the child tokens. Reformat this nodes from its children after the merge.
930   if (merge_mode_ & kMergeChildrenAndReformat) {
931     DCHECK(newer_component.subcomponents_.size() == subcomponents_.size());
932     for (size_t i = 0; i < newer_component.subcomponents_.size(); i++) {
933       bool success = subcomponents_[i]->MergeWithComponent(
934           *newer_component.subcomponents_[i], newer_was_more_recently_used);
935       if (!success)
936         return false;
937     }
938     FormatValueFromSubcomponents();
939     return true;
940   }
941   return false;
942 }
943 
HasNewerValuePrecendenceInMerging(const AddressComponent & newer_component) const944 bool AddressComponent::HasNewerValuePrecendenceInMerging(
945     const AddressComponent& newer_component) const {
946   return !IsLessSignificantVerificationStatus(
947       newer_component.GetVerificationStatus(), GetVerificationStatus());
948 }
949 
MergeTokenEquivalentComponent(const AddressComponent & newer_component)950 bool AddressComponent::MergeTokenEquivalentComponent(
951     const AddressComponent& newer_component) {
952   if (!AreSortedTokensEqual(
953           TokenizeValue(ValueForComparison()),
954           TokenizeValue(newer_component.ValueForComparison()))) {
955     return false;
956   }
957 
958   // Assumption:
959   // The values of both components are a permutation of the same tokens.
960   // The componentization of the components can be different in terms of
961   // how the tokens are divided between the subomponents. The valdiation
962   // status of the component and its subcomponent can be different.
963   //
964   // Merge Strategy:
965   // * Adopt the exact value (and validation status) of the node with the higher
966   // validation status.
967   //
968   // * For all subcomponents that have the same value, make a recursive call and
969   // use the result.
970   //
971   // * For the set of all non-matching subcomponents. Either use the ones from
972   // this component or the other depending on which substructure is better in
973   // terms of the number of validated tokens.
974 
975   if (HasNewerValuePrecendenceInMerging(newer_component)) {
976     SetValue(newer_component.GetValue(),
977              newer_component.GetVerificationStatus());
978   }
979 
980   // Now, the substructure of the node must be merged. There are three cases:
981   //
982   // * All nodes of the substructure are pairwise mergeable. In this case it
983   // is sufficient to apply a recursive merging strategy.
984   //
985   // * None of the nodes of the substructure are pairwise mergeable. In this
986   // case, either the complete substructure of |this| or |newer_component|
987   // must be used. Which one to use can be decided by the higher validation
988   // score.
989   //
990   // * In a mixed scenario, there is at least one pair of mergeable nodes
991   // in the substructure and at least on pair of non-mergeable nodes. Here,
992   // the mergeable nodes are merged while all other nodes are taken either
993   // from |this| or the |newer_component| decided by the higher validation
994   // score of the unmerged nodes.
995   //
996   // The following algorithm combines the three cases by first trying to merge
997   // all components pair-wise. For all components that couldn't be merged, the
998   // verification score is summed for this and the other component. If the other
999   // component has an equal or larger score, finalize the merge by using its
1000   // components. It is assumed that the other component is the newer of the two
1001   // components. By favoring the other component in a tie, the most recently
1002   // used structure wins.
1003 
1004   const std::vector<AddressComponent*> other_subcomponents =
1005       newer_component.Subcomponents();
1006 
1007   DCHECK(subcomponents_.size() == other_subcomponents.size());
1008 
1009   int this_component_verification_score = 0;
1010   int newer_component_verification_score = 0;
1011 
1012   std::vector<int> unmerged_indices;
1013   unmerged_indices.reserve(subcomponents_.size());
1014 
1015   for (size_t i = 0; i < subcomponents_.size(); i++) {
1016     DCHECK(subcomponents_[i]->GetStorageType() ==
1017            other_subcomponents.at(i)->GetStorageType());
1018 
1019     // If the components can't be merged directly, store the ungermed index and
1020     // sum the verification scores to decide which component's substructure to
1021     // use.
1022     if (!subcomponents_[i]->MergeTokenEquivalentComponent(
1023             *other_subcomponents.at(i))) {
1024       this_component_verification_score +=
1025           subcomponents_[i]->GetStructureVerificationScore();
1026       newer_component_verification_score +=
1027           other_subcomponents.at(i)->GetStructureVerificationScore();
1028       unmerged_indices.emplace_back(i);
1029     }
1030   }
1031 
1032   // If the total verification score of all unmerged components of the other
1033   // component is equal or larger than the score of this component, use its
1034   // subcomponents including their substructure for all unmerged components.
1035   if (newer_component_verification_score >= this_component_verification_score) {
1036     for (size_t i : unmerged_indices)
1037       *subcomponents_[i] = *other_subcomponents[i];
1038   }
1039 
1040   return true;
1041 }
1042 
ConsumeAdditionalToken(const base::string16 & token_value)1043 void AddressComponent::ConsumeAdditionalToken(
1044     const base::string16& token_value) {
1045   if (IsAtomic()) {
1046     if (GetValue().empty()) {
1047       SetValue(token_value, VerificationStatus::kParsed);
1048     } else {
1049       SetValue(base::StrCat({GetValue(), base::ASCIIToUTF16(" "), token_value}),
1050                VerificationStatus::kParsed);
1051     }
1052     return;
1053   }
1054 
1055   // Try the first free subcomponent.
1056   for (auto* subcomponent : subcomponents_) {
1057     if (subcomponent->GetValue().empty()) {
1058       subcomponent->SetValue(token_value, VerificationStatus::kParsed);
1059       return;
1060     }
1061   }
1062 
1063   // Otherwise append the value to the first component.
1064   subcomponents_[0]->SetValue(
1065       base::StrCat({GetValue(), base::ASCIIToUTF16(" "), token_value}),
1066       VerificationStatus::kParsed);
1067 }
1068 
MergeSubsetComponent(const AddressComponent & subset_component,const SortedTokenComparisonResult & token_comparison_result)1069 bool AddressComponent::MergeSubsetComponent(
1070     const AddressComponent& subset_component,
1071     const SortedTokenComparisonResult& token_comparison_result) {
1072   DCHECK(token_comparison_result.IsSingleTokenSuperset());
1073   DCHECK(token_comparison_result.additional_tokens.size() == 1);
1074 
1075   base::string16 token_to_consume =
1076       token_comparison_result.additional_tokens.back().value;
1077 
1078   int this_component_verification_score = 0;
1079   int newer_component_verification_score = 0;
1080   bool found_subset_component = false;
1081 
1082   std::vector<int> unmerged_indices;
1083   unmerged_indices.reserve(subcomponents_.size());
1084 
1085   const std::vector<AddressComponent*>& subset_subcomponents =
1086       subset_component.Subcomponents();
1087 
1088   unmerged_indices.reserve(subcomponents_.size());
1089 
1090   for (size_t i = 0; i < subcomponents_.size(); i++) {
1091     DCHECK(subcomponents_[i]->GetStorageType() ==
1092            subset_subcomponents.at(i)->GetStorageType());
1093 
1094     AddressComponent* subcomponent = subcomponents_[i];
1095     const AddressComponent* subset_subcomponent = subset_subcomponents.at(i);
1096 
1097     base::string16 additional_token;
1098 
1099     // If the additional token is the value of this token. Just leave it in.
1100     if (!found_subset_component &&
1101         subcomponent->GetValue() == token_to_consume &&
1102         subset_subcomponent->GetValue().empty()) {
1103       found_subset_component = true;
1104       continue;
1105     }
1106 
1107     SortedTokenComparisonResult subtoken_comparison_result =
1108         CompareSortedTokens(subcomponent->GetSortedTokens(),
1109                             subset_subcomponent->GetSortedTokens());
1110 
1111     // Recursive case.
1112     if (!found_subset_component &&
1113         subtoken_comparison_result.IsSingleTokenSuperset()) {
1114       found_subset_component = true;
1115       subcomponent->MergeSubsetComponent(*subset_subcomponent,
1116                                          subtoken_comparison_result);
1117       continue;
1118     }
1119 
1120     // If the tokens are the equivalent, they can directly be merged.
1121     if (subtoken_comparison_result.status == MATCH) {
1122       subcomponent->MergeTokenEquivalentComponent(*subset_subcomponent);
1123       continue;
1124     }
1125 
1126     // Otherwise calculate the verification score.
1127     this_component_verification_score +=
1128         subcomponent->GetStructureVerificationScore();
1129     newer_component_verification_score +=
1130         subset_subcomponent->GetStructureVerificationScore();
1131     unmerged_indices.emplace_back(i);
1132   }
1133 
1134   // If the total verification score of all unmerged components of the other
1135   // component is equal or larger than the score of this component, use its
1136   // subcomponents including their substructure for all unmerged components.
1137   if (newer_component_verification_score >= this_component_verification_score) {
1138     for (size_t i : unmerged_indices)
1139       *subcomponents_[i] = *subset_subcomponents[i];
1140 
1141     if (!found_subset_component)
1142       this->ConsumeAdditionalToken(token_to_consume);
1143   }
1144 
1145   // In the current implementation it is always possible to merge.
1146   // Once more tokens are supported this may change.
1147   return true;
1148 }
1149 
GetStructureVerificationScore() const1150 int AddressComponent::GetStructureVerificationScore() const {
1151   int result = 0;
1152   switch (GetVerificationStatus()) {
1153     case VerificationStatus::kNoStatus:
1154     case VerificationStatus::kParsed:
1155     case VerificationStatus::kFormatted:
1156     case VerificationStatus::kServerParsed:
1157       break;
1158     case VerificationStatus::kObserved:
1159       result += 1;
1160       break;
1161     case VerificationStatus::kUserVerified:
1162       // In the current implementation, only the root not can be verified by
1163       // the user.
1164       NOTREACHED();
1165       break;
1166   }
1167   for (const AddressComponent* component : subcomponents_)
1168     result += component->GetStructureVerificationScore();
1169 
1170   return result;
1171 }
1172 
NormalizedValue() const1173 base::string16 AddressComponent::NormalizedValue() const {
1174   return NormalizeValue(GetValue());
1175 }
1176 
ValueForComparison() const1177 base::string16 AddressComponent::ValueForComparison() const {
1178   return NormalizedValue();
1179 }
1180 
1181 }  // namespace structured_address
1182 
1183 }  // namespace autofill
1184