1 // Copyright 2020 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_
6 #define COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_
7 
8 #include <map>
9 #include <string>
10 #include <vector>
11 
12 #include "base/optional.h"
13 #include "base/strings/string_piece.h"
14 #include "components/autofill/core/browser/field_types.h"
15 
16 namespace re2 {
17 class RE2;
18 }  // namespace re2
19 
20 namespace autofill {
21 namespace structured_address {
22 
23 struct AddressToken;
24 struct SortedTokenComparisonResult;
25 
26 // Represents the validation status of value stored in the AutofillProfile.
27 // The associated integer values used to store the verification code in SQL and
28 // should not be modified.
29 enum class VerificationStatus {
30   // No verification status assigned.
31   kNoStatus = 0,
32   // The value token was parsed from a parent token.
33   kParsed = 1,
34   // Value was built from its subcomponents.
35   kFormatted = 2,
36   // The value was observed in a form transmission.
37   kObserved = 3,
38   // The user used the autofill settings to verify and store this token.
39   kUserVerified = 4,
40   // The token was parsed by the server.
41   kServerParsed = 5,
42 };
43 
44 // Returns true if |left| has a less significant verification status compared to
45 // |right|.
46 bool IsLessSignificantVerificationStatus(VerificationStatus left,
47                                          VerificationStatus right);
48 
49 // The merge mode defines if and how two components are merged.
50 enum MergeMode {
51   // If one component has an empty value, use the non-empty one.
52   kReplaceEmpty = 1,
53   // Recursively merge two components that have the same tokens in arbitrary
54   // order. This is used as the default merge mode.
55   kRecursivelyMergeTokenEquivalentValues = 1 << 1,
56   // If both tokens have the same normalized value, use the one with the better
57   // verification status. If both statuses are the same, use the newer one.
58   kUseBetterOrNewerForSameValue = 1 << 2,
59   // If one component is a superset of the other, use the subset.
60   kReplaceSuperset = 1 << 3,
61   // If one component is a subset of the other, use the superset.
62   kReplaceSubset = 1 << 4,
63   // If both components have a different value, is the newer one.
64   kUseNewerIfDifferent = 1 << 5,
65   // If the newer component contains one token more, apply a recursive strategy
66   // to merge the tokens.
67   kRecursivelyMergeSingleTokenSubset = 1 << 6,
68   // If one is a substring use the most recent one.
69   kUseMostRecentSubstring = 1 << 7,
70   // Merge the child nodes and reformat the node from its children after merge.
71   kMergeChildrenAndReformat = 1 << 8,
72   // If the tokens match or one is a subset of the other, pick the shorter one.
73   kPickShorterIfOneContainsTheOther = 1 << 9,
74   // Defines the default merging behavior.
75   kDefault = kRecursivelyMergeTokenEquivalentValues
76 };
77 
78 // An AddressComponent is a tree structure that represents a semi-structured
79 // address token. Such an address token can either be an atomic leaf node or
80 // have a set of children, each representing a more granular subtoken of the
81 // component.
82 //
83 // An AddressComponent has a string representation stored in |value_| and a
84 // VerificationStatus stored in |verification_status_|.
85 // The latter indicates if the value was user-verified, observed in a form
86 // submission event, parsed from its parent component or was formatted from its
87 // child components.
88 //
89 // In a proper component tree, each AddressComponent has a unique
90 // ServerFieldType. Additionally, an AddressComponent may be associated with a
91 // list of additional field types that allow for retrieving and setting the
92 // Component's value in specific formats. For example, NAME_MIDDLE may be the
93 // storage type and NAME_MIDDLE_INITIAL is an additional field type.
94 //
95 // The usage pattern of such an address tree is as follows:
96 //
97 //  * Create a tree from an observed form submission or a profile editing or
98 //  creation event in the Chrome settings. It is assumed that the created
99 //  tree does not have values for competing field types. Two types are competing
100 //  iff they are on a common root-to-leaf path. For example, an imported profile
101 //  with a value for NAME_FULL and NAME_LAST has conflicting types that
102 //  carry redundant information.
103 //
104 //  * After the creation of the tree, the values of unassigned nodes in the tree
105 //  are deducted from the values of assigned nodes. This happens by parsing
106 //  (taking a string and splitting it into components) or by formatting (taking
107 //  one or multiple strings and combining them into one string).
108 //
109 //  * After the completion, there should be no need to modify the tree.
110 //
111 //  * A tree may be mergeable with another tree of the same type. This
112 //  operation incorporates complementing observations. For example, in the first
113 //  tree NAME_FIRST, NAME_MIDDLE and NAME_LAST may be parsed from an observed
114 //  unstructured name (NAME_FULL). The second tree may be built from observing
115 //  the structured name, and contain observed NAME_FIRST, NAME_MIDDLE and
116 //  NAME_LAST values but only a formatted NAME_FULL value.
117 class AddressComponent {
118  public:
119   // Constructor for a compound child node.
120   AddressComponent(ServerFieldType storage_type,
121                    AddressComponent* parent,
122                    std::vector<AddressComponent*> subcomponents,
123                    unsigned int merge_mode);
124 
125   // Disallows copies since they are not needed in the current Autofill design.
126   AddressComponent(const AddressComponent& other) = delete;
127 
128   virtual ~AddressComponent();
129 
130   // Assignment operator that works recursively down the tree and assigns the
131   // |value_| and |verification_status_| of every node in right to the
132   // corresponding nodes in |this|. For an assignment it is required that both
133   // nodes have the same |storage_type_|.
134   AddressComponent& operator=(const AddressComponent& right);
135 
136   // Comparison operator that works recursively down the tree.
137   bool operator==(const AddressComponent& right) const;
138 
139   // Inequality operator that works recursively down the tree.
140   bool operator!=(const AddressComponent& right) const;
141 
142   // Returns the autofill storage type stored in |storage_type_|.
143   ServerFieldType GetStorageType() const;
144 
145   // Returns the string representation of |storage_type_|.
146   std::string GetStorageTypeName() const;
147 
148   // Returns the value verification status of the component's value;
149   VerificationStatus GetVerificationStatus() const;
150 
151   // Returns true if the component has no subcomponents.
152   bool IsAtomic() const;
153 
154   // Returns a constant reference to |value_.value()|. If the value is not
155   // assigned, an empty string is returned.
156   const base::string16& GetValue() const;
157 
158   // Returns true if the value of this AddressComponent is assigned.
159   bool IsValueAssigned() const;
160 
161   // Sets the value corresponding to the storage type of this AddressComponent.
162   virtual void SetValue(base::string16 value, VerificationStatus status);
163 
164   // Sets the value to an empty string, marks it unassigned and sets the
165   // verification status to |kNoStatus|.
166   virtual void UnsetValue();
167 
168   // The method sets the value of the current node if its |storage_type_| is
169   // |type| or if |ConvertAndGetTheValueForAdditionalFieldTypeName()| supports
170   // retrieving |type|. Otherwise, the call is delegated recursively to the
171   // node's children.
172   // Returns true if the |value_| and |verification_status_| were successfully
173   // set for this or an ancestor node with the storage type |type|. If
174   // |invalidate_child_nodes|, all child nodes of the assigned node are
175   // unassigned. If |invalidate_parent_nodes|, all ancestor nodes of the
176   // assigned node as unassigned.
177   bool SetValueForTypeIfPossible(const ServerFieldType& type,
178                                  const base::string16& value,
179                                  const VerificationStatus& verification_status,
180                                  bool invalidate_child_nodes = false,
181                                  bool invalidate_parent_nodes = false);
182 
183   // Same as |SetValueForTypeIfPossible()| but the type is supplied in the
184   // corresponding string representation.
185   bool SetValueForTypeIfPossible(const std::string& type_name,
186                                  const base::string16& value,
187                                  const VerificationStatus& verification_status,
188                                  bool invalidate_child_nodes = false,
189                                  bool invalidate_parent_nodes = false);
190 
191   // Convenience wrapper to allow setting the value using a std::string.
192   bool SetValueForTypeIfPossible(const ServerFieldType& type,
193                                  const std::string& value,
194                                  const VerificationStatus& verification_status,
195                                  bool invalidate_child_nodes = false,
196                                  bool invalidate_parent_nodes = false);
197 
198   // Convenience wrapper to allow setting the value using a std::string.
199   bool SetValueForTypeIfPossible(const std::string& type_name,
200                                  const std::string& value,
201                                  const VerificationStatus& verification_status,
202                                  bool invalidate_child_nodes = false,
203                                  bool invalidate_parent_nodes = false);
204 
205   // Convenience method to get the value of |type|.
206   // Returns an empty string if |type| is not supported.
207   base::string16 GetValueForType(const ServerFieldType& type) const;
208 
209   // Convenience method to get the value of |type| identified by its string
210   // representation name. Returns an empty string if |type| is not supported.
211   base::string16 GetValueForType(const std::string& type) const;
212 
213   // Convenience method to get the verification status of |type|.
214   // Returns |VerificationStatus::kNoStatus| if |type| is not supported.
215   VerificationStatus GetVerificationStatusForType(
216       const ServerFieldType& type) const;
217 
218   // Convenience method to get the verification status of |type| identified by
219   // its name. Returns |VerificationStatus::kNoStatus| if |type| is not
220   // supported.
221   VerificationStatus GetVerificationStatusForType(
222       const std::string& type) const;
223 
224   // Get the value and status of a |type|,
225   // Returns false if the |type| is not supported by the structure.
226   // The method returns |value_| and |validation_status_| of the current node if
227   // its |storage_type_| is |type| or if
228   // |ConvertAndSetTheValueForAdditionalFieldTypeName()| supports setting
229   // |type|. Otherwise, the call is delegated recursively to the node's
230   // children. Returns false if the neither the node or one of its ancestors
231   // supports |type|.
232   bool GetValueAndStatusForTypeIfPossible(const ServerFieldType& type,
233                                           base::string16* value,
234                                           VerificationStatus* status) const;
235 
236   // Get the value and status of a |type| identified by its name.
237   // Returns false if the |type| is not supported by the structure.
238   bool GetValueAndStatusForTypeIfPossible(const std::string& type_name,
239                                           base::string16* value,
240                                           VerificationStatus* status) const;
241 
242   // Returns true if the |value| and |verification_status| were successfully
243   // unset for |type|.
244   bool UnsetValueForTypeIfSupported(const ServerFieldType& type);
245 
246   // Parses |value_| to assign values to the subcomponents.
247   // The method uses 3 stages:
248   //
249   // * Use |ParseValueAndAssignSubcomponentsByMethod()|. This stage exists
250   // to catch special cases and may fail. The method is virtual and can be
251   // implemented on the type level.
252   //
253   // * Use |ParseValueAndAssignSubcomponentsByRegularExpressions()|. This stage
254   // uses a list of regular expressions acquired by the virtual method
255   // |GetParseRegularExpressionsByRelevance()|. This stage my fail.
256   //
257   // * Use |ParseValueAndAssignSubcomponentsByFallbackMethod()| as the last
258   // resort to parse |value_|. This method must produce a valid result.
259   void ParseValueAndAssignSubcomponents();
260 
261   // This methods populated the unassigned entries in the subtree of this node
262   // by either parsing unknown values for subcomponents from their parents, or
263   // vice versa, formatting unknown values from known subcomponents. The method
264   // is virtual and can be reimplemented on the type level.
265   virtual void RecursivelyCompleteTree();
266 
267   // Completes the full tree by calling |RecursivelyCompleteTree()| starting
268   // form the root node. Returns true if the completion was successful.
269   virtual bool CompleteFullTree();
270 
271   // Checks if a tree is completable in the sense that there are no conflicting
272   // observed or verified types. This means that there is not more than one
273   // observed or verified node on any root-to-leaf path in the tree.
274   bool IsTreeCompletable();
275 
276   // Recursively adds the supported types to the set. Calls
277   // |GetAdditionalSupportedFieldTypes()| to add field types.
278   void GetSupportedTypes(ServerFieldTypeSet* supported_types) const;
279 
280   // Adds the additional supported field types to |supported_types|.
281   // The method should DCHECK that the added types are not part of the set yet.
GetAdditionalSupportedFieldTypes(ServerFieldTypeSet * supported_types)282   virtual void GetAdditionalSupportedFieldTypes(
283       ServerFieldTypeSet* supported_types) const {}
284 
285   // Unassigns all nodes with parsed or formatted values.
286   void UnsetParsedAndFormattedValuesInEntireTree();
287 
288   // Unassigns all nodes with parsed or formatted values.
289   void RecursivelyUnsetParsedAndFormattedValues();
290 
291   // Returns true if both components are mergeable.
292   virtual bool IsMergeableWithComponent(
293       const AddressComponent& newer_component) const;
294 
295   // Recursively updates the verification statuses to the higher one, for nodes
296   // in |newer_component| that have the same values as the nodes in |this|.
297   virtual void MergeVerificationStatuses(
298       const AddressComponent& newer_component);
299 
300   // Merge |newer_component| into this AddressComponent.
301   // Returns false if the merging is not possible.
302   // The state of the component is not altered by a failed merging attempt.
303   // |newer_was_more_recently_used| indicates that the newer component was also
304   // more recently used for filling a form.
305   virtual bool MergeWithComponent(const AddressComponent& newer_component,
306                                   bool newer_was_more_recently_used = true);
307 
308   // Merge |newer_component| into this AddressComponent.
309   // The merging is possible iff the value of both root nodes is token
310   // equivalent, meaning they contain the same tokens in an arbitrary order.
311   // Returns false if the merging is not possible.
312   // The state of the component is not altered by a failed merging attempt.
313   bool MergeTokenEquivalentComponent(const AddressComponent& newer_component);
314 
315   // Returns a constant vector of pointers to the child nodes of the component.
Subcomponents()316   const std::vector<AddressComponent*>& Subcomponents() const {
317     return subcomponents_;
318   }
319 
320   // Returns a vector containing sorted normalized tokens of the
321   // value of the component. The tokens are lazily calculated when first needed.
322   const std::vector<AddressToken> GetSortedTokens() const;
323 
324   // Recursively unsets all subcomponents.
325   void RecursivelyUnsetSubcomponents();
326 
327   // Return if the value associated with |field_type_name| is valid.
328   // If |wipe_if_not|, the value is unset if invalid.
329   bool IsValueForTypeValid(const std::string& field_type_name,
330                            bool wipe_if_not = false);
331 
332   // Convenience wrapper to work the ServerFieldTypes.
333   bool IsValueForTypeValid(ServerFieldType field_type,
334                            bool wipe_if_not = false);
335 
336   // Recursively determines the validity status of a component value associated
337   // with |field_type_name|.  If |wipe_if_not|, the value is unset if invalid.
338   // Returns true if it is possible to determine the validity status of the
339   // value in this subcomponent.
340   bool GetIsValueForTypeValidIfPossible(const std::string& field_type_name,
341                                         bool* validity_status,
342                                         bool wipe_if_not = false);
343 
344   // Deletes the stored structure if it contains strings that are not a
345   // substring of the unstructured representation.
346   // Return true if a wipe operation was performed.
347   virtual bool WipeInvalidStructure();
348 
349 #ifdef UNIT_TEST
350   // Initiates the formatting of the values from the subcomponents.
FormatValueFromSubcomponentsForTesting()351   void FormatValueFromSubcomponentsForTesting() {
352     FormatValueFromSubcomponents();
353   }
354 
355   // Returns the best format string for testing.
GetBestFormatStringForTesting()356   base::string16 GetBestFormatStringForTesting() {
357     return GetBestFormatString();
358   }
359 
360   // Returns the parse expressions by relevance for testing.
361   std::vector<const re2::RE2*>
GetParseRegularExpressionsByRelevanceForTesting()362   GetParseRegularExpressionsByRelevanceForTesting() {
363     return GetParseRegularExpressionsByRelevance();
364   }
365 
366   // Returns a reference to the root node of the tree for testing.
GetRootNodeForTesting()367   AddressComponent& GetRootNodeForTesting() { return GetRootNode(); }
368 
369   // Replaces placeholder values in the best format string with the
370   // corresponding values.
GetReplacedPlaceholderTypesWithValuesForTesting()371   base::string16 GetReplacedPlaceholderTypesWithValuesForTesting() const {
372     return ReplacePlaceholderTypesWithValues(GetBestFormatString());
373   }
374 
375   // Returns a vector containing the |storage_types_| of all direct
376   // subcomponents.
GetSubcomponentTypesForTesting()377   std::vector<ServerFieldType> GetSubcomponentTypesForTesting() const {
378     return GetSubcomponentTypes();
379   }
380 
381   // Sets the merge mode for testing purposes.
SetMergeModeForTesting(int merge_mode)382   void SetMergeModeForTesting(int merge_mode) { merge_mode_ = merge_mode; }
383 
384 #endif
385 
386  protected:
387   // Returns the verification score of this component and its substructure.
388   // Each observed node contributes to the validation score by 1.
389   virtual int GetStructureVerificationScore() const;
390 
391   // Returns a vector containing the |storage_types_| of all direct
392   // subcomponents.
393   std::vector<ServerFieldType> GetSubcomponentTypes() const;
394 
395   // Heuristic method to get the best suited format string.
396   // This method is virtual and can be reimplemented for each type.
397   virtual base::string16 GetBestFormatString() const;
398 
399   // Returns pointers to regular expressions sorted by their relevance.
400   // This method is virtual and can be reimplemented for each type.
401   virtual std::vector<const re2::RE2*> GetParseRegularExpressionsByRelevance()
402       const;
403 
404   // Method to parse |value_| into the values of |subcomponents_|. The
405   // purpose of this method is to cover special cases. This method returns true
406   // on success and is allowed to fail. On failure, the |subcomponents_| are not
407   // altered.
408   virtual bool ParseValueAndAssignSubcomponentsByMethod();
409 
410   // This method parses |value_| to assign values to the subcomponents.
411   // The method is virtual and can be reimplemented per type.
412   // It must succeed.
413   virtual void ParseValueAndAssignSubcomponentsByFallbackMethod();
414 
415   // This method is used to set the value given by a type different than the
416   // storage type. It must implement the conversion logic specific to each type.
417   // It returns true if conversion logic exists and the type can be set.
418   virtual bool ConvertAndSetValueForAdditionalFieldTypeName(
419       const std::string& field_type_name,
420       const base::string16& value,
421       const VerificationStatus& status);
422 
423   // This method is used to retrieve the value for a supported field type
424   // different from the storage type. It must implement the conversion logic
425   // specific to each type. It returns true if the type is supported and the
426   // value can be written back to value.
427   // The method must handle |nullptr|s for both the value and status.
428   virtual bool ConvertAndGetTheValueForAdditionalFieldTypeName(
429       const std::string& field_type_name,
430       base::string16* value) const;
431 
432   // Clears all parsed and formatted values.
433   void ClearAllParsedAndFormattedValues();
434 
435   // Merge a component that has exactly one token less.
436   bool MergeSubsetComponent(
437       const AddressComponent& subset_component,
438       const SortedTokenComparisonResult& token_comparison_result);
439 
440   // Consumes an additional token into the most appropriate subcomponent.
441   // Can be implemented by the specific node types.
442   // The fall-back solution uses the first empty node.
443   // If no empty node is available, it appends the value to the first node.
444   virtual void ConsumeAdditionalToken(const base::string16& token_value);
445 
446   // Returns a reference to the root node of the tree.
447   AddressComponent& GetRootNode();
448 
449   // Returns a reference to the root node of the tree.
450   const AddressComponent& GetRootNode() const;
451 
452   // Function to determine if the value stored in this component is valid.
453   // Return true be default but can be overloaded by a subclass.
454   virtual bool IsValueValid() const;
455 
456   // Function to be called post assign to do sanitization.
PostAssignSanitization()457   virtual void PostAssignSanitization() {}
458 
459   // Returns a normalized value for comparison.
460   // In the default implementation, this converts the value to lower case and
461   // removes white spaces. This function may be reimplemented to perform
462   // different normalization operations.
463   virtual base::string16 NormalizedValue() const;
464 
465   // Returns a value used for comparison.
466   // In the default implementation this is just the normalized value but this
467   // function can be overridden in subclasses to apply further operations on
468   // the normalized value.
469   virtual base::string16 ValueForComparison() const;
470 
471   // Returns true if the merging of two token identical values should give
472   // precedence to the newer value. By default, the newer component gets
473   // precedence if it has the same or better verification status.
474   virtual bool HasNewerValuePrecendenceInMerging(
475       const AddressComponent& newer_component) const;
476 
477   // Parses |value| by using |parse_expressions| and assigns the values.
478   // Returns true on success.
479   bool ParseValueAndAssignSubcomponentsByRegularExpression(
480       const base::string16& value,
481       const re2::RE2* parse_expression);
482 
483  private:
484   // Unsets the node and all of its children.
485   void UnsetAddressComponentAndItsSubcomponents();
486 
487   // Unsets the children of a node.
488   void UnsetSubcomponents();
489 
490   // Determines the |value_| from the values of the subcomponents by using the
491   // most suitable format string determined by |GetBestFormatString()|.
492   void FormatValueFromSubcomponents();
493 
494   // Replaces placeholder values with the corresponding values.
495   base::string16 ReplacePlaceholderTypesWithValues(
496       const base::string16& format) const;
497 
498   // Replaces placeholder values with the corresponding values.
499   base::string16 ReplacePlaceholderTypesWithValuesRegexVersion(
500       const base::string16& format) const;
501 
502   // This method uses regular expressions acquired by
503   // |GetParseRegularExpressionsByRelevance| to parse |value_| into the values
504   // of the subcomponents. Returns true on success and is allowed to fail.
505   bool ParseValueAndAssignSubcomponentsByRegularExpressions();
506 
507   // Returns the maximum number of components with assigned values on the path
508   // from the component to a leaf node.
509   int MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths() const;
510 
511   // The unstructured value of this component.
512   base::Optional<base::string16> value_;
513 
514   // The verification status of |value_| indicates the certainty of the value
515   // to be correct.
516   VerificationStatus value_verification_status_;
517 
518   // The storable Autofill type of the component.
519   const ServerFieldType storage_type_;
520 
521   // A vector of pointers to the subcomponents.
522   std::vector<AddressComponent*> subcomponents_;
523 
524   // A vector that contains the tokens of |value_| after normalization,
525   // meaning that it was converted to lower case and diacritics have been
526   // removed. |value_| is tokenized by splitting the string by white spaces and
527   // commas. It is calculated when |value_| is set.
528   base::Optional<std::vector<AddressToken>> sorted_normalized_tokens_;
529 
530   // A pointer to the parent node. It is set to nullptr if the node is the root
531   // node of the AddressComponent tree.
532   AddressComponent* const parent_;
533 
534   // Defines if and how two components can be merged.
535   int merge_mode_;
536 };
537 
538 }  // namespace structured_address
539 
540 }  // namespace autofill
541 #endif  // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_
542