1 // Copyright 2020 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_ 6 #define COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_ 7 8 #include <map> 9 #include <string> 10 #include <vector> 11 12 #include "base/optional.h" 13 #include "base/strings/string_piece.h" 14 #include "components/autofill/core/browser/field_types.h" 15 16 namespace re2 { 17 class RE2; 18 } // namespace re2 19 20 namespace autofill { 21 namespace structured_address { 22 23 struct AddressToken; 24 struct SortedTokenComparisonResult; 25 26 // Represents the validation status of value stored in the AutofillProfile. 27 // The associated integer values used to store the verification code in SQL and 28 // should not be modified. 29 enum class VerificationStatus { 30 // No verification status assigned. 31 kNoStatus = 0, 32 // The value token was parsed from a parent token. 33 kParsed = 1, 34 // Value was built from its subcomponents. 35 kFormatted = 2, 36 // The value was observed in a form transmission. 37 kObserved = 3, 38 // The user used the autofill settings to verify and store this token. 39 kUserVerified = 4, 40 // The token was parsed by the server. 41 kServerParsed = 5, 42 }; 43 44 // Returns true if |left| has a less significant verification status compared to 45 // |right|. 46 bool IsLessSignificantVerificationStatus(VerificationStatus left, 47 VerificationStatus right); 48 49 // The merge mode defines if and how two components are merged. 50 enum MergeMode { 51 // If one component has an empty value, use the non-empty one. 52 kReplaceEmpty = 1, 53 // Recursively merge two components that have the same tokens in arbitrary 54 // order. This is used as the default merge mode. 55 kRecursivelyMergeTokenEquivalentValues = 1 << 1, 56 // If both tokens have the same normalized value, use the one with the better 57 // verification status. If both statuses are the same, use the newer one. 58 kUseBetterOrNewerForSameValue = 1 << 2, 59 // If one component is a superset of the other, use the subset. 60 kReplaceSuperset = 1 << 3, 61 // If one component is a subset of the other, use the superset. 62 kReplaceSubset = 1 << 4, 63 // If both components have a different value, is the newer one. 64 kUseNewerIfDifferent = 1 << 5, 65 // If the newer component contains one token more, apply a recursive strategy 66 // to merge the tokens. 67 kRecursivelyMergeSingleTokenSubset = 1 << 6, 68 // If one is a substring use the most recent one. 69 kUseMostRecentSubstring = 1 << 7, 70 // Merge the child nodes and reformat the node from its children after merge. 71 kMergeChildrenAndReformat = 1 << 8, 72 // If the tokens match or one is a subset of the other, pick the shorter one. 73 kPickShorterIfOneContainsTheOther = 1 << 9, 74 // Defines the default merging behavior. 75 kDefault = kRecursivelyMergeTokenEquivalentValues 76 }; 77 78 // An AddressComponent is a tree structure that represents a semi-structured 79 // address token. Such an address token can either be an atomic leaf node or 80 // have a set of children, each representing a more granular subtoken of the 81 // component. 82 // 83 // An AddressComponent has a string representation stored in |value_| and a 84 // VerificationStatus stored in |verification_status_|. 85 // The latter indicates if the value was user-verified, observed in a form 86 // submission event, parsed from its parent component or was formatted from its 87 // child components. 88 // 89 // In a proper component tree, each AddressComponent has a unique 90 // ServerFieldType. Additionally, an AddressComponent may be associated with a 91 // list of additional field types that allow for retrieving and setting the 92 // Component's value in specific formats. For example, NAME_MIDDLE may be the 93 // storage type and NAME_MIDDLE_INITIAL is an additional field type. 94 // 95 // The usage pattern of such an address tree is as follows: 96 // 97 // * Create a tree from an observed form submission or a profile editing or 98 // creation event in the Chrome settings. It is assumed that the created 99 // tree does not have values for competing field types. Two types are competing 100 // iff they are on a common root-to-leaf path. For example, an imported profile 101 // with a value for NAME_FULL and NAME_LAST has conflicting types that 102 // carry redundant information. 103 // 104 // * After the creation of the tree, the values of unassigned nodes in the tree 105 // are deducted from the values of assigned nodes. This happens by parsing 106 // (taking a string and splitting it into components) or by formatting (taking 107 // one or multiple strings and combining them into one string). 108 // 109 // * After the completion, there should be no need to modify the tree. 110 // 111 // * A tree may be mergeable with another tree of the same type. This 112 // operation incorporates complementing observations. For example, in the first 113 // tree NAME_FIRST, NAME_MIDDLE and NAME_LAST may be parsed from an observed 114 // unstructured name (NAME_FULL). The second tree may be built from observing 115 // the structured name, and contain observed NAME_FIRST, NAME_MIDDLE and 116 // NAME_LAST values but only a formatted NAME_FULL value. 117 class AddressComponent { 118 public: 119 // Constructor for a compound child node. 120 AddressComponent(ServerFieldType storage_type, 121 AddressComponent* parent, 122 std::vector<AddressComponent*> subcomponents, 123 unsigned int merge_mode); 124 125 // Disallows copies since they are not needed in the current Autofill design. 126 AddressComponent(const AddressComponent& other) = delete; 127 128 virtual ~AddressComponent(); 129 130 // Assignment operator that works recursively down the tree and assigns the 131 // |value_| and |verification_status_| of every node in right to the 132 // corresponding nodes in |this|. For an assignment it is required that both 133 // nodes have the same |storage_type_|. 134 AddressComponent& operator=(const AddressComponent& right); 135 136 // Comparison operator that works recursively down the tree. 137 bool operator==(const AddressComponent& right) const; 138 139 // Inequality operator that works recursively down the tree. 140 bool operator!=(const AddressComponent& right) const; 141 142 // Returns the autofill storage type stored in |storage_type_|. 143 ServerFieldType GetStorageType() const; 144 145 // Returns the string representation of |storage_type_|. 146 std::string GetStorageTypeName() const; 147 148 // Returns the value verification status of the component's value; 149 VerificationStatus GetVerificationStatus() const; 150 151 // Returns true if the component has no subcomponents. 152 bool IsAtomic() const; 153 154 // Returns a constant reference to |value_.value()|. If the value is not 155 // assigned, an empty string is returned. 156 const base::string16& GetValue() const; 157 158 // Returns true if the value of this AddressComponent is assigned. 159 bool IsValueAssigned() const; 160 161 // Sets the value corresponding to the storage type of this AddressComponent. 162 virtual void SetValue(base::string16 value, VerificationStatus status); 163 164 // Sets the value to an empty string, marks it unassigned and sets the 165 // verification status to |kNoStatus|. 166 virtual void UnsetValue(); 167 168 // The method sets the value of the current node if its |storage_type_| is 169 // |type| or if |ConvertAndGetTheValueForAdditionalFieldTypeName()| supports 170 // retrieving |type|. Otherwise, the call is delegated recursively to the 171 // node's children. 172 // Returns true if the |value_| and |verification_status_| were successfully 173 // set for this or an ancestor node with the storage type |type|. If 174 // |invalidate_child_nodes|, all child nodes of the assigned node are 175 // unassigned. If |invalidate_parent_nodes|, all ancestor nodes of the 176 // assigned node as unassigned. 177 bool SetValueForTypeIfPossible(const ServerFieldType& type, 178 const base::string16& value, 179 const VerificationStatus& verification_status, 180 bool invalidate_child_nodes = false, 181 bool invalidate_parent_nodes = false); 182 183 // Same as |SetValueForTypeIfPossible()| but the type is supplied in the 184 // corresponding string representation. 185 bool SetValueForTypeIfPossible(const std::string& type_name, 186 const base::string16& value, 187 const VerificationStatus& verification_status, 188 bool invalidate_child_nodes = false, 189 bool invalidate_parent_nodes = false); 190 191 // Convenience wrapper to allow setting the value using a std::string. 192 bool SetValueForTypeIfPossible(const ServerFieldType& type, 193 const std::string& value, 194 const VerificationStatus& verification_status, 195 bool invalidate_child_nodes = false, 196 bool invalidate_parent_nodes = false); 197 198 // Convenience wrapper to allow setting the value using a std::string. 199 bool SetValueForTypeIfPossible(const std::string& type_name, 200 const std::string& value, 201 const VerificationStatus& verification_status, 202 bool invalidate_child_nodes = false, 203 bool invalidate_parent_nodes = false); 204 205 // Convenience method to get the value of |type|. 206 // Returns an empty string if |type| is not supported. 207 base::string16 GetValueForType(const ServerFieldType& type) const; 208 209 // Convenience method to get the value of |type| identified by its string 210 // representation name. Returns an empty string if |type| is not supported. 211 base::string16 GetValueForType(const std::string& type) const; 212 213 // Convenience method to get the verification status of |type|. 214 // Returns |VerificationStatus::kNoStatus| if |type| is not supported. 215 VerificationStatus GetVerificationStatusForType( 216 const ServerFieldType& type) const; 217 218 // Convenience method to get the verification status of |type| identified by 219 // its name. Returns |VerificationStatus::kNoStatus| if |type| is not 220 // supported. 221 VerificationStatus GetVerificationStatusForType( 222 const std::string& type) const; 223 224 // Get the value and status of a |type|, 225 // Returns false if the |type| is not supported by the structure. 226 // The method returns |value_| and |validation_status_| of the current node if 227 // its |storage_type_| is |type| or if 228 // |ConvertAndSetTheValueForAdditionalFieldTypeName()| supports setting 229 // |type|. Otherwise, the call is delegated recursively to the node's 230 // children. Returns false if the neither the node or one of its ancestors 231 // supports |type|. 232 bool GetValueAndStatusForTypeIfPossible(const ServerFieldType& type, 233 base::string16* value, 234 VerificationStatus* status) const; 235 236 // Get the value and status of a |type| identified by its name. 237 // Returns false if the |type| is not supported by the structure. 238 bool GetValueAndStatusForTypeIfPossible(const std::string& type_name, 239 base::string16* value, 240 VerificationStatus* status) const; 241 242 // Returns true if the |value| and |verification_status| were successfully 243 // unset for |type|. 244 bool UnsetValueForTypeIfSupported(const ServerFieldType& type); 245 246 // Parses |value_| to assign values to the subcomponents. 247 // The method uses 3 stages: 248 // 249 // * Use |ParseValueAndAssignSubcomponentsByMethod()|. This stage exists 250 // to catch special cases and may fail. The method is virtual and can be 251 // implemented on the type level. 252 // 253 // * Use |ParseValueAndAssignSubcomponentsByRegularExpressions()|. This stage 254 // uses a list of regular expressions acquired by the virtual method 255 // |GetParseRegularExpressionsByRelevance()|. This stage my fail. 256 // 257 // * Use |ParseValueAndAssignSubcomponentsByFallbackMethod()| as the last 258 // resort to parse |value_|. This method must produce a valid result. 259 void ParseValueAndAssignSubcomponents(); 260 261 // This methods populated the unassigned entries in the subtree of this node 262 // by either parsing unknown values for subcomponents from their parents, or 263 // vice versa, formatting unknown values from known subcomponents. The method 264 // is virtual and can be reimplemented on the type level. 265 virtual void RecursivelyCompleteTree(); 266 267 // Completes the full tree by calling |RecursivelyCompleteTree()| starting 268 // form the root node. Returns true if the completion was successful. 269 virtual bool CompleteFullTree(); 270 271 // Checks if a tree is completable in the sense that there are no conflicting 272 // observed or verified types. This means that there is not more than one 273 // observed or verified node on any root-to-leaf path in the tree. 274 bool IsTreeCompletable(); 275 276 // Recursively adds the supported types to the set. Calls 277 // |GetAdditionalSupportedFieldTypes()| to add field types. 278 void GetSupportedTypes(ServerFieldTypeSet* supported_types) const; 279 280 // Adds the additional supported field types to |supported_types|. 281 // The method should DCHECK that the added types are not part of the set yet. GetAdditionalSupportedFieldTypes(ServerFieldTypeSet * supported_types)282 virtual void GetAdditionalSupportedFieldTypes( 283 ServerFieldTypeSet* supported_types) const {} 284 285 // Unassigns all nodes with parsed or formatted values. 286 void UnsetParsedAndFormattedValuesInEntireTree(); 287 288 // Unassigns all nodes with parsed or formatted values. 289 void RecursivelyUnsetParsedAndFormattedValues(); 290 291 // Returns true if both components are mergeable. 292 virtual bool IsMergeableWithComponent( 293 const AddressComponent& newer_component) const; 294 295 // Recursively updates the verification statuses to the higher one, for nodes 296 // in |newer_component| that have the same values as the nodes in |this|. 297 virtual void MergeVerificationStatuses( 298 const AddressComponent& newer_component); 299 300 // Merge |newer_component| into this AddressComponent. 301 // Returns false if the merging is not possible. 302 // The state of the component is not altered by a failed merging attempt. 303 // |newer_was_more_recently_used| indicates that the newer component was also 304 // more recently used for filling a form. 305 virtual bool MergeWithComponent(const AddressComponent& newer_component, 306 bool newer_was_more_recently_used = true); 307 308 // Merge |newer_component| into this AddressComponent. 309 // The merging is possible iff the value of both root nodes is token 310 // equivalent, meaning they contain the same tokens in an arbitrary order. 311 // Returns false if the merging is not possible. 312 // The state of the component is not altered by a failed merging attempt. 313 bool MergeTokenEquivalentComponent(const AddressComponent& newer_component); 314 315 // Returns a constant vector of pointers to the child nodes of the component. Subcomponents()316 const std::vector<AddressComponent*>& Subcomponents() const { 317 return subcomponents_; 318 } 319 320 // Returns a vector containing sorted normalized tokens of the 321 // value of the component. The tokens are lazily calculated when first needed. 322 const std::vector<AddressToken> GetSortedTokens() const; 323 324 // Recursively unsets all subcomponents. 325 void RecursivelyUnsetSubcomponents(); 326 327 // Return if the value associated with |field_type_name| is valid. 328 // If |wipe_if_not|, the value is unset if invalid. 329 bool IsValueForTypeValid(const std::string& field_type_name, 330 bool wipe_if_not = false); 331 332 // Convenience wrapper to work the ServerFieldTypes. 333 bool IsValueForTypeValid(ServerFieldType field_type, 334 bool wipe_if_not = false); 335 336 // Recursively determines the validity status of a component value associated 337 // with |field_type_name|. If |wipe_if_not|, the value is unset if invalid. 338 // Returns true if it is possible to determine the validity status of the 339 // value in this subcomponent. 340 bool GetIsValueForTypeValidIfPossible(const std::string& field_type_name, 341 bool* validity_status, 342 bool wipe_if_not = false); 343 344 // Deletes the stored structure if it contains strings that are not a 345 // substring of the unstructured representation. 346 // Return true if a wipe operation was performed. 347 virtual bool WipeInvalidStructure(); 348 349 #ifdef UNIT_TEST 350 // Initiates the formatting of the values from the subcomponents. FormatValueFromSubcomponentsForTesting()351 void FormatValueFromSubcomponentsForTesting() { 352 FormatValueFromSubcomponents(); 353 } 354 355 // Returns the best format string for testing. GetBestFormatStringForTesting()356 base::string16 GetBestFormatStringForTesting() { 357 return GetBestFormatString(); 358 } 359 360 // Returns the parse expressions by relevance for testing. 361 std::vector<const re2::RE2*> GetParseRegularExpressionsByRelevanceForTesting()362 GetParseRegularExpressionsByRelevanceForTesting() { 363 return GetParseRegularExpressionsByRelevance(); 364 } 365 366 // Returns a reference to the root node of the tree for testing. GetRootNodeForTesting()367 AddressComponent& GetRootNodeForTesting() { return GetRootNode(); } 368 369 // Replaces placeholder values in the best format string with the 370 // corresponding values. GetReplacedPlaceholderTypesWithValuesForTesting()371 base::string16 GetReplacedPlaceholderTypesWithValuesForTesting() const { 372 return ReplacePlaceholderTypesWithValues(GetBestFormatString()); 373 } 374 375 // Returns a vector containing the |storage_types_| of all direct 376 // subcomponents. GetSubcomponentTypesForTesting()377 std::vector<ServerFieldType> GetSubcomponentTypesForTesting() const { 378 return GetSubcomponentTypes(); 379 } 380 381 // Sets the merge mode for testing purposes. SetMergeModeForTesting(int merge_mode)382 void SetMergeModeForTesting(int merge_mode) { merge_mode_ = merge_mode; } 383 384 #endif 385 386 protected: 387 // Returns the verification score of this component and its substructure. 388 // Each observed node contributes to the validation score by 1. 389 virtual int GetStructureVerificationScore() const; 390 391 // Returns a vector containing the |storage_types_| of all direct 392 // subcomponents. 393 std::vector<ServerFieldType> GetSubcomponentTypes() const; 394 395 // Heuristic method to get the best suited format string. 396 // This method is virtual and can be reimplemented for each type. 397 virtual base::string16 GetBestFormatString() const; 398 399 // Returns pointers to regular expressions sorted by their relevance. 400 // This method is virtual and can be reimplemented for each type. 401 virtual std::vector<const re2::RE2*> GetParseRegularExpressionsByRelevance() 402 const; 403 404 // Method to parse |value_| into the values of |subcomponents_|. The 405 // purpose of this method is to cover special cases. This method returns true 406 // on success and is allowed to fail. On failure, the |subcomponents_| are not 407 // altered. 408 virtual bool ParseValueAndAssignSubcomponentsByMethod(); 409 410 // This method parses |value_| to assign values to the subcomponents. 411 // The method is virtual and can be reimplemented per type. 412 // It must succeed. 413 virtual void ParseValueAndAssignSubcomponentsByFallbackMethod(); 414 415 // This method is used to set the value given by a type different than the 416 // storage type. It must implement the conversion logic specific to each type. 417 // It returns true if conversion logic exists and the type can be set. 418 virtual bool ConvertAndSetValueForAdditionalFieldTypeName( 419 const std::string& field_type_name, 420 const base::string16& value, 421 const VerificationStatus& status); 422 423 // This method is used to retrieve the value for a supported field type 424 // different from the storage type. It must implement the conversion logic 425 // specific to each type. It returns true if the type is supported and the 426 // value can be written back to value. 427 // The method must handle |nullptr|s for both the value and status. 428 virtual bool ConvertAndGetTheValueForAdditionalFieldTypeName( 429 const std::string& field_type_name, 430 base::string16* value) const; 431 432 // Clears all parsed and formatted values. 433 void ClearAllParsedAndFormattedValues(); 434 435 // Merge a component that has exactly one token less. 436 bool MergeSubsetComponent( 437 const AddressComponent& subset_component, 438 const SortedTokenComparisonResult& token_comparison_result); 439 440 // Consumes an additional token into the most appropriate subcomponent. 441 // Can be implemented by the specific node types. 442 // The fall-back solution uses the first empty node. 443 // If no empty node is available, it appends the value to the first node. 444 virtual void ConsumeAdditionalToken(const base::string16& token_value); 445 446 // Returns a reference to the root node of the tree. 447 AddressComponent& GetRootNode(); 448 449 // Returns a reference to the root node of the tree. 450 const AddressComponent& GetRootNode() const; 451 452 // Function to determine if the value stored in this component is valid. 453 // Return true be default but can be overloaded by a subclass. 454 virtual bool IsValueValid() const; 455 456 // Function to be called post assign to do sanitization. PostAssignSanitization()457 virtual void PostAssignSanitization() {} 458 459 // Returns a normalized value for comparison. 460 // In the default implementation, this converts the value to lower case and 461 // removes white spaces. This function may be reimplemented to perform 462 // different normalization operations. 463 virtual base::string16 NormalizedValue() const; 464 465 // Returns a value used for comparison. 466 // In the default implementation this is just the normalized value but this 467 // function can be overridden in subclasses to apply further operations on 468 // the normalized value. 469 virtual base::string16 ValueForComparison() const; 470 471 // Returns true if the merging of two token identical values should give 472 // precedence to the newer value. By default, the newer component gets 473 // precedence if it has the same or better verification status. 474 virtual bool HasNewerValuePrecendenceInMerging( 475 const AddressComponent& newer_component) const; 476 477 // Parses |value| by using |parse_expressions| and assigns the values. 478 // Returns true on success. 479 bool ParseValueAndAssignSubcomponentsByRegularExpression( 480 const base::string16& value, 481 const re2::RE2* parse_expression); 482 483 private: 484 // Unsets the node and all of its children. 485 void UnsetAddressComponentAndItsSubcomponents(); 486 487 // Unsets the children of a node. 488 void UnsetSubcomponents(); 489 490 // Determines the |value_| from the values of the subcomponents by using the 491 // most suitable format string determined by |GetBestFormatString()|. 492 void FormatValueFromSubcomponents(); 493 494 // Replaces placeholder values with the corresponding values. 495 base::string16 ReplacePlaceholderTypesWithValues( 496 const base::string16& format) const; 497 498 // Replaces placeholder values with the corresponding values. 499 base::string16 ReplacePlaceholderTypesWithValuesRegexVersion( 500 const base::string16& format) const; 501 502 // This method uses regular expressions acquired by 503 // |GetParseRegularExpressionsByRelevance| to parse |value_| into the values 504 // of the subcomponents. Returns true on success and is allowed to fail. 505 bool ParseValueAndAssignSubcomponentsByRegularExpressions(); 506 507 // Returns the maximum number of components with assigned values on the path 508 // from the component to a leaf node. 509 int MaximumNumberOfAssignedAddressComponentsOnNodeToLeafPaths() const; 510 511 // The unstructured value of this component. 512 base::Optional<base::string16> value_; 513 514 // The verification status of |value_| indicates the certainty of the value 515 // to be correct. 516 VerificationStatus value_verification_status_; 517 518 // The storable Autofill type of the component. 519 const ServerFieldType storage_type_; 520 521 // A vector of pointers to the subcomponents. 522 std::vector<AddressComponent*> subcomponents_; 523 524 // A vector that contains the tokens of |value_| after normalization, 525 // meaning that it was converted to lower case and diacritics have been 526 // removed. |value_| is tokenized by splitting the string by white spaces and 527 // commas. It is calculated when |value_| is set. 528 base::Optional<std::vector<AddressToken>> sorted_normalized_tokens_; 529 530 // A pointer to the parent node. It is set to nullptr if the node is the root 531 // node of the AddressComponent tree. 532 AddressComponent* const parent_; 533 534 // Defines if and how two components can be merged. 535 int merge_mode_; 536 }; 537 538 } // namespace structured_address 539 540 } // namespace autofill 541 #endif // COMPONENTS_AUTOFILL_CORE_BROWSER_DATA_MODEL_AUTOFILL_STRUCTURED_ADDRESS_COMPONENT_H_ 542