1/* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5"use strict"; 6 7var EXPORTED_SYMBOLS = ["FormAutofillNameUtils"]; 8 9// FormAutofillNameUtils is initially translated from 10// https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817 11var FormAutofillNameUtils = { 12 NAME_PREFIXES: [ 13 "1lt", 14 "1st", 15 "2lt", 16 "2nd", 17 "3rd", 18 "admiral", 19 "capt", 20 "captain", 21 "col", 22 "cpt", 23 "dr", 24 "gen", 25 "general", 26 "lcdr", 27 "lt", 28 "ltc", 29 "ltg", 30 "ltjg", 31 "maj", 32 "major", 33 "mg", 34 "mr", 35 "mrs", 36 "ms", 37 "pastor", 38 "prof", 39 "rep", 40 "reverend", 41 "rev", 42 "sen", 43 "st", 44 ], 45 46 NAME_SUFFIXES: [ 47 "b.a", 48 "ba", 49 "d.d.s", 50 "dds", 51 "i", 52 "ii", 53 "iii", 54 "iv", 55 "ix", 56 "jr", 57 "m.a", 58 "m.d", 59 "ma", 60 "md", 61 "ms", 62 "ph.d", 63 "phd", 64 "sr", 65 "v", 66 "vi", 67 "vii", 68 "viii", 69 "x", 70 ], 71 72 FAMILY_NAME_PREFIXES: [ 73 "d'", 74 "de", 75 "del", 76 "der", 77 "di", 78 "la", 79 "le", 80 "mc", 81 "san", 82 "st", 83 "ter", 84 "van", 85 "von", 86 ], 87 88 // The common and non-ambiguous CJK surnames (last names) that have more than 89 // one character. 90 COMMON_CJK_MULTI_CHAR_SURNAMES: [ 91 // Korean, taken from the list of surnames: 92 // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D 93 "남궁", 94 "사공", 95 "서문", 96 "선우", 97 "제갈", 98 "황보", 99 "독고", 100 "망절", 101 102 // Chinese, taken from the top 10 Chinese 2-character surnames: 103 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93 104 // Simplified Chinese (mostly mainland China) 105 "欧阳", 106 "令狐", 107 "皇甫", 108 "上官", 109 "司徒", 110 "诸葛", 111 "司马", 112 "宇文", 113 "呼延", 114 "端木", 115 // Traditional Chinese (mostly Taiwan) 116 "張簡", 117 "歐陽", 118 "諸葛", 119 "申屠", 120 "尉遲", 121 "司馬", 122 "軒轅", 123 "夏侯", 124 ], 125 126 // All Korean surnames that have more than one character, even the 127 // rare/ambiguous ones. 128 KOREAN_MULTI_CHAR_SURNAMES: [ 129 "강전", 130 "남궁", 131 "독고", 132 "동방", 133 "망절", 134 "사공", 135 "서문", 136 "선우", 137 "소봉", 138 "어금", 139 "장곡", 140 "제갈", 141 "황목", 142 "황보", 143 ], 144 145 // The whitespace definition based on 146 // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817 147 WHITESPACE: [ 148 "\u0009", // CHARACTER TABULATION 149 "\u000A", // LINE FEED (LF) 150 "\u000B", // LINE TABULATION 151 "\u000C", // FORM FEED (FF) 152 "\u000D", // CARRIAGE RETURN (CR) 153 "\u0020", // SPACE 154 "\u0085", // NEXT LINE (NEL) 155 "\u00A0", // NO-BREAK SPACE 156 "\u1680", // OGHAM SPACE MARK 157 "\u2000", // EN QUAD 158 "\u2001", // EM QUAD 159 "\u2002", // EN SPACE 160 "\u2003", // EM SPACE 161 "\u2004", // THREE-PER-EM SPACE 162 "\u2005", // FOUR-PER-EM SPACE 163 "\u2006", // SIX-PER-EM SPACE 164 "\u2007", // FIGURE SPACE 165 "\u2008", // PUNCTUATION SPACE 166 "\u2009", // THIN SPACE 167 "\u200A", // HAIR SPACE 168 "\u2028", // LINE SEPARATOR 169 "\u2029", // PARAGRAPH SEPARATOR 170 "\u202F", // NARROW NO-BREAK SPACE 171 "\u205F", // MEDIUM MATHEMATICAL SPACE 172 "\u3000", // IDEOGRAPHIC SPACE 173 ], 174 175 // The middle dot is used as a separator for foreign names in Japanese. 176 MIDDLE_DOT: [ 177 "\u30FB", // KATAKANA MIDDLE DOT 178 "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT" 179 ], 180 181 // The Unicode range is based on Wiki: 182 // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs 183 // https://en.wikipedia.org/wiki/Hangul 184 // https://en.wikipedia.org/wiki/Japanese_writing_system 185 CJK_RANGE: [ 186 "\u1100-\u11FF", // Hangul Jamo 187 "\u3040-\u309F", // Hiragana 188 "\u30A0-\u30FF", // Katakana 189 "\u3105-\u312C", // Bopomofo 190 "\u3130-\u318F", // Hangul Compatibility Jamo 191 "\u31F0-\u31FF", // Katakana Phonetic Extensions 192 "\u3200-\u32FF", // Enclosed CJK Letters and Months 193 "\u3400-\u4DBF", // CJK unified ideographs Extension A 194 "\u4E00-\u9FFF", // CJK Unified Ideographs 195 "\uA960-\uA97F", // Hangul Jamo Extended-A 196 "\uAC00-\uD7AF", // Hangul Syllables 197 "\uD7B0-\uD7FF", // Hangul Jamo Extended-B 198 "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms 199 ], 200 201 HANGUL_RANGE: [ 202 "\u1100-\u11FF", // Hangul Jamo 203 "\u3130-\u318F", // Hangul Compatibility Jamo 204 "\uA960-\uA97F", // Hangul Jamo Extended-A 205 "\uAC00-\uD7AF", // Hangul Syllables 206 "\uD7B0-\uD7FF", // Hangul Jamo Extended-B 207 ], 208 209 _dataLoaded: false, 210 211 // Returns true if |set| contains |token|, modulo a final period. 212 _containsString(set, token) { 213 let target = token.replace(/\.$/, "").toLowerCase(); 214 return set.includes(target); 215 }, 216 217 // Removes common name prefixes from |name_tokens|. 218 _stripPrefixes(nameTokens) { 219 for (let i in nameTokens) { 220 if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) { 221 return nameTokens.slice(i); 222 } 223 } 224 return []; 225 }, 226 227 // Removes common name suffixes from |name_tokens|. 228 _stripSuffixes(nameTokens) { 229 for (let i = nameTokens.length - 1; i >= 0; i--) { 230 if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) { 231 return nameTokens.slice(0, i + 1); 232 } 233 } 234 return []; 235 }, 236 237 _isCJKName(name) { 238 // The name is considered to be a CJK name if it is only CJK characters, 239 // spaces, and "middle dot" separators, with at least one CJK character, and 240 // no more than 2 words. 241 // 242 // Chinese and Japanese names are usually spelled out using the Han 243 // characters (logographs), which constitute the "CJK Unified Ideographs" 244 // block in Unicode, also referred to as Unihan. Korean names are usually 245 // spelled out in the Korean alphabet (Hangul), although they do have a Han 246 // equivalent as well. 247 248 if (!name) { 249 return false; 250 } 251 252 let previousWasCJK = false; 253 let wordCount = 0; 254 255 for (let c of name) { 256 let isMiddleDot = this.MIDDLE_DOT.includes(c); 257 let isCJK = !isMiddleDot && this.reCJK.test(c); 258 if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) { 259 return false; 260 } 261 if (isCJK && !previousWasCJK) { 262 wordCount++; 263 } 264 previousWasCJK = isCJK; 265 } 266 267 return wordCount > 0 && wordCount < 3; 268 }, 269 270 // Tries to split a Chinese, Japanese, or Korean name into its given name & 271 // surname parts. If splitting did not work for whatever reason, returns null. 272 _splitCJKName(nameTokens) { 273 // The convention for CJK languages is to put the surname (last name) first, 274 // and the given name (first name) second. In a continuous text, there is 275 // normally no space between the two parts of the name. When entering their 276 // name into a field, though, some people add a space to disambiguate. CJK 277 // names (almost) never have a middle name. 278 279 let reHangulName = new RegExp( 280 "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$", 281 "u" 282 ); 283 let nameParts = { 284 given: "", 285 middle: "", 286 family: "", 287 }; 288 289 if (nameTokens.length == 1) { 290 // There is no space between the surname and given name. Try to infer 291 // where to separate between the two. Most Chinese and Korean surnames 292 // have only one character, but there are a few that have 2. If the name 293 // does not start with a surname from a known list, default to one 294 // character. 295 let name = nameTokens[0]; 296 let isKorean = reHangulName.test(name); 297 let surnameLength = 0; 298 299 // 4-character Korean names are more likely to be 2/2 than 1/3, so use 300 // the full list of Korean 2-char surnames. (instead of only the common 301 // ones) 302 let multiCharSurnames = 303 isKorean && name.length > 3 304 ? this.KOREAN_MULTI_CHAR_SURNAMES 305 : this.COMMON_CJK_MULTI_CHAR_SURNAMES; 306 307 // Default to 1 character if the surname is not in the list. 308 surnameLength = multiCharSurnames.some(surname => 309 name.startsWith(surname) 310 ) 311 ? 2 312 : 1; 313 314 nameParts.family = name.substr(0, surnameLength); 315 nameParts.given = name.substr(surnameLength); 316 } else if (nameTokens.length == 2) { 317 // The user entered a space between the two name parts. This makes our job 318 // easier. Family name first, given name second. 319 nameParts.family = nameTokens[0]; 320 nameParts.given = nameTokens[1]; 321 } else { 322 return null; 323 } 324 325 return nameParts; 326 }, 327 328 init() { 329 if (this._dataLoaded) { 330 return; 331 } 332 this._dataLoaded = true; 333 334 this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u"); 335 }, 336 337 splitName(name) { 338 let nameParts = { 339 given: "", 340 middle: "", 341 family: "", 342 }; 343 344 if (!name) { 345 return nameParts; 346 } 347 348 let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/); 349 nameTokens = this._stripPrefixes(nameTokens); 350 351 if (this._isCJKName(name)) { 352 let parts = this._splitCJKName(nameTokens); 353 if (parts) { 354 return parts; 355 } 356 } 357 358 // Don't assume "Ma" is a suffix in John Ma. 359 if (nameTokens.length > 2) { 360 nameTokens = this._stripSuffixes(nameTokens); 361 } 362 363 if (!nameTokens.length) { 364 // Bad things have happened; just assume the whole thing is a given name. 365 nameParts.given = name; 366 return nameParts; 367 } 368 369 // Only one token, assume given name. 370 if (nameTokens.length == 1) { 371 nameParts.given = nameTokens[0]; 372 return nameParts; 373 } 374 375 // 2 or more tokens. Grab the family, which is the last word plus any 376 // recognizable family prefixes. 377 let familyTokens = [nameTokens.pop()]; 378 while (nameTokens.length) { 379 let lastToken = nameTokens[nameTokens.length - 1]; 380 if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) { 381 break; 382 } 383 familyTokens.unshift(lastToken); 384 nameTokens.pop(); 385 } 386 nameParts.family = familyTokens.join(" "); 387 388 // Take the last remaining token as the middle name (if there are at least 2 389 // tokens). 390 if (nameTokens.length >= 2) { 391 nameParts.middle = nameTokens.pop(); 392 } 393 394 // Remainder is given name. 395 nameParts.given = nameTokens.join(" "); 396 397 return nameParts; 398 }, 399 400 joinNameParts({ given, middle, family }) { 401 if (this._isCJKName(given) && this._isCJKName(family) && !middle) { 402 return family + given; 403 } 404 return [given, middle, family] 405 .filter(part => part && part.length) 406 .join(" "); 407 }, 408}; 409 410FormAutofillNameUtils.init(); 411