1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5"use strict";
6
7var EXPORTED_SYMBOLS = ["FormAutofillNameUtils"];
8
9// FormAutofillNameUtils is initially translated from
10// https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
11var FormAutofillNameUtils = {
12  NAME_PREFIXES: [
13    "1lt",
14    "1st",
15    "2lt",
16    "2nd",
17    "3rd",
18    "admiral",
19    "capt",
20    "captain",
21    "col",
22    "cpt",
23    "dr",
24    "gen",
25    "general",
26    "lcdr",
27    "lt",
28    "ltc",
29    "ltg",
30    "ltjg",
31    "maj",
32    "major",
33    "mg",
34    "mr",
35    "mrs",
36    "ms",
37    "pastor",
38    "prof",
39    "rep",
40    "reverend",
41    "rev",
42    "sen",
43    "st",
44  ],
45
46  NAME_SUFFIXES: [
47    "b.a",
48    "ba",
49    "d.d.s",
50    "dds",
51    "i",
52    "ii",
53    "iii",
54    "iv",
55    "ix",
56    "jr",
57    "m.a",
58    "m.d",
59    "ma",
60    "md",
61    "ms",
62    "ph.d",
63    "phd",
64    "sr",
65    "v",
66    "vi",
67    "vii",
68    "viii",
69    "x",
70  ],
71
72  FAMILY_NAME_PREFIXES: [
73    "d'",
74    "de",
75    "del",
76    "der",
77    "di",
78    "la",
79    "le",
80    "mc",
81    "san",
82    "st",
83    "ter",
84    "van",
85    "von",
86  ],
87
88  // The common and non-ambiguous CJK surnames (last names) that have more than
89  // one character.
90  COMMON_CJK_MULTI_CHAR_SURNAMES: [
91    // Korean, taken from the list of surnames:
92    // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
93    "남궁",
94    "사공",
95    "서문",
96    "선우",
97    "제갈",
98    "황보",
99    "독고",
100    "망절",
101
102    // Chinese, taken from the top 10 Chinese 2-character surnames:
103    // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
104    // Simplified Chinese (mostly mainland China)
105    "欧阳",
106    "令狐",
107    "皇甫",
108    "上官",
109    "司徒",
110    "诸葛",
111    "司马",
112    "宇文",
113    "呼延",
114    "端木",
115    // Traditional Chinese (mostly Taiwan)
116    "張簡",
117    "歐陽",
118    "諸葛",
119    "申屠",
120    "尉遲",
121    "司馬",
122    "軒轅",
123    "夏侯",
124  ],
125
126  // All Korean surnames that have more than one character, even the
127  // rare/ambiguous ones.
128  KOREAN_MULTI_CHAR_SURNAMES: [
129    "강전",
130    "남궁",
131    "독고",
132    "동방",
133    "망절",
134    "사공",
135    "서문",
136    "선우",
137    "소봉",
138    "어금",
139    "장곡",
140    "제갈",
141    "황목",
142    "황보",
143  ],
144
145  // The whitespace definition based on
146  // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
147  WHITESPACE: [
148    "\u0009", // CHARACTER TABULATION
149    "\u000A", // LINE FEED (LF)
150    "\u000B", // LINE TABULATION
151    "\u000C", // FORM FEED (FF)
152    "\u000D", // CARRIAGE RETURN (CR)
153    "\u0020", // SPACE
154    "\u0085", // NEXT LINE (NEL)
155    "\u00A0", // NO-BREAK SPACE
156    "\u1680", // OGHAM SPACE MARK
157    "\u2000", // EN QUAD
158    "\u2001", // EM QUAD
159    "\u2002", // EN SPACE
160    "\u2003", // EM SPACE
161    "\u2004", // THREE-PER-EM SPACE
162    "\u2005", // FOUR-PER-EM SPACE
163    "\u2006", // SIX-PER-EM SPACE
164    "\u2007", // FIGURE SPACE
165    "\u2008", // PUNCTUATION SPACE
166    "\u2009", // THIN SPACE
167    "\u200A", // HAIR SPACE
168    "\u2028", // LINE SEPARATOR
169    "\u2029", // PARAGRAPH SEPARATOR
170    "\u202F", // NARROW NO-BREAK SPACE
171    "\u205F", // MEDIUM MATHEMATICAL SPACE
172    "\u3000", // IDEOGRAPHIC SPACE
173  ],
174
175  // The middle dot is used as a separator for foreign names in Japanese.
176  MIDDLE_DOT: [
177    "\u30FB", // KATAKANA MIDDLE DOT
178    "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT"
179  ],
180
181  // The Unicode range is based on Wiki:
182  // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
183  // https://en.wikipedia.org/wiki/Hangul
184  // https://en.wikipedia.org/wiki/Japanese_writing_system
185  CJK_RANGE: [
186    "\u1100-\u11FF", // Hangul Jamo
187    "\u3040-\u309F", // Hiragana
188    "\u30A0-\u30FF", // Katakana
189    "\u3105-\u312C", // Bopomofo
190    "\u3130-\u318F", // Hangul Compatibility Jamo
191    "\u31F0-\u31FF", // Katakana Phonetic Extensions
192    "\u3200-\u32FF", // Enclosed CJK Letters and Months
193    "\u3400-\u4DBF", // CJK unified ideographs Extension A
194    "\u4E00-\u9FFF", // CJK Unified Ideographs
195    "\uA960-\uA97F", // Hangul Jamo Extended-A
196    "\uAC00-\uD7AF", // Hangul Syllables
197    "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
198    "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms
199  ],
200
201  HANGUL_RANGE: [
202    "\u1100-\u11FF", // Hangul Jamo
203    "\u3130-\u318F", // Hangul Compatibility Jamo
204    "\uA960-\uA97F", // Hangul Jamo Extended-A
205    "\uAC00-\uD7AF", // Hangul Syllables
206    "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
207  ],
208
209  _dataLoaded: false,
210
211  // Returns true if |set| contains |token|, modulo a final period.
212  _containsString(set, token) {
213    let target = token.replace(/\.$/, "").toLowerCase();
214    return set.includes(target);
215  },
216
217  // Removes common name prefixes from |name_tokens|.
218  _stripPrefixes(nameTokens) {
219    for (let i in nameTokens) {
220      if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) {
221        return nameTokens.slice(i);
222      }
223    }
224    return [];
225  },
226
227  // Removes common name suffixes from |name_tokens|.
228  _stripSuffixes(nameTokens) {
229    for (let i = nameTokens.length - 1; i >= 0; i--) {
230      if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) {
231        return nameTokens.slice(0, i + 1);
232      }
233    }
234    return [];
235  },
236
237  _isCJKName(name) {
238    // The name is considered to be a CJK name if it is only CJK characters,
239    // spaces, and "middle dot" separators, with at least one CJK character, and
240    // no more than 2 words.
241    //
242    // Chinese and Japanese names are usually spelled out using the Han
243    // characters (logographs), which constitute the "CJK Unified Ideographs"
244    // block in Unicode, also referred to as Unihan. Korean names are usually
245    // spelled out in the Korean alphabet (Hangul), although they do have a Han
246    // equivalent as well.
247
248    if (!name) {
249      return false;
250    }
251
252    let previousWasCJK = false;
253    let wordCount = 0;
254
255    for (let c of name) {
256      let isMiddleDot = this.MIDDLE_DOT.includes(c);
257      let isCJK = !isMiddleDot && this.reCJK.test(c);
258      if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) {
259        return false;
260      }
261      if (isCJK && !previousWasCJK) {
262        wordCount++;
263      }
264      previousWasCJK = isCJK;
265    }
266
267    return wordCount > 0 && wordCount < 3;
268  },
269
270  // Tries to split a Chinese, Japanese, or Korean name into its given name &
271  // surname parts. If splitting did not work for whatever reason, returns null.
272  _splitCJKName(nameTokens) {
273    // The convention for CJK languages is to put the surname (last name) first,
274    // and the given name (first name) second. In a continuous text, there is
275    // normally no space between the two parts of the name. When entering their
276    // name into a field, though, some people add a space to disambiguate. CJK
277    // names (almost) never have a middle name.
278
279    let reHangulName = new RegExp(
280      "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$",
281      "u"
282    );
283    let nameParts = {
284      given: "",
285      middle: "",
286      family: "",
287    };
288
289    if (nameTokens.length == 1) {
290      // There is no space between the surname and given name. Try to infer
291      // where to separate between the two. Most Chinese and Korean surnames
292      // have only one character, but there are a few that have 2. If the name
293      // does not start with a surname from a known list, default to one
294      // character.
295      let name = nameTokens[0];
296      let isKorean = reHangulName.test(name);
297      let surnameLength = 0;
298
299      // 4-character Korean names are more likely to be 2/2 than 1/3, so use
300      // the full list of Korean 2-char surnames. (instead of only the common
301      // ones)
302      let multiCharSurnames =
303        isKorean && name.length > 3
304          ? this.KOREAN_MULTI_CHAR_SURNAMES
305          : this.COMMON_CJK_MULTI_CHAR_SURNAMES;
306
307      // Default to 1 character if the surname is not in the list.
308      surnameLength = multiCharSurnames.some(surname =>
309        name.startsWith(surname)
310      )
311        ? 2
312        : 1;
313
314      nameParts.family = name.substr(0, surnameLength);
315      nameParts.given = name.substr(surnameLength);
316    } else if (nameTokens.length == 2) {
317      // The user entered a space between the two name parts. This makes our job
318      // easier. Family name first, given name second.
319      nameParts.family = nameTokens[0];
320      nameParts.given = nameTokens[1];
321    } else {
322      return null;
323    }
324
325    return nameParts;
326  },
327
328  init() {
329    if (this._dataLoaded) {
330      return;
331    }
332    this._dataLoaded = true;
333
334    this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u");
335  },
336
337  splitName(name) {
338    let nameParts = {
339      given: "",
340      middle: "",
341      family: "",
342    };
343
344    if (!name) {
345      return nameParts;
346    }
347
348    let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/);
349    nameTokens = this._stripPrefixes(nameTokens);
350
351    if (this._isCJKName(name)) {
352      let parts = this._splitCJKName(nameTokens);
353      if (parts) {
354        return parts;
355      }
356    }
357
358    // Don't assume "Ma" is a suffix in John Ma.
359    if (nameTokens.length > 2) {
360      nameTokens = this._stripSuffixes(nameTokens);
361    }
362
363    if (!nameTokens.length) {
364      // Bad things have happened; just assume the whole thing is a given name.
365      nameParts.given = name;
366      return nameParts;
367    }
368
369    // Only one token, assume given name.
370    if (nameTokens.length == 1) {
371      nameParts.given = nameTokens[0];
372      return nameParts;
373    }
374
375    // 2 or more tokens. Grab the family, which is the last word plus any
376    // recognizable family prefixes.
377    let familyTokens = [nameTokens.pop()];
378    while (nameTokens.length) {
379      let lastToken = nameTokens[nameTokens.length - 1];
380      if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) {
381        break;
382      }
383      familyTokens.unshift(lastToken);
384      nameTokens.pop();
385    }
386    nameParts.family = familyTokens.join(" ");
387
388    // Take the last remaining token as the middle name (if there are at least 2
389    // tokens).
390    if (nameTokens.length >= 2) {
391      nameParts.middle = nameTokens.pop();
392    }
393
394    // Remainder is given name.
395    nameParts.given = nameTokens.join(" ");
396
397    return nameParts;
398  },
399
400  joinNameParts({ given, middle, family }) {
401    if (this._isCJKName(given) && this._isCJKName(family) && !middle) {
402      return family + given;
403    }
404    return [given, middle, family]
405      .filter(part => part && part.length)
406      .join(" ");
407  },
408};
409
410FormAutofillNameUtils.init();
411