1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_SPECIAL_CASE_H_
6 #define V8_REGEXP_SPECIAL_CASE_H_
7 
8 #ifdef V8_INTL_SUPPORT
9 #include "new-regexp/regexp-shim.h"
10 
11 #include "unicode/uchar.h"
12 #include "unicode/uniset.h"
13 #include "unicode/unistr.h"
14 
15 namespace v8 {
16 namespace internal {
17 
18 // Sets of Unicode characters that need special handling under "i" mode
19 
20 // For non-unicode ignoreCase matches (aka "i", not "iu"), ECMA 262
21 // defines slightly different case-folding rules than Unicode. An
22 // input character should match a pattern character if the result of
23 // the Canonicalize algorithm is the same for both characters.
24 //
25 // Roughly speaking, for "i" regexps, Canonicalize(c) is the same as
26 // c.toUpperCase(), unless a) c.toUpperCase() is a multi-character
27 // string, or b) c is non-ASCII, and c.toUpperCase() is ASCII. See
28 // https://tc39.es/ecma262/#sec-runtime-semantics-canonicalize-ch for
29 // the precise definition.
30 //
31 // While compiling such regular expressions, we need to compute the
32 // set of characters that should match a given input character. (See
33 // GetCaseIndependentLetters and CharacterRange::AddCaseEquivalents.)
34 // For almost all characters, this can be efficiently computed using
35 // UnicodeSet::closeOver(USET_CASE_INSENSITIVE). These sets represent
36 // the remaining special cases.
37 //
38 // For a character c, the rules are as follows:
39 //
40 // 1. If c is in neither IgnoreSet nor SpecialAddSet, then calling
41 //    UnicodeSet::closeOver(USET_CASE_INSENSITIVE) on a UnicodeSet
42 //    containing c will produce the set of characters that should
43 //    match /c/i (or /[c]/i), and only those characters.
44 //
45 // 2. If c is in IgnoreSet, then the only character it should match is
46 //    itself. However, closeOver will add additional incorrect
47 //    matches. For example, consider SHARP S: 'ß' (U+00DF) and 'ẞ'
48 //    (U+1E9E). Although closeOver('ß') = "ßẞ", uppercase('ß') is
49 //    "SS".  Step 3.e therefore requires that 'ß' canonicalizes to
50 //    itself, and should not match 'ẞ'. In these cases, we can skip
51 //    the closeOver entirely, because it will never add an equivalent
52 //    character.
53 //
54 // 3. If c is in SpecialAddSet, then it should match at least one
55 //    character other than itself. However, closeOver will add at
56 //    least one additional incorrect match. For example, consider the
57 //    letter 'k'. Closing over 'k' gives "kKK" (lowercase k, uppercase
58 //    K, U+212A KELVIN SIGN). However, because of step 3.g, KELVIN
59 //    SIGN should not match either of the other two characters. As a
60 //    result, "k" and "K" are in SpecialAddSet (and KELVIN SIGN is in
61 //    IgnoreSet). To find the correct matches for characters in
62 //    SpecialAddSet, we closeOver the original character, but filter
63 //    out the results that do not have the same canonical value.
64 //
65 // The contents of these sets are calculated at build time by
66 // src/regexp/gen-regexp-special-case.cc, which generates
67 // gen/src/regexp/special-case.cc. This is done by iterating over the
68 // result of closeOver for each BMP character, and finding sets for
69 // which at least one character has a different canonical value than
70 // another character. Characters that match no other characters in
71 // their equivalence class are added to IgnoreSet. Characters that
72 // match at least one other character are added to SpecialAddSet.
73 
74 class RegExpCaseFolding final : public AllStatic {
75  public:
76   static const icu::UnicodeSet& IgnoreSet();
77   static const icu::UnicodeSet& SpecialAddSet();
78 
79   // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
80   // Canonicalize) step 3, which is used to determine whether
81   // characters match when ignoreCase is true and unicode is false.
Canonicalize(UChar32 ch)82   static UChar32 Canonicalize(UChar32 ch) {
83     // a. Assert: ch is a UTF-16 code unit.
84     CHECK_LE(ch, 0xffff);
85 
86     // b. Let s be the String value consisting of the single code unit ch.
87     icu::UnicodeString s(ch);
88 
89     // c. Let u be the same result produced as if by performing the algorithm
90     // for String.prototype.toUpperCase using s as the this value.
91     // d. Assert: Type(u) is String.
92     icu::UnicodeString& u = s.toUpper();
93 
94     // e. If u does not consist of a single code unit, return ch.
95     if (u.length() != 1) {
96       return ch;
97     }
98 
99     // f. Let cu be u's single code unit element.
100     UChar32 cu = u.char32At(0);
101 
102     // g. If the value of ch >= 128 and the value of cu < 128, return ch.
103     if (ch >= 128 && cu < 128) {
104       return ch;
105     }
106 
107     // h. Return cu.
108     return cu;
109   }
110 };
111 
112 }  // namespace internal
113 }  // namespace v8
114 
115 #endif  // V8_INTL_SUPPORT
116 
117 #endif  // V8_REGEXP_SPECIAL_CASE_H_
118