1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <fstream>
6 #include <iomanip>
7 #include <iostream>
8 #include <sstream>
9
10 #include "irregexp/imported/special-case.h"
11
12 namespace v8 {
13 namespace internal {
14
15 static const uc32 kSurrogateStart = 0xd800;
16 static const uc32 kSurrogateEnd = 0xdfff;
17 static const uc32 kNonBmpStart = 0x10000;
18
19 // The following code generates "src/regexp/special-case.cc".
PrintSet(std::ofstream & out,const char * name,const icu::UnicodeSet & set)20 void PrintSet(std::ofstream& out, const char* name,
21 const icu::UnicodeSet& set) {
22 out << "icu::UnicodeSet Build" << name << "() {\n"
23 << " icu::UnicodeSet set;\n";
24 for (int32_t i = 0; i < set.getRangeCount(); i++) {
25 if (set.getRangeStart(i) == set.getRangeEnd(i)) {
26 out << " set.add(0x" << set.getRangeStart(i) << ");\n";
27 } else {
28 out << " set.add(0x" << set.getRangeStart(i) << ", 0x"
29 << set.getRangeEnd(i) << ");\n";
30 }
31 }
32 out << " set.freeze();\n"
33 << " return set;\n"
34 << "}\n\n";
35
36 out << "struct " << name << "Data {\n"
37 << " " << name << "Data() : set(Build" << name << "()) {}\n"
38 << " const icu::UnicodeSet set;\n"
39 << "};\n\n";
40
41 out << "//static\n"
42 << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
43 << " static base::LazyInstance<" << name << "Data>::type set =\n"
44 << " LAZY_INSTANCE_INITIALIZER;\n"
45 << " return set.Pointer()->set;\n"
46 << "}\n\n";
47 }
48
PrintSpecial(std::ofstream & out)49 void PrintSpecial(std::ofstream& out) {
50 icu::UnicodeSet current;
51 icu::UnicodeSet special_add;
52 icu::UnicodeSet ignore;
53 UErrorCode status = U_ZERO_ERROR;
54 icu::UnicodeSet upper("[\\p{Lu}]", status);
55 CHECK(U_SUCCESS(status));
56
57 // Iterate through all chars in BMP except surrogates.
58 for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
59 if (i >= static_cast<UChar32>(kSurrogateStart) &&
60 i <= static_cast<UChar32>(kSurrogateEnd)) {
61 continue; // Ignore surrogate range
62 }
63 current.set(i, i);
64 current.closeOver(USET_CASE_INSENSITIVE);
65
66 // Check to see if all characters in the case-folding equivalence
67 // class as defined by UnicodeSet::closeOver all map to the same
68 // canonical value.
69 UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
70 bool class_has_matching_canonical_char = false;
71 bool class_has_non_matching_canonical_char = false;
72 for (int32_t j = 0; j < current.getRangeCount(); j++) {
73 for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
74 c++) {
75 if (c == i) {
76 continue;
77 }
78 UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
79 if (canonical == other_canonical) {
80 class_has_matching_canonical_char = true;
81 } else {
82 class_has_non_matching_canonical_char = true;
83 }
84 }
85 }
86 // If any other character in i's equivalence class has a
87 // different canonical value, then i needs special handling. If
88 // no other character shares a canonical value with i, we can
89 // ignore i when adding alternatives for case-independent
90 // comparison. If at least one other character shares a
91 // canonical value, then i needs special handling.
92 if (class_has_non_matching_canonical_char) {
93 if (class_has_matching_canonical_char) {
94 special_add.add(i);
95 } else {
96 ignore.add(i);
97 }
98 }
99 }
100
101 // Verify that no Unicode equivalence class contains two non-trivial
102 // JS equivalence classes. Every character in SpecialAddSet has the
103 // same canonical value as every other non-IgnoreSet character in
104 // its Unicode equivalence class. Therefore, if we call closeOver on
105 // a set containing no IgnoreSet characters, the only characters
106 // that must be removed from the result are in IgnoreSet. This fact
107 // is used in CharacterRange::AddCaseEquivalents.
108 for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
109 for (UChar32 c = special_add.getRangeStart(i);
110 c <= special_add.getRangeEnd(i); c++) {
111 UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
112 current.set(c, c);
113 current.closeOver(USET_CASE_INSENSITIVE);
114 current.removeAll(ignore);
115 for (int32_t j = 0; j < current.getRangeCount(); j++) {
116 for (UChar32 c2 = current.getRangeStart(j);
117 c2 <= current.getRangeEnd(j); c2++) {
118 CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
119 }
120 }
121 }
122 }
123
124 PrintSet(out, "IgnoreSet", ignore);
125 PrintSet(out, "SpecialAddSet", special_add);
126 }
127
WriteHeader(const char * header_filename)128 void WriteHeader(const char* header_filename) {
129 std::ofstream out(header_filename);
130 out << std::hex << std::setfill('0') << std::setw(4);
131 out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
132 << "// Use of this source code is governed by a BSD-style license that\n"
133 << "// can be found in the LICENSE file.\n\n"
134 << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
135 << "// The following functions are used to build UnicodeSets\n"
136 << "// for special cases where the case-folding algorithm used by\n"
137 << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
138 << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
139 << "// Semantics: Canonicalize) step 3.\n\n"
140 << "#ifdef V8_INTL_SUPPORT\n"
141 << "#include \"src/base/lazy-instance.h\"\n\n"
142 << "#include \"src/regexp/special-case.h\"\n\n"
143 << "#include \"unicode/uniset.h\"\n"
144 << "namespace v8 {\n"
145 << "namespace internal {\n\n";
146
147 PrintSpecial(out);
148
149 out << "\n"
150 << "} // namespace internal\n"
151 << "} // namespace v8\n"
152 << "#endif // V8_INTL_SUPPORT\n";
153 }
154
155 } // namespace internal
156 } // namespace v8
157
main(int argc,const char ** argv)158 int main(int argc, const char** argv) {
159 if (argc != 2) {
160 std::cerr << "Usage: " << argv[0] << " <output filename>\n";
161 std::exit(1);
162 }
163 v8::internal::WriteHeader(argv[1]);
164
165 return 0;
166 }
167