1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <fstream>
6 #include <iomanip>
7 #include <iostream>
8 #include <sstream>
9 
10 #include "irregexp/imported/special-case.h"
11 
12 namespace v8 {
13 namespace internal {
14 
15 static const uc32 kSurrogateStart = 0xd800;
16 static const uc32 kSurrogateEnd = 0xdfff;
17 static const uc32 kNonBmpStart = 0x10000;
18 
19 // The following code generates "src/regexp/special-case.cc".
PrintSet(std::ofstream & out,const char * name,const icu::UnicodeSet & set)20 void PrintSet(std::ofstream& out, const char* name,
21               const icu::UnicodeSet& set) {
22   out << "icu::UnicodeSet Build" << name << "() {\n"
23       << "  icu::UnicodeSet set;\n";
24   for (int32_t i = 0; i < set.getRangeCount(); i++) {
25     if (set.getRangeStart(i) == set.getRangeEnd(i)) {
26       out << "  set.add(0x" << set.getRangeStart(i) << ");\n";
27     } else {
28       out << "  set.add(0x" << set.getRangeStart(i) << ", 0x"
29           << set.getRangeEnd(i) << ");\n";
30     }
31   }
32   out << "  set.freeze();\n"
33       << "  return set;\n"
34       << "}\n\n";
35 
36   out << "struct " << name << "Data {\n"
37       << "  " << name << "Data() : set(Build" << name << "()) {}\n"
38       << "  const icu::UnicodeSet set;\n"
39       << "};\n\n";
40 
41   out << "//static\n"
42       << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n"
43       << "  static base::LazyInstance<" << name << "Data>::type set =\n"
44       << "      LAZY_INSTANCE_INITIALIZER;\n"
45       << "  return set.Pointer()->set;\n"
46       << "}\n\n";
47 }
48 
PrintSpecial(std::ofstream & out)49 void PrintSpecial(std::ofstream& out) {
50   icu::UnicodeSet current;
51   icu::UnicodeSet special_add;
52   icu::UnicodeSet ignore;
53   UErrorCode status = U_ZERO_ERROR;
54   icu::UnicodeSet upper("[\\p{Lu}]", status);
55   CHECK(U_SUCCESS(status));
56 
57   // Iterate through all chars in BMP except surrogates.
58   for (UChar32 i = 0; i < static_cast<UChar32>(kNonBmpStart); i++) {
59     if (i >= static_cast<UChar32>(kSurrogateStart) &&
60         i <= static_cast<UChar32>(kSurrogateEnd)) {
61       continue;  // Ignore surrogate range
62     }
63     current.set(i, i);
64     current.closeOver(USET_CASE_INSENSITIVE);
65 
66     // Check to see if all characters in the case-folding equivalence
67     // class as defined by UnicodeSet::closeOver all map to the same
68     // canonical value.
69     UChar32 canonical = RegExpCaseFolding::Canonicalize(i);
70     bool class_has_matching_canonical_char = false;
71     bool class_has_non_matching_canonical_char = false;
72     for (int32_t j = 0; j < current.getRangeCount(); j++) {
73       for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j);
74            c++) {
75         if (c == i) {
76           continue;
77         }
78         UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c);
79         if (canonical == other_canonical) {
80           class_has_matching_canonical_char = true;
81         } else {
82           class_has_non_matching_canonical_char = true;
83         }
84       }
85     }
86     // If any other character in i's equivalence class has a
87     // different canonical value, then i needs special handling.  If
88     // no other character shares a canonical value with i, we can
89     // ignore i when adding alternatives for case-independent
90     // comparison.  If at least one other character shares a
91     // canonical value, then i needs special handling.
92     if (class_has_non_matching_canonical_char) {
93       if (class_has_matching_canonical_char) {
94         special_add.add(i);
95       } else {
96         ignore.add(i);
97       }
98     }
99   }
100 
101   // Verify that no Unicode equivalence class contains two non-trivial
102   // JS equivalence classes. Every character in SpecialAddSet has the
103   // same canonical value as every other non-IgnoreSet character in
104   // its Unicode equivalence class. Therefore, if we call closeOver on
105   // a set containing no IgnoreSet characters, the only characters
106   // that must be removed from the result are in IgnoreSet. This fact
107   // is used in CharacterRange::AddCaseEquivalents.
108   for (int32_t i = 0; i < special_add.getRangeCount(); i++) {
109     for (UChar32 c = special_add.getRangeStart(i);
110          c <= special_add.getRangeEnd(i); c++) {
111       UChar32 canonical = RegExpCaseFolding::Canonicalize(c);
112       current.set(c, c);
113       current.closeOver(USET_CASE_INSENSITIVE);
114       current.removeAll(ignore);
115       for (int32_t j = 0; j < current.getRangeCount(); j++) {
116         for (UChar32 c2 = current.getRangeStart(j);
117              c2 <= current.getRangeEnd(j); c2++) {
118           CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2));
119         }
120       }
121     }
122   }
123 
124   PrintSet(out, "IgnoreSet", ignore);
125   PrintSet(out, "SpecialAddSet", special_add);
126 }
127 
WriteHeader(const char * header_filename)128 void WriteHeader(const char* header_filename) {
129   std::ofstream out(header_filename);
130   out << std::hex << std::setfill('0') << std::setw(4);
131   out << "// Copyright 2020 the V8 project authors. All rights reserved.\n"
132       << "// Use of this source code is governed by a BSD-style license that\n"
133       << "// can be found in the LICENSE file.\n\n"
134       << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n"
135       << "// The following functions are used to build UnicodeSets\n"
136       << "// for special cases where the case-folding algorithm used by\n"
137       << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n"
138       << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n"
139       << "// Semantics: Canonicalize) step 3.\n\n"
140       << "#ifdef V8_INTL_SUPPORT\n"
141       << "#include \"src/base/lazy-instance.h\"\n\n"
142       << "#include \"src/regexp/special-case.h\"\n\n"
143       << "#include \"unicode/uniset.h\"\n"
144       << "namespace v8 {\n"
145       << "namespace internal {\n\n";
146 
147   PrintSpecial(out);
148 
149   out << "\n"
150       << "}  // namespace internal\n"
151       << "}  // namespace v8\n"
152       << "#endif  // V8_INTL_SUPPORT\n";
153 }
154 
155 }  // namespace internal
156 }  // namespace v8
157 
main(int argc,const char ** argv)158 int main(int argc, const char** argv) {
159   if (argc != 2) {
160     std::cerr << "Usage: " << argv[0] << " <output filename>\n";
161     std::exit(1);
162   }
163   v8::internal::WriteHeader(argv[1]);
164 
165   return 0;
166 }
167