1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  regexst.h
5 //
6 //  Copyright (C) 2004-2015, International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains class RegexStaticSets
10 //
11 //  This class is internal to the regular expression implementation.
12 //  For the public Regular Expression API, see the file "unicode/regex.h"
13 //
14 //  RegexStaticSets groups together the common UnicodeSets that are needed
15 //   for compiling or executing RegularExpressions.  This grouping simplifies
16 //   the thread safe lazy creation and sharing of these sets across
17 //   all instances of regular expressions.
18 //
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
22 
23 #include "unicode/unistr.h"
24 #include "unicode/uniset.h"
25 #include "unicode/uchar.h"
26 #include "unicode/regex.h"
27 #include "uprops.h"
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "uassert.h"
31 #include "ucln_in.h"
32 #include "umutex.h"
33 
34 #include "regexcst.h"   // Contains state table for the regex pattern parser.
35                         //   generated by a Perl script.
36 #include "regexst.h"
37 
38 U_NAMESPACE_BEGIN
39 
40 // "Rule Char" Characters are those with special meaning, and therefore
41 //    need to be escaped to appear as literals in a regexp.
42 constexpr char16_t const *gRuleSet_rule_chars = u"*?+[(){}^$|\\.";
43 
44 //
45 //   The backslash escape characters that ICU's unescape() function will handle.
46 //
47 constexpr char16_t const *gUnescapeChars = u"acefnrtuUx";
48 
49 //
50 //  Unicode Set pattern for Regular Expression  \w
51 //
52 constexpr char16_t const *gIsWordPattern = u"[\\p{Alphabetic}\\p{M}\\p{Nd}\\p{Pc}\\u200c\\u200d]";
53 
54 //
55 //  Unicode Set Definitions for Regular Expression  \s
56 //
57 constexpr  char16_t const *gIsSpacePattern = u"[\\p{WhiteSpace}]";
58 
59 //
60 //  UnicodeSets used in implementation of Grapheme Cluster detection, \X
61 //
62 constexpr char16_t const *gGC_ControlPattern = u"[[:Zl:][:Zp:][:Cc:][:Cf:]-[:Grapheme_Extend:]]";
63 constexpr char16_t const *gGC_ExtendPattern  = u"[\\p{Grapheme_Extend}]";
64 constexpr char16_t const *gGC_LPattern       = u"[\\p{Hangul_Syllable_Type=L}]";
65 constexpr char16_t const *gGC_VPattern       = u"[\\p{Hangul_Syllable_Type=V}]";
66 constexpr char16_t const *gGC_TPattern       = u"[\\p{Hangul_Syllable_Type=T}]";
67 constexpr char16_t const *gGC_LVPattern      = u"[\\p{Hangul_Syllable_Type=LV}]";
68 constexpr char16_t const *gGC_LVTPattern     = u"[\\p{Hangul_Syllable_Type=LVT}]";
69 
70 
71 RegexStaticSets *RegexStaticSets::gStaticSets = nullptr;
72 UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;
73 
74 
RegexStaticSets(UErrorCode * status)75 RegexStaticSets::RegexStaticSets(UErrorCode *status) {
76     // Initialize the shared static sets to their correct values.
77     fUnescapeCharSet.addAll(UnicodeString(true, gUnescapeChars, -1)).freeze();
78     fPropSets[URX_ISWORD_SET].applyPattern(UnicodeString(true, gIsWordPattern, -1), *status).freeze();
79     fPropSets[URX_ISSPACE_SET].applyPattern(UnicodeString(true, gIsSpacePattern, -1), *status).freeze();
80     fPropSets[URX_GC_EXTEND].applyPattern(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status).freeze();
81     fPropSets[URX_GC_CONTROL].applyPattern(UnicodeString(TRUE, gGC_ControlPattern, -1), *status).freeze();
82     fPropSets[URX_GC_L].applyPattern(UnicodeString(TRUE, gGC_LPattern, -1), *status).freeze();
83     fPropSets[URX_GC_V].applyPattern(UnicodeString(TRUE, gGC_VPattern, -1), *status).freeze();
84     fPropSets[URX_GC_T].applyPattern(UnicodeString(TRUE, gGC_TPattern, -1), *status).freeze();
85     fPropSets[URX_GC_LV].applyPattern(UnicodeString(TRUE, gGC_LVPattern, -1), *status).freeze();
86     fPropSets[URX_GC_LVT].applyPattern(UnicodeString(TRUE, gGC_LVTPattern, -1), *status).freeze();
87 
88 
89     //
90     //  "Normal" is the set of characters that don't need special handling
91     //            when finding grapheme cluster boundaries.
92     //
93     fPropSets[URX_GC_NORMAL].complement();
94     fPropSets[URX_GC_NORMAL].remove(0xac00, 0xd7a4);
95     fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_CONTROL]);
96     fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_L]);
97     fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_V]);
98     fPropSets[URX_GC_NORMAL].removeAll(fPropSets[URX_GC_T]);
99     fPropSets[URX_GC_NORMAL].freeze();
100 
101     // Initialize the 8-bit fast bit sets from the parallel full
102     //   UnicodeSets.
103     //
104     // TODO: 25 Oct 2019 are these fast 8-bit sets worth keeping?
105     //       Measured 3.5% gain on (non) matching with the pattern "x(?:\\S+)+x"
106     //       This runs in exponential time, making it easy to adjust the time for
107     //       convenient measuring.
108     //
109     //       This 8 bit optimization dates from the early days of ICU,
110     //       with a less optimized UnicodeSet. At the time, the difference
111     //       was substantial.
112 
113     for (int32_t i=0; i<URX_LAST_SET; i++) {
114         fPropSets8[i].init(&fPropSets[i]);
115     }
116 
117     // Sets used while parsing rules, but not referenced from the parse state table
118     fRuleSets[kRuleSet_rule_char-128]
119             .addAll(UnicodeString(gRuleSet_rule_chars)).complement().freeze();
120 
121     fRuleSets[kRuleSet_digit_char-128].add(u'0', u'9').freeze();
122     fRuleSets[kRuleSet_ascii_letter-128].add(u'A', u'Z').add(u'a', u'z').freeze();
123     fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
124 
125     // Finally, initialize an empty UText string for utility purposes
126     fEmptyText = utext_openUChars(nullptr, nullptr, 0, status);
127 
128 }
129 
130 
~RegexStaticSets()131 RegexStaticSets::~RegexStaticSets() {
132     fRuleDigitsAlias = nullptr;
133     utext_close(fEmptyText);
134 }
135 
136 
137 //------------------------------------------------------------------------------
138 //
139 //   regex_cleanup      Memory cleanup function, free/delete all
140 //                      cached memory.  Called by ICU's u_cleanup() function.
141 //
142 //------------------------------------------------------------------------------
143 
144 U_CDECL_BEGIN
145 static UBool U_CALLCONV
regex_cleanup(void)146 regex_cleanup(void) {
147     delete RegexStaticSets::gStaticSets;
148     RegexStaticSets::gStaticSets = nullptr;
149     gStaticSetsInitOnce.reset();
150     return TRUE;
151 }
152 
initStaticSets(UErrorCode & status)153 static void U_CALLCONV initStaticSets(UErrorCode &status) {
154     U_ASSERT(RegexStaticSets::gStaticSets == nullptr);
155     ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
156     RegexStaticSets::gStaticSets = new RegexStaticSets(&status);
157     if (U_FAILURE(status)) {
158         delete RegexStaticSets::gStaticSets;
159         RegexStaticSets::gStaticSets = nullptr;
160     }
161     if (RegexStaticSets::gStaticSets == nullptr && U_SUCCESS(status)) {
162         status = U_MEMORY_ALLOCATION_ERROR;
163     }
164 }
165 U_CDECL_END
166 
initGlobals(UErrorCode * status)167 void RegexStaticSets::initGlobals(UErrorCode *status) {
168     umtx_initOnce(gStaticSetsInitOnce, &initStaticSets, *status);
169 }
170 
171 U_NAMESPACE_END
172 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
173