1 /*
2  * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 package java.util.regex;
27 
28 import java.util.HashMap;
29 import java.util.Locale;
30 import java.util.regex.Pattern.CharPredicate;
31 import java.util.regex.Pattern.BmpCharPredicate;
32 
33 class CharPredicates {
34 
ALPHABETIC()35     static final CharPredicate ALPHABETIC() {
36         return Character::isAlphabetic;
37     }
38 
39     // \p{gc=Decimal_Number}
DIGIT()40     static final CharPredicate DIGIT() {
41         return Character::isDigit;
42     }
43 
LETTER()44     static final CharPredicate LETTER() {
45         return Character::isLetter;
46     }
47 
IDEOGRAPHIC()48     static final CharPredicate IDEOGRAPHIC() {
49         return Character::isIdeographic;
50     }
51 
LOWERCASE()52     static final CharPredicate LOWERCASE() {
53         return Character::isLowerCase;
54     }
55 
UPPERCASE()56     static final CharPredicate UPPERCASE() {
57         return Character::isUpperCase;
58     }
59 
TITLECASE()60     static final CharPredicate TITLECASE() {
61         return Character::isTitleCase;
62     }
63 
64     // \p{Whitespace}
WHITE_SPACE()65     static final CharPredicate WHITE_SPACE() {
66         return ch ->
67             ((((1 << Character.SPACE_SEPARATOR) |
68                (1 << Character.LINE_SEPARATOR) |
69                (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
70             != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
71     }
72 
73     // \p{gc=Control}
CONTROL()74     static final CharPredicate CONTROL() {
75         return ch -> Character.getType(ch) == Character.CONTROL;
76     }
77 
78     // \p{gc=Punctuation}
PUNCTUATION()79     static final CharPredicate PUNCTUATION() {
80         return ch ->
81             ((((1 << Character.CONNECTOR_PUNCTUATION) |
82                (1 << Character.DASH_PUNCTUATION) |
83                (1 << Character.START_PUNCTUATION) |
84                (1 << Character.END_PUNCTUATION) |
85                (1 << Character.OTHER_PUNCTUATION) |
86                (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
87                (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
88             != 0;
89     }
90 
91     // \p{gc=Decimal_Number}
92     // \p{Hex_Digit}    -> PropList.txt: Hex_Digit
HEX_DIGIT()93     static final CharPredicate HEX_DIGIT() {
94         return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) ||
95                 (ch >= 0x0041 && ch <= 0x0046) ||
96                 (ch >= 0x0061 && ch <= 0x0066) ||
97                 (ch >= 0xFF10 && ch <= 0xFF19) ||
98                 (ch >= 0xFF21 && ch <= 0xFF26) ||
99                 (ch >= 0xFF41 && ch <= 0xFF46));
100     }
101 
ASSIGNED()102     static final CharPredicate ASSIGNED() {
103         return ch -> Character.getType(ch) != Character.UNASSIGNED;
104     }
105 
106     // PropList.txt:Noncharacter_Code_Point
NONCHARACTER_CODE_POINT()107     static final CharPredicate NONCHARACTER_CODE_POINT() {
108         return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
109     }
110 
111     // \p{alpha}
112     // \p{digit}
ALNUM()113     static final CharPredicate ALNUM() {
114         return ALPHABETIC().union(DIGIT());
115     }
116 
117     // \p{Whitespace} --
118     // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL}  -> 0xa, 0xb, 0xc, 0xd, 0x85
119     //  \p{gc=Line_Separator}
120     //  \p{gc=Paragraph_Separator}]
BLANK()121     static final CharPredicate BLANK() {
122         return ch ->
123             Character.getType(ch) == Character.SPACE_SEPARATOR ||
124             ch == 0x9; // \N{HT}
125     }
126 
127     // [^
128     //  \p{space}
129     //  \p{gc=Control}
130     //  \p{gc=Surrogate}
131     //  \p{gc=Unassigned}]
GRAPH()132     static final CharPredicate GRAPH() {
133         return ch ->
134             ((((1 << Character.SPACE_SEPARATOR) |
135                (1 << Character.LINE_SEPARATOR) |
136                (1 << Character.PARAGRAPH_SEPARATOR) |
137                (1 << Character.CONTROL) |
138                (1 << Character.SURROGATE) |
139                (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
140             == 0;
141     }
142 
143     // \p{graph}
144     // \p{blank}
145     // -- \p{cntrl}
PRINT()146     static final CharPredicate PRINT() {
147         return GRAPH().union(BLANK()).and(CONTROL().negate());
148     }
149 
150     //  200C..200D    PropList.txt:Join_Control
JOIN_CONTROL()151     static final CharPredicate JOIN_CONTROL() {
152         return ch -> ch == 0x200C || ch == 0x200D;
153     }
154 
155     //  \p{alpha}
156     //  \p{gc=Mark}
157     //  \p{digit}
158     //  \p{gc=Connector_Punctuation}
159     //  \p{Join_Control}    200C..200D
WORD()160     static final CharPredicate WORD() {
161         return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) |
162                                   (1 << Character.ENCLOSING_MARK) |
163                                   (1 << Character.COMBINING_SPACING_MARK) |
164                                   (1 << Character.DECIMAL_DIGIT_NUMBER) |
165                                   (1 << Character.CONNECTOR_PUNCTUATION))
166                                  >> Character.getType(ch)) & 1) != 0,
167                          JOIN_CONTROL());
168     }
169 
170     /////////////////////////////////////////////////////////////////////////////
171 
getPosixPredicate(String name, boolean caseIns)172     private static CharPredicate getPosixPredicate(String name, boolean caseIns) {
173         return switch (name) {
174             case "ALPHA" -> ALPHABETIC();
175             case "LOWER" -> caseIns
176                                 ? LOWERCASE().union(UPPERCASE(), TITLECASE())
177                                 : LOWERCASE();
178             case "UPPER" -> caseIns
179                                 ? UPPERCASE().union(LOWERCASE(), TITLECASE())
180                                 : UPPERCASE();
181             case "SPACE" -> WHITE_SPACE();
182             case "PUNCT" -> PUNCTUATION();
183             case "XDIGIT" -> HEX_DIGIT();
184             case "ALNUM" -> ALNUM();
185             case "CNTRL" -> CONTROL();
186             case "DIGIT" -> DIGIT();
187             case "BLANK" -> BLANK();
188             case "GRAPH" -> GRAPH();
189             case "PRINT" -> PRINT();
190             default -> null;
191         };
192     }
193 
getUnicodePredicate(String name, boolean caseIns)194     private static CharPredicate getUnicodePredicate(String name, boolean caseIns) {
195         return switch (name) {
196             case "ALPHABETIC" -> ALPHABETIC();
197             case "ASSIGNED" -> ASSIGNED();
198             case "CONTROL" -> CONTROL();
199             case "HEXDIGIT", "HEX_DIGIT" -> HEX_DIGIT();
200             case "IDEOGRAPHIC" -> IDEOGRAPHIC();
201             case "JOINCONTROL", "JOIN_CONTROL" -> JOIN_CONTROL();
202             case "LETTER" -> LETTER();
203             case "LOWERCASE" -> caseIns
204                                     ? LOWERCASE().union(UPPERCASE(), TITLECASE())
205                                     : LOWERCASE();
206             case "NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT" -> NONCHARACTER_CODE_POINT();
207             case "TITLECASE" -> caseIns
208                                     ? TITLECASE().union(LOWERCASE(), UPPERCASE())
209                                     : TITLECASE();
210             case "PUNCTUATION" -> PUNCTUATION();
211             case "UPPERCASE" -> caseIns
212                                     ? UPPERCASE().union(LOWERCASE(), TITLECASE())
213                                     : UPPERCASE();
214             case "WHITESPACE", "WHITE_SPACE" -> WHITE_SPACE();
215             case "WORD" -> WORD();
216             default -> null;
217         };
218     }
219 
220     public static CharPredicate forUnicodeProperty(String propName, boolean caseIns) {
221         propName = propName.toUpperCase(Locale.ROOT);
222         CharPredicate p = getUnicodePredicate(propName, caseIns);
223         if (p != null)
224             return p;
225         return getPosixPredicate(propName, caseIns);
226     }
227 
228     public static CharPredicate forPOSIXName(String propName, boolean caseIns) {
229         return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH), caseIns);
230     }
231 
232     /////////////////////////////////////////////////////////////////////////////
233 
234     /**
235      * Returns a predicate matching all characters belong to a named
236      * UnicodeScript.
237      */
238     static CharPredicate forUnicodeScript(String name) {
239         final Character.UnicodeScript script;
240         try {
241             script = Character.UnicodeScript.forName(name);
242             return ch -> script == Character.UnicodeScript.of(ch);
243         } catch (IllegalArgumentException iae) {}
244         return null;
245     }
246 
247     /**
248      * Returns a predicate matching all characters in a UnicodeBlock.
249      */
250     static CharPredicate forUnicodeBlock(String name) {
251         final Character.UnicodeBlock block;
252         try {
253             block = Character.UnicodeBlock.forName(name);
254             return ch -> block == Character.UnicodeBlock.of(ch);
255         } catch (IllegalArgumentException iae) {}
256          return null;
257     }
258 
259     /////////////////////////////////////////////////////////////////////////////
260 
261     // unicode categories, aliases, properties, java methods ...
262 
263     static CharPredicate forProperty(String name, boolean caseIns) {
264         // Unicode character property aliases, defined in
265         // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
266         return switch (name) {
267             case "Cn" -> category(1 << Character.UNASSIGNED);
268             case "Lu" -> category(caseIns ? (1 << Character.LOWERCASE_LETTER) |
269                                             (1 << Character.UPPERCASE_LETTER) |
270                                             (1 << Character.TITLECASE_LETTER)
271                                           : (1 << Character.UPPERCASE_LETTER));
272             case "Ll" -> category(caseIns ? (1 << Character.LOWERCASE_LETTER) |
273                                             (1 << Character.UPPERCASE_LETTER) |
274                                             (1 << Character.TITLECASE_LETTER)
275                                           : (1 << Character.LOWERCASE_LETTER));
276             case "Lt" -> category(caseIns ? (1 << Character.LOWERCASE_LETTER) |
277                                             (1 << Character.UPPERCASE_LETTER) |
278                                             (1 << Character.TITLECASE_LETTER)
279                                           : (1 << Character.TITLECASE_LETTER));
280             case "Lm" -> category(1 << Character.MODIFIER_LETTER);
281             case "Lo" -> category(1 << Character.OTHER_LETTER);
282             case "Mn" -> category(1 << Character.NON_SPACING_MARK);
283             case "Me" -> category(1 << Character.ENCLOSING_MARK);
284             case "Mc" -> category(1 << Character.COMBINING_SPACING_MARK);
285             case "Nd" -> category(1 << Character.DECIMAL_DIGIT_NUMBER);
286             case "Nl" -> category(1 << Character.LETTER_NUMBER);
287             case "No" -> category(1 << Character.OTHER_NUMBER);
288             case "Zs" -> category(1 << Character.SPACE_SEPARATOR);
289             case "Zl" -> category(1 << Character.LINE_SEPARATOR);
290             case "Zp" -> category(1 << Character.PARAGRAPH_SEPARATOR);
291             case "Cc" -> category(1 << Character.CONTROL);
292             case "Cf" -> category(1 << Character.FORMAT);
293             case "Co" -> category(1 << Character.PRIVATE_USE);
294             case "Cs" -> category(1 << Character.SURROGATE);
295             case "Pd" -> category(1 << Character.DASH_PUNCTUATION);
296             case "Ps" -> category(1 << Character.START_PUNCTUATION);
297             case "Pe" -> category(1 << Character.END_PUNCTUATION);
298             case "Pc" -> category(1 << Character.CONNECTOR_PUNCTUATION);
299             case "Po" -> category(1 << Character.OTHER_PUNCTUATION);
300             case "Sm" -> category(1 << Character.MATH_SYMBOL);
301             case "Sc" -> category(1 << Character.CURRENCY_SYMBOL);
302             case "Sk" -> category(1 << Character.MODIFIER_SYMBOL);
303             case "So" -> category(1 << Character.OTHER_SYMBOL);
304             case "Pi" -> category(1 << Character.INITIAL_QUOTE_PUNCTUATION);
305             case "Pf" -> category(1 << Character.FINAL_QUOTE_PUNCTUATION);
306             case "L" -> category(((1 << Character.UPPERCASE_LETTER) |
307                                   (1 << Character.LOWERCASE_LETTER) |
308                                   (1 << Character.TITLECASE_LETTER) |
309                                   (1 << Character.MODIFIER_LETTER) |
310                                   (1 << Character.OTHER_LETTER)));
311             case "M" -> category(((1 << Character.NON_SPACING_MARK) |
312                                   (1 << Character.ENCLOSING_MARK) |
313                                   (1 << Character.COMBINING_SPACING_MARK)));
314             case "N" -> category(((1 << Character.DECIMAL_DIGIT_NUMBER) |
315                                   (1 << Character.LETTER_NUMBER) |
316                                   (1 << Character.OTHER_NUMBER)));
317             case "Z" -> category(((1 << Character.SPACE_SEPARATOR) |
318                                   (1 << Character.LINE_SEPARATOR) |
319                                   (1 << Character.PARAGRAPH_SEPARATOR)));
320             case "C" -> category(((1 << Character.CONTROL) |
321                                   (1 << Character.FORMAT) |
322                                   (1 << Character.PRIVATE_USE) |
323                                   (1 << Character.SURROGATE) |
324                                   (1 << Character.UNASSIGNED))); // Other
325             case "P" -> category(((1 << Character.DASH_PUNCTUATION) |
326                                   (1 << Character.START_PUNCTUATION) |
327                                   (1 << Character.END_PUNCTUATION) |
328                                   (1 << Character.CONNECTOR_PUNCTUATION) |
329                                   (1 << Character.OTHER_PUNCTUATION) |
330                                   (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
331                                   (1 << Character.FINAL_QUOTE_PUNCTUATION)));
332             case "S" -> category(((1 << Character.MATH_SYMBOL) |
333                                   (1 << Character.CURRENCY_SYMBOL) |
334                                   (1 << Character.MODIFIER_SYMBOL) |
335                                   (1 << Character.OTHER_SYMBOL)));
336             case "LC" -> category(((1 << Character.UPPERCASE_LETTER) |
337                                    (1 << Character.LOWERCASE_LETTER) |
338                                    (1 << Character.TITLECASE_LETTER)));
339             case "LD" -> category(((1 << Character.UPPERCASE_LETTER) |
340                                    (1 << Character.LOWERCASE_LETTER) |
341                                    (1 << Character.TITLECASE_LETTER) |
342                                    (1 << Character.MODIFIER_LETTER) |
343                                    (1 << Character.OTHER_LETTER) |
344                                    (1 << Character.DECIMAL_DIGIT_NUMBER)));
345             case "L1" -> range(0x00, 0xFF); // Latin-1
346             case "all" -> Pattern.ALL();
347             // Posix regular expression character classes, defined in
348             // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
349             case "ASCII" -> range(0x00, 0x7F);    // ASCII
350             case "Alnum" -> ctype(ASCII.ALNUM);   // Alphanumeric characters
351             case "Alpha" -> ctype(ASCII.ALPHA);   // Alphabetic characters
352             case "Blank" -> ctype(ASCII.BLANK);   // Space and tab characters
353             case "Cntrl" -> ctype(ASCII.CNTRL);   // Control characters
354             case "Digit" -> range('0', '9');      // Numeric characters
355             case "Graph" -> ctype(ASCII.GRAPH);   // printable and visible
356             case "Lower" -> caseIns ? ctype(ASCII.ALPHA)
357                                     : range('a', 'z'); // Lower-case alphabetic
358             case "Print" -> range(0x20, 0x7E);    // Printable characters
359             case "Punct" -> ctype(ASCII.PUNCT);   // Punctuation characters
360             case "Space" -> ctype(ASCII.SPACE);   // Space characters
361             case "Upper" -> caseIns ? ctype(ASCII.ALPHA)
362                                     : range('A', 'Z'); // Upper-case alphabetic
363             case "XDigit" -> ctype(ASCII.XDIGIT); // hexadecimal digits
364 
365             // Java character properties, defined by methods in Character.java
366             case "javaLowerCase" -> caseIns ? c -> Character.isLowerCase(c) ||
367                                                    Character.isUpperCase(c) ||
368                                                    Character.isTitleCase(c)
369                                             : Character::isLowerCase;
370             case "javaUpperCase" -> caseIns ? c -> Character.isUpperCase(c) ||
371                                                    Character.isLowerCase(c) ||
372                                                    Character.isTitleCase(c)
373                                             : Character::isUpperCase;
374             case "javaAlphabetic" -> Character::isAlphabetic;
375             case "javaIdeographic" -> Character::isIdeographic;
376             case "javaTitleCase" -> caseIns ? c -> Character.isTitleCase(c) ||
377                                                    Character.isLowerCase(c) ||
378                                                    Character.isUpperCase(c)
379                                             : Character::isTitleCase;
380             case "javaDigit" -> Character::isDigit;
381             case "javaDefined" -> Character::isDefined;
382             case "javaLetter" -> Character::isLetter;
383             case "javaLetterOrDigit" -> Character::isLetterOrDigit;
384             case "javaJavaIdentifierStart" -> Character::isJavaIdentifierStart;
385             case "javaJavaIdentifierPart" -> Character::isJavaIdentifierPart;
386             case "javaUnicodeIdentifierStart" -> Character::isUnicodeIdentifierStart;
387             case "javaUnicodeIdentifierPart" -> Character::isUnicodeIdentifierPart;
388             case "javaIdentifierIgnorable" -> Character::isIdentifierIgnorable;
389             case "javaSpaceChar" -> Character::isSpaceChar;
390             case "javaWhitespace" -> Character::isWhitespace;
391             case "javaISOControl" -> Character::isISOControl;
392             case "javaMirrored" -> Character::isMirrored;
393             default -> null;
394         };
395     }
396 
397     private static CharPredicate category(final int typeMask) {
398         return ch -> (typeMask & (1 << Character.getType(ch))) != 0;
399     }
400 
401     private static CharPredicate range(final int lower, final int upper) {
402         return (BmpCharPredicate)ch -> lower <= ch && ch <= upper;
403     }
404 
405     private static CharPredicate ctype(final int ctype) {
406         return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype);
407     }
408 
409     /////////////////////////////////////////////////////////////////////////////
410 
411     /**
412      * Posix ASCII variants, not in the lookup map
413      */
414     static final BmpCharPredicate ASCII_DIGIT() {
415         return ch -> ch < 128 && ASCII.isDigit(ch);
416     }
417     static final BmpCharPredicate ASCII_WORD() {
418         return ch -> ch < 128 && ASCII.isWord(ch);
419     }
420     static final BmpCharPredicate ASCII_SPACE() {
421         return ch -> ch < 128 && ASCII.isSpace(ch);
422     }
423 
424 }
425