1 /* 2 * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.util.HashMap; 29 import java.util.Locale; 30 import java.util.regex.Pattern.CharPredicate; 31 import java.util.regex.Pattern.BmpCharPredicate; 32 33 class CharPredicates { 34 ALPHABETIC()35 static final CharPredicate ALPHABETIC() { 36 return Character::isAlphabetic; 37 } 38 39 // \p{gc=Decimal_Number} DIGIT()40 static final CharPredicate DIGIT() { 41 return Character::isDigit; 42 } 43 LETTER()44 static final CharPredicate LETTER() { 45 return Character::isLetter; 46 } 47 IDEOGRAPHIC()48 static final CharPredicate IDEOGRAPHIC() { 49 return Character::isIdeographic; 50 } 51 LOWERCASE()52 static final CharPredicate LOWERCASE() { 53 return Character::isLowerCase; 54 } 55 UPPERCASE()56 static final CharPredicate UPPERCASE() { 57 return Character::isUpperCase; 58 } 59 TITLECASE()60 static final CharPredicate TITLECASE() { 61 return Character::isTitleCase; 62 } 63 64 // \p{Whitespace} WHITE_SPACE()65 static final CharPredicate WHITE_SPACE() { 66 return ch -> 67 ((((1 << Character.SPACE_SEPARATOR) | 68 (1 << Character.LINE_SEPARATOR) | 69 (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) 70 != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); 71 } 72 73 // \p{gc=Control} CONTROL()74 static final CharPredicate CONTROL() { 75 return ch -> Character.getType(ch) == Character.CONTROL; 76 } 77 78 // \p{gc=Punctuation} PUNCTUATION()79 static final CharPredicate PUNCTUATION() { 80 return ch -> 81 ((((1 << Character.CONNECTOR_PUNCTUATION) | 82 (1 << Character.DASH_PUNCTUATION) | 83 (1 << Character.START_PUNCTUATION) | 84 (1 << Character.END_PUNCTUATION) | 85 (1 << Character.OTHER_PUNCTUATION) | 86 (1 << Character.INITIAL_QUOTE_PUNCTUATION) | 87 (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) 88 != 0; 89 } 90 91 // \p{gc=Decimal_Number} 92 // \p{Hex_Digit} -> PropList.txt: Hex_Digit HEX_DIGIT()93 static final CharPredicate HEX_DIGIT() { 94 return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) || 95 (ch >= 0x0041 && ch <= 0x0046) || 96 (ch >= 0x0061 && ch <= 0x0066) || 97 (ch >= 0xFF10 && ch <= 0xFF19) || 98 (ch >= 0xFF21 && ch <= 0xFF26) || 99 (ch >= 0xFF41 && ch <= 0xFF46)); 100 } 101 ASSIGNED()102 static final CharPredicate ASSIGNED() { 103 return ch -> Character.getType(ch) != Character.UNASSIGNED; 104 } 105 106 // PropList.txt:Noncharacter_Code_Point NONCHARACTER_CODE_POINT()107 static final CharPredicate NONCHARACTER_CODE_POINT() { 108 return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); 109 } 110 111 // \p{alpha} 112 // \p{digit} ALNUM()113 static final CharPredicate ALNUM() { 114 return ALPHABETIC().union(DIGIT()); 115 } 116 117 // \p{Whitespace} -- 118 // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 119 // \p{gc=Line_Separator} 120 // \p{gc=Paragraph_Separator}] BLANK()121 static final CharPredicate BLANK() { 122 return ch -> 123 Character.getType(ch) == Character.SPACE_SEPARATOR || 124 ch == 0x9; // \N{HT} 125 } 126 127 // [^ 128 // \p{space} 129 // \p{gc=Control} 130 // \p{gc=Surrogate} 131 // \p{gc=Unassigned}] GRAPH()132 static final CharPredicate GRAPH() { 133 return ch -> 134 ((((1 << Character.SPACE_SEPARATOR) | 135 (1 << Character.LINE_SEPARATOR) | 136 (1 << Character.PARAGRAPH_SEPARATOR) | 137 (1 << Character.CONTROL) | 138 (1 << Character.SURROGATE) | 139 (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) 140 == 0; 141 } 142 143 // \p{graph} 144 // \p{blank} 145 // -- \p{cntrl} PRINT()146 static final CharPredicate PRINT() { 147 return GRAPH().union(BLANK()).and(CONTROL().negate()); 148 } 149 150 // 200C..200D PropList.txt:Join_Control JOIN_CONTROL()151 static final CharPredicate JOIN_CONTROL() { 152 return ch -> ch == 0x200C || ch == 0x200D; 153 } 154 155 // \p{alpha} 156 // \p{gc=Mark} 157 // \p{digit} 158 // \p{gc=Connector_Punctuation} 159 // \p{Join_Control} 200C..200D WORD()160 static final CharPredicate WORD() { 161 return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) | 162 (1 << Character.ENCLOSING_MARK) | 163 (1 << Character.COMBINING_SPACING_MARK) | 164 (1 << Character.DECIMAL_DIGIT_NUMBER) | 165 (1 << Character.CONNECTOR_PUNCTUATION)) 166 >> Character.getType(ch)) & 1) != 0, 167 JOIN_CONTROL()); 168 } 169 170 ///////////////////////////////////////////////////////////////////////////// 171 getPosixPredicate(String name, boolean caseIns)172 private static CharPredicate getPosixPredicate(String name, boolean caseIns) { 173 return switch (name) { 174 case "ALPHA" -> ALPHABETIC(); 175 case "LOWER" -> caseIns 176 ? LOWERCASE().union(UPPERCASE(), TITLECASE()) 177 : LOWERCASE(); 178 case "UPPER" -> caseIns 179 ? UPPERCASE().union(LOWERCASE(), TITLECASE()) 180 : UPPERCASE(); 181 case "SPACE" -> WHITE_SPACE(); 182 case "PUNCT" -> PUNCTUATION(); 183 case "XDIGIT" -> HEX_DIGIT(); 184 case "ALNUM" -> ALNUM(); 185 case "CNTRL" -> CONTROL(); 186 case "DIGIT" -> DIGIT(); 187 case "BLANK" -> BLANK(); 188 case "GRAPH" -> GRAPH(); 189 case "PRINT" -> PRINT(); 190 default -> null; 191 }; 192 } 193 getUnicodePredicate(String name, boolean caseIns)194 private static CharPredicate getUnicodePredicate(String name, boolean caseIns) { 195 return switch (name) { 196 case "ALPHABETIC" -> ALPHABETIC(); 197 case "ASSIGNED" -> ASSIGNED(); 198 case "CONTROL" -> CONTROL(); 199 case "HEXDIGIT", "HEX_DIGIT" -> HEX_DIGIT(); 200 case "IDEOGRAPHIC" -> IDEOGRAPHIC(); 201 case "JOINCONTROL", "JOIN_CONTROL" -> JOIN_CONTROL(); 202 case "LETTER" -> LETTER(); 203 case "LOWERCASE" -> caseIns 204 ? LOWERCASE().union(UPPERCASE(), TITLECASE()) 205 : LOWERCASE(); 206 case "NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT" -> NONCHARACTER_CODE_POINT(); 207 case "TITLECASE" -> caseIns 208 ? TITLECASE().union(LOWERCASE(), UPPERCASE()) 209 : TITLECASE(); 210 case "PUNCTUATION" -> PUNCTUATION(); 211 case "UPPERCASE" -> caseIns 212 ? UPPERCASE().union(LOWERCASE(), TITLECASE()) 213 : UPPERCASE(); 214 case "WHITESPACE", "WHITE_SPACE" -> WHITE_SPACE(); 215 case "WORD" -> WORD(); 216 default -> null; 217 }; 218 } 219 220 public static CharPredicate forUnicodeProperty(String propName, boolean caseIns) { 221 propName = propName.toUpperCase(Locale.ROOT); 222 CharPredicate p = getUnicodePredicate(propName, caseIns); 223 if (p != null) 224 return p; 225 return getPosixPredicate(propName, caseIns); 226 } 227 228 public static CharPredicate forPOSIXName(String propName, boolean caseIns) { 229 return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH), caseIns); 230 } 231 232 ///////////////////////////////////////////////////////////////////////////// 233 234 /** 235 * Returns a predicate matching all characters belong to a named 236 * UnicodeScript. 237 */ 238 static CharPredicate forUnicodeScript(String name) { 239 final Character.UnicodeScript script; 240 try { 241 script = Character.UnicodeScript.forName(name); 242 return ch -> script == Character.UnicodeScript.of(ch); 243 } catch (IllegalArgumentException iae) {} 244 return null; 245 } 246 247 /** 248 * Returns a predicate matching all characters in a UnicodeBlock. 249 */ 250 static CharPredicate forUnicodeBlock(String name) { 251 final Character.UnicodeBlock block; 252 try { 253 block = Character.UnicodeBlock.forName(name); 254 return ch -> block == Character.UnicodeBlock.of(ch); 255 } catch (IllegalArgumentException iae) {} 256 return null; 257 } 258 259 ///////////////////////////////////////////////////////////////////////////// 260 261 // unicode categories, aliases, properties, java methods ... 262 263 static CharPredicate forProperty(String name, boolean caseIns) { 264 // Unicode character property aliases, defined in 265 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt 266 return switch (name) { 267 case "Cn" -> category(1 << Character.UNASSIGNED); 268 case "Lu" -> category(caseIns ? (1 << Character.LOWERCASE_LETTER) | 269 (1 << Character.UPPERCASE_LETTER) | 270 (1 << Character.TITLECASE_LETTER) 271 : (1 << Character.UPPERCASE_LETTER)); 272 case "Ll" -> category(caseIns ? (1 << Character.LOWERCASE_LETTER) | 273 (1 << Character.UPPERCASE_LETTER) | 274 (1 << Character.TITLECASE_LETTER) 275 : (1 << Character.LOWERCASE_LETTER)); 276 case "Lt" -> category(caseIns ? (1 << Character.LOWERCASE_LETTER) | 277 (1 << Character.UPPERCASE_LETTER) | 278 (1 << Character.TITLECASE_LETTER) 279 : (1 << Character.TITLECASE_LETTER)); 280 case "Lm" -> category(1 << Character.MODIFIER_LETTER); 281 case "Lo" -> category(1 << Character.OTHER_LETTER); 282 case "Mn" -> category(1 << Character.NON_SPACING_MARK); 283 case "Me" -> category(1 << Character.ENCLOSING_MARK); 284 case "Mc" -> category(1 << Character.COMBINING_SPACING_MARK); 285 case "Nd" -> category(1 << Character.DECIMAL_DIGIT_NUMBER); 286 case "Nl" -> category(1 << Character.LETTER_NUMBER); 287 case "No" -> category(1 << Character.OTHER_NUMBER); 288 case "Zs" -> category(1 << Character.SPACE_SEPARATOR); 289 case "Zl" -> category(1 << Character.LINE_SEPARATOR); 290 case "Zp" -> category(1 << Character.PARAGRAPH_SEPARATOR); 291 case "Cc" -> category(1 << Character.CONTROL); 292 case "Cf" -> category(1 << Character.FORMAT); 293 case "Co" -> category(1 << Character.PRIVATE_USE); 294 case "Cs" -> category(1 << Character.SURROGATE); 295 case "Pd" -> category(1 << Character.DASH_PUNCTUATION); 296 case "Ps" -> category(1 << Character.START_PUNCTUATION); 297 case "Pe" -> category(1 << Character.END_PUNCTUATION); 298 case "Pc" -> category(1 << Character.CONNECTOR_PUNCTUATION); 299 case "Po" -> category(1 << Character.OTHER_PUNCTUATION); 300 case "Sm" -> category(1 << Character.MATH_SYMBOL); 301 case "Sc" -> category(1 << Character.CURRENCY_SYMBOL); 302 case "Sk" -> category(1 << Character.MODIFIER_SYMBOL); 303 case "So" -> category(1 << Character.OTHER_SYMBOL); 304 case "Pi" -> category(1 << Character.INITIAL_QUOTE_PUNCTUATION); 305 case "Pf" -> category(1 << Character.FINAL_QUOTE_PUNCTUATION); 306 case "L" -> category(((1 << Character.UPPERCASE_LETTER) | 307 (1 << Character.LOWERCASE_LETTER) | 308 (1 << Character.TITLECASE_LETTER) | 309 (1 << Character.MODIFIER_LETTER) | 310 (1 << Character.OTHER_LETTER))); 311 case "M" -> category(((1 << Character.NON_SPACING_MARK) | 312 (1 << Character.ENCLOSING_MARK) | 313 (1 << Character.COMBINING_SPACING_MARK))); 314 case "N" -> category(((1 << Character.DECIMAL_DIGIT_NUMBER) | 315 (1 << Character.LETTER_NUMBER) | 316 (1 << Character.OTHER_NUMBER))); 317 case "Z" -> category(((1 << Character.SPACE_SEPARATOR) | 318 (1 << Character.LINE_SEPARATOR) | 319 (1 << Character.PARAGRAPH_SEPARATOR))); 320 case "C" -> category(((1 << Character.CONTROL) | 321 (1 << Character.FORMAT) | 322 (1 << Character.PRIVATE_USE) | 323 (1 << Character.SURROGATE) | 324 (1 << Character.UNASSIGNED))); // Other 325 case "P" -> category(((1 << Character.DASH_PUNCTUATION) | 326 (1 << Character.START_PUNCTUATION) | 327 (1 << Character.END_PUNCTUATION) | 328 (1 << Character.CONNECTOR_PUNCTUATION) | 329 (1 << Character.OTHER_PUNCTUATION) | 330 (1 << Character.INITIAL_QUOTE_PUNCTUATION) | 331 (1 << Character.FINAL_QUOTE_PUNCTUATION))); 332 case "S" -> category(((1 << Character.MATH_SYMBOL) | 333 (1 << Character.CURRENCY_SYMBOL) | 334 (1 << Character.MODIFIER_SYMBOL) | 335 (1 << Character.OTHER_SYMBOL))); 336 case "LC" -> category(((1 << Character.UPPERCASE_LETTER) | 337 (1 << Character.LOWERCASE_LETTER) | 338 (1 << Character.TITLECASE_LETTER))); 339 case "LD" -> category(((1 << Character.UPPERCASE_LETTER) | 340 (1 << Character.LOWERCASE_LETTER) | 341 (1 << Character.TITLECASE_LETTER) | 342 (1 << Character.MODIFIER_LETTER) | 343 (1 << Character.OTHER_LETTER) | 344 (1 << Character.DECIMAL_DIGIT_NUMBER))); 345 case "L1" -> range(0x00, 0xFF); // Latin-1 346 case "all" -> Pattern.ALL(); 347 // Posix regular expression character classes, defined in 348 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html 349 case "ASCII" -> range(0x00, 0x7F); // ASCII 350 case "Alnum" -> ctype(ASCII.ALNUM); // Alphanumeric characters 351 case "Alpha" -> ctype(ASCII.ALPHA); // Alphabetic characters 352 case "Blank" -> ctype(ASCII.BLANK); // Space and tab characters 353 case "Cntrl" -> ctype(ASCII.CNTRL); // Control characters 354 case "Digit" -> range('0', '9'); // Numeric characters 355 case "Graph" -> ctype(ASCII.GRAPH); // printable and visible 356 case "Lower" -> caseIns ? ctype(ASCII.ALPHA) 357 : range('a', 'z'); // Lower-case alphabetic 358 case "Print" -> range(0x20, 0x7E); // Printable characters 359 case "Punct" -> ctype(ASCII.PUNCT); // Punctuation characters 360 case "Space" -> ctype(ASCII.SPACE); // Space characters 361 case "Upper" -> caseIns ? ctype(ASCII.ALPHA) 362 : range('A', 'Z'); // Upper-case alphabetic 363 case "XDigit" -> ctype(ASCII.XDIGIT); // hexadecimal digits 364 365 // Java character properties, defined by methods in Character.java 366 case "javaLowerCase" -> caseIns ? c -> Character.isLowerCase(c) || 367 Character.isUpperCase(c) || 368 Character.isTitleCase(c) 369 : Character::isLowerCase; 370 case "javaUpperCase" -> caseIns ? c -> Character.isUpperCase(c) || 371 Character.isLowerCase(c) || 372 Character.isTitleCase(c) 373 : Character::isUpperCase; 374 case "javaAlphabetic" -> Character::isAlphabetic; 375 case "javaIdeographic" -> Character::isIdeographic; 376 case "javaTitleCase" -> caseIns ? c -> Character.isTitleCase(c) || 377 Character.isLowerCase(c) || 378 Character.isUpperCase(c) 379 : Character::isTitleCase; 380 case "javaDigit" -> Character::isDigit; 381 case "javaDefined" -> Character::isDefined; 382 case "javaLetter" -> Character::isLetter; 383 case "javaLetterOrDigit" -> Character::isLetterOrDigit; 384 case "javaJavaIdentifierStart" -> Character::isJavaIdentifierStart; 385 case "javaJavaIdentifierPart" -> Character::isJavaIdentifierPart; 386 case "javaUnicodeIdentifierStart" -> Character::isUnicodeIdentifierStart; 387 case "javaUnicodeIdentifierPart" -> Character::isUnicodeIdentifierPart; 388 case "javaIdentifierIgnorable" -> Character::isIdentifierIgnorable; 389 case "javaSpaceChar" -> Character::isSpaceChar; 390 case "javaWhitespace" -> Character::isWhitespace; 391 case "javaISOControl" -> Character::isISOControl; 392 case "javaMirrored" -> Character::isMirrored; 393 default -> null; 394 }; 395 } 396 397 private static CharPredicate category(final int typeMask) { 398 return ch -> (typeMask & (1 << Character.getType(ch))) != 0; 399 } 400 401 private static CharPredicate range(final int lower, final int upper) { 402 return (BmpCharPredicate)ch -> lower <= ch && ch <= upper; 403 } 404 405 private static CharPredicate ctype(final int ctype) { 406 return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype); 407 } 408 409 ///////////////////////////////////////////////////////////////////////////// 410 411 /** 412 * Posix ASCII variants, not in the lookup map 413 */ 414 static final BmpCharPredicate ASCII_DIGIT() { 415 return ch -> ch < 128 && ASCII.isDigit(ch); 416 } 417 static final BmpCharPredicate ASCII_WORD() { 418 return ch -> ch < 128 && ASCII.isWord(ch); 419 } 420 static final BmpCharPredicate ASCII_SPACE() { 421 return ch -> ch < 128 && ASCII.isSpace(ch); 422 } 423 424 } 425