1 /* 2 * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.xerces.internal.impl.xpath.regex; 22 23 import java.text.CharacterIterator; 24 import java.util.Locale; 25 import java.util.Stack; 26 27 import com.sun.org.apache.xerces.internal.util.IntStack; 28 29 /** 30 * A regular expression matching engine using Non-deterministic Finite Automaton (NFA). 31 * This engine does not conform to the POSIX regular expression. 32 * 33 * <hr width="50%"> 34 * <h2>How to use</h2> 35 * 36 * <dl> 37 * <dt>A. Standard way 38 * <dd> 39 * <pre> 40 * RegularExpression re = new RegularExpression(<var>regex</var>); 41 * if (re.matches(text)) { ... } 42 * </pre> 43 * 44 * <dt>B. Capturing groups 45 * <dd> 46 * <pre> 47 * RegularExpression re = new RegularExpression(<var>regex</var>); 48 * Match match = new Match(); 49 * if (re.matches(text, match)) { 50 * ... // You can refer captured texts with methods of the <code>Match</code> class. 51 * } 52 * </pre> 53 * 54 * </dl> 55 * 56 * <h4>Case-insensitive matching</h4> 57 * <pre> 58 * RegularExpression re = new RegularExpression(<var>regex</var>, "i"); 59 * if (re.matches(text) >= 0) { ...} 60 * </pre> 61 * 62 * <h4>Options</h4> 63 * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a> 64 * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>. 65 * This <var>options</var> parameter consists of the following characters. 66 * </p> 67 * <dl> 68 * <dt><a name="I_OPTION"><code>"i"</code></a> 69 * <dd>This option indicates case-insensitive matching. 70 * <dt><a name="M_OPTION"><code>"m"</code></a> 71 * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text. 72 * <dt><a name="S_OPTION"><code>"s"</code></a> 73 * <dd class="REGEX"><kbd>.</kbd> matches any one character. 74 * <dt><a name="U_OPTION"><code>"u"</code></a> 75 * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode. 76 * <dt><a name="W_OPTION"><code>"w"</code></a> 77 * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of 78 * 'Unicode Regular Expression Guidelines' Revision 4. 79 * When "w" and "u" are specified at the same time, 80 * <kbd>\b \B \< \></kbd> are processed for the "w" option. 81 * <dt><a name="COMMA_OPTION"><code>","</code></a> 82 * <dd>The parser treats a comma in a character class as a range separator. 83 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option. 84 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option. 85 * 86 * <dt><a name="X_OPTION"><code>"X"</code></a> 87 * <dd class="REGEX"> 88 * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>. 89 * The <code>match()</code> method does not do subsring matching 90 * but entire string matching. 91 * 92 * </dl> 93 * 94 * <hr width="50%"> 95 * <h3>Syntax</h3> 96 * <table border="1" bgcolor="#ddeeff"> 97 * <tr> 98 * <td> 99 * <h4>Differences from the Perl 5 regular expression</h4> 100 * <ul> 101 * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.) 102 * <li>Supports subtraction, union, and intersection operations for character classes. 103 * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations), 104 * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>, 105 * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>, 106 * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>, 107 * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd> 108 * </ul> 109 * </td> 110 * </tr> 111 * </table> 112 * 113 * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P> 114 * <ul> 115 * <li>Character 116 * <dl> 117 * <dt class="REGEX"><kbd>.</kbd> (A period) 118 * <dd>Matches any one character except the following characters. 119 * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D), 120 * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028) 121 * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates. 122 * <dd>When <a href="#S_OPTION">the "s" option</a> is specified, 123 * it matches any character including the above four characters. 124 * 125 * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd> 126 * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A), 127 * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009) 128 * 129 * <dt class="REGEX"><kbd>\c</kbd><var>C</var> 130 * <dd>Matches a control character. 131 * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>', 132 * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'. 133 * It matches a control character of which the character code is less than 134 * the character code of the <var>C</var> by 0x0040. 135 * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A), 136 * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B). 137 * 138 * <dt class="REGEX">a non-meta character 139 * <dd>Matches the character. 140 * 141 * <dt class="REGEX"><KBD>\</KBD> + a meta character 142 * <dd>Matches the meta character. 143 * 144 * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> 145 * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode. 146 * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and 147 * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>. 148 * 149 * <!-- 150 * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var> 151 * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode. 152 * --> 153 * 154 * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var> 155 * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode. 156 * 157 * <dt class="REGEX"><kbd>\g</kbd> 158 * <dd>Matches a grapheme. 159 * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd> 160 * 161 * <dt class="REGEX"><kbd>\X</kbd> 162 * <dd class="REGEX">Matches a combining character sequence. 163 * It is equivalent to <kbd>(?:\PM\pM*)</kbd> 164 * </dl> 165 * </li> 166 * 167 * <li>Character class 168 * <dl> 169 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>) 170 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>) 171 * <dd>Positive character class. It matches a character in ranges. 172 * <dd><var>R<sub>n</sub></var>: 173 * <ul> 174 * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>) 175 * <p>This range matches the character. 176 * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var> 177 * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point. 178 + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>, 179 + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd> 180 * <p>... 181 * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd> 182 * <p>These expressions specifies the same ranges as the following expressions. 183 * </ul> 184 * <p class="REGEX">Enumerated ranges are merged (union operation). 185 * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd> 186 * 187 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>) 188 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>) 189 * <dd>Negative character class. It matches a character not in ranges. 190 * 191 * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd> 192 * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.) 193 * <dd>Subtraction or union or intersection for character classes. 194 * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>. 195 * <dd>The result of this operations is a <u>positive character class</u> 196 * even if an expression includes any negative character classes. 197 * You have to take care on this in case-insensitive matching. 198 * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>, 199 * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching. 200 * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because 201 * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>' 202 * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>. 203 * 204 * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt> 205 * <dd>Character class subtraction for the XML Schema. 206 * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>. 207 * 208 * <dt class="REGEX"><kbd>\d</kbd> 209 * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>. 210 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 211 * <span class="REGEX"><kbd>\p{Nd}</kbd></span>. 212 * 213 * <dt class="REGEX"><kbd>\D</kbd> 214 * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd> 215 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 216 * <span class="REGEX"><kbd>\P{Nd}</kbd></span>. 217 * 218 * <dt class="REGEX"><kbd>\s</kbd> 219 * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd> 220 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 221 * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>. 222 * 223 * <dt class="REGEX"><kbd>\S</kbd> 224 * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd> 225 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 226 * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>. 227 * 228 * <dt class="REGEX"><kbd>\w</kbd> 229 * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd> 230 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 231 * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. 232 * 233 * <dt class="REGEX"><kbd>\W</kbd> 234 * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd> 235 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 236 * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. 237 * 238 * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd> 239 * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>. 240 * The following names are available: 241 * <dl> 242 * <dt>Unicode General Categories: 243 * <dd><kbd> 244 * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp, 245 * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So, 246 * </kbd> 247 * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters) 248 * <dt>Unicode Blocks: 249 * <dd><kbd> 250 * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B, 251 * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek, 252 * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati, 253 * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian, 254 * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation, 255 * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols, 256 * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators, 257 * Miscellaneous Technical, Control Pictures, Optical Character Recognition, 258 * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes, 259 * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana, 260 * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, 261 * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs, 262 * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates, 263 * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms, 264 * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms, 265 * Small Form Variants, Arabic Presentation Forms-B, Specials, 266 * Halfwidth and Fullwidth Forms 267 * </kbd> 268 * <dt>Others: 269 * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>) 270 * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>) 271 * <dd><kbd>UNASSGINED</kbd> 272 * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>) 273 * </dl> 274 * 275 * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd> 276 * <dd>Matches one character not in the specified General Category or the specified Block. 277 * </dl> 278 * </li> 279 * 280 * <li>Selection and Quantifier 281 * <dl> 282 * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR> 283 * <dd>... 284 * 285 * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD> 286 * <dd>Matches 0 or more <var>X</var>. 287 * 288 * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD> 289 * <dd>Matches 1 or more <var>X</var>. 290 * 291 * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD> 292 * <dd>Matches 0 or 1 <var>X</var>. 293 * 294 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd> 295 * <dd>Matches <var>number</var> times. 296 * 297 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd> 298 * <dd>... 299 * 300 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd> 301 * <dd>... 302 * 303 * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd> 304 * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd> 305 * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd> 306 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd> 307 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd> 308 * <dd>Non-greedy matching. 309 * </dl> 310 * </li> 311 * 312 * <li>Grouping, Capturing, and Back-reference 313 * <dl> 314 * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD> 315 * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>". 316 * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>", 317 * you have to write "<KBD>(?:foo)+</KBD>". 318 * 319 * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD> 320 * <dd>Grouping with capturing. 321 * It make a group and applications can know 322 * where in target text a group matched with methods of a <code>Match</code> instance 323 * after <code><a href="#matches(java.lang.String, com.sun.org.apache.xerces.internal.utils.regex.Match)">matches(String,Match)</a></code>. 324 * The 0th group means whole of this regular expression. 325 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis. 326 * 327 * <p>For instance, a regular expression is 328 * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>" 329 * and target text is 330 * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>": 331 * <ul> 332 * <li><code>Match.getCapturedText(0)</code>: 333 * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>" 334 * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>" 335 * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>" 336 * </ul> 337 * 338 * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd> 339 * <dd> 340 * 341 * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd> 342 * <dd>Independent expression group. ................ 343 * 344 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd> 345 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd> 346 * <dd>............................ 347 * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'. 348 * Note that it can not contain 'u'. 349 * 350 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd> 351 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd> 352 * <dd>...... 353 * <dd>These expressions must be at the beginning of a group. 354 * </dl> 355 * </li> 356 * 357 * <li>Anchor 358 * <dl> 359 * <dt class="REGEX"><kbd>\A</kbd> 360 * <dd>Matches the beginnig of the text. 361 * 362 * <dt class="REGEX"><kbd>\Z</kbd> 363 * <dd>Matches the end of the text, or before an EOL character at the end of the text, 364 * or CARRIAGE RETURN + LINE FEED at the end of the text. 365 * 366 * <dt class="REGEX"><kbd>\z</kbd> 367 * <dd>Matches the end of the text. 368 * 369 * <dt class="REGEX"><kbd>^</kbd> 370 * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>. 371 * <dd>When <a href="#M_OPTION">a "m" option</a> is set, 372 * it matches the beginning of the text, or after one of EOL characters ( 373 * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028), 374 * PARAGRAPH SEPARATOR (U+2029).) 375 * 376 * <dt class="REGEX"><kbd>$</kbd> 377 * <dd>Matches the end of the text, or before an EOL character at the end of the text, 378 * or CARRIAGE RETURN + LINE FEED at the end of the text. 379 * <dd>When <a href="#M_OPTION">a "m" option</a> is set, 380 * it matches the end of the text, or before an EOL character. 381 * 382 * <dt class="REGEX"><kbd>\b</kbd> 383 * <dd>Matches word boundary. 384 * (See <a href="#W_OPTION">a "w" option</a>) 385 * 386 * <dt class="REGEX"><kbd>\B</kbd> 387 * <dd>Matches non word boundary. 388 * (See <a href="#W_OPTION">a "w" option</a>) 389 * 390 * <dt class="REGEX"><kbd>\<</kbd> 391 * <dd>Matches the beginning of a word. 392 * (See <a href="#W_OPTION">a "w" option</a>) 393 * 394 * <dt class="REGEX"><kbd>\></kbd> 395 * <dd>Matches the end of a word. 396 * (See <a href="#W_OPTION">a "w" option</a>) 397 * </dl> 398 * </li> 399 * <li>Lookahead and lookbehind 400 * <dl> 401 * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd> 402 * <dd>Lookahead. 403 * 404 * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd> 405 * <dd>Negative lookahead. 406 * 407 * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd> 408 * <dd>Lookbehind. 409 * <dd>(Note for text capturing......) 410 * 411 * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd> 412 * <dd>Negative lookbehind. 413 * </dl> 414 * </li> 415 * 416 * <li>Misc. 417 * <dl> 418 * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>, 419 * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd> 420 * <dd>...... 421 * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd> 422 * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'. 423 * You can not write comments in character classes and before quantifiers. 424 * </dl> 425 * </li> 426 * </ul> 427 * 428 * 429 * <hr width="50%"> 430 * <h3>BNF for the regular expression</h3> 431 * <pre> 432 * regex ::= ('(?' options ')')? term ('|' term)* 433 * term ::= factor+ 434 * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )? 435 * | '(?#' [^)]* ')' 436 * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}' 437 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 438 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X' 439 * | '(?>' regex ')' | '(?' options ':' regex ')' 440 * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')' 441 * options ::= [imsw]* ('-' [imsw]+)? 442 * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 443 * looks ::= '(?=' regex ')' | '(?!' regex ')' 444 * | '(?<=' regex ')' | '(?<!' regex ')' 445 * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1 446 * category-block ::= '\' [pP] category-symbol-1 447 * | ('\p{' | '\P{') (category-symbol | block-name 448 * | other-properties) '}' 449 * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S' 450 * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo' 451 * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No' 452 * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs' 453 * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po' 454 * | 'Sm' | 'Sc' | 'Sk' | 'So' 455 * block-name ::= (See above) 456 * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED' 457 * character-1 ::= (any character except meta-characters) 458 * 459 * char-class ::= '[' ranges ']' 460 * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')' 461 * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+ 462 * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block 463 * | range-char | range-char '-' range-char 464 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2 465 * code-point ::= '\x' hex-char hex-char 466 * | '\x{' hex-char+ '}' 467 * <!-- | '\u005c u' hex-char hex-char hex-char hex-char 468 * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char 469 * hex-char ::= [0-9a-fA-F] 470 * character-2 ::= (any character except \[]-,) 471 * </pre> 472 * 473 * <hr width="50%"> 474 * <h3>TODO</h3> 475 * <ul> 476 * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a> 477 * <ul> 478 * <li>2.4 Canonical Equivalents 479 * <li>Level 3 480 * </ul> 481 * <li>Parsing performance 482 * </ul> 483 * 484 * <hr width="50%"> 485 * 486 * @xerces.internal 487 * 488 * @author TAMURA Kent <kent@trl.ibm.co.jp> 489 */ 490 public class RegularExpression implements java.io.Serializable { 491 492 private static final long serialVersionUID = 6242499334195006401L; 493 494 static final boolean DEBUG = false; 495 496 /** 497 * Compiles a token tree into an operation flow. 498 */ compile(Token tok)499 private synchronized void compile(Token tok) { 500 if (this.operations != null) 501 return; 502 this.numberOfClosures = 0; 503 this.operations = this.compile(tok, null, false); 504 } 505 506 /** 507 * Converts a token to an operation. 508 */ compile(Token tok, Op next, boolean reverse)509 private Op compile(Token tok, Op next, boolean reverse) { 510 Op ret; 511 switch (tok.type) { 512 case Token.DOT: 513 ret = Op.createDot(); 514 ret.next = next; 515 break; 516 517 case Token.CHAR: 518 ret = Op.createChar(tok.getChar()); 519 ret.next = next; 520 break; 521 522 case Token.ANCHOR: 523 ret = Op.createAnchor(tok.getChar()); 524 ret.next = next; 525 break; 526 527 case Token.RANGE: 528 case Token.NRANGE: 529 ret = Op.createRange(tok); 530 ret.next = next; 531 break; 532 533 case Token.CONCAT: 534 ret = next; 535 if (!reverse) { 536 for (int i = tok.size()-1; i >= 0; i --) { 537 ret = compile(tok.getChild(i), ret, false); 538 } 539 } else { 540 for (int i = 0; i < tok.size(); i ++) { 541 ret = compile(tok.getChild(i), ret, true); 542 } 543 } 544 break; 545 546 case Token.UNION: 547 Op.UnionOp uni = Op.createUnion(tok.size()); 548 for (int i = 0; i < tok.size(); i ++) { 549 uni.addElement(compile(tok.getChild(i), next, reverse)); 550 } 551 ret = uni; // ret.next is null. 552 break; 553 554 case Token.CLOSURE: 555 case Token.NONGREEDYCLOSURE: 556 Token child = tok.getChild(0); 557 int min = tok.getMin(); 558 int max = tok.getMax(); 559 if (min >= 0 && min == max) { // {n} 560 ret = next; 561 for (int i = 0; i < min; i ++) { 562 ret = compile(child, ret, reverse); 563 } 564 break; 565 } 566 if (min > 0 && max > 0) 567 max -= min; 568 if (max > 0) { 569 // X{2,6} -> XX(X(X(XX?)?)?)? 570 ret = next; 571 for (int i = 0; i < max; i ++) { 572 Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE); 573 q.next = next; 574 q.setChild(compile(child, ret, reverse)); 575 ret = q; 576 } 577 } else { 578 Op.ChildOp op; 579 if (tok.type == Token.NONGREEDYCLOSURE) { 580 op = Op.createNonGreedyClosure(); 581 } else { // Token.CLOSURE 582 op = Op.createClosure(this.numberOfClosures++); 583 } 584 op.next = next; 585 op.setChild(compile(child, op, reverse)); 586 ret = op; 587 } 588 if (min > 0) { 589 for (int i = 0; i < min; i ++) { 590 ret = compile(child, ret, reverse); 591 } 592 } 593 break; 594 595 case Token.EMPTY: 596 ret = next; 597 break; 598 599 case Token.STRING: 600 ret = Op.createString(tok.getString()); 601 ret.next = next; 602 break; 603 604 case Token.BACKREFERENCE: 605 ret = Op.createBackReference(tok.getReferenceNumber()); 606 ret.next = next; 607 break; 608 609 case Token.PAREN: 610 if (tok.getParenNumber() == 0) { 611 ret = compile(tok.getChild(0), next, reverse); 612 } else if (reverse) { 613 next = Op.createCapture(tok.getParenNumber(), next); 614 next = compile(tok.getChild(0), next, reverse); 615 ret = Op.createCapture(-tok.getParenNumber(), next); 616 } else { 617 next = Op.createCapture(-tok.getParenNumber(), next); 618 next = compile(tok.getChild(0), next, reverse); 619 ret = Op.createCapture(tok.getParenNumber(), next); 620 } 621 break; 622 623 case Token.LOOKAHEAD: 624 ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false)); 625 break; 626 case Token.NEGATIVELOOKAHEAD: 627 ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false)); 628 break; 629 case Token.LOOKBEHIND: 630 ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true)); 631 break; 632 case Token.NEGATIVELOOKBEHIND: 633 ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true)); 634 break; 635 636 case Token.INDEPENDENT: 637 ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse)); 638 break; 639 640 case Token.MODIFIERGROUP: 641 ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse), 642 ((Token.ModifierToken)tok).getOptions(), 643 ((Token.ModifierToken)tok).getOptionsMask()); 644 break; 645 646 case Token.CONDITION: 647 Token.ConditionToken ctok = (Token.ConditionToken)tok; 648 int ref = ctok.refNumber; 649 Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse); 650 Op yes = compile(ctok.yes, next, reverse); 651 Op no = ctok.no == null ? null : compile(ctok.no, next, reverse); 652 ret = Op.createCondition(next, ref, condition, yes, no); 653 break; 654 655 default: 656 throw new RuntimeException("Unknown token type: "+tok.type); 657 } // switch (tok.type) 658 return ret; 659 } 660 661 662 //Public 663 664 /** 665 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 666 * 667 * @return true if the target is matched to this regular expression. 668 */ matches(char[] target)669 public boolean matches(char[] target) { 670 return this.matches(target, 0, target .length , (Match)null); 671 } 672 673 /** 674 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 675 * in specified range or not. 676 * 677 * @param start Start offset of the range. 678 * @param end End offset +1 of the range. 679 * @return true if the target is matched to this regular expression. 680 */ matches(char[] target, int start, int end)681 public boolean matches(char[] target, int start, int end) { 682 return this.matches(target, start, end, (Match)null); 683 } 684 685 /** 686 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 687 * 688 * @param match A Match instance for storing matching result. 689 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 690 */ matches(char[] target, Match match)691 public boolean matches(char[] target, Match match) { 692 return this.matches(target, 0, target .length , match); 693 } 694 695 696 /** 697 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 698 * in specified range or not. 699 * 700 * @param start Start offset of the range. 701 * @param end End offset +1 of the range. 702 * @param match A Match instance for storing matching result. 703 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 704 */ matches(char[] target, int start, int end, Match match)705 public boolean matches(char[] target, int start, int end, Match match) { 706 707 synchronized (this) { 708 if (this.operations == null) 709 this.prepare(); 710 if (this.context == null) 711 this.context = new Context(); 712 } 713 Context con = null; 714 synchronized (this.context) { 715 con = this.context.inuse ? new Context() : this.context; 716 con.reset(target, start, end, this.numberOfClosures); 717 } 718 if (match != null) { 719 match.setNumberOfGroups(this.nofparen); 720 match.setSource(target); 721 } else if (this.hasBackReferences) { 722 match = new Match(); 723 match.setNumberOfGroups(this.nofparen); 724 // Need not to call setSource() because 725 // a caller can not access this match instance. 726 } 727 con.match = match; 728 729 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 730 int matchEnd = this. match(con, this.operations, con.start, 1, this.options); 731 //System.err.println("DEBUG: matchEnd="+matchEnd); 732 if (matchEnd == con.limit) { 733 if (con.match != null) { 734 con.match.setBeginning(0, con.start); 735 con.match.setEnd(0, matchEnd); 736 } 737 con.setInUse(false); 738 return true; 739 } 740 return false; 741 } 742 743 /* 744 * The pattern has only fixed string. 745 * The engine uses Boyer-Moore. 746 */ 747 if (this.fixedStringOnly) { 748 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 749 int o = this.fixedStringTable.matches(target, con.start, con.limit); 750 if (o >= 0) { 751 if (con.match != null) { 752 con.match.setBeginning(0, o); 753 con.match.setEnd(0, o+this.fixedString.length()); 754 } 755 con.setInUse(false); 756 return true; 757 } 758 con.setInUse(false); 759 return false; 760 } 761 762 /* 763 * The pattern contains a fixed string. 764 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 765 * If not, it return with false. 766 */ 767 if (this.fixedString != null) { 768 int o = this.fixedStringTable.matches(target, con.start, con.limit); 769 if (o < 0) { 770 //System.err.println("Non-match in fixed-string search."); 771 con.setInUse(false); 772 return false; 773 } 774 } 775 776 int limit = con.limit-this.minlength; 777 int matchStart; 778 int matchEnd = -1; 779 780 /* 781 * Checks whether the expression starts with ".*". 782 */ 783 if (this.operations != null 784 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 785 if (isSet(this.options, SINGLE_LINE)) { 786 matchStart = con.start; 787 matchEnd = this. match(con, this.operations, con.start, 1, this.options); 788 } else { 789 boolean previousIsEOL = true; 790 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 791 int ch = target [ matchStart ] ; 792 if (isEOLChar(ch)) { 793 previousIsEOL = true; 794 } else { 795 if (previousIsEOL) { 796 if (0 <= (matchEnd = this. match(con, this.operations, 797 matchStart, 1, this.options))) 798 break; 799 } 800 previousIsEOL = false; 801 } 802 } 803 } 804 } 805 806 /* 807 * Optimization against the first character. 808 */ 809 else if (this.firstChar != null) { 810 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 811 RangeToken range = this.firstChar; 812 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 813 int ch = target [matchStart] ; 814 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 815 ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]); 816 } 817 if (!range.match(ch)) { 818 continue; 819 } 820 if (0 <= (matchEnd = this. match(con, this.operations, 821 matchStart, 1, this.options))) { 822 break; 823 } 824 } 825 } 826 827 /* 828 * Straightforward matching. 829 */ 830 else { 831 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 832 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) 833 break; 834 } 835 } 836 837 if (matchEnd >= 0) { 838 if (con.match != null) { 839 con.match.setBeginning(0, matchStart); 840 con.match.setEnd(0, matchEnd); 841 } 842 con.setInUse(false); 843 return true; 844 } else { 845 con.setInUse(false); 846 return false; 847 } 848 } 849 850 /** 851 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 852 * 853 * @return true if the target is matched to this regular expression. 854 */ matches(String target)855 public boolean matches(String target) { 856 return this.matches(target, 0, target .length() , (Match)null); 857 } 858 859 /** 860 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 861 * in specified range or not. 862 * 863 * @param start Start offset of the range. 864 * @param end End offset +1 of the range. 865 * @return true if the target is matched to this regular expression. 866 */ matches(String target, int start, int end)867 public boolean matches(String target, int start, int end) { 868 return this.matches(target, start, end, (Match)null); 869 } 870 871 /** 872 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 873 * 874 * @param match A Match instance for storing matching result. 875 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 876 */ matches(String target, Match match)877 public boolean matches(String target, Match match) { 878 return this.matches(target, 0, target .length() , match); 879 } 880 881 /** 882 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 883 * in specified range or not. 884 * 885 * @param start Start offset of the range. 886 * @param end End offset +1 of the range. 887 * @param match A Match instance for storing matching result. 888 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 889 */ matches(String target, int start, int end, Match match)890 public boolean matches(String target, int start, int end, Match match) { 891 892 synchronized (this) { 893 if (this.operations == null) 894 this.prepare(); 895 if (this.context == null) 896 this.context = new Context(); 897 } 898 Context con = null; 899 synchronized (this.context) { 900 con = this.context.inuse ? new Context() : this.context; 901 con.reset(target, start, end, this.numberOfClosures); 902 } 903 if (match != null) { 904 match.setNumberOfGroups(this.nofparen); 905 match.setSource(target); 906 } else if (this.hasBackReferences) { 907 match = new Match(); 908 match.setNumberOfGroups(this.nofparen); 909 // Need not to call setSource() because 910 // a caller can not access this match instance. 911 } 912 con.match = match; 913 914 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 915 if (DEBUG) { 916 System.err.println("target string="+target); 917 } 918 int matchEnd = this. match(con, this.operations, con.start, 1, this.options); 919 if (DEBUG) { 920 System.err.println("matchEnd="+matchEnd); 921 System.err.println("con.limit="+con.limit); 922 } 923 if (matchEnd == con.limit) { 924 if (con.match != null) { 925 con.match.setBeginning(0, con.start); 926 con.match.setEnd(0, matchEnd); 927 } 928 con.setInUse(false); 929 return true; 930 } 931 return false; 932 } 933 934 /* 935 * The pattern has only fixed string. 936 * The engine uses Boyer-Moore. 937 */ 938 if (this.fixedStringOnly) { 939 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 940 int o = this.fixedStringTable.matches(target, con.start, con.limit); 941 if (o >= 0) { 942 if (con.match != null) { 943 con.match.setBeginning(0, o); 944 con.match.setEnd(0, o+this.fixedString.length()); 945 } 946 con.setInUse(false); 947 return true; 948 } 949 con.setInUse(false); 950 return false; 951 } 952 953 /* 954 * The pattern contains a fixed string. 955 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 956 * If not, it return with false. 957 */ 958 if (this.fixedString != null) { 959 int o = this.fixedStringTable.matches(target, con.start, con.limit); 960 if (o < 0) { 961 //System.err.println("Non-match in fixed-string search."); 962 con.setInUse(false); 963 return false; 964 } 965 } 966 967 int limit = con.limit-this.minlength; 968 int matchStart; 969 int matchEnd = -1; 970 971 /* 972 * Checks whether the expression starts with ".*". 973 */ 974 if (this.operations != null 975 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 976 if (isSet(this.options, SINGLE_LINE)) { 977 matchStart = con.start; 978 matchEnd = this.match(con, this.operations, con.start, 1, this.options); 979 } else { 980 boolean previousIsEOL = true; 981 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 982 int ch = target .charAt( matchStart ) ; 983 if (isEOLChar(ch)) { 984 previousIsEOL = true; 985 } else { 986 if (previousIsEOL) { 987 if (0 <= (matchEnd = this.match(con, this.operations, 988 matchStart, 1, this.options))) 989 break; 990 } 991 previousIsEOL = false; 992 } 993 } 994 } 995 } 996 997 /* 998 * Optimization against the first character. 999 */ 1000 else if (this.firstChar != null) { 1001 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 1002 RangeToken range = this.firstChar; 1003 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1004 int ch = target .charAt( matchStart ) ; 1005 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 1006 ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1)); 1007 } 1008 if (!range.match(ch)) { 1009 continue; 1010 } 1011 if (0 <= (matchEnd = this.match(con, this.operations, 1012 matchStart, 1, this.options))) { 1013 break; 1014 } 1015 } 1016 } 1017 1018 /* 1019 * Straightforward matching. 1020 */ 1021 else { 1022 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1023 if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options))) 1024 break; 1025 } 1026 } 1027 1028 if (matchEnd >= 0) { 1029 if (con.match != null) { 1030 con.match.setBeginning(0, matchStart); 1031 con.match.setEnd(0, matchEnd); 1032 } 1033 con.setInUse(false); 1034 return true; 1035 } else { 1036 con.setInUse(false); 1037 return false; 1038 } 1039 } 1040 1041 /** 1042 * @return -1 when not match; offset of the end of matched string when match. 1043 */ 1044 @SuppressWarnings("fallthrough") match(Context con, Op op, int offset, int dx, int opts)1045 private int match(Context con, Op op, int offset, int dx, int opts) { 1046 final ExpressionTarget target = con.target; 1047 final Stack<Op> opStack = new Stack<>(); 1048 final IntStack dataStack = new IntStack(); 1049 final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE); 1050 int retValue = -1; 1051 boolean returned = false; 1052 1053 for (;;) { 1054 if (op == null || offset > con.limit || offset < con.start) { 1055 if (op == null) { 1056 retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 1057 } 1058 else { 1059 retValue = -1; 1060 } 1061 returned = true; 1062 } 1063 else { 1064 retValue = -1; 1065 // dx value is either 1 or -1 1066 switch (op.type) { 1067 case Op.CHAR: 1068 { 1069 final int o1 = (dx > 0) ? offset : offset -1; 1070 if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) { 1071 returned = true; 1072 break; 1073 } 1074 offset += dx; 1075 op = op.next; 1076 } 1077 break; 1078 1079 case Op.DOT: 1080 { 1081 int o1 = (dx > 0) ? offset : offset - 1; 1082 if (o1 >= con.limit || o1 < 0) { 1083 returned = true; 1084 break; 1085 } 1086 if (isSet(opts, SINGLE_LINE)) { 1087 if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) { 1088 o1 += dx; 1089 } 1090 } 1091 else { 1092 int ch = target.charAt(o1); 1093 if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) { 1094 o1 += dx; 1095 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1)); 1096 } 1097 if (isEOLChar(ch)) { 1098 returned = true; 1099 break; 1100 } 1101 } 1102 offset = (dx > 0) ? o1 + 1 : o1; 1103 op = op.next; 1104 } 1105 break; 1106 1107 case Op.RANGE: 1108 case Op.NRANGE: 1109 { 1110 int o1 = (dx > 0) ? offset : offset -1; 1111 if (o1 >= con.limit || o1 < 0) { 1112 returned = true; 1113 break; 1114 } 1115 int ch = target.charAt(offset); 1116 if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) { 1117 o1 += dx; 1118 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1)); 1119 } 1120 final RangeToken tok = op.getToken(); 1121 if (!tok.match(ch)) { 1122 returned = true; 1123 break; 1124 } 1125 offset = (dx > 0) ? o1+1 : o1; 1126 op = op.next; 1127 } 1128 break; 1129 1130 case Op.ANCHOR: 1131 { 1132 if (!matchAnchor(target, op, con, offset, opts)) { 1133 returned = true; 1134 break; 1135 } 1136 op = op.next; 1137 } 1138 break; 1139 1140 case Op.BACKREFERENCE: 1141 { 1142 int refno = op.getData(); 1143 if (refno <= 0 || refno >= this.nofparen) { 1144 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); 1145 } 1146 if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) { 1147 returned = true; 1148 break; 1149 } 1150 int o2 = con.match.getBeginning(refno); 1151 int literallen = con.match.getEnd(refno)-o2; 1152 if (dx > 0) { 1153 if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) { 1154 returned = true; 1155 break; 1156 } 1157 offset += literallen; 1158 } 1159 else { 1160 if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) { 1161 returned = true; 1162 break; 1163 } 1164 offset -= literallen; 1165 } 1166 op = op.next; 1167 } 1168 break; 1169 1170 case Op.STRING: 1171 { 1172 String literal = op.getString(); 1173 int literallen = literal.length(); 1174 if (dx > 0) { 1175 if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) { 1176 returned = true; 1177 break; 1178 } 1179 offset += literallen; 1180 } 1181 else { 1182 if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) { 1183 returned = true; 1184 break; 1185 } 1186 offset -= literallen; 1187 } 1188 op = op.next; 1189 } 1190 break; 1191 1192 case Op.CLOSURE: 1193 { 1194 // Saves current position to avoid zero-width repeats. 1195 final int id = op.getData(); 1196 if (con.closureContexts[id].contains(offset)) { 1197 returned = true; 1198 break; 1199 } 1200 1201 con.closureContexts[id].addOffset(offset); 1202 } 1203 // fall through 1204 1205 case Op.QUESTION: 1206 { 1207 opStack.push(op); 1208 dataStack.push(offset); 1209 op = op.getChild(); 1210 } 1211 break; 1212 1213 case Op.NONGREEDYCLOSURE: 1214 case Op.NONGREEDYQUESTION: 1215 { 1216 opStack.push(op); 1217 dataStack.push(offset); 1218 op = op.next; 1219 } 1220 break; 1221 1222 case Op.UNION: 1223 if (op.size() == 0) { 1224 returned = true; 1225 } 1226 else { 1227 opStack.push(op); 1228 dataStack.push(0); 1229 dataStack.push(offset); 1230 op = op.elementAt(0); 1231 } 1232 break; 1233 1234 case Op.CAPTURE: 1235 { 1236 final int refno = op.getData(); 1237 if (con.match != null) { 1238 if (refno > 0) { 1239 dataStack.push(con.match.getBeginning(refno)); 1240 con.match.setBeginning(refno, offset); 1241 } 1242 else { 1243 final int index = -refno; 1244 dataStack.push(con.match.getEnd(index)); 1245 con.match.setEnd(index, offset); 1246 } 1247 opStack.push(op); 1248 dataStack.push(offset); 1249 } 1250 op = op.next; 1251 } 1252 break; 1253 1254 case Op.LOOKAHEAD: 1255 case Op.NEGATIVELOOKAHEAD: 1256 case Op.LOOKBEHIND: 1257 case Op.NEGATIVELOOKBEHIND: 1258 { 1259 opStack.push(op); 1260 dataStack.push(dx); 1261 dataStack.push(offset); 1262 dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1; 1263 op = op.getChild(); 1264 } 1265 break; 1266 1267 case Op.INDEPENDENT: 1268 { 1269 opStack.push(op); 1270 dataStack.push(offset); 1271 op = op.getChild(); 1272 } 1273 break; 1274 1275 case Op.MODIFIER: 1276 { 1277 int localopts = opts; 1278 localopts |= op.getData(); 1279 localopts &= ~op.getData2(); 1280 opStack.push(op); 1281 dataStack.push(opts); 1282 dataStack.push(offset); 1283 opts = localopts; 1284 op = op.getChild(); 1285 } 1286 break; 1287 1288 case Op.CONDITION: 1289 { 1290 Op.ConditionOp cop = (Op.ConditionOp)op; 1291 if (cop.refNumber > 0) { 1292 if (cop.refNumber >= this.nofparen) { 1293 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); 1294 } 1295 if (con.match.getBeginning(cop.refNumber) >= 0 1296 && con.match.getEnd(cop.refNumber) >= 0) { 1297 op = cop.yes; 1298 } 1299 else if (cop.no != null) { 1300 op = cop.no; 1301 } 1302 else { 1303 op = cop.next; 1304 } 1305 } 1306 else { 1307 opStack.push(op); 1308 dataStack.push(offset); 1309 op = cop.condition; 1310 } 1311 } 1312 break; 1313 1314 default: 1315 throw new RuntimeException("Unknown operation type: " + op.type); 1316 } 1317 } 1318 1319 // handle recursive operations 1320 while (returned) { 1321 // exhausted all the operations 1322 if (opStack.isEmpty()) { 1323 return retValue; 1324 } 1325 1326 op = opStack.pop(); 1327 offset = dataStack.pop(); 1328 1329 switch (op.type) { 1330 case Op.CLOSURE: 1331 case Op.QUESTION: 1332 if (retValue < 0) { 1333 op = op.next; 1334 returned = false; 1335 } 1336 break; 1337 1338 case Op.NONGREEDYCLOSURE: 1339 case Op.NONGREEDYQUESTION: 1340 if (retValue < 0) { 1341 op = op.getChild(); 1342 returned = false; 1343 } 1344 break; 1345 1346 case Op.UNION: 1347 { 1348 int unionIndex = dataStack.pop(); 1349 if (DEBUG) { 1350 System.err.println("UNION: "+unionIndex+", ret="+retValue); 1351 } 1352 1353 if (retValue < 0) { 1354 if (++unionIndex < op.size()) { 1355 opStack.push(op); 1356 dataStack.push(unionIndex); 1357 dataStack.push(offset); 1358 op = op.elementAt(unionIndex); 1359 returned = false; 1360 } 1361 else { 1362 retValue = -1; 1363 } 1364 } 1365 } 1366 break; 1367 1368 case Op.CAPTURE: 1369 final int refno = op.getData(); 1370 final int saved = dataStack.pop(); 1371 if (retValue < 0) { 1372 if (refno > 0) { 1373 con.match.setBeginning(refno, saved); 1374 } 1375 else { 1376 con.match.setEnd(-refno, saved); 1377 } 1378 } 1379 break; 1380 1381 case Op.LOOKAHEAD: 1382 case Op.LOOKBEHIND: 1383 { 1384 dx = dataStack.pop(); 1385 if (0 <= retValue) { 1386 op = op.next; 1387 returned = false; 1388 } 1389 retValue = -1; 1390 } 1391 break; 1392 1393 case Op.NEGATIVELOOKAHEAD: 1394 case Op.NEGATIVELOOKBEHIND: 1395 { 1396 dx = dataStack.pop(); 1397 if (0 > retValue) { 1398 op = op.next; 1399 returned = false; 1400 } 1401 retValue = -1; 1402 } 1403 break; 1404 1405 case Op.MODIFIER: 1406 opts = dataStack.pop(); 1407 // fall through 1408 1409 case Op.INDEPENDENT: 1410 if (retValue >= 0) { 1411 offset = retValue; 1412 op = op.next; 1413 returned = false; 1414 } 1415 break; 1416 1417 case Op.CONDITION: 1418 { 1419 final Op.ConditionOp cop = (Op.ConditionOp)op; 1420 if (0 <= retValue) { 1421 op = cop.yes; 1422 } 1423 else if (cop.no != null) { 1424 op = cop.no; 1425 } 1426 else { 1427 op = cop.next; 1428 } 1429 } 1430 returned = false; 1431 break; 1432 1433 default: 1434 break; 1435 } 1436 } 1437 } 1438 } 1439 matchChar(int ch, int other, boolean ignoreCase)1440 private boolean matchChar(int ch, int other, boolean ignoreCase) { 1441 return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other; 1442 } 1443 matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts)1444 boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) { 1445 boolean go = false; 1446 switch (op.getData()) { 1447 case '^': 1448 if (isSet(opts, MULTIPLE_LINES)) { 1449 if (!(offset == con.start 1450 || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1)))) 1451 return false; 1452 } else { 1453 if (offset != con.start) 1454 return false; 1455 } 1456 break; 1457 1458 case '@': // Internal use only. 1459 // The @ always matches line beginnings. 1460 if (!(offset == con.start 1461 || offset > con.start && isEOLChar(target.charAt(offset-1)))) 1462 return false; 1463 break; 1464 1465 case '$': 1466 if (isSet(opts, MULTIPLE_LINES)) { 1467 if (!(offset == con.limit 1468 || offset < con.limit && isEOLChar(target.charAt(offset)))) 1469 return false; 1470 } else { 1471 if (!(offset == con.limit 1472 || offset+1 == con.limit && isEOLChar(target.charAt(offset)) 1473 || offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN 1474 && target.charAt(offset+1) == LINE_FEED)) 1475 return false; 1476 } 1477 break; 1478 1479 case 'A': 1480 if (offset != con.start) return false; 1481 break; 1482 1483 case 'Z': 1484 if (!(offset == con.limit 1485 || offset+1 == con.limit && isEOLChar(target.charAt(offset)) 1486 || offset+2 == con.limit && target.charAt(offset) == CARRIAGE_RETURN 1487 && target.charAt(offset+1) == LINE_FEED)) 1488 return false; 1489 break; 1490 1491 case 'z': 1492 if (offset != con.limit) return false; 1493 break; 1494 1495 case 'b': 1496 if (con.length == 0) 1497 return false; 1498 { 1499 int after = getWordType(target, con.start, con.limit, offset, opts); 1500 if (after == WT_IGNORE) return false; 1501 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 1502 if (after == before) return false; 1503 } 1504 break; 1505 1506 case 'B': 1507 if (con.length == 0) 1508 go = true; 1509 else { 1510 int after = getWordType(target, con.start, con.limit, offset, opts); 1511 go = after == WT_IGNORE 1512 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 1513 } 1514 if (!go) return false; 1515 break; 1516 1517 case '<': 1518 if (con.length == 0 || offset == con.limit) return false; 1519 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 1520 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 1521 return false; 1522 break; 1523 1524 case '>': 1525 if (con.length == 0 || offset == con.start) return false; 1526 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 1527 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 1528 return false; 1529 break; 1530 } // switch anchor type 1531 1532 return true; 1533 } 1534 getPreviousWordType(ExpressionTarget target, int begin, int end, int offset, int opts)1535 private static final int getPreviousWordType(ExpressionTarget target, int begin, int end, 1536 int offset, int opts) { 1537 int ret = getWordType(target, begin, end, --offset, opts); 1538 while (ret == WT_IGNORE) 1539 ret = getWordType(target, begin, end, --offset, opts); 1540 return ret; 1541 } 1542 getWordType(ExpressionTarget target, int begin, int end, int offset, int opts)1543 private static final int getWordType(ExpressionTarget target, int begin, int end, 1544 int offset, int opts) { 1545 if (offset < begin || offset >= end) return WT_OTHER; 1546 return getWordType0(target.charAt(offset) , opts); 1547 } 1548 1549 1550 /** 1551 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 1552 * 1553 * @return true if the target is matched to this regular expression. 1554 */ matches(CharacterIterator target)1555 public boolean matches(CharacterIterator target) { 1556 return this.matches(target, (Match)null); 1557 } 1558 1559 1560 /** 1561 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 1562 * 1563 * @param match A Match instance for storing matching result. 1564 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 1565 */ matches(CharacterIterator target, Match match)1566 public boolean matches(CharacterIterator target, Match match) { 1567 int start = target.getBeginIndex(); 1568 int end = target.getEndIndex(); 1569 1570 1571 1572 synchronized (this) { 1573 if (this.operations == null) 1574 this.prepare(); 1575 if (this.context == null) 1576 this.context = new Context(); 1577 } 1578 Context con = null; 1579 synchronized (this.context) { 1580 con = this.context.inuse ? new Context() : this.context; 1581 con.reset(target, start, end, this.numberOfClosures); 1582 } 1583 if (match != null) { 1584 match.setNumberOfGroups(this.nofparen); 1585 match.setSource(target); 1586 } else if (this.hasBackReferences) { 1587 match = new Match(); 1588 match.setNumberOfGroups(this.nofparen); 1589 // Need not to call setSource() because 1590 // a caller can not access this match instance. 1591 } 1592 con.match = match; 1593 1594 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 1595 int matchEnd = this.match(con, this.operations, con.start, 1, this.options); 1596 //System.err.println("DEBUG: matchEnd="+matchEnd); 1597 if (matchEnd == con.limit) { 1598 if (con.match != null) { 1599 con.match.setBeginning(0, con.start); 1600 con.match.setEnd(0, matchEnd); 1601 } 1602 con.setInUse(false); 1603 return true; 1604 } 1605 return false; 1606 } 1607 1608 /* 1609 * The pattern has only fixed string. 1610 * The engine uses Boyer-Moore. 1611 */ 1612 if (this.fixedStringOnly) { 1613 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 1614 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1615 if (o >= 0) { 1616 if (con.match != null) { 1617 con.match.setBeginning(0, o); 1618 con.match.setEnd(0, o+this.fixedString.length()); 1619 } 1620 con.setInUse(false); 1621 return true; 1622 } 1623 con.setInUse(false); 1624 return false; 1625 } 1626 1627 /* 1628 * The pattern contains a fixed string. 1629 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 1630 * If not, it return with false. 1631 */ 1632 if (this.fixedString != null) { 1633 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1634 if (o < 0) { 1635 //System.err.println("Non-match in fixed-string search."); 1636 con.setInUse(false); 1637 return false; 1638 } 1639 } 1640 1641 int limit = con.limit-this.minlength; 1642 int matchStart; 1643 int matchEnd = -1; 1644 1645 /* 1646 * Checks whether the expression starts with ".*". 1647 */ 1648 if (this.operations != null 1649 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 1650 if (isSet(this.options, SINGLE_LINE)) { 1651 matchStart = con.start; 1652 matchEnd = this.match(con, this.operations, con.start, 1, this.options); 1653 } else { 1654 boolean previousIsEOL = true; 1655 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1656 int ch = target .setIndex( matchStart ) ; 1657 if (isEOLChar(ch)) { 1658 previousIsEOL = true; 1659 } else { 1660 if (previousIsEOL) { 1661 if (0 <= (matchEnd = this.match(con, this.operations, 1662 matchStart, 1, this.options))) 1663 break; 1664 } 1665 previousIsEOL = false; 1666 } 1667 } 1668 } 1669 } 1670 1671 /* 1672 * Optimization against the first character. 1673 */ 1674 else if (this.firstChar != null) { 1675 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 1676 RangeToken range = this.firstChar; 1677 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1678 int ch = target .setIndex( matchStart ) ; 1679 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 1680 ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1)); 1681 } 1682 if (!range.match(ch)) { 1683 continue; 1684 } 1685 if (0 <= (matchEnd = this.match(con, this.operations, 1686 matchStart, 1, this.options))) { 1687 break; 1688 } 1689 } 1690 } 1691 1692 /* 1693 * Straightforward matching. 1694 */ 1695 else { 1696 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1697 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options))) 1698 break; 1699 } 1700 } 1701 1702 if (matchEnd >= 0) { 1703 if (con.match != null) { 1704 con.match.setBeginning(0, matchStart); 1705 con.match.setEnd(0, matchEnd); 1706 } 1707 con.setInUse(false); 1708 return true; 1709 } else { 1710 con.setInUse(false); 1711 return false; 1712 } 1713 } 1714 1715 // ================================================================ 1716 1717 /** 1718 * A regular expression. 1719 * @serial 1720 */ 1721 String regex; 1722 /** 1723 * @serial 1724 */ 1725 int options; 1726 1727 /** 1728 * The number of parenthesis in the regular expression. 1729 * @serial 1730 */ 1731 int nofparen; 1732 /** 1733 * Internal representation of the regular expression. 1734 * @serial 1735 */ 1736 Token tokentree; 1737 1738 boolean hasBackReferences = false; 1739 1740 transient int minlength; 1741 transient Op operations = null; 1742 transient int numberOfClosures; 1743 transient Context context = null; 1744 transient RangeToken firstChar = null; 1745 1746 transient String fixedString = null; 1747 transient int fixedStringOptions; 1748 transient BMPattern fixedStringTable = null; 1749 transient boolean fixedStringOnly = false; 1750 1751 static abstract class ExpressionTarget { charAt(int index)1752 abstract char charAt(int index); regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen)1753 abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen); regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen)1754 abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen); 1755 } 1756 1757 static final class StringTarget extends ExpressionTarget { 1758 1759 private String target; 1760 StringTarget(String target)1761 StringTarget(String target) { 1762 this.target = target; 1763 } 1764 resetTarget(String target)1765 final void resetTarget(String target) { 1766 this.target = target; 1767 } 1768 charAt(int index)1769 final char charAt(int index) { 1770 return target.charAt(index); 1771 } 1772 regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen)1773 final boolean regionMatches(boolean ignoreCase, int offset, int limit, 1774 String part, int partlen) { 1775 if (limit-offset < partlen) { 1776 return false; 1777 } 1778 return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen); 1779 } 1780 regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen)1781 final boolean regionMatches(boolean ignoreCase, int offset, int limit, 1782 int offset2, int partlen) { 1783 if (limit-offset < partlen) { 1784 return false; 1785 } 1786 return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen) 1787 : target.regionMatches(offset, target, offset2, partlen); 1788 } 1789 } 1790 1791 static final class CharArrayTarget extends ExpressionTarget { 1792 1793 char[] target; 1794 CharArrayTarget(char[] target)1795 CharArrayTarget(char[] target) { 1796 this.target = target; 1797 } 1798 resetTarget(char[] target)1799 final void resetTarget(char[] target) { 1800 this.target = target; 1801 } 1802 charAt(int index)1803 char charAt(int index) { 1804 return target[index]; 1805 } 1806 regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen)1807 final boolean regionMatches(boolean ignoreCase, int offset, int limit, 1808 String part, int partlen) { 1809 if (offset < 0 || limit-offset < partlen) { 1810 return false; 1811 } 1812 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen) 1813 : regionMatches(offset, limit, part, partlen); 1814 } 1815 regionMatches(int offset, int limit, String part, int partlen)1816 private final boolean regionMatches(int offset, int limit, String part, int partlen) { 1817 int i = 0; 1818 while (partlen-- > 0) { 1819 if (target[offset++] != part.charAt(i++)) { 1820 return false; 1821 } 1822 } 1823 return true; 1824 } 1825 regionMatchesIgnoreCase(int offset, int limit, String part, int partlen)1826 private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) { 1827 int i = 0; 1828 while (partlen-- > 0) { 1829 final char ch1 = target[offset++] ; 1830 final char ch2 = part.charAt(i++); 1831 if (ch1 == ch2) { 1832 continue; 1833 } 1834 final char uch1 = Character.toUpperCase(ch1); 1835 final char uch2 = Character.toUpperCase(ch2); 1836 if (uch1 == uch2) { 1837 continue; 1838 } 1839 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { 1840 return false; 1841 } 1842 } 1843 return true; 1844 } 1845 regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen)1846 final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) { 1847 if (offset < 0 || limit-offset < partlen) { 1848 return false; 1849 } 1850 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen) 1851 : regionMatches(offset, limit, offset2, partlen); 1852 } 1853 regionMatches(int offset, int limit, int offset2, int partlen)1854 private final boolean regionMatches(int offset, int limit, int offset2, int partlen) { 1855 int i = offset2; 1856 while (partlen-- > 0) { 1857 if ( target [ offset++ ] != target [ i++ ] ) 1858 return false; 1859 } 1860 return true; 1861 } 1862 regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen)1863 private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) { 1864 int i = offset2; 1865 while (partlen-- > 0) { 1866 final char ch1 = target[offset++] ; 1867 final char ch2 = target[i++] ; 1868 if (ch1 == ch2) { 1869 continue; 1870 } 1871 final char uch1 = Character.toUpperCase(ch1); 1872 final char uch2 = Character.toUpperCase(ch2); 1873 if (uch1 == uch2) { 1874 continue; 1875 } 1876 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { 1877 return false; 1878 } 1879 } 1880 return true; 1881 } 1882 } 1883 1884 static final class CharacterIteratorTarget extends ExpressionTarget { 1885 CharacterIterator target; 1886 CharacterIteratorTarget(CharacterIterator target)1887 CharacterIteratorTarget(CharacterIterator target) { 1888 this.target = target; 1889 } 1890 resetTarget(CharacterIterator target)1891 final void resetTarget(CharacterIterator target) { 1892 this.target = target; 1893 } 1894 charAt(int index)1895 final char charAt(int index) { 1896 return target.setIndex(index); 1897 } 1898 regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen)1899 final boolean regionMatches(boolean ignoreCase, int offset, int limit, 1900 String part, int partlen) { 1901 if (offset < 0 || limit-offset < partlen) { 1902 return false; 1903 } 1904 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen) 1905 : regionMatches(offset, limit, part, partlen); 1906 } 1907 regionMatches(int offset, int limit, String part, int partlen)1908 private final boolean regionMatches(int offset, int limit, String part, int partlen) { 1909 int i = 0; 1910 while (partlen-- > 0) { 1911 if (target.setIndex(offset++) != part.charAt(i++)) { 1912 return false; 1913 } 1914 } 1915 return true; 1916 } 1917 regionMatchesIgnoreCase(int offset, int limit, String part, int partlen)1918 private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) { 1919 int i = 0; 1920 while (partlen-- > 0) { 1921 final char ch1 = target.setIndex(offset++) ; 1922 final char ch2 = part.charAt(i++); 1923 if (ch1 == ch2) { 1924 continue; 1925 } 1926 final char uch1 = Character.toUpperCase(ch1); 1927 final char uch2 = Character.toUpperCase(ch2); 1928 if (uch1 == uch2) { 1929 continue; 1930 } 1931 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { 1932 return false; 1933 } 1934 } 1935 return true; 1936 } 1937 regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen)1938 final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) { 1939 if (offset < 0 || limit-offset < partlen) { 1940 return false; 1941 } 1942 return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen) 1943 : regionMatches(offset, limit, offset2, partlen); 1944 } 1945 regionMatches(int offset, int limit, int offset2, int partlen)1946 private final boolean regionMatches(int offset, int limit, int offset2, int partlen) { 1947 int i = offset2; 1948 while (partlen-- > 0) { 1949 if (target.setIndex(offset++) != target.setIndex(i++)) { 1950 return false; 1951 } 1952 } 1953 return true; 1954 } 1955 regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen)1956 private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) { 1957 int i = offset2; 1958 while (partlen-- > 0) { 1959 final char ch1 = target.setIndex(offset++) ; 1960 final char ch2 = target.setIndex(i++) ; 1961 if (ch1 == ch2) { 1962 continue; 1963 } 1964 final char uch1 = Character.toUpperCase(ch1); 1965 final char uch2 = Character.toUpperCase(ch2); 1966 if (uch1 == uch2) { 1967 continue; 1968 } 1969 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) { 1970 return false; 1971 } 1972 } 1973 return true; 1974 } 1975 } 1976 1977 static final class ClosureContext { 1978 1979 int[] offsets = new int[4]; 1980 int currentIndex = 0; 1981 contains(int offset)1982 boolean contains(int offset) { 1983 for (int i=0; i<currentIndex;++i) { 1984 if (offsets[i] == offset) { 1985 return true; 1986 } 1987 } 1988 return false; 1989 } 1990 reset()1991 void reset() { 1992 currentIndex = 0; 1993 } 1994 addOffset(int offset)1995 void addOffset(int offset) { 1996 // We do not check for duplicates, caller is responsible for that 1997 if (currentIndex == offsets.length) { 1998 offsets = expandOffsets(); 1999 } 2000 offsets[currentIndex++] = offset; 2001 } 2002 expandOffsets()2003 private int[] expandOffsets() { 2004 final int len = offsets.length; 2005 final int newLen = len << 1; 2006 int[] newOffsets = new int[newLen]; 2007 2008 System.arraycopy(offsets, 0, newOffsets, 0, currentIndex); 2009 return newOffsets; 2010 } 2011 } 2012 2013 static final class Context { 2014 int start; 2015 int limit; 2016 int length; 2017 Match match; 2018 boolean inuse = false; 2019 ClosureContext[] closureContexts; 2020 2021 private StringTarget stringTarget; 2022 private CharArrayTarget charArrayTarget; 2023 private CharacterIteratorTarget characterIteratorTarget; 2024 2025 ExpressionTarget target; 2026 Context()2027 Context() { 2028 } 2029 resetCommon(int nofclosures)2030 private void resetCommon(int nofclosures) { 2031 this.length = this.limit-this.start; 2032 setInUse(true); 2033 this.match = null; 2034 if (this.closureContexts == null || this.closureContexts.length != nofclosures) { 2035 this.closureContexts = new ClosureContext[nofclosures]; 2036 } 2037 for (int i = 0; i < nofclosures; i ++) { 2038 if (this.closureContexts[i] == null) { 2039 this.closureContexts[i] = new ClosureContext(); 2040 } 2041 else { 2042 this.closureContexts[i].reset(); 2043 } 2044 } 2045 } 2046 reset(CharacterIterator target, int start, int limit, int nofclosures)2047 void reset(CharacterIterator target, int start, int limit, int nofclosures) { 2048 if (characterIteratorTarget == null) { 2049 characterIteratorTarget = new CharacterIteratorTarget(target); 2050 } 2051 else { 2052 characterIteratorTarget.resetTarget(target); 2053 } 2054 this.target = characterIteratorTarget; 2055 this.start = start; 2056 this.limit = limit; 2057 this.resetCommon(nofclosures); 2058 } 2059 reset(String target, int start, int limit, int nofclosures)2060 void reset(String target, int start, int limit, int nofclosures) { 2061 if (stringTarget == null) { 2062 stringTarget = new StringTarget(target); 2063 } 2064 else { 2065 stringTarget.resetTarget(target); 2066 } 2067 this.target = stringTarget; 2068 this.start = start; 2069 this.limit = limit; 2070 this.resetCommon(nofclosures); 2071 } 2072 reset(char[] target, int start, int limit, int nofclosures)2073 void reset(char[] target, int start, int limit, int nofclosures) { 2074 if (charArrayTarget == null) { 2075 charArrayTarget = new CharArrayTarget(target); 2076 } 2077 else { 2078 charArrayTarget.resetTarget(target); 2079 } 2080 this.target = charArrayTarget; 2081 this.start = start; 2082 this.limit = limit; 2083 this.resetCommon(nofclosures); 2084 } setInUse(boolean inUse)2085 synchronized void setInUse(boolean inUse) { 2086 this.inuse = inUse; 2087 } 2088 } 2089 2090 /** 2091 * Prepares for matching. This method is called just before starting matching. 2092 */ prepare()2093 void prepare() { 2094 if (Op.COUNT) Op.nofinstances = 0; 2095 this.compile(this.tokentree); 2096 /* 2097 if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .* 2098 Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@'); 2099 anchor.next = this.operations; 2100 this.operations = anchor; 2101 } 2102 */ 2103 if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances); 2104 2105 this.minlength = this.tokentree.getMinLength(); 2106 2107 this.firstChar = null; 2108 if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) 2109 && !isSet(this.options, XMLSCHEMA_MODE)) { 2110 RangeToken firstChar = Token.createRange(); 2111 int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options); 2112 if (fresult == Token.FC_TERMINAL) { 2113 firstChar.compactRanges(); 2114 this.firstChar = firstChar; 2115 if (DEBUG) 2116 System.err.println("DEBUG: Use the first character optimization: "+firstChar); 2117 } 2118 } 2119 2120 if (this.operations != null 2121 && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR) 2122 && this.operations.next == null) { 2123 if (DEBUG) 2124 System.err.print(" *** Only fixed string! *** "); 2125 this.fixedStringOnly = true; 2126 if (this.operations.type == Op.STRING) 2127 this.fixedString = this.operations.getString(); 2128 else if (this.operations.getData() >= 0x10000) { // Op.CHAR 2129 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData()); 2130 } else { 2131 char[] ac = new char[1]; 2132 ac[0] = (char)this.operations.getData(); 2133 this.fixedString = new String(ac); 2134 } 2135 this.fixedStringOptions = this.options; 2136 this.fixedStringTable = new BMPattern(this.fixedString, 256, 2137 isSet(this.fixedStringOptions, IGNORE_CASE)); 2138 } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION) 2139 && !isSet(this.options, XMLSCHEMA_MODE)) { 2140 Token.FixedStringContainer container = new Token.FixedStringContainer(); 2141 this.tokentree.findFixedString(container, this.options); 2142 this.fixedString = container.token == null ? null : container.token.getString(); 2143 this.fixedStringOptions = container.options; 2144 if (this.fixedString != null && this.fixedString.length() < 2) 2145 this.fixedString = null; 2146 // This pattern has a fixed string of which length is more than one. 2147 if (this.fixedString != null) { 2148 this.fixedStringTable = new BMPattern(this.fixedString, 256, 2149 isSet(this.fixedStringOptions, IGNORE_CASE)); 2150 if (DEBUG) { 2151 System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length() 2152 +"/" //+this.fixedString 2153 +"/"+REUtil.createOptionString(this.fixedStringOptions)); 2154 System.err.print("String: "); 2155 REUtil.dumpString(this.fixedString); 2156 } 2157 } 2158 } 2159 } 2160 2161 /** 2162 * An option. 2163 * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span> 2164 * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span> 2165 * does not capture. 2166 * 2167 * @see #RegularExpression(java.lang.String,int) 2168 * @see #setPattern(java.lang.String,int) 2169 static final int MARK_PARENS = 1<<0; 2170 */ 2171 2172 /** 2173 * "i" 2174 */ 2175 static final int IGNORE_CASE = 1<<1; 2176 2177 /** 2178 * "s" 2179 */ 2180 static final int SINGLE_LINE = 1<<2; 2181 2182 /** 2183 * "m" 2184 */ 2185 static final int MULTIPLE_LINES = 1<<3; 2186 2187 /** 2188 * "x" 2189 */ 2190 static final int EXTENDED_COMMENT = 1<<4; 2191 2192 /** 2193 * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>. 2194 * 2195 * @see #RegularExpression(java.lang.String,int) 2196 * @see #setPattern(java.lang.String,int) 2197 * @see #UNICODE_WORD_BOUNDARY 2198 */ 2199 static final int USE_UNICODE_CATEGORY = 1<<5; // "u" 2200 2201 /** 2202 * An option. 2203 * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>. 2204 * <p>By default, the engine considers a position between a word character 2205 * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character 2206 * is a word boundary. 2207 * <p>By this option, the engine checks word boundaries with the method of 2208 * 'Unicode Regular Expression Guidelines' Revision 4. 2209 * 2210 * @see #RegularExpression(java.lang.String,int) 2211 * @see #setPattern(java.lang.String,int) 2212 */ 2213 static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w" 2214 2215 /** 2216 * "H" 2217 */ 2218 static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7; 2219 /** 2220 * "F" 2221 */ 2222 static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8; 2223 /** 2224 * "X". XML Schema mode. 2225 */ 2226 static final int XMLSCHEMA_MODE = 1<<9; 2227 /** 2228 * ",". 2229 */ 2230 static final int SPECIAL_COMMA = 1<<10; 2231 2232 isSet(int options, int flag)2233 private static final boolean isSet(int options, int flag) { 2234 return (options & flag) == flag; 2235 } 2236 2237 /** 2238 * Creates a new RegularExpression instance. 2239 * 2240 * @param regex A regular expression 2241 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2242 */ RegularExpression(String regex)2243 public RegularExpression(String regex) throws ParseException { 2244 this(regex, null); 2245 } 2246 2247 /** 2248 * Creates a new RegularExpression instance with options. 2249 * 2250 * @param regex A regular expression 2251 * @param options A String consisted of "i" "m" "s" "u" "w" "," "X" 2252 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2253 */ RegularExpression(String regex, String options)2254 public RegularExpression(String regex, String options) throws ParseException { 2255 this.setPattern(regex, options); 2256 } 2257 2258 /** 2259 * Creates a new RegularExpression instance with options. 2260 * 2261 * @param regex A regular expression 2262 * @param options A String consisted of "i" "m" "s" "u" "w" "," "X" 2263 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2264 */ RegularExpression(String regex, String options, Locale locale)2265 public RegularExpression(String regex, String options, Locale locale) throws ParseException { 2266 this.setPattern(regex, options, locale); 2267 } 2268 RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options)2269 RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) { 2270 this.regex = regex; 2271 this.tokentree = tok; 2272 this.nofparen = parens; 2273 this.options = options; 2274 this.hasBackReferences = hasBackReferences; 2275 } 2276 2277 /** 2278 * 2279 */ setPattern(String newPattern)2280 public void setPattern(String newPattern) throws ParseException { 2281 this.setPattern(newPattern, Locale.getDefault()); 2282 } 2283 setPattern(String newPattern, Locale locale)2284 public void setPattern(String newPattern, Locale locale) throws ParseException { 2285 this.setPattern(newPattern, this.options, locale); 2286 } 2287 setPattern(String newPattern, int options, Locale locale)2288 private void setPattern(String newPattern, int options, Locale locale) throws ParseException { 2289 this.regex = newPattern; 2290 this.options = options; 2291 RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE) 2292 ? new ParserForXMLSchema(locale) : new RegexParser(locale); 2293 this.tokentree = rp.parse(this.regex, this.options); 2294 this.nofparen = rp.parennumber; 2295 this.hasBackReferences = rp.hasBackReferences; 2296 2297 this.operations = null; 2298 this.context = null; 2299 } 2300 /** 2301 * 2302 */ setPattern(String newPattern, String options)2303 public void setPattern(String newPattern, String options) throws ParseException { 2304 this.setPattern(newPattern, options, Locale.getDefault()); 2305 } 2306 setPattern(String newPattern, String options, Locale locale)2307 public void setPattern(String newPattern, String options, Locale locale) throws ParseException { 2308 this.setPattern(newPattern, REUtil.parseOptions(options), locale); 2309 } 2310 2311 /** 2312 * 2313 */ getPattern()2314 public String getPattern() { 2315 return this.regex; 2316 } 2317 2318 /** 2319 * Represents this instence in String. 2320 */ toString()2321 public String toString() { 2322 return this.tokentree.toString(this.options); 2323 } 2324 2325 /** 2326 * Returns a option string. 2327 * The order of letters in it may be different from a string specified 2328 * in a constructor or <code>setPattern()</code>. 2329 * 2330 * @see #RegularExpression(java.lang.String,java.lang.String) 2331 * @see #setPattern(java.lang.String,java.lang.String) 2332 */ getOptions()2333 public String getOptions() { 2334 return REUtil.createOptionString(this.options); 2335 } 2336 2337 /** 2338 * Return true if patterns are the same and the options are equivalent. 2339 */ equals(Object obj)2340 public boolean equals(Object obj) { 2341 if (obj == null) return false; 2342 if (!(obj instanceof RegularExpression)) 2343 return false; 2344 RegularExpression r = (RegularExpression)obj; 2345 return this.regex.equals(r.regex) && this.options == r.options; 2346 } 2347 equals(String pattern, int options)2348 boolean equals(String pattern, int options) { 2349 return this.regex.equals(pattern) && this.options == options; 2350 } 2351 2352 /** 2353 * 2354 */ hashCode()2355 public int hashCode() { 2356 return (this.regex+"/"+this.getOptions()).hashCode(); 2357 } 2358 2359 /** 2360 * Return the number of regular expression groups. 2361 * This method returns 1 when the regular expression has no capturing-parenthesis. 2362 * 2363 */ getNumberOfGroups()2364 public int getNumberOfGroups() { 2365 return this.nofparen; 2366 } 2367 2368 // ================================================================ 2369 2370 private static final int WT_IGNORE = 0; 2371 private static final int WT_LETTER = 1; 2372 private static final int WT_OTHER = 2; getWordType0(char ch, int opts)2373 private static final int getWordType0(char ch, int opts) { 2374 if (!isSet(opts, UNICODE_WORD_BOUNDARY)) { 2375 if (isSet(opts, USE_UNICODE_CATEGORY)) { 2376 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER; 2377 } 2378 return isWordChar(ch) ? WT_LETTER : WT_OTHER; 2379 } 2380 2381 switch (Character.getType(ch)) { 2382 case Character.UPPERCASE_LETTER: // L 2383 case Character.LOWERCASE_LETTER: // L 2384 case Character.TITLECASE_LETTER: // L 2385 case Character.MODIFIER_LETTER: // L 2386 case Character.OTHER_LETTER: // L 2387 case Character.LETTER_NUMBER: // N 2388 case Character.DECIMAL_DIGIT_NUMBER: // N 2389 case Character.OTHER_NUMBER: // N 2390 case Character.COMBINING_SPACING_MARK: // Mc 2391 return WT_LETTER; 2392 2393 case Character.FORMAT: // Cf 2394 case Character.NON_SPACING_MARK: // Mn 2395 case Character.ENCLOSING_MARK: // Mc 2396 return WT_IGNORE; 2397 2398 case Character.CONTROL: // Cc 2399 switch (ch) { 2400 case '\t': 2401 case '\n': 2402 case '\u000B': 2403 case '\f': 2404 case '\r': 2405 return WT_OTHER; 2406 default: 2407 return WT_IGNORE; 2408 } 2409 2410 default: 2411 return WT_OTHER; 2412 } 2413 } 2414 2415 // ================================================================ 2416 2417 static final int LINE_FEED = 0x000A; 2418 static final int CARRIAGE_RETURN = 0x000D; 2419 static final int LINE_SEPARATOR = 0x2028; 2420 static final int PARAGRAPH_SEPARATOR = 0x2029; 2421 isEOLChar(int ch)2422 private static final boolean isEOLChar(int ch) { 2423 return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR 2424 || ch == PARAGRAPH_SEPARATOR; 2425 } 2426 isWordChar(int ch)2427 private static final boolean isWordChar(int ch) { // Legacy word characters 2428 if (ch == '_') return true; 2429 if (ch < '0') return false; 2430 if (ch > 'z') return false; 2431 if (ch <= '9') return true; 2432 if (ch < 'A') return false; 2433 if (ch <= 'Z') return true; 2434 if (ch < 'a') return false; 2435 return true; 2436 } 2437 matchIgnoreCase(int chardata, int ch)2438 private static final boolean matchIgnoreCase(int chardata, int ch) { 2439 if (chardata == ch) return true; 2440 if (chardata > 0xffff || ch > 0xffff) return false; 2441 char uch1 = Character.toUpperCase((char)chardata); 2442 char uch2 = Character.toUpperCase((char)ch); 2443 if (uch1 == uch2) return true; 2444 return Character.toLowerCase(uch1) == Character.toLowerCase(uch2); 2445 } 2446 } 2447