1 /*
2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3  */
4 /*
5  * Licensed to the Apache Software Foundation (ASF) under one or more
6  * contributor license agreements.  See the NOTICE file distributed with
7  * this work for additional information regarding copyright ownership.
8  * The ASF licenses this file to You under the Apache License, Version 2.0
9  * (the "License"); you may not use this file except in compliance with
10  * the License.  You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22 
23 import java.text.CharacterIterator;
24 import java.util.Locale;
25 import java.util.Stack;
26 
27 import com.sun.org.apache.xerces.internal.util.IntStack;
28 
29 /**
30  * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
31  * This engine does not conform to the POSIX regular expression.
32  *
33  * <hr width="50%">
34  * <h2>How to use</h2>
35  *
36  * <dl>
37  *   <dt>A. Standard way
38  *   <dd>
39  * <pre>
40  * RegularExpression re = new RegularExpression(<var>regex</var>);
41  * if (re.matches(text)) { ... }
42  * </pre>
43  *
44  *   <dt>B. Capturing groups
45  *   <dd>
46  * <pre>
47  * RegularExpression re = new RegularExpression(<var>regex</var>);
48  * Match match = new Match();
49  * if (re.matches(text, match)) {
50  *     ... // You can refer captured texts with methods of the <code>Match</code> class.
51  * }
52  * </pre>
53  *
54  * </dl>
55  *
56  * <h4>Case-insensitive matching</h4>
57  * <pre>
58  * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
59  * if (re.matches(text) >= 0) { ...}
60  * </pre>
61  *
62  * <h4>Options</h4>
63  * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
64  *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
65  *    This <var>options</var> parameter consists of the following characters.
66  * </p>
67  * <dl>
68  *   <dt><a name="I_OPTION"><code>"i"</code></a>
69  *   <dd>This option indicates case-insensitive matching.
70  *   <dt><a name="M_OPTION"><code>"m"</code></a>
71  *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
72  *   <dt><a name="S_OPTION"><code>"s"</code></a>
73  *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
74  *   <dt><a name="U_OPTION"><code>"u"</code></a>
75  *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
76  *   <dt><a name="W_OPTION"><code>"w"</code></a>
77  *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
78  *      'Unicode Regular Expression Guidelines' Revision 4.
79  *      When "w" and "u" are specified at the same time,
80  *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
81  *   <dt><a name="COMMA_OPTION"><code>","</code></a>
82  *   <dd>The parser treats a comma in a character class as a range separator.
83  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
84  *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
85  *
86  *   <dt><a name="X_OPTION"><code>"X"</code></a>
87  *   <dd class="REGEX">
88  *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
89  *       The <code>match()</code> method does not do subsring matching
90  *       but entire string matching.
91  *
92  * </dl>
93  *
94  * <hr width="50%">
95  * <h3>Syntax</h3>
96  * <table border="1" bgcolor="#ddeeff">
97  *   <tr>
98  *    <td>
99  *     <h4>Differences from the Perl 5 regular expression</h4>
100  *     <ul>
101  *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
102  *      <li>Supports subtraction, union, and intersection operations for character classes.
103  *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
104  *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
105  *          <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
106  *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
107  *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
108  *     </ul>
109  *    </td>
110  *   </tr>
111  * </table>
112  *
113  * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
114  * <ul>
115  *   <li>Character
116  *     <dl>
117  *       <dt class="REGEX"><kbd>.</kbd> (A period)
118  *       <dd>Matches any one character except the following characters.
119  *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
120  *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
121  *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
122  *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
123  *           it matches any character including the above four characters.
124  *
125  *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
126  *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
127  *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
128  *
129  *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
130  *       <dd>Matches a control character.
131  *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
132  *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
133  *           It matches a control character of which the character code is less than
134  *           the character code of the <var>C</var> by 0x0040.
135  *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
136  *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
137  *
138  *       <dt class="REGEX">a non-meta character
139  *       <dd>Matches the character.
140  *
141  *       <dt class="REGEX"><KBD>\</KBD> + a meta character
142  *       <dd>Matches the meta character.
143  *
144  *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
145  *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
146  *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
147  *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
148  *
149  *       <!--
150  *       <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
151  *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
152  *       -->
153  *
154  *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
155  *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
156  *
157  *       <dt class="REGEX"><kbd>\g</kbd>
158  *       <dd>Matches a grapheme.
159  *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
160  *
161  *       <dt class="REGEX"><kbd>\X</kbd>
162  *       <dd class="REGEX">Matches a combining character sequence.
163  *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
164  *     </dl>
165  *   </li>
166  *
167  *   <li>Character class
168  *     <dl>
169 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
170 + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
171  *       <dd>Positive character class.  It matches a character in ranges.
172  *       <dd><var>R<sub>n</sub></var>:
173  *       <ul>
174  *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
175  *             <p>This range matches the character.
176  *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
177  *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
178 + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
179 + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
180  *             <p>...
181  *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
182  *             <p>These expressions specifies the same ranges as the following expressions.
183  *       </ul>
184  *       <p class="REGEX">Enumerated ranges are merged (union operation).
185  *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
186  *
187  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
188  *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
189  *       <dd>Negative character class.  It matches a character not in ranges.
190  *
191  *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
192  *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
193  *       <dd>Subtraction or union or intersection for character classes.
194  *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
195  *       <dd>The result of this operations is a <u>positive character class</u>
196  *           even if an expression includes any negative character classes.
197  *           You have to take care on this in case-insensitive matching.
198  *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
199  *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
200  *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
201  *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
202  *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
203  *
204  *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
205  *       <dd>Character class subtraction for the XML Schema.
206  *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
207  *
208  *       <dt class="REGEX"><kbd>\d</kbd>
209  *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
210  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
211  *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
212  *
213  *       <dt class="REGEX"><kbd>\D</kbd>
214  *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
215  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
216  *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
217  *
218  *       <dt class="REGEX"><kbd>\s</kbd>
219  *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
220  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
221  *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
222  *
223  *       <dt class="REGEX"><kbd>\S</kbd>
224  *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
225  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
226  *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
227  *
228  *       <dt class="REGEX"><kbd>\w</kbd>
229  *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
230  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
231  *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
232  *
233  *       <dt class="REGEX"><kbd>\W</kbd>
234  *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
235  *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
236  *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
237  *
238  *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
239  *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
240  *       The following names are available:
241  *       <dl>
242  *         <dt>Unicode General Categories:
243  *         <dd><kbd>
244  *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
245  *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
246  *         </kbd>
247  *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
248  *         <dt>Unicode Blocks:
249  *         <dd><kbd>
250  *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
251  *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
252  *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
253  *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
254  *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
255  *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
256  *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
257  *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
258  *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
259  *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
260  *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
261  *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
262  *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
263  *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
264  *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
265  *       Small Form Variants, Arabic Presentation Forms-B, Specials,
266  *       Halfwidth and Fullwidth Forms
267  *         </kbd>
268  *         <dt>Others:
269  *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
270  *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
271  *         <dd><kbd>UNASSGINED</kbd>
272  *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
273  *       </dl>
274  *
275  *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
276  *       <dd>Matches one character not in the specified General Category or the specified Block.
277  *     </dl>
278  *   </li>
279  *
280  *   <li>Selection and Quantifier
281  *     <dl>
282  *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
283  *       <dd>...
284  *
285  *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
286  *       <dd>Matches 0 or more <var>X</var>.
287  *
288  *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
289  *       <dd>Matches 1 or more <var>X</var>.
290  *
291  *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
292  *       <dd>Matches 0 or 1 <var>X</var>.
293  *
294  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
295  *       <dd>Matches <var>number</var> times.
296  *
297  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
298  *       <dd>...
299  *
300  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
301  *       <dd>...
302  *
303  *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
304  *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
305  *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
306  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
307  *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
308  *       <dd>Non-greedy matching.
309  *     </dl>
310  *   </li>
311  *
312  *   <li>Grouping, Capturing, and Back-reference
313  *     <dl>
314  *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
315  *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
316  *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
317  *       you have to write "<KBD>(?:foo)+</KBD>".
318  *
319  *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
320  *       <dd>Grouping with capturing.
321  * It make a group and applications can know
322  * where in target text a group matched with methods of a <code>Match</code> instance
323  * after <code><a href="#matches(java.lang.String, com.sun.org.apache.xerces.internal.utils.regex.Match)">matches(String,Match)</a></code>.
324  * The 0th group means whole of this regular expression.
325  * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
326  *
327  *   <p>For instance, a regular expression is
328  *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
329  *   and target text is
330  *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
331  *   <ul>
332  *     <li><code>Match.getCapturedText(0)</code>:
333  *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
334  *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
335  *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
336  *   </ul>
337  *
338  *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
339  *       <dd>
340  *
341  *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
342  *       <dd>Independent expression group. ................
343  *
344  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
345  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
346  *       <dd>............................
347  *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
348  *           Note that it can not contain 'u'.
349  *
350  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
351  *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
352  *       <dd>......
353  *       <dd>These expressions must be at the beginning of a group.
354  *     </dl>
355  *   </li>
356  *
357  *   <li>Anchor
358  *     <dl>
359  *       <dt class="REGEX"><kbd>\A</kbd>
360  *       <dd>Matches the beginnig of the text.
361  *
362  *       <dt class="REGEX"><kbd>\Z</kbd>
363  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
364  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
365  *
366  *       <dt class="REGEX"><kbd>\z</kbd>
367  *       <dd>Matches the end of the text.
368  *
369  *       <dt class="REGEX"><kbd>^</kbd>
370  *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
371  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
372  *           it matches the beginning of the text, or after one of EOL characters (
373  *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
374  *           PARAGRAPH SEPARATOR (U+2029).)
375  *
376  *       <dt class="REGEX"><kbd>$</kbd>
377  *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
378  *           or CARRIAGE RETURN + LINE FEED at the end of the text.
379  *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
380  *           it matches the end of the text, or before an EOL character.
381  *
382  *       <dt class="REGEX"><kbd>\b</kbd>
383  *       <dd>Matches word boundary.
384  *           (See <a href="#W_OPTION">a "w" option</a>)
385  *
386  *       <dt class="REGEX"><kbd>\B</kbd>
387  *       <dd>Matches non word boundary.
388  *           (See <a href="#W_OPTION">a "w" option</a>)
389  *
390  *       <dt class="REGEX"><kbd>\&lt;</kbd>
391  *       <dd>Matches the beginning of a word.
392  *           (See <a href="#W_OPTION">a "w" option</a>)
393  *
394  *       <dt class="REGEX"><kbd>\&gt;</kbd>
395  *       <dd>Matches the end of a word.
396  *           (See <a href="#W_OPTION">a "w" option</a>)
397  *     </dl>
398  *   </li>
399  *   <li>Lookahead and lookbehind
400  *     <dl>
401  *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
402  *       <dd>Lookahead.
403  *
404  *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
405  *       <dd>Negative lookahead.
406  *
407  *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
408  *       <dd>Lookbehind.
409  *       <dd>(Note for text capturing......)
410  *
411  *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
412  *       <dd>Negative lookbehind.
413  *     </dl>
414  *   </li>
415  *
416  *   <li>Misc.
417  *     <dl>
418  *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
419  *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
420  *       <dd>......
421  *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
422  *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
423  *           You can not write comments in character classes and before quantifiers.
424  *     </dl>
425  *   </li>
426  * </ul>
427  *
428  *
429  * <hr width="50%">
430  * <h3>BNF for the regular expression</h3>
431  * <pre>
432  * regex ::= ('(?' options ')')? term ('|' term)*
433  * term ::= factor+
434  * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
435  *            | '(?#' [^)]* ')'
436  * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
437  * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
438  *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
439  *          | '(?>' regex ')' | '(?' options ':' regex ')'
440  *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
441  * options ::= [imsw]* ('-' [imsw]+)?
442  * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
443  * looks ::= '(?=' regex ')'  | '(?!' regex ')'
444  *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
445  * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
446  * category-block ::= '\' [pP] category-symbol-1
447  *                    | ('\p{' | '\P{') (category-symbol | block-name
448  *                                       | other-properties) '}'
449  * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
450  * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
451  *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
452  *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
453  *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
454  *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
455  * block-name ::= (See above)
456  * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
457  * character-1 ::= (any character except meta-characters)
458  *
459  * char-class ::= '[' ranges ']'
460  *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
461  * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
462  * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
463  *           | range-char | range-char '-' range-char
464  * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
465  * code-point ::= '\x' hex-char hex-char
466  *                | '\x{' hex-char+ '}'
467  * <!--               | '\u005c u' hex-char hex-char hex-char hex-char
468  * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
469  * hex-char ::= [0-9a-fA-F]
470  * character-2 ::= (any character except \[]-,)
471  * </pre>
472  *
473  * <hr width="50%">
474  * <h3>TODO</h3>
475  * <ul>
476  *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
477  *     <ul>
478  *       <li>2.4 Canonical Equivalents
479  *       <li>Level 3
480  *     </ul>
481  *   <li>Parsing performance
482  * </ul>
483  *
484  * <hr width="50%">
485  *
486  * @xerces.internal
487  *
488  * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
489  */
490 public class RegularExpression implements java.io.Serializable {
491 
492     private static final long serialVersionUID = 6242499334195006401L;
493 
494     static final boolean DEBUG = false;
495 
496     /**
497      * Compiles a token tree into an operation flow.
498      */
compile(Token tok)499     private synchronized void compile(Token tok) {
500         if (this.operations != null)
501             return;
502         this.numberOfClosures = 0;
503         this.operations = this.compile(tok, null, false);
504     }
505 
506     /**
507      * Converts a token to an operation.
508      */
compile(Token tok, Op next, boolean reverse)509     private Op compile(Token tok, Op next, boolean reverse) {
510         Op ret;
511         switch (tok.type) {
512         case Token.DOT:
513             ret = Op.createDot();
514             ret.next = next;
515             break;
516 
517         case Token.CHAR:
518             ret = Op.createChar(tok.getChar());
519             ret.next = next;
520             break;
521 
522         case Token.ANCHOR:
523             ret = Op.createAnchor(tok.getChar());
524             ret.next = next;
525             break;
526 
527         case Token.RANGE:
528         case Token.NRANGE:
529             ret = Op.createRange(tok);
530             ret.next = next;
531             break;
532 
533         case Token.CONCAT:
534             ret = next;
535             if (!reverse) {
536                 for (int i = tok.size()-1;  i >= 0;  i --) {
537                     ret = compile(tok.getChild(i), ret, false);
538                 }
539             } else {
540                 for (int i = 0;  i < tok.size();  i ++) {
541                     ret = compile(tok.getChild(i), ret, true);
542                 }
543             }
544             break;
545 
546         case Token.UNION:
547             Op.UnionOp uni = Op.createUnion(tok.size());
548             for (int i = 0;  i < tok.size();  i ++) {
549                 uni.addElement(compile(tok.getChild(i), next, reverse));
550             }
551             ret = uni;                          // ret.next is null.
552             break;
553 
554         case Token.CLOSURE:
555         case Token.NONGREEDYCLOSURE:
556             Token child = tok.getChild(0);
557             int min = tok.getMin();
558             int max = tok.getMax();
559             if (min >= 0 && min == max) { // {n}
560                 ret = next;
561                 for (int i = 0; i < min;  i ++) {
562                     ret = compile(child, ret, reverse);
563                 }
564                 break;
565             }
566             if (min > 0 && max > 0)
567                 max -= min;
568             if (max > 0) {
569                 // X{2,6} -> XX(X(X(XX?)?)?)?
570                 ret = next;
571                 for (int i = 0;  i < max;  i ++) {
572                     Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
573                     q.next = next;
574                     q.setChild(compile(child, ret, reverse));
575                     ret = q;
576                 }
577             } else {
578                 Op.ChildOp op;
579                 if (tok.type == Token.NONGREEDYCLOSURE) {
580                     op = Op.createNonGreedyClosure();
581                 } else {                        // Token.CLOSURE
582                     op = Op.createClosure(this.numberOfClosures++);
583                 }
584                 op.next = next;
585                 op.setChild(compile(child, op, reverse));
586                 ret = op;
587             }
588             if (min > 0) {
589                 for (int i = 0;  i < min;  i ++) {
590                     ret = compile(child, ret, reverse);
591                 }
592             }
593             break;
594 
595         case Token.EMPTY:
596             ret = next;
597             break;
598 
599         case Token.STRING:
600             ret = Op.createString(tok.getString());
601             ret.next = next;
602             break;
603 
604         case Token.BACKREFERENCE:
605             ret = Op.createBackReference(tok.getReferenceNumber());
606             ret.next = next;
607             break;
608 
609         case Token.PAREN:
610             if (tok.getParenNumber() == 0) {
611                 ret = compile(tok.getChild(0), next, reverse);
612             } else if (reverse) {
613                 next = Op.createCapture(tok.getParenNumber(), next);
614                 next = compile(tok.getChild(0), next, reverse);
615                 ret = Op.createCapture(-tok.getParenNumber(), next);
616             } else {
617                 next = Op.createCapture(-tok.getParenNumber(), next);
618                 next = compile(tok.getChild(0), next, reverse);
619                 ret = Op.createCapture(tok.getParenNumber(), next);
620             }
621             break;
622 
623         case Token.LOOKAHEAD:
624             ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
625             break;
626         case Token.NEGATIVELOOKAHEAD:
627             ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
628             break;
629         case Token.LOOKBEHIND:
630             ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
631             break;
632         case Token.NEGATIVELOOKBEHIND:
633             ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
634             break;
635 
636         case Token.INDEPENDENT:
637             ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
638             break;
639 
640         case Token.MODIFIERGROUP:
641             ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
642                                     ((Token.ModifierToken)tok).getOptions(),
643                                     ((Token.ModifierToken)tok).getOptionsMask());
644             break;
645 
646         case Token.CONDITION:
647             Token.ConditionToken ctok = (Token.ConditionToken)tok;
648             int ref = ctok.refNumber;
649             Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
650             Op yes = compile(ctok.yes, next, reverse);
651             Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
652             ret = Op.createCondition(next, ref, condition, yes, no);
653             break;
654 
655         default:
656             throw new RuntimeException("Unknown token type: "+tok.type);
657         } // switch (tok.type)
658         return ret;
659     }
660 
661 
662 //Public
663 
664     /**
665      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
666      *
667      * @return true if the target is matched to this regular expression.
668      */
matches(char[] target)669     public boolean matches(char[]  target) {
670         return this.matches(target, 0,  target .length , (Match)null);
671     }
672 
673     /**
674      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
675      * in specified range or not.
676      *
677      * @param start Start offset of the range.
678      * @param end  End offset +1 of the range.
679      * @return true if the target is matched to this regular expression.
680      */
matches(char[] target, int start, int end)681     public boolean matches(char[]  target, int start, int end) {
682         return this.matches(target, start, end, (Match)null);
683     }
684 
685     /**
686      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
687      *
688      * @param match A Match instance for storing matching result.
689      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
690      */
matches(char[] target, Match match)691     public boolean matches(char[]  target, Match match) {
692         return this.matches(target, 0,  target .length , match);
693     }
694 
695 
696     /**
697      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
698      * in specified range or not.
699      *
700      * @param start Start offset of the range.
701      * @param end  End offset +1 of the range.
702      * @param match A Match instance for storing matching result.
703      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
704      */
matches(char[] target, int start, int end, Match match)705     public boolean matches(char[] target, int start, int end, Match match) {
706 
707         synchronized (this) {
708             if (this.operations == null)
709                 this.prepare();
710             if (this.context == null)
711                 this.context = new Context();
712         }
713         Context con = null;
714         synchronized (this.context) {
715             con = this.context.inuse ? new Context() : this.context;
716             con.reset(target, start, end, this.numberOfClosures);
717         }
718         if (match != null) {
719             match.setNumberOfGroups(this.nofparen);
720             match.setSource(target);
721         } else if (this.hasBackReferences) {
722             match = new Match();
723             match.setNumberOfGroups(this.nofparen);
724             // Need not to call setSource() because
725             // a caller can not access this match instance.
726         }
727         con.match = match;
728 
729         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
730             int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
731             //System.err.println("DEBUG: matchEnd="+matchEnd);
732             if (matchEnd == con.limit) {
733                 if (con.match != null) {
734                     con.match.setBeginning(0, con.start);
735                     con.match.setEnd(0, matchEnd);
736                 }
737                 con.setInUse(false);
738                 return true;
739             }
740             return false;
741         }
742 
743         /*
744          * The pattern has only fixed string.
745          * The engine uses Boyer-Moore.
746          */
747         if (this.fixedStringOnly) {
748             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
749             int o = this.fixedStringTable.matches(target, con.start, con.limit);
750             if (o >= 0) {
751                 if (con.match != null) {
752                     con.match.setBeginning(0, o);
753                     con.match.setEnd(0, o+this.fixedString.length());
754                 }
755                 con.setInUse(false);
756                 return true;
757             }
758             con.setInUse(false);
759             return false;
760         }
761 
762         /*
763          * The pattern contains a fixed string.
764          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
765          * If not, it return with false.
766          */
767         if (this.fixedString != null) {
768             int o = this.fixedStringTable.matches(target, con.start, con.limit);
769             if (o < 0) {
770                 //System.err.println("Non-match in fixed-string search.");
771                 con.setInUse(false);
772                 return false;
773             }
774         }
775 
776         int limit = con.limit-this.minlength;
777         int matchStart;
778         int matchEnd = -1;
779 
780         /*
781          * Checks whether the expression starts with ".*".
782          */
783         if (this.operations != null
784             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
785             if (isSet(this.options, SINGLE_LINE)) {
786                 matchStart = con.start;
787                 matchEnd = this. match(con, this.operations, con.start, 1, this.options);
788             } else {
789                 boolean previousIsEOL = true;
790                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
791                     int ch =  target [  matchStart ] ;
792                     if (isEOLChar(ch)) {
793                         previousIsEOL = true;
794                     } else {
795                         if (previousIsEOL) {
796                             if (0 <= (matchEnd = this. match(con, this.operations,
797                                                              matchStart, 1, this.options)))
798                                 break;
799                         }
800                         previousIsEOL = false;
801                     }
802                 }
803             }
804         }
805 
806         /*
807          * Optimization against the first character.
808          */
809         else if (this.firstChar != null) {
810             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
811             RangeToken range = this.firstChar;
812             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
813                 int ch =  target [matchStart] ;
814                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
815                     ch = REUtil.composeFromSurrogates(ch, target[matchStart+1]);
816                 }
817                 if (!range.match(ch))  {
818                     continue;
819                 }
820                 if (0 <= (matchEnd = this. match(con, this.operations,
821                                                  matchStart, 1, this.options))) {
822                         break;
823                 }
824             }
825         }
826 
827         /*
828          * Straightforward matching.
829          */
830         else {
831             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
832                 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
833                     break;
834             }
835         }
836 
837         if (matchEnd >= 0) {
838             if (con.match != null) {
839                 con.match.setBeginning(0, matchStart);
840                 con.match.setEnd(0, matchEnd);
841             }
842             con.setInUse(false);
843             return true;
844         } else {
845             con.setInUse(false);
846             return false;
847         }
848     }
849 
850     /**
851      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
852      *
853      * @return true if the target is matched to this regular expression.
854      */
matches(String target)855     public boolean matches(String  target) {
856         return this.matches(target, 0,  target .length() , (Match)null);
857     }
858 
859     /**
860      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
861      * in specified range or not.
862      *
863      * @param start Start offset of the range.
864      * @param end  End offset +1 of the range.
865      * @return true if the target is matched to this regular expression.
866      */
matches(String target, int start, int end)867     public boolean matches(String  target, int start, int end) {
868         return this.matches(target, start, end, (Match)null);
869     }
870 
871     /**
872      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
873      *
874      * @param match A Match instance for storing matching result.
875      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
876      */
matches(String target, Match match)877     public boolean matches(String  target, Match match) {
878         return this.matches(target, 0,  target .length() , match);
879     }
880 
881     /**
882      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
883      * in specified range or not.
884      *
885      * @param start Start offset of the range.
886      * @param end  End offset +1 of the range.
887      * @param match A Match instance for storing matching result.
888      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
889      */
matches(String target, int start, int end, Match match)890     public boolean matches(String  target, int start, int end, Match match) {
891 
892         synchronized (this) {
893             if (this.operations == null)
894                 this.prepare();
895             if (this.context == null)
896                 this.context = new Context();
897         }
898         Context con = null;
899         synchronized (this.context) {
900             con = this.context.inuse ? new Context() : this.context;
901             con.reset(target, start, end, this.numberOfClosures);
902         }
903         if (match != null) {
904             match.setNumberOfGroups(this.nofparen);
905             match.setSource(target);
906         } else if (this.hasBackReferences) {
907             match = new Match();
908             match.setNumberOfGroups(this.nofparen);
909             // Need not to call setSource() because
910             // a caller can not access this match instance.
911         }
912         con.match = match;
913 
914         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
915             if (DEBUG) {
916                 System.err.println("target string="+target);
917             }
918             int matchEnd = this. match(con, this.operations, con.start, 1, this.options);
919             if (DEBUG) {
920                 System.err.println("matchEnd="+matchEnd);
921                 System.err.println("con.limit="+con.limit);
922             }
923             if (matchEnd == con.limit) {
924                 if (con.match != null) {
925                     con.match.setBeginning(0, con.start);
926                     con.match.setEnd(0, matchEnd);
927                 }
928                 con.setInUse(false);
929                 return true;
930             }
931             return false;
932         }
933 
934         /*
935          * The pattern has only fixed string.
936          * The engine uses Boyer-Moore.
937          */
938         if (this.fixedStringOnly) {
939             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
940             int o = this.fixedStringTable.matches(target, con.start, con.limit);
941             if (o >= 0) {
942                 if (con.match != null) {
943                     con.match.setBeginning(0, o);
944                     con.match.setEnd(0, o+this.fixedString.length());
945                 }
946                 con.setInUse(false);
947                 return true;
948             }
949             con.setInUse(false);
950             return false;
951         }
952 
953         /*
954          * The pattern contains a fixed string.
955          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
956          * If not, it return with false.
957          */
958         if (this.fixedString != null) {
959             int o = this.fixedStringTable.matches(target, con.start, con.limit);
960             if (o < 0) {
961                 //System.err.println("Non-match in fixed-string search.");
962                 con.setInUse(false);
963                 return false;
964             }
965         }
966 
967         int limit = con.limit-this.minlength;
968         int matchStart;
969         int matchEnd = -1;
970 
971         /*
972          * Checks whether the expression starts with ".*".
973          */
974         if (this.operations != null
975             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
976             if (isSet(this.options, SINGLE_LINE)) {
977                 matchStart = con.start;
978                 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
979             } else {
980                 boolean previousIsEOL = true;
981                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
982                     int ch =  target .charAt(  matchStart ) ;
983                     if (isEOLChar(ch)) {
984                         previousIsEOL = true;
985                     } else {
986                         if (previousIsEOL) {
987                             if (0 <= (matchEnd = this.match(con, this.operations,
988                                                             matchStart, 1, this.options)))
989                                 break;
990                         }
991                         previousIsEOL = false;
992                     }
993                 }
994             }
995         }
996 
997         /*
998          * Optimization against the first character.
999          */
1000         else if (this.firstChar != null) {
1001             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1002             RangeToken range = this.firstChar;
1003             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1004                 int ch =  target .charAt(  matchStart ) ;
1005                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1006                     ch = REUtil.composeFromSurrogates(ch, target.charAt(matchStart+1));
1007                 }
1008                 if (!range.match(ch)) {
1009                     continue;
1010                 }
1011                 if (0 <= (matchEnd = this.match(con, this.operations,
1012                                                 matchStart, 1, this.options))) {
1013                         break;
1014                 }
1015             }
1016         }
1017 
1018         /*
1019          * Straightforward matching.
1020          */
1021         else {
1022             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1023                 if (0 <= (matchEnd = this.match(con, this.operations, matchStart, 1, this.options)))
1024                     break;
1025             }
1026         }
1027 
1028         if (matchEnd >= 0) {
1029             if (con.match != null) {
1030                 con.match.setBeginning(0, matchStart);
1031                 con.match.setEnd(0, matchEnd);
1032             }
1033             con.setInUse(false);
1034             return true;
1035         } else {
1036             con.setInUse(false);
1037             return false;
1038         }
1039     }
1040 
1041     /**
1042      * @return -1 when not match; offset of the end of matched string when match.
1043      */
1044     @SuppressWarnings("fallthrough")
match(Context con, Op op, int offset, int dx, int opts)1045     private int match(Context con, Op op, int offset, int dx, int opts) {
1046         final ExpressionTarget target = con.target;
1047         final Stack<Op> opStack = new Stack<>();
1048         final IntStack dataStack = new IntStack();
1049         final boolean isSetIgnoreCase = isSet(opts, IGNORE_CASE);
1050         int retValue = -1;
1051         boolean returned = false;
1052 
1053         for (;;) {
1054             if (op == null || offset > con.limit || offset < con.start) {
1055                 if (op == null) {
1056                     retValue = isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
1057                 }
1058                 else {
1059                    retValue = -1;
1060                 }
1061                 returned = true;
1062             }
1063             else  {
1064                 retValue = -1;
1065                 // dx value is either 1 or -1
1066                 switch (op.type) {
1067                 case Op.CHAR:
1068                     {
1069                         final int o1 = (dx > 0) ? offset : offset -1;
1070                         if (o1 >= con.limit || o1 < 0 || !matchChar(op.getData(), target.charAt(o1), isSetIgnoreCase)) {
1071                             returned = true;
1072                             break;
1073                         }
1074                         offset += dx;
1075                         op = op.next;
1076                     }
1077                     break;
1078 
1079                 case Op.DOT:
1080                     {
1081                         int o1 = (dx > 0) ? offset : offset - 1;
1082                         if (o1 >= con.limit || o1 < 0) {
1083                             returned = true;
1084                             break;
1085                         }
1086                         if (isSet(opts, SINGLE_LINE)) {
1087                             if (REUtil.isHighSurrogate(target.charAt(o1)) && o1+dx >= 0 && o1+dx < con.limit) {
1088                                 o1 += dx;
1089                             }
1090                         }
1091                         else {
1092                             int ch = target.charAt(o1);
1093                             if (REUtil.isHighSurrogate(ch) && o1+dx >= 0 && o1+dx < con.limit) {
1094                                 o1 += dx;
1095                                 ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1096                             }
1097                             if (isEOLChar(ch)) {
1098                                 returned = true;
1099                                 break;
1100                             }
1101                         }
1102                         offset = (dx > 0) ? o1 + 1 : o1;
1103                         op = op.next;
1104                     }
1105                     break;
1106 
1107                 case Op.RANGE:
1108                 case Op.NRANGE:
1109                     {
1110                         int o1 = (dx > 0) ? offset : offset -1;
1111                         if (o1 >= con.limit || o1 < 0) {
1112                             returned = true;
1113                             break;
1114                         }
1115                         int ch = target.charAt(offset);
1116                         if (REUtil.isHighSurrogate(ch) && o1+dx < con.limit && o1+dx >=0) {
1117                             o1 += dx;
1118                             ch = REUtil.composeFromSurrogates(ch, target.charAt(o1));
1119                         }
1120                         final RangeToken tok = op.getToken();
1121                         if (!tok.match(ch)) {
1122                             returned = true;
1123                             break;
1124                         }
1125                         offset = (dx > 0) ? o1+1 : o1;
1126                         op = op.next;
1127                     }
1128                     break;
1129 
1130                 case Op.ANCHOR:
1131                     {
1132                         if (!matchAnchor(target, op, con, offset, opts)) {
1133                             returned = true;
1134                             break;
1135                         }
1136                         op = op.next;
1137                     }
1138                     break;
1139 
1140                 case Op.BACKREFERENCE:
1141                     {
1142                         int refno = op.getData();
1143                         if (refno <= 0 || refno >= this.nofparen) {
1144                             throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
1145                         }
1146                         if (con.match.getBeginning(refno) < 0 || con.match.getEnd(refno) < 0) {
1147                             returned = true;
1148                             break;
1149                         }
1150                         int o2 = con.match.getBeginning(refno);
1151                         int literallen = con.match.getEnd(refno)-o2;
1152                         if (dx > 0) {
1153                             if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, o2, literallen)) {
1154                                 returned = true;
1155                                 break;
1156                             }
1157                             offset += literallen;
1158                         }
1159                         else {
1160                             if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, o2, literallen)) {
1161                                 returned = true;
1162                                 break;
1163                             }
1164                             offset -= literallen;
1165                         }
1166                         op = op.next;
1167                     }
1168                     break;
1169 
1170                 case Op.STRING:
1171                     {
1172                         String literal = op.getString();
1173                         int literallen = literal.length();
1174                         if (dx > 0) {
1175                             if (!target.regionMatches(isSetIgnoreCase, offset, con.limit, literal, literallen)) {
1176                                 returned = true;
1177                                 break;
1178                             }
1179                             offset += literallen;
1180                         }
1181                         else {
1182                             if (!target.regionMatches(isSetIgnoreCase, offset-literallen, con.limit, literal, literallen)) {
1183                                 returned = true;
1184                                 break;
1185                             }
1186                             offset -= literallen;
1187                         }
1188                         op = op.next;
1189                     }
1190                     break;
1191 
1192                 case Op.CLOSURE:
1193                     {
1194                         // Saves current position to avoid zero-width repeats.
1195                         final int id = op.getData();
1196                         if (con.closureContexts[id].contains(offset)) {
1197                             returned = true;
1198                             break;
1199                         }
1200 
1201                         con.closureContexts[id].addOffset(offset);
1202                     }
1203                     // fall through
1204 
1205                 case Op.QUESTION:
1206                     {
1207                         opStack.push(op);
1208                         dataStack.push(offset);
1209                         op = op.getChild();
1210                     }
1211                     break;
1212 
1213                 case Op.NONGREEDYCLOSURE:
1214                 case Op.NONGREEDYQUESTION:
1215                     {
1216                         opStack.push(op);
1217                         dataStack.push(offset);
1218                         op = op.next;
1219                     }
1220                     break;
1221 
1222                 case Op.UNION:
1223                     if (op.size() == 0) {
1224                         returned = true;
1225                     }
1226                     else {
1227                         opStack.push(op);
1228                         dataStack.push(0);
1229                         dataStack.push(offset);
1230                         op = op.elementAt(0);
1231                     }
1232                     break;
1233 
1234                 case Op.CAPTURE:
1235                     {
1236                         final int refno = op.getData();
1237                         if (con.match != null) {
1238                             if (refno > 0) {
1239                                 dataStack.push(con.match.getBeginning(refno));
1240                                 con.match.setBeginning(refno, offset);
1241                             }
1242                             else {
1243                                 final int index = -refno;
1244                                 dataStack.push(con.match.getEnd(index));
1245                                 con.match.setEnd(index, offset);
1246                             }
1247                             opStack.push(op);
1248                             dataStack.push(offset);
1249                         }
1250                         op = op.next;
1251                     }
1252                     break;
1253 
1254                 case Op.LOOKAHEAD:
1255                 case Op.NEGATIVELOOKAHEAD:
1256                 case Op.LOOKBEHIND:
1257                 case Op.NEGATIVELOOKBEHIND:
1258                     {
1259                         opStack.push(op);
1260                         dataStack.push(dx);
1261                         dataStack.push(offset);
1262                         dx = (op.type == Op.LOOKAHEAD || op.type == Op.NEGATIVELOOKAHEAD) ? 1 : -1;
1263                         op = op.getChild();
1264                     }
1265                     break;
1266 
1267                 case Op.INDEPENDENT:
1268                     {
1269                         opStack.push(op);
1270                         dataStack.push(offset);
1271                         op = op.getChild();
1272                     }
1273                     break;
1274 
1275                 case Op.MODIFIER:
1276                     {
1277                         int localopts = opts;
1278                         localopts |= op.getData();
1279                         localopts &= ~op.getData2();
1280                         opStack.push(op);
1281                         dataStack.push(opts);
1282                         dataStack.push(offset);
1283                         opts = localopts;
1284                         op = op.getChild();
1285                     }
1286                     break;
1287 
1288                 case Op.CONDITION:
1289                     {
1290                         Op.ConditionOp cop = (Op.ConditionOp)op;
1291                         if (cop.refNumber > 0) {
1292                             if (cop.refNumber >= this.nofparen) {
1293                                 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
1294                             }
1295                             if (con.match.getBeginning(cop.refNumber) >= 0
1296                                     && con.match.getEnd(cop.refNumber) >= 0) {
1297                                 op = cop.yes;
1298                             }
1299                             else if (cop.no != null) {
1300                                 op = cop.no;
1301                             }
1302                             else {
1303                                 op = cop.next;
1304                             }
1305                         }
1306                         else {
1307                             opStack.push(op);
1308                             dataStack.push(offset);
1309                             op = cop.condition;
1310                         }
1311                     }
1312                     break;
1313 
1314                 default:
1315                     throw new RuntimeException("Unknown operation type: " + op.type);
1316                 }
1317             }
1318 
1319             // handle recursive operations
1320             while (returned) {
1321                 // exhausted all the operations
1322                 if (opStack.isEmpty()) {
1323                     return retValue;
1324                 }
1325 
1326                 op = opStack.pop();
1327                 offset = dataStack.pop();
1328 
1329                 switch (op.type) {
1330                 case Op.CLOSURE:
1331                 case Op.QUESTION:
1332                     if (retValue < 0) {
1333                         op = op.next;
1334                         returned = false;
1335                     }
1336                     break;
1337 
1338                 case Op.NONGREEDYCLOSURE:
1339                 case Op.NONGREEDYQUESTION:
1340                     if (retValue < 0) {
1341                         op = op.getChild();
1342                         returned = false;
1343                     }
1344                     break;
1345 
1346                 case Op.UNION:
1347                     {
1348                         int unionIndex = dataStack.pop();
1349                         if (DEBUG) {
1350                             System.err.println("UNION: "+unionIndex+", ret="+retValue);
1351                         }
1352 
1353                         if (retValue < 0) {
1354                             if (++unionIndex < op.size()) {
1355                                 opStack.push(op);
1356                                 dataStack.push(unionIndex);
1357                                 dataStack.push(offset);
1358                                 op = op.elementAt(unionIndex);
1359                                 returned = false;
1360                             }
1361                             else {
1362                                 retValue = -1;
1363                             }
1364                         }
1365                     }
1366                     break;
1367 
1368                 case Op.CAPTURE:
1369                     final int refno = op.getData();
1370                     final int saved = dataStack.pop();
1371                     if (retValue < 0) {
1372                         if (refno > 0) {
1373                             con.match.setBeginning(refno, saved);
1374                         }
1375                         else {
1376                             con.match.setEnd(-refno, saved);
1377                         }
1378                     }
1379                     break;
1380 
1381                 case Op.LOOKAHEAD:
1382                 case Op.LOOKBEHIND:
1383                     {
1384                         dx = dataStack.pop();
1385                         if (0 <= retValue) {
1386                             op = op.next;
1387                             returned = false;
1388                         }
1389                         retValue = -1;
1390                     }
1391                     break;
1392 
1393                 case Op.NEGATIVELOOKAHEAD:
1394                 case Op.NEGATIVELOOKBEHIND:
1395                     {
1396                         dx = dataStack.pop();
1397                         if (0 > retValue)  {
1398                             op = op.next;
1399                             returned = false;
1400                         }
1401                         retValue = -1;
1402                     }
1403                     break;
1404 
1405                 case Op.MODIFIER:
1406                     opts = dataStack.pop();
1407                     // fall through
1408 
1409                 case Op.INDEPENDENT:
1410                     if (retValue >= 0)  {
1411                         offset = retValue;
1412                         op = op.next;
1413                         returned = false;
1414                     }
1415                     break;
1416 
1417                 case Op.CONDITION:
1418                     {
1419                         final Op.ConditionOp cop = (Op.ConditionOp)op;
1420                         if (0 <= retValue) {
1421                             op = cop.yes;
1422                         }
1423                         else if (cop.no != null) {
1424                             op = cop.no;
1425                         }
1426                         else {
1427                             op = cop.next;
1428                         }
1429                     }
1430                     returned = false;
1431                     break;
1432 
1433                 default:
1434                     break;
1435                 }
1436             }
1437         }
1438     }
1439 
matchChar(int ch, int other, boolean ignoreCase)1440     private boolean matchChar(int ch, int other, boolean ignoreCase) {
1441         return (ignoreCase) ? matchIgnoreCase(ch, other) : ch == other;
1442     }
1443 
matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts)1444     boolean matchAnchor(ExpressionTarget target, Op op, Context con, int offset, int opts) {
1445         boolean go = false;
1446         switch (op.getData()) {
1447         case '^':
1448             if (isSet(opts, MULTIPLE_LINES)) {
1449                 if (!(offset == con.start
1450                       || offset > con.start && offset < con.limit && isEOLChar(target.charAt(offset-1))))
1451                     return false;
1452             } else {
1453                 if (offset != con.start)
1454                     return false;
1455             }
1456             break;
1457 
1458         case '@':                         // Internal use only.
1459             // The @ always matches line beginnings.
1460             if (!(offset == con.start
1461                   || offset > con.start && isEOLChar(target.charAt(offset-1))))
1462                 return false;
1463             break;
1464 
1465         case '$':
1466             if (isSet(opts, MULTIPLE_LINES)) {
1467                 if (!(offset == con.limit
1468                       || offset < con.limit && isEOLChar(target.charAt(offset))))
1469                     return false;
1470             } else {
1471                 if (!(offset == con.limit
1472                       || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1473                       || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1474                       &&  target.charAt(offset+1) == LINE_FEED))
1475                     return false;
1476             }
1477             break;
1478 
1479         case 'A':
1480             if (offset != con.start)  return false;
1481             break;
1482 
1483         case 'Z':
1484             if (!(offset == con.limit
1485                   || offset+1 == con.limit && isEOLChar(target.charAt(offset))
1486                   || offset+2 == con.limit &&  target.charAt(offset) == CARRIAGE_RETURN
1487                   &&  target.charAt(offset+1) == LINE_FEED))
1488                 return false;
1489             break;
1490 
1491         case 'z':
1492             if (offset != con.limit)  return false;
1493             break;
1494 
1495         case 'b':
1496             if (con.length == 0)
1497                 return false;
1498             {
1499                 int after = getWordType(target, con.start, con.limit, offset, opts);
1500                 if (after == WT_IGNORE)  return false;
1501                 int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
1502                 if (after == before)  return false;
1503             }
1504             break;
1505 
1506         case 'B':
1507             if (con.length == 0)
1508                 go = true;
1509             else {
1510                 int after = getWordType(target, con.start, con.limit, offset, opts);
1511                 go = after == WT_IGNORE
1512                      || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
1513             }
1514             if (!go)  return false;
1515             break;
1516 
1517         case '<':
1518             if (con.length == 0 || offset == con.limit)  return false;
1519             if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
1520                 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
1521                 return false;
1522             break;
1523 
1524         case '>':
1525             if (con.length == 0 || offset == con.start)  return false;
1526             if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
1527                 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
1528                 return false;
1529             break;
1530         } // switch anchor type
1531 
1532         return true;
1533     }
1534 
getPreviousWordType(ExpressionTarget target, int begin, int end, int offset, int opts)1535     private static final int getPreviousWordType(ExpressionTarget target, int begin, int end,
1536                                                  int offset, int opts) {
1537         int ret = getWordType(target, begin, end, --offset, opts);
1538         while (ret == WT_IGNORE)
1539             ret = getWordType(target, begin, end, --offset, opts);
1540         return ret;
1541     }
1542 
getWordType(ExpressionTarget target, int begin, int end, int offset, int opts)1543     private static final int getWordType(ExpressionTarget target, int begin, int end,
1544                                          int offset, int opts) {
1545         if (offset < begin || offset >= end)  return WT_OTHER;
1546         return getWordType0(target.charAt(offset) , opts);
1547     }
1548 
1549 
1550     /**
1551      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1552      *
1553      * @return true if the target is matched to this regular expression.
1554      */
matches(CharacterIterator target)1555     public boolean matches(CharacterIterator target) {
1556         return this.matches(target, (Match)null);
1557     }
1558 
1559 
1560     /**
1561      * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
1562      *
1563      * @param match A Match instance for storing matching result.
1564      * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
1565      */
matches(CharacterIterator target, Match match)1566     public boolean matches(CharacterIterator  target, Match match) {
1567         int start = target.getBeginIndex();
1568         int end = target.getEndIndex();
1569 
1570 
1571 
1572         synchronized (this) {
1573             if (this.operations == null)
1574                 this.prepare();
1575             if (this.context == null)
1576                 this.context = new Context();
1577         }
1578         Context con = null;
1579         synchronized (this.context) {
1580             con = this.context.inuse ? new Context() : this.context;
1581             con.reset(target, start, end, this.numberOfClosures);
1582         }
1583         if (match != null) {
1584             match.setNumberOfGroups(this.nofparen);
1585             match.setSource(target);
1586         } else if (this.hasBackReferences) {
1587             match = new Match();
1588             match.setNumberOfGroups(this.nofparen);
1589             // Need not to call setSource() because
1590             // a caller can not access this match instance.
1591         }
1592         con.match = match;
1593 
1594         if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
1595             int matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1596             //System.err.println("DEBUG: matchEnd="+matchEnd);
1597             if (matchEnd == con.limit) {
1598                 if (con.match != null) {
1599                     con.match.setBeginning(0, con.start);
1600                     con.match.setEnd(0, matchEnd);
1601                 }
1602                 con.setInUse(false);
1603                 return true;
1604             }
1605             return false;
1606         }
1607 
1608         /*
1609          * The pattern has only fixed string.
1610          * The engine uses Boyer-Moore.
1611          */
1612         if (this.fixedStringOnly) {
1613             //System.err.println("DEBUG: fixed-only: "+this.fixedString);
1614             int o = this.fixedStringTable.matches(target, con.start, con.limit);
1615             if (o >= 0) {
1616                 if (con.match != null) {
1617                     con.match.setBeginning(0, o);
1618                     con.match.setEnd(0, o+this.fixedString.length());
1619                 }
1620                 con.setInUse(false);
1621                 return true;
1622             }
1623             con.setInUse(false);
1624             return false;
1625         }
1626 
1627         /*
1628          * The pattern contains a fixed string.
1629          * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
1630          * If not, it return with false.
1631          */
1632         if (this.fixedString != null) {
1633             int o = this.fixedStringTable.matches(target, con.start, con.limit);
1634             if (o < 0) {
1635                 //System.err.println("Non-match in fixed-string search.");
1636                 con.setInUse(false);
1637                 return false;
1638             }
1639         }
1640 
1641         int limit = con.limit-this.minlength;
1642         int matchStart;
1643         int matchEnd = -1;
1644 
1645         /*
1646          * Checks whether the expression starts with ".*".
1647          */
1648         if (this.operations != null
1649             && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
1650             if (isSet(this.options, SINGLE_LINE)) {
1651                 matchStart = con.start;
1652                 matchEnd = this.match(con, this.operations, con.start, 1, this.options);
1653             } else {
1654                 boolean previousIsEOL = true;
1655                 for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1656                     int ch =  target .setIndex(  matchStart ) ;
1657                     if (isEOLChar(ch)) {
1658                         previousIsEOL = true;
1659                     } else {
1660                         if (previousIsEOL) {
1661                             if (0 <= (matchEnd = this.match(con, this.operations,
1662                                                             matchStart, 1, this.options)))
1663                                 break;
1664                         }
1665                         previousIsEOL = false;
1666                     }
1667                 }
1668             }
1669         }
1670 
1671         /*
1672          * Optimization against the first character.
1673          */
1674         else if (this.firstChar != null) {
1675             //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
1676             RangeToken range = this.firstChar;
1677             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1678                 int ch =  target .setIndex(  matchStart ) ;
1679                 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
1680                     ch = REUtil.composeFromSurrogates(ch, target.setIndex(matchStart+1));
1681                 }
1682                 if (!range.match(ch)) {
1683                     continue;
1684                 }
1685                 if (0 <= (matchEnd = this.match(con, this.operations,
1686                                                 matchStart, 1, this.options))) {
1687                     break;
1688                 }
1689             }
1690         }
1691 
1692         /*
1693          * Straightforward matching.
1694          */
1695         else {
1696             for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
1697                 if (0 <= (matchEnd = this. match(con, this.operations, matchStart, 1, this.options)))
1698                     break;
1699             }
1700         }
1701 
1702         if (matchEnd >= 0) {
1703             if (con.match != null) {
1704                 con.match.setBeginning(0, matchStart);
1705                 con.match.setEnd(0, matchEnd);
1706             }
1707             con.setInUse(false);
1708             return true;
1709         } else {
1710             con.setInUse(false);
1711             return false;
1712         }
1713     }
1714 
1715     // ================================================================
1716 
1717     /**
1718      * A regular expression.
1719      * @serial
1720      */
1721     String regex;
1722     /**
1723      * @serial
1724      */
1725     int options;
1726 
1727     /**
1728      * The number of parenthesis in the regular expression.
1729      * @serial
1730      */
1731     int nofparen;
1732     /**
1733      * Internal representation of the regular expression.
1734      * @serial
1735      */
1736     Token tokentree;
1737 
1738     boolean hasBackReferences = false;
1739 
1740     transient int minlength;
1741     transient Op operations = null;
1742     transient int numberOfClosures;
1743     transient Context context = null;
1744     transient RangeToken firstChar = null;
1745 
1746     transient String fixedString = null;
1747     transient int fixedStringOptions;
1748     transient BMPattern fixedStringTable = null;
1749     transient boolean fixedStringOnly = false;
1750 
1751     static abstract class ExpressionTarget {
charAt(int index)1752         abstract char charAt(int index);
regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen)1753         abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen);
regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen)1754         abstract boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen);
1755     }
1756 
1757     static final class StringTarget extends ExpressionTarget {
1758 
1759         private String target;
1760 
StringTarget(String target)1761         StringTarget(String target) {
1762             this.target = target;
1763         }
1764 
resetTarget(String target)1765         final void resetTarget(String target) {
1766             this.target = target;
1767         }
1768 
charAt(int index)1769         final char charAt(int index) {
1770             return target.charAt(index);
1771         }
1772 
regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen)1773         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1774                               String part, int partlen) {
1775             if (limit-offset < partlen) {
1776                 return false;
1777             }
1778             return (ignoreCase) ? target.regionMatches(true, offset, part, 0, partlen) : target.regionMatches(offset, part, 0, partlen);
1779         }
1780 
regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen)1781         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1782                                     int offset2, int partlen) {
1783             if (limit-offset < partlen) {
1784                 return false;
1785             }
1786             return (ignoreCase) ? target.regionMatches(true, offset, target, offset2, partlen)
1787                                 : target.regionMatches(offset, target, offset2, partlen);
1788         }
1789     }
1790 
1791     static final class CharArrayTarget extends ExpressionTarget {
1792 
1793         char[] target;
1794 
CharArrayTarget(char[] target)1795         CharArrayTarget(char[] target) {
1796             this.target = target;
1797         }
1798 
resetTarget(char[] target)1799         final void resetTarget(char[] target) {
1800             this.target = target;
1801         }
1802 
charAt(int index)1803         char charAt(int index) {
1804             return target[index];
1805         }
1806 
regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen)1807         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1808                 String part, int partlen) {
1809             if (offset < 0 || limit-offset < partlen)  {
1810                 return false;
1811             }
1812             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1813                                 : regionMatches(offset, limit, part, partlen);
1814         }
1815 
regionMatches(int offset, int limit, String part, int partlen)1816         private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1817             int i = 0;
1818             while (partlen-- > 0) {
1819                 if (target[offset++] != part.charAt(i++)) {
1820                     return false;
1821                 }
1822             }
1823             return true;
1824         }
1825 
regionMatchesIgnoreCase(int offset, int limit, String part, int partlen)1826         private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1827             int i = 0;
1828             while (partlen-- > 0) {
1829                 final char ch1 = target[offset++] ;
1830                 final char ch2 = part.charAt(i++);
1831                 if (ch1 == ch2) {
1832                     continue;
1833                 }
1834                 final char uch1 = Character.toUpperCase(ch1);
1835                 final char uch2 = Character.toUpperCase(ch2);
1836                 if (uch1 == uch2) {
1837                     continue;
1838                 }
1839                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1840                     return false;
1841                 }
1842             }
1843             return true;
1844         }
1845 
regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen)1846         final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1847             if (offset < 0 || limit-offset < partlen) {
1848                 return false;
1849             }
1850             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1851                                 : regionMatches(offset, limit, offset2, partlen);
1852         }
1853 
regionMatches(int offset, int limit, int offset2, int partlen)1854         private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1855             int i = offset2;
1856             while (partlen-- > 0) {
1857                 if ( target [  offset++ ]  !=  target [  i++ ] )
1858                     return false;
1859             }
1860             return true;
1861         }
1862 
regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen)1863         private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1864             int i = offset2;
1865             while (partlen-- > 0) {
1866                 final char ch1 =  target[offset++] ;
1867                 final char ch2 =  target[i++] ;
1868                 if (ch1 == ch2) {
1869                     continue;
1870                 }
1871                 final char uch1 = Character.toUpperCase(ch1);
1872                 final char uch2 = Character.toUpperCase(ch2);
1873                 if (uch1 == uch2) {
1874                     continue;
1875                 }
1876                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1877                     return false;
1878                 }
1879             }
1880             return true;
1881         }
1882     }
1883 
1884     static final class CharacterIteratorTarget extends ExpressionTarget {
1885         CharacterIterator target;
1886 
CharacterIteratorTarget(CharacterIterator target)1887         CharacterIteratorTarget(CharacterIterator target) {
1888             this.target = target;
1889         }
1890 
resetTarget(CharacterIterator target)1891         final void resetTarget(CharacterIterator target) {
1892             this.target = target;
1893         }
1894 
charAt(int index)1895         final char charAt(int index) {
1896             return target.setIndex(index);
1897         }
1898 
regionMatches(boolean ignoreCase, int offset, int limit, String part, int partlen)1899         final boolean regionMatches(boolean ignoreCase, int offset, int limit,
1900                 String part, int partlen) {
1901             if (offset < 0 || limit-offset < partlen)  {
1902                 return false;
1903             }
1904             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, part, partlen)
1905                                 : regionMatches(offset, limit, part, partlen);
1906         }
1907 
regionMatches(int offset, int limit, String part, int partlen)1908         private final boolean regionMatches(int offset, int limit, String part, int partlen) {
1909             int i = 0;
1910             while (partlen-- > 0) {
1911                 if (target.setIndex(offset++) != part.charAt(i++)) {
1912                     return false;
1913                 }
1914             }
1915             return true;
1916         }
1917 
regionMatchesIgnoreCase(int offset, int limit, String part, int partlen)1918         private final boolean regionMatchesIgnoreCase(int offset, int limit, String part, int partlen) {
1919             int i = 0;
1920             while (partlen-- > 0) {
1921                 final char ch1 = target.setIndex(offset++) ;
1922                 final char ch2 = part.charAt(i++);
1923                 if (ch1 == ch2) {
1924                     continue;
1925                 }
1926                 final char uch1 = Character.toUpperCase(ch1);
1927                 final char uch2 = Character.toUpperCase(ch2);
1928                 if (uch1 == uch2) {
1929                     continue;
1930                 }
1931                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1932                     return false;
1933                 }
1934             }
1935             return true;
1936         }
1937 
regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen)1938         final boolean regionMatches(boolean ignoreCase, int offset, int limit, int offset2, int partlen) {
1939             if (offset < 0 || limit-offset < partlen) {
1940                 return false;
1941             }
1942             return (ignoreCase) ? regionMatchesIgnoreCase(offset, limit, offset2, partlen)
1943                                 : regionMatches(offset, limit, offset2, partlen);
1944         }
1945 
regionMatches(int offset, int limit, int offset2, int partlen)1946         private final boolean regionMatches(int offset, int limit, int offset2, int partlen) {
1947             int i = offset2;
1948             while (partlen-- > 0) {
1949                 if (target.setIndex(offset++) != target.setIndex(i++)) {
1950                     return false;
1951                 }
1952             }
1953             return true;
1954         }
1955 
regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen)1956         private final boolean regionMatchesIgnoreCase(int offset, int limit, int offset2, int partlen) {
1957             int i = offset2;
1958             while (partlen-- > 0) {
1959                 final char ch1 = target.setIndex(offset++) ;
1960                 final char ch2 = target.setIndex(i++) ;
1961                 if (ch1 == ch2) {
1962                     continue;
1963                 }
1964                 final char uch1 = Character.toUpperCase(ch1);
1965                 final char uch2 = Character.toUpperCase(ch2);
1966                 if (uch1 == uch2) {
1967                     continue;
1968                 }
1969                 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) {
1970                     return false;
1971                 }
1972             }
1973             return true;
1974         }
1975     }
1976 
1977     static final class ClosureContext {
1978 
1979         int[] offsets = new int[4];
1980         int currentIndex = 0;
1981 
contains(int offset)1982         boolean contains(int offset) {
1983             for (int i=0; i<currentIndex;++i) {
1984                 if (offsets[i] == offset) {
1985                     return true;
1986                 }
1987             }
1988             return false;
1989         }
1990 
reset()1991         void reset() {
1992             currentIndex = 0;
1993         }
1994 
addOffset(int offset)1995         void addOffset(int offset) {
1996             // We do not check for duplicates, caller is responsible for that
1997             if (currentIndex == offsets.length) {
1998                 offsets = expandOffsets();
1999             }
2000             offsets[currentIndex++] = offset;
2001         }
2002 
expandOffsets()2003         private int[] expandOffsets() {
2004             final int len = offsets.length;
2005             final int newLen = len << 1;
2006             int[] newOffsets = new int[newLen];
2007 
2008             System.arraycopy(offsets, 0, newOffsets, 0, currentIndex);
2009             return newOffsets;
2010         }
2011     }
2012 
2013     static final class Context {
2014         int start;
2015         int limit;
2016         int length;
2017         Match match;
2018         boolean inuse = false;
2019         ClosureContext[] closureContexts;
2020 
2021         private StringTarget stringTarget;
2022         private CharArrayTarget charArrayTarget;
2023         private CharacterIteratorTarget characterIteratorTarget;
2024 
2025         ExpressionTarget target;
2026 
Context()2027         Context() {
2028         }
2029 
resetCommon(int nofclosures)2030         private void resetCommon(int nofclosures) {
2031             this.length = this.limit-this.start;
2032             setInUse(true);
2033             this.match = null;
2034             if (this.closureContexts == null || this.closureContexts.length != nofclosures) {
2035                 this.closureContexts = new ClosureContext[nofclosures];
2036             }
2037             for (int i = 0;  i < nofclosures;  i ++)  {
2038                 if (this.closureContexts[i] == null) {
2039                     this.closureContexts[i] = new ClosureContext();
2040                 }
2041                 else {
2042                     this.closureContexts[i].reset();
2043                 }
2044             }
2045         }
2046 
reset(CharacterIterator target, int start, int limit, int nofclosures)2047         void reset(CharacterIterator target, int start, int limit, int nofclosures) {
2048             if (characterIteratorTarget == null) {
2049                 characterIteratorTarget = new CharacterIteratorTarget(target);
2050             }
2051             else {
2052                 characterIteratorTarget.resetTarget(target);
2053             }
2054             this.target = characterIteratorTarget;
2055             this.start = start;
2056             this.limit = limit;
2057             this.resetCommon(nofclosures);
2058         }
2059 
reset(String target, int start, int limit, int nofclosures)2060         void reset(String target, int start, int limit, int nofclosures) {
2061             if (stringTarget == null) {
2062                 stringTarget = new StringTarget(target);
2063             }
2064             else {
2065                 stringTarget.resetTarget(target);
2066             }
2067             this.target = stringTarget;
2068             this.start = start;
2069             this.limit = limit;
2070             this.resetCommon(nofclosures);
2071         }
2072 
reset(char[] target, int start, int limit, int nofclosures)2073         void reset(char[] target, int start, int limit, int nofclosures) {
2074             if (charArrayTarget == null) {
2075                 charArrayTarget = new CharArrayTarget(target);
2076             }
2077             else {
2078                 charArrayTarget.resetTarget(target);
2079             }
2080             this.target = charArrayTarget;
2081             this.start = start;
2082             this.limit = limit;
2083             this.resetCommon(nofclosures);
2084         }
setInUse(boolean inUse)2085         synchronized void setInUse(boolean inUse) {
2086             this.inuse = inUse;
2087         }
2088     }
2089 
2090     /**
2091      * Prepares for matching.  This method is called just before starting matching.
2092      */
prepare()2093     void prepare() {
2094         if (Op.COUNT)  Op.nofinstances = 0;
2095         this.compile(this.tokentree);
2096         /*
2097         if  (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
2098             Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
2099             anchor.next = this.operations;
2100             this.operations = anchor;
2101         }
2102         */
2103         if (Op.COUNT)  System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
2104 
2105         this.minlength = this.tokentree.getMinLength();
2106 
2107         this.firstChar = null;
2108         if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
2109             && !isSet(this.options, XMLSCHEMA_MODE)) {
2110             RangeToken firstChar = Token.createRange();
2111             int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
2112             if (fresult == Token.FC_TERMINAL) {
2113                 firstChar.compactRanges();
2114                 this.firstChar = firstChar;
2115                 if (DEBUG)
2116                     System.err.println("DEBUG: Use the first character optimization: "+firstChar);
2117             }
2118         }
2119 
2120         if (this.operations != null
2121             && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
2122             && this.operations.next == null) {
2123             if (DEBUG)
2124                 System.err.print(" *** Only fixed string! *** ");
2125             this.fixedStringOnly = true;
2126             if (this.operations.type == Op.STRING)
2127                 this.fixedString = this.operations.getString();
2128             else if (this.operations.getData() >= 0x10000) { // Op.CHAR
2129                 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
2130             } else {
2131                 char[] ac = new char[1];
2132                 ac[0] = (char)this.operations.getData();
2133                 this.fixedString = new String(ac);
2134             }
2135             this.fixedStringOptions = this.options;
2136             this.fixedStringTable = new BMPattern(this.fixedString, 256,
2137                                                   isSet(this.fixedStringOptions, IGNORE_CASE));
2138         } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
2139                    && !isSet(this.options, XMLSCHEMA_MODE)) {
2140             Token.FixedStringContainer container = new Token.FixedStringContainer();
2141             this.tokentree.findFixedString(container, this.options);
2142             this.fixedString = container.token == null ? null : container.token.getString();
2143             this.fixedStringOptions = container.options;
2144             if (this.fixedString != null && this.fixedString.length() < 2)
2145                 this.fixedString = null;
2146             // This pattern has a fixed string of which length is more than one.
2147             if (this.fixedString != null) {
2148                 this.fixedStringTable = new BMPattern(this.fixedString, 256,
2149                                                       isSet(this.fixedStringOptions, IGNORE_CASE));
2150                 if (DEBUG) {
2151                     System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
2152                                        +"/" //+this.fixedString
2153                                        +"/"+REUtil.createOptionString(this.fixedStringOptions));
2154                     System.err.print("String: ");
2155                     REUtil.dumpString(this.fixedString);
2156                 }
2157             }
2158         }
2159     }
2160 
2161     /**
2162      * An option.
2163      * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
2164      * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
2165      * does not capture.
2166      *
2167      * @see #RegularExpression(java.lang.String,int)
2168      * @see #setPattern(java.lang.String,int)
2169     static final int MARK_PARENS = 1<<0;
2170      */
2171 
2172     /**
2173      * "i"
2174      */
2175     static final int IGNORE_CASE = 1<<1;
2176 
2177     /**
2178      * "s"
2179      */
2180     static final int SINGLE_LINE = 1<<2;
2181 
2182     /**
2183      * "m"
2184      */
2185     static final int MULTIPLE_LINES = 1<<3;
2186 
2187     /**
2188      * "x"
2189      */
2190     static final int EXTENDED_COMMENT = 1<<4;
2191 
2192     /**
2193      * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
2194      *
2195      * @see #RegularExpression(java.lang.String,int)
2196      * @see #setPattern(java.lang.String,int)
2197      * @see #UNICODE_WORD_BOUNDARY
2198      */
2199     static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
2200 
2201     /**
2202      * An option.
2203      * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
2204      * <p>By default, the engine considers a position between a word character
2205      * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
2206      * is a word boundary.
2207      * <p>By this option, the engine checks word boundaries with the method of
2208      * 'Unicode Regular Expression Guidelines' Revision 4.
2209      *
2210      * @see #RegularExpression(java.lang.String,int)
2211      * @see #setPattern(java.lang.String,int)
2212      */
2213     static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
2214 
2215     /**
2216      * "H"
2217      */
2218     static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
2219     /**
2220      * "F"
2221      */
2222     static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
2223     /**
2224      * "X". XML Schema mode.
2225      */
2226     static final int XMLSCHEMA_MODE = 1<<9;
2227     /**
2228      * ",".
2229      */
2230     static final int SPECIAL_COMMA = 1<<10;
2231 
2232 
isSet(int options, int flag)2233     private static final boolean isSet(int options, int flag) {
2234         return (options & flag) == flag;
2235     }
2236 
2237     /**
2238      * Creates a new RegularExpression instance.
2239      *
2240      * @param regex A regular expression
2241      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2242      */
RegularExpression(String regex)2243     public RegularExpression(String regex) throws ParseException {
2244         this(regex, null);
2245     }
2246 
2247     /**
2248      * Creates a new RegularExpression instance with options.
2249      *
2250      * @param regex A regular expression
2251      * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2252      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2253      */
RegularExpression(String regex, String options)2254     public RegularExpression(String regex, String options) throws ParseException {
2255         this.setPattern(regex, options);
2256     }
2257 
2258     /**
2259      * Creates a new RegularExpression instance with options.
2260      *
2261      * @param regex A regular expression
2262      * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
2263      * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
2264      */
RegularExpression(String regex, String options, Locale locale)2265     public RegularExpression(String regex, String options, Locale locale) throws ParseException {
2266         this.setPattern(regex, options, locale);
2267     }
2268 
RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options)2269     RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
2270         this.regex = regex;
2271         this.tokentree = tok;
2272         this.nofparen = parens;
2273         this.options = options;
2274         this.hasBackReferences = hasBackReferences;
2275     }
2276 
2277     /**
2278      *
2279      */
setPattern(String newPattern)2280     public void setPattern(String newPattern) throws ParseException {
2281         this.setPattern(newPattern, Locale.getDefault());
2282     }
2283 
setPattern(String newPattern, Locale locale)2284     public void setPattern(String newPattern, Locale locale) throws ParseException {
2285         this.setPattern(newPattern, this.options, locale);
2286     }
2287 
setPattern(String newPattern, int options, Locale locale)2288     private void setPattern(String newPattern, int options, Locale locale) throws ParseException {
2289         this.regex = newPattern;
2290         this.options = options;
2291         RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
2292                          ? new ParserForXMLSchema(locale) : new RegexParser(locale);
2293         this.tokentree = rp.parse(this.regex, this.options);
2294         this.nofparen = rp.parennumber;
2295         this.hasBackReferences = rp.hasBackReferences;
2296 
2297         this.operations = null;
2298         this.context = null;
2299     }
2300     /**
2301      *
2302      */
setPattern(String newPattern, String options)2303     public void setPattern(String newPattern, String options) throws ParseException {
2304         this.setPattern(newPattern, options, Locale.getDefault());
2305     }
2306 
setPattern(String newPattern, String options, Locale locale)2307     public void setPattern(String newPattern, String options, Locale locale) throws ParseException {
2308         this.setPattern(newPattern, REUtil.parseOptions(options), locale);
2309     }
2310 
2311     /**
2312      *
2313      */
getPattern()2314     public String getPattern() {
2315         return this.regex;
2316     }
2317 
2318     /**
2319      * Represents this instence in String.
2320      */
toString()2321     public String toString() {
2322         return this.tokentree.toString(this.options);
2323     }
2324 
2325     /**
2326      * Returns a option string.
2327      * The order of letters in it may be different from a string specified
2328      * in a constructor or <code>setPattern()</code>.
2329      *
2330      * @see #RegularExpression(java.lang.String,java.lang.String)
2331      * @see #setPattern(java.lang.String,java.lang.String)
2332      */
getOptions()2333     public String getOptions() {
2334         return REUtil.createOptionString(this.options);
2335     }
2336 
2337     /**
2338      *  Return true if patterns are the same and the options are equivalent.
2339      */
equals(Object obj)2340     public boolean equals(Object obj) {
2341         if (obj == null)  return false;
2342         if (!(obj instanceof RegularExpression))
2343             return false;
2344         RegularExpression r = (RegularExpression)obj;
2345         return this.regex.equals(r.regex) && this.options == r.options;
2346     }
2347 
equals(String pattern, int options)2348     boolean equals(String pattern, int options) {
2349         return this.regex.equals(pattern) && this.options == options;
2350     }
2351 
2352     /**
2353      *
2354      */
hashCode()2355     public int hashCode() {
2356         return (this.regex+"/"+this.getOptions()).hashCode();
2357     }
2358 
2359     /**
2360      * Return the number of regular expression groups.
2361      * This method returns 1 when the regular expression has no capturing-parenthesis.
2362      *
2363      */
getNumberOfGroups()2364     public int getNumberOfGroups() {
2365         return this.nofparen;
2366     }
2367 
2368     // ================================================================
2369 
2370     private static final int WT_IGNORE = 0;
2371     private static final int WT_LETTER = 1;
2372     private static final int WT_OTHER = 2;
getWordType0(char ch, int opts)2373     private static final int getWordType0(char ch, int opts) {
2374         if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
2375             if (isSet(opts, USE_UNICODE_CATEGORY)) {
2376                 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
2377             }
2378             return isWordChar(ch) ? WT_LETTER : WT_OTHER;
2379         }
2380 
2381         switch (Character.getType(ch)) {
2382         case Character.UPPERCASE_LETTER:      // L
2383         case Character.LOWERCASE_LETTER:      // L
2384         case Character.TITLECASE_LETTER:      // L
2385         case Character.MODIFIER_LETTER:       // L
2386         case Character.OTHER_LETTER:          // L
2387         case Character.LETTER_NUMBER:         // N
2388         case Character.DECIMAL_DIGIT_NUMBER:  // N
2389         case Character.OTHER_NUMBER:          // N
2390         case Character.COMBINING_SPACING_MARK: // Mc
2391             return WT_LETTER;
2392 
2393         case Character.FORMAT:                // Cf
2394         case Character.NON_SPACING_MARK:      // Mn
2395         case Character.ENCLOSING_MARK:        // Mc
2396             return WT_IGNORE;
2397 
2398         case Character.CONTROL:               // Cc
2399             switch (ch) {
2400             case '\t':
2401             case '\n':
2402             case '\u000B':
2403             case '\f':
2404             case '\r':
2405                 return WT_OTHER;
2406             default:
2407                 return WT_IGNORE;
2408             }
2409 
2410         default:
2411             return WT_OTHER;
2412         }
2413     }
2414 
2415     // ================================================================
2416 
2417     static final int LINE_FEED = 0x000A;
2418     static final int CARRIAGE_RETURN = 0x000D;
2419     static final int LINE_SEPARATOR = 0x2028;
2420     static final int PARAGRAPH_SEPARATOR = 0x2029;
2421 
isEOLChar(int ch)2422     private static final boolean isEOLChar(int ch) {
2423         return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
2424         || ch == PARAGRAPH_SEPARATOR;
2425     }
2426 
isWordChar(int ch)2427     private static final boolean isWordChar(int ch) { // Legacy word characters
2428         if (ch == '_')  return true;
2429         if (ch < '0')  return false;
2430         if (ch > 'z')  return false;
2431         if (ch <= '9')  return true;
2432         if (ch < 'A')  return false;
2433         if (ch <= 'Z')  return true;
2434         if (ch < 'a')  return false;
2435         return true;
2436     }
2437 
matchIgnoreCase(int chardata, int ch)2438     private static final boolean matchIgnoreCase(int chardata, int ch) {
2439         if (chardata == ch)  return true;
2440         if (chardata > 0xffff || ch > 0xffff)  return false;
2441         char uch1 = Character.toUpperCase((char)chardata);
2442         char uch2 = Character.toUpperCase((char)ch);
2443         if (uch1 == uch2)  return true;
2444         return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
2445     }
2446 }
2447