1 /*
2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
3  */
4 /*
5  * Licensed to the Apache Software Foundation (ASF) under one or more
6  * contributor license agreements.  See the NOTICE file distributed with
7  * this work for additional information regarding copyright ownership.
8  * The ASF licenses this file to You under the Apache License, Version 2.0
9  * (the "License"); you may not use this file except in compliance with
10  * the License.  You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 package com.sun.org.apache.xerces.internal.impl.xpath.regex;
22 
23 import java.text.CharacterIterator;
24 
25 /**
26  * @xerces.internal
27  *
28  */
29 public final class REUtil {
REUtil()30     private REUtil() {
31     }
32 
composeFromSurrogates(int high, int low)33     static final int composeFromSurrogates(int high, int low) {
34         return 0x10000 + ((high-0xd800)<<10) + low-0xdc00;
35     }
36 
isLowSurrogate(int ch)37     static final boolean isLowSurrogate(int ch) {
38         return (ch & 0xfc00) == 0xdc00;
39     }
40 
isHighSurrogate(int ch)41     static final boolean isHighSurrogate(int ch) {
42         return (ch & 0xfc00) == 0xd800;
43     }
44 
decomposeToSurrogates(int ch)45     static final String decomposeToSurrogates(int ch) {
46         char[] chs = new char[2];
47         ch -= 0x10000;
48         chs[0] = (char)((ch>>10)+0xd800);
49         chs[1] = (char)((ch&0x3ff)+0xdc00);
50         return new String(chs);
51     }
52 
substring(CharacterIterator iterator, int begin, int end)53     static final String substring(CharacterIterator iterator, int begin, int end) {
54         char[] src = new char[end-begin];
55         for (int i = 0;  i < src.length;  i ++)
56             src[i] = iterator.setIndex(i+begin);
57         return new String(src);
58     }
59 
60     // ================================================================
61 
getOptionValue(int ch)62     static final int getOptionValue(int ch) {
63         int ret = 0;
64         switch (ch) {
65           case 'i':
66             ret = RegularExpression.IGNORE_CASE;
67             break;
68           case 'm':
69             ret = RegularExpression.MULTIPLE_LINES;
70             break;
71           case 's':
72             ret = RegularExpression.SINGLE_LINE;
73             break;
74           case 'x':
75             ret = RegularExpression.EXTENDED_COMMENT;
76             break;
77           case 'u':
78             ret = RegularExpression.USE_UNICODE_CATEGORY;
79             break;
80           case 'w':
81             ret = RegularExpression.UNICODE_WORD_BOUNDARY;
82             break;
83           case 'F':
84             ret = RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION;
85             break;
86           case 'H':
87             ret = RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
88             break;
89           case 'X':
90             ret = RegularExpression.XMLSCHEMA_MODE;
91             break;
92           case ',':
93             ret = RegularExpression.SPECIAL_COMMA;
94             break;
95           default:
96         }
97         return ret;
98     }
99 
parseOptions(String opts)100     static final int parseOptions(String opts) throws ParseException {
101         if (opts == null)  return 0;
102         int options = 0;
103         for (int i = 0;  i < opts.length();  i ++) {
104             int v = getOptionValue(opts.charAt(i));
105             if (v == 0)
106                 throw new ParseException("Unknown Option: "+opts.substring(i), -1);
107             options |= v;
108         }
109         return options;
110     }
111 
createOptionString(int options)112     static final String createOptionString(int options) {
113         StringBuilder sb = new StringBuilder(9);
114         if ((options & RegularExpression.PROHIBIT_FIXED_STRING_OPTIMIZATION) != 0)
115             sb.append('F');
116         if ((options & RegularExpression.PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) != 0)
117             sb.append('H');
118         if ((options & RegularExpression.XMLSCHEMA_MODE) != 0)
119             sb.append('X');
120         if ((options & RegularExpression.IGNORE_CASE) != 0)
121             sb.append('i');
122         if ((options & RegularExpression.MULTIPLE_LINES) != 0)
123             sb.append('m');
124         if ((options & RegularExpression.SINGLE_LINE) != 0)
125             sb.append('s');
126         if ((options & RegularExpression.USE_UNICODE_CATEGORY) != 0)
127             sb.append('u');
128         if ((options & RegularExpression.UNICODE_WORD_BOUNDARY) != 0)
129             sb.append('w');
130         if ((options & RegularExpression.EXTENDED_COMMENT) != 0)
131             sb.append('x');
132         if ((options & RegularExpression.SPECIAL_COMMA) != 0)
133             sb.append(',');
134         return sb.toString().intern();
135     }
136 
137     // ================================================================
138 
stripExtendedComment(String regex)139     static String stripExtendedComment(String regex) {
140         int len = regex.length();
141         StringBuilder buffer = new StringBuilder(len);
142         int offset = 0;
143         int charClass = 0;
144         while (offset < len) {
145             int ch = regex.charAt(offset++);
146                                                 // Skips a white space.
147             if (ch == '\t' || ch == '\n' || ch == '\f' || ch == '\r' || ch == ' ') {
148                 // if we are inside a character class, we keep the white space
149                 if (charClass > 0) {
150                     buffer.append((char)ch);
151                 }
152                 continue;
153             }
154 
155             if (ch == '#') {                    // Skips chracters between '#' and a line end.
156                 while (offset < len) {
157                     ch = regex.charAt(offset++);
158                     if (ch == '\r' || ch == '\n')
159                         break;
160                 }
161                 continue;
162             }
163 
164             int next;                           // Strips an escaped white space.
165             if (ch == '\\' && offset < len) {
166                 if ((next = regex.charAt(offset)) == '#'
167                     || next == '\t' || next == '\n' || next == '\f'
168                     || next == '\r' || next == ' ') {
169                     buffer.append((char)next);
170                     offset ++;
171                 } else {                        // Other escaped character.
172                     buffer.append('\\');
173                     buffer.append((char)next);
174                     offset ++;
175                 }
176             }
177             else if (ch == '[') {
178                 charClass++;
179                 buffer.append((char)ch);
180                 if (offset < len) {
181                     next = regex.charAt(offset);
182                     if (next == '[' || next ==']') {
183                         buffer.append((char)next);
184                         offset ++;
185                     }
186                     else if (next == '^' && offset + 1 < len) {
187                         next = regex.charAt(offset + 1);
188                         if (next == '[' || next ==']') {
189                             buffer.append('^');
190                             buffer.append((char)next);
191                             offset += 2;
192                         }
193                     }
194                 }
195             }
196             else {
197                 if (charClass > 0 && ch == ']') {
198                     --charClass;
199                 }
200                 buffer.append((char)ch);
201             }
202         }
203         return buffer.toString();
204     }
205 
206     // ================================================================
207 
208     /**
209      * Sample entry.
210      * <div>Usage: <KBD>com.sun.org.apache.xerces.internal.utils.regex.REUtil &lt;regex&gt; &lt;string&gt;</KBD></div>
211      */
main(String[] argv)212     public static void main(String[] argv) {
213         String pattern = null;
214         try {
215             String options = "";
216             String target = null;
217             if( argv.length == 0 ) {
218                 System.out.println( "Error:Usage: java REUtil -i|-m|-s|-u|-w|-X regularExpression String" );
219                 System.exit( 0 );
220             }
221             for (int i = 0;  i < argv.length;  i ++) {
222                 if (argv[i].length() == 0 || argv[i].charAt(0) != '-') {
223                     if (pattern == null)
224                         pattern = argv[i];
225                     else if (target == null)
226                         target = argv[i];
227                     else
228                         System.err.println("Unnecessary: "+argv[i]);
229                 } else if (argv[i].equals("-i")) {
230                     options += "i";
231                 } else if (argv[i].equals("-m")) {
232                     options += "m";
233                 } else if (argv[i].equals("-s")) {
234                     options += "s";
235                 } else if (argv[i].equals("-u")) {
236                     options += "u";
237                 } else if (argv[i].equals("-w")) {
238                     options += "w";
239                 } else if (argv[i].equals("-X")) {
240                     options += "X";
241                 } else {
242                     System.err.println("Unknown option: "+argv[i]);
243                 }
244             }
245             RegularExpression reg = new RegularExpression(pattern, options);
246             System.out.println("RegularExpression: "+reg);
247             Match match = new Match();
248             reg.matches(target, match);
249             for (int i = 0;  i < match.getNumberOfGroups();  i ++) {
250                 if (i == 0 )  System.out.print("Matched range for the whole pattern: ");
251                 else System.out.print("["+i+"]: ");
252                 if (match.getBeginning(i) < 0)
253                     System.out.println("-1");
254                 else {
255                     System.out.print(match.getBeginning(i)+", "+match.getEnd(i)+", ");
256                     System.out.println("\""+match.getCapturedText(i)+"\"");
257                 }
258             }
259         } catch (ParseException pe) {
260             if (pattern == null) {
261                 pe.printStackTrace();
262             } else {
263                 System.err.println("com.sun.org.apache.xerces.internal.utils.regex.ParseException: "+pe.getMessage());
264                 String indent = "        ";
265                 System.err.println(indent+pattern);
266                 int loc = pe.getLocation();
267                 if (loc >= 0) {
268                     System.err.print(indent);
269                     for (int i = 0;  i < loc;  i ++)  System.err.print("-");
270                     System.err.println("^");
271                 }
272             }
273         } catch (Exception e) {
274             e.printStackTrace();
275         }
276     }
277 
278     static final int CACHESIZE = 20;
279     static final RegularExpression[] regexCache = new RegularExpression[CACHESIZE];
280     /**
281      * Creates a RegularExpression instance.
282      * This method caches created instances.
283      *
284      * @see RegularExpression#RegularExpression(java.lang.String, java.lang.String)
285      */
createRegex(String pattern, String options)286     public static RegularExpression createRegex(String pattern, String options)
287         throws ParseException {
288         RegularExpression re = null;
289         int intOptions = REUtil.parseOptions(options);
290         synchronized (REUtil.regexCache) {
291             int i;
292             for (i = 0;  i < REUtil.CACHESIZE;  i ++) {
293                 RegularExpression cached = REUtil.regexCache[i];
294                 if (cached == null) {
295                     i = -1;
296                     break;
297                 }
298                 if (cached.equals(pattern, intOptions)) {
299                     re = cached;
300                     break;
301                 }
302             }
303             if (re != null) {
304                 if (i != 0) {
305                     System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, i);
306                     REUtil.regexCache[0] = re;
307                 }
308             } else {
309                 re = new RegularExpression(pattern, options);
310                 System.arraycopy(REUtil.regexCache, 0, REUtil.regexCache, 1, REUtil.CACHESIZE-1);
311                 REUtil.regexCache[0] = re;
312             }
313         }
314         return re;
315     }
316 
317     /**
318      *
319      * @see RegularExpression#matches(java.lang.String)
320      */
matches(String regex, String target)321     public static boolean matches(String regex, String target) throws ParseException {
322         return REUtil.createRegex(regex, null).matches(target);
323     }
324 
325     /**
326      *
327      * @see RegularExpression#matches(java.lang.String)
328      */
matches(String regex, String options, String target)329     public static boolean matches(String regex, String options, String target) throws ParseException {
330         return REUtil.createRegex(regex, options).matches(target);
331     }
332 
333     // ================================================================
334 
335     /**
336      *
337      */
quoteMeta(String literal)338     public static String quoteMeta(String literal) {
339         int len = literal.length();
340         StringBuilder buffer = null;
341         for (int i = 0;  i < len;  i ++) {
342             int ch = literal.charAt(i);
343             if (".*+?{[()|\\^$".indexOf(ch) >= 0) {
344                 if (buffer == null) {
345                     buffer = new StringBuilder(i+(len-i)*2);
346                     if (i > 0)  buffer.append(literal.substring(0, i));
347                 }
348                 buffer.append('\\');
349                 buffer.append((char)ch);
350             } else if (buffer != null)
351                 buffer.append((char)ch);
352         }
353         return buffer != null ? buffer.toString() : literal;
354     }
355 
356     // ================================================================
357 
dumpString(String v)358     static void dumpString(String v) {
359         for (int i = 0;  i < v.length();  i ++) {
360             System.out.print(Integer.toHexString(v.charAt(i)));
361             System.out.print(" ");
362         }
363         System.out.println();
364     }
365 }
366