1 /*
2  * Permission is hereby granted, free of charge, to any person obtaining a copy of
3  * this software and associated documentation files (the "Software"), to deal in
4  * the Software without restriction, including without limitation the rights to
5  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
6  * of the Software, and to permit persons to whom the Software is furnished to do
7  * so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be included in all
10  * copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
18  * SOFTWARE.
19  */
20 package jdk.nashorn.internal.runtime.regexp.joni;
21 
22 import java.util.Arrays;
23 import jdk.nashorn.internal.runtime.regexp.joni.encoding.CharacterType;
24 import jdk.nashorn.internal.runtime.regexp.joni.encoding.IntHolder;
25 
26 @SuppressWarnings("javadoc")
27 public final class EncodingHelper {
28 
29     final static int NEW_LINE            = 0x000a;
30     final static int RETURN              = 0x000d;
31     final static int LINE_SEPARATOR      = 0x2028;
32     final static int PARAGRAPH_SEPARATOR = 0x2029;
33 
34     final static char[] EMPTYCHARS = new char[0];
35     final static int[][] codeRanges = new int[15][];
36 
digitVal(final int code)37     public static int digitVal(final int code) {
38         return code - '0';
39     }
40 
odigitVal(final int code)41     public static int odigitVal(final int code) {
42         return digitVal(code);
43     }
44 
isXDigit(final int code)45     public static boolean isXDigit(final int code) {
46         return Character.isDigit(code) || (code >= 'a' && code <= 'f') || (code >= 'A' && code <= 'F');
47     }
48 
xdigitVal(final int code)49     public static int xdigitVal(final int code) {
50         if (Character.isDigit(code)) {
51             return code - '0';
52         } else if (code >= 'a' && code <= 'f') {
53             return code - 'a' + 10;
54         } else {
55             return code - 'A' + 10;
56         }
57     }
58 
isDigit(final int code)59     public static boolean isDigit(final int code) {
60         return code >= '0' && code <= '9';
61     }
62 
isWord(final int code)63     public static boolean isWord(final int code) {
64         // letter, digit, or '_'
65         return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
66     }
67 
isNewLine(final int code)68     public static boolean isNewLine(final int code) {
69         return code == NEW_LINE || code == RETURN || code == LINE_SEPARATOR || code == PARAGRAPH_SEPARATOR;
70     }
71 
isNewLine(final char[] chars, final int p, final int end)72     public static boolean isNewLine(final char[] chars, final int p, final int end) {
73         return p < end && isNewLine(chars[p]);
74     }
75 
76     // Encoding.prevCharHead
prevCharHead(final int p, final int s)77     public static int prevCharHead(final int p, final int s) {
78         return s <= p ? -1 : s - 1;
79     }
80 
81     /* onigenc_get_right_adjust_char_head_with_prev */
rightAdjustCharHeadWithPrev(final int s, final IntHolder prev)82     public static int rightAdjustCharHeadWithPrev(final int s, final IntHolder prev) {
83         if (prev != null) {
84             prev.value = -1; /* Sorry */
85         }
86         return s;
87     }
88 
89     // Encoding.stepBack
stepBack(final int p, final int sp, final int np)90     public static int stepBack(final int p, final int sp, final int np) {
91         int s = sp, n = np;
92         while (s != -1 && n-- > 0) {
93            if (s <= p) {
94             return -1;
95         }
96            s--;
97        }
98        return s;
99     }
100 
mbcodeStartPosition()101     public static int mbcodeStartPosition() {
102         return 0x80;
103     }
104 
caseFoldCodesByString(final int flag, final char c)105     public static char[] caseFoldCodesByString(final int flag, final char c) {
106         char[] codes = EMPTYCHARS;
107         final char upper = toUpperCase(c);
108 
109         if (upper != toLowerCase(upper)) {
110             int count = 0;
111             char ch = 0;
112 
113             do {
114                 final char u = toUpperCase(ch);
115                 if (u == upper && ch != c) {
116                     // Almost all characters will return array of length 1, very few 2 or 3, so growing by one is fine.
117                     codes = count == 0 ? new char[1] : Arrays.copyOf(codes, count + 1);
118                     codes[count++] = ch;
119                 }
120             } while (ch++ < 0xffff);
121         }
122         return codes;
123     }
124 
applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg)125     public static void applyAllCaseFold(final int flag, final ApplyCaseFold fun, final Object arg) {
126         for (int c = 0; c < 0xffff; c++) {
127             if (Character.isLowerCase(c)) {
128                 final int upper = toUpperCase(c);
129 
130                 if (upper != c) {
131                     ApplyCaseFold.apply(c, upper, arg);
132                 }
133             }
134         }
135 
136         // Some characters have multiple lower case variants, hence we need to do a second run
137         for (int c = 0; c < 0xffff; c++) {
138             if (Character.isLowerCase(c)) {
139                 final int upper = toUpperCase(c);
140 
141                 if (upper != c) {
142                     ApplyCaseFold.apply(upper, c, arg);
143                 }
144             }
145         }
146     }
147 
toLowerCase(final char c)148     public static char toLowerCase(final char c) {
149         return (char)toLowerCase((int)c);
150     }
151 
toLowerCase(final int c)152     public static int toLowerCase(final int c) {
153         if (c < 128) {
154             return ('A' <= c && c <= 'Z') ? (c + ('a' - 'A')) : c;
155         }
156         // Do not convert non-ASCII upper case character to ASCII lower case.
157         final int lower = Character.toLowerCase(c);
158         return (lower < 128) ? c : lower;
159 
160     }
161 
toUpperCase(final char c)162     public static char toUpperCase(final char c) {
163         return (char)toUpperCase((int)c);
164     }
165 
toUpperCase(final int c)166     public static int toUpperCase(final int c) {
167         if (c < 128) {
168             return ('a' <= c && c <= 'z') ? c + ('A' - 'a') : c;
169         }
170         // Do not convert non-ASCII lower case character to ASCII upper case.
171         final int upper = Character.toUpperCase(c);
172         return (upper < 128) ? c : upper;
173     }
174 
ctypeCodeRange(final int ctype, final IntHolder sbOut)175     public static int[] ctypeCodeRange(final int ctype, final IntHolder sbOut) {
176         sbOut.value = 0x100; // use bitset for codes smaller than 256
177         int[] range = null;
178 
179         if (ctype < codeRanges.length) {
180             range = codeRanges[ctype];
181 
182             if (range == null) {
183                 // format: [numberOfRanges, rangeStart, rangeEnd, ...]
184                 range = new int[16];
185                 int rangeCount = 0;
186                 int lastCode = -2;
187 
188                 for (int code = 0; code <= 0xffff; code++) {
189                     if (isCodeCType(code, ctype)) {
190                         if (lastCode < code -1) {
191                             if (rangeCount * 2 + 2 >= range.length) {
192                                 range = Arrays.copyOf(range, range.length * 2);
193                             }
194                             range[rangeCount * 2 + 1] = code;
195                             rangeCount++;
196                         }
197                         range[rangeCount * 2] = lastCode = code;
198                     }
199                 }
200 
201                 if (rangeCount * 2 + 1 < range.length) {
202                     range = Arrays.copyOf(range, rangeCount * 2 + 1);
203                 }
204 
205                 range[0] = rangeCount;
206                 codeRanges[ctype] = range;
207             }
208         }
209 
210         return range;
211     }
212 
213     // CodeRange.isInCodeRange
isInCodeRange(final int[] p, final int offset, final int code)214     public static boolean isInCodeRange(final int[] p, final int offset, final int code) {
215         int low = 0;
216         final int n = p[offset];
217         int high = n ;
218 
219         while (low < high) {
220             final int x = (low + high) >> 1;
221             if (code > p[(x << 1) + 2 + offset]) {
222                 low = x + 1;
223             } else {
224                 high = x;
225             }
226         }
227         return low < n && code >= p[(low << 1) + 1 + offset];
228     }
229 
230     /**
231      * @see <a href="http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt">http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt</a>
232      *
233      * @param code code
234      * @param ctype ctype
235      *
236      * @return isCodeCType
237      */
isCodeCType(final int code, final int ctype)238     public static boolean isCodeCType(final int code, final int ctype) {
239         int type;
240         switch (ctype) {
241             case CharacterType.NEWLINE:
242                 return isNewLine(code);
243             case CharacterType.ALPHA:
244                 return (1 << Character.getType(code) & CharacterType.ALPHA_MASK) != 0;
245             case CharacterType.BLANK:
246                 return code == 0x09 || Character.getType(code) == Character.SPACE_SEPARATOR;
247             case CharacterType.CNTRL:
248                 type = Character.getType(code);
249                 return (1 << type & CharacterType.CNTRL_MASK) != 0 || type == Character.UNASSIGNED;
250             case CharacterType.DIGIT:
251                 return EncodingHelper.isDigit(code);
252             case CharacterType.GRAPH:
253                 switch (code) {
254                     case 0x09:
255                     case 0x0a:
256                     case 0x0b:
257                     case 0x0c:
258                     case 0x0d:
259                         return false;
260                     default:
261                         type = Character.getType(code);
262                         return (1 << type & CharacterType.GRAPH_MASK) == 0 && type != Character.UNASSIGNED;
263                 }
264             case CharacterType.LOWER:
265                 return Character.isLowerCase(code);
266             case CharacterType.PRINT:
267                 type = Character.getType(code);
268                 return (1 << type & CharacterType.PRINT_MASK) == 0 && type != Character.UNASSIGNED;
269             case CharacterType.PUNCT:
270                 return (1 << Character.getType(code) & CharacterType.PUNCT_MASK) != 0;
271             case CharacterType.SPACE:
272                 // ECMA 7.2 and 7.3
273                 switch (code) {
274                     case 0x09:
275                     case 0x0a:
276                     case 0x0b:
277                     case 0x0c:
278                     case 0x0d:
279                         return true;
280                     default:
281                         // true if Unicode separator or BOM or U+180E (see JDK-8138758)
282                         return (1 << Character.getType(code) & CharacterType.SPACE_MASK) != 0
283                                 || code == 0xfeff || code == 0x180e;
284                 }
285             case CharacterType.UPPER:
286                 return Character.isUpperCase(code);
287             case CharacterType.XDIGIT:
288                 return EncodingHelper.isXDigit(code);
289             case CharacterType.WORD:
290                 return (1 << Character.getType(code) & CharacterType.WORD_MASK) != 0;
291             case CharacterType.ALNUM:
292                 return (1 << Character.getType(code) & CharacterType.ALNUM_MASK) != 0;
293             case CharacterType.ASCII:
294                 return code < 0x80;
295             default:
296                 throw new RuntimeException("illegal character type: " + ctype);
297         }
298     }
299 }
300 
301