1 /*
2  * Copyright (c) 2011, 2018, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 package com.sun.tools.javac.parser;
27 
28 import java.nio.CharBuffer;
29 import java.util.Arrays;
30 
31 import com.sun.tools.javac.file.JavacFileManager;
32 import com.sun.tools.javac.resources.CompilerProperties.Errors;
33 import com.sun.tools.javac.util.ArrayUtils;
34 import com.sun.tools.javac.util.Log;
35 import com.sun.tools.javac.util.Name;
36 import com.sun.tools.javac.util.Names;
37 
38 import static com.sun.tools.javac.util.LayoutCharacters.*;
39 
40 /** The char reader used by the javac lexer/tokenizer. Returns the sequence of
41  * characters contained in the input stream, handling unicode escape accordingly.
42  * Additionally, it provides features for saving chars into a buffer and to retrieve
43  * them at a later stage.
44  *
45  *  <p><b>This is NOT part of any supported API.
46  *  If you write code that depends on this, you do so at your own risk.
47  *  This code and its internal interfaces are subject to change or
48  *  deletion without notice.</b>
49  */
50 public class UnicodeReader {
51 
52     /** The input buffer, index of next character to be read,
53      *  index of one past last character in buffer.
54      */
55     protected char[] buf;
56     protected int bp;
57     protected final int buflen;
58 
59     /** The current character.
60      */
61     protected char ch;
62 
63     /** The buffer index of the last converted unicode character
64      */
65     protected int unicodeConversionBp = -1;
66 
67     protected Log log;
68     protected Names names;
69 
70     /** A character buffer for saved chars.
71      */
72     protected char[] sbuf = new char[128];
73     protected int realLength;
74     protected int sp;
75 
76     /**
77      * Create a scanner from the input array.  This method might
78      * modify the array.  To avoid copying the input array, ensure
79      * that {@code inputLength < input.length} or
80      * {@code input[input.length -1]} is a white space character.
81      *
82      * @param sf the factory which created this Scanner
83      * @param buffer the input, might be modified
84      * Must be positive and less than or equal to input.length.
85      */
UnicodeReader(ScannerFactory sf, CharBuffer buffer)86     protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
87         this(sf, JavacFileManager.toArray(buffer), buffer.limit());
88     }
89 
UnicodeReader(ScannerFactory sf, char[] input, int inputLength)90     protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
91         log = sf.log;
92         names = sf.names;
93         realLength = inputLength;
94         if (inputLength == input.length) {
95             if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
96                 inputLength--;
97             } else {
98                 input = Arrays.copyOf(input, inputLength + 1);
99             }
100         }
101         buf = input;
102         buflen = inputLength;
103         buf[buflen] = EOI;
104         bp = -1;
105         scanChar();
106     }
107 
108     /** Read next character.
109      */
scanChar()110     protected void scanChar() {
111         if (bp < buflen) {
112             ch = buf[++bp];
113             if (ch == '\\') {
114                 convertUnicode();
115             }
116         }
117     }
118 
119     /** Read next character in comment, skipping over double '\' characters.
120      */
scanCommentChar()121     protected void scanCommentChar() {
122         scanChar();
123         if (ch == '\\') {
124             if (peekChar() == '\\' && !isUnicode()) {
125                 skipChar();
126             } else {
127                 convertUnicode();
128             }
129         }
130     }
131 
132     /** Append a character to sbuf.
133      */
putChar(char ch, boolean scan)134     protected void putChar(char ch, boolean scan) {
135         sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
136         sbuf[sp++] = ch;
137         if (scan)
138             scanChar();
139     }
140 
putChar(char ch)141     protected void putChar(char ch) {
142         putChar(ch, false);
143     }
144 
putChar(boolean scan)145     protected void putChar(boolean scan) {
146         putChar(ch, scan);
147     }
148 
nextChar(boolean skip)149     protected void nextChar(boolean skip) {
150         if (!skip) {
151             sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
152             sbuf[sp++] = ch;
153         }
154 
155         scanChar();
156     }
157 
name()158     Name name() {
159         return names.fromChars(sbuf, 0, sp);
160     }
161 
chars()162     String chars() {
163         return new String(sbuf, 0, sp);
164     }
165 
166     /** Add 'count' copies of the character 'ch' to the string buffer.
167      */
repeat(char ch, int count)168     protected void repeat(char ch, int count) {
169         for ( ; 0 < count; count--) {
170             putChar(ch, false);
171         }
172     }
173 
174     /** Reset the scan buffer pointer to 'pos'.
175      */
reset(int pos)176     protected void reset(int pos) {
177         bp = pos - 1;
178         scanChar();
179     }
180 
181     /** Convert unicode escape; bp points to initial '\' character
182      *  (Spec 3.3).
183      */
convertUnicode()184     protected void convertUnicode() {
185         if (ch == '\\' && unicodeConversionBp != bp ) {
186             bp++; ch = buf[bp];
187             if (ch == 'u') {
188                 do {
189                     bp++; ch = buf[bp];
190                 } while (ch == 'u');
191                 int limit = bp + 3;
192                 if (limit < buflen) {
193                     int d = digit(bp, 16);
194                     int code = d;
195                     while (bp < limit && d >= 0) {
196                         bp++; ch = buf[bp];
197                         d = digit(bp, 16);
198                         code = (code << 4) + d;
199                     }
200                     if (d >= 0) {
201                         ch = (char)code;
202                         unicodeConversionBp = bp;
203                         return;
204                     }
205                 }
206                 log.error(bp, Errors.IllegalUnicodeEsc);
207             } else {
208                 bp--;
209                 ch = '\\';
210             }
211         }
212     }
213 
214     /** Are surrogates supported?
215      */
216     final static boolean surrogatesSupported = surrogatesSupported();
surrogatesSupported()217     private static boolean surrogatesSupported() {
218         try {
219             Character.isHighSurrogate('a');
220             return true;
221         } catch (NoSuchMethodError ex) {
222             return false;
223         }
224     }
225 
226     /** Scan surrogate pairs.  If 'ch' is a high surrogate and
227      *  the next character is a low surrogate, returns the code point
228      *  constructed from these surrogates. Otherwise, returns -1.
229      *  This method will not consume any of the characters.
230      */
peekSurrogates()231     protected int peekSurrogates() {
232         if (surrogatesSupported && Character.isHighSurrogate(ch)) {
233             char high = ch;
234             int prevBP = bp;
235 
236             scanChar();
237 
238             char low = ch;
239 
240             ch = high;
241             bp = prevBP;
242 
243             if (Character.isLowSurrogate(low)) {
244                 return Character.toCodePoint(high, low);
245             }
246         }
247 
248         return -1;
249     }
250 
251     /** Convert an ASCII digit from its base (8, 10, or 16)
252      *  to its value.
253      */
digit(int pos, int base)254     protected int digit(int pos, int base) {
255         char c = ch;
256         if ('0' <= c && c <= '9')
257             return Character.digit(c, base); //a fast common case
258         int codePoint = peekSurrogates();
259         int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
260         if (result >= 0 && c > 0x7f) {
261             log.error(pos + 1, Errors.IllegalNonasciiDigit);
262             if (codePoint >= 0)
263                 scanChar();
264             ch = "0123456789abcdef".charAt(result);
265         }
266         return result;
267     }
268 
isUnicode()269     protected boolean isUnicode() {
270         return unicodeConversionBp == bp;
271     }
272 
skipChar()273     protected void skipChar() {
274         bp++;
275     }
276 
peekChar()277     protected char peekChar() {
278         return buf[bp + 1];
279     }
280 
281     /**
282      * Returns a copy of the input buffer, up to its inputLength.
283      * Unicode escape sequences are not translated.
284      */
getRawCharacters()285     public char[] getRawCharacters() {
286         char[] chars = new char[buflen];
287         System.arraycopy(buf, 0, chars, 0, buflen);
288         return chars;
289     }
290 
291     /**
292      * Returns a copy of a character array subset of the input buffer.
293      * The returned array begins at the {@code beginIndex} and
294      * extends to the character at index {@code endIndex - 1}.
295      * Thus the length of the substring is {@code endIndex-beginIndex}.
296      * This behavior is like
297      * {@code String.substring(beginIndex, endIndex)}.
298      * Unicode escape sequences are not translated.
299      *
300      * @param beginIndex the beginning index, inclusive.
301      * @param endIndex the ending index, exclusive.
302      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
303      *         array bounds
304      */
getRawCharacters(int beginIndex, int endIndex)305     public char[] getRawCharacters(int beginIndex, int endIndex) {
306         int length = endIndex - beginIndex;
307         char[] chars = new char[length];
308         System.arraycopy(buf, beginIndex, chars, 0, length);
309         return chars;
310     }
311 }
312