1 /*
2  * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 package com.sun.tools.javac.parser;
27 
28 import java.util.Arrays;
29 
30 import com.sun.tools.javac.resources.CompilerProperties.Errors;
31 import com.sun.tools.javac.util.Log;
32 
33 import static com.sun.tools.javac.util.LayoutCharacters.EOI;
34 import static com.sun.tools.javac.util.LayoutCharacters.tabulate;
35 
36 /**
37  * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters
38  * one by one as contained in the input stream, handling unicode escape sequences accordingly.
39  *
40  *  <p><b>This is NOT part of any supported API.
41  *  If you write code that depends on this, you do so at your own risk.
42  *  This code and its internal interfaces are subject to change or
43  *  deletion without notice.</b></p>
44  */
45 public class UnicodeReader {
46     /**
47      * Buffer containing characters from source file. May contain extraneous characters
48      * beyond this.length.
49      */
50     private final char[] buffer;
51 
52     /**
53      * Length of meaningful content in buffer.
54      */
55     private final int length;
56 
57     /**
58      * Character buffer index of character currently being observed.
59      */
60     private int position;
61 
62     /**
63      * Number of characters combined to provide character currently being observed. Typically
64      * one, but may be more when combinations of surrogate pairs and unicode escape sequences
65      * are read.
66      */
67     private int width;
68 
69     /**
70      * Character currently being observed. If a surrogate pair is read then will be the high
71      * member of the pair.
72      */
73     private char character;
74 
75     /**
76      * Codepoint of character currently being observed. Typically equivalent to the character
77      * but will have a value greater that 0xFFFF when a surrogate pair.
78      */
79     private int codepoint;
80 
81     /**
82      * true if the last character was a backslash. This is used to handle the special case
83      * when a backslash precedes an unicode escape. In that case, the second backslash
84      * is treated as a backslash and not part of an unicode escape.
85      */
86     private boolean wasBackslash;
87 
88     /**
89      * Log for error reporting.
90      */
91     private final Log log;
92 
93     /**
94      * Constructor.
95      *
96      * @param sf      scan factory.
97      * @param array   array containing contents of source.
98      * @param length  length of meaningful content in buffer.
99      */
UnicodeReader(ScannerFactory sf, char[] array, int length)100     protected UnicodeReader(ScannerFactory sf, char[] array, int length) {
101         this.buffer = array;
102         this.length = length;
103         this.position = 0;
104         this.width = 0;
105         this.character = '\0';
106         this.codepoint = 0;
107         this.wasBackslash = false;
108         this.log = sf.log;
109 
110         nextCodePoint();
111     }
112 
113     /**
114      * Returns the length of the buffer. This is length of meaningful content in buffer and
115      * not the length of the buffer array.
116      *
117      * @return length of the buffer.
118      */
length()119     protected int length() {
120         return length;
121     }
122 
123     /**
124      * Return true if current position is within the meaningful part of the buffer.
125      *
126      * @return true if current position is within the meaningful part of the buffer.
127      */
isAvailable()128     protected boolean isAvailable() {
129         return position < length;
130     }
131 
132     /**
133      * Fetches the next 16-bit character from the buffer and places it in this.character.
134      */
nextCodeUnit()135     private void nextCodeUnit() {
136         // Index of next character in buffer.
137         int index = position + width;
138 
139         // If past end of buffer.
140         if (length <= index) {
141             // End of file is marked with EOI.
142             character = EOI;
143         } else {
144             // Next character in buffer.
145             character = buffer[index];
146             // Increment length of codepoint.
147             width++;
148         }
149     }
150 
151     /**
152      * Fetches the next 16-bit character from the buffer. If an unicode escape
153      * is detected then converts the unicode escape to a character.
154      */
nextUnicodeInputCharacter()155     private void nextUnicodeInputCharacter() {
156         // Position to next codepoint.
157         position += width;
158         // Codepoint has no characters yet.
159         width = 0;
160 
161         // Fetch next character.
162         nextCodeUnit();
163 
164         // If second backslash is detected.
165         if (wasBackslash) {
166             // Treat like a normal character (not part of unicode escape.)
167             wasBackslash = false;
168         } else if (character == '\\') {
169             // May be an unicode escape.
170             wasBackslash = !unicodeEscape();
171         }
172 
173         // Codepoint and character match if not surrogate.
174         codepoint = (int)character;
175     }
176 
177     /**
178      * Fetches the nextcode point from the buffer. If an unicode escape is recognized
179      * then converts unicode escape to a character. If two characters are a surrogate pair
180      * then converts to a codepoint.
181      */
nextCodePoint()182     private void nextCodePoint() {
183         // Next unicode character.
184         nextUnicodeInputCharacter();
185 
186         // Return early if ASCII or not a surrogate pair.
187         if (isASCII() || !Character.isHighSurrogate(character)) {
188             return;
189         }
190 
191         // Capture high surrogate and position.
192         char hi = character;
193         int savePosition = position;
194         int saveWidth = width;
195 
196         // Get potential low surrogate.
197         nextUnicodeInputCharacter();
198         char lo = character;
199 
200         if (Character.isLowSurrogate(lo)) {
201             // Start codepoint at start of high surrogate.
202             position = savePosition;
203             width += saveWidth;
204             // Compute codepoint.
205             codepoint = Character.toCodePoint(hi, lo);
206         } else {
207             // Restore to treat high surrogate as just a character.
208             position = savePosition;
209             width = saveWidth;
210             character = hi;
211             codepoint = (int)hi;
212             // Could potential report an error here (old code did not.)
213         }
214     }
215 
216     /**
217      * Converts an unicode escape into a character.
218      *
219      * @return true if was an unicode escape.
220      */
unicodeEscape()221     private boolean unicodeEscape() {
222         // Start of unicode escape (past backslash.)
223         int start = position + width;
224 
225         // Default to backslash result, unless proven otherwise.
226         character = '\\';
227         width = 1;
228 
229         // Skip multiple 'u'.
230         int index;
231         for (index = start; index < length; index++) {
232             if (buffer[index] != 'u') {
233                 break;
234             }
235         }
236 
237         // Needs to have been at least one u.
238         if (index == start) {
239             return false;
240         }
241 
242         int code = 0;
243 
244         for (int i = 0; i < 4; i++) {
245             // Translate and merge digit.
246             int digit = index < length ? Character.digit(buffer[index], 16) : -1;
247             code = code << 4 | digit;
248 
249             // If invalid digit.
250             if (code < 0) {
251                 break;
252             }
253 
254             // On to next character.
255             index++;
256         }
257 
258         // Skip digits even if error.
259         width = index - position;
260 
261         // If all digits are good.
262         if (code >= 0) {
263             character = (char)code;
264         } else {
265             log.error(position, Errors.IllegalUnicodeEsc);
266         }
267 
268         // Return true even if error so that the invalid unicode escape is skipped.
269         return true;
270     }
271 
272     /**
273      * Return the current position in the character buffer.
274      *
275      * @return  current position in the character buffer.
276      */
277     protected int position() {
278         return position;
279     }
280 
281 
282     /**
283      * Reset the reader to the specified position.
284      * Warning: Do not use when previous character was an ASCII or unicode backslash.
285      * @param pos
286      */
287     protected void reset(int pos) {
288         position = pos;
289         width = 0;
290         wasBackslash = false;
291         nextCodePoint();
292     }
293 
294     /**
295      * Return the current character in at the current position.
296      *
297      * @return current character in at the current position.
298      */
299     protected char get() {
300         return character;
301     }
302 
303     /**
304      * Return the current codepoint in at the current position.
305      *
306      * @return current codepoint in at the current position.
307      */
308     protected int getCodepoint() {
309         return codepoint;
310     }
311 
312     /**
313      * Returns true if the current codepoint is a surrogate.
314      *
315      * @return true if the current codepoint is a surrogate.
316      */
317     protected boolean isSurrogate() {
318         return 0xFFFF < codepoint;
319     }
320 
321     /**
322      * Returns true if the current character is ASCII.
323      *
324      * @return true if the current character is ASCII.
325      */
326     protected boolean isASCII() {
327         return character <= 0x7F;
328     }
329 
330     /**
331      * Advances the current character to the next character.
332      *
333      * @return next character.
334      */
335     protected char next() {
336         nextCodePoint();
337 
338         return character;
339     }
340 
341     /**
342      * Compare character. Returns true if a match.
343      *
344      * @param ch  character to match.
345      *
346      * @return true if a match.
347      */
348     protected boolean is(char ch) {
349         return character == ch;
350     }
351 
352     /**
353      * Match one of the arguments. Returns true if a match.
354      */
355     protected boolean isOneOf(char ch1, char ch2) {
356         return is(ch1) || is(ch2);
357     }
358     protected boolean isOneOf(char ch1, char ch2, char ch3) {
359         return is(ch1) || is(ch2) || is(ch3);
360     }
361     protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) {
362         return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6);
363     }
364 
365     /**
366      * Tests to see if current character is in the range of lo to hi characters (inclusive).
367      *
368      * @param lo  lowest character in range.
369      * @param hi  highest character in range.
370      *
371      * @return true if the current character is in range.
372      */
373     protected boolean inRange(char lo, char hi) {
374         return lo <= character && character <= hi;
375     }
376 
377     /**
378      * Compare character and advance if a match. Returns true if a match.
379      *
380      * @param ch  character to match.
381      *
382      * @return true if a match.
383      */
384     protected boolean accept(char ch) {
385         if (is(ch)) {
386             next();
387 
388             return true;
389         }
390 
391         return false;
392     }
393 
394     /**
395      * Match one of the arguments and advance if a match. Returns true if a match.
396      */
397     protected boolean acceptOneOf(char ch1, char ch2) {
398         if (isOneOf(ch1, ch2)) {
399             next();
400 
401             return true;
402         }
403 
404         return false;
405     }
406 
407     protected boolean acceptOneOf(char ch1, char ch2, char ch3) {
408         if (isOneOf(ch1, ch2, ch3)) {
409             next();
410 
411             return true;
412         }
413 
414         return false;
415     }
416 
417     /**
418      * Skip over all occurances of character.
419      *
420      * @param ch character to accept.
421      */
422     protected void skip(char ch) {
423         while (accept(ch)) {
424             // next
425         }
426     }
427 
428     /**
429      * Skip over ASCII white space characters.
430      */
431     protected void skipWhitespace() {
432         while (acceptOneOf(' ', '\t', '\f')) {
433             // next
434         }
435     }
436 
437     /**
438      * Skip to end of line.
439      */
440     protected void skipToEOLN() {
441         while (isAvailable()) {
442             if (isOneOf('\r', '\n')) {
443                 break;
444             }
445 
446             next();
447         }
448 
449     }
450 
451     /**
452      * Compare string and advance if a match. Returns true if a match.
453      * Warning: Do not use when previous character was a backslash
454      * (confuses state of wasBackslash.)
455      *
456      * @param string string to match character for character.
457      *
458      * @return true if a match.
459      */
460     protected boolean accept(String string) {
461         // Quick test.
462         if (string.length() == 0 || !is(string.charAt(0))) {
463             return false;
464         }
465 
466         // Be prepared to retreat if not a match.
467         int savedPosition = position;
468 
469         nextCodePoint();
470 
471         // Check each character.
472         for (int i = 1; i < string.length(); i++) {
473             if (!is(string.charAt(i))) {
474                 // Restart if not a match.
475                 reset(savedPosition);
476 
477                 return false;
478             }
479 
480             nextCodePoint();
481         }
482 
483         return true;
484     }
485 
486     /**
487      * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not
488      * advance character.
489      *
490      * @param pos         starting position.
491      * @param digitRadix  base of number being converted.
492      *
493      * @return value of digit.
494      */
495     protected int digit(int pos, int digitRadix) {
496         int result;
497 
498         // Just an ASCII digit.
499         if (inRange('0', '9')) {
500             // Fast common case.
501             result = character - '0';
502 
503             return result < digitRadix ? result : -1;
504         }
505 
506         // Handle other digits.
507         result = isSurrogate() ? Character.digit(codepoint, digitRadix) :
508                                  Character.digit(character, digitRadix);
509 
510         if (result >= 0 && !isASCII()) {
511             log.error(position(), Errors.IllegalNonasciiDigit);
512             character = "0123456789abcdef".charAt(result);
513         }
514 
515         return result;
516     }
517 
518     /**
519      * Returns the input buffer. Unicode escape sequences are not translated.
520      *
521      * @return the input buffer.
522      */
523     public char[] getRawCharacters() {
524         return length == buffer.length ? buffer : Arrays.copyOf(buffer, length);
525     }
526 
527     /**
528      * Returns a copy of a character array subset of the input buffer.
529      * The returned array begins at the {@code beginIndex} and
530      * extends to the character at index {@code endIndex - 1}.
531      * Thus the length of the substring is {@code endIndex-beginIndex}.
532      * This behavior is like
533      * {@code String.substring(beginIndex, endIndex)}.
534      * Unicode escape sequences are not translated.
535      *
536      * @param  beginIndex the beginning index, inclusive.
537      * @param  endIndex the ending index, exclusive.
538      *
539      * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
540      *         array bounds
541      */
542     public char[] getRawCharacters(int beginIndex, int endIndex) {
543         return Arrays.copyOfRange(buffer, beginIndex, endIndex);
544     }
545 
546     /**
547      * This is a specialized version of UnicodeReader that keeps track of the
548      * column position within a given character stream. Used for Javadoc
549      * processing to build a table for mapping positions in the comment string
550      * to positions in the source file.
551      */
552     static class PositionTrackingReader extends UnicodeReader {
553         /**
554          * Offset from the beginning of the original reader buffer.
555          */
556         private final int offset;
557 
558         /**
559          * Current column in the comment.
560          */
561         private int column;
562 
563         /**
564          * Constructor.
565          *
566          * @param sf      Scan factory.
567          * @param array   Array containing contents of source.
568          * @param offset  Position offset in original source buffer.
569          */
570         protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) {
571             super(sf, array, array.length);
572             this.offset = offset;
573             this.column = 0;
574         }
575 
576         /**
577          * Advances the current character to the next character. Tracks column.
578          *
579          * @return next character.
580          */
581         @Override
582         protected char next() {
583             super.next();
584 
585             if (isOneOf('\n', '\r', '\f')) {
586                 column = 0;
587             } else if (is('\t')) {
588                 column = tabulate(column);
589             } else {
590                 column++;
591             }
592 
593             return get();
594         }
595 
596         /**
597          * Returns the current column.
598          *
599          * @return  the current column.
600          */
601         protected int column() {
602             return column;
603         }
604 
605         /**
606          * Returns position relative to the original source buffer.
607          *
608          * @return
609          */
610         protected int offsetPosition() {
611             return position() + offset;
612         }
613     }
614 
615 }
616