1 /* 2 * Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.tools.javac.parser; 27 28 import java.util.Arrays; 29 30 import com.sun.tools.javac.resources.CompilerProperties.Errors; 31 import com.sun.tools.javac.util.Log; 32 33 import static com.sun.tools.javac.util.LayoutCharacters.EOI; 34 import static com.sun.tools.javac.util.LayoutCharacters.tabulate; 35 36 /** 37 * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters 38 * one by one as contained in the input stream, handling unicode escape sequences accordingly. 39 * 40 * <p><b>This is NOT part of any supported API. 41 * If you write code that depends on this, you do so at your own risk. 42 * This code and its internal interfaces are subject to change or 43 * deletion without notice.</b></p> 44 */ 45 public class UnicodeReader { 46 /** 47 * Buffer containing characters from source file. May contain extraneous characters 48 * beyond this.length. 49 */ 50 private final char[] buffer; 51 52 /** 53 * Length of meaningful content in buffer. 54 */ 55 private final int length; 56 57 /** 58 * Character buffer index of character currently being observed. 59 */ 60 private int position; 61 62 /** 63 * Number of characters combined to provide character currently being observed. Typically 64 * one, but may be more when combinations of surrogate pairs and unicode escape sequences 65 * are read. 66 */ 67 private int width; 68 69 /** 70 * Character currently being observed. If a surrogate pair is read then will be the high 71 * member of the pair. 72 */ 73 private char character; 74 75 /** 76 * Codepoint of character currently being observed. Typically equivalent to the character 77 * but will have a value greater that 0xFFFF when a surrogate pair. 78 */ 79 private int codepoint; 80 81 /** 82 * true if the last character was a backslash. This is used to handle the special case 83 * when a backslash precedes an unicode escape. In that case, the second backslash 84 * is treated as a backslash and not part of an unicode escape. 85 */ 86 private boolean wasBackslash; 87 88 /** 89 * Log for error reporting. 90 */ 91 private final Log log; 92 93 /** 94 * Constructor. 95 * 96 * @param sf scan factory. 97 * @param array array containing contents of source. 98 * @param length length of meaningful content in buffer. 99 */ UnicodeReader(ScannerFactory sf, char[] array, int length)100 protected UnicodeReader(ScannerFactory sf, char[] array, int length) { 101 this.buffer = array; 102 this.length = length; 103 this.position = 0; 104 this.width = 0; 105 this.character = '\0'; 106 this.codepoint = 0; 107 this.wasBackslash = false; 108 this.log = sf.log; 109 110 nextCodePoint(); 111 } 112 113 /** 114 * Returns the length of the buffer. This is length of meaningful content in buffer and 115 * not the length of the buffer array. 116 * 117 * @return length of the buffer. 118 */ length()119 protected int length() { 120 return length; 121 } 122 123 /** 124 * Return true if current position is within the meaningful part of the buffer. 125 * 126 * @return true if current position is within the meaningful part of the buffer. 127 */ isAvailable()128 protected boolean isAvailable() { 129 return position < length; 130 } 131 132 /** 133 * Fetches the next 16-bit character from the buffer and places it in this.character. 134 */ nextCodeUnit()135 private void nextCodeUnit() { 136 // Index of next character in buffer. 137 int index = position + width; 138 139 // If past end of buffer. 140 if (length <= index) { 141 // End of file is marked with EOI. 142 character = EOI; 143 } else { 144 // Next character in buffer. 145 character = buffer[index]; 146 // Increment length of codepoint. 147 width++; 148 } 149 } 150 151 /** 152 * Fetches the next 16-bit character from the buffer. If an unicode escape 153 * is detected then converts the unicode escape to a character. 154 */ nextUnicodeInputCharacter()155 private void nextUnicodeInputCharacter() { 156 // Position to next codepoint. 157 position += width; 158 // Codepoint has no characters yet. 159 width = 0; 160 161 // Fetch next character. 162 nextCodeUnit(); 163 164 // If second backslash is detected. 165 if (wasBackslash) { 166 // Treat like a normal character (not part of unicode escape.) 167 wasBackslash = false; 168 } else if (character == '\\') { 169 // May be an unicode escape. 170 wasBackslash = !unicodeEscape(); 171 } 172 173 // Codepoint and character match if not surrogate. 174 codepoint = (int)character; 175 } 176 177 /** 178 * Fetches the nextcode point from the buffer. If an unicode escape is recognized 179 * then converts unicode escape to a character. If two characters are a surrogate pair 180 * then converts to a codepoint. 181 */ nextCodePoint()182 private void nextCodePoint() { 183 // Next unicode character. 184 nextUnicodeInputCharacter(); 185 186 // Return early if ASCII or not a surrogate pair. 187 if (isASCII() || !Character.isHighSurrogate(character)) { 188 return; 189 } 190 191 // Capture high surrogate and position. 192 char hi = character; 193 int savePosition = position; 194 int saveWidth = width; 195 196 // Get potential low surrogate. 197 nextUnicodeInputCharacter(); 198 char lo = character; 199 200 if (Character.isLowSurrogate(lo)) { 201 // Start codepoint at start of high surrogate. 202 position = savePosition; 203 width += saveWidth; 204 // Compute codepoint. 205 codepoint = Character.toCodePoint(hi, lo); 206 } else { 207 // Restore to treat high surrogate as just a character. 208 position = savePosition; 209 width = saveWidth; 210 character = hi; 211 codepoint = (int)hi; 212 // Could potential report an error here (old code did not.) 213 } 214 } 215 216 /** 217 * Converts an unicode escape into a character. 218 * 219 * @return true if was an unicode escape. 220 */ unicodeEscape()221 private boolean unicodeEscape() { 222 // Start of unicode escape (past backslash.) 223 int start = position + width; 224 225 // Default to backslash result, unless proven otherwise. 226 character = '\\'; 227 width = 1; 228 229 // Skip multiple 'u'. 230 int index; 231 for (index = start; index < length; index++) { 232 if (buffer[index] != 'u') { 233 break; 234 } 235 } 236 237 // Needs to have been at least one u. 238 if (index == start) { 239 return false; 240 } 241 242 int code = 0; 243 244 for (int i = 0; i < 4; i++) { 245 // Translate and merge digit. 246 int digit = index < length ? Character.digit(buffer[index], 16) : -1; 247 code = code << 4 | digit; 248 249 // If invalid digit. 250 if (code < 0) { 251 break; 252 } 253 254 // On to next character. 255 index++; 256 } 257 258 // Skip digits even if error. 259 width = index - position; 260 261 // If all digits are good. 262 if (code >= 0) { 263 character = (char)code; 264 } else { 265 log.error(position, Errors.IllegalUnicodeEsc); 266 } 267 268 // Return true even if error so that the invalid unicode escape is skipped. 269 return true; 270 } 271 272 /** 273 * Return the current position in the character buffer. 274 * 275 * @return current position in the character buffer. 276 */ 277 protected int position() { 278 return position; 279 } 280 281 282 /** 283 * Reset the reader to the specified position. 284 * Warning: Do not use when previous character was an ASCII or unicode backslash. 285 * @param pos 286 */ 287 protected void reset(int pos) { 288 position = pos; 289 width = 0; 290 wasBackslash = false; 291 nextCodePoint(); 292 } 293 294 /** 295 * Return the current character in at the current position. 296 * 297 * @return current character in at the current position. 298 */ 299 protected char get() { 300 return character; 301 } 302 303 /** 304 * Return the current codepoint in at the current position. 305 * 306 * @return current codepoint in at the current position. 307 */ 308 protected int getCodepoint() { 309 return codepoint; 310 } 311 312 /** 313 * Returns true if the current codepoint is a surrogate. 314 * 315 * @return true if the current codepoint is a surrogate. 316 */ 317 protected boolean isSurrogate() { 318 return 0xFFFF < codepoint; 319 } 320 321 /** 322 * Returns true if the current character is ASCII. 323 * 324 * @return true if the current character is ASCII. 325 */ 326 protected boolean isASCII() { 327 return character <= 0x7F; 328 } 329 330 /** 331 * Advances the current character to the next character. 332 * 333 * @return next character. 334 */ 335 protected char next() { 336 nextCodePoint(); 337 338 return character; 339 } 340 341 /** 342 * Compare character. Returns true if a match. 343 * 344 * @param ch character to match. 345 * 346 * @return true if a match. 347 */ 348 protected boolean is(char ch) { 349 return character == ch; 350 } 351 352 /** 353 * Match one of the arguments. Returns true if a match. 354 */ 355 protected boolean isOneOf(char ch1, char ch2) { 356 return is(ch1) || is(ch2); 357 } 358 protected boolean isOneOf(char ch1, char ch2, char ch3) { 359 return is(ch1) || is(ch2) || is(ch3); 360 } 361 protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) { 362 return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6); 363 } 364 365 /** 366 * Tests to see if current character is in the range of lo to hi characters (inclusive). 367 * 368 * @param lo lowest character in range. 369 * @param hi highest character in range. 370 * 371 * @return true if the current character is in range. 372 */ 373 protected boolean inRange(char lo, char hi) { 374 return lo <= character && character <= hi; 375 } 376 377 /** 378 * Compare character and advance if a match. Returns true if a match. 379 * 380 * @param ch character to match. 381 * 382 * @return true if a match. 383 */ 384 protected boolean accept(char ch) { 385 if (is(ch)) { 386 next(); 387 388 return true; 389 } 390 391 return false; 392 } 393 394 /** 395 * Match one of the arguments and advance if a match. Returns true if a match. 396 */ 397 protected boolean acceptOneOf(char ch1, char ch2) { 398 if (isOneOf(ch1, ch2)) { 399 next(); 400 401 return true; 402 } 403 404 return false; 405 } 406 407 protected boolean acceptOneOf(char ch1, char ch2, char ch3) { 408 if (isOneOf(ch1, ch2, ch3)) { 409 next(); 410 411 return true; 412 } 413 414 return false; 415 } 416 417 /** 418 * Skip over all occurances of character. 419 * 420 * @param ch character to accept. 421 */ 422 protected void skip(char ch) { 423 while (accept(ch)) { 424 // next 425 } 426 } 427 428 /** 429 * Skip over ASCII white space characters. 430 */ 431 protected void skipWhitespace() { 432 while (acceptOneOf(' ', '\t', '\f')) { 433 // next 434 } 435 } 436 437 /** 438 * Skip to end of line. 439 */ 440 protected void skipToEOLN() { 441 while (isAvailable()) { 442 if (isOneOf('\r', '\n')) { 443 break; 444 } 445 446 next(); 447 } 448 449 } 450 451 /** 452 * Compare string and advance if a match. Returns true if a match. 453 * Warning: Do not use when previous character was a backslash 454 * (confuses state of wasBackslash.) 455 * 456 * @param string string to match character for character. 457 * 458 * @return true if a match. 459 */ 460 protected boolean accept(String string) { 461 // Quick test. 462 if (string.length() == 0 || !is(string.charAt(0))) { 463 return false; 464 } 465 466 // Be prepared to retreat if not a match. 467 int savedPosition = position; 468 469 nextCodePoint(); 470 471 // Check each character. 472 for (int i = 1; i < string.length(); i++) { 473 if (!is(string.charAt(i))) { 474 // Restart if not a match. 475 reset(savedPosition); 476 477 return false; 478 } 479 480 nextCodePoint(); 481 } 482 483 return true; 484 } 485 486 /** 487 * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not 488 * advance character. 489 * 490 * @param pos starting position. 491 * @param digitRadix base of number being converted. 492 * 493 * @return value of digit. 494 */ 495 protected int digit(int pos, int digitRadix) { 496 int result; 497 498 // Just an ASCII digit. 499 if (inRange('0', '9')) { 500 // Fast common case. 501 result = character - '0'; 502 503 return result < digitRadix ? result : -1; 504 } 505 506 // Handle other digits. 507 result = isSurrogate() ? Character.digit(codepoint, digitRadix) : 508 Character.digit(character, digitRadix); 509 510 if (result >= 0 && !isASCII()) { 511 log.error(position(), Errors.IllegalNonasciiDigit); 512 character = "0123456789abcdef".charAt(result); 513 } 514 515 return result; 516 } 517 518 /** 519 * Returns the input buffer. Unicode escape sequences are not translated. 520 * 521 * @return the input buffer. 522 */ 523 public char[] getRawCharacters() { 524 return length == buffer.length ? buffer : Arrays.copyOf(buffer, length); 525 } 526 527 /** 528 * Returns a copy of a character array subset of the input buffer. 529 * The returned array begins at the {@code beginIndex} and 530 * extends to the character at index {@code endIndex - 1}. 531 * Thus the length of the substring is {@code endIndex-beginIndex}. 532 * This behavior is like 533 * {@code String.substring(beginIndex, endIndex)}. 534 * Unicode escape sequences are not translated. 535 * 536 * @param beginIndex the beginning index, inclusive. 537 * @param endIndex the ending index, exclusive. 538 * 539 * @throws ArrayIndexOutOfBoundsException if either offset is outside of the 540 * array bounds 541 */ 542 public char[] getRawCharacters(int beginIndex, int endIndex) { 543 return Arrays.copyOfRange(buffer, beginIndex, endIndex); 544 } 545 546 /** 547 * This is a specialized version of UnicodeReader that keeps track of the 548 * column position within a given character stream. Used for Javadoc 549 * processing to build a table for mapping positions in the comment string 550 * to positions in the source file. 551 */ 552 static class PositionTrackingReader extends UnicodeReader { 553 /** 554 * Offset from the beginning of the original reader buffer. 555 */ 556 private final int offset; 557 558 /** 559 * Current column in the comment. 560 */ 561 private int column; 562 563 /** 564 * Constructor. 565 * 566 * @param sf Scan factory. 567 * @param array Array containing contents of source. 568 * @param offset Position offset in original source buffer. 569 */ 570 protected PositionTrackingReader(ScannerFactory sf, char[] array, int offset) { 571 super(sf, array, array.length); 572 this.offset = offset; 573 this.column = 0; 574 } 575 576 /** 577 * Advances the current character to the next character. Tracks column. 578 * 579 * @return next character. 580 */ 581 @Override 582 protected char next() { 583 super.next(); 584 585 if (isOneOf('\n', '\r', '\f')) { 586 column = 0; 587 } else if (is('\t')) { 588 column = tabulate(column); 589 } else { 590 column++; 591 } 592 593 return get(); 594 } 595 596 /** 597 * Returns the current column. 598 * 599 * @return the current column. 600 */ 601 protected int column() { 602 return column; 603 } 604 605 /** 606 * Returns position relative to the original source buffer. 607 * 608 * @return 609 */ 610 protected int offsetPosition() { 611 return position() + offset; 612 } 613 } 614 615 } 616