1 // $Id: stream.cpp,v 1.85 2004/03/25 13:32:28 ericb Exp $
2 //
3 // This software is subject to the terms of the IBM Jikes Compiler
4 // License Agreement available at the following URL:
5 // http://ibm.com/developerworks/opensource/jikes.
6 // Copyright (C) 1996, 2004 IBM Corporation and others. All Rights Reserved.
7 // You must accept the terms of that agreement to use this software.
8 //
9
10 #include "stream.h"
11 #include "code.h"
12 #include "zip.h"
13 #include "symbol.h"
14 #include "control.h"
15 #include "semantic.h"
16 #include "javasym.h"
17 #include "option.h"
18 #include "tab.h"
19
20 #ifdef HAVE_JIKES_NAMESPACE
21 namespace Jikes { // Open namespace Jikes block
22 #endif
23
24 // Class StreamError
25
getSeverity()26 JikesError::JikesErrorSeverity StreamError::getSeverity()
27 {
28 // Most Lexical errors are ERRORs.
29 return kind >= StreamError::LAST_CHARACTER_NOT_NEWLINE
30 ? JikesError::JIKES_WARNING : JikesError::JIKES_ERROR;
31 }
32
getFileName()33 const char* StreamError::getFileName()
34 {
35 assert(lex_stream);
36 return lex_stream -> FileName();
37 }
38
getErrorMessage()39 const wchar_t* StreamError::getErrorMessage()
40 {
41 switch (kind)
42 {
43 case BAD_TOKEN:
44 return L"Illegal token ignored.";
45 case EMPTY_CHARACTER_CONSTANT:
46 return L"Empty character constant.";
47 case UNTERMINATED_CHARACTER_CONSTANT:
48 return L"Character constant not properly terminated.";
49 case MULTI_CHARACTER_CONSTANT:
50 return L"Character constant must be only one character.";
51 case ESCAPE_EXPECTED:
52 return L"Escape sequence required for this character constant.";
53 case UNTERMINATED_COMMENT:
54 return L"Comment not properly terminated.";
55 case UNTERMINATED_STRING_CONSTANT:
56 return L"String constant not properly terminated.";
57 case INVALID_HEX_CONSTANT:
58 return L"The hexadecimal prefix '0x' must be followed by at least one "
59 L"hex digit.";
60 case INVALID_FLOATING_HEX_EXPONENT:
61 return L"A hexadecimal floating point literal must have an exponent "
62 L"'p' designator.";
63 case INVALID_FLOATING_HEX_MANTISSA:
64 return L"A hexadecimal floating point literal must have at least one "
65 L"hex digit between the prefix '0x' and exponent 'p'.";
66 case INVALID_FLOATING_HEX_PREFIX:
67 return L"A hexadecimal floating point literal must start with the "
68 L"prefix '0x'.";
69 case INVALID_OCTAL_CONSTANT:
70 return L"The octal prefix '0' must not be followed by '8' or '9'.";
71 case INVALID_FLOATING_EXPONENT:
72 return L"A floating point exponent must have at least one digit.";
73 case INVALID_UNICODE_ESCAPE:
74 return L"Invalid unicode escape character.";
75 case INVALID_ESCAPE_SEQUENCE:
76 return L"Invalid escape sequence.";
77 case LAST_CHARACTER_NOT_NEWLINE:
78 return L"While not necessary, it is a good idea to end a file with a "
79 L"line terminator.";
80 case DEPRECATED_IDENTIFIER_ASSERT:
81 return L"The use of \"assert\" as an identifier is deprecated, "
82 L"as it is now a keyword. Use -source 1.4 if you intended "
83 L"to make use of assertions.";
84 case DEPRECATED_IDENTIFIER_ENUM:
85 return L"The use of \"enum\" as an identifier is deprecated, "
86 L"as it will be a keyword once -source 1.5 is implemented.";
87 case DOLLAR_IN_IDENTIFIER:
88 return L"The use of \"$\" in an identifier, while legal, is strongly "
89 L"discouraged, since it can conflict with compiler-generated "
90 L"names. If you are trying to access a nested type, use \".\" "
91 L"instead of \"$\".";
92 case FAVOR_CAPITAL_L_SUFFIX:
93 return L"The L suffix is preferred over the l suffix because l "
94 L"(lowercase L) is easily confused with 1 (the digit 1).";
95 default:
96 assert(false);
97 }
98
99 return L"Unknown Error";
100 }
101
102 bool StreamError::emacs_style_report = false;
103
getErrorReport()104 const wchar_t* StreamError::getErrorReport()
105 {
106 //
107 // We need to use this lazy initialization, because we can't to it in
108 // Initialize() method. Reason is that Find* methods are unusable until
109 // LexStream::CompressSpace is called, which does not happen until later
110 // after scanning is done and all errors are reported.
111 //
112 if (! initialized)
113 {
114 left_line_no = lex_stream -> FindLine(start_location);
115 left_column_no = lex_stream -> FindColumn(start_location - 1) + 1;
116 right_line_no = lex_stream -> FindLine(end_location);
117 right_column_no = lex_stream -> FindColumn(end_location);
118 initialized = true;
119 }
120
121 return emacs_style_report ? emacsErrorString() : regularErrorString();
122 }
123
emacsErrorString()124 const wchar_t* StreamError::emacsErrorString()
125 {
126 ErrorString s;
127
128 s << getFileName()
129 << ':' << left_line_no << ':' << left_column_no
130 << ':' << right_line_no << ':' << right_column_no
131 << ": Lexical " << getSeverityString() << ": " << getErrorMessage();
132
133 return s.Array();
134 }
135
136
regularErrorString()137 const wchar_t* StreamError::regularErrorString()
138 {
139 ErrorString s;
140
141 assert(lex_stream);
142 lex_stream -> OutputSource(this, s);
143
144 s << endl << "*** Lexical " << getSeverityString() << ": "
145 << getErrorMessage();
146
147 return s.Array();
148 }
149
150
Initialize(StreamErrorKind kind_,unsigned start,unsigned end,LexStream * l)151 void StreamError::Initialize(StreamErrorKind kind_, unsigned start,
152 unsigned end, LexStream* l)
153 {
154 kind = kind_;
155 start_location = start;
156 end_location = end;
157 lex_stream = l;
158 }
159
StreamError()160 StreamError::StreamError() : initialized(false)
161 {
162 }
163
164
165 // Class Stream
166
Stream()167 Stream::Stream()
168 : input_buffer(NULL),
169 input_buffer_length(0)
170 #if defined(HAVE_LIBICU_UC)
171 , _decoder(NULL)
172 #elif defined(JIKES_ICONV_ENCODING)
173 , _decoder((iconv_t) - 1)
174 #endif
175 {
176 }
177
~Stream()178 Stream::~Stream()
179 {
180 DestroyInput();
181 #ifdef HAVE_ENCODING
182 DestroyEncoding();
183 #endif // HAVE_ENCODING
184 }
185
186 #ifdef HAVE_ENCODING
187
188 // This method will return true is the given encoding
189 // can be supported, it is static because we need to
190 // be able to query encodings without an instance.
191
IsSupportedEncoding(char * encoding)192 bool Stream::IsSupportedEncoding(char* encoding)
193 {
194 // Create a tmp object instead of duplicating
195 // the code in SetEncoding and DestroyEncoding
196 Stream* tmp = new Stream();
197 bool supported = tmp -> SetEncoding(encoding);
198 delete tmp;
199 return supported;
200 }
201
SetEncoding(char * encoding)202 bool Stream::SetEncoding(char* encoding)
203 {
204 assert(encoding);
205 DestroyEncoding();
206
207 # if defined(HAVE_LIBICU_UC)
208 UErrorCode err = U_ZERO_ERROR;
209 _decoder = ucnv_open(encoding, &err);
210 # elif defined(JIKES_ICONV_ENCODING)
211 _decoder = iconv_open(JIKES_ICONV_ENCODING, encoding);
212 # endif
213
214 return HaveDecoder();
215 }
216
DestroyEncoding()217 void Stream::DestroyEncoding()
218 {
219 if (HaveDecoder())
220 {
221 # if defined(HAVE_LIBICU_UC)
222 ucnv_close(_decoder);
223 _decoder = NULL;
224 # elif defined(JIKES_ICONV_ENCODING)
225 iconv_close(_decoder);
226 _decoder = (iconv_t)-1;
227 # endif
228 }
229 }
230
231
232 // FIXME: We may want to inline this next method
233
234 // nah... I wanna get rid of this method instead.
235
DecodeNextCharacter()236 wchar_t Stream::DecodeNextCharacter()
237 {
238 const char* before = source_ptr;
239 wchar_t next;
240 error_decode_next_character = false;
241
242 # if defined(HAVE_LIBICU_UC)
243
244 if (!HaveDecoder())
245 return (wchar_t) *source_ptr++;
246
247 UErrorCode err = U_ZERO_ERROR;
248 next = ucnv_getNextUChar(_decoder, &source_ptr, source_tail + 1, &err);
249
250 if (U_FAILURE(err))
251 {
252 fprintf(stderr,"Conversion error: %s at byte %d\n",
253 u_errorName(err),
254 int(before - data_buffer)
255 );
256 error_decode_next_character = true;
257 return 0;
258 }
259
260 # elif defined(JIKES_ICONV_ENCODING)
261
262 if (!HaveDecoder()) {
263 // you can't just cast a char to a wchar_t, since that would
264 // sign extend the results, which if wchar_t is 4 bytes will
265 // lead the parser to segfault because it calculates a table
266 // offset based on the char.
267 return (wchar_t) ((*source_ptr++) & 0x00FF);
268 }
269
270 wchar_t* chp = &next;
271 size_t chl = sizeof(wchar_t);
272 size_t srcl = 1;
273
274 try_it_again:
275 size_t n = iconv(_decoder,
276 # ifdef HAVE_ERROR_CALL_ICONV_CONST
277 (char**)
278 # endif // HAVE_ERROR_CALL_ICONV_CONST
279 &source_ptr, &srcl,
280 (char**) &chp, &chl);
281
282 if (n == (size_t) -1)
283 {
284 if (errno == EINVAL && before + srcl + 1 <= source_tail) {
285 srcl++; //we're on a multibyte input and it didn't fit in srcl
286 goto try_it_again; //so we increase the window if there is space
287 // and try again. This is the ultimate hack. I hate it.
288 }
289 else
290 {
291 fprintf(stderr,"Charset conversion error at offset %d: ",
292 (int) (before - data_buffer));
293 perror("");
294 error_decode_next_character = true;
295 return 0;
296 }
297 }
298
299 # if JIKES_ICONV_NEEDS_BYTE_SWAP
300 char tmp;
301 char* targ = (char*) &next;
302 # if SIZEOF_WCHAR_T == 2
303 tmp = targ[0];
304 targ[0] = targ[1];
305 targ[1] = tmp;
306 # elif SIZEOF_WCHAR_T == 4
307 tmp = targ[0];
308 targ[0] = targ[3];
309 targ[3] = tmp;
310 tmp = targ[1];
311 targ[1] = targ[2];
312 targ[2] = tmp;
313 # else
314 # error sizeof(wchar_t) unworkable, this should not have passed configure
315 # endif //sizeof(wchar_t)
316
317 # endif // JIKES_ICONV_NEEDS_BYTE_SWAP
318
319 # endif // JIKES_ICONV_ENCODING
320
321 if (before == source_ptr)
322 {
323 //End of conversion
324 error_decode_next_character = true;
325 return 0;
326 }
327
328 return next;
329 }
330
331 #endif // HAVE_ENCODING
332
333
334 // Class LexStream
335
LexStream(Control & control_,FileSymbol * file_symbol_)336 LexStream::LexStream(Control& control_, FileSymbol* file_symbol_)
337 : file_symbol(file_symbol_),
338 #ifdef JIKES_DEBUG
339 file_read(false),
340 #endif
341 index(0),
342 tokens(NULL),
343 token_stream(12, 16),
344 comments(NULL),
345 comment_stream(10, 8),
346 locations(NULL),
347 line_location(12, 8),
348 package(0),
349 initial_reading_of_input(true),
350 comment_buffer(NULL),
351 control(control_)
352 {
353 StreamError::emacs_style_report = ! control_.option.errors;
354 }
355
~LexStream()356 LexStream::~LexStream()
357 {
358 #ifdef JIKES_DEBUG
359 if (file_read)
360 control.line_count += (line_location.Length() - 3);
361 #endif
362
363 DestroyInput();
364 }
365
366
KeywordName(int kind)367 const wchar_t* LexStream::KeywordName(int kind)
368 {
369 switch (kind)
370 {
371 case TK_abstract: return StringConstant::US_abstract;
372 case TK_assert: return StringConstant::US_assert;
373 case TK_boolean: return StringConstant::US_boolean;
374 case TK_break: return StringConstant::US_break;
375 case TK_byte: return StringConstant::US_byte;
376 case TK_case: return StringConstant::US_case;
377 case TK_catch: return StringConstant::US_catch;
378 case TK_char: return StringConstant::US_char;
379 case TK_class: return StringConstant::US_class;
380 case TK_const: return StringConstant::US_const;
381 case TK_continue: return StringConstant::US_continue;
382 case TK_default: return StringConstant::US_default;
383 case TK_do: return StringConstant::US_do;
384 case TK_double: return StringConstant::US_double;
385 case TK_else: return StringConstant::US_else;
386 case TK_enum: return StringConstant::US_enum;
387 case TK_extends: return StringConstant::US_extends;
388 case TK_false: return StringConstant::US_false;
389 case TK_final: return StringConstant::US_final;
390 case TK_finally: return StringConstant::US_finally;
391 case TK_float: return StringConstant::US_float;
392 case TK_for: return StringConstant::US_for;
393 case TK_goto: return StringConstant::US_goto;
394 case TK_if: return StringConstant::US_if;
395 case TK_implements: return StringConstant::US_implements;
396 case TK_import: return StringConstant::US_import;
397 case TK_instanceof: return StringConstant::US_instanceof;
398 case TK_int: return StringConstant::US_int;
399 case TK_interface: return StringConstant::US_interface;
400 case TK_long: return StringConstant::US_long;
401 case TK_native: return StringConstant::US_native;
402 case TK_new: return StringConstant::US_new;
403 case TK_null: return StringConstant::US_null;
404 case TK_package: return StringConstant::US_package;
405 case TK_private: return StringConstant::US_private;
406 case TK_protected: return StringConstant::US_protected;
407 case TK_public: return StringConstant::US_public;
408 case TK_return: return StringConstant::US_return;
409 case TK_short: return StringConstant::US_short;
410 case TK_static: return StringConstant::US_static;
411 case TK_strictfp: return StringConstant::US_strictfp;
412 case TK_super: return StringConstant::US_super;
413 case TK_switch: return StringConstant::US_switch;
414 case TK_synchronized: return StringConstant::US_synchronized;
415 case TK_this: return StringConstant::US_this;
416 case TK_throw: return StringConstant::US_throw;
417 case TK_throws: return StringConstant::US_throws;
418 case TK_transient: return StringConstant::US_transient;
419 case TK_true: return StringConstant::US_true;
420 case TK_try: return StringConstant::US_try;
421 case TK_void: return StringConstant::US_void;
422 case TK_volatile: return StringConstant::US_volatile;
423 case TK_while: return StringConstant::US_while;
424
425 case TK_PLUS_PLUS: return StringConstant::US_PLUS_PLUS;
426 case TK_MINUS_MINUS: return StringConstant::US_MINUS_MINUS;
427 case TK_EQUAL_EQUAL: return StringConstant::US_EQUAL_EQUAL;
428 case TK_LESS_EQUAL: return StringConstant::US_LESS_EQUAL;
429 case TK_GREATER_EQUAL: return StringConstant::US_GREATER_EQUAL;
430 case TK_NOT_EQUAL: return StringConstant::US_NOT_EQUAL;
431 case TK_LEFT_SHIFT: return StringConstant::US_LEFT_SHIFT;
432 case TK_RIGHT_SHIFT: return StringConstant::US_RIGHT_SHIFT;
433 case TK_UNSIGNED_RIGHT_SHIFT:
434 return StringConstant::US_UNSIGNED_RIGHT_SHIFT;
435 case TK_PLUS_EQUAL: return StringConstant::US_PLUS_EQUAL;
436 case TK_MINUS_EQUAL: return StringConstant::US_MINUS_EQUAL;
437 case TK_MULTIPLY_EQUAL: return StringConstant::US_MULTIPLY_EQUAL;
438 case TK_DIVIDE_EQUAL: return StringConstant::US_DIVIDE_EQUAL;
439 case TK_AND_EQUAL: return StringConstant::US_AND_EQUAL;
440 case TK_OR_EQUAL: return StringConstant::US_OR_EQUAL;
441 case TK_XOR_EQUAL: return StringConstant::US_XOR_EQUAL;
442 case TK_REMAINDER_EQUAL: return StringConstant::US_REMAINDER_EQUAL;
443 case TK_LEFT_SHIFT_EQUAL: return StringConstant::US_LEFT_SHIFT_EQUAL;
444 case TK_RIGHT_SHIFT_EQUAL:
445 return StringConstant::US_RIGHT_SHIFT_EQUAL;
446 case TK_UNSIGNED_RIGHT_SHIFT_EQUAL:
447 return StringConstant::US_UNSIGNED_RIGHT_SHIFT_EQUAL;
448 case TK_OR_OR: return StringConstant::US_OR_OR;
449 case TK_AND_AND: return StringConstant::US_AND_AND;
450
451 case TK_PLUS: return StringConstant::US_PLUS;
452 case TK_MINUS: return StringConstant::US_MINUS;
453 case TK_NOT: return StringConstant::US_NOT;
454 case TK_REMAINDER: return StringConstant::US_REMAINDER;
455 case TK_XOR: return StringConstant::US_XOR;
456 case TK_AND: return StringConstant::US_AND;
457 case TK_MULTIPLY: return StringConstant::US_MULTIPLY;
458 case TK_OR: return StringConstant::US_OR;
459 case TK_TWIDDLE: return StringConstant::US_TWIDDLE;
460 case TK_DIVIDE: return StringConstant::US_DIVIDE;
461 case TK_GREATER: return StringConstant::US_GREATER;
462 case TK_LESS: return StringConstant::US_LESS;
463 case TK_LPAREN: return StringConstant::US_LPAREN;
464 case TK_RPAREN: return StringConstant::US_RPAREN;
465 case TK_LBRACE: return StringConstant::US_LBRACE;
466 case TK_RBRACE: return StringConstant::US_RBRACE;
467 case TK_LBRACKET: return StringConstant::US_LBRACKET;
468 case TK_RBRACKET: return StringConstant::US_RBRACKET;
469 case TK_SEMICOLON: return StringConstant::US_SEMICOLON;
470 case TK_QUESTION: return StringConstant::US_QUESTION;
471 case TK_COLON: return StringConstant::US_COLON;
472 case TK_COMMA: return StringConstant::US_COMMA;
473 case TK_DOT: return StringConstant::US_DOT;
474 case TK_ELLIPSIS: return StringConstant::US_DOT_DOT_DOT;
475 case TK_AT: return StringConstant::US_AT;
476 case TK_EQUAL: return StringConstant::US_EQUAL;
477 case TK_EOF: return StringConstant::US_EOF;
478 default: break;
479 }
480 return StringConstant::US_EMPTY;
481 }
482
483
RightColumn(TokenIndex i)484 unsigned LexStream::RightColumn(TokenIndex i)
485 {
486 if (! input_buffer)
487 return 0;
488 unsigned location = tokens[i].Location() - 1 +
489 (NameSymbol(i) || LiteralSymbol(i)
490 ? tokens[i].additional_info.symbol -> NameLength()
491 : wcslen(KeywordName(tokens[i].Kind())));
492 return FindColumn(location);
493 }
494
NameString(TokenIndex i)495 const wchar_t* LexStream::NameString(TokenIndex i)
496 {
497 return NameSymbol(i) || LiteralSymbol(i)
498 ? tokens[i].additional_info.symbol -> Name()
499 : KeywordName(tokens[i].Kind());
500 }
501
NameStringLength(TokenIndex i)502 unsigned LexStream::NameStringLength(TokenIndex i)
503 {
504 return NameSymbol(i) || LiteralSymbol(i)
505 ? tokens[i].additional_info.symbol -> NameLength()
506 : wcslen(KeywordName(tokens[i].Kind()));
507 }
508
LineLength(unsigned line_no)509 unsigned LexStream::LineLength(unsigned line_no)
510 {
511 assert(input_buffer && locations);
512 return Tab::Wcslen(input_buffer, locations[line_no],
513 locations[line_no + 1] - 2); // ignore the \n
514 }
515
LineSegmentLength(TokenIndex i)516 unsigned LexStream::LineSegmentLength(TokenIndex i)
517 {
518 return Tab::Wcslen(input_buffer, tokens[i].Location(),
519 LineEnd(Line(i)));
520 }
521
522 //
523 // If the token represents a literal, this returns the literal symbol
524 // associated with it.
525 //
LiteralSymbol(TokenIndex i)526 class LiteralSymbol* LexStream::LiteralSymbol(TokenIndex i)
527 {
528 assert(i < (unsigned) token_stream.Length());
529 Symbol* symbol = tokens[i].additional_info.symbol;
530 return (symbol && Kind(i) != TK_LBRACE)
531 ? symbol -> LiteralCast() : (class LiteralSymbol*) NULL;
532 }
533
534
535 //
536 // If the token represents a literal, this returns the name symbol
537 // associated with it.
538 //
NameSymbol(TokenIndex i)539 class NameSymbol* LexStream::NameSymbol(TokenIndex i)
540 {
541 assert(i < (unsigned) token_stream.Length());
542 Symbol* symbol = tokens[i].additional_info.symbol;
543 return (symbol && Kind(i) != TK_LBRACE)
544 ? symbol -> NameCast() : (class NameSymbol*) NULL;
545 }
546
547
548 //
549 // Name of input file where the token appeared.
550 //
FileName()551 char* LexStream::FileName() { return file_symbol -> FileName(); }
FileNameLength()552 unsigned LexStream::FileNameLength()
553 {
554 return file_symbol -> FileNameLength();
555 }
556
557
558 //
559 //
560 //
CompressSpace()561 void LexStream::CompressSpace()
562 {
563 tokens = token_stream.Array();
564 comments = comment_stream.Array();
565 locations = line_location.Array();
566 types = type_index.Array();
567 }
568
569
570 //
571 // Outputs a line of source code, flattening literal TABs into spaces for
572 // uniform output spacing.
573 //
OutputLine(unsigned line_no,ErrorString & s)574 void LexStream::OutputLine(unsigned line_no, ErrorString& s)
575 {
576 assert(line_no);
577 unsigned line_end = LineEnd(line_no);
578 bool expand = Coutput.ExpandWchar();
579 for (unsigned i = LineStart(line_no), offset = 0; i <= line_end;
580 i++, offset++)
581 {
582 wchar_t ch = input_buffer[i];
583 if (ch == U_CARRIAGE_RETURN || ch == U_LINE_FEED)
584 s << (wchar_t) U_LINE_FEED;
585 else if (ch == U_HORIZONTAL_TAB)
586 {
587 s.width(Tab::TabSize() - offset % Tab::TabSize());
588 s << (wchar_t) U_SPACE;
589 offset = Tab::TabSize() - 1;
590 }
591 else if (ch == U_NULL)
592 {
593 s << (expand ? "\\u0000" : "?");
594 }
595 else
596 {
597 if (expand && (ch < U_SPACE || ch >= 0x0ff))
598 offset += 5;
599 s << ch;
600 }
601 }
602 }
603
604
605 //
606 // Outputs the section of source code which is in error.
607 //
OutputSource(JikesError * err,ErrorString & s)608 void LexStream::OutputSource(JikesError* err, ErrorString& s)
609 {
610 int left_line_no = err -> getLeftLineNo();
611 int left_column_no = err -> getLeftColumnNo();
612 int right_line_no = err -> getRightLineNo();
613 int right_column_no = err -> getRightColumnNo();
614 if (left_line_no == 0)
615 s << endl;
616 else if (left_line_no >= right_line_no)
617 {
618 s << endl << endl;
619 s.width(6);
620 s << left_line_no << ". ";
621 OutputLine(left_line_no, s);
622
623 s.width(left_column_no + 8);
624 s << '^';
625 if (left_column_no < right_column_no)
626 {
627 s.width(right_column_no - left_column_no);
628 s.fill('-');
629 s << "^";
630 s.fill(' ');
631 }
632 }
633 else // multi-line
634 {
635 s << endl << endl;
636 s.width(left_column_no + 8);
637 s << "<";
638 s.width(LineLength(left_line_no) - left_column_no);
639 s.fill('-');
640 s << "" << endl;
641 s.fill(' ');
642
643 s.width(6);
644 s << left_line_no << ". ";
645 OutputLine(left_line_no, s);
646 if (right_line_no > left_line_no + 1)
647 s << " . . ." << endl;
648 s.width(6);
649 s << right_line_no << ". ";
650 OutputLine(right_line_no, s);
651
652 s.width(8);
653 s << "";
654 s.width(right_column_no);
655 s.fill('-');
656 s << ">";
657 s.fill(' ');
658 }
659 }
660
661
662 //
663 // Find and return the index of the first comment that immediately follows
664 // tok. Return 0 if there is not a comment that immediately follows tok.
665 //
FirstComment(TokenIndex tok)666 LexStream::CommentIndex LexStream::FirstComment(TokenIndex tok)
667 {
668 unsigned location = Location(tok);
669 int lo = 0;
670 int hi = comment_stream.Length() - 1;
671 unsigned i = 0;
672 if (lo < hi)
673 {
674 do
675 {
676 int mid = (lo + hi) / 2;
677 if (comment_stream[mid].location < location)
678 lo = mid + 1;
679 else hi = mid - 1;
680 } while (lo < hi);
681 i = comment_stream[lo].location > location ? lo : lo + 1;
682 }
683 return i < (unsigned) comment_stream.Length() &&
684 comment_stream[i].previous_token == tok ? i : 0;
685 }
686
687
FindLine(unsigned location)688 unsigned LexStream::FindLine(unsigned location)
689 {
690 int lo = 0;
691 int hi = line_location.Length() - 1;
692
693 assert(locations);
694
695 //
696 // we can place the exit test at the bottom of the loop
697 // since the line_location array will always contain at least
698 // one element.
699 //
700 do
701 {
702 int mid = (lo + hi) / 2;
703 if (locations[mid] == location)
704 return mid;
705 if (locations[mid] < location)
706 lo = mid + 1;
707 else hi = mid - 1;
708 } while (lo < hi);
709 return locations[lo] > location ? lo - 1 : lo;
710 }
711
FindColumn(unsigned loc)712 unsigned LexStream::FindColumn(unsigned loc)
713 {
714 assert(locations);
715 return input_buffer[loc] == U_LINE_FEED ? 0
716 : Tab::Wcslen(input_buffer, locations[FindLine(loc)], loc);
717 }
718
ReadInput()719 void LexStream::ReadInput()
720 {
721 if (file_symbol -> IsZip())
722 {
723 ZipFile* zipfile = new ZipFile(file_symbol);
724
725 if (zipfile -> Buffer() == NULL)
726 {
727 fprintf(stderr, "chaos: Don\'t know how to process compressed "
728 "(\".java\") source in a zip file\n");
729 assert(false);
730 }
731 else if (! file_symbol -> lex_stream)
732 {
733 // Once the zip file is loaded, it never changes. So, we only read
734 // it the first time
735 file_symbol -> lex_stream = this;
736 ProcessInput(zipfile -> Buffer(),
737 file_symbol -> uncompressed_size);
738 }
739 delete zipfile;
740 }
741 else
742 {
743 struct stat status;
744 JikesAPI::getInstance() -> stat(FileName(), &status);
745
746 file_symbol -> mtime = status.st_mtime; // actual time stamp of file read
747 file_symbol -> lex_stream = this;
748
749
750 JikesAPI::FileReader* file =
751 JikesAPI::getInstance() -> read(FileName());
752 if (file)
753 {
754 ProcessInput(file -> getBuffer(), file -> getBufferSize());
755 delete file;
756 }
757 }
758
759 initial_reading_of_input = false;
760 }
761
RereadInput()762 void LexStream::RereadInput()
763 {
764 if (input_buffer) // if input already available, do nothing
765 ;
766 else if (file_symbol -> IsZip())
767 {
768 ZipFile* zipfile = new ZipFile(file_symbol);
769
770 if (zipfile -> Buffer() == NULL)
771 {
772 fprintf(stderr, "chaos: Don\'t know how to process compressed "
773 "(\".java\") source in a zip file\n");
774 assert(false);
775 }
776 else ProcessInput(zipfile -> Buffer(),
777 file_symbol -> uncompressed_size);
778 delete zipfile;
779 }
780 else
781 {
782 struct stat status;
783 JikesAPI::getInstance() -> stat(FileName(), &status);
784
785 if (status.st_mtime == file_symbol -> mtime)
786 {
787 JikesAPI::FileReader* file =
788 JikesAPI::getInstance() -> read(FileName());
789 if (file)
790 {
791 ProcessInput(file -> getBuffer(), file -> getBufferSize());
792 delete file;
793 }
794 }
795 else
796 {
797 // TODO: File has changed !!!
798 }
799 }
800 }
801
802
hexvalue(wchar_t ch)803 int LexStream::hexvalue(wchar_t ch)
804 {
805 switch (ch)
806 {
807 case U_a: case U_A:
808 return 10;
809 case U_b: case U_B:
810 return 11;
811 case U_c: case U_C:
812 return 12;
813 case U_d: case U_D:
814 return 13;
815 case U_e: case U_E:
816 return 14;
817 case U_f: case U_F:
818 return 15;
819 default:
820 return ch - U_0;
821 }
822 }
823
824 //
825 // Store/convert filesize bytes from a file in the input_buffer.
826 //
827
828 #if defined(HAVE_ENCODING)
829
ProcessInput(const char * buffer,long filesize)830 void LexStream::ProcessInput(const char* buffer, long filesize)
831 {
832 LexStream::ProcessInputUnicode(buffer, filesize);
833 }
834
835 #else // defined(HAVE_ENCODING)
836
ProcessInput(const char * buffer,long filesize)837 void LexStream::ProcessInput(const char* buffer, long filesize)
838 {
839 LexStream::ProcessInputAscii(buffer, filesize);
840 }
841
ProcessInputAscii(const char * buffer,long filesize)842 void LexStream::ProcessInputAscii(const char* buffer, long filesize)
843 {
844 #ifdef JIKES_DEBUG
845 file_read = true;
846 #endif
847
848 wchar_t* input_ptr = AllocateInputBuffer(filesize);
849 *input_ptr = U_LINE_FEED; // Add an initial '\n' for correct line numbers.
850
851 if (buffer)
852 {
853 InitializeDataBuffer(buffer, filesize);
854
855 while (source_ptr <= source_tail)
856 {
857 // The (& 0x00ff) guarantees that quantity is unsigned value.
858 *(++input_ptr) = (*source_ptr++) & 0x00ff;
859
860 //
861 // During this pass, only flatten \u constructs. Even numbers of
862 // \\ are ignored; odd is a unicode escape, which may have
863 // unlimited u's (lowercase), then exactly 4 hex digits (no case).
864 //
865 if (*input_ptr == U_BACKSLASH)
866 {
867 if (source_ptr > source_tail)
868 {
869 // Oops, file ended on single \. This will cause an
870 // error later in the scanner, so do nothing.
871 }
872 else if (*source_ptr == U_u)
873 {
874 // Parse the unicode escape.
875 const char* u_ptr = source_ptr;
876 while (++source_ptr <= source_tail && *source_ptr == U_u);
877
878 *input_ptr = 0;
879 int i = 0;
880 bool bad_char = false;
881 for ( ; source_ptr <= source_tail && i < 4; i++)
882 {
883 const char ch = *source_ptr++;
884 switch (ch)
885 {
886 case U_a: case U_b: case U_c: case U_d:
887 case U_e: case U_f:
888 *input_ptr = (*input_ptr << 4) + (ch - (U_a - 10));
889 break;
890 case U_A: case U_B: case U_C: case U_D:
891 case U_E: case U_F:
892 *input_ptr = (*input_ptr << 4) + (ch - (U_A - 10));
893 break;
894 case U_0: case U_1: case U_2: case U_3:
895 case U_4: case U_5: case U_6: case U_7:
896 case U_8: case U_9:
897 *input_ptr = (*input_ptr << 4) + (ch - U_0);
898 break;
899 default:
900 bad_char = true;
901 *input_ptr <<= 4;
902 }
903 }
904 if (bad_char || i != 4)
905 {
906 if (initial_reading_of_input)
907 ReportMessage(StreamError::INVALID_UNICODE_ESCAPE,
908 (unsigned) (input_ptr - input_buffer),
909 (unsigned) (input_ptr - input_buffer) + (source_ptr - u_ptr));
910
911 // Restore the input such that we just pass the bad
912 // escape through to the next scan.
913 source_ptr = u_ptr;
914 *input_ptr = U_BACKSLASH;
915 }
916 }
917 else
918 {
919 // All other escaped characters, including \, are just
920 // passed through to the next scan.
921 *(++input_ptr) = *source_ptr++;
922 }
923 }
924 //
925 // Replace \r with \n, \r\n with \n. Then the scanner only has
926 // to look for \n, and we can use \r as an early EOF flag.
927 //
928 if (*input_ptr == U_CARRIAGE_RETURN)
929 {
930 *input_ptr = U_LINE_FEED;
931 if (*source_ptr == U_LINE_FEED)
932 source_ptr++;
933 else if (*source_ptr == U_BACKSLASH)
934 {
935 //
936 // Remember, \u000a is U_LINE_FEED. Here, if we error out,
937 // do nothing, as the next pass through the outermost loop
938 // will catch it.
939 //
940 int i = 0;
941 while (source_ptr + i < source_tail &&
942 source_ptr[++i] == U_u);
943 if (i > 1 && (source_ptr + i + 3) <= source_tail &&
944 source_ptr[i] == U_0 && source_ptr[i + 1] == U_0 &&
945 source_ptr[i + 2] == U_0 &&
946 (source_ptr[i + 3] == U_a || source_ptr[i + 3] == U_A))
947 {
948 source_ptr += i + 4;
949 }
950 }
951 }
952 }
953 }
954
955 //
956 // To aid the scanner, we artificially remove any U_CTL_Z ending the file,
957 // and insert U_CARRIAGE_RETURN, U_NULL. This is because U_CTL_Z is legal
958 // inside comments, but // comments must end on a newline; and it is safe
959 // since the above pass converted all CR's to LF's.
960 //
961 if (*input_ptr == U_CTL_Z)
962 input_ptr--;
963 if (initial_reading_of_input && control.option.pedantic &&
964 *input_ptr != U_LINE_FEED)
965 {
966 ReportMessage(StreamError::LAST_CHARACTER_NOT_NEWLINE,
967 (unsigned) (input_ptr - input_buffer),
968 (unsigned) (input_ptr - input_buffer));
969 }
970 *(++input_ptr) = U_CARRIAGE_RETURN;
971 *(++input_ptr) = U_NULL;
972 input_buffer_length = input_ptr - input_buffer;
973 }
974
975 #endif // ! defined(HAVE_ENCODING)
976
977
978
979 #if defined(HAVE_ENCODING)
980
ProcessInputUnicode(const char * buffer,long filesize)981 void LexStream::ProcessInputUnicode(const char* buffer, long filesize)
982 {
983 //fprintf(stderr,"LexStream::ProcessInputUnicode called.\n");
984 #ifdef JIKES_DEBUG
985 file_read = true;
986 #endif
987
988 wchar_t* input_ptr = AllocateInputBuffer(filesize);
989 wchar_t* input_tail = input_ptr + filesize;
990 *input_ptr = U_LINE_FEED; // add an initial '\n';
991
992 if (buffer)
993 {
994 int escape_value = 0;
995 wchar_t* escape_ptr = NULL;
996 UnicodeLexerState saved_state = RAW;
997 UnicodeLexerState state = START;
998 bool oncemore = false;
999
1000 // If oncemore is true, ch holds the current character, otherwise
1001 // it is updated to the next character
1002 wchar_t ch = 0;
1003
1004 if (control.option.encoding)
1005 {
1006 // The encoding should have been validated by now
1007 bool encoding_set = SetEncoding(control.option.encoding);
1008 assert(encoding_set);
1009 }
1010
1011 // init data after setting the encoding
1012 InitializeDataBuffer(buffer, filesize);
1013
1014 while (HasMoreData() || oncemore)
1015 {
1016 // On each iteration we advance input_ptr a maximum of 2 positions.
1017 // Here we check if we are close to the end of input_buffer.
1018 if (input_ptr >= input_tail)
1019 {
1020 // If this happens, reallocate it with some more space.
1021 // This is very rare case, which could happen if
1022 // one code page character is represented by several
1023 // unicode characters. One of exaples of such
1024 // situation is unicode "surrogates".
1025 //
1026 // If such reallocation will be required, it will indeed
1027 // slow down compilation a bit.
1028 size_t cursize = input_ptr - input_buffer;
1029 size_t newsize = cursize + cursize / 10 + 4; // add 10%
1030 wchar_t* tmp = new wchar_t[newsize];
1031 memcpy(tmp, input_buffer, cursize * sizeof(wchar_t));
1032 delete [] input_buffer;
1033 input_buffer = tmp;
1034 input_tail = input_buffer + newsize - 1;
1035 input_ptr = input_buffer + cursize;
1036 }
1037
1038 if (! oncemore)
1039 {
1040 ch = DecodeNextCharacter();
1041 if (ErrorDecodeNextCharacter())
1042 break;
1043 }
1044 else oncemore = false;
1045
1046 switch (state)
1047 {
1048 case QUOTE:
1049 *(++input_ptr) = U_BACKSLASH;
1050 if (ch == U_BACKSLASH)
1051 {
1052 *(++input_ptr) = U_BACKSLASH;
1053 state = RAW;
1054 }
1055 else if (ch == U_u)
1056 {
1057 //
1058 // We transfer all the characters of the escape sequence,
1059 // in case it is invalid; but remember where it started
1060 // for error reporting, and to back up on success.
1061 //
1062 escape_ptr = input_ptr;
1063 *(++input_ptr) = U_u;
1064 state = UNICODE_ESCAPE;
1065 }
1066 else
1067 {
1068 state = RAW;
1069 oncemore = true;
1070 }
1071 break;
1072 case UNICODE_ESCAPE:
1073 *(++input_ptr) = ch;
1074 if (Code::IsHexDigit(ch))
1075 {
1076 state = UNICODE_ESCAPE_DIGIT_0;
1077 escape_value = hexvalue(ch) << 12;
1078 }
1079 else if (ch != U_u)
1080 {
1081 if (initial_reading_of_input)
1082 ReportMessage(StreamError::INVALID_UNICODE_ESCAPE,
1083 (unsigned) (escape_ptr - input_buffer),
1084 ((unsigned) (input_ptr - input_buffer) -
1085 (Code::IsNewline(ch) ? 1 : 0)));
1086 state = RAW;
1087 }
1088 break;
1089 case UNICODE_ESCAPE_DIGIT_0:
1090 *(++input_ptr) = ch;
1091 if (Code::IsHexDigit(ch))
1092 {
1093 state = UNICODE_ESCAPE_DIGIT_1;
1094 escape_value += hexvalue(ch) << 8;
1095 }
1096 else
1097 {
1098 if (initial_reading_of_input)
1099 ReportMessage(StreamError::INVALID_UNICODE_ESCAPE,
1100 (unsigned) (escape_ptr - input_buffer),
1101 ((unsigned) (input_ptr - input_buffer) -
1102 (Code::IsNewline(ch) ? 1 : 0)));
1103 state = RAW;
1104 }
1105 break;
1106 case UNICODE_ESCAPE_DIGIT_1:
1107 *(++input_ptr) = ch;
1108 if (Code::IsHexDigit(ch))
1109 {
1110 state = UNICODE_ESCAPE_DIGIT_2;
1111 escape_value += hexvalue(ch) << 4;
1112 }
1113 else
1114 {
1115 if (initial_reading_of_input)
1116 ReportMessage(StreamError::INVALID_UNICODE_ESCAPE,
1117 (unsigned) (escape_ptr - input_buffer),
1118 ((unsigned) (input_ptr - input_buffer) -
1119 (Code::IsNewline(ch) ? 1 : 0)));
1120 state = RAW;
1121 }
1122 break;
1123 case UNICODE_ESCAPE_DIGIT_2:
1124 if (Code::IsHexDigit(ch))
1125 {
1126 ch = escape_value + hexvalue(ch);
1127 state = saved_state;
1128 input_ptr = escape_ptr - 1; // Back up - see case QUOTE.
1129 oncemore = true;
1130 }
1131 else
1132 {
1133 *(++input_ptr) = ch;
1134 if (initial_reading_of_input)
1135 ReportMessage(StreamError::INVALID_UNICODE_ESCAPE,
1136 (unsigned) (escape_ptr - input_buffer),
1137 ((unsigned) (input_ptr - input_buffer) -
1138 (Code::IsNewline(ch) ? 1 : 0)));
1139 state = RAW;
1140 }
1141 saved_state = UNICODE_ESCAPE_DIGIT_2;
1142 break;
1143 case CR:
1144 if (ch == U_LINE_FEED)
1145 {
1146 // skip line feed if it comes right after a CR.
1147 state = RAW;
1148 }
1149 else if (ch == U_CARRIAGE_RETURN)
1150 {
1151 // but if CR follows CR then the second CR starts a
1152 // line feed too (and note that state=CR afterwards),
1153 // so that CR-CR-LF will be handled correctly.
1154 *(++input_ptr) = U_LINE_FEED;
1155 }
1156 else if (ch == U_BACKSLASH &&
1157 saved_state != UNICODE_ESCAPE_DIGIT_2)
1158 {
1159 state = QUOTE;
1160 }
1161 else
1162 {
1163 state = RAW;
1164 *(++input_ptr) = ch;
1165 }
1166 // clear saved_state == UNICODE_ESCAPE_DIGIT_2 status
1167 saved_state = CR;
1168 break;
1169 case START:
1170 // if for some reason converter produced or passed
1171 // byte order mark, it have to be ignored.
1172 state = RAW;
1173 if (ch == U_BOM || ch == U_REVERSE_BOM)
1174 break; //ignore
1175 // fallthrough
1176 case RAW:
1177 if (ch == U_BACKSLASH && saved_state != UNICODE_ESCAPE_DIGIT_2)
1178 {
1179 state = QUOTE;
1180 }
1181 else if (ch == U_CARRIAGE_RETURN)
1182 {
1183 state = CR;
1184 *(++input_ptr) = U_LINE_FEED;
1185 }
1186 else
1187 {
1188 *(++input_ptr) = ch;
1189 }
1190 saved_state = RAW;
1191 break;
1192 }
1193 }
1194 if (state == QUOTE)
1195 {
1196 *(++input_ptr) = U_BACKSLASH;
1197 }
1198 else if (state >= UNICODE_ESCAPE)
1199 {
1200 if (initial_reading_of_input)
1201 ReportMessage(StreamError::INVALID_UNICODE_ESCAPE,
1202 (unsigned) (escape_ptr - input_buffer),
1203 (unsigned) (input_ptr - input_buffer));
1204 }
1205 }
1206
1207 //
1208 // To aid the scanner, we artificially remove any U_CTL_Z ending the file,
1209 // and insert U_CARRIAGE_RETURN, U_NULL. This is because U_CTL_Z is legal
1210 // inside comments, but // comments must end on a newline; and it is safe
1211 // since the above pass converted all CR's to LF's.
1212 //
1213 if (*input_ptr == U_CTL_Z)
1214 input_ptr--;
1215 if (initial_reading_of_input && control.option.pedantic &&
1216 *input_ptr != U_LINE_FEED)
1217 {
1218 ReportMessage(StreamError::LAST_CHARACTER_NOT_NEWLINE,
1219 (unsigned) (input_ptr - input_buffer),
1220 (unsigned) (input_ptr - input_buffer));
1221 }
1222 *(++input_ptr) = U_CARRIAGE_RETURN;
1223 *(++input_ptr) = U_NULL;
1224 input_buffer_length = input_ptr - input_buffer;
1225 }
1226 #endif // defined(HAVE_ENCODING)
1227
ReportMessage(StreamError::StreamErrorKind kind,unsigned start_location,unsigned end_location)1228 void LexStream::ReportMessage(StreamError::StreamErrorKind kind,
1229 unsigned start_location,
1230 unsigned end_location)
1231 {
1232 if (control.option.tolerance != JikesOption::NO_WARNINGS ||
1233 kind < StreamError::DEPRECATED_IDENTIFIER_ASSERT)
1234 {
1235 bad_tokens.Next().Initialize(kind, start_location, end_location, this);
1236 }
1237 }
1238
1239 //
1240 // This procedure uses a quick sort algorithm to sort the stream ERRORS
1241 // by their locations.
1242 //
SortMessages()1243 void LexStream::SortMessages()
1244 {
1245 int lower,
1246 upper,
1247 lostack[32],
1248 histack[32];
1249
1250 int top,
1251 i,
1252 j;
1253 StreamError pivot,
1254 temp;
1255
1256 top = 0;
1257 lostack[top] = 0;
1258 histack[top] = bad_tokens.Length() - 1;
1259
1260 while (top >= 0)
1261 {
1262 lower = lostack[top];
1263 upper = histack[top];
1264 top--;
1265
1266 while (upper > lower)
1267 {
1268 //
1269 // The array is most-likely almost sorted. Therefore,
1270 // we use the middle element as the pivot element.
1271 //
1272 i = (lower + upper) / 2;
1273 pivot = bad_tokens[i];
1274 bad_tokens[i] = bad_tokens[lower];
1275
1276 //
1277 // Split the array section indicated by LOWER and UPPER
1278 // using ARRAY(LOWER) as the pivot.
1279 //
1280 i = lower;
1281 for (j = lower + 1; j <= upper; j++)
1282 {
1283 if (bad_tokens[j].start_location < pivot.start_location)
1284 {
1285 temp = bad_tokens[++i];
1286 bad_tokens[i] = bad_tokens[j];
1287 bad_tokens[j] = temp;
1288 }
1289 }
1290 bad_tokens[lower] = bad_tokens[i];
1291 bad_tokens[i] = pivot;
1292
1293 top++;
1294 if ((i - lower) < (upper - i))
1295 {
1296 lostack[top] = i + 1;
1297 histack[top] = upper;
1298 upper = i - 1;
1299 }
1300 else
1301 {
1302 histack[top] = i - 1;
1303 lostack[top] = lower;
1304 lower = i + 1;
1305 }
1306 }
1307 }
1308 }
1309
1310
1311 //
1312 //
1313 //
PrintMessages()1314 void LexStream::PrintMessages()
1315 {
1316 //
1317 // If control.option.dump_errors then the error messages have already
1318 // been printed
1319 //
1320 if (! control.option.dump_errors)
1321 {
1322 RereadInput();
1323
1324 if (control.option.errors)
1325 {
1326 char* file_name = FileName();
1327
1328 int error_count = NumBadTokens(),
1329 warning_count = NumWarnTokens();
1330 if (error_count)
1331 {
1332 Coutput << endl << "Found " << error_count << " lexical error"
1333 << (error_count == 1 ? "" : "s");
1334 }
1335 if (warning_count)
1336 {
1337 if (error_count)
1338 Coutput << "and issued ";
1339 else
1340 Coutput << endl << "Issued ";
1341 Coutput << warning_count << " lexical warning"
1342 << (warning_count == 1 ? "" : "s");
1343 }
1344 if (error_count || warning_count)
1345 Coutput << " in \"" << file_name << "\":";
1346
1347 if (! input_buffer)
1348 {
1349 int length = FileNameLength();
1350 wchar_t* name = new wchar_t[length + 1];
1351 for (int i = 0; i < length; i++)
1352 name[i] = file_name[i];
1353 name[length] = U_NULL;
1354 control.system_semantic ->
1355 ReportSemError(SemanticError::CANNOT_REOPEN_FILE,
1356 BAD_TOKEN, name);
1357 delete [] name;
1358 }
1359 else
1360 {
1361 for (unsigned i = 0; i < bad_tokens.Length(); i++)
1362 JikesAPI::getInstance() -> reportError(&bad_tokens[i]);
1363 }
1364 }
1365 else
1366 {
1367 for (unsigned i = 0; i < bad_tokens.Length(); i++)
1368 JikesAPI::getInstance() -> reportError(&bad_tokens[i]);
1369 }
1370
1371 DestroyInput();
1372 Coutput.flush();
1373 }
1374 }
1375
1376 #ifdef HAVE_JIKES_NAMESPACE
1377 } // Close namespace Jikes block
1378 #endif
1379
1380