1 // $Id: scanner.cpp,v 1.45 2004/03/25 13:32:28 ericb Exp $
2 //
3 // This software is subject to the terms of the IBM Jikes Compiler
4 // License Agreement available at the following URL:
5 // http://ibm.com/developerworks/opensource/jikes.
6 // Copyright (C) 1996, 2004 IBM Corporation and others.  All Rights Reserved.
7 // You must accept the terms of that agreement to use this software.
8 //
9 
10 #include "scanner.h"
11 #include "control.h"
12 #include "error.h"
13 #include "javadef.h"
14 #include "javasym.h"
15 #include "option.h"
16 #include "code.h"
17 
18 #ifdef HAVE_JIKES_NAMESPACE
19 namespace Jikes { // Open namespace Jikes block
20 #endif
21 
22 int (*Scanner::scan_keyword[13]) (const wchar_t* p1) =
23 {
24     ScanKeyword0,
25     ScanKeyword0,
26     ScanKeyword2,
27     ScanKeyword3,
28     ScanKeyword4,
29     ScanKeyword5,
30     ScanKeyword6,
31     ScanKeyword7,
32     ScanKeyword8,
33     ScanKeyword9,
34     ScanKeyword10,
35     ScanKeyword0,
36     ScanKeyword12
37 };
38 
39 
40 //
41 // The constructor initializes all utility variables.
42 //
Scanner(Control & control_)43 Scanner::Scanner(Control& control_)
44     : control(control_),
45       dollar_warning_given(false),
46       deprecated(false)
47 {
48     //
49     // If this assertion fails, the Token structure in stream.h must be
50     // redesigned !!!
51     //
52     assert(NUM_TERMINALS < 128);
53     //
54     // If this assertion fails, then gencode.java is at fault.
55     //
56 #ifdef JIKES_DEBUG
57     assert(Code::CodeCheck());
58 #endif // JIKES_DEBUG
59 
60     //
61     // CLASSIFY_TOKEN is a mapping from each character into a
62     // classification routine that is invoked when that character
63     // is the first character encountered in a token.
64     //
65     for (int c = 0; c < 128; c++)
66     {
67         if (Code::IsAsciiUpper(c) || Code::IsAsciiLower(c) || c == U_DOLLAR ||
68             c == U_UNDERSCORE)
69         {
70             classify_token[c] = &Scanner::ClassifyId;
71         }
72         else if (Code::IsDecimalDigit(c))
73             classify_token[c] = &Scanner::ClassifyNumericLiteral;
74         else if (Code::IsSpace(c))
75             classify_token[c] = &Scanner::SkipSpaces;
76         else classify_token[c] = &Scanner::ClassifyBadToken;
77     }
78     classify_token[128] = &Scanner::ClassifyNonAsciiUnicode;
79 
80     classify_token[U_a] = &Scanner::ClassifyIdOrKeyword;
81     classify_token[U_b] = &Scanner::ClassifyIdOrKeyword;
82     classify_token[U_c] = &Scanner::ClassifyIdOrKeyword;
83     classify_token[U_d] = &Scanner::ClassifyIdOrKeyword;
84     classify_token[U_e] = &Scanner::ClassifyIdOrKeyword;
85     classify_token[U_f] = &Scanner::ClassifyIdOrKeyword;
86     classify_token[U_g] = &Scanner::ClassifyIdOrKeyword;
87     classify_token[U_i] = &Scanner::ClassifyIdOrKeyword;
88     classify_token[U_l] = &Scanner::ClassifyIdOrKeyword;
89     classify_token[U_n] = &Scanner::ClassifyIdOrKeyword;
90     classify_token[U_p] = &Scanner::ClassifyIdOrKeyword;
91     classify_token[U_r] = &Scanner::ClassifyIdOrKeyword;
92     classify_token[U_s] = &Scanner::ClassifyIdOrKeyword;
93     classify_token[U_t] = &Scanner::ClassifyIdOrKeyword;
94     classify_token[U_v] = &Scanner::ClassifyIdOrKeyword;
95     classify_token[U_w] = &Scanner::ClassifyIdOrKeyword;
96 
97     classify_token[U_SINGLE_QUOTE] = &Scanner::ClassifyCharLiteral;
98     classify_token[U_DOUBLE_QUOTE] = &Scanner::ClassifyStringLiteral;
99 
100     classify_token[U_PLUS] = &Scanner::ClassifyPlus;
101     classify_token[U_MINUS] = &Scanner::ClassifyMinus;
102     classify_token[U_EXCLAMATION] = &Scanner::ClassifyNot;
103     classify_token[U_PERCENT] = &Scanner::ClassifyMod;
104     classify_token[U_CARET] = &Scanner::ClassifyXor;
105     classify_token[U_AMPERSAND] = &Scanner::ClassifyAnd;
106     classify_token[U_STAR] = &Scanner::ClassifyStar;
107     classify_token[U_BAR] = &Scanner::ClassifyOr;
108     classify_token[U_TILDE] = &Scanner::ClassifyComplement;
109     classify_token[U_SLASH] = &Scanner::ClassifySlash;
110     classify_token[U_GREATER] = &Scanner::ClassifyGreater;
111     classify_token[U_LESS] = &Scanner::ClassifyLess;
112     classify_token[U_LEFT_PARENTHESIS] = &Scanner::ClassifyLparen;
113     classify_token[U_RIGHT_PARENTHESIS] = &Scanner::ClassifyRparen;
114     classify_token[U_LEFT_BRACE] = &Scanner::ClassifyLbrace;
115     classify_token[U_RIGHT_BRACE] = &Scanner::ClassifyRbrace;
116     classify_token[U_LEFT_BRACKET] = &Scanner::ClassifyLbracket;
117     classify_token[U_RIGHT_BRACKET] = &Scanner::ClassifyRbracket;
118     classify_token[U_SEMICOLON] = &Scanner::ClassifySemicolon;
119     classify_token[U_QUESTION] = &Scanner::ClassifyQuestion;
120     classify_token[U_COLON] = &Scanner::ClassifyColon;
121     classify_token[U_COMMA] = &Scanner::ClassifyComma;
122     classify_token[U_DOT] = &Scanner::ClassifyPeriod;
123     classify_token[U_EQUAL] = &Scanner::ClassifyEqual;
124     classify_token[U_AT] = &Scanner::ClassifyAt;
125 }
126 
127 
128 //
129 // Associate a lexical stream with this file. Remember, we doctored the stream
130 // to start with \n so that we always start on a whitespace token, and so that
131 // the first source code line is line 1.
132 //
Initialize(FileSymbol * file_symbol)133 void Scanner::Initialize(FileSymbol* file_symbol)
134 {
135     lex = new LexStream(control, file_symbol);
136     current_token_index = lex -> GetNextToken(); // Get 0th token.
137     current_token = &(lex -> token_stream[current_token_index]);
138     current_token -> SetKind(0);
139 
140 #ifdef JIKES_DEBUG
141     if (control.option.debug_comments)
142     {
143         // Add 0th comment.
144         LexStream::Comment* current_comment = &(lex -> comment_stream.Next());
145         current_comment -> string = NULL;
146         current_comment -> length = 0;
147         current_comment -> previous_token = BAD_TOKEN;
148         current_comment -> location = 0;
149     }
150 #endif // JIKES_DEBUG
151 
152     lex -> line_location.Next() = 0; // Mark starting location of line # 0
153 }
154 
155 
156 //
157 // This is one of the main entry point for the Java lexical analyser. Its
158 // input is the name of a regular text file. Its output is a stream of tokens.
159 //
SetUp(FileSymbol * file_symbol)160 void Scanner::SetUp(FileSymbol* file_symbol)
161 {
162     Initialize(file_symbol);
163     lex -> CompressSpace();
164     file_symbol -> lex_stream = lex;
165 }
166 
167 
168 //
169 // This is one of the main entry point for the Java lexical analyser. Its
170 // input is the name of a regular text file. Its output is a stream of tokens.
171 //
Scan(FileSymbol * file_symbol)172 void Scanner::Scan(FileSymbol* file_symbol)
173 {
174     Initialize(file_symbol);
175     lex -> ReadInput();
176     cursor = lex -> InputBuffer();
177     if (cursor)
178     {
179         Scan();
180         lex -> CompressSpace();
181 
182         if (control.option.dump_errors)
183         {
184             lex -> SortMessages();
185             for (unsigned i = 0; i < lex -> bad_tokens.Length(); i++)
186                 JikesAPI::getInstance() ->
187                     reportError(&(lex -> bad_tokens[i]));
188         }
189         lex -> DestroyInput(); // get rid of input buffer
190     }
191     else
192     {
193         delete lex;
194         lex = NULL;
195     }
196     file_symbol -> lex_stream = lex;
197 }
198 
199 
200 //
201 // Scan the InputBuffer() and process all tokens and comments.
202 //
Scan()203 void Scanner::Scan()
204 {
205     input_buffer_tail = &cursor[lex -> InputBufferLength()];
206 
207     //
208     // CURSOR is assumed to point to the next character to be scanned.
209     // Using CURSOR, we jump to the proper classification function
210     // which scans and classifies the token and returns the location of
211     // the character immediately following it.
212     //
213     do
214     {
215         //
216         // Allocate space for next token and set its location.
217         //
218         if (! current_token_index || current_token -> Kind())
219         {
220             current_token_index =
221                 lex -> GetNextToken(cursor - lex -> InputBuffer());
222             current_token = &(lex -> token_stream[current_token_index]);
223         }
224         else
225         {
226             current_token -> ResetInfoAndSetLocation(cursor -
227                                                      lex -> InputBuffer());
228         }
229         if (deprecated)
230         {
231             current_token -> SetDeprecated();
232             deprecated = false;
233         }
234         (this ->* classify_token[*cursor < 128 ? *cursor : 128])();
235     } while (cursor < input_buffer_tail);
236 
237     //
238     // Add a a gate after the last line.
239     //
240     lex -> line_location.Next() = input_buffer_tail - lex -> InputBuffer();
241     current_token -> SetKind(TK_EOF);
242 
243     //
244     // If the brace_stack is not empty, then there are unmatched left
245     // braces in the input. Each unmatched left brace should point to
246     // the EOF token as a substitute for a matching right brace.
247     //
248     assert(current_token_index == lex -> token_stream.Length() - 1);
249 
250     for (TokenIndex left_brace = brace_stack.Top();
251          left_brace; left_brace = brace_stack.Top())
252     {
253         lex -> token_stream[left_brace].SetRightBrace(current_token_index);
254         brace_stack.Pop();
255     }
256 }
257 
258 
259 //
260 // CURSOR points to the first '*' in a /**/ comment.
261 //
ScanStarComment()262 void Scanner::ScanStarComment()
263 {
264     const wchar_t* start = cursor - 1;
265     current_token -> SetKind(0);
266 #ifdef JIKES_DEBUG
267     LexStream::Comment* current_comment = NULL;
268     if (control.option.debug_comments)
269     {
270         current_comment = &(lex -> comment_stream.Next());
271         current_comment -> string = NULL;
272         current_comment -> previous_token = current_token_index - 1;
273         current_comment -> location = start - lex -> InputBuffer();
274     }
275 #endif // JIKES_DEBUG
276 
277     //
278     // If this comment starts with the prefix "/**" then it is a document
279     // comment. Check whether or not it contains the deprecated tag and if so,
280     // mark the token preceeding it. The @deprecated tag must appear at the
281     // beginning of a line. According to Sun,
282     // http://java.sun.com/j2se/1.4/docs/tooldocs/win32/javadoc.html#comments,
283     // this means ignoring whitespace, *, and /** patterns. But in practice,
284     // javac doesn't quite implement it this way, completely ignoring /**
285     // separators, and rejecting \f and \t after *<space>*.
286     // This implementation also ignores /**, but treats whitespace correctly.
287     //
288     // Note that we exploit the fact that the stream is doctored to always
289     // end in U_CARRIAGE_RETURN, U_NULL; and that we changed all CR to LF
290     // within the file.
291     //
292     if (*++cursor == U_STAR)
293     {
294         enum
295         {
296             HEADER,
297             STAR,
298             REMAINDER
299         } state = HEADER;
300         while (*cursor != U_CARRIAGE_RETURN)
301         {
302             switch (*cursor++)
303             {
304             case U_LINE_FEED:
305                 // Record new line.
306                 lex -> line_location.Next() = cursor - lex -> InputBuffer();
307                 state = HEADER;
308                 break;
309             case U_SPACE:
310             case U_FORM_FEED:
311             case U_HORIZONTAL_TAB:
312                 if (state != REMAINDER)
313                     state = HEADER;
314                 break;
315             case U_STAR:
316                 if (state != REMAINDER || *cursor == U_SLASH)
317                     state = STAR;
318                 break;
319             case U_SLASH:
320                 if (state == STAR)
321                 {
322 #ifdef JIKES_DEBUG
323                     if (control.option.debug_comments)
324                         current_comment -> length = cursor - start;
325 #endif // JIKES_DEBUG
326                     return;
327                 }
328                 // fallthrough
329             default:
330                 if (state != REMAINDER)
331                 {
332                     state = REMAINDER;
333                     if (cursor[-1] == U_AT &&
334                         cursor[0] == U_d &&
335                         cursor[1] == U_e &&
336                         cursor[2] == U_p &&
337                         cursor[3] == U_r &&
338                         cursor[4] == U_e &&
339                         cursor[5] == U_c &&
340                         cursor[6] == U_a &&
341                         cursor[7] == U_t &&
342                         cursor[8] == U_e &&
343                         cursor[9] == U_d &&
344                         (Code::IsWhitespace(cursor + 10) ||
345                          cursor[10] == U_STAR))
346                     {
347                         deprecated = true;
348                         cursor += 9;
349                     }
350                 }
351             }
352         }
353     }
354     else // normal /* */ comment
355     {
356         // Normal comments do not affect deprecation.
357         if (current_token -> Deprecated())
358             deprecated = true;
359         while (*cursor != U_CARRIAGE_RETURN)
360         {
361             if (*cursor == U_STAR) // Potential comment closer.
362             {
363                 while (*++cursor == U_STAR)
364                     ;
365                 if (*cursor == U_SLASH)
366                 {
367                     cursor++;
368 #ifdef JIKES_DEBUG
369                     if (control.option.debug_comments)
370                         current_comment -> length = cursor - start;
371 #endif // JIKES_DEBUG
372                     return;
373                 }
374                 if (*cursor == U_CARRIAGE_RETURN)
375                     break;
376             }
377             if (Code::IsNewline(*cursor++)) // Record new line.
378             {
379                 lex -> line_location.Next() = cursor - lex -> InputBuffer();
380             }
381         }
382     }
383 
384     //
385     // If we got here, we are in an unterminated comment. Discard the
386     // U_CARRIAGE_RETURN that ends the stream.
387     //
388     lex -> ReportMessage(StreamError::UNTERMINATED_COMMENT,
389                          start - lex -> InputBuffer(),
390                          cursor - lex -> InputBuffer() - 1);
391 
392 #ifdef JIKES_DEBUG
393     if (control.option.debug_comments)
394         current_comment -> length = cursor - 1 - start;
395 #endif // JIKES_DEBUG
396 }
397 
398 
399 //
400 // CURSOR points to the second '/' in a // comment.
401 //
ScanSlashComment()402 void Scanner::ScanSlashComment()
403 {
404     //
405     // Note that we exploit the fact that the stream is doctored to always
406     // end in U_CARRIAGE_RETURN, U_NULL; and that we changed all CR to LF
407     // within the file. Normal comments do not affect deprecation.
408     //
409     if (current_token -> Deprecated())
410         deprecated = true;
411     current_token -> SetKind(0);
412     while (! Code::IsNewline(*++cursor));  // Skip all until \n or EOF
413 #ifdef JIKES_DEBUG
414     if (control.option.debug_comments)
415     {
416         LexStream::Comment* current_comment = &(lex -> comment_stream.Next());
417         current_comment -> string = NULL;
418         current_comment -> previous_token = current_token_index - 1;
419         current_comment -> location = current_token -> Location();
420         current_comment -> length = (cursor - lex -> InputBuffer()) -
421             current_comment -> location;
422     }
423 #endif // JIKES_DEBUG
424 }
425 
426 
427 //
428 // This procedure is invoked to skip useless spaces in the input.
429 // It assumes upon entry that CURSOR points to the next character to
430 // be scanned.  Before returning it sets CURSOR to the location of the
431 // first non-space character following its initial position.
432 //
SkipSpaces()433 inline void Scanner::SkipSpaces()
434 {
435     //
436     // We exploit the fact that the stream was doctored to end in
437     // U_CARRIAGE_RETURN, U_NULL; and that all internal CR were changed to LF.
438     // Normal comments do not affect deprecation.
439     //
440     if (current_token -> Deprecated())
441         deprecated = true;
442     current_token -> SetKind(0);
443     do
444     {
445         if (Code::IsNewline(*cursor))  // Starting a new line?
446             lex -> line_location.Next() = cursor + 1 - lex -> InputBuffer();
447     } while (Code::IsSpace(*++cursor));
448 }
449 
450 
451 //
452 // scan_keyword(i):
453 // Scan an identifier of length I and determine if it is a keyword.
454 //
ScanKeyword0(const wchar_t *)455 int Scanner::ScanKeyword0(const wchar_t*)
456 {
457     return TK_Identifier;
458 }
459 
ScanKeyword2(const wchar_t * p1)460 int Scanner::ScanKeyword2(const wchar_t* p1)
461 {
462     if (p1[0] == U_d && p1[1] == U_o)
463         return TK_do;
464     if (p1[0] == U_i && p1[1] == U_f)
465         return TK_if;
466     return TK_Identifier;
467 }
468 
ScanKeyword3(const wchar_t * p1)469 int Scanner::ScanKeyword3(const wchar_t* p1)
470 {
471     switch (*p1)
472     {
473     case U_f:
474         if (p1[1] == U_o && p1[2] == U_r)
475             return TK_for;
476         break;
477     case U_i:
478         if (p1[1] == U_n && p1[2] == U_t)
479             return TK_int;
480         break;
481     case U_n:
482         if (p1[1] == U_e && p1[2] == U_w)
483             return TK_new;
484         break;
485     case U_t:
486         if (p1[1] == U_r && p1[2] == U_y)
487             return TK_try;
488         break;
489     }
490     return TK_Identifier;
491 }
492 
ScanKeyword4(const wchar_t * p1)493 int Scanner::ScanKeyword4(const wchar_t* p1)
494 {
495     switch (*p1)
496     {
497     case U_b:
498         if (p1[1] == U_y && p1[2] == U_t && p1[3] == U_e)
499             return TK_byte;
500         break;
501     case U_c:
502         if (p1[1] == U_a && p1[2] == U_s && p1[3] == U_e)
503             return TK_case;
504         if (p1[1] == U_h && p1[2] == U_a && p1[3] == U_r)
505             return TK_char;
506         break;
507     case U_e:
508         if (p1[1] == U_l && p1[2] == U_s && p1[3] == U_e)
509             return TK_else;
510         if (p1[1] == U_n && p1[2] == U_u && p1[3] == U_m)
511             return TK_enum;
512         break;
513     case U_g:
514         if (p1[1] == U_o && p1[2] == U_t && p1[3] == U_o)
515             return TK_goto;
516         break;
517     case U_l:
518         if (p1[1] == U_o && p1[2] == U_n && p1[3] == U_g)
519             return TK_long;
520         break;
521     case U_n:
522         if (p1[1] == U_u && p1[2] == U_l && p1[3] == U_l)
523             return TK_null;
524         break;
525     case U_t:
526         if (p1[1] == U_h && p1[2] == U_i && p1[3] == U_s)
527             return TK_this;
528         if (p1[1] == U_r && p1[2] == U_u && p1[3] == U_e)
529             return TK_true;
530         break;
531     case U_v:
532         if (p1[1] == U_o && p1[2] == U_i && p1[3] == U_d)
533             return TK_void;
534         break;
535     }
536     return TK_Identifier;
537 }
538 
ScanKeyword5(const wchar_t * p1)539 int Scanner::ScanKeyword5(const wchar_t* p1)
540 {
541     switch (*p1)
542     {
543     case U_b:
544         if (p1[1] == U_r && p1[2] == U_e && p1[3] == U_a && p1[4] == U_k)
545             return TK_break;
546         break;
547     case U_c:
548         if (p1[1] == U_a && p1[2] == U_t && p1[3] == U_c && p1[4] == U_h)
549             return TK_catch;
550         if (p1[1] == U_l && p1[2] == U_a && p1[3] == U_s && p1[4] == U_s)
551             return TK_class;
552         if (p1[1] == U_o && p1[2] == U_n && p1[3] == U_s && p1[4] == U_t)
553             return TK_const;
554         break;
555     case U_f:
556         if (p1[1] == U_a && p1[2] == U_l && p1[3] == U_s && p1[4] == U_e)
557             return TK_false;
558         if (p1[1] == U_i && p1[2] == U_n && p1[3] == U_a && p1[4] == U_l)
559             return TK_final;
560         if (p1[1] == U_l && p1[2] == U_o && p1[3] == U_a && p1[4] == U_t)
561             return TK_float;
562         break;
563     case U_s:
564         if (p1[1] == U_h && p1[2] == U_o && p1[3] == U_r && p1[4] == U_t)
565             return TK_short;
566         if (p1[1] == U_u && p1[2] == U_p && p1[3] == U_e && p1[4] == U_r)
567             return TK_super;
568         break;
569     case U_t:
570         if (p1[1] == U_h && p1[2] == U_r && p1[3] == U_o && p1[4] == U_w)
571             return TK_throw;
572         break;
573     case U_w:
574         if (p1[1] == U_h && p1[2] == U_i && p1[3] == U_l && p1[4] == U_e)
575             return TK_while;
576         break;
577     }
578     return TK_Identifier;
579 }
580 
ScanKeyword6(const wchar_t * p1)581 int Scanner::ScanKeyword6(const wchar_t* p1)
582 {
583     switch (*p1)
584     {
585     case U_a:
586         if (p1[1] == U_s && p1[2] == U_s &&
587             p1[3] == U_e && p1[4] == U_r && p1[5] == U_t)
588             return TK_assert;
589         break;
590     case U_d:
591         if (p1[1] == U_o && p1[2] == U_u &&
592             p1[3] == U_b && p1[4] == U_l && p1[5] == U_e)
593             return TK_double;
594         break;
595     case U_i:
596         if (p1[1] == U_m && p1[2] == U_p &&
597             p1[3] == U_o && p1[4] == U_r && p1[5] == U_t)
598             return TK_import;
599         break;
600     case U_n:
601         if (p1[1] == U_a && p1[2] == U_t &&
602             p1[3] == U_i && p1[4] == U_v && p1[5] == U_e)
603             return TK_native;
604         break;
605     case U_p:
606         if (p1[1] == U_u && p1[2] == U_b &&
607             p1[3] == U_l && p1[4] == U_i && p1[5] == U_c)
608             return TK_public;
609         break;
610     case U_r:
611         if (p1[1] == U_e && p1[2] == U_t &&
612             p1[3] == U_u && p1[4] == U_r && p1[5] == U_n)
613             return TK_return;
614         break;
615     case U_s:
616         if (p1[1] == U_t && p1[2] == U_a &&
617             p1[3] == U_t && p1[4] == U_i && p1[5] == U_c)
618             return TK_static;
619         if (p1[1] == U_w && p1[2] == U_i &&
620             p1[3] == U_t && p1[4] == U_c && p1[5] == U_h)
621             return TK_switch;
622         break;
623     case U_t:
624         if (p1[1] == U_h && p1[2] == U_r &&
625             p1[3] == U_o && p1[4] == U_w && p1[5] == U_s)
626             return TK_throws;
627         break;
628     }
629     return TK_Identifier;
630 }
631 
ScanKeyword7(const wchar_t * p1)632 int Scanner::ScanKeyword7(const wchar_t* p1)
633 {
634     switch (*p1)
635     {
636     case U_b:
637         if (p1[1] == U_o && p1[2] == U_o && p1[3] == U_l &&
638             p1[4] == U_e && p1[5] == U_a && p1[6] == U_n)
639             return TK_boolean;
640         break;
641     case U_d:
642         if (p1[1] == U_e && p1[2] == U_f && p1[3] == U_a &&
643             p1[4] == U_u && p1[5] == U_l && p1[6] == U_t)
644             return TK_default;
645         break;
646     case U_e:
647         if (p1[1] == U_x && p1[2] == U_t && p1[3] == U_e &&
648             p1[4] == U_n && p1[5] == U_d && p1[6] == U_s)
649             return TK_extends;
650         break;
651     case U_f:
652         if (p1[1] == U_i && p1[2] == U_n && p1[3] == U_a &&
653             p1[4] == U_l && p1[5] == U_l && p1[6] == U_y)
654             return TK_finally;
655         break;
656     case U_p:
657         if (p1[1] == U_a && p1[2] == U_c && p1[3] == U_k &&
658             p1[4] == U_a && p1[5] == U_g && p1[6] == U_e)
659             return TK_package;
660         if (p1[1] == U_r && p1[2] == U_i && p1[3] == U_v &&
661             p1[4] == U_a && p1[5] == U_t && p1[6] == U_e)
662             return TK_private;
663         break;
664     }
665     return TK_Identifier;
666 }
667 
ScanKeyword8(const wchar_t * p1)668 int Scanner::ScanKeyword8(const wchar_t* p1)
669 {
670     switch (*p1)
671     {
672     case U_a:
673         if (p1[1] == U_b && p1[2] == U_s &&
674             p1[3] == U_t && p1[4] == U_r &&
675             p1[5] == U_a && p1[6] == U_c && p1[7] == U_t)
676             return TK_abstract;
677         break;
678     case U_c:
679         if (p1[1] == U_o && p1[2] == U_n &&
680             p1[3] == U_t && p1[4] == U_i &&
681             p1[5] == U_n && p1[6] == U_u && p1[7] == U_e)
682             return TK_continue;
683         break;
684     case U_s:
685         if (p1[1] == U_t && p1[2] == U_r &&
686             p1[3] == U_i && p1[4] == U_c &&
687             p1[5] == U_t && p1[6] == U_f && p1[7] == U_p)
688             return TK_strictfp;
689         break;
690     case U_v:
691         if (p1[1] == U_o && p1[2] == U_l &&
692             p1[3] == U_a && p1[4] == U_t &&
693             p1[5] == U_i && p1[6] == U_l && p1[7] == U_e)
694             return TK_volatile;
695         break;
696     }
697     return TK_Identifier;
698 }
699 
ScanKeyword9(const wchar_t * p1)700 int Scanner::ScanKeyword9(const wchar_t* p1)
701 {
702     if (p1[0] == U_i && p1[1] == U_n && p1[2] == U_t &&
703         p1[3] == U_e && p1[4] == U_r && p1[5] == U_f &&
704         p1[6] == U_a && p1[7] == U_c && p1[8] == U_e)
705         return TK_interface;
706     if (p1[0] == U_p && p1[1] == U_r && p1[2] == U_o &&
707         p1[3] == U_t && p1[4] == U_e && p1[5] == U_c &&
708         p1[6] == U_t && p1[7] == U_e && p1[8] == U_d)
709         return TK_protected;
710     if (p1[0] == U_t && p1[1] == U_r && p1[2] == U_a &&
711         p1[3] == U_n && p1[4] == U_s && p1[5] == U_i &&
712         p1[6] == U_e && p1[7] == U_n && p1[8] == U_t)
713         return TK_transient;
714     return TK_Identifier;
715 }
716 
ScanKeyword10(const wchar_t * p1)717 int Scanner::ScanKeyword10(const wchar_t* p1)
718 {
719     if (p1[0] == U_i)
720     {
721         if (p1[1] == U_m && p1[2] == U_p && p1[3] == U_l &&
722             p1[4] == U_e && p1[5] == U_m && p1[6] == U_e &&
723             p1[7] == U_n && p1[8] == U_t && p1[9] == U_s)
724             return TK_implements;
725         if (p1[1] == U_n && p1[2] == U_s && p1[3] == U_t &&
726             p1[4] == U_a && p1[5] == U_n && p1[6] == U_c &&
727             p1[7] == U_e && p1[8] == U_o && p1[9] == U_f)
728             return TK_instanceof;
729     }
730     return TK_Identifier;
731 }
732 
ScanKeyword12(const wchar_t * p1)733 int Scanner::ScanKeyword12(const wchar_t* p1)
734 {
735     if (p1[0] == U_s && p1[1] == U_y && p1[2] == U_n &&
736         p1[3] == U_c && p1[4] == U_h && p1[5] == U_r &&
737         p1[6] == U_o && p1[7] == U_n && p1[8] == U_i &&
738         p1[9] == U_z && p1[10] == U_e&& p1[11] == U_d)
739         return TK_synchronized;
740     return TK_Identifier;
741 }
742 
743 
744 //
745 // This procedure is invoked to scan a character literal. After the character
746 // literal has been scanned and classified, it is entered in the table with
747 // quotes intact.
748 //
ClassifyCharLiteral()749 void Scanner::ClassifyCharLiteral()
750 {
751     //
752     // We exploit the fact that the stream was doctored to end in
753     // U_CARRIAGE_RETURN, U_NULL; and that all internal CR were changed to LF.
754     //
755     current_token -> SetKind(TK_CharacterLiteral);
756     bool bad = false;
757     const wchar_t* ptr = cursor + 1;
758     switch (*ptr)
759     {
760     case U_SINGLE_QUOTE:
761         bad = true;
762         if (ptr[1] == U_SINGLE_QUOTE)
763         {
764             lex -> ReportMessage(StreamError::ESCAPE_EXPECTED,
765                                  current_token -> Location() + 1,
766                                  current_token -> Location() + 1);
767         }
768         else
769         {
770             lex -> ReportMessage(StreamError::EMPTY_CHARACTER_CONSTANT,
771                                  current_token -> Location(),
772                                  current_token -> Location() + 1);
773             ptr--;
774         }
775         break;
776     case U_BACKSLASH:
777         switch (*++ptr)
778         {
779         case U_b:
780         case U_f:
781         case U_n:
782         case U_r:
783         case U_t:
784         case U_DOUBLE_QUOTE:
785         case U_BACKSLASH:
786             break;
787         case U_SINGLE_QUOTE:
788             //
789             // The user may have forgotten to do '\\'.
790             //
791             if (ptr[1] != U_SINGLE_QUOTE)
792             {
793                 lex -> ReportMessage(StreamError::ESCAPE_EXPECTED,
794                                      current_token -> Location() + 1,
795                                      current_token -> Location() + 1);
796                 ptr--;
797                 bad = true;
798             }
799             break;
800         case U_0:
801         case U_1:
802         case U_2:
803         case U_3:
804             if (! Code::IsOctalDigit(ptr[1]))
805                 break;
806             ptr++;
807             // fallthrough
808         case U_4:
809         case U_5:
810         case U_6:
811         case U_7:
812             if (! Code::IsOctalDigit(ptr[1]))
813                 break;
814             ptr++;
815             break;
816         case U_CARRIAGE_RETURN:
817         case U_LINE_FEED:
818             ptr--;
819             // fallthrough
820         case U_u:
821             //
822             // By now, Unicode escapes have already been flattened; and it is
823             // illegal to try it twice (such as '\u005cu0000').
824             //
825         default:
826             lex -> ReportMessage(StreamError::INVALID_ESCAPE_SEQUENCE,
827                                  current_token -> Location() + 1,
828                                  current_token -> Location() + ptr - cursor);
829             bad = true;
830         }
831         break;
832     case U_CARRIAGE_RETURN:
833     case U_LINE_FEED:
834         // Since the source is broken into lines before tokens (JLS 3.2), this
835         // is an unterminated quote. We complain after this switch.
836         ptr--;
837         break;
838     default:
839         break;
840     }
841 
842     if (*++ptr != U_SINGLE_QUOTE)
843     {
844         //
845         // For generally better parsing and nicer error messages, see if the
846         // user tried to do a multiple character alpha-numeric string.
847         //
848         while (Code::IsAlnum(ptr))
849             ptr += Code::Codelength(ptr);
850         if (Code::IsNewline(*ptr))
851             ptr--;
852         if (! bad)
853         {
854             lex -> ReportMessage((*ptr != U_SINGLE_QUOTE || ptr == cursor
855                                   ? StreamError::UNTERMINATED_CHARACTER_CONSTANT
856                                   : StreamError::MULTI_CHARACTER_CONSTANT),
857                                  current_token -> Location(),
858                                  ptr - lex -> InputBuffer());
859         }
860     }
861 
862     ptr++;
863     current_token ->
864         SetSymbol(control.char_table.FindOrInsertLiteral(cursor,
865                                                          ptr - cursor));
866     cursor = ptr;
867 }
868 
869 
870 //
871 // This procedure is invoked to scan a string literal. After the string
872 // literal has been scanned and classified, it is entered in the table with
873 // quotes intact.
874 //
ClassifyStringLiteral()875 void Scanner::ClassifyStringLiteral()
876 {
877     //
878     // We exploit the fact that the stream was doctored to end in
879     // U_CARRIAGE_RETURN, U_NULL; and that all internal CR were changed to LF.
880     //
881     current_token -> SetKind(TK_StringLiteral);
882 
883     const wchar_t* ptr = cursor + 1;
884 
885     while (*ptr != U_DOUBLE_QUOTE && ! Code::IsNewline(*ptr))
886     {
887         if (*ptr++ == U_BACKSLASH)
888         {
889             switch (*ptr++)
890             {
891             case U_b:
892             case U_f:
893             case U_n:
894             case U_r:
895             case U_t:
896             case U_SINGLE_QUOTE:
897             case U_DOUBLE_QUOTE:
898             case U_BACKSLASH:
899             case U_0:
900             case U_1:
901             case U_2:
902             case U_3:
903             case U_4:
904             case U_5:
905             case U_6:
906             case U_7:
907                 break;
908             case U_u:
909                 //
910                 // By now, Unicode escapes have already been flattened; and it
911                 // is illegal to try it twice (such as "\u005cu0000").
912                 //
913             default:
914                 ptr--;
915                 lex -> ReportMessage(StreamError::INVALID_ESCAPE_SEQUENCE,
916                                      ptr - lex -> InputBuffer() - 1,
917                                      (ptr - lex -> InputBuffer() -
918                                       (Code::IsNewline(*ptr) ? 1 : 0)));
919             }
920         }
921     }
922 
923     if (Code::IsNewline(*ptr))
924     {
925         ptr--;
926         lex -> ReportMessage(StreamError::UNTERMINATED_STRING_CONSTANT,
927                              current_token -> Location(),
928                              ptr - lex -> InputBuffer());
929     }
930 
931     ptr++;
932     current_token ->
933         SetSymbol(control.string_table.FindOrInsertLiteral(cursor,
934                                                            ptr - cursor));
935     cursor = ptr;
936 }
937 
938 
939 //
940 // This procedure is invoked when CURSOR points to a letter which starts a
941 // keyword. It scans the identifier and checks whether or not it is a keyword.
942 // Note that the use of that check is a time-optimization that is not
943 // required for correctness.
944 //
ClassifyIdOrKeyword()945 void Scanner::ClassifyIdOrKeyword()
946 {
947     const wchar_t* ptr = cursor + 1;
948     bool has_dollar = false;
949 
950     while (Code::IsAlnum(ptr))
951     {
952         has_dollar = has_dollar || (*ptr == U_DS);
953         ptr += Code::Codelength(ptr);
954     }
955     int len = ptr - cursor;
956 
957     current_token -> SetKind(len < 13 ? (scan_keyword[len])(cursor)
958                              : TK_Identifier);
959 
960     if (current_token -> Kind() == TK_assert &&
961         control.option.source < JikesOption::SDK1_4)
962     {
963         lex -> ReportMessage(StreamError::DEPRECATED_IDENTIFIER_ASSERT,
964                              current_token -> Location(),
965                              current_token -> Location() + len - 1);
966         current_token -> SetKind(TK_Identifier);
967     }
968     if (current_token -> Kind() == TK_enum &&
969         control.option.source < JikesOption::SDK1_5)
970     {
971         lex -> ReportMessage(StreamError::DEPRECATED_IDENTIFIER_ENUM,
972                              current_token -> Location(),
973                              current_token -> Location() + len - 1);
974         current_token -> SetKind(TK_Identifier);
975     }
976     if (has_dollar && ! dollar_warning_given)
977     {
978         dollar_warning_given = true;
979         lex -> ReportMessage(StreamError::DOLLAR_IN_IDENTIFIER,
980                              current_token -> Location(),
981                              current_token -> Location() + len - 1);
982     }
983 
984     if (current_token -> Kind() == TK_Identifier)
985     {
986         current_token -> SetSymbol(control.FindOrInsertName(cursor, len));
987         for (unsigned i = 0; i < control.option.keyword_map.Length(); i++)
988         {
989             if (control.option.keyword_map[i].length == len &&
990                 wcsncmp(cursor, control.option.keyword_map[i].name, len) == 0)
991             {
992                 current_token -> SetKind(control.option.keyword_map[i].key);
993             }
994         }
995     }
996     else if (current_token -> Kind() == TK_class ||
997              current_token -> Kind() == TK_enum ||
998              current_token -> Kind() == TK_interface)
999     {
1000         //
1001         // If this is a top-level type keyword (not in braces), we keep track
1002         // of it by adding it to a list.
1003         //
1004         if (brace_stack.Size() == 0)
1005             lex -> type_index.Next() = current_token_index;
1006     }
1007     else if (current_token -> Kind() == TK_package && ! lex -> package)
1008         lex -> package = current_token_index;
1009     cursor = ptr;
1010 }
1011 
1012 //
1013 // This procedure is invoked when CURSOR points to an identifier start
1014 // which cannot start a keyword.
1015 //
ClassifyId()1016 void Scanner::ClassifyId()
1017 {
1018     const wchar_t* ptr = cursor;
1019     bool has_dollar = false;
1020 
1021     while (Code::IsAlnum(ptr))
1022     {
1023         has_dollar = has_dollar || (*ptr == U_DS);
1024         ptr += Code::Codelength(ptr);
1025     }
1026 
1027     int len = ptr - cursor;
1028 
1029     if (has_dollar && ! dollar_warning_given)
1030     {
1031         dollar_warning_given = true;
1032         lex -> ReportMessage(StreamError::DOLLAR_IN_IDENTIFIER,
1033                              current_token -> Location(),
1034                              current_token -> Location() + len - 1);
1035     }
1036 
1037     current_token -> SetKind(TK_Identifier);
1038     current_token -> SetSymbol(control.FindOrInsertName(cursor, len));
1039 
1040     for (unsigned i = 0; i < control.option.keyword_map.Length(); i++)
1041     {
1042         if (control.option.keyword_map[i].length == len &&
1043             wcsncmp(cursor, control.option.keyword_map[i].name, len) == 0)
1044         {
1045             current_token -> SetKind(control.option.keyword_map[i].key);
1046         }
1047     }
1048     cursor = ptr;
1049 }
1050 
1051 
1052 //
1053 // This procedure is invoked when CURSOR points directly to '0' - '9' or '.'.
1054 // Such a token is classified as a numeric literal: TK_LongLiteral,
1055 // TK_IntegerLiteral, TK_DoubleLiteral, or TK_FloatLiteral.
1056 //
ClassifyNumericLiteral()1057 void Scanner::ClassifyNumericLiteral()
1058 {
1059     //
1060     // Scan the initial sequence of digits, if any.
1061     //
1062     const wchar_t* ptr = cursor - 1;
1063     const wchar_t* tmp;
1064     while (Code::IsDecimalDigit(*++ptr));
1065 
1066     //
1067     // We now take an initial crack at classifying the numeric token.
1068     // We have three initial cases to consider, and stop parsing before any
1069     // exponent or type suffix:
1070     //
1071     // 1) If the initial (perhaps empty) sequence of digits is followed by
1072     //    '.', we have a floating-point constant. We scan the sequence of
1073     //    digits (if any) that follows the period. When '.' starts the number,
1074     //    we already checked that a digit follows before calling this method.
1075     // 2) If the initial sequence is "0x" or "0X", we have a hexadecimal
1076     //    literal, either integer or floating point.  To be floating point,
1077     //    the literal must contain an exponent with 'p' or 'P'; otherwise we
1078     //    parse the largest int literal.  There must be at least one hex
1079     //    digit after the prefix, and before the (possible) exponent.
1080     // 2) Otherwise, we have an integer literal. If the initial (non-empty)
1081     //    sequence of digits start with "0", we have an octal constant, and
1082     //    for nicer parsing, we simply complain about non-octal digits rather
1083     //    than strictly breaking 019 into the two tokens 01 and 9 (because
1084     //    it would be a guaranteed syntax error later on). However, it is
1085     //    still possible that 019 starts a valid floating point literal, which
1086     //    is checked later.
1087     //
1088     if (*ptr == U_DOT)
1089     {
1090         current_token -> SetKind(TK_DoubleLiteral);
1091         while (Code::IsDecimalDigit(*++ptr));
1092     }
1093     else
1094     {
1095         current_token -> SetKind(TK_IntegerLiteral);
1096         if (*cursor == U_0)
1097         {
1098             if (*ptr == U_x || *ptr == U_X)
1099             {
1100                 // Don't use isxdigit, it's not platform independent.
1101                 while (Code::IsHexDigit(*++ptr)); // Skip the 'x'.
1102                 if (*ptr == U_DOT)
1103                 {
1104                     current_token -> SetKind(TK_DoubleLiteral);
1105                     while (Code::IsHexDigit(*++ptr));
1106                     if (*ptr != U_p && *ptr != U_P)
1107                     {
1108                         // Missing required 'p' exponent.
1109                         lex -> ReportMessage(StreamError::INVALID_FLOATING_HEX_EXPONENT,
1110                                              current_token -> Location(),
1111                                              ptr - 1 - lex -> InputBuffer());
1112                     }
1113                     else if (ptr == cursor + 3)
1114                     {
1115                         // Missing hex digits before exponent, with '.'.
1116                         tmp = ptr;
1117                         if (Code::IsSign(*++tmp)) // Skip the exponent letter.
1118                             tmp++; // Skip the '+' or '-'.
1119                         if (Code::IsHexDigit(*tmp))
1120                             while (Code::IsHexDigit(*++tmp));
1121                         if (*tmp != U_d && *tmp != U_D &&
1122                             *tmp != U_f && *tmp != U_F)
1123                         {
1124                             tmp--;
1125                         }
1126                         lex -> ReportMessage(StreamError::INVALID_FLOATING_HEX_MANTISSA,
1127                                              current_token -> Location(),
1128                                              tmp - lex -> InputBuffer());
1129                     }
1130                 }
1131                 else if (ptr == cursor + 2) // Found a runt "0x".
1132                 {
1133                     if (*ptr == U_p || *ptr == U_P)
1134                     {
1135                         // Missing hex digits before exponent, without '.'.
1136                         tmp = ptr;
1137                         if (Code::IsSign(*++tmp)) // Skip the exponent letter.
1138                             tmp++; // Skip the '+' or '-'.
1139                         if (Code::IsHexDigit(*tmp))
1140                             while (Code::IsHexDigit(*++tmp));
1141                         if (*tmp != U_d && *tmp != U_D &&
1142                             *tmp != U_f && *tmp != U_F)
1143                         {
1144                             tmp--;
1145                         }
1146                         lex -> ReportMessage(StreamError::INVALID_FLOATING_HEX_MANTISSA,
1147                                              current_token -> Location(),
1148                                              tmp - lex -> InputBuffer());
1149                     }
1150                     else
1151                     {
1152                         tmp = (*ptr == U_l || *ptr == U_L) ? ptr : ptr - 1;
1153                         lex -> ReportMessage(StreamError::INVALID_HEX_CONSTANT,
1154                                              current_token -> Location(),
1155                                              tmp - lex -> InputBuffer());
1156                     }
1157                 }
1158             }
1159             // Octal prefix. See if it will become floating point later.
1160             else if (*ptr != U_e && *ptr != U_E &&
1161                      *ptr != U_d && *ptr != U_D &&
1162                      *ptr != U_f && *ptr != U_F)
1163             {
1164                 tmp = cursor;
1165                 while (Code::IsOctalDigit(*++tmp)); // Skip leading '0'.
1166                 if (tmp != ptr)
1167                 {
1168                     tmp = (*ptr == U_l || *ptr == U_L) ? ptr : ptr - 1;
1169                     lex -> ReportMessage(StreamError::INVALID_OCTAL_CONSTANT,
1170                                          current_token -> Location(),
1171                                          tmp - lex -> InputBuffer());
1172                 }
1173             }
1174         }
1175     }
1176 
1177     //
1178     // If the initial numeric token is followed by an exponent, then it is a
1179     // floating-point constant. If that's the case, the literal is
1180     // reclassified and the exponent is scanned. Note that as 'E' and 'e' are
1181     // legitimate hexadecimal digits, we don't have to worry about a
1182     // hexadecimal constant being used as the prefix of a floating-point
1183     // constant. A hex floating point requires a hex prefix. An exponent
1184     // overrides an octal literal, as do the float and double suffixes. We
1185     // stop parsing before any type suffix.
1186     //
1187     // For example, 0x123e12 is tokenized as a single hexadecimal digit, while
1188     // the string 0x123e+12 gets broken down as the hex number 0x123e, the
1189     // operator '+', and the decimal constant 12. Meanwhile, 019e+0 and 019d
1190     // are both tokenized as a single floating-point constant 19.0. Note that
1191     // 1e should strictly be parsed as the int 1 followed by identifier e;
1192     // 1e+ should be the int 1, identifier e, and operator +; and 1p0d should
1193     // be the int 1 and identifier p0d; however all these cases are guaranteed
1194     // to be syntax errors later on, so we nicely consume them as a single
1195     // invalid floating point token now.
1196     //
1197     if (*ptr == U_e || *ptr == U_E || *ptr == U_p || *ptr == U_P)
1198     {
1199         current_token -> SetKind(TK_DoubleLiteral);
1200         if ((*ptr == U_p || *ptr == U_P) &&
1201             ! (cursor[1] == U_x || cursor[1] == U_X))
1202         {
1203             tmp = ptr;
1204             if (Code::IsSign(*++tmp)) // Skip the exponent letter.
1205                 tmp++; // Skip the '+' or '-'.
1206             if (Code::IsDecimalDigit(*tmp))
1207                 while (Code::IsDecimalDigit(*++tmp));
1208             if (*tmp != U_d && *tmp != U_D && *tmp != U_f && *tmp != U_F)
1209                 tmp--;
1210             lex -> ReportMessage(StreamError::INVALID_FLOATING_HEX_PREFIX,
1211                                  current_token -> Location(),
1212                                  tmp - lex -> InputBuffer());
1213         }
1214         if (Code::IsSign(*++ptr)) // Skip the exponent letter.
1215             ptr++; // Skip the '+' or '-'.
1216         if (Code::IsDecimalDigit(*ptr))
1217             while (Code::IsDecimalDigit(*++ptr));
1218         else
1219         {
1220             tmp = (*ptr == U_d || *ptr == U_D || *ptr == U_f || *ptr == U_F)
1221                 ? ptr : ptr - 1;
1222             lex -> ReportMessage(StreamError::INVALID_FLOATING_EXPONENT,
1223                                  current_token -> Location(),
1224                                  tmp - lex -> InputBuffer());
1225         }
1226     }
1227 
1228     //
1229     // A numeric constant may be suffixed by a letter that further qualifies
1230     // what kind of a constant it is. We check for these suffixes here.
1231     //
1232     int len;
1233     if (*ptr == U_f || *ptr == U_F)
1234     {
1235         len = ++ptr - cursor;
1236         current_token ->
1237             SetSymbol(control.float_table.FindOrInsertLiteral(cursor, len));
1238         current_token -> SetKind(TK_FloatLiteral);
1239     }
1240     else if (*ptr == U_d || *ptr == U_D)
1241     {
1242         len = ++ptr - cursor;
1243         current_token ->
1244             SetSymbol(control.double_table.FindOrInsertLiteral(cursor, len));
1245         current_token -> SetKind(TK_DoubleLiteral);
1246     }
1247     else if (current_token -> Kind() == TK_IntegerLiteral)
1248     {
1249         if (*ptr == U_l || *ptr == U_L)
1250         {
1251             if (*ptr == U_l && control.option.pedantic)
1252             {
1253                 lex -> ReportMessage(StreamError::FAVOR_CAPITAL_L_SUFFIX,
1254                                      current_token -> Location(),
1255                                      ptr - lex -> InputBuffer());
1256             }
1257 
1258             len = ++ptr - cursor;
1259             current_token ->
1260                 SetSymbol(control.long_table.FindOrInsertLiteral(cursor, len));
1261             current_token -> SetKind(TK_LongLiteral);
1262         }
1263         else
1264         {
1265             len = ptr - cursor;
1266             current_token ->
1267                 SetSymbol(control.int_table.FindOrInsertLiteral(cursor, len));
1268         }
1269     }
1270     else
1271     {
1272         assert(current_token -> Kind() == TK_DoubleLiteral);
1273         len = ptr - cursor;
1274         current_token ->
1275             SetSymbol(control.double_table.FindOrInsertLiteral(cursor, len));
1276     }
1277     cursor = ptr;
1278 }
1279 
1280 
ClassifyColon()1281 void Scanner::ClassifyColon()
1282 {
1283     current_token -> SetKind(TK_COLON);
1284     cursor++;
1285 }
1286 
1287 
ClassifyPlus()1288 void Scanner::ClassifyPlus()
1289 {
1290     cursor++;
1291     if (*cursor == U_PLUS)
1292     {
1293         cursor++;
1294         current_token -> SetKind(TK_PLUS_PLUS);
1295     }
1296     else if (*cursor == U_EQUAL)
1297     {
1298         cursor++;
1299         current_token -> SetKind(TK_PLUS_EQUAL);
1300     }
1301     else current_token -> SetKind(TK_PLUS);
1302 }
1303 
1304 
ClassifyMinus()1305 void Scanner::ClassifyMinus()
1306 {
1307     cursor++;
1308     if (*cursor == U_MINUS)
1309     {
1310         cursor++;
1311         current_token -> SetKind(TK_MINUS_MINUS);
1312     }
1313     else if (*cursor == U_EQUAL)
1314     {
1315         cursor++;
1316         current_token -> SetKind(TK_MINUS_EQUAL);
1317     }
1318     else current_token -> SetKind(TK_MINUS);
1319 }
1320 
1321 
ClassifyStar()1322 void Scanner::ClassifyStar()
1323 {
1324     cursor++;
1325     if (*cursor == U_EQUAL)
1326     {
1327         cursor++;
1328         current_token -> SetKind(TK_MULTIPLY_EQUAL);
1329     }
1330     else current_token -> SetKind(TK_MULTIPLY);
1331 }
1332 
1333 
ClassifySlash()1334 void Scanner::ClassifySlash()
1335 {
1336     cursor++;
1337     if (*cursor == U_EQUAL)
1338     {
1339         cursor++;
1340         current_token -> SetKind(TK_DIVIDE_EQUAL);
1341     }
1342     else if (*cursor == U_SLASH)
1343         ScanSlashComment();
1344     else if (*cursor == U_STAR)
1345         ScanStarComment();
1346     else current_token -> SetKind(TK_DIVIDE);
1347 }
1348 
1349 
ClassifyLess()1350 void Scanner::ClassifyLess()
1351 {
1352     cursor++;
1353     if (*cursor == U_EQUAL)
1354     {
1355         cursor++;
1356         current_token -> SetKind(TK_LESS_EQUAL);
1357     }
1358     else if (*cursor == U_LESS)
1359     {
1360         cursor++;
1361         if (*cursor == U_EQUAL)
1362         {
1363             cursor++;
1364             current_token -> SetKind(TK_LEFT_SHIFT_EQUAL);
1365         }
1366         else current_token -> SetKind(TK_LEFT_SHIFT);
1367     }
1368     else current_token -> SetKind(TK_LESS);
1369 }
1370 
1371 
ClassifyGreater()1372 void Scanner::ClassifyGreater()
1373 {
1374     cursor++;
1375     current_token -> SetKind(TK_GREATER);
1376     if (*cursor == U_EQUAL)
1377     {
1378         cursor++;
1379         current_token -> SetKind(TK_GREATER_EQUAL);
1380     }
1381     else if (*cursor == U_GREATER)
1382     {
1383         cursor++;
1384         if (*cursor == U_EQUAL)
1385         {
1386             cursor++;
1387             current_token -> SetKind(TK_RIGHT_SHIFT_EQUAL);
1388         }
1389         else if (*cursor == U_GREATER)
1390         {
1391             cursor++;
1392             if (*cursor == U_EQUAL)
1393             {
1394                 cursor++;
1395                 current_token -> SetKind(TK_UNSIGNED_RIGHT_SHIFT_EQUAL);
1396             }
1397             else current_token -> SetKind(TK_UNSIGNED_RIGHT_SHIFT);
1398         }
1399         else current_token -> SetKind(TK_RIGHT_SHIFT);
1400     }
1401 }
1402 
1403 
ClassifyAnd()1404 void Scanner::ClassifyAnd()
1405 {
1406     cursor++;
1407     if (*cursor == U_AMPERSAND)
1408     {
1409         cursor++;
1410         current_token -> SetKind(TK_AND_AND);
1411     }
1412     else if (*cursor == U_EQUAL)
1413     {
1414         cursor++;
1415         current_token -> SetKind(TK_AND_EQUAL);
1416     }
1417     else current_token -> SetKind(TK_AND);
1418 }
1419 
1420 
ClassifyOr()1421 void Scanner::ClassifyOr()
1422 {
1423     cursor++;
1424     if (*cursor == U_BAR)
1425     {
1426         cursor++;
1427         current_token -> SetKind(TK_OR_OR);
1428     }
1429     else if (*cursor == U_EQUAL)
1430     {
1431         cursor++;
1432         current_token -> SetKind(TK_OR_EQUAL);
1433     }
1434     else current_token -> SetKind(TK_OR);
1435 }
1436 
1437 
ClassifyXor()1438 void Scanner::ClassifyXor()
1439 {
1440     cursor++;
1441     if (*cursor == U_EQUAL)
1442     {
1443         cursor++;
1444         current_token -> SetKind(TK_XOR_EQUAL);
1445     }
1446     else current_token -> SetKind(TK_XOR);
1447 }
1448 
1449 
ClassifyNot()1450 void Scanner::ClassifyNot()
1451 {
1452     cursor++;
1453     if (*cursor == U_EQUAL)
1454     {
1455         cursor++;
1456         current_token -> SetKind(TK_NOT_EQUAL);
1457     }
1458     else current_token -> SetKind(TK_NOT);
1459 }
1460 
1461 
ClassifyEqual()1462 void Scanner::ClassifyEqual()
1463 {
1464     cursor++;
1465     if (*cursor == U_EQUAL)
1466     {
1467         cursor++;
1468         current_token -> SetKind(TK_EQUAL_EQUAL);
1469     }
1470     else current_token -> SetKind(TK_EQUAL);
1471 }
1472 
1473 
ClassifyMod()1474 void Scanner::ClassifyMod()
1475 {
1476     cursor++;
1477     if (*cursor == U_EQUAL)
1478     {
1479         cursor++;
1480         current_token -> SetKind(TK_REMAINDER_EQUAL);
1481     }
1482     else current_token -> SetKind(TK_REMAINDER);
1483 }
1484 
1485 
ClassifyPeriod()1486 void Scanner::ClassifyPeriod()
1487 {
1488     if (Code::IsDecimalDigit(cursor[1])) // Is '.' followed by digit?
1489         ClassifyNumericLiteral();
1490     else if (cursor[1] == U_DOT && cursor[2] == U_DOT)
1491     {
1492         // Added for Java 1.5, varargs, by JSR 201.
1493         current_token -> SetKind(TK_ELLIPSIS);
1494         cursor += 3;
1495     }
1496     else
1497     {
1498         current_token -> SetKind(TK_DOT);
1499         cursor++;
1500     }
1501 }
1502 
1503 
ClassifySemicolon()1504 void Scanner::ClassifySemicolon()
1505 {
1506     current_token -> SetKind(TK_SEMICOLON);
1507     cursor++;
1508 }
1509 
1510 
ClassifyComma()1511 void Scanner::ClassifyComma()
1512 {
1513     current_token -> SetKind(TK_COMMA);
1514     cursor++;
1515 }
1516 
1517 
ClassifyLbrace()1518 void Scanner::ClassifyLbrace()
1519 {
1520     //
1521     // Instead of setting the symbol for a left brace, we keep track of it.
1522     // When we encounter its matching right brace, we use the symbol field
1523     // to identify its counterpart.
1524     //
1525     brace_stack.Push(current_token_index);
1526     current_token -> SetKind(TK_LBRACE);
1527     cursor++;
1528 }
1529 
1530 
ClassifyRbrace()1531 void Scanner::ClassifyRbrace()
1532 {
1533     //
1534     // When a left brace in encountered, it is pushed into the brace_stack.
1535     // When its matching right brace in encountered, we pop the left brace
1536     // and make it point to its matching right brace.
1537     //
1538     TokenIndex left_brace = brace_stack.Top();
1539     if (left_brace) // This right brace is matched by a left one
1540     {
1541         lex -> token_stream[left_brace].SetRightBrace(current_token_index);
1542         brace_stack.Pop();
1543     }
1544     current_token -> SetKind(TK_RBRACE);
1545     cursor++;
1546 }
1547 
1548 
ClassifyLparen()1549 void Scanner::ClassifyLparen()
1550 {
1551     current_token -> SetKind(TK_LPAREN);
1552     cursor++;
1553 }
1554 
1555 
ClassifyRparen()1556 void Scanner::ClassifyRparen()
1557 {
1558     current_token -> SetKind(TK_RPAREN);
1559     cursor++;
1560 }
1561 
1562 
ClassifyLbracket()1563 void Scanner::ClassifyLbracket()
1564 {
1565     current_token -> SetKind(TK_LBRACKET);
1566     cursor++;
1567 }
1568 
1569 
ClassifyRbracket()1570 void Scanner::ClassifyRbracket()
1571 {
1572     current_token -> SetKind(TK_RBRACKET);
1573     cursor++;
1574 }
1575 
1576 
ClassifyComplement()1577 void Scanner::ClassifyComplement()
1578 {
1579     current_token -> SetKind(TK_TWIDDLE);
1580     cursor++;
1581 }
1582 
1583 
ClassifyAt()1584 void Scanner::ClassifyAt()
1585 {
1586     // Added for Java 1.5, attributes, by JSR 175.
1587     current_token -> SetKind(TK_AT);
1588     cursor++;
1589 }
1590 
1591 
ClassifyQuestion()1592 void Scanner::ClassifyQuestion()
1593 {
1594     current_token -> SetKind(TK_QUESTION);
1595     cursor++;
1596 }
1597 
1598 
ClassifyNonAsciiUnicode()1599 void Scanner::ClassifyNonAsciiUnicode()
1600 {
1601     if (Code::IsAlpha(cursor)) // Some kind of non-ascii unicode letter
1602         ClassifyId();
1603     else ClassifyBadToken();
1604 }
1605 
1606 
1607 //
1608 // Anything that doesn't fit above. Note that the lex stream already stripped
1609 // any concluding ctrl-z, so we don't need to worry about seeing that as a
1610 // bad token. For fewer error messages, we scan until the next valid
1611 // character, issue the error message, then treat this token as whitespace.
1612 //
ClassifyBadToken()1613 void Scanner::ClassifyBadToken()
1614 {
1615     while (++cursor < input_buffer_tail)
1616     {
1617         if ((*cursor < 128 &&
1618              classify_token[*cursor] != &Scanner::ClassifyBadToken) ||
1619             Code::IsAlpha(cursor))
1620         {
1621             break;
1622         }
1623     }
1624     current_token -> SetKind(0);
1625     lex -> ReportMessage(StreamError::BAD_TOKEN, current_token -> Location(),
1626                          cursor - lex -> InputBuffer() - 1);
1627 }
1628 
1629 #ifdef HAVE_JIKES_NAMESPACE
1630 } // Close namespace Jikes block
1631 #endif
1632