1 // $Id: stream.h,v 1.53 2004/03/25 13:32:28 ericb Exp $ -*- c++ -*-
2 //
3 // This software is subject to the terms of the IBM Jikes Compiler
4 // License Agreement available at the following URL:
5 // http://ibm.com/developerworks/opensource/jikes.
6 // Copyright (C) 1996, 2004 IBM Corporation and others.  All Rights Reserved.
7 // You must accept the terms of that agreement to use this software.
8 //
9 
10 #ifndef stream_INCLUDED
11 #define stream_INCLUDED
12 
13 #include "platform.h"
14 #include "tuple.h"
15 #include "jikesapi.h"
16 
17 #ifdef HAVE_JIKES_NAMESPACE
18 namespace Jikes { // Open namespace Jikes block
19 #endif
20 
21 class Control;
22 class Input_info;
23 class Scanner;
24 class Symbol;
25 class FileSymbol;
26 class ZipFile;
27 class LexStream;
28 class ErrorString;
29 
30 class StreamError : public JikesError
31 {
32     friend class LexStream;
33 
34 public:
35 
36     StreamError();
37 
38     virtual const wchar_t* getErrorMessage();
39     virtual const wchar_t* getErrorReport();
40 
41     virtual JikesErrorSeverity getSeverity();
42     virtual const char* getFileName();
43 
getLeftLineNo()44     virtual int getLeftLineNo() { return left_line_no; }
getLeftColumnNo()45     virtual int getLeftColumnNo() { return left_column_no; }
getRightLineNo()46     virtual int getRightLineNo() { return right_line_no; }
getRightColumnNo()47     virtual int getRightColumnNo() { return right_column_no; }
48 
49     enum StreamErrorKind
50     {
51         BAD_TOKEN,
52         EMPTY_CHARACTER_CONSTANT,
53         UNTERMINATED_CHARACTER_CONSTANT,
54         MULTI_CHARACTER_CONSTANT,
55         ESCAPE_EXPECTED,
56         UNTERMINATED_COMMENT,
57         UNTERMINATED_STRING_CONSTANT,
58         INVALID_HEX_CONSTANT,
59         INVALID_FLOATING_HEX_EXPONENT,
60         INVALID_FLOATING_HEX_MANTISSA,
61         INVALID_FLOATING_HEX_PREFIX,
62         INVALID_OCTAL_CONSTANT,
63         INVALID_FLOATING_EXPONENT,
64         INVALID_UNICODE_ESCAPE,
65         INVALID_ESCAPE_SEQUENCE,
66         LAST_CHARACTER_NOT_NEWLINE, // pedantic only
67         DEPRECATED_IDENTIFIER_ASSERT, // from here, these are warnings only
68         DEPRECATED_IDENTIFIER_ENUM,
69         DOLLAR_IN_IDENTIFIER,
70         FAVOR_CAPITAL_L_SUFFIX
71     };
72 
73 private:
74 
75     unsigned start_location;
76     unsigned end_location;
77     StreamErrorKind kind;
78 
79     static bool emacs_style_report;
80     LexStream* lex_stream;
81 
82     int left_line_no;
83     int left_column_no;
84     int right_line_no;
85     int right_column_no;
86 
87     const wchar_t* regularErrorString();
88     const wchar_t* emacsErrorString();
89 
90     bool initialized;
91 
92     void Initialize(StreamErrorKind, unsigned, unsigned, LexStream*);
93 };
94 
95 
96 //
97 // The stream class encapsulates details related to reading
98 // a stream of possibly encoded data from the file system.
99 //
100 class Stream
101 {
102 public:
103 
104     Stream();
105     ~Stream();
106 
DestroyInput()107     void DestroyInput()
108     {
109         delete [] input_buffer;
110         input_buffer = NULL;
111     }
112 
InputBuffer()113     inline const wchar_t* InputBuffer() { return input_buffer; }
InputBufferLength()114     inline unsigned InputBufferLength() { return input_buffer_length; }
115 
AllocateInputBuffer(unsigned size)116     inline wchar_t* AllocateInputBuffer(unsigned size)
117     {
118         // +3 for leading \n, trailing \r\0
119         return input_buffer = new wchar_t[size + 3];
120     }
121 
122 #if defined(HAVE_ENCODING)
123     static bool IsSupportedEncoding(char* encoding);
124     bool SetEncoding(char* encoding);
125 #endif
126 
127 protected:
128 
129     wchar_t* input_buffer;
130     unsigned input_buffer_length;
131 
132     const char* source_ptr;    // Start of data buffer to decoded
133     const char* source_tail;   // End of data buffer to be decoded
134     const char* data_buffer;   // The data to be decoded
135 
136     bool error_decode_next_character;
137 
138 //private: // FIXME : Make vars private once extracted from LexStream!
139 
140 #ifdef HAVE_ENCODING
141 
142 #if defined(HAVE_LIBICU_UC)
143     UConverter* _decoder;
144 #elif defined(JIKES_ICONV_ENCODING)
145     iconv_t _decoder;
146 #endif
147 
148     void DestroyEncoding();
149 
150     // Read the next wchar_t from the stream.
151     // If an error occurs the ErrorDecodeNextCharacter
152     // method will return true on the next call.
153 
154     wchar_t DecodeNextCharacter();
155 
ErrorDecodeNextCharacter()156     inline bool ErrorDecodeNextCharacter()
157     {
158         bool result = error_decode_next_character;
159         if (result)
160             error_decode_next_character = false;
161         return result;
162     }
163 
164     // Returns true if an encoding has been set
165 
HaveDecoder()166     inline bool HaveDecoder()
167     {
168 #if defined(HAVE_LIBICU_UC)
169         return _decoder != NULL;
170 #elif defined(JIKES_ICONV_ENCODING)
171         return _decoder != (iconv_t) -1;
172 #endif
173     }
174 
175 #endif // HAVE_ENCODING
176 
InitializeDataBuffer(const char * buffer,long size)177     inline void InitializeDataBuffer(const char* buffer, long size)
178     {
179         data_buffer = buffer;
180         source_ptr = data_buffer;
181         source_tail = data_buffer + size - 1;
182     }
183 
HasMoreData()184     inline bool HasMoreData()
185     {
186         return source_ptr <= source_tail;
187     }
188 };
189 
190 
191 //
192 // LexStream holds a stream of tokens generated from an input and
193 // provides methods to retrieve information from the stream.
194 //
195 class LexStream : public Stream
196 {
197     friend class StreamError;
198 
199 public:
200     typedef unsigned CommentIndex;
201     enum { LEX_INFINITY = INT_MAX }; // the largest value for TokenIndex
202 
203     FileSymbol* file_symbol;
204 
Next(TokenIndex i)205     inline TokenIndex Next(TokenIndex i)
206     {
207         return ++i < token_stream.Length() ? i : token_stream.Length() - 1;
208     }
Previous(TokenIndex i)209     inline TokenIndex Previous(TokenIndex i) { return i <= 0 ? 0 : i - 1; }
Peek()210     inline TokenIndex Peek() { return Next(index); }
211     inline void Reset(TokenIndex i = 1) { index = Previous(i); }
Gettoken()212     inline TokenIndex Gettoken() { return index = Next(index); }
Gettoken(TokenIndex end_token)213     inline TokenIndex Gettoken(TokenIndex end_token)
214     {
215         return index = (index < end_token ? Next(index)
216                         : token_stream.Length() - 1);
217     }
218 
Kind(TokenIndex i)219     inline unsigned Kind(TokenIndex i)
220     {
221         return tokens[i >= NumTokens() ? NumTokens() - 1 : i].Kind();
222     }
223 
Location(TokenIndex i)224     inline unsigned Location(TokenIndex i)
225     {
226         assert(i < NumTokens());
227         return tokens[i].Location();
228     }
229 
Line(TokenIndex i)230     inline unsigned Line(TokenIndex i)
231     {
232         return FindLine(tokens[i].Location());
233     }
234 
Column(TokenIndex i)235     inline unsigned Column(TokenIndex i)
236     {
237         // FindColumn grabs the right edge of an expanded character.
238         return input_buffer ? FindColumn(tokens[i].Location() - 1) + 1 : 0;
239     }
240     unsigned RightColumn(TokenIndex i);
241 
AfterEol(TokenIndex i)242     inline bool AfterEol(TokenIndex i)
243     {
244         return i < 1 ? true : Line(i - 1) < Line(i);
245     }
246 
IsDeprecated(TokenIndex i)247     inline bool IsDeprecated(TokenIndex i) { return tokens[i].Deprecated(); }
248 
MatchingBrace(TokenIndex i)249     inline TokenIndex MatchingBrace(TokenIndex i)
250     {
251         return tokens[i].additional_info.right_brace;
252     }
253 
254     const wchar_t* NameString(TokenIndex i);
255     unsigned NameStringLength(TokenIndex i);
256 
257     // TODO: Rename these methods to differ from the class name?
258     class LiteralSymbol* LiteralSymbol(TokenIndex);
259     class NameSymbol* NameSymbol(TokenIndex);
260 
261     char* FileName();
262     unsigned FileNameLength();
263 
264     unsigned LineLength(unsigned line_no);
LineStart(unsigned line_no)265     inline unsigned LineStart(unsigned line_no)
266     {
267         return locations[line_no];
268     }
LineEnd(unsigned line_no)269     inline unsigned LineEnd(unsigned line_no)
270     {
271         return locations[line_no + 1] - 1;
272     }
273 
274     unsigned LineSegmentLength(TokenIndex i);
275 
276     //
277     // For a sequence of tokens in a given range find out how many large
278     // characters they contain and compute the appropriate offset.
279     //
WcharOffset(TokenIndex start,TokenIndex end)280     inline unsigned WcharOffset(TokenIndex start, TokenIndex end)
281     {
282         unsigned offset = 0;
283         for (TokenIndex i = start; i <= end; i++)
284         {
285             for (const wchar_t* str = NameString(i); *str; str++)
286             {
287                 if (*str > 0xff)
288                     offset += 5;
289             }
290         }
291 
292         return offset;
293     }
294 
295     //
296     // When only an end token is supplied, the start token is assume to be the
297     // first one on the same line.
298     //
WcharOffset(TokenIndex end)299     inline unsigned WcharOffset(TokenIndex end)
300     {
301         TokenIndex start = end;
302         unsigned the_line = Line(end);
303         while (Line(--start) == the_line);
304         return WcharOffset(start + 1, end);
305     }
306 
307     //
308     // Used for outputting sections of source code in error messages.
309     //
310     void OutputLine(unsigned, ErrorString&);
311     void OutputSource(JikesError*, ErrorString&);
312 
313     CommentIndex FirstComment(TokenIndex);
314 
NumTypes()315     inline unsigned NumTypes() { return type_index.Length(); }
Type(unsigned i)316     inline TokenIndex Type(unsigned i) { return types[i]; }
317 
NumTokens()318     inline unsigned NumTokens() { return token_stream.Length(); }
NumComments()319     inline unsigned NumComments() { return comment_stream.Length(); }
PrecedingToken(CommentIndex i)320     inline TokenIndex PrecedingToken(CommentIndex i)
321     {
322         return comments[i].previous_token;
323     }
CommentLocation(CommentIndex i)324     inline unsigned CommentLocation(CommentIndex i)
325     {
326         return comments[i].location;
327     }
328 
CommentString(CommentIndex i)329     inline const wchar_t* CommentString(CommentIndex i)
330     {
331         return comments[i].string;
332     }
333 
CommentStringLength(CommentIndex i)334     inline unsigned CommentStringLength(CommentIndex i)
335     {
336         return comments[i].length;
337     }
338 
PackageToken()339     inline TokenIndex PackageToken()
340     {
341         return package;
342     }
343 
NumBadTokens()344     inline unsigned NumBadTokens()
345     {
346         unsigned count = 0;
347         for (unsigned i = 0; i < bad_tokens.Length(); i++)
348             if (bad_tokens[i].getSeverity() == JikesError::JIKES_ERROR)
349                 count++;
350         return count;
351     }
NumWarnTokens()352     inline unsigned NumWarnTokens()
353     {
354         return bad_tokens.Length() - NumBadTokens();
355     }
356 
357 #ifdef JIKES_DEBUG
358     bool file_read;
359 #endif
360 
361     //
362     // Constructors and Destructor.
363     //
364     LexStream(Control&, FileSymbol*);
365 
366     void RereadInput();
367     ~LexStream();
368 
DestroyInput()369     void DestroyInput()
370     {
371         Stream::DestroyInput();
372 
373         delete [] comment_buffer;
374         comment_buffer = NULL;
375     }
376 
377     void ReportMessage(StreamError::StreamErrorKind,
378                        unsigned start, unsigned end);
379     void SortMessages();
380     void PrintMessages();
381 
SetUpComments()382     void SetUpComments()
383     {
384         if (comment_buffer)
385             return;
386         RereadInput();
387         //
388         // Calculate the length of the string required to save the comments.
389         // Allocate the buffer, save the comments in the buffer and update
390         // their respective "string" pointer.
391         //
392         unsigned length = 0;
393         unsigned i;
394 
395         for (i = 1; i < comment_stream.Length(); i++)
396             length += (comments[i].length + 1);
397         comment_buffer = new wchar_t[length];
398         wchar_t* ptr = comment_buffer;
399         for (i = 1; i < comment_stream.Length(); i++)
400         {
401             memcpy(ptr, &(input_buffer[comments[i].location]),
402                    comments[i].length * sizeof(wchar_t));
403             comments[i].string = ptr;
404             ptr += comments[i].length;
405             *ptr++ = U_NULL;
406         }
407     }
408 
409 #ifdef JIKES_DEBUG
410     void Dump(); // temporary function used to dump token stream.
411 #endif
412 
413     //
414     // Return the total size of space allocated for the tokens.
415     //
TokenSpaceAllocated(void)416     size_t TokenSpaceAllocated(void)
417     {
418         return token_stream.Length() * sizeof(Token);
419     }
420 
421     //
422     // Return the total size of space allocated for the comments.
423     //
CommentSpaceAllocated(void)424     size_t CommentSpaceAllocated(void)
425     {
426         return comment_stream.Length() * sizeof(Comment);
427     }
428 
429 private:
430 
431     int hexvalue(wchar_t ch);
432 
433 #if defined(HAVE_ENCODING)
434     enum UnicodeLexerState
435     {
436         START,
437         RAW,
438         CR,
439         QUOTE,
440         UNICODE_ESCAPE,
441         UNICODE_ESCAPE_DIGIT_0,
442         UNICODE_ESCAPE_DIGIT_1,
443         UNICODE_ESCAPE_DIGIT_2
444     };
445 #endif // HAVE_ENCODING
446 
447     friend class Scanner;
448 
449     struct Comment
450     {
451         TokenIndex previous_token;
452         unsigned location;
453         unsigned length;
454         wchar_t* string;
455     };
456 
457     class Token
458     {
459         //
460         // It is expected that a location will be set for every token.
461         // Therefore, as we are setting the location, we also reset the
462         // deprecated bit to 0. If it is subsequently discovered that the
463         // token is followed by one or more deprecated tags then the bit is
464         // set to 1 by an invocation of the function SetDeprecated. Note that
465         // a better way to resetting all the bits in "info" is to use the
466         // function ResetInfoAndSetLocation defined below, instead of using
467         // SetLocation.
468         //
SetLocation(unsigned location)469         inline void SetLocation(unsigned location)
470         {
471             assert(location <= 0x00FFFFFF);
472             info = (info & 0x0000007F) | (location << 8);
473         }
474 
475     public:
476         unsigned info;
477         union
478         {
479             Symbol* symbol;
480             TokenIndex right_brace;
481         } additional_info;
482 
483         //
484         // To just reset the info, this function should be invoked with a
485         // location value of 0.
486         //
ResetInfoAndSetLocation(unsigned location)487         inline void ResetInfoAndSetLocation(unsigned location)
488         {
489             assert(location <= 0x00FFFFFF);
490             info = (location << 8);
491             additional_info.symbol = NULL;
492         }
493 
Location()494         inline unsigned Location() { return info >> 8; }
SetKind(unsigned kind)495         inline void SetKind(unsigned kind)
496         {
497             assert(kind <= 0x0000007F);
498             info = (info & 0xFFFFFF80) | kind;
499         }
Kind()500         inline unsigned Kind() { return info & 0x0000007F; }
ResetDeprecated()501         inline void ResetDeprecated() { info &= ~0x00000080; }
SetDeprecated()502         inline void SetDeprecated() { info |= 0x00000080; }
Deprecated()503         inline bool Deprecated() { return (info & 0x00000080) != 0; }
504 
SetSymbol(Symbol * symbol)505         inline void SetSymbol(Symbol* symbol)
506         {
507             additional_info.symbol = symbol;
508         }
SetRightBrace(TokenIndex rbrace)509         inline void SetRightBrace(TokenIndex rbrace)
510         {
511             additional_info.right_brace = rbrace;
512         }
513     };
514 
515     TokenIndex GetNextToken(unsigned location = 0)
516     {
517         TokenIndex index = token_stream.NextIndex();
518         token_stream[index].ResetInfoAndSetLocation(location);
519 
520         return index;
521     }
522 
523     Tuple<StreamError> bad_tokens;
524 
525     TokenIndex index;
526     Token* tokens;
527     ConvertibleArray<Token> token_stream;
528     Comment* comments;
529     ConvertibleArray<Comment> comment_stream;
530     unsigned* locations;
531     ConvertibleArray<unsigned> line_location;
532     TokenIndex* types;
533     ConvertibleArray<TokenIndex> type_index;
534     TokenIndex package;
535 
536     void CompressSpace();
537 
538     bool initial_reading_of_input;
539 
540     wchar_t* comment_buffer;
541 
542     Control& control;
543 
544     void ReadInput();
545     void ProcessInput(const char*, long);
546 #if defined(HAVE_ENCODING)
547     void ProcessInputUnicode(const char*, long);
548 #else
549     void ProcessInputAscii(const char*, long);
550 #endif // defined(HAVE_ENCODING)
551 
552     const wchar_t* KeywordName(int);
553 
554     unsigned FindLine(unsigned location);
555 
556     //
557     // Finds the column of the right edge of a character.
558     //
559     unsigned FindColumn(unsigned loc);
560 };
561 
562 #ifdef HAVE_JIKES_NAMESPACE
563 } // Close namespace Jikes block
564 #endif
565 
566 #endif // stream_INCLUDED
567 
568