1 // $Id: stream.h,v 1.53 2004/03/25 13:32:28 ericb Exp $ -*- c++ -*- 2 // 3 // This software is subject to the terms of the IBM Jikes Compiler 4 // License Agreement available at the following URL: 5 // http://ibm.com/developerworks/opensource/jikes. 6 // Copyright (C) 1996, 2004 IBM Corporation and others. All Rights Reserved. 7 // You must accept the terms of that agreement to use this software. 8 // 9 10 #ifndef stream_INCLUDED 11 #define stream_INCLUDED 12 13 #include "platform.h" 14 #include "tuple.h" 15 #include "jikesapi.h" 16 17 #ifdef HAVE_JIKES_NAMESPACE 18 namespace Jikes { // Open namespace Jikes block 19 #endif 20 21 class Control; 22 class Input_info; 23 class Scanner; 24 class Symbol; 25 class FileSymbol; 26 class ZipFile; 27 class LexStream; 28 class ErrorString; 29 30 class StreamError : public JikesError 31 { 32 friend class LexStream; 33 34 public: 35 36 StreamError(); 37 38 virtual const wchar_t* getErrorMessage(); 39 virtual const wchar_t* getErrorReport(); 40 41 virtual JikesErrorSeverity getSeverity(); 42 virtual const char* getFileName(); 43 getLeftLineNo()44 virtual int getLeftLineNo() { return left_line_no; } getLeftColumnNo()45 virtual int getLeftColumnNo() { return left_column_no; } getRightLineNo()46 virtual int getRightLineNo() { return right_line_no; } getRightColumnNo()47 virtual int getRightColumnNo() { return right_column_no; } 48 49 enum StreamErrorKind 50 { 51 BAD_TOKEN, 52 EMPTY_CHARACTER_CONSTANT, 53 UNTERMINATED_CHARACTER_CONSTANT, 54 MULTI_CHARACTER_CONSTANT, 55 ESCAPE_EXPECTED, 56 UNTERMINATED_COMMENT, 57 UNTERMINATED_STRING_CONSTANT, 58 INVALID_HEX_CONSTANT, 59 INVALID_FLOATING_HEX_EXPONENT, 60 INVALID_FLOATING_HEX_MANTISSA, 61 INVALID_FLOATING_HEX_PREFIX, 62 INVALID_OCTAL_CONSTANT, 63 INVALID_FLOATING_EXPONENT, 64 INVALID_UNICODE_ESCAPE, 65 INVALID_ESCAPE_SEQUENCE, 66 LAST_CHARACTER_NOT_NEWLINE, // pedantic only 67 DEPRECATED_IDENTIFIER_ASSERT, // from here, these are warnings only 68 DEPRECATED_IDENTIFIER_ENUM, 69 DOLLAR_IN_IDENTIFIER, 70 FAVOR_CAPITAL_L_SUFFIX 71 }; 72 73 private: 74 75 unsigned start_location; 76 unsigned end_location; 77 StreamErrorKind kind; 78 79 static bool emacs_style_report; 80 LexStream* lex_stream; 81 82 int left_line_no; 83 int left_column_no; 84 int right_line_no; 85 int right_column_no; 86 87 const wchar_t* regularErrorString(); 88 const wchar_t* emacsErrorString(); 89 90 bool initialized; 91 92 void Initialize(StreamErrorKind, unsigned, unsigned, LexStream*); 93 }; 94 95 96 // 97 // The stream class encapsulates details related to reading 98 // a stream of possibly encoded data from the file system. 99 // 100 class Stream 101 { 102 public: 103 104 Stream(); 105 ~Stream(); 106 DestroyInput()107 void DestroyInput() 108 { 109 delete [] input_buffer; 110 input_buffer = NULL; 111 } 112 InputBuffer()113 inline const wchar_t* InputBuffer() { return input_buffer; } InputBufferLength()114 inline unsigned InputBufferLength() { return input_buffer_length; } 115 AllocateInputBuffer(unsigned size)116 inline wchar_t* AllocateInputBuffer(unsigned size) 117 { 118 // +3 for leading \n, trailing \r\0 119 return input_buffer = new wchar_t[size + 3]; 120 } 121 122 #if defined(HAVE_ENCODING) 123 static bool IsSupportedEncoding(char* encoding); 124 bool SetEncoding(char* encoding); 125 #endif 126 127 protected: 128 129 wchar_t* input_buffer; 130 unsigned input_buffer_length; 131 132 const char* source_ptr; // Start of data buffer to decoded 133 const char* source_tail; // End of data buffer to be decoded 134 const char* data_buffer; // The data to be decoded 135 136 bool error_decode_next_character; 137 138 //private: // FIXME : Make vars private once extracted from LexStream! 139 140 #ifdef HAVE_ENCODING 141 142 #if defined(HAVE_LIBICU_UC) 143 UConverter* _decoder; 144 #elif defined(JIKES_ICONV_ENCODING) 145 iconv_t _decoder; 146 #endif 147 148 void DestroyEncoding(); 149 150 // Read the next wchar_t from the stream. 151 // If an error occurs the ErrorDecodeNextCharacter 152 // method will return true on the next call. 153 154 wchar_t DecodeNextCharacter(); 155 ErrorDecodeNextCharacter()156 inline bool ErrorDecodeNextCharacter() 157 { 158 bool result = error_decode_next_character; 159 if (result) 160 error_decode_next_character = false; 161 return result; 162 } 163 164 // Returns true if an encoding has been set 165 HaveDecoder()166 inline bool HaveDecoder() 167 { 168 #if defined(HAVE_LIBICU_UC) 169 return _decoder != NULL; 170 #elif defined(JIKES_ICONV_ENCODING) 171 return _decoder != (iconv_t) -1; 172 #endif 173 } 174 175 #endif // HAVE_ENCODING 176 InitializeDataBuffer(const char * buffer,long size)177 inline void InitializeDataBuffer(const char* buffer, long size) 178 { 179 data_buffer = buffer; 180 source_ptr = data_buffer; 181 source_tail = data_buffer + size - 1; 182 } 183 HasMoreData()184 inline bool HasMoreData() 185 { 186 return source_ptr <= source_tail; 187 } 188 }; 189 190 191 // 192 // LexStream holds a stream of tokens generated from an input and 193 // provides methods to retrieve information from the stream. 194 // 195 class LexStream : public Stream 196 { 197 friend class StreamError; 198 199 public: 200 typedef unsigned CommentIndex; 201 enum { LEX_INFINITY = INT_MAX }; // the largest value for TokenIndex 202 203 FileSymbol* file_symbol; 204 Next(TokenIndex i)205 inline TokenIndex Next(TokenIndex i) 206 { 207 return ++i < token_stream.Length() ? i : token_stream.Length() - 1; 208 } Previous(TokenIndex i)209 inline TokenIndex Previous(TokenIndex i) { return i <= 0 ? 0 : i - 1; } Peek()210 inline TokenIndex Peek() { return Next(index); } 211 inline void Reset(TokenIndex i = 1) { index = Previous(i); } Gettoken()212 inline TokenIndex Gettoken() { return index = Next(index); } Gettoken(TokenIndex end_token)213 inline TokenIndex Gettoken(TokenIndex end_token) 214 { 215 return index = (index < end_token ? Next(index) 216 : token_stream.Length() - 1); 217 } 218 Kind(TokenIndex i)219 inline unsigned Kind(TokenIndex i) 220 { 221 return tokens[i >= NumTokens() ? NumTokens() - 1 : i].Kind(); 222 } 223 Location(TokenIndex i)224 inline unsigned Location(TokenIndex i) 225 { 226 assert(i < NumTokens()); 227 return tokens[i].Location(); 228 } 229 Line(TokenIndex i)230 inline unsigned Line(TokenIndex i) 231 { 232 return FindLine(tokens[i].Location()); 233 } 234 Column(TokenIndex i)235 inline unsigned Column(TokenIndex i) 236 { 237 // FindColumn grabs the right edge of an expanded character. 238 return input_buffer ? FindColumn(tokens[i].Location() - 1) + 1 : 0; 239 } 240 unsigned RightColumn(TokenIndex i); 241 AfterEol(TokenIndex i)242 inline bool AfterEol(TokenIndex i) 243 { 244 return i < 1 ? true : Line(i - 1) < Line(i); 245 } 246 IsDeprecated(TokenIndex i)247 inline bool IsDeprecated(TokenIndex i) { return tokens[i].Deprecated(); } 248 MatchingBrace(TokenIndex i)249 inline TokenIndex MatchingBrace(TokenIndex i) 250 { 251 return tokens[i].additional_info.right_brace; 252 } 253 254 const wchar_t* NameString(TokenIndex i); 255 unsigned NameStringLength(TokenIndex i); 256 257 // TODO: Rename these methods to differ from the class name? 258 class LiteralSymbol* LiteralSymbol(TokenIndex); 259 class NameSymbol* NameSymbol(TokenIndex); 260 261 char* FileName(); 262 unsigned FileNameLength(); 263 264 unsigned LineLength(unsigned line_no); LineStart(unsigned line_no)265 inline unsigned LineStart(unsigned line_no) 266 { 267 return locations[line_no]; 268 } LineEnd(unsigned line_no)269 inline unsigned LineEnd(unsigned line_no) 270 { 271 return locations[line_no + 1] - 1; 272 } 273 274 unsigned LineSegmentLength(TokenIndex i); 275 276 // 277 // For a sequence of tokens in a given range find out how many large 278 // characters they contain and compute the appropriate offset. 279 // WcharOffset(TokenIndex start,TokenIndex end)280 inline unsigned WcharOffset(TokenIndex start, TokenIndex end) 281 { 282 unsigned offset = 0; 283 for (TokenIndex i = start; i <= end; i++) 284 { 285 for (const wchar_t* str = NameString(i); *str; str++) 286 { 287 if (*str > 0xff) 288 offset += 5; 289 } 290 } 291 292 return offset; 293 } 294 295 // 296 // When only an end token is supplied, the start token is assume to be the 297 // first one on the same line. 298 // WcharOffset(TokenIndex end)299 inline unsigned WcharOffset(TokenIndex end) 300 { 301 TokenIndex start = end; 302 unsigned the_line = Line(end); 303 while (Line(--start) == the_line); 304 return WcharOffset(start + 1, end); 305 } 306 307 // 308 // Used for outputting sections of source code in error messages. 309 // 310 void OutputLine(unsigned, ErrorString&); 311 void OutputSource(JikesError*, ErrorString&); 312 313 CommentIndex FirstComment(TokenIndex); 314 NumTypes()315 inline unsigned NumTypes() { return type_index.Length(); } Type(unsigned i)316 inline TokenIndex Type(unsigned i) { return types[i]; } 317 NumTokens()318 inline unsigned NumTokens() { return token_stream.Length(); } NumComments()319 inline unsigned NumComments() { return comment_stream.Length(); } PrecedingToken(CommentIndex i)320 inline TokenIndex PrecedingToken(CommentIndex i) 321 { 322 return comments[i].previous_token; 323 } CommentLocation(CommentIndex i)324 inline unsigned CommentLocation(CommentIndex i) 325 { 326 return comments[i].location; 327 } 328 CommentString(CommentIndex i)329 inline const wchar_t* CommentString(CommentIndex i) 330 { 331 return comments[i].string; 332 } 333 CommentStringLength(CommentIndex i)334 inline unsigned CommentStringLength(CommentIndex i) 335 { 336 return comments[i].length; 337 } 338 PackageToken()339 inline TokenIndex PackageToken() 340 { 341 return package; 342 } 343 NumBadTokens()344 inline unsigned NumBadTokens() 345 { 346 unsigned count = 0; 347 for (unsigned i = 0; i < bad_tokens.Length(); i++) 348 if (bad_tokens[i].getSeverity() == JikesError::JIKES_ERROR) 349 count++; 350 return count; 351 } NumWarnTokens()352 inline unsigned NumWarnTokens() 353 { 354 return bad_tokens.Length() - NumBadTokens(); 355 } 356 357 #ifdef JIKES_DEBUG 358 bool file_read; 359 #endif 360 361 // 362 // Constructors and Destructor. 363 // 364 LexStream(Control&, FileSymbol*); 365 366 void RereadInput(); 367 ~LexStream(); 368 DestroyInput()369 void DestroyInput() 370 { 371 Stream::DestroyInput(); 372 373 delete [] comment_buffer; 374 comment_buffer = NULL; 375 } 376 377 void ReportMessage(StreamError::StreamErrorKind, 378 unsigned start, unsigned end); 379 void SortMessages(); 380 void PrintMessages(); 381 SetUpComments()382 void SetUpComments() 383 { 384 if (comment_buffer) 385 return; 386 RereadInput(); 387 // 388 // Calculate the length of the string required to save the comments. 389 // Allocate the buffer, save the comments in the buffer and update 390 // their respective "string" pointer. 391 // 392 unsigned length = 0; 393 unsigned i; 394 395 for (i = 1; i < comment_stream.Length(); i++) 396 length += (comments[i].length + 1); 397 comment_buffer = new wchar_t[length]; 398 wchar_t* ptr = comment_buffer; 399 for (i = 1; i < comment_stream.Length(); i++) 400 { 401 memcpy(ptr, &(input_buffer[comments[i].location]), 402 comments[i].length * sizeof(wchar_t)); 403 comments[i].string = ptr; 404 ptr += comments[i].length; 405 *ptr++ = U_NULL; 406 } 407 } 408 409 #ifdef JIKES_DEBUG 410 void Dump(); // temporary function used to dump token stream. 411 #endif 412 413 // 414 // Return the total size of space allocated for the tokens. 415 // TokenSpaceAllocated(void)416 size_t TokenSpaceAllocated(void) 417 { 418 return token_stream.Length() * sizeof(Token); 419 } 420 421 // 422 // Return the total size of space allocated for the comments. 423 // CommentSpaceAllocated(void)424 size_t CommentSpaceAllocated(void) 425 { 426 return comment_stream.Length() * sizeof(Comment); 427 } 428 429 private: 430 431 int hexvalue(wchar_t ch); 432 433 #if defined(HAVE_ENCODING) 434 enum UnicodeLexerState 435 { 436 START, 437 RAW, 438 CR, 439 QUOTE, 440 UNICODE_ESCAPE, 441 UNICODE_ESCAPE_DIGIT_0, 442 UNICODE_ESCAPE_DIGIT_1, 443 UNICODE_ESCAPE_DIGIT_2 444 }; 445 #endif // HAVE_ENCODING 446 447 friend class Scanner; 448 449 struct Comment 450 { 451 TokenIndex previous_token; 452 unsigned location; 453 unsigned length; 454 wchar_t* string; 455 }; 456 457 class Token 458 { 459 // 460 // It is expected that a location will be set for every token. 461 // Therefore, as we are setting the location, we also reset the 462 // deprecated bit to 0. If it is subsequently discovered that the 463 // token is followed by one or more deprecated tags then the bit is 464 // set to 1 by an invocation of the function SetDeprecated. Note that 465 // a better way to resetting all the bits in "info" is to use the 466 // function ResetInfoAndSetLocation defined below, instead of using 467 // SetLocation. 468 // SetLocation(unsigned location)469 inline void SetLocation(unsigned location) 470 { 471 assert(location <= 0x00FFFFFF); 472 info = (info & 0x0000007F) | (location << 8); 473 } 474 475 public: 476 unsigned info; 477 union 478 { 479 Symbol* symbol; 480 TokenIndex right_brace; 481 } additional_info; 482 483 // 484 // To just reset the info, this function should be invoked with a 485 // location value of 0. 486 // ResetInfoAndSetLocation(unsigned location)487 inline void ResetInfoAndSetLocation(unsigned location) 488 { 489 assert(location <= 0x00FFFFFF); 490 info = (location << 8); 491 additional_info.symbol = NULL; 492 } 493 Location()494 inline unsigned Location() { return info >> 8; } SetKind(unsigned kind)495 inline void SetKind(unsigned kind) 496 { 497 assert(kind <= 0x0000007F); 498 info = (info & 0xFFFFFF80) | kind; 499 } Kind()500 inline unsigned Kind() { return info & 0x0000007F; } ResetDeprecated()501 inline void ResetDeprecated() { info &= ~0x00000080; } SetDeprecated()502 inline void SetDeprecated() { info |= 0x00000080; } Deprecated()503 inline bool Deprecated() { return (info & 0x00000080) != 0; } 504 SetSymbol(Symbol * symbol)505 inline void SetSymbol(Symbol* symbol) 506 { 507 additional_info.symbol = symbol; 508 } SetRightBrace(TokenIndex rbrace)509 inline void SetRightBrace(TokenIndex rbrace) 510 { 511 additional_info.right_brace = rbrace; 512 } 513 }; 514 515 TokenIndex GetNextToken(unsigned location = 0) 516 { 517 TokenIndex index = token_stream.NextIndex(); 518 token_stream[index].ResetInfoAndSetLocation(location); 519 520 return index; 521 } 522 523 Tuple<StreamError> bad_tokens; 524 525 TokenIndex index; 526 Token* tokens; 527 ConvertibleArray<Token> token_stream; 528 Comment* comments; 529 ConvertibleArray<Comment> comment_stream; 530 unsigned* locations; 531 ConvertibleArray<unsigned> line_location; 532 TokenIndex* types; 533 ConvertibleArray<TokenIndex> type_index; 534 TokenIndex package; 535 536 void CompressSpace(); 537 538 bool initial_reading_of_input; 539 540 wchar_t* comment_buffer; 541 542 Control& control; 543 544 void ReadInput(); 545 void ProcessInput(const char*, long); 546 #if defined(HAVE_ENCODING) 547 void ProcessInputUnicode(const char*, long); 548 #else 549 void ProcessInputAscii(const char*, long); 550 #endif // defined(HAVE_ENCODING) 551 552 const wchar_t* KeywordName(int); 553 554 unsigned FindLine(unsigned location); 555 556 // 557 // Finds the column of the right edge of a character. 558 // 559 unsigned FindColumn(unsigned loc); 560 }; 561 562 #ifdef HAVE_JIKES_NAMESPACE 563 } // Close namespace Jikes block 564 #endif 565 566 #endif // stream_INCLUDED 567 568