1 #ifndef INC_UnicodeCharScanner_hpp__ 2 #define INC_UnicodeCharScanner_hpp__ 3 4 #include <map> 5 #include <cctype> 6 7 #include <antlr/config.hpp> 8 #include <antlr/CommonToken.hpp> 9 #include <antlr/TokenStream.hpp> 10 #include <antlr/RecognitionException.hpp> 11 #include <antlr/SemanticException.hpp> 12 #include <antlr/InputBuffer.hpp> 13 #include <antlr/BitSet.hpp> 14 #include <antlr/LexerSharedInputState.hpp> 15 16 #include "MismatchedUnicodeCharException.hpp" 17 18 /** Superclass of generated lexers 19 */ 20 class UnicodeCharScanner : public antlr::TokenStream { 21 protected: 22 typedef antlr::RefToken (*factory_type)(); 23 public: 24 typedef unsigned int char_type; 25 typedef std::map<std::string,int> string_map; 26 UnicodeCharScanner(antlr::InputBuffer & cb,bool case_sensitive)27 UnicodeCharScanner( antlr::InputBuffer& cb, bool case_sensitive ) 28 : saveConsumedInput(true) 29 , caseSensitive(case_sensitive) 30 , literals() 31 , inputState(new antlr::LexerInputState(cb)) 32 , commitToPath(false) 33 , tabsize(8) 34 , traceDepth(0) 35 { 36 setTokenObjectFactory(&antlr::CommonToken::factory); 37 } UnicodeCharScanner(antlr::InputBuffer * cb,bool case_sensitive)38 UnicodeCharScanner( antlr::InputBuffer* cb, bool case_sensitive ) 39 : saveConsumedInput(true) 40 , caseSensitive(case_sensitive) 41 , literals() 42 , inputState(new antlr::LexerInputState(cb)) 43 , commitToPath(false) 44 , tabsize(8) 45 , traceDepth(0) 46 { 47 setTokenObjectFactory(&antlr::CommonToken::factory); 48 } UnicodeCharScanner(const antlr::LexerSharedInputState & state,bool case_sensitive)49 UnicodeCharScanner( const antlr::LexerSharedInputState& state, bool case_sensitive ) 50 : saveConsumedInput(true) 51 , caseSensitive(case_sensitive) 52 , literals() 53 , inputState(state) 54 , commitToPath(false) 55 , tabsize(8) 56 , traceDepth(0) 57 { 58 setTokenObjectFactory(&antlr::CommonToken::factory); 59 } 60 ~UnicodeCharScanner()61 virtual ~UnicodeCharScanner() 62 { 63 } 64 LA(char_type i)65 virtual char_type LA(char_type i) 66 { 67 char_type c = inputState->getInput().LA(i); 68 return c; 69 } 70 append(char_type c)71 virtual void append(char_type c) 72 { 73 if (saveConsumedInput) 74 { 75 size_t len = text.length(); 76 77 if( (len % 256) == 0 ) 78 text.reserve(len+256); 79 80 // This is how UTF8 is encoded 81 // +---------------------------+----------+----------+----------+----------+ 82 // | Unicode scalar | 1st | 2nd | 3th | 4th | 83 // +---------------------------+----------+----------+----------+----------+ 84 // |00000000 0xxxxxxx | 0xxxxxxx | | | | 85 // |00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | | 86 // |zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | | 87 // |000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx | 88 // +---------------------------+----------+----------+----------+----------+ 89 90 if (c < 0x80) 91 { 92 text += c; 93 return; 94 } 95 else if (c < 0x800) 96 { 97 text += ( (c >> 6) | 0xC0 ); 98 text += ( c & 0x3F | 0x80 ); 99 } 100 else if (c < 0x10000) 101 { 102 text += ( (c >> 12) | 0xE0 ); 103 text += ( ((c >> 6) & 0x3F) | 0x80 ); 104 text += ( (c & 0x3F) | 0x80 ); 105 } 106 else if (c < 0x200000) 107 { 108 text += ( (c >> 18) | 0xF0 ); // first 3 bits 109 text += ( (((c >> 16) & 0x3) << 4) | 110 ((c >> 12) & 0xF) | 0x80 ); 111 text += ( ((c >> 6) & 0x3F) | 0x80 ); 112 text += ( (c & 0x3F) | 0x80 ); 113 } 114 else 115 assert(0); 116 } 117 } 118 append(const std::string & s)119 virtual void append(const std::string& s) 120 { 121 assert(0); 122 if (saveConsumedInput) 123 text+=s; 124 } 125 commit()126 virtual void commit() 127 { 128 inputState->getInput().commit(); 129 } 130 consume()131 virtual void consume() 132 { 133 if (inputState->guessing == 0) 134 { 135 char_type c = LA(1); 136 append(c); 137 inputState->column++; 138 } 139 inputState->getInput().consume(); 140 } 141 142 /** Consume chars until one matches the given char */ consumeUntil(char_type c)143 virtual void consumeUntil(char_type c) 144 { 145 for(;;) 146 { 147 char_type la_1 = LA(1); 148 if( static_cast<char_type>(EOF_CHAR) == la_1 || la_1 == c ) 149 break; 150 consume(); 151 } 152 } 153 154 /** Consume chars until one matches the given set */ consumeUntil(const antlr::BitSet & set)155 virtual void consumeUntil(const antlr::BitSet& set) 156 { 157 for(;;) 158 { 159 char_type la_1 = LA(1); 160 if( static_cast<char_type>(EOF_CHAR) == la_1 || set.member(la_1) ) 161 break; 162 consume(); 163 } 164 } 165 166 /// Mark the current position and return a id for it mark()167 virtual unsigned int mark() 168 { 169 return inputState->getInput().mark(); 170 } 171 172 /// Rewind the scanner to a previously marked position rewind(unsigned int pos)173 virtual void rewind(unsigned int pos) 174 { 175 inputState->getInput().rewind(pos); 176 } 177 178 /// See if input contains character 'c' throw MismatchedUnicodeCharException if not match(char_type c)179 virtual void match(char_type c) 180 { 181 char_type la_1 = LA(1); 182 if ( la_1 != c ) 183 throw MismatchedUnicodeCharException(la_1, c, false, this); 184 consume(); 185 } 186 187 /** See if input contains element from bitset b 188 * throw MismatchedUnicodeCharException if not 189 */ match(const antlr::BitSet & b)190 virtual void match(const antlr::BitSet& b) 191 { 192 char_type la_1 = LA(1); 193 194 if ( !b.member(la_1) ) 195 throw MismatchedUnicodeCharException( la_1, b, false, this ); 196 consume(); 197 } 198 199 /** See if input contains string 's' throw MismatchedUnicodeCharException if not 200 * @note the string cannot match EOF 201 */ match(const char * s)202 virtual void match( const char* s ) 203 { 204 while( *s != '\0' ) 205 { 206 // the & 0xFF is here to prevent sign extension lateron 207 char_type la_1 = LA(1), c = (*s++ & 0xFF); 208 209 if ( la_1 != c ) 210 throw MismatchedUnicodeCharException(la_1, c, false, this); 211 212 consume(); 213 } 214 } 215 /** See if input contains string 's' throw MismatchedUnicodeCharException if not 216 * @note the string cannot match EOF 217 */ match(const std::string & s)218 virtual void match(const std::string& s) 219 { 220 size_t len = s.length(); 221 222 for (size_t i = 0; i < len; i++) 223 { 224 // the & 0xFF is here to prevent sign extension lateron 225 char_type la_1 = LA(1), c = (s[i] & 0xFF); 226 227 if ( la_1 != c ) 228 throw MismatchedUnicodeCharException(la_1, c, false, this); 229 230 consume(); 231 } 232 } 233 /** See if input does not contain character 'c' 234 * throw MismatchedUnicodeCharException if not 235 */ matchNot(char_type c)236 virtual void matchNot(char_type c) 237 { 238 char_type la_1 = LA(1); 239 240 if ( la_1 == c ) 241 throw MismatchedUnicodeCharException(la_1, c, true, this); 242 243 consume(); 244 } 245 /** See if input contains character in range c1-c2 246 * throw MismatchedUnicodeCharException if not 247 */ matchRange(char_type c1,char_type c2)248 virtual void matchRange(char_type c1, char_type c2) 249 { 250 char_type la_1 = LA(1); 251 252 if ( la_1 < c1 || la_1 > c2 ) 253 throw MismatchedUnicodeCharException(la_1, c1, c2, false, this); 254 255 consume(); 256 } 257 258 /// Get the line the scanner currently is in (starts at 1) getLine() const259 virtual int getLine() const 260 { 261 return inputState->line; 262 } 263 264 /// set the line number setLine(int l)265 virtual void setLine(int l) 266 { 267 inputState->line = l; 268 } 269 270 /// Get the column the scanner currently is in (starts at 1) getColumn() const271 virtual int getColumn() const 272 { 273 return inputState->column; 274 } 275 /// set the column number setColumn(int c)276 virtual void setColumn(int c) 277 { 278 inputState->column = c; 279 } 280 281 /// get the filename for the file currently used getFilename() const282 virtual const std::string& getFilename() const 283 { 284 return inputState->filename; 285 } 286 /// Set the filename the scanner is using (used in error messages) setFilename(const std::string & f)287 virtual void setFilename(const std::string& f) 288 { 289 inputState->filename = f; 290 } 291 getCommitToPath() const292 virtual bool getCommitToPath() const 293 { 294 return commitToPath; 295 } 296 setCommitToPath(bool commit)297 virtual void setCommitToPath(bool commit) 298 { 299 commitToPath = commit; 300 } 301 302 /** return a copy of the current text buffer */ getText() const303 virtual const std::string& getText() const 304 { 305 return text; 306 } 307 setText(const std::string & s)308 virtual void setText(const std::string& s) 309 { 310 text = s; 311 } 312 resetText()313 virtual void resetText() 314 { 315 text = ""; 316 inputState->tokenStartColumn = inputState->column; 317 inputState->tokenStartLine = inputState->line; 318 } 319 getTokenObject() const320 virtual antlr::RefToken getTokenObject() const 321 { 322 return _returnToken; 323 } 324 325 ///{ These need different handling in unicode case 326 327 virtual bool getCaseSensitiveLiterals() const=0; 328 getCaseSensitive() const329 virtual bool getCaseSensitive() const 330 { 331 return caseSensitive; 332 } 333 setCaseSensitive(bool t)334 virtual void setCaseSensitive(bool t) 335 { 336 caseSensitive = t; 337 } 338 339 /** Override this method to get more specific case handling 340 * @note some platforms probably require setting the right locale for 341 * correct functioning. 342 */ toLower(char_type c) const343 virtual char_type toLower(char_type c) const 344 { 345 return std::tolower(c); 346 } 347 348 /** Used to keep track of line breaks, needs to be called from 349 * within generated lexers when a \n \r is encountered. 350 */ newline()351 virtual void newline() 352 { 353 ++inputState->line; 354 inputState->column = 1; 355 } 356 357 /** Advance the current column number by an appropriate amount according 358 * to the tabsize. This method needs to be explicitly called from the 359 * lexer rules encountering tabs. 360 */ tab()361 virtual void tab() 362 { 363 int c = getColumn(); 364 int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop 365 setColumn( nc ); 366 } 367 /// set the tabsize. Returns the old tabsize setTabsize(int size)368 int setTabsize( int size ) 369 { 370 int oldsize = tabsize; 371 tabsize = size; 372 return oldsize; 373 } 374 /// Return the tabsize used by the scanner getTabSize() const375 int getTabSize() const 376 { 377 return tabsize; 378 } 379 ///} 380 381 /** Report exception errors caught in nextToken() */ reportError(const antlr::RecognitionException & ex)382 virtual void reportError(const antlr::RecognitionException& ex) 383 { 384 std::cerr << ex.toString().c_str() << std::endl; 385 } 386 387 /** Parser error-reporting function can be overridden in subclass */ reportError(const std::string & s)388 virtual void reportError(const std::string& s) 389 { 390 if (getFilename() == "") 391 std::cerr << "error: " << s.c_str() << std::endl; 392 else 393 std::cerr << getFilename().c_str() << ": error: " << s.c_str() << std::endl; 394 } 395 396 /** Parser warning-reporting function can be overridden in subclass */ reportWarning(const std::string & s)397 virtual void reportWarning(const std::string& s) 398 { 399 if (getFilename() == "") 400 std::cerr << "warning: " << s.c_str() << std::endl; 401 else 402 std::cerr << getFilename().c_str() << ": warning: " << s.c_str() << std::endl; 403 } 404 getInputBuffer()405 virtual antlr::InputBuffer& getInputBuffer() 406 { 407 return inputState->getInput(); 408 } 409 getInputState()410 virtual antlr::LexerSharedInputState getInputState() 411 { 412 return inputState; 413 } 414 415 /** set the input state for the lexer. 416 * @note state is a reference counted object, hence no reference */ setInputState(antlr::LexerSharedInputState state)417 virtual void setInputState(antlr::LexerSharedInputState state) 418 { 419 inputState = state; 420 } 421 422 /// Set the factory for created tokens setTokenObjectFactory(factory_type factory)423 virtual void setTokenObjectFactory(factory_type factory) 424 { 425 tokenFactory = factory; 426 } 427 428 /** Test the token text against the literals table 429 * Override this method to perform a different literals test 430 */ testLiteralsTable(int ttype) const431 virtual int testLiteralsTable(int ttype) const 432 { 433 string_map::const_iterator i = literals.find(text); 434 if (i != literals.end()) 435 ttype = (*i).second; 436 return ttype; 437 } 438 439 /** Test the text passed in against the literals table 440 * Override this method to perform a different literals test 441 * This is used primarily when you want to test a portion of 442 * a token 443 */ testLiteralsTable(const std::string & text,int ttype) const444 virtual int testLiteralsTable(const std::string& text, int ttype) const 445 { 446 string_map::const_iterator i = literals.find(text); 447 if (i != literals.end()) 448 ttype = (*i).second; 449 return ttype; 450 } 451 452 /** This method is called by YourLexer::nextToken() when the lexer has 453 * hit EOF condition. EOF is NOT a character. 454 * This method is not called if EOF is reached during 455 * syntactic predicate evaluation or during evaluation 456 * of normal lexical rules, which presumably would be 457 * an IOException. This traps the "normal" EOF condition. 458 * 459 * uponEOF() is called after the complete evaluation of 460 * the previous token and only if your parser asks 461 * for another token beyond that last non-EOF token. 462 * 463 * You might want to throw token or char stream exceptions 464 * like: "Heh, premature eof" or a retry stream exception 465 * ("I found the end of this file, go back to referencing file"). 466 */ uponEOF()467 virtual void uponEOF() 468 { 469 } 470 471 /// Methods used to change tracing behavior traceIndent()472 void traceIndent() 473 { 474 for( int i = 0; i < traceDepth; i++ ) 475 std::cout << " "; 476 } 477 traceIn(const char * rname)478 void traceIn(const char* rname) 479 { 480 traceDepth++; 481 traceIndent(); 482 std::cout << "> lexer " << rname 483 << "; c==" << LA(1) << std::endl; 484 } 485 traceOut(const char * rname)486 void traceOut(const char* rname) 487 { 488 traceIndent(); 489 std::cout << "< lexer " << rname 490 << "; c==" << LA(1) << std::endl; 491 traceDepth--; 492 } 493 494 #ifndef NO_STATIC_CONSTS 495 static const int EOF_CHAR = EOF; 496 #else 497 enum { 498 EOF_CHAR = EOF 499 }; 500 #endif 501 protected: 502 std::string text; ///< Text of current token 503 /// flag indicating wether consume saves characters 504 bool saveConsumedInput; 505 factory_type tokenFactory; ///< Factory for tokens 506 bool caseSensitive; ///< Is this lexer case sensitive 507 string_map literals; // set by subclass 508 509 antlr::RefToken _returnToken; ///< used to return tokens w/o using return val 510 511 /// Input state, gives access to input stream, shared among different lexers 512 antlr::LexerSharedInputState inputState; 513 514 /** Used during filter mode to indicate that path is desired. 515 * A subsequent scan error will report an error as usual 516 * if acceptPath=true; 517 */ 518 bool commitToPath; 519 520 unsigned int tabsize; ///< tab size the scanner uses. 521 522 /// Create a new RefToken of type t makeToken(int t)523 virtual antlr::RefToken makeToken(int t) 524 { 525 antlr::RefToken tok = tokenFactory(); 526 // actually at this point you want to convert the stored lexeme text 527 // into the format you want to have it in in the backend... 528 tok->setType(t); 529 tok->setColumn(inputState->tokenStartColumn); 530 tok->setLine(inputState->tokenStartLine); 531 return tok; 532 } 533 534 /** Tracer class, used when -traceLexer is passed to antlr 535 */ 536 class Tracer { 537 private: 538 UnicodeCharScanner* parser; 539 const char* text; 540 541 Tracer(const Tracer& other); // undefined 542 Tracer& operator=(const Tracer& other); // undefined 543 public: Tracer(UnicodeCharScanner * p,const char * t)544 Tracer( UnicodeCharScanner* p, const char* t ) 545 : parser(p), text(t) 546 { 547 parser->traceIn(text); 548 } ~Tracer()549 ~Tracer() 550 { 551 parser->traceOut(text); 552 } 553 }; 554 555 int traceDepth; 556 private: 557 UnicodeCharScanner( const UnicodeCharScanner& other ); // undefined 558 UnicodeCharScanner& operator=( const UnicodeCharScanner& other ); // undefined 559 }; 560 561 #endif //INC_UnicodeCharScanner_hpp__ 562