1 #ifndef INC_CharScanner_hpp__
2 #define INC_CharScanner_hpp__
3
4 /* ANTLR Translator Generator
5 * Project led by Terence Parr at http://www.jGuru.com
6 * Software rights: http://www.antlr.org/license.html
7 *
8 * $Id: CharScanner.hpp,v 1.3 2009-10-24 16:20:01 medericboquien Exp $
9 */
10
11 #include <antlr/config.hpp>
12
13 // g++-4.3 needs this
14 #include <cstring>
15 #include <cstdlib>
16
17 // g++-4.4 needs this
18 #include <cstdio>
19
20 #include <map>
21
22 #ifdef HAS_NOT_CCTYPE_H
23 #include <ctype.h>
24 #else
25 #include <cctype>
26 #endif
27
28 #if ( _MSC_VER == 1200 )
29 // VC6 seems to need this
30 // note that this is not a standard C++ include file.
31 # include <stdio.h>
32 #endif
33
34 #include <antlr/TokenStream.hpp>
35 #include <antlr/RecognitionException.hpp>
36 #include <antlr/SemanticException.hpp>
37 #include <antlr/MismatchedCharException.hpp>
38 #include <antlr/InputBuffer.hpp>
39 #include <antlr/BitSet.hpp>
40 #include <antlr/LexerSharedInputState.hpp>
41
42 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
43 namespace antlr {
44 #endif
45
46 class ANTLR_API CharScanner;
47
ANTLR_C_USING(tolower)48 ANTLR_C_USING(tolower)
49
50 #ifdef ANTLR_REALLY_NO_STRCASECMP
51 // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior
52 // on the mac has neither...
53 inline int strcasecmp(const char *s1, const char *s2)
54 {
55 while (true)
56 {
57 char c1 = tolower(*s1++),
58 c2 = tolower(*s2++);
59 if (c1 < c2) return -1;
60 if (c1 > c2) return 1;
61 if (c1 == 0) return 0;
62 }
63 }
64 #else
65 #ifdef NO_STRCASECMP
66 ANTLR_C_USING(stricmp)
67 #else
68 ANTLR_C_USING(strcasecmp)
69 #endif
70 #endif
71
72 /** Functor for the literals map
73 */
74 class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> {
75 private:
76 const CharScanner* scanner;
77 public:
78 #ifdef NO_TEMPLATE_PARTS
CharScannerLiteralsLess()79 CharScannerLiteralsLess() {} // not really used, definition to appease MSVC
80 #endif
CharScannerLiteralsLess(const CharScanner * theScanner)81 CharScannerLiteralsLess(const CharScanner* theScanner)
82 : scanner(theScanner)
83 {
84 }
85 bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const;
86 // defaults are good enough..
87 // CharScannerLiteralsLess(const CharScannerLiteralsLess&);
88 // CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&);
89 };
90
91 /** Superclass of generated lexers
92 */
93 class ANTLR_API CharScanner : public TokenStream {
94 protected:
95 typedef RefToken (*factory_type)();
96 public:
97 CharScanner(InputBuffer& cb, bool case_sensitive );
98 CharScanner(InputBuffer* cb, bool case_sensitive );
99 CharScanner(const LexerSharedInputState& state, bool case_sensitive );
100
~CharScanner()101 virtual ~CharScanner()
102 {
103 }
104
105 virtual int LA(unsigned int i);
106
append(char c)107 virtual void append(char c)
108 {
109 if (saveConsumedInput) {
110 int l = text.length();
111 if ((l%256) == 0)
112 text.reserve(l+256);
113 text.replace(l,0,&c,1);
114 }
115 }
116
append(const ANTLR_USE_NAMESPACE (std)string & s)117 virtual void append(const ANTLR_USE_NAMESPACE(std)string& s)
118 {
119 if (saveConsumedInput)
120 text+=s;
121 }
122
commit()123 virtual void commit()
124 {
125 inputState->getInput().commit();
126 }
127
128 virtual void consume();
129
130 /** Consume chars until one matches the given char */
consumeUntil(int c)131 virtual void consumeUntil(int c)
132 {
133 for(;;)
134 {
135 int la_1 = LA(1);
136 if( la_1 == EOF_CHAR || la_1 == c )
137 break;
138 consume();
139 }
140 }
141
142 /** Consume chars until one matches the given set */
consumeUntil(const BitSet & set)143 virtual void consumeUntil(const BitSet& set)
144 {
145 for(;;)
146 {
147 int la_1 = LA(1);
148 if( la_1 == EOF_CHAR || set.member(la_1) )
149 break;
150 consume();
151 }
152 }
153
154 /// Mark the current position and return a id for it
mark()155 virtual unsigned int mark()
156 {
157 return inputState->getInput().mark();
158 }
159 /// Rewind the scanner to a previously marked position
rewind(unsigned int pos)160 virtual void rewind(unsigned int pos)
161 {
162 inputState->getInput().rewind(pos);
163 }
164
165 /// See if input contains character 'c' throw MismatchedCharException if not
match(int c)166 virtual void match(int c)
167 {
168 int la_1 = LA(1);
169 if ( la_1 != c )
170 throw MismatchedCharException(la_1, c, false, this);
171 consume();
172 }
173
174 /** See if input contains element from bitset b
175 * throw MismatchedCharException if not
176 */
match(const BitSet & b)177 virtual void match(const BitSet& b)
178 {
179 int la_1 = LA(1);
180
181 if ( !b.member(la_1) )
182 throw MismatchedCharException( la_1, b, false, this );
183 consume();
184 }
185
186 /** See if input contains string 's' throw MismatchedCharException if not
187 * @note the string cannot match EOF
188 */
match(const char * s)189 virtual void match( const char* s )
190 {
191 while( *s != '\0' )
192 {
193 // the & 0xFF is here to prevent sign extension lateron
194 int la_1 = LA(1), c = (*s++ & 0xFF);
195
196 if ( la_1 != c )
197 throw MismatchedCharException(la_1, c, false, this);
198
199 consume();
200 }
201 }
202 /** See if input contains string 's' throw MismatchedCharException if not
203 * @note the string cannot match EOF
204 */
match(const ANTLR_USE_NAMESPACE (std)string & s)205 virtual void match(const ANTLR_USE_NAMESPACE(std)string& s)
206 {
207 size_t len = s.length();
208
209 for (size_t i = 0; i < len; i++)
210 {
211 // the & 0xFF is here to prevent sign extension lateron
212 int la_1 = LA(1), c = (s[i] & 0xFF);
213
214 if ( la_1 != c )
215 throw MismatchedCharException(la_1, c, false, this);
216
217 consume();
218 }
219 }
220 /** See if input does not contain character 'c'
221 * throw MismatchedCharException if not
222 */
matchNot(int c)223 virtual void matchNot(int c)
224 {
225 int la_1 = LA(1);
226
227 if ( la_1 == c )
228 throw MismatchedCharException(la_1, c, true, this);
229
230 consume();
231 }
232 /** See if input contains character in range c1-c2
233 * throw MismatchedCharException if not
234 */
matchRange(int c1,int c2)235 virtual void matchRange(int c1, int c2)
236 {
237 int la_1 = LA(1);
238
239 if ( la_1 < c1 || la_1 > c2 )
240 throw MismatchedCharException(la_1, c1, c2, false, this);
241
242 consume();
243 }
244
getCaseSensitive() const245 virtual bool getCaseSensitive() const
246 {
247 return caseSensitive;
248 }
249
setCaseSensitive(bool t)250 virtual void setCaseSensitive(bool t)
251 {
252 caseSensitive = t;
253 }
254
255 virtual bool getCaseSensitiveLiterals() const=0;
256
257 /// Get the line the scanner currently is in (starts at 1)
getLine() const258 virtual int getLine() const
259 {
260 return inputState->line;
261 }
262
263 /// set the line number
setLine(int l)264 virtual void setLine(int l)
265 {
266 inputState->line = l;
267 }
268
269 /// Get the column the scanner currently is in (starts at 1)
getColumn() const270 virtual int getColumn() const
271 {
272 return inputState->column;
273 }
274 /// set the column number
setColumn(int c)275 virtual void setColumn(int c)
276 {
277 inputState->column = c;
278 }
279
280 /// get the filename for the file currently used
ANTLR_USE_NAMESPACE(std)281 virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const
282 {
283 return inputState->filename;
284 }
285 /// Set the filename the scanner is using (used in error messages)
setFilename(const ANTLR_USE_NAMESPACE (std)string & f)286 virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f)
287 {
288 inputState->filename = f;
289 }
290
getCommitToPath() const291 virtual bool getCommitToPath() const
292 {
293 return commitToPath;
294 }
295
setCommitToPath(bool commit)296 virtual void setCommitToPath(bool commit)
297 {
298 commitToPath = commit;
299 }
300
301 /** return a copy of the current text buffer */
ANTLR_USE_NAMESPACE(std)302 virtual const ANTLR_USE_NAMESPACE(std)string& getText() const
303 {
304 return text;
305 }
306
setText(const ANTLR_USE_NAMESPACE (std)string & s)307 virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s)
308 {
309 text = s;
310 }
311
resetText()312 virtual void resetText()
313 {
314 text = "";
315 inputState->tokenStartColumn = inputState->column;
316 inputState->tokenStartLine = inputState->line;
317 }
318
getTokenObject() const319 virtual RefToken getTokenObject() const
320 {
321 return _returnToken;
322 }
323
324 /** Used to keep track of line breaks, needs to be called from
325 * within generated lexers when a \n \r is encountered.
326 */
newline()327 virtual void newline()
328 {
329 ++inputState->line;
330 inputState->column = 1;
331 }
332
333 /** Advance the current column number by an appropriate amount according
334 * to the tabsize. This method needs to be explicitly called from the
335 * lexer rules encountering tabs.
336 */
tab()337 virtual void tab()
338 {
339 int c = getColumn();
340 int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop
341 setColumn( nc );
342 }
343 /// set the tabsize. Returns the old tabsize
setTabsize(int size)344 int setTabsize( int size )
345 {
346 int oldsize = tabsize;
347 tabsize = size;
348 return oldsize;
349 }
350 /// Return the tabsize used by the scanner
getTabSize() const351 int getTabSize() const
352 {
353 return tabsize;
354 }
355
356 /** Terminate program using exit()
357 * @deprecated will be removed in the next release. It is not used.
358 */
359 virtual void panic();
360 /** Terminate program using exit()
361 * @deprecated will be removed in the next release. It is not used.
362 */
363 virtual void panic(const ANTLR_USE_NAMESPACE(std)string& s);
364
365 /** Report exception errors caught in nextToken() */
366 virtual void reportError(const RecognitionException& e);
367
368 /** Parser error-reporting function can be overridden in subclass */
369 virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s);
370
371 /** Parser warning-reporting function can be overridden in subclass */
372 virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s);
373
getInputBuffer()374 virtual InputBuffer& getInputBuffer()
375 {
376 return inputState->getInput();
377 }
378
getInputState()379 virtual LexerSharedInputState getInputState()
380 {
381 return inputState;
382 }
383
384 /** set the input state for the lexer.
385 * @note state is a reference counted object, hence no reference */
setInputState(LexerSharedInputState state)386 virtual void setInputState(LexerSharedInputState state)
387 {
388 inputState = state;
389 }
390
391 /// Set the factory for created tokens
setTokenObjectFactory(factory_type factory)392 virtual void setTokenObjectFactory(factory_type factory)
393 {
394 tokenFactory = factory;
395 }
396
397 /** Test the token text against the literals table
398 * Override this method to perform a different literals test
399 */
testLiteralsTable(int ttype) const400 virtual int testLiteralsTable(int ttype) const
401 {
402 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text);
403 if (i != literals.end())
404 ttype = (*i).second;
405 return ttype;
406 }
407
408 /** Test the text passed in against the literals table
409 * Override this method to perform a different literals test
410 * This is used primarily when you want to test a portion of
411 * a token
412 */
testLiteralsTable(const ANTLR_USE_NAMESPACE (std)string & txt,int ttype) const413 virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const
414 {
415 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt);
416 if (i != literals.end())
417 ttype = (*i).second;
418 return ttype;
419 }
420
421 /// Override this method to get more specific case handling
toLower(int c) const422 virtual int toLower(int c) const
423 {
424 // test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?)
425 // also VC++ 6.0 does this. (see fix 422 (is reverted by this fix)
426 // this one is more structural. Maybe make this configurable.
427 return (c == EOF_CHAR ? EOF_CHAR : tolower(c));
428 }
429
430 /** This method is called by YourLexer::nextToken() when the lexer has
431 * hit EOF condition. EOF is NOT a character.
432 * This method is not called if EOF is reached during
433 * syntactic predicate evaluation or during evaluation
434 * of normal lexical rules, which presumably would be
435 * an IOException. This traps the "normal" EOF condition.
436 *
437 * uponEOF() is called after the complete evaluation of
438 * the previous token and only if your parser asks
439 * for another token beyond that last non-EOF token.
440 *
441 * You might want to throw token or char stream exceptions
442 * like: "Heh, premature eof" or a retry stream exception
443 * ("I found the end of this file, go back to referencing file").
444 */
uponEOF()445 virtual void uponEOF()
446 {
447 }
448
449 /// Methods used to change tracing behavior
450 virtual void traceIndent();
451 virtual void traceIn(const char* rname);
452 virtual void traceOut(const char* rname);
453
454 #ifndef NO_STATIC_CONSTS
455 static const int EOF_CHAR = EOF;
456 #else
457 enum {
458 EOF_CHAR = EOF
459 };
460 #endif
461 protected:
462 ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token
463 /// flag indicating wether consume saves characters
464 bool saveConsumedInput;
465 factory_type tokenFactory; ///< Factory for tokens
466 bool caseSensitive; ///< Is this lexer case sensitive
467 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass
468
469 RefToken _returnToken; ///< used to return tokens w/o using return val
470
471 /// Input state, gives access to input stream, shared among different lexers
472 LexerSharedInputState inputState;
473
474 /** Used during filter mode to indicate that path is desired.
475 * A subsequent scan error will report an error as usual
476 * if acceptPath=true;
477 */
478 bool commitToPath;
479
480 int tabsize; ///< tab size the scanner uses.
481
482 /// Create a new RefToken of type t
makeToken(int t)483 virtual RefToken makeToken(int t)
484 {
485 RefToken tok = tokenFactory();
486 tok->setType(t);
487 tok->setColumn(inputState->tokenStartColumn);
488 tok->setLine(inputState->tokenStartLine);
489 return tok;
490 }
491
492 /** Tracer class, used when -traceLexer is passed to antlr
493 */
494 class Tracer {
495 private:
496 CharScanner* parser;
497 const char* text;
498
499 Tracer(const Tracer& other); // undefined
500 Tracer& operator=(const Tracer& other); // undefined
501 public:
Tracer(CharScanner * p,const char * t)502 Tracer( CharScanner* p,const char* t )
503 : parser(p), text(t)
504 {
505 parser->traceIn(text);
506 }
~Tracer()507 ~Tracer()
508 {
509 parser->traceOut(text);
510 }
511 };
512
513 int traceDepth;
514 private:
515 CharScanner( const CharScanner& other ); // undefined
516 CharScanner& operator=( const CharScanner& other ); // undefined
517
518 #ifndef NO_STATIC_CONSTS
519 static const int NO_CHAR = 0;
520 #else
521 enum {
522 NO_CHAR = 0
523 };
524 #endif
525 };
526
LA(unsigned int i)527 inline int CharScanner::LA(unsigned int i)
528 {
529 int c = inputState->getInput().LA(i);
530
531 if ( caseSensitive )
532 return c;
533 else
534 return toLower(c); // VC 6 tolower bug caught in toLower.
535 }
536
operator ()(const ANTLR_USE_NAMESPACE (std)string & x,const ANTLR_USE_NAMESPACE (std)string & y) const537 inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const
538 {
539 if (scanner->getCaseSensitiveLiterals())
540 return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y);
541 else
542 {
543 #ifdef NO_STRCASECMP
544 return (stricmp(x.c_str(),y.c_str())<0);
545 #else
546 return (strcasecmp(x.c_str(),y.c_str())<0);
547 #endif
548 }
549 }
550
551 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
552 }
553 #endif
554
555 #endif //INC_CharScanner_hpp__
556