1 #ifndef INC_CharScanner_hpp__
2 #define INC_CharScanner_hpp__
3
4 /* ANTLR Translator Generator
5 * Project led by Terence Parr at http://www.jGuru.com
6 * Software rights: http://www.antlr.org/license.html
7 *
8 * $Id: //depot/code/org.antlr/release/antlr-2.7.7/lib/cpp/antlr/CharScanner.hpp#2 $
9 */
10
11 #include <antlr/config.hpp>
12
13 #include <map>
14
15 #ifdef HAS_NOT_CCTYPE_H
16 #include <ctype.h>
17 #else
18 #include <cctype>
19 #endif
20
21 #include <cstdio> // for EOF
22 extern "C" {
23 #include <strings.h> // for strcasecmp
24 }
25
26 #if ( _MSC_VER == 1200 )
27 // VC6 seems to need this
28 // note that this is not a standard C++ include file.
29 # include <stdio.h>
30 #endif
31
32 #include <antlr/TokenStream.hpp>
33 #include <antlr/RecognitionException.hpp>
34 #include <antlr/SemanticException.hpp>
35 #include <antlr/MismatchedCharException.hpp>
36 #include <antlr/InputBuffer.hpp>
37 #include <antlr/BitSet.hpp>
38 #include <antlr/LexerSharedInputState.hpp>
39 #include <strings.h>
40
41 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
42 namespace antlr {
43 #endif
44
45 class ANTLR_API CharScanner;
46
ANTLR_C_USING(tolower)47 ANTLR_C_USING(tolower)
48
49 #ifdef ANTLR_REALLY_NO_STRCASECMP
50 // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior
51 // on the mac has neither...
52 inline int strcasecmp(const char *s1, const char *s2)
53 {
54 while (true)
55 {
56 char c1 = tolower(*s1++),
57 c2 = tolower(*s2++);
58 if (c1 < c2) return -1;
59 if (c1 > c2) return 1;
60 if (c1 == 0) return 0;
61 }
62 }
63 #else
64 #ifdef NO_STRCASECMP
65 ANTLR_C_USING(stricmp)
66 #else
67 ANTLR_C_USING(strcasecmp)
68 #endif
69 #endif
70
71 /** Functor for the literals map
72 */
73 class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> {
74 private:
75 const CharScanner* scanner;
76 public:
77 #ifdef NO_TEMPLATE_PARTS
CharScannerLiteralsLess()78 CharScannerLiteralsLess() {} // not really used, definition to appease MSVC
79 #endif
CharScannerLiteralsLess(const CharScanner * theScanner)80 CharScannerLiteralsLess(const CharScanner* theScanner)
81 : scanner(theScanner)
82 {
83 }
84 bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const;
85 // defaults are good enough..
86 // CharScannerLiteralsLess(const CharScannerLiteralsLess&);
87 // CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&);
88 };
89
90 /** Superclass of generated lexers
91 */
92 class ANTLR_API CharScanner : public TokenStream {
93 protected:
94 typedef RefToken (*factory_type)();
95 public:
96 CharScanner(InputBuffer& cb, bool case_sensitive );
97 CharScanner(InputBuffer* cb, bool case_sensitive );
98 CharScanner(const LexerSharedInputState& state, bool case_sensitive );
99
~CharScanner()100 virtual ~CharScanner()
101 {
102 }
103
104 virtual int LA(unsigned int i);
105
append(char c)106 virtual void append(char c)
107 {
108 if (saveConsumedInput)
109 {
110 size_t l = text.length();
111
112 if ((l%256) == 0)
113 text.reserve(l+256);
114
115 text.replace(l,0,&c,1);
116 }
117 }
118
append(const ANTLR_USE_NAMESPACE (std)string & s)119 virtual void append(const ANTLR_USE_NAMESPACE(std)string& s)
120 {
121 if( saveConsumedInput )
122 text += s;
123 }
124
commit()125 virtual void commit()
126 {
127 inputState->getInput().commit();
128 }
129
130 /** called by the generated lexer to do error recovery, override to
131 * customize the behaviour.
132 */
recover(const RecognitionException & ex,const BitSet & tokenSet)133 virtual void recover(const RecognitionException& ex, const BitSet& tokenSet)
134 {
135 consume();
136 consumeUntil(tokenSet);
137 }
138
consume()139 virtual void consume()
140 {
141 if (inputState->guessing == 0)
142 {
143 int c = LA(1);
144 if (caseSensitive)
145 {
146 append(c);
147 }
148 else
149 {
150 // use input.LA(), not LA(), to get original case
151 // CharScanner.LA() would toLower it.
152 append(inputState->getInput().LA(1));
153 }
154
155 // RK: in a sense I don't like this automatic handling.
156 if (c == '\t')
157 tab();
158 else
159 inputState->column++;
160 }
161 inputState->getInput().consume();
162 }
163
164 /** Consume chars until one matches the given char */
consumeUntil(int c)165 virtual void consumeUntil(int c)
166 {
167 for(;;)
168 {
169 int la_1 = LA(1);
170 if( la_1 == EOF_CHAR || la_1 == c )
171 break;
172 consume();
173 }
174 }
175
176 /** Consume chars until one matches the given set */
consumeUntil(const BitSet & set)177 virtual void consumeUntil(const BitSet& set)
178 {
179 for(;;)
180 {
181 int la_1 = LA(1);
182 if( la_1 == EOF_CHAR || set.member(la_1) )
183 break;
184 consume();
185 }
186 }
187
188 /// Mark the current position and return a id for it
mark()189 virtual unsigned int mark()
190 {
191 return inputState->getInput().mark();
192 }
193 /// Rewind the scanner to a previously marked position
rewind(unsigned int pos)194 virtual void rewind(unsigned int pos)
195 {
196 inputState->getInput().rewind(pos);
197 }
198
199 /// See if input contains character 'c' throw MismatchedCharException if not
match(int c)200 virtual void match(int c)
201 {
202 int la_1 = LA(1);
203 if ( la_1 != c )
204 throw MismatchedCharException(la_1, c, false, this);
205 consume();
206 }
207
208 /** See if input contains element from bitset b
209 * throw MismatchedCharException if not
210 */
match(const BitSet & b)211 virtual void match(const BitSet& b)
212 {
213 int la_1 = LA(1);
214
215 if ( !b.member(la_1) )
216 throw MismatchedCharException( la_1, b, false, this );
217 consume();
218 }
219
220 /** See if input contains string 's' throw MismatchedCharException if not
221 * @note the string cannot match EOF
222 */
match(const char * s)223 virtual void match( const char* s )
224 {
225 while( *s != '\0' )
226 {
227 // the & 0xFF is here to prevent sign extension lateron
228 int la_1 = LA(1), c = (*s++ & 0xFF);
229
230 if ( la_1 != c )
231 throw MismatchedCharException(la_1, c, false, this);
232
233 consume();
234 }
235 }
236 /** See if input contains string 's' throw MismatchedCharException if not
237 * @note the string cannot match EOF
238 */
match(const ANTLR_USE_NAMESPACE (std)string & s)239 virtual void match(const ANTLR_USE_NAMESPACE(std)string& s)
240 {
241 size_t len = s.length();
242
243 for (size_t i = 0; i < len; i++)
244 {
245 // the & 0xFF is here to prevent sign extension lateron
246 int la_1 = LA(1), c = (s[i] & 0xFF);
247
248 if ( la_1 != c )
249 throw MismatchedCharException(la_1, c, false, this);
250
251 consume();
252 }
253 }
254 /** See if input does not contain character 'c'
255 * throw MismatchedCharException if not
256 */
matchNot(int c)257 virtual void matchNot(int c)
258 {
259 int la_1 = LA(1);
260
261 if ( la_1 == c )
262 throw MismatchedCharException(la_1, c, true, this);
263
264 consume();
265 }
266 /** See if input contains character in range c1-c2
267 * throw MismatchedCharException if not
268 */
matchRange(int c1,int c2)269 virtual void matchRange(int c1, int c2)
270 {
271 int la_1 = LA(1);
272
273 if ( la_1 < c1 || la_1 > c2 )
274 throw MismatchedCharException(la_1, c1, c2, false, this);
275
276 consume();
277 }
278
getCaseSensitive() const279 virtual bool getCaseSensitive() const
280 {
281 return caseSensitive;
282 }
283
setCaseSensitive(bool t)284 virtual void setCaseSensitive(bool t)
285 {
286 caseSensitive = t;
287 }
288
289 virtual bool getCaseSensitiveLiterals() const=0;
290
291 /// Get the line the scanner currently is in (starts at 1)
getLine() const292 virtual int getLine() const
293 {
294 return inputState->line;
295 }
296
297 /// set the line number
setLine(int l)298 virtual void setLine(int l)
299 {
300 inputState->line = l;
301 }
302
303 /// Get the column the scanner currently is in (starts at 1)
getColumn() const304 virtual int getColumn() const
305 {
306 return inputState->column;
307 }
308 /// set the column number
setColumn(int c)309 virtual void setColumn(int c)
310 {
311 inputState->column = c;
312 }
313
314 /// get the filename for the file currently used
ANTLR_USE_NAMESPACE(std)315 virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const
316 {
317 return inputState->filename;
318 }
319 /// Set the filename the scanner is using (used in error messages)
setFilename(const ANTLR_USE_NAMESPACE (std)string & f)320 virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f)
321 {
322 inputState->filename = f;
323 }
324
getCommitToPath() const325 virtual bool getCommitToPath() const
326 {
327 return commitToPath;
328 }
329
setCommitToPath(bool commit)330 virtual void setCommitToPath(bool commit)
331 {
332 commitToPath = commit;
333 }
334
335 /** return a copy of the current text buffer */
ANTLR_USE_NAMESPACE(std)336 virtual const ANTLR_USE_NAMESPACE(std)string& getText() const
337 {
338 return text;
339 }
340
setText(const ANTLR_USE_NAMESPACE (std)string & s)341 virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s)
342 {
343 text = s;
344 }
345
resetText()346 virtual void resetText()
347 {
348 text = "";
349 inputState->tokenStartColumn = inputState->column;
350 inputState->tokenStartLine = inputState->line;
351 }
352
getTokenObject() const353 virtual RefToken getTokenObject() const
354 {
355 return _returnToken;
356 }
357
358 /** Used to keep track of line breaks, needs to be called from
359 * within generated lexers when a \n \r is encountered.
360 */
newline()361 virtual void newline()
362 {
363 ++inputState->line;
364 inputState->column = 1;
365 }
366
367 /** Advance the current column number by an appropriate amount according
368 * to the tabsize. This method needs to be explicitly called from the
369 * lexer rules encountering tabs.
370 */
tab()371 virtual void tab()
372 {
373 int c = getColumn();
374 int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop
375 setColumn( nc );
376 }
377 /// set the tabsize. Returns the old tabsize
setTabsize(int size)378 int setTabsize( int size )
379 {
380 int oldsize = tabsize;
381 tabsize = size;
382 return oldsize;
383 }
384 /// Return the tabsize used by the scanner
getTabSize() const385 int getTabSize() const
386 {
387 return tabsize;
388 }
389
390 /** Report exception errors caught in nextToken() */
391 virtual void reportError(const RecognitionException& e);
392
393 /** Parser error-reporting function can be overridden in subclass */
394 virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s);
395
396 /** Parser warning-reporting function can be overridden in subclass */
397 virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s);
398
getInputBuffer()399 virtual InputBuffer& getInputBuffer()
400 {
401 return inputState->getInput();
402 }
403
getInputState()404 virtual LexerSharedInputState getInputState()
405 {
406 return inputState;
407 }
408
409 /** set the input state for the lexer.
410 * @note state is a reference counted object, hence no reference */
setInputState(LexerSharedInputState state)411 virtual void setInputState(LexerSharedInputState state)
412 {
413 inputState = state;
414 }
415
416 /// Set the factory for created tokens
setTokenObjectFactory(factory_type factory)417 virtual void setTokenObjectFactory(factory_type factory)
418 {
419 tokenFactory = factory;
420 }
421
422 /** Test the token text against the literals table
423 * Override this method to perform a different literals test
424 */
testLiteralsTable(int ttype) const425 virtual int testLiteralsTable(int ttype) const
426 {
427 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text);
428 if (i != literals.end())
429 ttype = (*i).second;
430 return ttype;
431 }
432
433 /** Test the text passed in against the literals table
434 * Override this method to perform a different literals test
435 * This is used primarily when you want to test a portion of
436 * a token
437 */
testLiteralsTable(const ANTLR_USE_NAMESPACE (std)string & txt,int ttype) const438 virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const
439 {
440 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt);
441 if (i != literals.end())
442 ttype = (*i).second;
443 return ttype;
444 }
445
446 /// Override this method to get more specific case handling
toLower(int c) const447 virtual int toLower(int c) const
448 {
449 // test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?)
450 // also VC++ 6.0 does this. (see fix 422 (is reverted by this fix)
451 // this one is more structural. Maybe make this configurable.
452 return (c == EOF_CHAR ? EOF_CHAR : tolower(c));
453 }
454
455 /** This method is called by YourLexer::nextToken() when the lexer has
456 * hit EOF condition. EOF is NOT a character.
457 * This method is not called if EOF is reached during
458 * syntactic predicate evaluation or during evaluation
459 * of normal lexical rules, which presumably would be
460 * an IOException. This traps the "normal" EOF condition.
461 *
462 * uponEOF() is called after the complete evaluation of
463 * the previous token and only if your parser asks
464 * for another token beyond that last non-EOF token.
465 *
466 * You might want to throw token or char stream exceptions
467 * like: "Heh, premature eof" or a retry stream exception
468 * ("I found the end of this file, go back to referencing file").
469 */
uponEOF()470 virtual void uponEOF()
471 {
472 }
473
474 /// Methods used to change tracing behavior
475 virtual void traceIndent();
476 virtual void traceIn(const char* rname);
477 virtual void traceOut(const char* rname);
478
479 #ifndef NO_STATIC_CONSTS
480 static const int EOF_CHAR = EOF;
481 #else
482 enum {
483 EOF_CHAR = EOF
484 };
485 #endif
486 protected:
487 ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token
488 /// flag indicating wether consume saves characters
489 bool saveConsumedInput;
490 factory_type tokenFactory; ///< Factory for tokens
491 bool caseSensitive; ///< Is this lexer case sensitive
492 ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass
493
494 RefToken _returnToken; ///< used to return tokens w/o using return val
495
496 /// Input state, gives access to input stream, shared among different lexers
497 LexerSharedInputState inputState;
498
499 /** Used during filter mode to indicate that path is desired.
500 * A subsequent scan error will report an error as usual
501 * if acceptPath=true;
502 */
503 bool commitToPath;
504
505 int tabsize; ///< tab size the scanner uses.
506
507 /// Create a new RefToken of type t
makeToken(int t)508 virtual RefToken makeToken(int t)
509 {
510 RefToken tok = tokenFactory();
511 tok->setType(t);
512 tok->setColumn(inputState->tokenStartColumn);
513 tok->setLine(inputState->tokenStartLine);
514 return tok;
515 }
516
517 /** Tracer class, used when -traceLexer is passed to antlr
518 */
519 class Tracer {
520 private:
521 CharScanner* parser;
522 const char* text;
523
524 Tracer(const Tracer& other); // undefined
525 Tracer& operator=(const Tracer& other); // undefined
526 public:
Tracer(CharScanner * p,const char * t)527 Tracer( CharScanner* p,const char* t )
528 : parser(p), text(t)
529 {
530 parser->traceIn(text);
531 }
~Tracer()532 ~Tracer()
533 {
534 parser->traceOut(text);
535 }
536 };
537
538 int traceDepth;
539 private:
540 CharScanner( const CharScanner& other ); // undefined
541 CharScanner& operator=( const CharScanner& other ); // undefined
542
543 #ifndef NO_STATIC_CONSTS
544 static const int NO_CHAR = 0;
545 #else
546 enum {
547 NO_CHAR = 0
548 };
549 #endif
550 };
551
LA(unsigned int i)552 inline int CharScanner::LA(unsigned int i)
553 {
554 int c = inputState->getInput().LA(i);
555
556 if ( caseSensitive )
557 return c;
558 else
559 return toLower(c); // VC 6 tolower bug caught in toLower.
560 }
561
operator ()(const ANTLR_USE_NAMESPACE (std)string & x,const ANTLR_USE_NAMESPACE (std)string & y) const562 inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const
563 {
564 if (scanner->getCaseSensitiveLiterals())
565 return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y);
566 else
567 {
568 #ifdef NO_STRCASECMP
569 return (stricmp(x.c_str(),y.c_str())<0);
570 #else
571 return (strcasecmp(x.c_str(),y.c_str())<0);
572 #endif
573 }
574 }
575
576 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
577 }
578 #endif
579
580 #endif //INC_CharScanner_hpp__
581