1 #ifndef INC_CharScanner_hpp__
2 #define INC_CharScanner_hpp__
3 
4 /* ANTLR Translator Generator
5  * Project led by Terence Parr at http://www.jGuru.com
6  * Software rights: http://www.antlr.org/license.html
7  *
8  * $Id: //depot/code/org.antlr/release/antlr-2.7.7/lib/cpp/antlr/CharScanner.hpp#2 $
9  */
10 
11 #include <antlr/config.hpp>
12 
13 #include <map>
14 
15 #ifdef HAS_NOT_CCTYPE_H
16 #include <ctype.h>
17 #else
18 #include <cctype>
19 #endif
20 
21 #include <cstdio>    // for EOF
22 extern "C" {
23 #include <strings.h> // for strcasecmp
24 }
25 
26 #if ( _MSC_VER == 1200 )
27 // VC6 seems to need this
28 // note that this is not a standard C++ include file.
29 # include <stdio.h>
30 #endif
31 
32 #include <antlr/TokenStream.hpp>
33 #include <antlr/RecognitionException.hpp>
34 #include <antlr/SemanticException.hpp>
35 #include <antlr/MismatchedCharException.hpp>
36 #include <antlr/InputBuffer.hpp>
37 #include <antlr/BitSet.hpp>
38 #include <antlr/LexerSharedInputState.hpp>
39 #include <strings.h>
40 
41 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
42 namespace antlr {
43 #endif
44 
45 class ANTLR_API CharScanner;
46 
ANTLR_C_USING(tolower)47 ANTLR_C_USING(tolower)
48 
49 #ifdef ANTLR_REALLY_NO_STRCASECMP
50 // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior
51 // on the mac has neither...
52 inline int strcasecmp(const char *s1, const char *s2)
53 {
54 	while (true)
55 	{
56 		char  c1 = tolower(*s1++),
57 				c2 = tolower(*s2++);
58 		if (c1 < c2) return -1;
59 		if (c1 > c2) return 1;
60 		if (c1 == 0) return 0;
61 	}
62 }
63 #else
64 #ifdef NO_STRCASECMP
65 ANTLR_C_USING(stricmp)
66 #else
67 ANTLR_C_USING(strcasecmp)
68 #endif
69 #endif
70 
71 /** Functor for the literals map
72  */
73 class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> {
74 private:
75 	const CharScanner* scanner;
76 public:
77 #ifdef NO_TEMPLATE_PARTS
CharScannerLiteralsLess()78 	CharScannerLiteralsLess() {} // not really used, definition to appease MSVC
79 #endif
CharScannerLiteralsLess(const CharScanner * theScanner)80 	CharScannerLiteralsLess(const CharScanner* theScanner)
81 	: scanner(theScanner)
82 	{
83 	}
84 	bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const;
85 // defaults are good enough..
86 	//	CharScannerLiteralsLess(const CharScannerLiteralsLess&);
87 	//	CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&);
88 };
89 
90 /** Superclass of generated lexers
91  */
92 class ANTLR_API CharScanner : public TokenStream {
93 protected:
94 	typedef RefToken (*factory_type)();
95 public:
96 	CharScanner(InputBuffer& cb, bool case_sensitive );
97 	CharScanner(InputBuffer* cb, bool case_sensitive );
98 	CharScanner(const LexerSharedInputState& state, bool case_sensitive );
99 
~CharScanner()100 	virtual ~CharScanner()
101 	{
102 	}
103 
104 	virtual int LA(unsigned int i);
105 
append(char c)106 	virtual void append(char c)
107 	{
108 		if (saveConsumedInput)
109 		{
110 			size_t l = text.length();
111 
112 			if ((l%256) == 0)
113 				text.reserve(l+256);
114 
115 			text.replace(l,0,&c,1);
116 		}
117 	}
118 
append(const ANTLR_USE_NAMESPACE (std)string & s)119 	virtual void append(const ANTLR_USE_NAMESPACE(std)string& s)
120 	{
121 		if( saveConsumedInput )
122 			text += s;
123 	}
124 
commit()125 	virtual void commit()
126 	{
127 		inputState->getInput().commit();
128 	}
129 
130 	/** called by the generated lexer to do error recovery, override to
131 	 * customize the behaviour.
132 	 */
recover(const RecognitionException & ex,const BitSet & tokenSet)133 	virtual void recover(const RecognitionException& ex, const BitSet& tokenSet)
134 	{
135 		consume();
136 		consumeUntil(tokenSet);
137 	}
138 
consume()139 	virtual void consume()
140 	{
141 		if (inputState->guessing == 0)
142 		{
143 			int c = LA(1);
144 			if (caseSensitive)
145 			{
146 				append(c);
147 			}
148 			else
149 			{
150 				// use input.LA(), not LA(), to get original case
151 				// CharScanner.LA() would toLower it.
152 				append(inputState->getInput().LA(1));
153 			}
154 
155 			// RK: in a sense I don't like this automatic handling.
156 			if (c == '\t')
157 				tab();
158 			else
159 				inputState->column++;
160 		}
161 		inputState->getInput().consume();
162 	}
163 
164 	/** Consume chars until one matches the given char */
consumeUntil(int c)165 	virtual void consumeUntil(int c)
166 	{
167 		for(;;)
168 		{
169 			int la_1 = LA(1);
170 			if( la_1 == EOF_CHAR || la_1 == c )
171 				break;
172 			consume();
173 		}
174 	}
175 
176 	/** Consume chars until one matches the given set */
consumeUntil(const BitSet & set)177 	virtual void consumeUntil(const BitSet& set)
178 	{
179 		for(;;)
180 		{
181 			int la_1 = LA(1);
182 			if( la_1 == EOF_CHAR || set.member(la_1) )
183 				break;
184 			consume();
185 		}
186 	}
187 
188 	/// Mark the current position and return a id for it
mark()189 	virtual unsigned int mark()
190 	{
191 		return inputState->getInput().mark();
192 	}
193 	/// Rewind the scanner to a previously marked position
rewind(unsigned int pos)194 	virtual void rewind(unsigned int pos)
195 	{
196 		inputState->getInput().rewind(pos);
197 	}
198 
199 	/// See if input contains character 'c' throw MismatchedCharException if not
match(int c)200 	virtual void match(int c)
201 	{
202 		int la_1 = LA(1);
203 		if ( la_1 != c )
204 			throw MismatchedCharException(la_1, c, false, this);
205 		consume();
206 	}
207 
208 	/** See if input contains element from bitset b
209 	 * throw MismatchedCharException if not
210 	 */
match(const BitSet & b)211 	virtual void match(const BitSet& b)
212 	{
213 		int la_1 = LA(1);
214 
215 		if ( !b.member(la_1) )
216 			throw MismatchedCharException( la_1, b, false, this );
217 		consume();
218 	}
219 
220 	/** See if input contains string 's' throw MismatchedCharException if not
221 	 * @note the string cannot match EOF
222 	 */
match(const char * s)223 	virtual void match( const char* s )
224 	{
225 		while( *s != '\0' )
226 		{
227 			// the & 0xFF is here to prevent sign extension lateron
228 			int la_1 = LA(1), c = (*s++ & 0xFF);
229 
230 			if ( la_1 != c )
231 				throw MismatchedCharException(la_1, c, false, this);
232 
233 			consume();
234 		}
235 	}
236 	/** See if input contains string 's' throw MismatchedCharException if not
237 	 * @note the string cannot match EOF
238 	 */
match(const ANTLR_USE_NAMESPACE (std)string & s)239 	virtual void match(const ANTLR_USE_NAMESPACE(std)string& s)
240 	{
241 		size_t len = s.length();
242 
243 		for (size_t i = 0; i < len; i++)
244 		{
245 			// the & 0xFF is here to prevent sign extension lateron
246 			int la_1 = LA(1), c = (s[i] & 0xFF);
247 
248 			if ( la_1 != c )
249 				throw MismatchedCharException(la_1, c, false, this);
250 
251 			consume();
252 		}
253 	}
254 	/** See if input does not contain character 'c'
255 	 * throw MismatchedCharException if not
256 	 */
matchNot(int c)257 	virtual void matchNot(int c)
258 	{
259 		int la_1 = LA(1);
260 
261 		if ( la_1 == c )
262 			throw MismatchedCharException(la_1, c, true, this);
263 
264 		consume();
265 	}
266 	/** See if input contains character in range c1-c2
267 	 * throw MismatchedCharException if not
268 	 */
matchRange(int c1,int c2)269 	virtual void matchRange(int c1, int c2)
270 	{
271 		int la_1 = LA(1);
272 
273 		if ( la_1 < c1 || la_1 > c2 )
274 			throw MismatchedCharException(la_1, c1, c2, false, this);
275 
276 		consume();
277 	}
278 
getCaseSensitive() const279 	virtual bool getCaseSensitive() const
280 	{
281 		return caseSensitive;
282 	}
283 
setCaseSensitive(bool t)284 	virtual void setCaseSensitive(bool t)
285 	{
286 		caseSensitive = t;
287 	}
288 
289 	virtual bool getCaseSensitiveLiterals() const=0;
290 
291 	/// Get the line the scanner currently is in (starts at 1)
getLine() const292 	virtual int getLine() const
293 	{
294 		return inputState->line;
295 	}
296 
297 	/// set the line number
setLine(int l)298 	virtual void setLine(int l)
299 	{
300 		inputState->line = l;
301 	}
302 
303 	/// Get the column the scanner currently is in (starts at 1)
getColumn() const304 	virtual int getColumn() const
305 	{
306 		return inputState->column;
307 	}
308 	/// set the column number
setColumn(int c)309 	virtual void setColumn(int c)
310 	{
311 		inputState->column = c;
312 	}
313 
314 	/// get the filename for the file currently used
ANTLR_USE_NAMESPACE(std)315 	virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const
316 	{
317 		return inputState->filename;
318 	}
319 	/// Set the filename the scanner is using (used in error messages)
setFilename(const ANTLR_USE_NAMESPACE (std)string & f)320 	virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f)
321 	{
322 		inputState->filename = f;
323 	}
324 
getCommitToPath() const325 	virtual bool getCommitToPath() const
326 	{
327 		return commitToPath;
328 	}
329 
setCommitToPath(bool commit)330 	virtual void setCommitToPath(bool commit)
331 	{
332 		commitToPath = commit;
333 	}
334 
335 	/** return a copy of the current text buffer */
ANTLR_USE_NAMESPACE(std)336 	virtual const ANTLR_USE_NAMESPACE(std)string& getText() const
337 	{
338 		return text;
339 	}
340 
setText(const ANTLR_USE_NAMESPACE (std)string & s)341 	virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s)
342 	{
343 		text = s;
344 	}
345 
resetText()346 	virtual void resetText()
347 	{
348 		text = "";
349 		inputState->tokenStartColumn = inputState->column;
350 		inputState->tokenStartLine = inputState->line;
351 	}
352 
getTokenObject() const353 	virtual RefToken getTokenObject() const
354 	{
355 		return _returnToken;
356 	}
357 
358 	/** Used to keep track of line breaks, needs to be called from
359 	 * within generated lexers when a \n \r is encountered.
360 	 */
newline()361 	virtual void newline()
362 	{
363 		++inputState->line;
364 		inputState->column = 1;
365 	}
366 
367 	/** Advance the current column number by an appropriate amount according
368 	 * to the tabsize. This method needs to be explicitly called from the
369 	 * lexer rules encountering tabs.
370 	 */
tab()371 	virtual void tab()
372 	{
373 		int c = getColumn();
374 		int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1;      // calculate tab stop
375 		setColumn( nc );
376 	}
377 	/// set the tabsize. Returns the old tabsize
setTabsize(int size)378 	int setTabsize( int size )
379 	{
380 		int oldsize = tabsize;
381 		tabsize = size;
382 		return oldsize;
383 	}
384 	/// Return the tabsize used by the scanner
getTabSize() const385 	int getTabSize() const
386 	{
387 		return tabsize;
388 	}
389 
390 	/** Report exception errors caught in nextToken() */
391 	virtual void reportError(const RecognitionException& e);
392 
393 	/** Parser error-reporting function can be overridden in subclass */
394 	virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s);
395 
396 	/** Parser warning-reporting function can be overridden in subclass */
397 	virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s);
398 
getInputBuffer()399 	virtual InputBuffer& getInputBuffer()
400 	{
401 		return inputState->getInput();
402 	}
403 
getInputState()404 	virtual LexerSharedInputState getInputState()
405 	{
406 		return inputState;
407 	}
408 
409 	/** set the input state for the lexer.
410 	 * @note state is a reference counted object, hence no reference */
setInputState(LexerSharedInputState state)411 	virtual void setInputState(LexerSharedInputState state)
412 	{
413 		inputState = state;
414 	}
415 
416 	/// Set the factory for created tokens
setTokenObjectFactory(factory_type factory)417 	virtual void setTokenObjectFactory(factory_type factory)
418 	{
419 		tokenFactory = factory;
420 	}
421 
422 	/** Test the token text against the literals table
423 	 * Override this method to perform a different literals test
424 	 */
testLiteralsTable(int ttype) const425 	virtual int testLiteralsTable(int ttype) const
426 	{
427 		ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text);
428 		if (i != literals.end())
429 			ttype = (*i).second;
430 		return ttype;
431 	}
432 
433 	/** Test the text passed in against the literals table
434 	 * Override this method to perform a different literals test
435 	 * This is used primarily when you want to test a portion of
436 	 * a token
437 	 */
testLiteralsTable(const ANTLR_USE_NAMESPACE (std)string & txt,int ttype) const438 	virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const
439 	{
440 		ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt);
441 		if (i != literals.end())
442 			ttype = (*i).second;
443 		return ttype;
444 	}
445 
446 	/// Override this method to get more specific case handling
toLower(int c) const447 	virtual int toLower(int c) const
448 	{
449 		// test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?)
450 		// also VC++ 6.0 does this. (see fix 422 (is reverted by this fix)
451 		// this one is more structural. Maybe make this configurable.
452 		return (c == EOF_CHAR ? EOF_CHAR : tolower(c));
453 	}
454 
455 	/** This method is called by YourLexer::nextToken() when the lexer has
456 	 *  hit EOF condition.  EOF is NOT a character.
457 	 *  This method is not called if EOF is reached during
458 	 *  syntactic predicate evaluation or during evaluation
459 	 *  of normal lexical rules, which presumably would be
460 	 *  an IOException.  This traps the "normal" EOF condition.
461 	 *
462 	 *  uponEOF() is called after the complete evaluation of
463 	 *  the previous token and only if your parser asks
464 	 *  for another token beyond that last non-EOF token.
465 	 *
466 	 *  You might want to throw token or char stream exceptions
467 	 *  like: "Heh, premature eof" or a retry stream exception
468 	 *  ("I found the end of this file, go back to referencing file").
469 	 */
uponEOF()470 	virtual void uponEOF()
471 	{
472 	}
473 
474 	/// Methods used to change tracing behavior
475 	virtual void traceIndent();
476 	virtual void traceIn(const char* rname);
477 	virtual void traceOut(const char* rname);
478 
479 #ifndef NO_STATIC_CONSTS
480 	static const int EOF_CHAR = EOF;
481 #else
482 	enum {
483 		EOF_CHAR = EOF
484 	};
485 #endif
486 protected:
487 	ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token
488  	/// flag indicating wether consume saves characters
489 	bool saveConsumedInput;
490 	factory_type tokenFactory;				///< Factory for tokens
491 	bool caseSensitive; 						///< Is this lexer case sensitive
492 	ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass
493 
494 	RefToken _returnToken;		///< used to return tokens w/o using return val
495 
496 	/// Input state, gives access to input stream, shared among different lexers
497 	LexerSharedInputState inputState;
498 
499 	/** Used during filter mode to indicate that path is desired.
500 	 * A subsequent scan error will report an error as usual
501 	 * if acceptPath=true;
502 	 */
503 	bool commitToPath;
504 
505 	int tabsize; 	///< tab size the scanner uses.
506 
507 	/// Create a new RefToken of type t
makeToken(int t)508 	virtual RefToken makeToken(int t)
509 	{
510 		RefToken tok = tokenFactory();
511 		tok->setType(t);
512 		tok->setColumn(inputState->tokenStartColumn);
513 		tok->setLine(inputState->tokenStartLine);
514 		return tok;
515 	}
516 
517 	/** Tracer class, used when -traceLexer is passed to antlr
518 	 */
519 	class Tracer {
520 	private:
521 		CharScanner* parser;
522 		const char* text;
523 
524 		Tracer(const Tracer& other); 					// undefined
525 		Tracer& operator=(const Tracer& other); 	// undefined
526 	public:
Tracer(CharScanner * p,const char * t)527 		Tracer( CharScanner* p,const char* t )
528 		: parser(p), text(t)
529 		{
530 			parser->traceIn(text);
531 		}
~Tracer()532 		~Tracer()
533 		{
534 			parser->traceOut(text);
535 		}
536 	};
537 
538 	int traceDepth;
539 private:
540 	CharScanner( const CharScanner& other ); 			  		// undefined
541 	CharScanner& operator=( const CharScanner& other );	// undefined
542 
543 #ifndef NO_STATIC_CONSTS
544 	static const int NO_CHAR = 0;
545 #else
546 	enum {
547 		NO_CHAR = 0
548 	};
549 #endif
550 };
551 
LA(unsigned int i)552 inline int CharScanner::LA(unsigned int i)
553 {
554 	int c = inputState->getInput().LA(i);
555 
556 	if ( caseSensitive )
557 		return c;
558 	else
559 		return toLower(c);	// VC 6 tolower bug caught in toLower.
560 }
561 
operator ()(const ANTLR_USE_NAMESPACE (std)string & x,const ANTLR_USE_NAMESPACE (std)string & y) const562 inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const
563 {
564 	if (scanner->getCaseSensitiveLiterals())
565 		return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y);
566 	else
567 	{
568 #ifdef NO_STRCASECMP
569 		return (stricmp(x.c_str(),y.c_str())<0);
570 #else
571 		return (strcasecmp(x.c_str(),y.c_str())<0);
572 #endif
573 	}
574 }
575 
576 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
577 }
578 #endif
579 
580 #endif //INC_CharScanner_hpp__
581