1 #ifndef INC_CharScanner_hpp__
2 #define INC_CharScanner_hpp__
3 
4 /* ANTLR Translator Generator
5  * Project led by Terence Parr at http://www.jGuru.com
6  * Software rights: http://www.antlr.org/license.html
7  *
8  * $Id: CharScanner.hpp,v 1.3 2009-10-24 16:20:01 medericboquien Exp $
9  */
10 
11 #include <antlr/config.hpp>
12 
13 // g++-4.3 needs this
14 #include <cstring>
15 #include <cstdlib>
16 
17 // g++-4.4 needs this
18 #include <cstdio>
19 
20 #include <map>
21 
22 #ifdef HAS_NOT_CCTYPE_H
23 #include <ctype.h>
24 #else
25 #include <cctype>
26 #endif
27 
28 #if ( _MSC_VER == 1200 )
29 // VC6 seems to need this
30 // note that this is not a standard C++ include file.
31 # include <stdio.h>
32 #endif
33 
34 #include <antlr/TokenStream.hpp>
35 #include <antlr/RecognitionException.hpp>
36 #include <antlr/SemanticException.hpp>
37 #include <antlr/MismatchedCharException.hpp>
38 #include <antlr/InputBuffer.hpp>
39 #include <antlr/BitSet.hpp>
40 #include <antlr/LexerSharedInputState.hpp>
41 
42 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
43 namespace antlr {
44 #endif
45 
46 class ANTLR_API CharScanner;
47 
ANTLR_C_USING(tolower)48 ANTLR_C_USING(tolower)
49 
50 #ifdef ANTLR_REALLY_NO_STRCASECMP
51 // Apparently, neither strcasecmp nor stricmp is standard, and Codewarrior
52 // on the mac has neither...
53 inline int strcasecmp(const char *s1, const char *s2)
54 {
55 	while (true)
56 	{
57 		char  c1 = tolower(*s1++),
58 				c2 = tolower(*s2++);
59 		if (c1 < c2) return -1;
60 		if (c1 > c2) return 1;
61 		if (c1 == 0) return 0;
62 	}
63 }
64 #else
65 #ifdef NO_STRCASECMP
66 ANTLR_C_USING(stricmp)
67 #else
68 ANTLR_C_USING(strcasecmp)
69 #endif
70 #endif
71 
72 /** Functor for the literals map
73  */
74 class ANTLR_API CharScannerLiteralsLess : public ANTLR_USE_NAMESPACE(std)binary_function<ANTLR_USE_NAMESPACE(std)string,ANTLR_USE_NAMESPACE(std)string,bool> {
75 private:
76 	const CharScanner* scanner;
77 public:
78 #ifdef NO_TEMPLATE_PARTS
CharScannerLiteralsLess()79 	CharScannerLiteralsLess() {} // not really used, definition to appease MSVC
80 #endif
CharScannerLiteralsLess(const CharScanner * theScanner)81 	CharScannerLiteralsLess(const CharScanner* theScanner)
82 	: scanner(theScanner)
83 	{
84 	}
85 	bool operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const;
86 // defaults are good enough..
87 	//	CharScannerLiteralsLess(const CharScannerLiteralsLess&);
88 	//	CharScannerLiteralsLess& operator=(const CharScannerLiteralsLess&);
89 };
90 
91 /** Superclass of generated lexers
92  */
93 class ANTLR_API CharScanner : public TokenStream {
94 protected:
95 	typedef RefToken (*factory_type)();
96 public:
97 	CharScanner(InputBuffer& cb, bool case_sensitive );
98 	CharScanner(InputBuffer* cb, bool case_sensitive );
99 	CharScanner(const LexerSharedInputState& state, bool case_sensitive );
100 
~CharScanner()101 	virtual ~CharScanner()
102 	{
103 	}
104 
105 	virtual int LA(unsigned int i);
106 
append(char c)107 	virtual void append(char c)
108 	{
109 		if (saveConsumedInput) {
110 			int l = text.length();
111 			if ((l%256) == 0)
112 				text.reserve(l+256);
113 			text.replace(l,0,&c,1);
114 		}
115 	}
116 
append(const ANTLR_USE_NAMESPACE (std)string & s)117 	virtual void append(const ANTLR_USE_NAMESPACE(std)string& s)
118 	{
119 		if (saveConsumedInput)
120 			text+=s;
121 	}
122 
commit()123 	virtual void commit()
124 	{
125 		inputState->getInput().commit();
126 	}
127 
128 	virtual void consume();
129 
130 	/** Consume chars until one matches the given char */
consumeUntil(int c)131 	virtual void consumeUntil(int c)
132 	{
133 		for(;;)
134 		{
135 			int la_1 = LA(1);
136 			if( la_1 == EOF_CHAR || la_1 == c )
137 				break;
138 			consume();
139 		}
140 	}
141 
142 	/** Consume chars until one matches the given set */
consumeUntil(const BitSet & set)143 	virtual void consumeUntil(const BitSet& set)
144 	{
145 		for(;;)
146 		{
147 			int la_1 = LA(1);
148 			if( la_1 == EOF_CHAR || set.member(la_1) )
149 				break;
150 			consume();
151 		}
152 	}
153 
154 	/// Mark the current position and return a id for it
mark()155 	virtual unsigned int mark()
156 	{
157 		return inputState->getInput().mark();
158 	}
159 	/// Rewind the scanner to a previously marked position
rewind(unsigned int pos)160 	virtual void rewind(unsigned int pos)
161 	{
162 		inputState->getInput().rewind(pos);
163 	}
164 
165 	/// See if input contains character 'c' throw MismatchedCharException if not
match(int c)166 	virtual void match(int c)
167 	{
168 		int la_1 = LA(1);
169 		if ( la_1 != c )
170 			throw MismatchedCharException(la_1, c, false, this);
171 		consume();
172 	}
173 
174 	/** See if input contains element from bitset b
175 	 * throw MismatchedCharException if not
176 	 */
match(const BitSet & b)177 	virtual void match(const BitSet& b)
178 	{
179 		int la_1 = LA(1);
180 
181 		if ( !b.member(la_1) )
182 			throw MismatchedCharException( la_1, b, false, this );
183 		consume();
184 	}
185 
186 	/** See if input contains string 's' throw MismatchedCharException if not
187 	 * @note the string cannot match EOF
188 	 */
match(const char * s)189 	virtual void match( const char* s )
190 	{
191 		while( *s != '\0' )
192 		{
193 			// the & 0xFF is here to prevent sign extension lateron
194 			int la_1 = LA(1), c = (*s++ & 0xFF);
195 
196 			if ( la_1 != c )
197 				throw MismatchedCharException(la_1, c, false, this);
198 
199 			consume();
200 		}
201 	}
202 	/** See if input contains string 's' throw MismatchedCharException if not
203 	 * @note the string cannot match EOF
204 	 */
match(const ANTLR_USE_NAMESPACE (std)string & s)205 	virtual void match(const ANTLR_USE_NAMESPACE(std)string& s)
206 	{
207 		size_t len = s.length();
208 
209 		for (size_t i = 0; i < len; i++)
210 		{
211 			// the & 0xFF is here to prevent sign extension lateron
212 			int la_1 = LA(1), c = (s[i] & 0xFF);
213 
214 			if ( la_1 != c )
215 				throw MismatchedCharException(la_1, c, false, this);
216 
217 			consume();
218 		}
219 	}
220 	/** See if input does not contain character 'c'
221 	 * throw MismatchedCharException if not
222 	 */
matchNot(int c)223 	virtual void matchNot(int c)
224 	{
225 		int la_1 = LA(1);
226 
227 		if ( la_1 == c )
228 			throw MismatchedCharException(la_1, c, true, this);
229 
230 		consume();
231 	}
232 	/** See if input contains character in range c1-c2
233 	 * throw MismatchedCharException if not
234 	 */
matchRange(int c1,int c2)235 	virtual void matchRange(int c1, int c2)
236 	{
237 		int la_1 = LA(1);
238 
239 		if ( la_1 < c1 || la_1 > c2 )
240 			throw MismatchedCharException(la_1, c1, c2, false, this);
241 
242 		consume();
243 	}
244 
getCaseSensitive() const245 	virtual bool getCaseSensitive() const
246 	{
247 		return caseSensitive;
248 	}
249 
setCaseSensitive(bool t)250 	virtual void setCaseSensitive(bool t)
251 	{
252 		caseSensitive = t;
253 	}
254 
255 	virtual bool getCaseSensitiveLiterals() const=0;
256 
257 	/// Get the line the scanner currently is in (starts at 1)
getLine() const258 	virtual int getLine() const
259 	{
260 		return inputState->line;
261 	}
262 
263 	/// set the line number
setLine(int l)264 	virtual void setLine(int l)
265 	{
266 		inputState->line = l;
267 	}
268 
269 	/// Get the column the scanner currently is in (starts at 1)
getColumn() const270 	virtual int getColumn() const
271 	{
272 		return inputState->column;
273 	}
274 	/// set the column number
setColumn(int c)275 	virtual void setColumn(int c)
276 	{
277 		inputState->column = c;
278 	}
279 
280 	/// get the filename for the file currently used
ANTLR_USE_NAMESPACE(std)281 	virtual const ANTLR_USE_NAMESPACE(std)string& getFilename() const
282 	{
283 		return inputState->filename;
284 	}
285 	/// Set the filename the scanner is using (used in error messages)
setFilename(const ANTLR_USE_NAMESPACE (std)string & f)286 	virtual void setFilename(const ANTLR_USE_NAMESPACE(std)string& f)
287 	{
288 		inputState->filename = f;
289 	}
290 
getCommitToPath() const291 	virtual bool getCommitToPath() const
292 	{
293 		return commitToPath;
294 	}
295 
setCommitToPath(bool commit)296 	virtual void setCommitToPath(bool commit)
297 	{
298 		commitToPath = commit;
299 	}
300 
301 	/** return a copy of the current text buffer */
ANTLR_USE_NAMESPACE(std)302 	virtual const ANTLR_USE_NAMESPACE(std)string& getText() const
303 	{
304 		return text;
305 	}
306 
setText(const ANTLR_USE_NAMESPACE (std)string & s)307 	virtual void setText(const ANTLR_USE_NAMESPACE(std)string& s)
308 	{
309 		text = s;
310 	}
311 
resetText()312 	virtual void resetText()
313 	{
314 		text = "";
315 		inputState->tokenStartColumn = inputState->column;
316 		inputState->tokenStartLine = inputState->line;
317 	}
318 
getTokenObject() const319 	virtual RefToken getTokenObject() const
320 	{
321 		return _returnToken;
322 	}
323 
324 	/** Used to keep track of line breaks, needs to be called from
325 	 * within generated lexers when a \n \r is encountered.
326 	 */
newline()327 	virtual void newline()
328 	{
329 		++inputState->line;
330 		inputState->column = 1;
331 	}
332 
333 	/** Advance the current column number by an appropriate amount according
334 	 * to the tabsize. This method needs to be explicitly called from the
335 	 * lexer rules encountering tabs.
336 	 */
tab()337 	virtual void tab()
338 	{
339 		int c = getColumn();
340 		int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1;      // calculate tab stop
341 		setColumn( nc );
342 	}
343 	/// set the tabsize. Returns the old tabsize
setTabsize(int size)344 	int setTabsize( int size )
345 	{
346 		int oldsize = tabsize;
347 		tabsize = size;
348 		return oldsize;
349 	}
350 	/// Return the tabsize used by the scanner
getTabSize() const351 	int getTabSize() const
352 	{
353 		return tabsize;
354 	}
355 
356 	/** Terminate program using exit()
357 	 * @deprecated will be removed in the next release. It is not used.
358 	 */
359 	virtual void panic();
360 	/** Terminate program using exit()
361 	 * @deprecated will be removed in the next release. It is not used.
362 	 */
363 	virtual void panic(const ANTLR_USE_NAMESPACE(std)string& s);
364 
365 	/** Report exception errors caught in nextToken() */
366 	virtual void reportError(const RecognitionException& e);
367 
368 	/** Parser error-reporting function can be overridden in subclass */
369 	virtual void reportError(const ANTLR_USE_NAMESPACE(std)string& s);
370 
371 	/** Parser warning-reporting function can be overridden in subclass */
372 	virtual void reportWarning(const ANTLR_USE_NAMESPACE(std)string& s);
373 
getInputBuffer()374 	virtual InputBuffer& getInputBuffer()
375 	{
376 		return inputState->getInput();
377 	}
378 
getInputState()379 	virtual LexerSharedInputState getInputState()
380 	{
381 		return inputState;
382 	}
383 
384 	/** set the input state for the lexer.
385 	 * @note state is a reference counted object, hence no reference */
setInputState(LexerSharedInputState state)386 	virtual void setInputState(LexerSharedInputState state)
387 	{
388 		inputState = state;
389 	}
390 
391 	/// Set the factory for created tokens
setTokenObjectFactory(factory_type factory)392 	virtual void setTokenObjectFactory(factory_type factory)
393 	{
394 		tokenFactory = factory;
395 	}
396 
397 	/** Test the token text against the literals table
398 	 * Override this method to perform a different literals test
399 	 */
testLiteralsTable(int ttype) const400 	virtual int testLiteralsTable(int ttype) const
401 	{
402 		ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(text);
403 		if (i != literals.end())
404 			ttype = (*i).second;
405 		return ttype;
406 	}
407 
408 	/** Test the text passed in against the literals table
409 	 * Override this method to perform a different literals test
410 	 * This is used primarily when you want to test a portion of
411 	 * a token
412 	 */
testLiteralsTable(const ANTLR_USE_NAMESPACE (std)string & txt,int ttype) const413 	virtual int testLiteralsTable(const ANTLR_USE_NAMESPACE(std)string& txt,int ttype) const
414 	{
415 		ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess>::const_iterator i = literals.find(txt);
416 		if (i != literals.end())
417 			ttype = (*i).second;
418 		return ttype;
419 	}
420 
421 	/// Override this method to get more specific case handling
toLower(int c) const422 	virtual int toLower(int c) const
423 	{
424 		// test on EOF_CHAR for buggy (?) STLPort tolower (or HPUX tolower?)
425 		// also VC++ 6.0 does this. (see fix 422 (is reverted by this fix)
426 		// this one is more structural. Maybe make this configurable.
427 		return (c == EOF_CHAR ? EOF_CHAR : tolower(c));
428 	}
429 
430 	/** This method is called by YourLexer::nextToken() when the lexer has
431 	 *  hit EOF condition.  EOF is NOT a character.
432 	 *  This method is not called if EOF is reached during
433 	 *  syntactic predicate evaluation or during evaluation
434 	 *  of normal lexical rules, which presumably would be
435 	 *  an IOException.  This traps the "normal" EOF condition.
436 	 *
437 	 *  uponEOF() is called after the complete evaluation of
438 	 *  the previous token and only if your parser asks
439 	 *  for another token beyond that last non-EOF token.
440 	 *
441 	 *  You might want to throw token or char stream exceptions
442 	 *  like: "Heh, premature eof" or a retry stream exception
443 	 *  ("I found the end of this file, go back to referencing file").
444 	 */
uponEOF()445 	virtual void uponEOF()
446 	{
447 	}
448 
449 	/// Methods used to change tracing behavior
450 	virtual void traceIndent();
451 	virtual void traceIn(const char* rname);
452 	virtual void traceOut(const char* rname);
453 
454 #ifndef NO_STATIC_CONSTS
455 	static const int EOF_CHAR = EOF;
456 #else
457 	enum {
458 		EOF_CHAR = EOF
459 	};
460 #endif
461 protected:
462 	ANTLR_USE_NAMESPACE(std)string text; ///< Text of current token
463  	/// flag indicating wether consume saves characters
464 	bool saveConsumedInput;
465 	factory_type tokenFactory;				///< Factory for tokens
466 	bool caseSensitive; 						///< Is this lexer case sensitive
467 	ANTLR_USE_NAMESPACE(std)map<ANTLR_USE_NAMESPACE(std)string,int,CharScannerLiteralsLess> literals; // set by subclass
468 
469 	RefToken _returnToken;		///< used to return tokens w/o using return val
470 
471 	/// Input state, gives access to input stream, shared among different lexers
472 	LexerSharedInputState inputState;
473 
474 	/** Used during filter mode to indicate that path is desired.
475 	 * A subsequent scan error will report an error as usual
476 	 * if acceptPath=true;
477 	 */
478 	bool commitToPath;
479 
480 	int tabsize; 	///< tab size the scanner uses.
481 
482 	/// Create a new RefToken of type t
makeToken(int t)483 	virtual RefToken makeToken(int t)
484 	{
485 		RefToken tok = tokenFactory();
486 		tok->setType(t);
487 		tok->setColumn(inputState->tokenStartColumn);
488 		tok->setLine(inputState->tokenStartLine);
489 		return tok;
490 	}
491 
492 	/** Tracer class, used when -traceLexer is passed to antlr
493 	 */
494 	class Tracer {
495 	private:
496 		CharScanner* parser;
497 		const char* text;
498 
499 		Tracer(const Tracer& other); 					// undefined
500 		Tracer& operator=(const Tracer& other); 	// undefined
501 	public:
Tracer(CharScanner * p,const char * t)502 		Tracer( CharScanner* p,const char* t )
503 		: parser(p), text(t)
504 		{
505 			parser->traceIn(text);
506 		}
~Tracer()507 		~Tracer()
508 		{
509 			parser->traceOut(text);
510 		}
511 	};
512 
513 	int traceDepth;
514 private:
515 	CharScanner( const CharScanner& other ); 			  		// undefined
516 	CharScanner& operator=( const CharScanner& other );	// undefined
517 
518 #ifndef NO_STATIC_CONSTS
519 	static const int NO_CHAR = 0;
520 #else
521 	enum {
522 		NO_CHAR = 0
523 	};
524 #endif
525 };
526 
LA(unsigned int i)527 inline int CharScanner::LA(unsigned int i)
528 {
529 	int c = inputState->getInput().LA(i);
530 
531 	if ( caseSensitive )
532 		return c;
533 	else
534 		return toLower(c);	// VC 6 tolower bug caught in toLower.
535 }
536 
operator ()(const ANTLR_USE_NAMESPACE (std)string & x,const ANTLR_USE_NAMESPACE (std)string & y) const537 inline bool CharScannerLiteralsLess::operator() (const ANTLR_USE_NAMESPACE(std)string& x,const ANTLR_USE_NAMESPACE(std)string& y) const
538 {
539 	if (scanner->getCaseSensitiveLiterals())
540 		return ANTLR_USE_NAMESPACE(std)less<ANTLR_USE_NAMESPACE(std)string>()(x,y);
541 	else
542 	{
543 #ifdef NO_STRCASECMP
544 		return (stricmp(x.c_str(),y.c_str())<0);
545 #else
546 		return (strcasecmp(x.c_str(),y.c_str())<0);
547 #endif
548 	}
549 }
550 
551 #ifdef ANTLR_CXX_SUPPORTS_NAMESPACE
552 }
553 #endif
554 
555 #endif //INC_CharScanner_hpp__
556