1 #ifndef INC_UnicodeCharScanner_hpp__
2 #define INC_UnicodeCharScanner_hpp__
3 
4 #include <map>
5 #include <cctype>
6 
7 #include <antlr/config.hpp>
8 #include <antlr/CommonToken.hpp>
9 #include <antlr/TokenStream.hpp>
10 #include <antlr/RecognitionException.hpp>
11 #include <antlr/SemanticException.hpp>
12 #include <antlr/InputBuffer.hpp>
13 #include <antlr/BitSet.hpp>
14 #include <antlr/LexerSharedInputState.hpp>
15 
16 #include "MismatchedUnicodeCharException.hpp"
17 
18 /** Superclass of generated lexers
19  */
20 class UnicodeCharScanner : public antlr::TokenStream {
21 protected:
22 	typedef antlr::RefToken (*factory_type)();
23 public:
24 	typedef unsigned int char_type;
25 	typedef std::map<std::string,int> string_map;
26 
UnicodeCharScanner(antlr::InputBuffer & cb,bool case_sensitive)27 	UnicodeCharScanner( antlr::InputBuffer& cb, bool case_sensitive )
28 	: saveConsumedInput(true)
29 	, caseSensitive(case_sensitive)
30 	, literals()
31 	, inputState(new antlr::LexerInputState(cb))
32 	, commitToPath(false)
33 	, tabsize(8)
34 	, traceDepth(0)
35 	{
36 		setTokenObjectFactory(&antlr::CommonToken::factory);
37 	}
UnicodeCharScanner(antlr::InputBuffer * cb,bool case_sensitive)38 	UnicodeCharScanner( antlr::InputBuffer* cb, bool case_sensitive )
39 	: saveConsumedInput(true)
40 	, caseSensitive(case_sensitive)
41 	, literals()
42 	, inputState(new antlr::LexerInputState(cb))
43 	, commitToPath(false)
44 	, tabsize(8)
45 	, traceDepth(0)
46 	{
47 		setTokenObjectFactory(&antlr::CommonToken::factory);
48 	}
UnicodeCharScanner(const antlr::LexerSharedInputState & state,bool case_sensitive)49 	UnicodeCharScanner( const antlr::LexerSharedInputState& state, bool case_sensitive )
50 	: saveConsumedInput(true)
51 	, caseSensitive(case_sensitive)
52 	, literals()
53 	, inputState(state)
54 	, commitToPath(false)
55 	, tabsize(8)
56 	, traceDepth(0)
57 	{
58 		setTokenObjectFactory(&antlr::CommonToken::factory);
59 	}
60 
~UnicodeCharScanner()61 	virtual ~UnicodeCharScanner()
62 	{
63 	}
64 
LA(char_type i)65 	virtual char_type LA(char_type i)
66 	{
67 		char_type c = inputState->getInput().LA(i);
68 		return c;
69 	}
70 
append(char_type c)71 	virtual void append(char_type c)
72 	{
73 		if (saveConsumedInput)
74 		{
75 			size_t len = text.length();
76 
77 			if( (len % 256) == 0 )
78 				text.reserve(len+256);
79 
80 // This is how UTF8 is encoded
81 // +---------------------------+----------+----------+----------+----------+
82 // | Unicode scalar            | 1st      | 2nd      | 3th      | 4th      |
83 // +---------------------------+----------+----------+----------+----------+
84 // |00000000 0xxxxxxx          | 0xxxxxxx |          |          |          |
85 // |00000yyy yyxxxxxx          | 110yyyyy | 10xxxxxx |          |          |
86 // |zzzzyyyy yyxxxxxx          | 1110zzzz | 10yyyyyy | 10xxxxxx |          |
87 // |000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
88 // +---------------------------+----------+----------+----------+----------+
89 
90 			if (c < 0x80)
91 			{
92 				text += c;
93 				return;
94 			}
95 			else if (c < 0x800)
96 			{
97 				text += ( (c >> 6) | 0xC0 );
98 				text += ( c & 0x3F | 0x80 );
99 			}
100 			else if (c < 0x10000)
101 			{
102 				text += ( (c >> 12) | 0xE0 );
103 				text += ( ((c >> 6) & 0x3F) | 0x80 );
104 				text += ( (c & 0x3F) | 0x80 );
105 			}
106 			else if (c < 0x200000)
107 			{
108 				text += ( (c >> 18) | 0xF0 );				// first 3 bits
109 				text += ( (((c >> 16) & 0x3) << 4) |
110 								 ((c >> 12) & 0xF) | 0x80 );
111 				text += ( ((c >> 6) & 0x3F) | 0x80 );
112 				text += ( (c & 0x3F) | 0x80 );
113 			}
114 			else
115 				assert(0);
116 		}
117 	}
118 
append(const std::string & s)119 	virtual void append(const std::string& s)
120 	{
121 		assert(0);
122 		if (saveConsumedInput)
123 			text+=s;
124 	}
125 
commit()126 	virtual void commit()
127 	{
128 		inputState->getInput().commit();
129 	}
130 
consume()131 	virtual void consume()
132 	{
133 		if (inputState->guessing == 0)
134 		{
135 			char_type c = LA(1);
136 			append(c);
137 			inputState->column++;
138 		}
139 		inputState->getInput().consume();
140 	}
141 
142 	/** Consume chars until one matches the given char */
consumeUntil(char_type c)143 	virtual void consumeUntil(char_type c)
144 	{
145 		for(;;)
146 		{
147 			char_type la_1 = LA(1);
148 			if( static_cast<char_type>(EOF_CHAR) == la_1 || la_1 == c )
149 				break;
150 			consume();
151 		}
152 	}
153 
154 	/** Consume chars until one matches the given set */
consumeUntil(const antlr::BitSet & set)155 	virtual void consumeUntil(const antlr::BitSet& set)
156 	{
157 		for(;;)
158 		{
159 			char_type la_1 = LA(1);
160 			if( static_cast<char_type>(EOF_CHAR) == la_1 || set.member(la_1) )
161 				break;
162 			consume();
163 		}
164 	}
165 
166 	/// Mark the current position and return a id for it
mark()167 	virtual unsigned int mark()
168 	{
169 		return inputState->getInput().mark();
170 	}
171 
172 	/// Rewind the scanner to a previously marked position
rewind(unsigned int pos)173 	virtual void rewind(unsigned int pos)
174 	{
175 		inputState->getInput().rewind(pos);
176 	}
177 
178 	/// See if input contains character 'c' throw MismatchedUnicodeCharException if not
match(char_type c)179 	virtual void match(char_type c)
180 	{
181 		char_type la_1 = LA(1);
182 		if ( la_1 != c )
183 			throw MismatchedUnicodeCharException(la_1, c, false, this);
184 		consume();
185 	}
186 
187 	/** See if input contains element from bitset b
188 	 * throw MismatchedUnicodeCharException if not
189 	 */
match(const antlr::BitSet & b)190 	virtual void match(const antlr::BitSet& b)
191 	{
192 		char_type la_1 = LA(1);
193 
194 		if ( !b.member(la_1) )
195 			throw MismatchedUnicodeCharException( la_1, b, false, this );
196 		consume();
197 	}
198 
199 	/** See if input contains string 's' throw MismatchedUnicodeCharException if not
200 	 * @note the string cannot match EOF
201 	 */
match(const char * s)202 	virtual void match( const char* s )
203 	{
204 		while( *s != '\0' )
205 		{
206 			// the & 0xFF is here to prevent sign extension lateron
207 			char_type la_1 = LA(1), c = (*s++ & 0xFF);
208 
209 			if ( la_1 != c )
210 				throw MismatchedUnicodeCharException(la_1, c, false, this);
211 
212 			consume();
213 		}
214 	}
215 	/** See if input contains string 's' throw MismatchedUnicodeCharException if not
216 	 * @note the string cannot match EOF
217 	 */
match(const std::string & s)218 	virtual void match(const std::string& s)
219 	{
220 		size_t len = s.length();
221 
222 		for (size_t i = 0; i < len; i++)
223 		{
224 			// the & 0xFF is here to prevent sign extension lateron
225 			char_type la_1 = LA(1), c = (s[i] & 0xFF);
226 
227 			if ( la_1 != c )
228 				throw MismatchedUnicodeCharException(la_1, c, false, this);
229 
230 			consume();
231 		}
232 	}
233 	/** See if input does not contain character 'c'
234 	 * throw MismatchedUnicodeCharException if not
235 	 */
matchNot(char_type c)236 	virtual void matchNot(char_type c)
237 	{
238 		char_type la_1 = LA(1);
239 
240 		if ( la_1 == c )
241 			throw MismatchedUnicodeCharException(la_1, c, true, this);
242 
243 		consume();
244 	}
245 	/** See if input contains character in range c1-c2
246 	 * throw MismatchedUnicodeCharException if not
247 	 */
matchRange(char_type c1,char_type c2)248 	virtual void matchRange(char_type c1, char_type c2)
249 	{
250 		char_type la_1 = LA(1);
251 
252 		if ( la_1 < c1 || la_1 > c2 )
253 			throw MismatchedUnicodeCharException(la_1, c1, c2, false, this);
254 
255 		consume();
256 	}
257 
258 	/// Get the line the scanner currently is in (starts at 1)
getLine() const259 	virtual int getLine() const
260 	{
261 		return inputState->line;
262 	}
263 
264 	/// set the line number
setLine(int l)265 	virtual void setLine(int l)
266 	{
267 		inputState->line = l;
268 	}
269 
270 	/// Get the column the scanner currently is in (starts at 1)
getColumn() const271 	virtual int getColumn() const
272 	{
273 		return inputState->column;
274 	}
275 	/// set the column number
setColumn(int c)276 	virtual void setColumn(int c)
277 	{
278 		inputState->column = c;
279 	}
280 
281 	/// get the filename for the file currently used
getFilename() const282 	virtual const std::string& getFilename() const
283 	{
284 		return inputState->filename;
285 	}
286 	/// Set the filename the scanner is using (used in error messages)
setFilename(const std::string & f)287 	virtual void setFilename(const std::string& f)
288 	{
289 		inputState->filename = f;
290 	}
291 
getCommitToPath() const292 	virtual bool getCommitToPath() const
293 	{
294 		return commitToPath;
295 	}
296 
setCommitToPath(bool commit)297 	virtual void setCommitToPath(bool commit)
298 	{
299 		commitToPath = commit;
300 	}
301 
302 	/** return a copy of the current text buffer */
getText() const303 	virtual const std::string& getText() const
304 	{
305 		return text;
306 	}
307 
setText(const std::string & s)308 	virtual void setText(const std::string& s)
309 	{
310 		text = s;
311 	}
312 
resetText()313 	virtual void resetText()
314 	{
315 		text = "";
316 		inputState->tokenStartColumn = inputState->column;
317 		inputState->tokenStartLine = inputState->line;
318 	}
319 
getTokenObject() const320 	virtual antlr::RefToken getTokenObject() const
321 	{
322 		return _returnToken;
323 	}
324 
325 	///{ These need different handling in unicode case
326 
327 	virtual bool getCaseSensitiveLiterals() const=0;
328 
getCaseSensitive() const329 	virtual bool getCaseSensitive() const
330 	{
331 		return caseSensitive;
332 	}
333 
setCaseSensitive(bool t)334 	virtual void setCaseSensitive(bool t)
335 	{
336 		caseSensitive = t;
337 	}
338 
339 	/** Override this method to get more specific case handling
340 	 * @note some platforms probably require setting the right locale for
341 	 * correct functioning.
342 	 */
toLower(char_type c) const343 	virtual char_type toLower(char_type c) const
344 	{
345 		return std::tolower(c);
346 	}
347 
348 	/** Used to keep track of line breaks, needs to be called from
349 	 * within generated lexers when a \n \r is encountered.
350 	 */
newline()351 	virtual void newline()
352 	{
353 		++inputState->line;
354 		inputState->column = 1;
355 	}
356 
357 	/** Advance the current column number by an appropriate amount according
358 	 * to the tabsize. This method needs to be explicitly called from the
359 	 * lexer rules encountering tabs.
360 	 */
tab()361 	virtual void tab()
362 	{
363 		int c = getColumn();
364 		int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1;      // calculate tab stop
365 		setColumn( nc );
366 	}
367 	/// set the tabsize. Returns the old tabsize
setTabsize(int size)368 	int setTabsize( int size )
369 	{
370 		int oldsize = tabsize;
371 		tabsize = size;
372 		return oldsize;
373 	}
374 	/// Return the tabsize used by the scanner
getTabSize() const375 	int getTabSize() const
376 	{
377 		return tabsize;
378 	}
379 	///}
380 
381 	/** Report exception errors caught in nextToken() */
reportError(const antlr::RecognitionException & ex)382 	virtual void reportError(const antlr::RecognitionException& ex)
383 	{
384 		std::cerr << ex.toString().c_str() << std::endl;
385 	}
386 
387 	/** Parser error-reporting function can be overridden in subclass */
reportError(const std::string & s)388 	virtual void reportError(const std::string& s)
389 	{
390 		if (getFilename() == "")
391 			std::cerr << "error: " << s.c_str() << std::endl;
392 		else
393 			std::cerr << getFilename().c_str() << ": error: " << s.c_str() << std::endl;
394 	}
395 
396 	/** Parser warning-reporting function can be overridden in subclass */
reportWarning(const std::string & s)397 	virtual void reportWarning(const std::string& s)
398 	{
399 		if (getFilename() == "")
400 			std::cerr << "warning: " << s.c_str() << std::endl;
401 		else
402 			std::cerr << getFilename().c_str() << ": warning: " << s.c_str() << std::endl;
403 	}
404 
getInputBuffer()405 	virtual antlr::InputBuffer& getInputBuffer()
406 	{
407 		return inputState->getInput();
408 	}
409 
getInputState()410 	virtual antlr::LexerSharedInputState getInputState()
411 	{
412 		return inputState;
413 	}
414 
415 	/** set the input state for the lexer.
416 	 * @note state is a reference counted object, hence no reference */
setInputState(antlr::LexerSharedInputState state)417 	virtual void setInputState(antlr::LexerSharedInputState state)
418 	{
419 		inputState = state;
420 	}
421 
422 	/// Set the factory for created tokens
setTokenObjectFactory(factory_type factory)423 	virtual void setTokenObjectFactory(factory_type factory)
424 	{
425 		tokenFactory = factory;
426 	}
427 
428 	/** Test the token text against the literals table
429 	 * Override this method to perform a different literals test
430 	 */
testLiteralsTable(int ttype) const431 	virtual int testLiteralsTable(int ttype) const
432 	{
433 		string_map::const_iterator i = literals.find(text);
434 		if (i != literals.end())
435 			ttype = (*i).second;
436 		return ttype;
437 	}
438 
439 	/** Test the text passed in against the literals table
440 	 * Override this method to perform a different literals test
441 	 * This is used primarily when you want to test a portion of
442 	 * a token
443 	 */
testLiteralsTable(const std::string & text,int ttype) const444 	virtual int testLiteralsTable(const std::string& text, int ttype) const
445 	{
446 		string_map::const_iterator i = literals.find(text);
447 		if (i != literals.end())
448 			ttype = (*i).second;
449 		return ttype;
450 	}
451 
452 	/** This method is called by YourLexer::nextToken() when the lexer has
453 	 *  hit EOF condition.  EOF is NOT a character.
454 	 *  This method is not called if EOF is reached during
455 	 *  syntactic predicate evaluation or during evaluation
456 	 *  of normal lexical rules, which presumably would be
457 	 *  an IOException.  This traps the "normal" EOF condition.
458 	 *
459 	 *  uponEOF() is called after the complete evaluation of
460 	 *  the previous token and only if your parser asks
461 	 *  for another token beyond that last non-EOF token.
462 	 *
463 	 *  You might want to throw token or char stream exceptions
464 	 *  like: "Heh, premature eof" or a retry stream exception
465 	 *  ("I found the end of this file, go back to referencing file").
466 	 */
uponEOF()467 	virtual void uponEOF()
468 	{
469 	}
470 
471 	/// Methods used to change tracing behavior
traceIndent()472 	void traceIndent()
473 	{
474 		for( int i = 0; i < traceDepth; i++ )
475 			std::cout << " ";
476 	}
477 
traceIn(const char * rname)478 	void traceIn(const char* rname)
479 	{
480 		traceDepth++;
481 		traceIndent();
482 		std::cout << "> lexer " << rname
483 			<< "; c==" << LA(1) << std::endl;
484 	}
485 
traceOut(const char * rname)486 	void traceOut(const char* rname)
487 	{
488 		traceIndent();
489 		std::cout << "< lexer " << rname
490 			<< "; c==" << LA(1) << std::endl;
491 		traceDepth--;
492 	}
493 
494 #ifndef NO_STATIC_CONSTS
495 	static const int EOF_CHAR = EOF;
496 #else
497 	enum {
498 		EOF_CHAR = EOF
499 	};
500 #endif
501 protected:
502 	std::string text; ///< Text of current token
503  	/// flag indicating wether consume saves characters
504 	bool saveConsumedInput;
505 	factory_type tokenFactory;				///< Factory for tokens
506 	bool caseSensitive; 						///< Is this lexer case sensitive
507 	string_map literals;						 // set by subclass
508 
509 	antlr::RefToken _returnToken;		///< used to return tokens w/o using return val
510 
511 	/// Input state, gives access to input stream, shared among different lexers
512 	antlr::LexerSharedInputState inputState;
513 
514 	/** Used during filter mode to indicate that path is desired.
515 	 * A subsequent scan error will report an error as usual
516 	 * if acceptPath=true;
517 	 */
518 	bool commitToPath;
519 
520 	unsigned int tabsize; 	///< tab size the scanner uses.
521 
522 	/// Create a new RefToken of type t
makeToken(int t)523 	virtual antlr::RefToken makeToken(int t)
524 	{
525 		antlr::RefToken tok = tokenFactory();
526 		// actually at this point you want to convert the stored lexeme text
527 		// into the format you want to have it in in the backend...
528 		tok->setType(t);
529 		tok->setColumn(inputState->tokenStartColumn);
530 		tok->setLine(inputState->tokenStartLine);
531 		return tok;
532 	}
533 
534 	/** Tracer class, used when -traceLexer is passed to antlr
535 	 */
536 	class Tracer {
537 	private:
538 		UnicodeCharScanner* parser;
539 		const char* text;
540 
541 		Tracer(const Tracer& other); 					// undefined
542 		Tracer& operator=(const Tracer& other); 	// undefined
543 	public:
Tracer(UnicodeCharScanner * p,const char * t)544 		Tracer( UnicodeCharScanner* p, const char* t )
545 		: parser(p), text(t)
546 		{
547 			parser->traceIn(text);
548 		}
~Tracer()549 		~Tracer()
550 		{
551 			parser->traceOut(text);
552 		}
553 	};
554 
555 	int traceDepth;
556 private:
557 	UnicodeCharScanner( const UnicodeCharScanner& other ); 		  		// undefined
558 	UnicodeCharScanner& operator=( const UnicodeCharScanner& other );	// undefined
559 };
560 
561 #endif //INC_UnicodeCharScanner_hpp__
562