1%{
2/*--   Copyright (C) 2009 Jonathan Schmidt-Dominé <devel@the-user.org>
3--   Derived from the KDevelop-Java-Lexer
4--
5--   This library is free software; you can redistribute it and/or
6--   modify it under the terms of the GNU Library General Public
7--   License as published by the Free Software Foundation; either
8--   version 2 of the License, or (at your option) any later version.
9--
10--   This library is distributed in the hope that it will be useful,
11--   but WITHOUT ANY WARRANTY; without even the implied warranty of
12--   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13--   Library General Public License for more details.
14--
15--   You should have received a copy of the GNU Library General Public License
16--   along with this library; see the file COPYING.LIB.  If not, write to
17--   the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18--   Boston, MA 02111-1307, USA.*/
19%}
20
21%option c++
22%option yyclass="cc::Lexer"
23%option noyywrap
24
25
26%{
27
28#define DONT_INCLUDE_FLEXLEXER
29#include "lexer.h"
30#include <QDebug>
31
32%}
33
34
35 /* UTF-8 sequences, generated with the Unicode.hs script from
36  * http://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */
37
38 /* \u0024, \u0041-\u005a, \u005f, \u0061-\u007a: one byte in UTF-8 */
39Letter1         [A-Za-z_$]
40 /* \u00c0-\u00d6, \u00d8-\u00f6, \u00f8-\u00ff */
41Letter2         [\xC3]([\x80-\x96]|[\x98-\xB6]|[\xB8-\xBF])
42 /* \u0100-\u1fff */
43Letter3         [\xC4-\xDF][\x80-\xBF]|([\xE0][\xA0-\xBF]|[\xE1][\x80-\xBF])[\x80-\xBF]
44 /* \u3040-\u318f */
45Letter4         [\xE3]([\x86][\x80-\x8F]|[\x81-\x85][\x80-\xBF])
46 /* \u3300-\u337f */
47Letter5         [\xE3][\x8C-\x8D][\x80-\xBF]
48 /* \u3400-\u3d2d */
49Letter6         [\xE3](\xB4[\x80-\xAD]|[\x90-\xB3][\x80-\xBF])
50 /* \u4e00-\u9fff */
51Letter7         ([\xE4][\xB8-\xBF]|[\xE5-\xE9][\x80-\xBF])[\x80-\xBF]
52 /* \uf900-\ufaff */
53Letter8         [\xEF][\xA4-\xAB][\x80-\xBF]
54
55Letter          {Letter1}|{Letter2}|{Letter3}|{Letter4}|{Letter5}|{Letter6}|{Letter7}|{Letter8}
56
57 /* \u0030-\u0039: ISO-LATIN-1 digits */
58Digit1          [0-9]
59 /* \u0660-\u0669, \u06f0-\u06f9: Arabic-Indic and extended Ar.-Indic digits */
60Digit2          [\xD9][\xA0-\xA9]|[\xDB][\xB0-\xB9]
61 /* \u0966-\u096f, \u09e6-\u09ef: Devanagari digits */
62Digit3          [\xE0]([\xA5]|[\xA7])[\xA6-\xAF]
63 /* \u0a66-\u0a6f, \u0ae6-\u0aef */
64Digit4          [\xE0]([\xA9]|[\xAB])[\xA6-\xAF]
65 /* \u0b66-\u0b6f, \u0be7-\u0bef */
66Digit5          [\xE0]([\xAD][\xA6-\xAF]|[\xAF][\xA7-\xAF])
67 /* \u0c66-\u0c6f, \u0ce6-\u0cef, \u0d66-\u0d6f */
68Digit6          [\xE0]([\xB1]|[\xB3]|[\xB5])[\xA6-\xAF]
69 /* \u0e50-\u0e59, \u0ed0-\u0ed9 */
70Digit7          [\xE0]([\xB9]|[\xBB])[\x90-\x99]
71 /* \u1040-\u1049 */
72Digit8          [\xE1][\x81][\x80-\x89]
73 /* \uff10-\uff19: Fullwidth digits */
74Digit9          [\xEF][\xBC][\x90-\x99]
75
76 /* \u0080-\uffff */
77Multibyte1      ([\xC2-\xDF]|[\xE0][\xA0-\xBF]|[\xE1-\xEF][\x80-\xBF])[\x80-\xBF]
78 /* \u10000-\u1fffff */
79Multibyte2      ([\xF0][\x90-\xBF]|[\xF1-\xF7][\x80-\xBF])[\x80-\xBF][\x80-\xBF]
80 /* \u200000-\u3ffffff */
81Multibyte3      ([\xF8][\x88-\xBF]|[\xF9-\xFB][\x80-\xBF])[\x80-\xBF][\x80-\xBF][\x80-\xBF]
82 /* \u4000000-\u7fffffff */
83Multibyte4      ([\xFC][\x84-\xBF]|[\xFD][\x80-\xBF])[\x80-\xBF][\x80-\xBF][\x80-\xBF]
84 /* Any multi-byte Unicode character. Single-byte ones are just . in lex. */
85Multibyte       {Multibyte1}|{Multibyte2}|{Multibyte3}|{Multibyte4}
86
87
88 /* non-Unicode stuff */
89
90HexDigit        [0-9a-fA-F]
91Digit           {Digit1}|{Digit2}|{Digit3}|{Digit4}|{Digit5}|{Digit6}|{Digit7}|{Digit8}|{Digit9}
92OctalDigit      [0-7]
93NonZeroDigit    [1-9]
94
95UnicodeEscape   [\\][u]+{HexDigit}{HexDigit}{HexDigit}{HexDigit}
96OctalEscape     [\\]{OctalDigit}({Digit}({Digit})?)?
97SimpleEscape    [\\]([']|["]|[\\]|[rnbft])
98Escape          {SimpleEscape}|{UnicodeEscape}|{OctalEscape}
99
100IntSuffix       [Ll]
101DecimalNum      ([0]|{NonZeroDigit}{Digit}*){IntSuffix}?
102OctalNum        [0]{OctalDigit}+{IntSuffix}?
103HexNum          [0][xX]{HexDigit}+{IntSuffix}?
104IntegerLiteral  {DecimalNum}|{OctalNum}|{HexNum}
105
106Sign            [+-]
107FloatSuffix     [fF]|[dD]
108SignedInt       {Sign}?{Digit}+
109DecimalExponent [eE]{SignedInt}?
110BinaryExponent  [pP]{SignedInt}?
111Float1          {Digit}+[\.]{Digit}*{DecimalExponent}?{FloatSuffix}?
112Float2          [\.]{Digit}+{DecimalExponent}?{FloatSuffix}?
113Float3          {Digit}+{DecimalExponent}{FloatSuffix}?
114Float4          {Digit}+{DecimalExponent}?{FloatSuffix}
115HexFloatNum     [0][xX]{HexDigit}*[\.]{HexDigit}+
116HexFloat1       {HexNum}[\.]?{BinaryExponent}{FloatSuffix}?
117HexFloat2       {HexFloatNum}{BinaryExponent}{FloatSuffix}?
118FloatingPoint   {Float1}|{Float2}|{Float3}|{Float4}|{HexFloat1}|{HexFloat2}
119
120%x IN_BLOCKCOMMENT
121
122%%
123
124 /* whitespace, newlines, preprocessor-statements and comments */
125
126[ \f\t]+        /* skip */ ;
127[\n]            /* skip */ ;
128
129"//"[^\n]*      /* line comments, skip */ ;
130"#"[^\n]*       /* preprocessor statement, skip */ ;
131
132"/*"            BEGIN(IN_BLOCKCOMMENT);
133<IN_BLOCKCOMMENT>{
134[^*\n]*         /* eat anything that's not a '*' */ ;
135"*"+[^*/\n]*    /* eat up '*'s that are not followed by slashes or newlines */;
136[\n]            /* skip */ ;
137"*"+"/"         BEGIN(INITIAL);
138<<EOF>> {
139    qWarning() << "Encountered end of file in an unclosed block comment";
140    return Parser::Token_EOF;
141}
142}
143
144
145 /* separators */
146
147"("             return Parser::Token_LPAREN;
148")"             return Parser::Token_RPAREN;
149"{"             return Parser::Token_LBRACE;
150"}"             return Parser::Token_RBRACE;
151"["             return Parser::Token_LBRACKET;
152"]"             return Parser::Token_RBRACKET;
153","             return Parser::Token_COMMA;
154";"             return Parser::Token_SEMICOLON;
155"."             return Parser::Token_DOT;
156"->"            return Parser::Token_ARROW;
157":"             return Parser::Token_COLON;
158
159
160 /* operators */
161
162"?"             return Parser::Token_QUESTION;
163"!"             return Parser::Token_NOT;
164"~"             return Parser::Token_TILDE;
165"=="            return Parser::Token_EQUAL_EQUAL;
166"<"             return Parser::Token_LESS;
167"<="            return Parser::Token_LESS_EQUAL;
168">"             return Parser::Token_GREATER;
169">="            return Parser::Token_GREATER_EQUAL;
170"!="            return Parser::Token_NOT_EQUAL;
171"&&"            return Parser::Token_AND_AND;
172"||"            return Parser::Token_OR_OR;
173"++"            return Parser::Token_PLUS_PLUS;
174"--"            return Parser::Token_MINUS_MINUS;
175"="             return Parser::Token_EQUAL;
176"+"             return Parser::Token_PLUS;
177"+="            return Parser::Token_PLUS_EQUAL;
178"-"             return Parser::Token_MINUS;
179"-="            return Parser::Token_MINUS_EQUAL;
180"*"             return Parser::Token_STAR;
181"*="            return Parser::Token_STAR_EQUAL;
182"/"             return Parser::Token_DIVIDE;
183"/="            return Parser::Token_DIVIDE_EQUAL;
184"&"             return Parser::Token_AND;
185"&="            return Parser::Token_AND_EQUAL;
186"|"             return Parser::Token_OR;
187"|="            return Parser::Token_OR_EQUAL;
188"^"             return Parser::Token_XOR;
189"^="            return Parser::Token_XOR_EQUAL;
190"%"             return Parser::Token_REMAINDER;
191"%="            return Parser::Token_REMAINDER_EQUAL;
192"<<"            return Parser::Token_LSHIFT;
193"<<="           return Parser::Token_LSHIFT_EQUAL;
194">>"            return Parser::Token_RSHIFT;
195">>="           return Parser::Token_RSHIFT_EQUAL;
196"..."           return Parser::Token_ELLIPSIS;
197
198 /* keywords */
199
200"break"         return Parser::Token_BREAK;
201"case"          return Parser::Token_CASE;
202"continue"      return Parser::Token_CONTINUE;
203"default"       return Parser::Token_DEFAULT;
204"do"            return Parser::Token_DO;
205"else"          return Parser::Token_ELSE;
206"enum"          return Parser::Token_ENUM;
207"for"           return Parser::Token_FOR;
208"goto"          return Parser::Token_GOTO;
209"if"            return Parser::Token_IF;
210"return"        return Parser::Token_RETURN;
211"switch"        return Parser::Token_SWITCH;
212"while"         return Parser::Token_WHILE;
213"static"        return Parser::Token_STATIC;
214"volatile"      return Parser::Token_VOLATILE;
215"__volatile__"      return Parser::Token_VOLATILE;
216"const"         return Parser::Token_CONST;
217"typedef"       return Parser::Token_TYPEDEF;
218"extern"        return Parser::Token_EXTERN;
219"auto"          return Parser::Token_AUTO;
220"register"      return Parser::Token_REGISTER;
221"void"          return Parser::Token_VOID;
222"int"           return Parser::Token_INT;
223"char"          return Parser::Token_CHAR;
224"short"         return Parser::Token_SHORT;
225"long"          return Parser::Token_LONG;
226"signed"        return Parser::Token_SIGNED;
227"unsigned"      return Parser::Token_UNSIGNED;
228"float"         return Parser::Token_FLOAT;
229"double"        return Parser::Token_DOUBLE;
230"union"         return Parser::Token_UNION;
231"asm"           return Parser::Token_ASM;
232"__asm__"       return Parser::Token_ASM;
233"__extension__" return Parser::Token_EXTENSION;
234"__inline"      return Parser::Token_INLINE;
235"inline"        return Parser::Token_INLINE;
236
237 /* characters and strings (C with unicode-support) */
238
239[']({Escape}|{Multibyte}|[^\\\n\'])[']   return Parser::Token_X_CONSTANT;
240[']({Escape}|{Multibyte}|[\\][^\\\n\']|[^\\\n\'])*([\\]?[\n]|[']) {
241    qWarning() << QString("Invalid character literal: %1").arg(yytext);
242    return Parser::Token_X_CONSTANT;
243}
244
245["]({Escape}|{Multibyte}|[^\\\n\"])*["]  return Parser::Token_STRING_LITERAL;
246["]({Escape}|{Multibyte}|[\\][^\\\n\"]|[^\\\n\"])*([\\]?[\n]|["]) {
247    qWarning() << QString("Invalid string literal: %1").arg(yytext);
248    return Parser::Token_STRING_LITERAL;
249}
250
251
252 /* identifiers and number literals */
253
254{Letter}({Letter}|{Digit})*  return Parser::Token_IDENTIFIER;
255
256{IntegerLiteral}   return Parser::Token_X_CONSTANT;
257{FloatingPoint}    return Parser::Token_X_CONSTANT;
258
259
260 /* everything else is not a valid lexeme */
261
262.                  {
263                        qWarning() << "INVALID TOKEN";
264                        exit(-1);
265                   }
266
267%%
268
269namespace cc
270{
271
272Lexer::Lexer( Parser *parser, char *contents )
273{
274    restart( parser, contents );
275}
276
277void Lexer::restart( Parser *parser, char *contents )
278{
279    m_parser = parser;
280    m_locationTable = parser->tokenStream->locationTable();
281    m_contents = contents;
282    m_tokenBegin = m_tokenEnd = 0;
283    m_currentOffset = 0;
284
285    // check for and ignore the UTF-8 byte order mark
286    unsigned char *ucontents = (unsigned char *) m_contents;
287    if ( ucontents[0] == 0xEF && ucontents[1] == 0xBB && ucontents[2] == 0xBF )
288    {
289        m_tokenBegin = m_tokenEnd = 3;
290        m_currentOffset = 3;
291    }
292
293    yyrestart(NULL);
294    BEGIN(INITIAL); // is not set automatically by yyrestart()
295}
296
297// reads a character, and returns 1 as the number of characters read
298// (or 0 when the end of the string is reached)
299int Lexer::LexerInput( char *buf, int /*max_size*/ )
300{
301    int c = m_contents[ m_currentOffset++ ];
302
303    switch(c)
304    {
305    case '\r':
306        c = '\n'; // only have one single line break character: '\n'
307        if ( m_contents[m_currentOffset + 1] == '\n' )
308        {
309            m_currentOffset++;
310            m_tokenEnd++;
311        }
312
313        // fall through
314    case '\n':
315        m_locationTable->newline( m_currentOffset - 1 );
316        break;
317
318    default:
319        break;
320    }
321
322    return (c == 0) ? 0 : (buf[0] = c, 1);
323}
324
325} // end of namespace cc
326