1%{ 2/*-- Copyright (C) 2009 Jonathan Schmidt-Dominé <devel@the-user.org> 3-- Derived from the KDevelop-Java-Lexer 4-- 5-- This library is free software; you can redistribute it and/or 6-- modify it under the terms of the GNU Library General Public 7-- License as published by the Free Software Foundation; either 8-- version 2 of the License, or (at your option) any later version. 9-- 10-- This library is distributed in the hope that it will be useful, 11-- but WITHOUT ANY WARRANTY; without even the implied warranty of 12-- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13-- Library General Public License for more details. 14-- 15-- You should have received a copy of the GNU Library General Public License 16-- along with this library; see the file COPYING.LIB. If not, write to 17-- the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18-- Boston, MA 02111-1307, USA.*/ 19%} 20 21%option c++ 22%option yyclass="cc::Lexer" 23%option noyywrap 24 25 26%{ 27 28#define DONT_INCLUDE_FLEXLEXER 29#include "lexer.h" 30#include <QDebug> 31 32%} 33 34 35 /* UTF-8 sequences, generated with the Unicode.hs script from 36 * http://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */ 37 38 /* \u0024, \u0041-\u005a, \u005f, \u0061-\u007a: one byte in UTF-8 */ 39Letter1 [A-Za-z_$] 40 /* \u00c0-\u00d6, \u00d8-\u00f6, \u00f8-\u00ff */ 41Letter2 [\xC3]([\x80-\x96]|[\x98-\xB6]|[\xB8-\xBF]) 42 /* \u0100-\u1fff */ 43Letter3 [\xC4-\xDF][\x80-\xBF]|([\xE0][\xA0-\xBF]|[\xE1][\x80-\xBF])[\x80-\xBF] 44 /* \u3040-\u318f */ 45Letter4 [\xE3]([\x86][\x80-\x8F]|[\x81-\x85][\x80-\xBF]) 46 /* \u3300-\u337f */ 47Letter5 [\xE3][\x8C-\x8D][\x80-\xBF] 48 /* \u3400-\u3d2d */ 49Letter6 [\xE3](\xB4[\x80-\xAD]|[\x90-\xB3][\x80-\xBF]) 50 /* \u4e00-\u9fff */ 51Letter7 ([\xE4][\xB8-\xBF]|[\xE5-\xE9][\x80-\xBF])[\x80-\xBF] 52 /* \uf900-\ufaff */ 53Letter8 [\xEF][\xA4-\xAB][\x80-\xBF] 54 55Letter {Letter1}|{Letter2}|{Letter3}|{Letter4}|{Letter5}|{Letter6}|{Letter7}|{Letter8} 56 57 /* \u0030-\u0039: ISO-LATIN-1 digits */ 58Digit1 [0-9] 59 /* \u0660-\u0669, \u06f0-\u06f9: Arabic-Indic and extended Ar.-Indic digits */ 60Digit2 [\xD9][\xA0-\xA9]|[\xDB][\xB0-\xB9] 61 /* \u0966-\u096f, \u09e6-\u09ef: Devanagari digits */ 62Digit3 [\xE0]([\xA5]|[\xA7])[\xA6-\xAF] 63 /* \u0a66-\u0a6f, \u0ae6-\u0aef */ 64Digit4 [\xE0]([\xA9]|[\xAB])[\xA6-\xAF] 65 /* \u0b66-\u0b6f, \u0be7-\u0bef */ 66Digit5 [\xE0]([\xAD][\xA6-\xAF]|[\xAF][\xA7-\xAF]) 67 /* \u0c66-\u0c6f, \u0ce6-\u0cef, \u0d66-\u0d6f */ 68Digit6 [\xE0]([\xB1]|[\xB3]|[\xB5])[\xA6-\xAF] 69 /* \u0e50-\u0e59, \u0ed0-\u0ed9 */ 70Digit7 [\xE0]([\xB9]|[\xBB])[\x90-\x99] 71 /* \u1040-\u1049 */ 72Digit8 [\xE1][\x81][\x80-\x89] 73 /* \uff10-\uff19: Fullwidth digits */ 74Digit9 [\xEF][\xBC][\x90-\x99] 75 76 /* \u0080-\uffff */ 77Multibyte1 ([\xC2-\xDF]|[\xE0][\xA0-\xBF]|[\xE1-\xEF][\x80-\xBF])[\x80-\xBF] 78 /* \u10000-\u1fffff */ 79Multibyte2 ([\xF0][\x90-\xBF]|[\xF1-\xF7][\x80-\xBF])[\x80-\xBF][\x80-\xBF] 80 /* \u200000-\u3ffffff */ 81Multibyte3 ([\xF8][\x88-\xBF]|[\xF9-\xFB][\x80-\xBF])[\x80-\xBF][\x80-\xBF][\x80-\xBF] 82 /* \u4000000-\u7fffffff */ 83Multibyte4 ([\xFC][\x84-\xBF]|[\xFD][\x80-\xBF])[\x80-\xBF][\x80-\xBF][\x80-\xBF] 84 /* Any multi-byte Unicode character. Single-byte ones are just . in lex. */ 85Multibyte {Multibyte1}|{Multibyte2}|{Multibyte3}|{Multibyte4} 86 87 88 /* non-Unicode stuff */ 89 90HexDigit [0-9a-fA-F] 91Digit {Digit1}|{Digit2}|{Digit3}|{Digit4}|{Digit5}|{Digit6}|{Digit7}|{Digit8}|{Digit9} 92OctalDigit [0-7] 93NonZeroDigit [1-9] 94 95UnicodeEscape [\\][u]+{HexDigit}{HexDigit}{HexDigit}{HexDigit} 96OctalEscape [\\]{OctalDigit}({Digit}({Digit})?)? 97SimpleEscape [\\]([']|["]|[\\]|[rnbft]) 98Escape {SimpleEscape}|{UnicodeEscape}|{OctalEscape} 99 100IntSuffix [Ll] 101DecimalNum ([0]|{NonZeroDigit}{Digit}*){IntSuffix}? 102OctalNum [0]{OctalDigit}+{IntSuffix}? 103HexNum [0][xX]{HexDigit}+{IntSuffix}? 104IntegerLiteral {DecimalNum}|{OctalNum}|{HexNum} 105 106Sign [+-] 107FloatSuffix [fF]|[dD] 108SignedInt {Sign}?{Digit}+ 109DecimalExponent [eE]{SignedInt}? 110BinaryExponent [pP]{SignedInt}? 111Float1 {Digit}+[\.]{Digit}*{DecimalExponent}?{FloatSuffix}? 112Float2 [\.]{Digit}+{DecimalExponent}?{FloatSuffix}? 113Float3 {Digit}+{DecimalExponent}{FloatSuffix}? 114Float4 {Digit}+{DecimalExponent}?{FloatSuffix} 115HexFloatNum [0][xX]{HexDigit}*[\.]{HexDigit}+ 116HexFloat1 {HexNum}[\.]?{BinaryExponent}{FloatSuffix}? 117HexFloat2 {HexFloatNum}{BinaryExponent}{FloatSuffix}? 118FloatingPoint {Float1}|{Float2}|{Float3}|{Float4}|{HexFloat1}|{HexFloat2} 119 120%x IN_BLOCKCOMMENT 121 122%% 123 124 /* whitespace, newlines, preprocessor-statements and comments */ 125 126[ \f\t]+ /* skip */ ; 127[\n] /* skip */ ; 128 129"//"[^\n]* /* line comments, skip */ ; 130"#"[^\n]* /* preprocessor statement, skip */ ; 131 132"/*" BEGIN(IN_BLOCKCOMMENT); 133<IN_BLOCKCOMMENT>{ 134[^*\n]* /* eat anything that's not a '*' */ ; 135"*"+[^*/\n]* /* eat up '*'s that are not followed by slashes or newlines */; 136[\n] /* skip */ ; 137"*"+"/" BEGIN(INITIAL); 138<<EOF>> { 139 qWarning() << "Encountered end of file in an unclosed block comment"; 140 return Parser::Token_EOF; 141} 142} 143 144 145 /* separators */ 146 147"(" return Parser::Token_LPAREN; 148")" return Parser::Token_RPAREN; 149"{" return Parser::Token_LBRACE; 150"}" return Parser::Token_RBRACE; 151"[" return Parser::Token_LBRACKET; 152"]" return Parser::Token_RBRACKET; 153"," return Parser::Token_COMMA; 154";" return Parser::Token_SEMICOLON; 155"." return Parser::Token_DOT; 156"->" return Parser::Token_ARROW; 157":" return Parser::Token_COLON; 158 159 160 /* operators */ 161 162"?" return Parser::Token_QUESTION; 163"!" return Parser::Token_NOT; 164"~" return Parser::Token_TILDE; 165"==" return Parser::Token_EQUAL_EQUAL; 166"<" return Parser::Token_LESS; 167"<=" return Parser::Token_LESS_EQUAL; 168">" return Parser::Token_GREATER; 169">=" return Parser::Token_GREATER_EQUAL; 170"!=" return Parser::Token_NOT_EQUAL; 171"&&" return Parser::Token_AND_AND; 172"||" return Parser::Token_OR_OR; 173"++" return Parser::Token_PLUS_PLUS; 174"--" return Parser::Token_MINUS_MINUS; 175"=" return Parser::Token_EQUAL; 176"+" return Parser::Token_PLUS; 177"+=" return Parser::Token_PLUS_EQUAL; 178"-" return Parser::Token_MINUS; 179"-=" return Parser::Token_MINUS_EQUAL; 180"*" return Parser::Token_STAR; 181"*=" return Parser::Token_STAR_EQUAL; 182"/" return Parser::Token_DIVIDE; 183"/=" return Parser::Token_DIVIDE_EQUAL; 184"&" return Parser::Token_AND; 185"&=" return Parser::Token_AND_EQUAL; 186"|" return Parser::Token_OR; 187"|=" return Parser::Token_OR_EQUAL; 188"^" return Parser::Token_XOR; 189"^=" return Parser::Token_XOR_EQUAL; 190"%" return Parser::Token_REMAINDER; 191"%=" return Parser::Token_REMAINDER_EQUAL; 192"<<" return Parser::Token_LSHIFT; 193"<<=" return Parser::Token_LSHIFT_EQUAL; 194">>" return Parser::Token_RSHIFT; 195">>=" return Parser::Token_RSHIFT_EQUAL; 196"..." return Parser::Token_ELLIPSIS; 197 198 /* keywords */ 199 200"break" return Parser::Token_BREAK; 201"case" return Parser::Token_CASE; 202"continue" return Parser::Token_CONTINUE; 203"default" return Parser::Token_DEFAULT; 204"do" return Parser::Token_DO; 205"else" return Parser::Token_ELSE; 206"enum" return Parser::Token_ENUM; 207"for" return Parser::Token_FOR; 208"goto" return Parser::Token_GOTO; 209"if" return Parser::Token_IF; 210"return" return Parser::Token_RETURN; 211"switch" return Parser::Token_SWITCH; 212"while" return Parser::Token_WHILE; 213"static" return Parser::Token_STATIC; 214"volatile" return Parser::Token_VOLATILE; 215"__volatile__" return Parser::Token_VOLATILE; 216"const" return Parser::Token_CONST; 217"typedef" return Parser::Token_TYPEDEF; 218"extern" return Parser::Token_EXTERN; 219"auto" return Parser::Token_AUTO; 220"register" return Parser::Token_REGISTER; 221"void" return Parser::Token_VOID; 222"int" return Parser::Token_INT; 223"char" return Parser::Token_CHAR; 224"short" return Parser::Token_SHORT; 225"long" return Parser::Token_LONG; 226"signed" return Parser::Token_SIGNED; 227"unsigned" return Parser::Token_UNSIGNED; 228"float" return Parser::Token_FLOAT; 229"double" return Parser::Token_DOUBLE; 230"union" return Parser::Token_UNION; 231"asm" return Parser::Token_ASM; 232"__asm__" return Parser::Token_ASM; 233"__extension__" return Parser::Token_EXTENSION; 234"__inline" return Parser::Token_INLINE; 235"inline" return Parser::Token_INLINE; 236 237 /* characters and strings (C with unicode-support) */ 238 239[']({Escape}|{Multibyte}|[^\\\n\'])['] return Parser::Token_X_CONSTANT; 240[']({Escape}|{Multibyte}|[\\][^\\\n\']|[^\\\n\'])*([\\]?[\n]|[']) { 241 qWarning() << QString("Invalid character literal: %1").arg(yytext); 242 return Parser::Token_X_CONSTANT; 243} 244 245["]({Escape}|{Multibyte}|[^\\\n\"])*["] return Parser::Token_STRING_LITERAL; 246["]({Escape}|{Multibyte}|[\\][^\\\n\"]|[^\\\n\"])*([\\]?[\n]|["]) { 247 qWarning() << QString("Invalid string literal: %1").arg(yytext); 248 return Parser::Token_STRING_LITERAL; 249} 250 251 252 /* identifiers and number literals */ 253 254{Letter}({Letter}|{Digit})* return Parser::Token_IDENTIFIER; 255 256{IntegerLiteral} return Parser::Token_X_CONSTANT; 257{FloatingPoint} return Parser::Token_X_CONSTANT; 258 259 260 /* everything else is not a valid lexeme */ 261 262. { 263 qWarning() << "INVALID TOKEN"; 264 exit(-1); 265 } 266 267%% 268 269namespace cc 270{ 271 272Lexer::Lexer( Parser *parser, char *contents ) 273{ 274 restart( parser, contents ); 275} 276 277void Lexer::restart( Parser *parser, char *contents ) 278{ 279 m_parser = parser; 280 m_locationTable = parser->tokenStream->locationTable(); 281 m_contents = contents; 282 m_tokenBegin = m_tokenEnd = 0; 283 m_currentOffset = 0; 284 285 // check for and ignore the UTF-8 byte order mark 286 unsigned char *ucontents = (unsigned char *) m_contents; 287 if ( ucontents[0] == 0xEF && ucontents[1] == 0xBB && ucontents[2] == 0xBF ) 288 { 289 m_tokenBegin = m_tokenEnd = 3; 290 m_currentOffset = 3; 291 } 292 293 yyrestart(NULL); 294 BEGIN(INITIAL); // is not set automatically by yyrestart() 295} 296 297// reads a character, and returns 1 as the number of characters read 298// (or 0 when the end of the string is reached) 299int Lexer::LexerInput( char *buf, int /*max_size*/ ) 300{ 301 int c = m_contents[ m_currentOffset++ ]; 302 303 switch(c) 304 { 305 case '\r': 306 c = '\n'; // only have one single line break character: '\n' 307 if ( m_contents[m_currentOffset + 1] == '\n' ) 308 { 309 m_currentOffset++; 310 m_tokenEnd++; 311 } 312 313 // fall through 314 case '\n': 315 m_locationTable->newline( m_currentOffset - 1 ); 316 break; 317 318 default: 319 break; 320 } 321 322 return (c == 0) ? 0 : (buf[0] = c, 1); 323} 324 325} // end of namespace cc 326