1 // lex.h -- Go frontend lexer. -*- C++ -*- 2 3 // Copyright 2009 The Go Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style 5 // license that can be found in the LICENSE file. 6 7 #ifndef GO_LEX_H 8 #define GO_LEX_H 9 10 #include <mpfr.h> 11 12 #include "operator.h" 13 #include "go-linemap.h" 14 15 struct Unicode_range; 16 17 // The keywords. These must be in sorted order, other than 18 // KEYWORD_INVALID. They must match the Keywords::mapping_ array in 19 // lex.cc. 20 21 enum Keyword 22 { 23 KEYWORD_INVALID, // Not a keyword. 24 KEYWORD_ASM, 25 KEYWORD_BREAK, 26 KEYWORD_CASE, 27 KEYWORD_CHAN, 28 KEYWORD_CONST, 29 KEYWORD_CONTINUE, 30 KEYWORD_DEFAULT, 31 KEYWORD_DEFER, 32 KEYWORD_ELSE, 33 KEYWORD_FALLTHROUGH, 34 KEYWORD_FOR, 35 KEYWORD_FUNC, 36 KEYWORD_GO, 37 KEYWORD_GOTO, 38 KEYWORD_IF, 39 KEYWORD_IMPORT, 40 KEYWORD_INTERFACE, 41 KEYWORD_MAP, 42 KEYWORD_PACKAGE, 43 KEYWORD_RANGE, 44 KEYWORD_RETURN, 45 KEYWORD_SELECT, 46 KEYWORD_STRUCT, 47 KEYWORD_SWITCH, 48 KEYWORD_TYPE, 49 KEYWORD_VAR 50 }; 51 52 // A token returned from the lexer. 53 54 class Token 55 { 56 public: 57 // Token classification. 58 enum Classification 59 { 60 // Token is invalid. 61 TOKEN_INVALID, 62 // Token indicates end of input. 63 TOKEN_EOF, 64 // Token is a keyword. 65 TOKEN_KEYWORD, 66 // Token is an identifier. 67 TOKEN_IDENTIFIER, 68 // Token is a string of characters. 69 TOKEN_STRING, 70 // Token is an operator. 71 TOKEN_OPERATOR, 72 // Token is a character constant. 73 TOKEN_CHARACTER, 74 // Token is an integer. 75 TOKEN_INTEGER, 76 // Token is a floating point number. 77 TOKEN_FLOAT, 78 // Token is an imaginary number. 79 TOKEN_IMAGINARY 80 }; 81 82 ~Token(); 83 Token(const Token&); 84 Token& operator=(const Token&); 85 86 // Get token classification. 87 Classification classification()88 classification() const 89 { return this->classification_; } 90 91 // Make a token for an invalid value. 92 static Token make_invalid_token(Location location)93 make_invalid_token(Location location) 94 { return Token(TOKEN_INVALID, location); } 95 96 // Make a token representing end of file. 97 static Token make_eof_token(Location location)98 make_eof_token(Location location) 99 { return Token(TOKEN_EOF, location); } 100 101 // Make a keyword token. 102 static Token make_keyword_token(Keyword keyword,Location location)103 make_keyword_token(Keyword keyword, Location location) 104 { 105 Token tok(TOKEN_KEYWORD, location); 106 tok.u_.keyword = keyword; 107 return tok; 108 } 109 110 // Make an identifier token. 111 static Token make_identifier_token(const std::string & value,bool is_exported,Location location)112 make_identifier_token(const std::string& value, bool is_exported, 113 Location location) 114 { 115 Token tok(TOKEN_IDENTIFIER, location); 116 tok.u_.identifier_value.name = new std::string(value); 117 tok.u_.identifier_value.is_exported = is_exported; 118 return tok; 119 } 120 121 // Make a quoted string token. 122 static Token make_string_token(const std::string & value,Location location)123 make_string_token(const std::string& value, Location location) 124 { 125 Token tok(TOKEN_STRING, location); 126 tok.u_.string_value = new std::string(value); 127 return tok; 128 } 129 130 // Make an operator token. 131 static Token make_operator_token(Operator op,Location location)132 make_operator_token(Operator op, Location location) 133 { 134 Token tok(TOKEN_OPERATOR, location); 135 tok.u_.op = op; 136 return tok; 137 } 138 139 // Make a character constant token. 140 static Token make_character_token(mpz_t val,Location location)141 make_character_token(mpz_t val, Location location) 142 { 143 Token tok(TOKEN_CHARACTER, location); 144 mpz_init(tok.u_.integer_value); 145 mpz_swap(tok.u_.integer_value, val); 146 return tok; 147 } 148 149 // Make an integer token. 150 static Token make_integer_token(mpz_t val,Location location)151 make_integer_token(mpz_t val, Location location) 152 { 153 Token tok(TOKEN_INTEGER, location); 154 mpz_init(tok.u_.integer_value); 155 mpz_swap(tok.u_.integer_value, val); 156 return tok; 157 } 158 159 // Make a float token. 160 static Token make_float_token(mpfr_t val,Location location)161 make_float_token(mpfr_t val, Location location) 162 { 163 Token tok(TOKEN_FLOAT, location); 164 mpfr_init(tok.u_.float_value); 165 mpfr_swap(tok.u_.float_value, val); 166 return tok; 167 } 168 169 // Make a token for an imaginary number. 170 static Token make_imaginary_token(mpfr_t val,Location location)171 make_imaginary_token(mpfr_t val, Location location) 172 { 173 Token tok(TOKEN_IMAGINARY, location); 174 mpfr_init(tok.u_.float_value); 175 mpfr_swap(tok.u_.float_value, val); 176 return tok; 177 } 178 179 // Get the location of the token. 180 Location location()181 location() const 182 { return this->location_; } 183 184 // Return whether this is an invalid token. 185 bool is_invalid()186 is_invalid() const 187 { return this->classification_ == TOKEN_INVALID; } 188 189 // Return whether this is the EOF token. 190 bool is_eof()191 is_eof() const 192 { return this->classification_ == TOKEN_EOF; } 193 194 // Return the keyword value for a keyword token. 195 Keyword keyword()196 keyword() const 197 { 198 go_assert(this->classification_ == TOKEN_KEYWORD); 199 return this->u_.keyword; 200 } 201 202 // Return whether this is an identifier. 203 bool is_identifier()204 is_identifier() const 205 { return this->classification_ == TOKEN_IDENTIFIER; } 206 207 // Return the identifier. 208 const std::string& identifier()209 identifier() const 210 { 211 go_assert(this->classification_ == TOKEN_IDENTIFIER); 212 return *this->u_.identifier_value.name; 213 } 214 215 // Return whether the identifier is exported. 216 bool is_identifier_exported()217 is_identifier_exported() const 218 { 219 go_assert(this->classification_ == TOKEN_IDENTIFIER); 220 return this->u_.identifier_value.is_exported; 221 } 222 223 // Return whether this is a string. 224 bool is_string()225 is_string() const 226 { 227 return this->classification_ == TOKEN_STRING; 228 } 229 230 // Return the value of a string. The returned value is a string of 231 // UTF-8 characters. 232 std::string string_value()233 string_value() const 234 { 235 go_assert(this->classification_ == TOKEN_STRING); 236 return *this->u_.string_value; 237 } 238 239 // Return the value of a character constant. 240 const mpz_t* character_value()241 character_value() const 242 { 243 go_assert(this->classification_ == TOKEN_CHARACTER); 244 return &this->u_.integer_value; 245 } 246 247 // Return the value of an integer. 248 const mpz_t* integer_value()249 integer_value() const 250 { 251 go_assert(this->classification_ == TOKEN_INTEGER); 252 return &this->u_.integer_value; 253 } 254 255 // Return the value of a float. 256 const mpfr_t* float_value()257 float_value() const 258 { 259 go_assert(this->classification_ == TOKEN_FLOAT); 260 return &this->u_.float_value; 261 } 262 263 // Return the value of an imaginary number. 264 const mpfr_t* imaginary_value()265 imaginary_value() const 266 { 267 go_assert(this->classification_ == TOKEN_IMAGINARY); 268 return &this->u_.float_value; 269 } 270 271 // Return the operator value for an operator token. 272 Operator op()273 op() const 274 { 275 go_assert(this->classification_ == TOKEN_OPERATOR); 276 return this->u_.op; 277 } 278 279 // Return whether this token is KEYWORD. 280 bool is_keyword(Keyword keyword)281 is_keyword(Keyword keyword) const 282 { 283 return (this->classification_ == TOKEN_KEYWORD 284 && this->u_.keyword == keyword); 285 } 286 287 // Return whether this token is OP. 288 bool is_op(Operator op)289 is_op(Operator op) const 290 { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; } 291 292 // Print the token for debugging. 293 void 294 print(FILE*) const; 295 296 private: 297 // Private constructor used by make_..._token functions above. 298 Token(Classification, Location); 299 300 // Clear the token. 301 void 302 clear(); 303 304 // The token classification. 305 Classification classification_; 306 union 307 { 308 // The keyword value for TOKEN_KEYWORD. 309 Keyword keyword; 310 // The token value for TOKEN_IDENTIFIER. 311 struct 312 { 313 // The name of the identifier. This has been mangled to only 314 // include ASCII characters. 315 std::string* name; 316 // Whether this name should be exported. This is true if the 317 // first letter in the name is upper case. 318 bool is_exported; 319 } identifier_value; 320 // The string value for TOKEN_STRING. 321 std::string* string_value; 322 // The token value for TOKEN_CHARACTER or TOKEN_INTEGER. 323 mpz_t integer_value; 324 // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY. 325 mpfr_t float_value; 326 // The token value for TOKEN_OPERATOR or the keyword value 327 Operator op; 328 } u_; 329 // The source location. 330 Location location_; 331 }; 332 333 // The lexer itself. 334 335 class Lex 336 { 337 public: 338 Lex(const char* input_file_name, FILE* input_file, Linemap *linemap); 339 340 ~Lex(); 341 342 // Return the next token. 343 Token 344 next_token(); 345 346 // Return the contents of any current //extern comment. 347 const std::string& extern_name()348 extern_name() const 349 { return this->extern_; } 350 351 // Return whether we have seen a //go:nointerface comment, clearing 352 // the flag. 353 bool get_and_clear_nointerface()354 get_and_clear_nointerface() 355 { 356 bool ret = this->saw_nointerface_; 357 this->saw_nointerface_ = false; 358 return ret; 359 } 360 361 // Return whether the identifier NAME should be exported. NAME is a 362 // mangled name which includes only ASCII characters. 363 static bool 364 is_exported_name(const std::string& name); 365 366 // Return whether the identifier NAME is invalid. When we see an 367 // invalid character we still build an identifier, but we use a 368 // magic string to indicate that the identifier is invalid. We then 369 // use this to avoid knockon errors. 370 static bool 371 is_invalid_identifier(const std::string& name); 372 373 // A helper function. Append V to STR. IS_CHARACTER is true if V 374 // is a Unicode character which should be converted into UTF-8, 375 // false if it is a byte value to be appended directly. The 376 // location is used to warn about an out of range character. 377 static void 378 append_char(unsigned int v, bool is_charater, std::string* str, 379 Location); 380 381 // A helper function. Fetch a UTF-8 character from STR and store it 382 // in *VALUE. Return the number of bytes read from STR. Return 0 383 // if STR does not point to a valid UTF-8 character. 384 static int 385 fetch_char(const char* str, unsigned int *value); 386 387 // Return whether C is a Unicode or "C" locale space character. 388 static bool 389 is_unicode_space(unsigned int c); 390 391 private: 392 ssize_t 393 get_line(); 394 395 bool 396 require_line(); 397 398 // The current location. 399 Location 400 location() const; 401 402 // A position CHARS column positions before the current location. 403 Location 404 earlier_location(int chars) const; 405 406 static bool 407 is_hex_digit(char); 408 409 static unsigned char octal_value(char c)410 octal_value(char c) 411 { return c - '0'; } 412 413 Token make_invalid_token()414 make_invalid_token() 415 { return Token::make_invalid_token(this->location()); } 416 417 Token make_eof_token()418 make_eof_token() 419 { return Token::make_eof_token(this->location()); } 420 421 Token make_operator(Operator op,int chars)422 make_operator(Operator op, int chars) 423 { return Token::make_operator_token(op, this->earlier_location(chars)); } 424 425 Token 426 gather_identifier(); 427 428 static bool 429 could_be_exponent(const char*, const char*); 430 431 Token 432 gather_number(); 433 434 Token 435 gather_character(); 436 437 Token 438 gather_string(); 439 440 Token 441 gather_raw_string(); 442 443 const char* 444 advance_one_utf8_char(const char*, unsigned int*, bool*); 445 446 const char* 447 advance_one_char(const char*, bool, unsigned int*, bool*); 448 449 static bool 450 is_unicode_digit(unsigned int c); 451 452 static bool 453 is_unicode_letter(unsigned int c); 454 455 static bool 456 is_unicode_uppercase(unsigned int c); 457 458 static bool 459 is_in_unicode_range(unsigned int C, const Unicode_range* ranges, 460 size_t range_size); 461 462 Operator 463 three_character_operator(char, char, char); 464 465 Operator 466 two_character_operator(char, char); 467 468 Operator 469 one_character_operator(char); 470 471 bool 472 skip_c_comment(); 473 474 void 475 skip_cpp_comment(); 476 477 // The input file name. 478 const char* input_file_name_; 479 // The input file. 480 FILE* input_file_; 481 // The object used to keep track of file names and line numbers. 482 Linemap* linemap_; 483 // The line buffer. This holds the current line. 484 char* linebuf_; 485 // The size of the line buffer. 486 size_t linebufsize_; 487 // The nmber of characters in the current line. 488 size_t linesize_; 489 // The current offset in linebuf_. 490 size_t lineoff_; 491 // The current line number. 492 size_t lineno_; 493 // Whether to add a semicolon if we see a newline now. 494 bool add_semi_at_eol_; 495 // Whether we just saw a magic go:nointerface comment. 496 bool saw_nointerface_; 497 // The external name to use for a function declaration, from a magic 498 // //extern comment. 499 std::string extern_; 500 }; 501 502 #endif // !defined(GO_LEX_H) 503