1 // lex.h -- Go frontend lexer.     -*- C++ -*-
2 
3 // Copyright 2009 The Go Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file.
6 
7 #ifndef GO_LEX_H
8 #define GO_LEX_H
9 
10 #include <mpfr.h>
11 
12 #include "operator.h"
13 #include "go-linemap.h"
14 
15 struct Unicode_range;
16 
17 // The keywords.  These must be in sorted order, other than
18 // KEYWORD_INVALID.  They must match the Keywords::mapping_ array in
19 // lex.cc.
20 
21 enum Keyword
22 {
23   KEYWORD_INVALID,	// Not a keyword.
24   KEYWORD_ASM,
25   KEYWORD_BREAK,
26   KEYWORD_CASE,
27   KEYWORD_CHAN,
28   KEYWORD_CONST,
29   KEYWORD_CONTINUE,
30   KEYWORD_DEFAULT,
31   KEYWORD_DEFER,
32   KEYWORD_ELSE,
33   KEYWORD_FALLTHROUGH,
34   KEYWORD_FOR,
35   KEYWORD_FUNC,
36   KEYWORD_GO,
37   KEYWORD_GOTO,
38   KEYWORD_IF,
39   KEYWORD_IMPORT,
40   KEYWORD_INTERFACE,
41   KEYWORD_MAP,
42   KEYWORD_PACKAGE,
43   KEYWORD_RANGE,
44   KEYWORD_RETURN,
45   KEYWORD_SELECT,
46   KEYWORD_STRUCT,
47   KEYWORD_SWITCH,
48   KEYWORD_TYPE,
49   KEYWORD_VAR
50 };
51 
52 // A token returned from the lexer.
53 
54 class Token
55 {
56  public:
57   // Token classification.
58   enum Classification
59   {
60     // Token is invalid.
61     TOKEN_INVALID,
62     // Token indicates end of input.
63     TOKEN_EOF,
64     // Token is a keyword.
65     TOKEN_KEYWORD,
66     // Token is an identifier.
67     TOKEN_IDENTIFIER,
68     // Token is a string of characters.
69     TOKEN_STRING,
70     // Token is an operator.
71     TOKEN_OPERATOR,
72     // Token is a character constant.
73     TOKEN_CHARACTER,
74     // Token is an integer.
75     TOKEN_INTEGER,
76     // Token is a floating point number.
77     TOKEN_FLOAT,
78     // Token is an imaginary number.
79     TOKEN_IMAGINARY
80   };
81 
82   ~Token();
83   Token(const Token&);
84   Token& operator=(const Token&);
85 
86   // Get token classification.
87   Classification
classification()88   classification() const
89   { return this->classification_; }
90 
91   // Make a token for an invalid value.
92   static Token
make_invalid_token(Location location)93   make_invalid_token(Location location)
94   { return Token(TOKEN_INVALID, location); }
95 
96   // Make a token representing end of file.
97   static Token
make_eof_token(Location location)98   make_eof_token(Location location)
99   { return Token(TOKEN_EOF, location); }
100 
101   // Make a keyword token.
102   static Token
make_keyword_token(Keyword keyword,Location location)103   make_keyword_token(Keyword keyword, Location location)
104   {
105     Token tok(TOKEN_KEYWORD, location);
106     tok.u_.keyword = keyword;
107     return tok;
108   }
109 
110   // Make an identifier token.
111   static Token
make_identifier_token(const std::string & value,bool is_exported,Location location)112   make_identifier_token(const std::string& value, bool is_exported,
113 			Location location)
114   {
115     Token tok(TOKEN_IDENTIFIER, location);
116     tok.u_.identifier_value.name = new std::string(value);
117     tok.u_.identifier_value.is_exported = is_exported;
118     return tok;
119   }
120 
121   // Make a quoted string token.
122   static Token
make_string_token(const std::string & value,Location location)123   make_string_token(const std::string& value, Location location)
124   {
125     Token tok(TOKEN_STRING, location);
126     tok.u_.string_value = new std::string(value);
127     return tok;
128   }
129 
130   // Make an operator token.
131   static Token
make_operator_token(Operator op,Location location)132   make_operator_token(Operator op, Location location)
133   {
134     Token tok(TOKEN_OPERATOR, location);
135     tok.u_.op = op;
136     return tok;
137   }
138 
139   // Make a character constant token.
140   static Token
make_character_token(mpz_t val,Location location)141   make_character_token(mpz_t val, Location location)
142   {
143     Token tok(TOKEN_CHARACTER, location);
144     mpz_init(tok.u_.integer_value);
145     mpz_swap(tok.u_.integer_value, val);
146     return tok;
147   }
148 
149   // Make an integer token.
150   static Token
make_integer_token(mpz_t val,Location location)151   make_integer_token(mpz_t val, Location location)
152   {
153     Token tok(TOKEN_INTEGER, location);
154     mpz_init(tok.u_.integer_value);
155     mpz_swap(tok.u_.integer_value, val);
156     return tok;
157   }
158 
159   // Make a float token.
160   static Token
make_float_token(mpfr_t val,Location location)161   make_float_token(mpfr_t val, Location location)
162   {
163     Token tok(TOKEN_FLOAT, location);
164     mpfr_init(tok.u_.float_value);
165     mpfr_swap(tok.u_.float_value, val);
166     return tok;
167   }
168 
169   // Make a token for an imaginary number.
170   static Token
make_imaginary_token(mpfr_t val,Location location)171   make_imaginary_token(mpfr_t val, Location location)
172   {
173     Token tok(TOKEN_IMAGINARY, location);
174     mpfr_init(tok.u_.float_value);
175     mpfr_swap(tok.u_.float_value, val);
176     return tok;
177   }
178 
179   // Get the location of the token.
180   Location
location()181   location() const
182   { return this->location_; }
183 
184   // Return whether this is an invalid token.
185   bool
is_invalid()186   is_invalid() const
187   { return this->classification_ == TOKEN_INVALID; }
188 
189   // Return whether this is the EOF token.
190   bool
is_eof()191   is_eof() const
192   { return this->classification_ == TOKEN_EOF; }
193 
194   // Return the keyword value for a keyword token.
195   Keyword
keyword()196   keyword() const
197   {
198     go_assert(this->classification_ == TOKEN_KEYWORD);
199     return this->u_.keyword;
200   }
201 
202   // Return whether this is an identifier.
203   bool
is_identifier()204   is_identifier() const
205   { return this->classification_ == TOKEN_IDENTIFIER; }
206 
207   // Return the identifier.
208   const std::string&
identifier()209   identifier() const
210   {
211     go_assert(this->classification_ == TOKEN_IDENTIFIER);
212     return *this->u_.identifier_value.name;
213   }
214 
215   // Return whether the identifier is exported.
216   bool
is_identifier_exported()217   is_identifier_exported() const
218   {
219     go_assert(this->classification_ == TOKEN_IDENTIFIER);
220     return this->u_.identifier_value.is_exported;
221   }
222 
223   // Return whether this is a string.
224   bool
is_string()225   is_string() const
226   {
227     return this->classification_ == TOKEN_STRING;
228   }
229 
230   // Return the value of a string.  The returned value is a string of
231   // UTF-8 characters.
232   std::string
string_value()233   string_value() const
234   {
235     go_assert(this->classification_ == TOKEN_STRING);
236     return *this->u_.string_value;
237   }
238 
239   // Return the value of a character constant.
240   const mpz_t*
character_value()241   character_value() const
242   {
243     go_assert(this->classification_ == TOKEN_CHARACTER);
244     return &this->u_.integer_value;
245   }
246 
247   // Return the value of an integer.
248   const mpz_t*
integer_value()249   integer_value() const
250   {
251     go_assert(this->classification_ == TOKEN_INTEGER);
252     return &this->u_.integer_value;
253   }
254 
255   // Return the value of a float.
256   const mpfr_t*
float_value()257   float_value() const
258   {
259     go_assert(this->classification_ == TOKEN_FLOAT);
260     return &this->u_.float_value;
261   }
262 
263   // Return the value of an imaginary number.
264   const mpfr_t*
imaginary_value()265   imaginary_value() const
266   {
267     go_assert(this->classification_ == TOKEN_IMAGINARY);
268     return &this->u_.float_value;
269   }
270 
271   // Return the operator value for an operator token.
272   Operator
op()273   op() const
274   {
275     go_assert(this->classification_ == TOKEN_OPERATOR);
276     return this->u_.op;
277   }
278 
279   // Return whether this token is KEYWORD.
280   bool
is_keyword(Keyword keyword)281   is_keyword(Keyword keyword) const
282   {
283     return (this->classification_ == TOKEN_KEYWORD
284 	    && this->u_.keyword == keyword);
285   }
286 
287   // Return whether this token is OP.
288   bool
is_op(Operator op)289   is_op(Operator op) const
290   { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; }
291 
292   // Print the token for debugging.
293   void
294   print(FILE*) const;
295 
296  private:
297   // Private constructor used by make_..._token functions above.
298   Token(Classification, Location);
299 
300   // Clear the token.
301   void
302   clear();
303 
304   // The token classification.
305   Classification classification_;
306   union
307   {
308     // The keyword value for TOKEN_KEYWORD.
309     Keyword keyword;
310     // The token value for TOKEN_IDENTIFIER.
311     struct
312     {
313       // The name of the identifier.  This has been mangled to only
314       // include ASCII characters.
315       std::string* name;
316       // Whether this name should be exported.  This is true if the
317       // first letter in the name is upper case.
318       bool is_exported;
319     } identifier_value;
320     // The string value for TOKEN_STRING.
321     std::string* string_value;
322     // The token value for TOKEN_CHARACTER or TOKEN_INTEGER.
323     mpz_t integer_value;
324     // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY.
325     mpfr_t float_value;
326     // The token value for TOKEN_OPERATOR or the keyword value
327     Operator op;
328   } u_;
329   // The source location.
330   Location location_;
331 };
332 
333 // The lexer itself.
334 
335 class Lex
336 {
337  public:
338   Lex(const char* input_file_name, FILE* input_file, Linemap *linemap);
339 
340   ~Lex();
341 
342   // Return the next token.
343   Token
344   next_token();
345 
346   // Return the contents of any current //extern comment.
347   const std::string&
extern_name()348   extern_name() const
349   { return this->extern_; }
350 
351   // Return whether we have seen a //go:nointerface comment, clearing
352   // the flag.
353   bool
get_and_clear_nointerface()354   get_and_clear_nointerface()
355   {
356     bool ret = this->saw_nointerface_;
357     this->saw_nointerface_ = false;
358     return ret;
359   }
360 
361   // Return whether the identifier NAME should be exported.  NAME is a
362   // mangled name which includes only ASCII characters.
363   static bool
364   is_exported_name(const std::string& name);
365 
366   // Return whether the identifier NAME is invalid.  When we see an
367   // invalid character we still build an identifier, but we use a
368   // magic string to indicate that the identifier is invalid.  We then
369   // use this to avoid knockon errors.
370   static bool
371   is_invalid_identifier(const std::string& name);
372 
373   // A helper function.  Append V to STR.  IS_CHARACTER is true if V
374   // is a Unicode character which should be converted into UTF-8,
375   // false if it is a byte value to be appended directly.  The
376   // location is used to warn about an out of range character.
377   static void
378   append_char(unsigned int v, bool is_charater, std::string* str,
379 	      Location);
380 
381   // A helper function.  Fetch a UTF-8 character from STR and store it
382   // in *VALUE.  Return the number of bytes read from STR.  Return 0
383   // if STR does not point to a valid UTF-8 character.
384   static int
385   fetch_char(const char* str, unsigned int *value);
386 
387   // Return whether C is a Unicode or "C" locale space character.
388   static bool
389   is_unicode_space(unsigned int c);
390 
391  private:
392   ssize_t
393   get_line();
394 
395   bool
396   require_line();
397 
398   // The current location.
399   Location
400   location() const;
401 
402   // A position CHARS column positions before the current location.
403   Location
404   earlier_location(int chars) const;
405 
406   static bool
407   is_hex_digit(char);
408 
409   static unsigned char
octal_value(char c)410   octal_value(char c)
411   { return c - '0'; }
412 
413   Token
make_invalid_token()414   make_invalid_token()
415   { return Token::make_invalid_token(this->location()); }
416 
417   Token
make_eof_token()418   make_eof_token()
419   { return Token::make_eof_token(this->location()); }
420 
421   Token
make_operator(Operator op,int chars)422   make_operator(Operator op, int chars)
423   { return Token::make_operator_token(op, this->earlier_location(chars)); }
424 
425   Token
426   gather_identifier();
427 
428   static bool
429   could_be_exponent(const char*, const char*);
430 
431   Token
432   gather_number();
433 
434   Token
435   gather_character();
436 
437   Token
438   gather_string();
439 
440   Token
441   gather_raw_string();
442 
443   const char*
444   advance_one_utf8_char(const char*, unsigned int*, bool*);
445 
446   const char*
447   advance_one_char(const char*, bool, unsigned int*, bool*);
448 
449   static bool
450   is_unicode_digit(unsigned int c);
451 
452   static bool
453   is_unicode_letter(unsigned int c);
454 
455   static bool
456   is_unicode_uppercase(unsigned int c);
457 
458   static bool
459   is_in_unicode_range(unsigned int C, const Unicode_range* ranges,
460 		      size_t range_size);
461 
462   Operator
463   three_character_operator(char, char, char);
464 
465   Operator
466   two_character_operator(char, char);
467 
468   Operator
469   one_character_operator(char);
470 
471   bool
472   skip_c_comment();
473 
474   void
475   skip_cpp_comment();
476 
477   // The input file name.
478   const char* input_file_name_;
479   // The input file.
480   FILE* input_file_;
481   // The object used to keep track of file names and line numbers.
482   Linemap* linemap_;
483   // The line buffer.  This holds the current line.
484   char* linebuf_;
485   // The size of the line buffer.
486   size_t linebufsize_;
487   // The nmber of characters in the current line.
488   size_t linesize_;
489   // The current offset in linebuf_.
490   size_t lineoff_;
491   // The current line number.
492   size_t lineno_;
493   // Whether to add a semicolon if we see a newline now.
494   bool add_semi_at_eol_;
495   // Whether we just saw a magic go:nointerface comment.
496   bool saw_nointerface_;
497   // The external name to use for a function declaration, from a magic
498   // //extern comment.
499   std::string extern_;
500 };
501 
502 #endif // !defined(GO_LEX_H)
503