1 // lex.h -- Go frontend lexer.     -*- C++ -*-
2 
3 // Copyright 2009 The Go Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style
5 // license that can be found in the LICENSE file.
6 
7 #ifndef GO_LEX_H
8 #define GO_LEX_H
9 
10 #include <mpfr.h>
11 
12 #include "operator.h"
13 #include "go-linemap.h"
14 
15 struct Unicode_range;
16 
17 // The keywords.  These must be in sorted order, other than
18 // KEYWORD_INVALID.  They must match the Keywords::mapping_ array in
19 // lex.cc.
20 
21 enum Keyword
22 {
23   KEYWORD_INVALID,	// Not a keyword.
24   KEYWORD_ASM,
25   KEYWORD_BREAK,
26   KEYWORD_CASE,
27   KEYWORD_CHAN,
28   KEYWORD_CONST,
29   KEYWORD_CONTINUE,
30   KEYWORD_DEFAULT,
31   KEYWORD_DEFER,
32   KEYWORD_ELSE,
33   KEYWORD_FALLTHROUGH,
34   KEYWORD_FOR,
35   KEYWORD_FUNC,
36   KEYWORD_GO,
37   KEYWORD_GOTO,
38   KEYWORD_IF,
39   KEYWORD_IMPORT,
40   KEYWORD_INTERFACE,
41   KEYWORD_MAP,
42   KEYWORD_PACKAGE,
43   KEYWORD_RANGE,
44   KEYWORD_RETURN,
45   KEYWORD_SELECT,
46   KEYWORD_STRUCT,
47   KEYWORD_SWITCH,
48   KEYWORD_TYPE,
49   KEYWORD_VAR
50 };
51 
52 // Pragmas built from magic comments and recorded for functions.
53 // These are used as bits in a bitmask.
54 // The set of values is intended to be the same as the gc compiler.
55 
56 enum GoPragma
57 {
58   GOPRAGMA_NOINTERFACE = 1 << 0,	// Method not in type descriptor.
59   GOPRAGMA_NOESCAPE = 1 << 1,		// Args do not escape.
60   GOPRAGMA_NORACE = 1 << 2,		// No race detector.
61   GOPRAGMA_NOSPLIT = 1 << 3,		// Do not split stack.
62   GOPRAGMA_NOINLINE = 1 << 4,		// Do not inline.
63   GOPRAGMA_SYSTEMSTACK = 1 << 5,	// Must run on system stack.
64   GOPRAGMA_NOWRITEBARRIER = 1 << 6,	// No write barriers.
65   GOPRAGMA_NOWRITEBARRIERREC = 1 << 7,	// No write barriers here or callees.
66   GOPRAGMA_CGOUNSAFEARGS = 1 << 8,	// Pointer to arg is pointer to all.
67   GOPRAGMA_UINTPTRESCAPES = 1 << 9,	// uintptr(p) escapes.
68   GOPRAGMA_NOTINHEAP = 1 << 10		// type is not in heap.
69 };
70 
71 // A token returned from the lexer.
72 
73 class Token
74 {
75  public:
76   // Token classification.
77   enum Classification
78   {
79     // Token is invalid.
80     TOKEN_INVALID,
81     // Token indicates end of input.
82     TOKEN_EOF,
83     // Token is a keyword.
84     TOKEN_KEYWORD,
85     // Token is an identifier.
86     TOKEN_IDENTIFIER,
87     // Token is a string of characters.
88     TOKEN_STRING,
89     // Token is an operator.
90     TOKEN_OPERATOR,
91     // Token is a character constant.
92     TOKEN_CHARACTER,
93     // Token is an integer.
94     TOKEN_INTEGER,
95     // Token is a floating point number.
96     TOKEN_FLOAT,
97     // Token is an imaginary number.
98     TOKEN_IMAGINARY
99   };
100 
101   ~Token();
102   Token(const Token&);
103   Token& operator=(const Token&);
104 
105   // Get token classification.
106   Classification
classification()107   classification() const
108   { return this->classification_; }
109 
110   // Make a token for an invalid value.
111   static Token
make_invalid_token(Location location)112   make_invalid_token(Location location)
113   { return Token(TOKEN_INVALID, location); }
114 
115   // Make a token representing end of file.
116   static Token
make_eof_token(Location location)117   make_eof_token(Location location)
118   { return Token(TOKEN_EOF, location); }
119 
120   // Make a keyword token.
121   static Token
make_keyword_token(Keyword keyword,Location location)122   make_keyword_token(Keyword keyword, Location location)
123   {
124     Token tok(TOKEN_KEYWORD, location);
125     tok.u_.keyword = keyword;
126     return tok;
127   }
128 
129   // Make an identifier token.
130   static Token
make_identifier_token(const std::string & value,bool is_exported,Location location)131   make_identifier_token(const std::string& value, bool is_exported,
132 			Location location)
133   {
134     Token tok(TOKEN_IDENTIFIER, location);
135     tok.u_.identifier_value.name = new std::string(value);
136     tok.u_.identifier_value.is_exported = is_exported;
137     return tok;
138   }
139 
140   // Make a quoted string token.
141   static Token
make_string_token(const std::string & value,Location location)142   make_string_token(const std::string& value, Location location)
143   {
144     Token tok(TOKEN_STRING, location);
145     tok.u_.string_value = new std::string(value);
146     return tok;
147   }
148 
149   // Make an operator token.
150   static Token
make_operator_token(Operator op,Location location)151   make_operator_token(Operator op, Location location)
152   {
153     Token tok(TOKEN_OPERATOR, location);
154     tok.u_.op = op;
155     return tok;
156   }
157 
158   // Make a character constant token.
159   static Token
make_character_token(mpz_t val,Location location)160   make_character_token(mpz_t val, Location location)
161   {
162     Token tok(TOKEN_CHARACTER, location);
163     mpz_init(tok.u_.integer_value);
164     mpz_swap(tok.u_.integer_value, val);
165     return tok;
166   }
167 
168   // Make an integer token.
169   static Token
make_integer_token(mpz_t val,Location location)170   make_integer_token(mpz_t val, Location location)
171   {
172     Token tok(TOKEN_INTEGER, location);
173     mpz_init(tok.u_.integer_value);
174     mpz_swap(tok.u_.integer_value, val);
175     return tok;
176   }
177 
178   // Make a float token.
179   static Token
make_float_token(mpfr_t val,Location location)180   make_float_token(mpfr_t val, Location location)
181   {
182     Token tok(TOKEN_FLOAT, location);
183     mpfr_init(tok.u_.float_value);
184     mpfr_swap(tok.u_.float_value, val);
185     return tok;
186   }
187 
188   // Make a token for an imaginary number.
189   static Token
make_imaginary_token(mpfr_t val,Location location)190   make_imaginary_token(mpfr_t val, Location location)
191   {
192     Token tok(TOKEN_IMAGINARY, location);
193     mpfr_init(tok.u_.float_value);
194     mpfr_swap(tok.u_.float_value, val);
195     return tok;
196   }
197 
198   // Get the location of the token.
199   Location
location()200   location() const
201   { return this->location_; }
202 
203   // Return whether this is an invalid token.
204   bool
is_invalid()205   is_invalid() const
206   { return this->classification_ == TOKEN_INVALID; }
207 
208   // Return whether this is the EOF token.
209   bool
is_eof()210   is_eof() const
211   { return this->classification_ == TOKEN_EOF; }
212 
213   // Return the keyword value for a keyword token.
214   Keyword
keyword()215   keyword() const
216   {
217     go_assert(this->classification_ == TOKEN_KEYWORD);
218     return this->u_.keyword;
219   }
220 
221   // Return whether this is an identifier.
222   bool
is_identifier()223   is_identifier() const
224   { return this->classification_ == TOKEN_IDENTIFIER; }
225 
226   // Return the identifier.
227   const std::string&
identifier()228   identifier() const
229   {
230     go_assert(this->classification_ == TOKEN_IDENTIFIER);
231     return *this->u_.identifier_value.name;
232   }
233 
234   // Return whether the identifier is exported.
235   bool
is_identifier_exported()236   is_identifier_exported() const
237   {
238     go_assert(this->classification_ == TOKEN_IDENTIFIER);
239     return this->u_.identifier_value.is_exported;
240   }
241 
242   // Return whether this is a string.
243   bool
is_string()244   is_string() const
245   {
246     return this->classification_ == TOKEN_STRING;
247   }
248 
249   // Return the value of a string.  The returned value is a string of
250   // UTF-8 characters.
251   std::string
string_value()252   string_value() const
253   {
254     go_assert(this->classification_ == TOKEN_STRING);
255     return *this->u_.string_value;
256   }
257 
258   // Return the value of a character constant.
259   const mpz_t*
character_value()260   character_value() const
261   {
262     go_assert(this->classification_ == TOKEN_CHARACTER);
263     return &this->u_.integer_value;
264   }
265 
266   // Return the value of an integer.
267   const mpz_t*
integer_value()268   integer_value() const
269   {
270     go_assert(this->classification_ == TOKEN_INTEGER);
271     return &this->u_.integer_value;
272   }
273 
274   // Return the value of a float.
275   const mpfr_t*
float_value()276   float_value() const
277   {
278     go_assert(this->classification_ == TOKEN_FLOAT);
279     return &this->u_.float_value;
280   }
281 
282   // Return the value of an imaginary number.
283   const mpfr_t*
imaginary_value()284   imaginary_value() const
285   {
286     go_assert(this->classification_ == TOKEN_IMAGINARY);
287     return &this->u_.float_value;
288   }
289 
290   // Return the operator value for an operator token.
291   Operator
op()292   op() const
293   {
294     go_assert(this->classification_ == TOKEN_OPERATOR);
295     return this->u_.op;
296   }
297 
298   // Return whether this token is KEYWORD.
299   bool
is_keyword(Keyword keyword)300   is_keyword(Keyword keyword) const
301   {
302     return (this->classification_ == TOKEN_KEYWORD
303 	    && this->u_.keyword == keyword);
304   }
305 
306   // Return whether this token is OP.
307   bool
is_op(Operator op)308   is_op(Operator op) const
309   { return this->classification_ == TOKEN_OPERATOR && this->u_.op == op; }
310 
311   // Print the token for debugging.
312   void
313   print(FILE*) const;
314 
315  private:
316   // Private constructor used by make_..._token functions above.
317   Token(Classification, Location);
318 
319   // Clear the token.
320   void
321   clear();
322 
323   // The token classification.
324   Classification classification_;
325   union
326   {
327     // The keyword value for TOKEN_KEYWORD.
328     Keyword keyword;
329     // The token value for TOKEN_IDENTIFIER.
330     struct
331     {
332       // The name of the identifier.  This has been mangled to only
333       // include ASCII characters.
334       std::string* name;
335       // Whether this name should be exported.  This is true if the
336       // first letter in the name is upper case.
337       bool is_exported;
338     } identifier_value;
339     // The string value for TOKEN_STRING.
340     std::string* string_value;
341     // The token value for TOKEN_CHARACTER or TOKEN_INTEGER.
342     mpz_t integer_value;
343     // The token value for TOKEN_FLOAT or TOKEN_IMAGINARY.
344     mpfr_t float_value;
345     // The token value for TOKEN_OPERATOR or the keyword value
346     Operator op;
347   } u_;
348   // The source location.
349   Location location_;
350 };
351 
352 // The lexer itself.
353 
354 class Lex
355 {
356  public:
357   Lex(const char* input_file_name, FILE* input_file, Linemap *linemap);
358 
359   ~Lex();
360 
361   // Return the next token.
362   Token
363   next_token();
364 
365   // Return the contents of any current //extern comment.
366   const std::string&
extern_name()367   extern_name() const
368   { return this->extern_; }
369 
370   // Return the current set of pragmas, and clear them.
371   unsigned int
get_and_clear_pragmas()372   get_and_clear_pragmas()
373   {
374     unsigned int ret = this->pragmas_;
375     this->pragmas_ = 0;
376     return ret;
377   }
378 
379   struct Linkname
380   {
381     std::string ext_name;	// External name.
382     bool is_exported;		// Whether the internal name is exported.
383     Location loc;		// Location of go:linkname directive.
384 
LinknameLinkname385     Linkname()
386       : ext_name(), is_exported(false), loc()
387     { }
388 
LinknameLinkname389     Linkname(const std::string& ext_name_a, bool is_exported_a, Location loc_a)
390       : ext_name(ext_name_a), is_exported(is_exported_a), loc(loc_a)
391     { }
392   };
393 
394   typedef std::map<std::string, Linkname> Linknames;
395 
396   // Return the linknames seen so far, or NULL if none, and clear the
397   // set.  These are from go:linkname compiler directives.
398   Linknames*
get_and_clear_linknames()399   get_and_clear_linknames()
400   {
401     Linknames* ret = this->linknames_;
402     this->linknames_ = NULL;
403     return ret;
404   }
405 
406   // Return whether the identifier NAME should be exported.  NAME is a
407   // mangled name which includes only ASCII characters.
408   static bool
409   is_exported_name(const std::string& name);
410 
411   // Return whether the identifier NAME is invalid.  When we see an
412   // invalid character we still build an identifier, but we use a
413   // magic string to indicate that the identifier is invalid.  We then
414   // use this to avoid knockon errors.
415   static bool
416   is_invalid_identifier(const std::string& name);
417 
418   // A helper function.  Append V to STR.  IS_CHARACTER is true if V
419   // is a Unicode character which should be converted into UTF-8,
420   // false if it is a byte value to be appended directly.  The
421   // location is used to warn about an out of range character.
422   static void
423   append_char(unsigned int v, bool is_charater, std::string* str,
424 	      Location);
425 
426   // A helper function.  Fetch a UTF-8 character from STR and store it
427   // in *VALUE.  Return the number of bytes read from STR.  Return 0
428   // if STR does not point to a valid UTF-8 character.
429   static int
430   fetch_char(const char* str, unsigned int *value);
431 
432   // Return whether C is a Unicode or "C" locale space character.
433   static bool
434   is_unicode_space(unsigned int c);
435 
436  private:
437   ssize_t
438   get_line();
439 
440   bool
441   require_line();
442 
443   // The current location.
444   Location
445   location() const;
446 
447   // A position CHARS column positions before the current location.
448   Location
449   earlier_location(int chars) const;
450 
451   static bool
452   is_hex_digit(char);
453 
454   static unsigned char
octal_value(char c)455   octal_value(char c)
456   { return c - '0'; }
457 
458   static unsigned
459   hex_val(char c);
460 
461   Token
make_invalid_token()462   make_invalid_token()
463   { return Token::make_invalid_token(this->location()); }
464 
465   Token
make_eof_token()466   make_eof_token()
467   { return Token::make_eof_token(this->location()); }
468 
469   Token
make_operator(Operator op,int chars)470   make_operator(Operator op, int chars)
471   { return Token::make_operator_token(op, this->earlier_location(chars)); }
472 
473   Token
474   gather_identifier();
475 
476   static bool
477   could_be_exponent(const char*, const char*);
478 
479   Token
480   gather_number();
481 
482   Token
483   gather_character();
484 
485   Token
486   gather_string();
487 
488   Token
489   gather_raw_string();
490 
491   const char*
492   advance_one_utf8_char(const char*, unsigned int*, bool*);
493 
494   const char*
495   advance_one_char(const char*, bool, unsigned int*, bool*);
496 
497   static bool
498   is_unicode_digit(unsigned int c);
499 
500   static bool
501   is_unicode_letter(unsigned int c);
502 
503   static bool
504   is_unicode_uppercase(unsigned int c);
505 
506   static bool
507   is_in_unicode_range(unsigned int C, const Unicode_range* ranges,
508 		      size_t range_size);
509 
510   Operator
511   three_character_operator(char, char, char);
512 
513   Operator
514   two_character_operator(char, char);
515 
516   Operator
517   one_character_operator(char);
518 
519   bool
520   skip_c_comment(bool* found_newline);
521 
522   void
523   skip_cpp_comment();
524 
525   // The input file name.
526   const char* input_file_name_;
527   // The input file.
528   FILE* input_file_;
529   // The object used to keep track of file names and line numbers.
530   Linemap* linemap_;
531   // The line buffer.  This holds the current line.
532   char* linebuf_;
533   // The size of the line buffer.
534   size_t linebufsize_;
535   // The nmber of characters in the current line.
536   size_t linesize_;
537   // The current offset in linebuf_.
538   size_t lineoff_;
539   // The current line number.
540   size_t lineno_;
541   // Whether to add a semicolon if we see a newline now.
542   bool add_semi_at_eol_;
543   // Pragmas for the next function, from magic comments.
544   unsigned int pragmas_;
545   // The external name to use for a function declaration, from a magic
546   // //extern comment.
547   std::string extern_;
548   // The list of //go:linkname comments, if any.
549   Linknames* linknames_;
550 };
551 
552 #endif // !defined(GO_LEX_H)
553