xref: /freebsd/contrib/lua/src/llex.c (revision 1f474190)
1 /*
2 ** $Id: llex.c,v 2.96.1.1 2017/04/19 17:20:42 roberto Exp $
3 ** Lexical Analyzer
4 ** See Copyright Notice in lua.h
5 */
6 
7 #define llex_c
8 #define LUA_CORE
9 
10 #include "lprefix.h"
11 
12 
13 #include <locale.h>
14 #include <string.h>
15 
16 #include "lua.h"
17 
18 #include "lctype.h"
19 #include "ldebug.h"
20 #include "ldo.h"
21 #include "lgc.h"
22 #include "llex.h"
23 #include "lobject.h"
24 #include "lparser.h"
25 #include "lstate.h"
26 #include "lstring.h"
27 #include "ltable.h"
28 #include "lzio.h"
29 
30 
31 
32 #define next(ls) (ls->current = zgetc(ls->z))
33 
34 
35 
36 #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
37 
38 
39 /* ORDER RESERVED */
40 static const char *const luaX_tokens [] = {
41     "and", "break", "do", "else", "elseif",
42     "end", "false", "for", "function", "goto", "if",
43     "in", "local", "nil", "not", "or", "repeat",
44     "return", "then", "true", "until", "while",
45     "//", "..", "...", "==", ">=", "<=", "~=",
46     "<<", ">>", "::", "<eof>",
47     "<number>", "<integer>", "<name>", "<string>"
48 };
49 
50 
51 #define save_and_next(ls) (save(ls, ls->current), next(ls))
52 
53 
54 static l_noret lexerror (LexState *ls, const char *msg, int token);
55 
56 
57 static void save (LexState *ls, int c) {
58   Mbuffer *b = ls->buff;
59   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
60     size_t newsize;
61     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
62       lexerror(ls, "lexical element too long", 0);
63     newsize = luaZ_sizebuffer(b) * 2;
64     luaZ_resizebuffer(ls->L, b, newsize);
65   }
66   b->buffer[luaZ_bufflen(b)++] = cast(char, c);
67 }
68 
69 
70 void luaX_init (lua_State *L) {
71   int i;
72   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
73   luaC_fix(L, obj2gco(e));  /* never collect this name */
74   for (i=0; i<NUM_RESERVED; i++) {
75     TString *ts = luaS_new(L, luaX_tokens[i]);
76     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
77     ts->extra = cast_byte(i+1);  /* reserved word */
78   }
79 }
80 
81 
82 const char *luaX_token2str (LexState *ls, int token) {
83   if (token < FIRST_RESERVED) {  /* single-byte symbols? */
84     lua_assert(token == cast_uchar(token));
85     return luaO_pushfstring(ls->L, "'%c'", token);
86   }
87   else {
88     const char *s = luaX_tokens[token - FIRST_RESERVED];
89     if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
90       return luaO_pushfstring(ls->L, "'%s'", s);
91     else  /* names, strings, and numerals */
92       return s;
93   }
94 }
95 
96 
97 static const char *txtToken (LexState *ls, int token) {
98   switch (token) {
99     case TK_NAME: case TK_STRING:
100     case TK_FLT: case TK_INT:
101       save(ls, '\0');
102       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
103     default:
104       return luaX_token2str(ls, token);
105   }
106 }
107 
108 
109 static l_noret lexerror (LexState *ls, const char *msg, int token) {
110   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
111   if (token)
112     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
113   luaD_throw(ls->L, LUA_ERRSYNTAX);
114 }
115 
116 
117 l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
118   lexerror(ls, msg, ls->t.token);
119 }
120 
121 
122 /*
123 ** creates a new string and anchors it in scanner's table so that
124 ** it will not be collected until the end of the compilation
125 ** (by that time it should be anchored somewhere)
126 */
127 TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
128   lua_State *L = ls->L;
129   TValue *o;  /* entry for 'str' */
130   TString *ts = luaS_newlstr(L, str, l);  /* create new string */
131   setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
132   o = luaH_set(L, ls->h, L->top - 1);
133   if (ttisnil(o)) {  /* not in use yet? */
134     /* boolean value does not need GC barrier;
135        table has no metatable, so it does not need to invalidate cache */
136     setbvalue(o, 1);  /* t[string] = true */
137     luaC_checkGC(L);
138   }
139   else {  /* string already present */
140     ts = tsvalue(keyfromval(o));  /* re-use value previously stored */
141   }
142   L->top--;  /* remove string from stack */
143   return ts;
144 }
145 
146 
147 /*
148 ** increment line number and skips newline sequence (any of
149 ** \n, \r, \n\r, or \r\n)
150 */
151 static void inclinenumber (LexState *ls) {
152   int old = ls->current;
153   lua_assert(currIsNewline(ls));
154   next(ls);  /* skip '\n' or '\r' */
155   if (currIsNewline(ls) && ls->current != old)
156     next(ls);  /* skip '\n\r' or '\r\n' */
157   if (++ls->linenumber >= MAX_INT)
158     lexerror(ls, "chunk has too many lines", 0);
159 }
160 
161 
162 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
163                     int firstchar) {
164   ls->t.token = 0;
165   ls->L = L;
166   ls->current = firstchar;
167   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
168   ls->z = z;
169   ls->fs = NULL;
170   ls->linenumber = 1;
171   ls->lastline = 1;
172   ls->source = source;
173   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
174   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
175 }
176 
177 
178 
179 /*
180 ** =======================================================
181 ** LEXICAL ANALYZER
182 ** =======================================================
183 */
184 
185 
186 static int check_next1 (LexState *ls, int c) {
187   if (ls->current == c) {
188     next(ls);
189     return 1;
190   }
191   else return 0;
192 }
193 
194 
195 /*
196 ** Check whether current char is in set 'set' (with two chars) and
197 ** saves it
198 */
199 static int check_next2 (LexState *ls, const char *set) {
200   lua_assert(set[2] == '\0');
201   if (ls->current == set[0] || ls->current == set[1]) {
202     save_and_next(ls);
203     return 1;
204   }
205   else return 0;
206 }
207 
208 
209 /* LUA_NUMBER */
210 /*
211 ** this function is quite liberal in what it accepts, as 'luaO_str2num'
212 ** will reject ill-formed numerals.
213 */
214 static int read_numeral (LexState *ls, SemInfo *seminfo) {
215   TValue obj;
216   const char *expo = "Ee";
217   int first = ls->current;
218   lua_assert(lisdigit(ls->current));
219   save_and_next(ls);
220   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
221     expo = "Pp";
222   for (;;) {
223     if (check_next2(ls, expo))  /* exponent part? */
224       check_next2(ls, "-+");  /* optional exponent sign */
225     if (lisxdigit(ls->current))
226       save_and_next(ls);
227     else if (ls->current == '.')
228       save_and_next(ls);
229     else break;
230   }
231   save(ls, '\0');
232   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)  /* format error? */
233     lexerror(ls, "malformed number", TK_FLT);
234   if (ttisinteger(&obj)) {
235     seminfo->i = ivalue(&obj);
236     return TK_INT;
237   }
238   else {
239     lua_assert(ttisfloat(&obj));
240     seminfo->r = fltvalue(&obj);
241     return TK_FLT;
242   }
243 }
244 
245 
246 /*
247 ** reads a sequence '[=*[' or ']=*]', leaving the last bracket.
248 ** If sequence is well formed, return its number of '='s + 2; otherwise,
249 ** return 1 if there is no '='s or 0 otherwise (an unfinished '[==...').
250 */
251 static size_t skip_sep (LexState *ls) {
252   size_t count = 0;
253   int s = ls->current;
254   lua_assert(s == '[' || s == ']');
255   save_and_next(ls);
256   while (ls->current == '=') {
257     save_and_next(ls);
258     count++;
259   }
260   return (ls->current == s) ? count + 2
261          : (count == 0) ? 1
262          : 0;
263 
264 }
265 
266 
267 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
268   int line = ls->linenumber;  /* initial line (for error message) */
269   save_and_next(ls);  /* skip 2nd '[' */
270   if (currIsNewline(ls))  /* string starts with a newline? */
271     inclinenumber(ls);  /* skip it */
272   for (;;) {
273     switch (ls->current) {
274       case EOZ: {  /* error */
275         const char *what = (seminfo ? "string" : "comment");
276         const char *msg = luaO_pushfstring(ls->L,
277                      "unfinished long %s (starting at line %d)", what, line);
278         lexerror(ls, msg, TK_EOS);
279         break;  /* to avoid warnings */
280       }
281       case ']': {
282         if (skip_sep(ls) == sep) {
283           save_and_next(ls);  /* skip 2nd ']' */
284           goto endloop;
285         }
286         break;
287       }
288       case '\n': case '\r': {
289         save(ls, '\n');
290         inclinenumber(ls);
291         if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
292         break;
293       }
294       default: {
295         if (seminfo) save_and_next(ls);
296         else next(ls);
297       }
298     }
299   } endloop:
300   if (seminfo)
301     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
302                                      luaZ_bufflen(ls->buff) - 2 * sep);
303 }
304 
305 
306 static void esccheck (LexState *ls, int c, const char *msg) {
307   if (!c) {
308     if (ls->current != EOZ)
309       save_and_next(ls);  /* add current to buffer for error message */
310     lexerror(ls, msg, TK_STRING);
311   }
312 }
313 
314 
315 static int gethexa (LexState *ls) {
316   save_and_next(ls);
317   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
318   return luaO_hexavalue(ls->current);
319 }
320 
321 
322 static int readhexaesc (LexState *ls) {
323   int r = gethexa(ls);
324   r = (r << 4) + gethexa(ls);
325   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
326   return r;
327 }
328 
329 
330 static unsigned long readutf8esc (LexState *ls) {
331   unsigned long r;
332   int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
333   save_and_next(ls);  /* skip 'u' */
334   esccheck(ls, ls->current == '{', "missing '{'");
335   r = gethexa(ls);  /* must have at least one digit */
336   while ((save_and_next(ls), lisxdigit(ls->current))) {
337     i++;
338     r = (r << 4) + luaO_hexavalue(ls->current);
339     esccheck(ls, r <= 0x10FFFF, "UTF-8 value too large");
340   }
341   esccheck(ls, ls->current == '}', "missing '}'");
342   next(ls);  /* skip '}' */
343   luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
344   return r;
345 }
346 
347 
348 static void utf8esc (LexState *ls) {
349   char buff[UTF8BUFFSZ];
350   int n = luaO_utf8esc(buff, readutf8esc(ls));
351   for (; n > 0; n--)  /* add 'buff' to string */
352     save(ls, buff[UTF8BUFFSZ - n]);
353 }
354 
355 
356 static int readdecesc (LexState *ls) {
357   int i;
358   int r = 0;  /* result accumulator */
359   for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
360     r = 10*r + ls->current - '0';
361     save_and_next(ls);
362   }
363   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
364   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
365   return r;
366 }
367 
368 
369 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
370   save_and_next(ls);  /* keep delimiter (for error messages) */
371   while (ls->current != del) {
372     switch (ls->current) {
373       case EOZ:
374         lexerror(ls, "unfinished string", TK_EOS);
375         break;  /* to avoid warnings */
376       case '\n':
377       case '\r':
378         lexerror(ls, "unfinished string", TK_STRING);
379         break;  /* to avoid warnings */
380       case '\\': {  /* escape sequences */
381         int c;  /* final character to be saved */
382         save_and_next(ls);  /* keep '\\' for error messages */
383         switch (ls->current) {
384           case 'a': c = '\a'; goto read_save;
385           case 'b': c = '\b'; goto read_save;
386           case 'f': c = '\f'; goto read_save;
387           case 'n': c = '\n'; goto read_save;
388           case 'r': c = '\r'; goto read_save;
389           case 't': c = '\t'; goto read_save;
390           case 'v': c = '\v'; goto read_save;
391           case 'x': c = readhexaesc(ls); goto read_save;
392           case 'u': utf8esc(ls);  goto no_save;
393           case '\n': case '\r':
394             inclinenumber(ls); c = '\n'; goto only_save;
395           case '\\': case '\"': case '\'':
396             c = ls->current; goto read_save;
397           case EOZ: goto no_save;  /* will raise an error next loop */
398           case 'z': {  /* zap following span of spaces */
399             luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
400             next(ls);  /* skip the 'z' */
401             while (lisspace(ls->current)) {
402               if (currIsNewline(ls)) inclinenumber(ls);
403               else next(ls);
404             }
405             goto no_save;
406           }
407           default: {
408             esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
409             c = readdecesc(ls);  /* digital escape '\ddd' */
410             goto only_save;
411           }
412         }
413        read_save:
414          next(ls);
415          /* go through */
416        only_save:
417          luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
418          save(ls, c);
419          /* go through */
420        no_save: break;
421       }
422       default:
423         save_and_next(ls);
424     }
425   }
426   save_and_next(ls);  /* skip delimiter */
427   seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
428                                    luaZ_bufflen(ls->buff) - 2);
429 }
430 
431 
432 static int llex (LexState *ls, SemInfo *seminfo) {
433   luaZ_resetbuffer(ls->buff);
434   for (;;) {
435     switch (ls->current) {
436       case '\n': case '\r': {  /* line breaks */
437         inclinenumber(ls);
438         break;
439       }
440       case ' ': case '\f': case '\t': case '\v': {  /* spaces */
441         next(ls);
442         break;
443       }
444       case '-': {  /* '-' or '--' (comment) */
445         next(ls);
446         if (ls->current != '-') return '-';
447         /* else is a comment */
448         next(ls);
449         if (ls->current == '[') {  /* long comment? */
450           size_t sep = skip_sep(ls);
451           luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
452           if (sep >= 2) {
453             read_long_string(ls, NULL, sep);  /* skip long comment */
454             luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
455             break;
456           }
457         }
458         /* else short comment */
459         while (!currIsNewline(ls) && ls->current != EOZ)
460           next(ls);  /* skip until end of line (or end of file) */
461         break;
462       }
463       case '[': {  /* long string or simply '[' */
464         size_t sep = skip_sep(ls);
465         if (sep >= 2) {
466           read_long_string(ls, seminfo, sep);
467           return TK_STRING;
468         }
469         else if (sep == 0)  /* '[=...' missing second bracket */
470           lexerror(ls, "invalid long string delimiter", TK_STRING);
471         return '[';
472       }
473       case '=': {
474         next(ls);
475         if (check_next1(ls, '=')) return TK_EQ;
476         else return '=';
477       }
478       case '<': {
479         next(ls);
480         if (check_next1(ls, '=')) return TK_LE;
481         else if (check_next1(ls, '<')) return TK_SHL;
482         else return '<';
483       }
484       case '>': {
485         next(ls);
486         if (check_next1(ls, '=')) return TK_GE;
487         else if (check_next1(ls, '>')) return TK_SHR;
488         else return '>';
489       }
490       case '/': {
491         next(ls);
492         if (check_next1(ls, '/')) return TK_IDIV;
493         else return '/';
494       }
495       case '~': {
496         next(ls);
497         if (check_next1(ls, '=')) return TK_NE;
498         else return '~';
499       }
500       case ':': {
501         next(ls);
502         if (check_next1(ls, ':')) return TK_DBCOLON;
503         else return ':';
504       }
505       case '"': case '\'': {  /* short literal strings */
506         read_string(ls, ls->current, seminfo);
507         return TK_STRING;
508       }
509       case '.': {  /* '.', '..', '...', or number */
510         save_and_next(ls);
511         if (check_next1(ls, '.')) {
512           if (check_next1(ls, '.'))
513             return TK_DOTS;   /* '...' */
514           else return TK_CONCAT;   /* '..' */
515         }
516         else if (!lisdigit(ls->current)) return '.';
517         else return read_numeral(ls, seminfo);
518       }
519       case '0': case '1': case '2': case '3': case '4':
520       case '5': case '6': case '7': case '8': case '9': {
521         return read_numeral(ls, seminfo);
522       }
523       case EOZ: {
524         return TK_EOS;
525       }
526       default: {
527         if (lislalpha(ls->current)) {  /* identifier or reserved word? */
528           TString *ts;
529           do {
530             save_and_next(ls);
531           } while (lislalnum(ls->current));
532           ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
533                                   luaZ_bufflen(ls->buff));
534           seminfo->ts = ts;
535           if (isreserved(ts))  /* reserved word? */
536             return ts->extra - 1 + FIRST_RESERVED;
537           else {
538             return TK_NAME;
539           }
540         }
541         else {  /* single-char tokens (+ - / ...) */
542           int c = ls->current;
543           next(ls);
544           return c;
545         }
546       }
547     }
548   }
549 }
550 
551 
552 void luaX_next (LexState *ls) {
553   ls->lastline = ls->linenumber;
554   if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
555     ls->t = ls->lookahead;  /* use this one */
556     ls->lookahead.token = TK_EOS;  /* and discharge it */
557   }
558   else
559     ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
560 }
561 
562 
563 int luaX_lookahead (LexState *ls) {
564   lua_assert(ls->lookahead.token == TK_EOS);
565   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
566   return ls->lookahead.token;
567 }
568 
569