xref: /freebsd/contrib/lua/src/llex.c (revision 4d846d26)
1 /*
2 ** $Id: llex.c $
3 ** Lexical Analyzer
4 ** See Copyright Notice in lua.h
5 */
6 
7 #define llex_c
8 #define LUA_CORE
9 
10 #include "lprefix.h"
11 
12 
13 #include <locale.h>
14 #include <string.h>
15 
16 #include "lua.h"
17 
18 #include "lctype.h"
19 #include "ldebug.h"
20 #include "ldo.h"
21 #include "lgc.h"
22 #include "llex.h"
23 #include "lobject.h"
24 #include "lparser.h"
25 #include "lstate.h"
26 #include "lstring.h"
27 #include "ltable.h"
28 #include "lzio.h"
29 
30 
31 
32 #define next(ls)	(ls->current = zgetc(ls->z))
33 
34 
35 
36 #define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
37 
38 
39 /* ORDER RESERVED */
40 static const char *const luaX_tokens [] = {
41     "and", "break", "do", "else", "elseif",
42     "end", "false", "for", "function", "goto", "if",
43     "in", "local", "nil", "not", "or", "repeat",
44     "return", "then", "true", "until", "while",
45     "//", "..", "...", "==", ">=", "<=", "~=",
46     "<<", ">>", "::", "<eof>",
47     "<number>", "<integer>", "<name>", "<string>"
48 };
49 
50 
51 #define save_and_next(ls) (save(ls, ls->current), next(ls))
52 
53 
54 static l_noret lexerror (LexState *ls, const char *msg, int token);
55 
56 
57 static void save (LexState *ls, int c) {
58   Mbuffer *b = ls->buff;
59   if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
60     size_t newsize;
61     if (luaZ_sizebuffer(b) >= MAX_SIZE/2)
62       lexerror(ls, "lexical element too long", 0);
63     newsize = luaZ_sizebuffer(b) * 2;
64     luaZ_resizebuffer(ls->L, b, newsize);
65   }
66   b->buffer[luaZ_bufflen(b)++] = cast_char(c);
67 }
68 
69 
70 void luaX_init (lua_State *L) {
71   int i;
72   TString *e = luaS_newliteral(L, LUA_ENV);  /* create env name */
73   luaC_fix(L, obj2gco(e));  /* never collect this name */
74   for (i=0; i<NUM_RESERVED; i++) {
75     TString *ts = luaS_new(L, luaX_tokens[i]);
76     luaC_fix(L, obj2gco(ts));  /* reserved words are never collected */
77     ts->extra = cast_byte(i+1);  /* reserved word */
78   }
79 }
80 
81 
82 const char *luaX_token2str (LexState *ls, int token) {
83   if (token < FIRST_RESERVED) {  /* single-byte symbols? */
84     if (lisprint(token))
85       return luaO_pushfstring(ls->L, "'%c'", token);
86     else  /* control character */
87       return luaO_pushfstring(ls->L, "'<\\%d>'", token);
88   }
89   else {
90     const char *s = luaX_tokens[token - FIRST_RESERVED];
91     if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
92       return luaO_pushfstring(ls->L, "'%s'", s);
93     else  /* names, strings, and numerals */
94       return s;
95   }
96 }
97 
98 
99 static const char *txtToken (LexState *ls, int token) {
100   switch (token) {
101     case TK_NAME: case TK_STRING:
102     case TK_FLT: case TK_INT:
103       save(ls, '\0');
104       return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
105     default:
106       return luaX_token2str(ls, token);
107   }
108 }
109 
110 
111 static l_noret lexerror (LexState *ls, const char *msg, int token) {
112   msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
113   if (token)
114     luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
115   luaD_throw(ls->L, LUA_ERRSYNTAX);
116 }
117 
118 
119 l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
120   lexerror(ls, msg, ls->t.token);
121 }
122 
123 
124 /*
125 ** Creates a new string and anchors it in scanner's table so that it
126 ** will not be collected until the end of the compilation; by that time
127 ** it should be anchored somewhere. It also internalizes long strings,
128 ** ensuring there is only one copy of each unique string.  The table
129 ** here is used as a set: the string enters as the key, while its value
130 ** is irrelevant. We use the string itself as the value only because it
131 ** is a TValue readly available. Later, the code generation can change
132 ** this value.
133 */
134 TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
135   lua_State *L = ls->L;
136   TString *ts = luaS_newlstr(L, str, l);  /* create new string */
137   const TValue *o = luaH_getstr(ls->h, ts);
138   if (!ttisnil(o))  /* string already present? */
139     ts = keystrval(nodefromval(o));  /* get saved copy */
140   else {  /* not in use yet */
141     TValue *stv = s2v(L->top++);  /* reserve stack space for string */
142     setsvalue(L, stv, ts);  /* temporarily anchor the string */
143     luaH_finishset(L, ls->h, stv, o, stv);  /* t[string] = string */
144     /* table is not a metatable, so it does not need to invalidate cache */
145     luaC_checkGC(L);
146     L->top--;  /* remove string from stack */
147   }
148   return ts;
149 }
150 
151 
152 /*
153 ** increment line number and skips newline sequence (any of
154 ** \n, \r, \n\r, or \r\n)
155 */
156 static void inclinenumber (LexState *ls) {
157   int old = ls->current;
158   lua_assert(currIsNewline(ls));
159   next(ls);  /* skip '\n' or '\r' */
160   if (currIsNewline(ls) && ls->current != old)
161     next(ls);  /* skip '\n\r' or '\r\n' */
162   if (++ls->linenumber >= MAX_INT)
163     lexerror(ls, "chunk has too many lines", 0);
164 }
165 
166 
167 void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
168                     int firstchar) {
169   ls->t.token = 0;
170   ls->L = L;
171   ls->current = firstchar;
172   ls->lookahead.token = TK_EOS;  /* no look-ahead token */
173   ls->z = z;
174   ls->fs = NULL;
175   ls->linenumber = 1;
176   ls->lastline = 1;
177   ls->source = source;
178   ls->envn = luaS_newliteral(L, LUA_ENV);  /* get env name */
179   luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
180 }
181 
182 
183 
184 /*
185 ** =======================================================
186 ** LEXICAL ANALYZER
187 ** =======================================================
188 */
189 
190 
191 static int check_next1 (LexState *ls, int c) {
192   if (ls->current == c) {
193     next(ls);
194     return 1;
195   }
196   else return 0;
197 }
198 
199 
200 /*
201 ** Check whether current char is in set 'set' (with two chars) and
202 ** saves it
203 */
204 static int check_next2 (LexState *ls, const char *set) {
205   lua_assert(set[2] == '\0');
206   if (ls->current == set[0] || ls->current == set[1]) {
207     save_and_next(ls);
208     return 1;
209   }
210   else return 0;
211 }
212 
213 
214 /* LUA_NUMBER */
215 /*
216 ** This function is quite liberal in what it accepts, as 'luaO_str2num'
217 ** will reject ill-formed numerals. Roughly, it accepts the following
218 ** pattern:
219 **
220 **   %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))*
221 **
222 ** The only tricky part is to accept [+-] only after a valid exponent
223 ** mark, to avoid reading '3-4' or '0xe+1' as a single number.
224 **
225 ** The caller might have already read an initial dot.
226 */
227 static int read_numeral (LexState *ls, SemInfo *seminfo) {
228   TValue obj;
229   const char *expo = "Ee";
230   int first = ls->current;
231   lua_assert(lisdigit(ls->current));
232   save_and_next(ls);
233   if (first == '0' && check_next2(ls, "xX"))  /* hexadecimal? */
234     expo = "Pp";
235   for (;;) {
236     if (check_next2(ls, expo))  /* exponent mark? */
237       check_next2(ls, "-+");  /* optional exponent sign */
238     else if (lisxdigit(ls->current) || ls->current == '.')  /* '%x|%.' */
239       save_and_next(ls);
240     else break;
241   }
242   if (lislalpha(ls->current))  /* is numeral touching a letter? */
243     save_and_next(ls);  /* force an error */
244   save(ls, '\0');
245   if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0)  /* format error? */
246     lexerror(ls, "malformed number", TK_FLT);
247   if (ttisinteger(&obj)) {
248     seminfo->i = ivalue(&obj);
249     return TK_INT;
250   }
251   else {
252     lua_assert(ttisfloat(&obj));
253     seminfo->r = fltvalue(&obj);
254     return TK_FLT;
255   }
256 }
257 
258 
259 /*
260 ** read a sequence '[=*[' or ']=*]', leaving the last bracket. If
261 ** sequence is well formed, return its number of '='s + 2; otherwise,
262 ** return 1 if it is a single bracket (no '='s and no 2nd bracket);
263 ** otherwise (an unfinished '[==...') return 0.
264 */
265 static size_t skip_sep (LexState *ls) {
266   size_t count = 0;
267   int s = ls->current;
268   lua_assert(s == '[' || s == ']');
269   save_and_next(ls);
270   while (ls->current == '=') {
271     save_and_next(ls);
272     count++;
273   }
274   return (ls->current == s) ? count + 2
275          : (count == 0) ? 1
276          : 0;
277 }
278 
279 
280 static void read_long_string (LexState *ls, SemInfo *seminfo, size_t sep) {
281   int line = ls->linenumber;  /* initial line (for error message) */
282   save_and_next(ls);  /* skip 2nd '[' */
283   if (currIsNewline(ls))  /* string starts with a newline? */
284     inclinenumber(ls);  /* skip it */
285   for (;;) {
286     switch (ls->current) {
287       case EOZ: {  /* error */
288         const char *what = (seminfo ? "string" : "comment");
289         const char *msg = luaO_pushfstring(ls->L,
290                      "unfinished long %s (starting at line %d)", what, line);
291         lexerror(ls, msg, TK_EOS);
292         break;  /* to avoid warnings */
293       }
294       case ']': {
295         if (skip_sep(ls) == sep) {
296           save_and_next(ls);  /* skip 2nd ']' */
297           goto endloop;
298         }
299         break;
300       }
301       case '\n': case '\r': {
302         save(ls, '\n');
303         inclinenumber(ls);
304         if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
305         break;
306       }
307       default: {
308         if (seminfo) save_and_next(ls);
309         else next(ls);
310       }
311     }
312   } endloop:
313   if (seminfo)
314     seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
315                                      luaZ_bufflen(ls->buff) - 2 * sep);
316 }
317 
318 
319 static void esccheck (LexState *ls, int c, const char *msg) {
320   if (!c) {
321     if (ls->current != EOZ)
322       save_and_next(ls);  /* add current to buffer for error message */
323     lexerror(ls, msg, TK_STRING);
324   }
325 }
326 
327 
328 static int gethexa (LexState *ls) {
329   save_and_next(ls);
330   esccheck (ls, lisxdigit(ls->current), "hexadecimal digit expected");
331   return luaO_hexavalue(ls->current);
332 }
333 
334 
335 static int readhexaesc (LexState *ls) {
336   int r = gethexa(ls);
337   r = (r << 4) + gethexa(ls);
338   luaZ_buffremove(ls->buff, 2);  /* remove saved chars from buffer */
339   return r;
340 }
341 
342 
343 static unsigned long readutf8esc (LexState *ls) {
344   unsigned long r;
345   int i = 4;  /* chars to be removed: '\', 'u', '{', and first digit */
346   save_and_next(ls);  /* skip 'u' */
347   esccheck(ls, ls->current == '{', "missing '{'");
348   r = gethexa(ls);  /* must have at least one digit */
349   while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
350     i++;
351     esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
352     r = (r << 4) + luaO_hexavalue(ls->current);
353   }
354   esccheck(ls, ls->current == '}', "missing '}'");
355   next(ls);  /* skip '}' */
356   luaZ_buffremove(ls->buff, i);  /* remove saved chars from buffer */
357   return r;
358 }
359 
360 
361 static void utf8esc (LexState *ls) {
362   char buff[UTF8BUFFSZ];
363   int n = luaO_utf8esc(buff, readutf8esc(ls));
364   for (; n > 0; n--)  /* add 'buff' to string */
365     save(ls, buff[UTF8BUFFSZ - n]);
366 }
367 
368 
369 static int readdecesc (LexState *ls) {
370   int i;
371   int r = 0;  /* result accumulator */
372   for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
373     r = 10*r + ls->current - '0';
374     save_and_next(ls);
375   }
376   esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
377   luaZ_buffremove(ls->buff, i);  /* remove read digits from buffer */
378   return r;
379 }
380 
381 
382 static void read_string (LexState *ls, int del, SemInfo *seminfo) {
383   save_and_next(ls);  /* keep delimiter (for error messages) */
384   while (ls->current != del) {
385     switch (ls->current) {
386       case EOZ:
387         lexerror(ls, "unfinished string", TK_EOS);
388         break;  /* to avoid warnings */
389       case '\n':
390       case '\r':
391         lexerror(ls, "unfinished string", TK_STRING);
392         break;  /* to avoid warnings */
393       case '\\': {  /* escape sequences */
394         int c;  /* final character to be saved */
395         save_and_next(ls);  /* keep '\\' for error messages */
396         switch (ls->current) {
397           case 'a': c = '\a'; goto read_save;
398           case 'b': c = '\b'; goto read_save;
399           case 'f': c = '\f'; goto read_save;
400           case 'n': c = '\n'; goto read_save;
401           case 'r': c = '\r'; goto read_save;
402           case 't': c = '\t'; goto read_save;
403           case 'v': c = '\v'; goto read_save;
404           case 'x': c = readhexaesc(ls); goto read_save;
405           case 'u': utf8esc(ls);  goto no_save;
406           case '\n': case '\r':
407             inclinenumber(ls); c = '\n'; goto only_save;
408           case '\\': case '\"': case '\'':
409             c = ls->current; goto read_save;
410           case EOZ: goto no_save;  /* will raise an error next loop */
411           case 'z': {  /* zap following span of spaces */
412             luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
413             next(ls);  /* skip the 'z' */
414             while (lisspace(ls->current)) {
415               if (currIsNewline(ls)) inclinenumber(ls);
416               else next(ls);
417             }
418             goto no_save;
419           }
420           default: {
421             esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
422             c = readdecesc(ls);  /* digital escape '\ddd' */
423             goto only_save;
424           }
425         }
426        read_save:
427          next(ls);
428          /* go through */
429        only_save:
430          luaZ_buffremove(ls->buff, 1);  /* remove '\\' */
431          save(ls, c);
432          /* go through */
433        no_save: break;
434       }
435       default:
436         save_and_next(ls);
437     }
438   }
439   save_and_next(ls);  /* skip delimiter */
440   seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
441                                    luaZ_bufflen(ls->buff) - 2);
442 }
443 
444 
445 static int llex (LexState *ls, SemInfo *seminfo) {
446   luaZ_resetbuffer(ls->buff);
447   for (;;) {
448     switch (ls->current) {
449       case '\n': case '\r': {  /* line breaks */
450         inclinenumber(ls);
451         break;
452       }
453       case ' ': case '\f': case '\t': case '\v': {  /* spaces */
454         next(ls);
455         break;
456       }
457       case '-': {  /* '-' or '--' (comment) */
458         next(ls);
459         if (ls->current != '-') return '-';
460         /* else is a comment */
461         next(ls);
462         if (ls->current == '[') {  /* long comment? */
463           size_t sep = skip_sep(ls);
464           luaZ_resetbuffer(ls->buff);  /* 'skip_sep' may dirty the buffer */
465           if (sep >= 2) {
466             read_long_string(ls, NULL, sep);  /* skip long comment */
467             luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
468             break;
469           }
470         }
471         /* else short comment */
472         while (!currIsNewline(ls) && ls->current != EOZ)
473           next(ls);  /* skip until end of line (or end of file) */
474         break;
475       }
476       case '[': {  /* long string or simply '[' */
477         size_t sep = skip_sep(ls);
478         if (sep >= 2) {
479           read_long_string(ls, seminfo, sep);
480           return TK_STRING;
481         }
482         else if (sep == 0)  /* '[=...' missing second bracket? */
483           lexerror(ls, "invalid long string delimiter", TK_STRING);
484         return '[';
485       }
486       case '=': {
487         next(ls);
488         if (check_next1(ls, '=')) return TK_EQ;  /* '==' */
489         else return '=';
490       }
491       case '<': {
492         next(ls);
493         if (check_next1(ls, '=')) return TK_LE;  /* '<=' */
494         else if (check_next1(ls, '<')) return TK_SHL;  /* '<<' */
495         else return '<';
496       }
497       case '>': {
498         next(ls);
499         if (check_next1(ls, '=')) return TK_GE;  /* '>=' */
500         else if (check_next1(ls, '>')) return TK_SHR;  /* '>>' */
501         else return '>';
502       }
503       case '/': {
504         next(ls);
505         if (check_next1(ls, '/')) return TK_IDIV;  /* '//' */
506         else return '/';
507       }
508       case '~': {
509         next(ls);
510         if (check_next1(ls, '=')) return TK_NE;  /* '~=' */
511         else return '~';
512       }
513       case ':': {
514         next(ls);
515         if (check_next1(ls, ':')) return TK_DBCOLON;  /* '::' */
516         else return ':';
517       }
518       case '"': case '\'': {  /* short literal strings */
519         read_string(ls, ls->current, seminfo);
520         return TK_STRING;
521       }
522       case '.': {  /* '.', '..', '...', or number */
523         save_and_next(ls);
524         if (check_next1(ls, '.')) {
525           if (check_next1(ls, '.'))
526             return TK_DOTS;   /* '...' */
527           else return TK_CONCAT;   /* '..' */
528         }
529         else if (!lisdigit(ls->current)) return '.';
530         else return read_numeral(ls, seminfo);
531       }
532       case '0': case '1': case '2': case '3': case '4':
533       case '5': case '6': case '7': case '8': case '9': {
534         return read_numeral(ls, seminfo);
535       }
536       case EOZ: {
537         return TK_EOS;
538       }
539       default: {
540         if (lislalpha(ls->current)) {  /* identifier or reserved word? */
541           TString *ts;
542           do {
543             save_and_next(ls);
544           } while (lislalnum(ls->current));
545           ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
546                                   luaZ_bufflen(ls->buff));
547           seminfo->ts = ts;
548           if (isreserved(ts))  /* reserved word? */
549             return ts->extra - 1 + FIRST_RESERVED;
550           else {
551             return TK_NAME;
552           }
553         }
554         else {  /* single-char tokens ('+', '*', '%', '{', '}', ...) */
555           int c = ls->current;
556           next(ls);
557           return c;
558         }
559       }
560     }
561   }
562 }
563 
564 
565 void luaX_next (LexState *ls) {
566   ls->lastline = ls->linenumber;
567   if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
568     ls->t = ls->lookahead;  /* use this one */
569     ls->lookahead.token = TK_EOS;  /* and discharge it */
570   }
571   else
572     ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
573 }
574 
575 
576 int luaX_lookahead (LexState *ls) {
577   lua_assert(ls->lookahead.token == TK_EOS);
578   ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
579   return ls->lookahead.token;
580 }
581 
582