1 /*
2 * Copyright (c) 2003
3 * David Leonard. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of David Leonard nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #if HAVE_CONFIG_H
32 # include <config.h>
33 #endif
34
35 #if STDC_HEADERS
36 # include <stdio.h>
37 # include <stdlib.h>
38 #endif
39
40 #if HAVE_STRING_H
41 # include <string.h>
42 #endif
43
44 #include <see/type.h>
45 #include <see/string.h>
46 #include <see/value.h>
47 #include <see/object.h>
48 #include <see/input.h>
49 #include <see/try.h>
50 #include <see/intern.h>
51 #include <see/error.h>
52 #include <see/debug.h>
53 #include <see/interpreter.h>
54 #include <see/system.h>
55 #include "tokens.h"
56 #include "lex.h"
57 #include "stringdefs.h"
58 #include "unicode.h"
59 #include "dtoa.h"
60 #include "dprint.h"
61 #include "nmath.h"
62
63 #ifndef NDEBUG
64 int SEE_lex_debug = 0;
65 #endif
66
67 /*
68 * Lexical analyser.
69 *
70 * This is a lexical analyser for ECMAScript. It uses a 6-character
71 * lookahead input 'filter' (to detect '\u####') and provides an
72 * interface that reveals if the returned token was immediately
73 * preceeded by a line terminator.
74 *
75 * The lexical analyser's behaviour when deciding a slash '/' as
76 * a division or the start of a regular expression is determined by
77 * a flag. The parser is exepected to set it.
78 *
79 * NOTE: Although all strings generated for ECMAscript are UTF-16,
80 * this lexer requires UCS-32 input.
81 */
82
83 /* Macros that assume local variable lex */
84 #define NEXT lex->input->lookahead
85 #define SKIP do { SEE_INPUT_NEXT(lex->input); \
86 } while (!ATEOF && is_FormatControl(NEXT))
87 #define UNGET(c) do { lex->la[++lex->lalen]=(c); } while (0)
88 #define ATEOF (lex->input->eof)
89 #define LOOKAHEAD(buf, len) SEE_input_lookahead_copy(lex->input, buf, len)
90 #define CONSUME(ch) \
91 do { \
92 if (ATEOF) \
93 SYNTAX_ERROR(STR(unexpected_eof)); \
94 if (NEXT != (ch)) \
95 SYNTAX_ERROR(SEE_string_sprintf( \
96 lex->input->interpreter, "expected '%c'", (ch))); \
97 SKIP; \
98 } while (0)
99
100 #define SYNTAX_ERROR(s) \
101 SEE_error_throw_string(lex->input->interpreter, \
102 lex->input->interpreter->SyntaxError, \
103 prefix_msg(s, lex))
104
105 /* Sign constants */
106 #define NEGATIVE (-1)
107 #define POSITIVE (1)
108
109 /* Prototypes */
110 static struct SEE_string *prefix_msg(struct SEE_string *s, struct lex *lex);
111 static int is_FormatControl(SEE_unicode_t c);
112 static int is_WhiteSpace(SEE_unicode_t c);
113 static int is_LineTerminator(SEE_unicode_t c);
114 static int is_HexDigit(SEE_unicode_t c);
115 static int HexValue(SEE_unicode_t c);
116 static int is_HexEscape(struct lex *lex);
117 static int is_UnicodeEscape(struct lex *lex);
118 static int is_IdentifierStart(struct lex *lex);
119 static int is_IdentifierPart(struct lex *lex);
120 static SEE_unicode_t HexEscape(struct lex *lex);
121 static SEE_unicode_t UnicodeEscape(struct lex *lex);
122 static int DivPunctuator(struct lex *lex);
123 static int LineTerminator(struct lex *lex);
124 static int SkipToEndOfLine(struct lex *lex);
125 static int SGMLComment(struct lex *lex);
126 static int SGMLCommentEnd(struct lex *lex);
127 static int Punctuator(struct lex *lex);
128 static int StringLiteral(struct lex *lex);
129 static int RegularExpressionLiteral(struct lex *lex, int prev);
130 static int NumericLiteral(struct lex *lex);
131 static int CommentDiv(struct lex *lex);
132 static int Token(struct lex *lex);
133 static int lex0(struct lex *lex);
134
135 /* Returns ("line " + next_lineno + ": " + s) */
136 static struct SEE_string *
prefix_msg(s,lex)137 prefix_msg(s, lex)
138 struct SEE_string *s;
139 struct lex *lex;
140 {
141 struct SEE_string *t;
142 struct SEE_interpreter *interp = lex->input->interpreter;
143
144 t = SEE_string_sprintf(interp, "line %d: ", lex->next_lineno);
145 SEE_string_append(t, s);
146 return t;
147 }
148
149 static int
is_FormatControl(c)150 is_FormatControl(c)
151 SEE_unicode_t c; /* 7.1 */
152 {
153 return UNICODE_IS_Cf(c); /* category Cf or L or R */
154 }
155
156 static int
is_WhiteSpace(c)157 is_WhiteSpace(c)
158 SEE_unicode_t c; /* 7.2 */
159 {
160 return (c == 0x0009 || c == 0x000B || c == 0x000C || c == 0x0020
161 || c == 0x00A0 || UNICODE_IS_Zs(c));
162 }
163
164 static int
is_LineTerminator(c)165 is_LineTerminator(c)
166 SEE_unicode_t c; /* 7.3 */
167 {
168 return (c == 0x000A || c == 0x000D || c == 0x2028 || c == 0x2029);
169 }
170
171 static int
is_HexDigit(c)172 is_HexDigit(c)
173 SEE_unicode_t c;
174 {
175 return ((c >= '0' && c <= '9') ||
176 (c >= 'A' && c <= 'F') ||
177 (c >= 'a' && c <= 'f'));
178 }
179
180 /* Returns the hexadecimal value of a character. Assumes char is a hex digit */
181 static int
HexValue(c)182 HexValue(c)
183 SEE_unicode_t c;
184 {
185 if (c >= '0' && c <= '9') return c - '0';
186 else if (c >= 'a' && c <= 'f') return c - 'a' + 10;
187 else /* (c >= 'A' && c <= 'F') */ return c - 'A' + 10;
188 }
189
190 static int
is_HexEscape(lex)191 is_HexEscape(lex)
192 struct lex *lex; /* 7.6 */
193 {
194 SEE_unicode_t lookahead[4];
195 int lookahead_len;
196
197 lookahead_len = LOOKAHEAD(lookahead, 4);
198 return (lookahead_len >= 4 &&
199 lookahead[0] == '\\' &&
200 lookahead[1] == 'x' &&
201 is_HexDigit(lookahead[2]) &&
202 is_HexDigit(lookahead[3]));
203 }
204
205 static int
is_UnicodeEscape(lex)206 is_UnicodeEscape(lex)
207 struct lex *lex; /* 7.6 */
208 {
209 SEE_unicode_t lookahead[6];
210 int lookahead_len;
211
212 lookahead_len = LOOKAHEAD(lookahead, 6);
213 return (lookahead_len >= 6 &&
214 lookahead[0] == '\\' &&
215 lookahead[1] == 'u' &&
216 is_HexDigit(lookahead[2]) &&
217 is_HexDigit(lookahead[3]) &&
218 is_HexDigit(lookahead[4]) &&
219 is_HexDigit(lookahead[5]));
220 }
221
222 static int
is_IdentifierStart(lex)223 is_IdentifierStart(lex)
224 struct lex *lex; /* 7.6 */
225 {
226 SEE_unicode_t c;
227
228 if (ATEOF)
229 return 0;
230 if (is_UnicodeEscape(lex))
231 return 1;
232 c = NEXT;
233 return UNICODE_IS_IS(c);
234 }
235
236 static int
is_IdentifierPart(lex)237 is_IdentifierPart(lex)
238 struct lex *lex; /* 7.6 */
239 {
240 SEE_unicode_t c;
241
242 if (ATEOF)
243 return 0;
244 if (is_UnicodeEscape(lex))
245 return 1;
246 c = NEXT;
247 return UNICODE_IS_IP(c);
248 }
249
250 static SEE_unicode_t
HexEscape(lex)251 HexEscape(lex)
252 struct lex *lex; /* 7.6 la \x */
253 {
254 int i;
255 SEE_unicode_t r = 0;
256 CONSUME('\\'); CONSUME('x');
257 for (i = 0; i < 2; i++) {
258 if (ATEOF) SYNTAX_ERROR(STR(unexpected_eof));
259 r = (r << 4) | HexValue(NEXT);
260 SKIP;
261 }
262 return r;
263 }
264
265 static SEE_unicode_t
UnicodeEscape(lex)266 UnicodeEscape(lex)
267 struct lex *lex; /* 7.6 la \u */
268 {
269 int i;
270 SEE_unicode_t r = 0;
271 CONSUME('\\'); CONSUME('u');
272 for (i = 0; i < 4; i++) {
273 if (ATEOF) SYNTAX_ERROR(STR(unexpected_eof));
274 r = (r << 4) | HexValue(NEXT);
275 SKIP;
276 }
277 return r;
278
279 /*
280 * XXX NOTE: the \uxxxx escape can only encode characters
281 * up to 0xffff. To express unicode characters above this
282 * codepoint, you would have to use a UTF-16 surrogate, but
283 * this is problematic. Better would be to augment ECMA-262
284 * with a \Uxxxxxxxx escape, such as Python provides.
285 * (spec bug?)
286 */
287 }
288
289 static int
DivPunctuator(lex)290 DivPunctuator(lex)
291 struct lex *lex; /* 7.7 la / */
292 {
293 CONSUME('/');
294 if (!ATEOF && NEXT == '=') {
295 SKIP;
296 return tDIVEQ;
297 }
298 return tDIV;
299 }
300
301 static int
LineTerminator(lex)302 LineTerminator(lex)
303 struct lex *lex; /* line terminator */
304 {
305 SEE_unicode_t lookahead[2];
306 int lookahead_len;
307
308 lookahead_len = LOOKAHEAD(lookahead, 2);
309 SEE_ASSERT(lex->input->interpreter, is_LineTerminator(lookahead[0]));
310 SKIP;
311 if (lookahead_len == 2 &&
312 lookahead[0] == '\r' &&
313 lookahead[1] == '\n')
314 {} /* Don't count the \r in a CRLF pair */
315 else
316 lex->next_lineno++;
317 return tLINETERMINATOR;
318 }
319
320 /* Skips all characters up to and including a line terminator (or EOF) */
321 static int
SkipToEndOfLine(lex)322 SkipToEndOfLine(lex)
323 struct lex *lex;
324 {
325 while (!ATEOF && !is_LineTerminator(NEXT))
326 SKIP;
327 if (ATEOF)
328 return tEND;
329 return LineTerminator(lex);
330 }
331
332 static int
SGMLComment(lex)333 SGMLComment(lex)
334 struct lex *lex; /* la <!-- */
335 {
336 /*
337 * Treat SGML comment introducers the same as '//',
338 * i.e. to ignore everything up to the end of the line.
339 */
340 return SkipToEndOfLine(lex);
341 }
342
343 static int
SGMLCommentEnd(lex)344 SGMLCommentEnd(lex)
345 struct lex *lex; /* la (^) --> */
346 {
347 /*
348 * The closing '-->' is supposed to be protected by an
349 * actual '//' comment leader. (Refer to Chapter 9 of
350 * 'Client-Side JavaScript Guide', by Netscape) but
351 * we treat it as '//' for compatibility.
352 */
353 return SkipToEndOfLine(lex);
354 }
355
356 static int
Punctuator(lex)357 Punctuator(lex)
358 struct lex *lex; /* 7.7 */
359 {
360 SEE_unicode_t op[4]; /* ">>>=" is the longest punctuator */
361 struct token *t;
362 int j, len, oplen;
363 struct SEE_interpreter *interp = lex->input->interpreter;
364
365 if (ATEOF)
366 return tEND;
367 oplen = LOOKAHEAD(op, 4);
368 len = SEE_tok_noperators - 1;
369 if (len > oplen)
370 len = oplen;
371 for (; len > 0; len--)
372 for (t = SEE_tok_operators[len]; t->token; t++) {
373 for (j = 0; j < len; j++)
374 if (t->identifier[j] != op[j])
375 goto out;
376 if (t->token == tSGMLCOMMENT) {
377 if (interp->compatibility & SEE_COMPAT_SGMLCOM)
378 return SGMLComment(lex);
379 else
380 goto out;
381 }
382 if (t->token == tSGMLCOMMENTEND && lex->next_at_bol) {
383 if (interp->compatibility & SEE_COMPAT_SGMLCOM)
384 return SGMLCommentEnd(lex);
385 else
386 goto out;
387 }
388 for (j = 0; j < len; j++)
389 SKIP;
390 return t->token;
391 out:
392 /* continue */ ;
393 }
394
395 /*
396 * Throw a descriptive error message
397 */
398 if (op[0] == SEE_INPUT_BADCHAR)
399 SYNTAX_ERROR(SEE_string_sprintf(interp,
400 "malformed unicode input"));
401 else if (op[0] >= ' ' && op[0] <= '~')
402 SYNTAX_ERROR(SEE_string_sprintf(interp,
403 "unexpected character '%c'", op[0]));
404 else
405 SYNTAX_ERROR(SEE_string_sprintf(interp,
406 "unexpected character '\\u%04x'", op[0]));
407 /* NOTREACHED */
408 }
409
410 static int
StringLiteral(lex)411 StringLiteral(lex)
412 struct lex *lex; /* 7.8.4 la ' " */
413 {
414 SEE_unicode_t quote;
415 SEE_unicode_t c = 0;
416 struct SEE_string *s;
417 struct SEE_interpreter *interp = lex->input->interpreter;
418
419 s = SEE_string_new(interp, 0);
420 quote = NEXT;
421 SKIP;
422 while (!ATEOF && NEXT != quote) {
423 if (is_LineTerminator(NEXT))
424 SYNTAX_ERROR(STR(broken_literal));
425 else if (is_UnicodeEscape(lex))
426 c = UnicodeEscape(lex);
427 else if (is_HexEscape(lex))
428 c = HexEscape(lex);
429 else if (NEXT == '\\') {
430 SKIP;
431 if (is_LineTerminator(NEXT)) {
432 if (SEE_GET_JS_COMPAT(interp)) {
433 /* Ignore escaped LineTerminator */
434 SKIP;
435 continue;
436 }
437 SYNTAX_ERROR(STR(escaped_lit_nl));
438 }
439 else if (ATEOF)
440 SYNTAX_ERROR(STR(escaped_lit_nl));
441 switch (NEXT) {
442 case 'b': c = 0x0008; SKIP; break;
443 case 't': c = 0x0009; SKIP; break;
444 case 'n': c = 0x000a; SKIP; break;
445 case 'v': c = 0x000b; SKIP; break;
446 case 'f': c = 0x000c; SKIP; break;
447 case 'r': c = 0x000d; SKIP; break;
448 case '0': case '1': case '2': case '3':
449 c = NEXT - '0'; SKIP;
450 if (!ATEOF && NEXT >= '0' && NEXT <= '7')
451 { c = (c << 3) | (NEXT - '0'); SKIP; }
452 if (!ATEOF && NEXT >= '0' && NEXT <= '7')
453 { c = (c << 3) | (NEXT - '0'); SKIP; }
454 break;
455 case '4': case '5': case '6': case '7':
456 c = NEXT - '0'; SKIP;
457 if (!ATEOF && NEXT >= '0' && NEXT <= '7')
458 { c = (c << 3) | (NEXT - '0'); SKIP; }
459 break;
460 case 'x':
461 case 'u':
462 if (SEE_GET_JS_COMPAT(interp))
463 goto literal;
464 /* Strict ECMA: */
465 if (NEXT == 'x')
466 SYNTAX_ERROR(STR(invalid_esc_x));
467 else
468 SYNTAX_ERROR(STR(invalid_esc_u));
469 /* NOTREACHED */
470 default:
471 literal:
472 c = NEXT; SKIP; break;
473 }
474 } else {
475 c = NEXT;
476 SKIP;
477 }
478 SEE_string_append_unicode(s, c);
479 }
480 CONSUME(quote);
481 SEE_SET_STRING(&lex->value, s);
482 return tSTRING;
483 }
484
485 /*
486 * 7.8.5 Scans for a regular expression token.
487 * Assumes prev (immediately previous token) is either tDIV or tDIVEQ.
488 * Returns tREGEX on success or throws an exception on failure.
489 * The string in lex->value is of the form "/regex/flags"
490 */
491 static int
RegularExpressionLiteral(lex,prev)492 RegularExpressionLiteral(lex, prev)
493 struct lex *lex;
494 int prev;
495 {
496 struct SEE_string *s;
497 int incc = 0;
498 struct SEE_interpreter *interp = lex->input->interpreter;
499
500 s = SEE_string_new(interp, 0);
501 SEE_string_addch(s, '/');
502 if (prev == tDIVEQ)
503 SEE_string_addch(s, '=');
504 while (!ATEOF) {
505 if (NEXT == '/' &&
506 (!incc || !(SEE_GET_JS_COMPAT(interp)))) /* EXT:15 */
507 break;
508 if (NEXT == '\\') {
509 SEE_string_addch(s, '\\');
510 SKIP;
511 if (ATEOF) break;
512 } else {
513 /* Track charclasses for JS_COMPAT */
514 if (NEXT == '[') incc = 1;
515 if (NEXT == ']') incc = 0;
516 }
517 if (is_LineTerminator(NEXT))
518 SYNTAX_ERROR(STR(broken_regex));
519 SEE_string_append_unicode(s, NEXT);
520 SKIP;
521 }
522 if (ATEOF)
523 SYNTAX_ERROR(STR(eof_in_regex));
524 CONSUME('/');
525
526 SEE_string_addch(s, '/');
527 while (!ATEOF && is_IdentifierPart(lex)) {
528 SEE_string_append_unicode(s, NEXT);
529 SKIP;
530 }
531
532 SEE_SET_STRING(&lex->value, s);
533 return tREGEX;
534 }
535
536 static int
NumericLiteral(lex)537 NumericLiteral(lex)
538 struct lex *lex; /* 7.8.3 la [.0-9] */
539 {
540 SEE_number_t n, e;
541 int seendigit;
542 unsigned int i;
543 struct SEE_string *s;
544 char *numbuf, *endstr;
545 struct SEE_interpreter *interp = lex->input->interpreter;
546
547 seendigit = 0;
548 n = 0;
549 s = SEE_string_new(interp, 0);
550
551 if (NEXT == '0') {
552 SKIP;
553 if (!ATEOF && (NEXT == 'x' || NEXT == 'X')) {
554 SKIP;
555 if (ATEOF || !is_HexDigit(NEXT))
556 SYNTAX_ERROR(STR(hex_literal_detritus));
557 while (!ATEOF && is_HexDigit(NEXT)) {
558 SEE_string_addch(s, (SEE_char_t)NEXT);
559 SKIP;
560 }
561 if (!ATEOF && is_IdentifierStart(lex))
562 SYNTAX_ERROR(STR(hex_literal_detritus));
563 e = 1;
564 for (i = 0; i < s->length; i++) {
565 n += e * HexValue(s->data[s->length - i - 1]);
566 e *= 16;
567 }
568 SEE_SET_NUMBER(&lex->value, n);
569 return tNUMBER;
570 }
571 SEE_string_addch(s, '0');
572 seendigit = 1;
573 }
574
575 while (!ATEOF && '0' <= NEXT && NEXT <= '9') {
576 SEE_string_addch(s, (SEE_char_t)NEXT);
577 seendigit = 1;
578 SKIP;
579 }
580
581 /* Octal integers */
582 if (SEE_GET_JS_COMPAT(interp)
583 && seendigit
584 && (ATEOF || (NEXT != '.' && NEXT != 'e' && NEXT != 'E'))
585 && s->length > 1
586 && s->data[0] == '0')
587 {
588 /* Octal integers start with 0 and dont follow with . or e */
589 n = 0;
590 for (i = 1; i < s->length; i++) {
591 if (s->data[i] > '7')
592 goto not_octal;
593 n = n * 8 + s->data[i] - '0';
594 }
595 if (!ATEOF && is_IdentifierStart(lex))
596 goto not_octal;
597 SEE_SET_NUMBER(&lex->value, n);
598 return tNUMBER;
599 }
600 not_octal:
601
602 if (!ATEOF && NEXT == '.') {
603 SEE_string_addch(s, '.');
604 SKIP;
605 while (!ATEOF && '0' <= NEXT && NEXT <= '9') {
606 seendigit = 1;
607 SEE_string_addch(s, (SEE_char_t)NEXT);
608 SKIP;
609 }
610 }
611 if (!seendigit) {
612 /* free(s) */
613 return '.'; /* Actually matched Punctuator! */
614 }
615
616 if (!ATEOF && (NEXT == 'e' || NEXT == 'E')) {
617 SEE_string_addch(s, (SEE_char_t)NEXT);
618 SKIP;
619 seendigit = 0;
620 if (!ATEOF && NEXT == '-') {
621 SEE_string_addch(s, '-');
622 SKIP;
623 } else if (!ATEOF && NEXT == '+') {
624 SEE_string_addch(s, '+');
625 SKIP;
626 }
627 e = 0;
628 while (!ATEOF && '0' <= NEXT && NEXT <= '9') {
629 seendigit = 1;
630 SEE_string_addch(s, (SEE_char_t)NEXT);
631 SKIP;
632 }
633 if (!seendigit)
634 SYNTAX_ERROR(STR(dec_literal_detritus));
635 }
636
637 numbuf = SEE_STRING_ALLOCA(interp, char, s->length + 1);
638 for (i = 0; i < s->length; i++)
639 numbuf[i] = s->data[i] & 0x7f;
640 numbuf[i] = '\0';
641 endstr = NULL;
642 n = SEE_strtod(numbuf, &endstr);
643 if (!endstr || *endstr) /* impossible condition? */
644 SYNTAX_ERROR(STR(dec_literal_detritus));
645 SEE_SET_NUMBER(&lex->value, n);
646 return tNUMBER;
647 }
648
649 static int
CommentDiv(lex)650 CommentDiv(lex)
651 struct lex *lex; /* 7.4 la / */
652 {
653 SEE_unicode_t lookahead[2];
654 int lookahead_len;
655
656 lookahead_len = LOOKAHEAD(lookahead, 2);
657
658 if (lookahead_len >= 2 && lookahead[0] == '/' && lookahead[1] == '*') {
659 int starprev = 0, contains_newline = 0;
660 SKIP;
661 SKIP;
662 while (!ATEOF) {
663 if (starprev && NEXT == '/') {
664 CONSUME('/');
665 return contains_newline
666 ? tLINETERMINATOR
667 : tCOMMENT;
668 }
669 if (is_LineTerminator(NEXT)) {
670 (void)LineTerminator(lex);
671 contains_newline = 1;
672 starprev = 0;
673 } else {
674 starprev = (NEXT == '*');
675 SKIP;
676 }
677 }
678 SYNTAX_ERROR(STR(eof_in_c_comment));
679 }
680 if (lookahead_len >= 2 && lookahead[0] == '/' && lookahead[1] == '/')
681 return SkipToEndOfLine(lex);
682
683 /*
684 * NB: This assumes regular expressions not wanted,
685 * and that the rest of the regex can be scanned later
686 * if the parser wants it.
687 */
688 return DivPunctuator(lex);
689 }
690
691 static int
Token(lex)692 Token(lex)
693 struct lex *lex; /* 7.5 */
694 {
695 struct SEE_interpreter *interp = lex->input->interpreter;
696
697 if (ATEOF)
698 return tEND;
699
700 if (NEXT == '\'' || NEXT == '\"')
701 return StringLiteral(lex);
702
703 if ((NEXT >= '0' && NEXT <= '9') || NEXT == '.')
704 return NumericLiteral(lex);
705
706 if (is_IdentifierStart(lex)) {
707 int hasescape = 0, i;
708 struct SEE_string *s;
709 SEE_unicode_t c;
710
711 s = SEE_string_new(interp, 0);
712 do {
713 if (is_UnicodeEscape(lex)) {
714 c = UnicodeEscape(lex);
715 if (s->length == 0) {
716 if (!UNICODE_IS_IS(c))
717 SYNTAX_ERROR(STR(bad_unicode_ident));
718 } else
719 if (!UNICODE_IS_IP(c))
720 SYNTAX_ERROR(STR(bad_unicode_ident));
721 hasescape = 1;
722 } else {
723 c = NEXT;
724 SKIP;
725 }
726 SEE_string_append_unicode(s, c);
727 } while (is_IdentifierPart(lex));
728
729 /* match keywords */
730 if (!hasescape)
731 for (i = 0; i < SEE_tok_nkeywords; i++) {
732 const struct SEE_string *keyword;
733
734 keyword = STRn(SEE_tok_keywords[i].index);
735 if (keyword->length == s->length &&
736 SEE_string_cmp(keyword, s) == 0)
737 {
738 int token = SEE_tok_keywords[i].token;
739 if (token == tRESERVED &&
740 /* EXT:3 */ SEE_COMPAT_JS(interp, >=, JS11))
741 {
742 #ifndef NDEBUG
743 dprintf("Warning: line %d: reserved token '",
744 lex->next_lineno);
745 dprints(s);
746 dprintf("' treated as identifier\n");
747 #endif
748 break;
749 }
750 return token;
751 }
752 }
753
754 SEE_intern_and_free(interp, &s);
755 SEE_SET_STRING(&lex->value, s);
756 return tIDENT;
757 }
758
759 return Punctuator(lex);
760 }
761
762
763
764 /*
765 * Scanner grammar goal. Scans lex->input for a token, and returns it.
766 *
767 * May return multiple tLINETERMINATORs, but will never return tCOMMENT.
768 * Scans the InputElementDiv production (never InputElementRegex).
769 * If this function returns tDIV or tDIVEQ, and a regular expression is wanted,
770 * then SEE_lex_regex() should be called immediately.
771 */
772 static int
lex0(lex)773 lex0(lex)
774 struct lex *lex;
775 {
776 int ret;
777
778 again:
779
780 while (!ATEOF && is_WhiteSpace(NEXT) && !is_LineTerminator(NEXT))
781 SKIP; /* skip non-newline whitespace */
782 if (ATEOF)
783 return tEND;
784 if (is_LineTerminator(NEXT))
785 return LineTerminator(lex);
786
787 switch (NEXT) {
788 case '/':
789 ret = CommentDiv(lex);
790 if (ret == tCOMMENT)
791 goto again; /* Discard tCOMMENTs */
792 return ret;
793 case '\"':
794 case '\'':
795 return StringLiteral(lex);
796 case '0': case '1': case '2': case '3': case '4':
797 case '5': case '6': case '7': case '8': case '9':
798 return NumericLiteral(lex);
799 case '.':
800 {
801 SEE_unicode_t lookahead[2];
802 int lookahead_len;
803
804 lookahead_len = LOOKAHEAD(lookahead, 2);
805 if (lookahead_len >= 2
806 && lookahead[1] >= '0'
807 && lookahead[1] <= '9')
808 return NumericLiteral(lex);
809 SKIP;
810 return '.';
811 }
812 default:
813 return Token(lex);
814 }
815 }
816
817 /*------------------------------------------------------------
818 * Public API
819 */
820
821 /*
822 * Initialises a tokenizer structure
823 */
824 void
SEE_lex_init(lex,inp)825 SEE_lex_init(lex, inp)
826 struct lex *lex;
827 struct SEE_input *inp;
828 {
829 lex->input = inp;
830 SEE_SET_UNDEFINED(&lex->value);
831 lex->next_lineno = inp->first_lineno;
832 lex->next_filename = SEE_intern(inp->interpreter, inp->filename);
833 lex->next_at_bol = 1;
834 (void)SEE_lex_next(lex);
835 }
836
837 /*
838 * Main interface to the lexical anaylser.
839 *
840 * We keep a one-token lookahead.
841 * Each call to this function generates a new lookahead token
842 * (in lex->next) and returns the previous one, so
843 * the lex flags apply to the scanning of the NEXT token,
844 * and NOT to the token being returned. (ie The caller should
845 * generally refer to the resulting lex->next to make
846 * decisions. The value returned is merely a convenience.)
847 *
848 * On return, this function also sets (or clears) the
849 * lex->next_follows_nl flag when a newline is seen immediately
850 * before lex->next. The parser should use this information to
851 * perform automatic semicolon insertion. Note that the defined
852 * tLINETERMINATOR token is an internal scanner pseudo-token and
853 * is never returned by this function. Use the next_follows_nl flag.
854 *
855 * As a special case, if end-of-file (tEND) does not follow
856 * a line terminator, then this function pretends that it does.
857 *
858 * The lex->next_lineno field reflects the line number of
859 * lex->next.
860 */
861 int
SEE_lex_next(lex)862 SEE_lex_next(lex)
863 struct lex *lex;
864 {
865 int next, token;
866
867 lex->next_follows_nl = 0;
868 next = lex->next;
869
870 token = lex0(lex);
871 while (token == tLINETERMINATOR) {
872 #ifndef NDEBUG
873 if (SEE_lex_debug && !lex->next_follows_nl)
874 dprintf("lex: [LINETERMINATOR]\n");
875
876 #endif
877 lex->next_follows_nl = 1;
878 lex->next_at_bol = 1;
879 token = lex0(lex);
880 }
881 lex->next_at_bol = 0;
882
883 if (token == tEND)
884 lex->next_follows_nl = 1;
885 lex->next = token;
886
887 #ifndef NDEBUG
888 if (SEE_lex_debug)
889 switch (lex->next) {
890 case tIDENT:
891 dprintf("lex: tIDENT ");
892 dprintv(lex->input->interpreter, &lex->value);
893 dprintf("\n"); break;
894 case tSTRING:
895 dprintf("lex: tSTRING ");
896 dprintv(lex->input->interpreter, &lex->value);
897 dprintf("\n"); break;
898 case tNUMBER:
899 dprintf("lex: tNUMBER ");
900 dprintv(lex->input->interpreter, &lex->value);
901 dprintf("\n"); break;
902 default:
903 dprintf("lex: %s\n", SEE_tokenname(lex->next));
904 }
905 #endif
906
907 return next;
908 }
909
910 /*
911 * Converts the next token (just scanned) into a regular expression,
912 * if possible.
913 */
914 void
SEE_lex_regex(lex)915 SEE_lex_regex(lex)
916 struct lex *lex;
917 {
918 if (lex->next == tDIV || lex->next == tDIVEQ)
919 lex->next = RegularExpressionLiteral(lex, lex->next);
920 }
921
922 /*
923 * 9.3.1
924 * Scans a SEE_string to convert it into a number.
925 * On success, sets res to the resulting number and returns non-zero.
926 *
927 * This function is called by SEE_ToNumber().
928 */
929 int
SEE_lex_number(interp,s,res)930 SEE_lex_number(interp, s, res)
931 struct SEE_interpreter *interp;
932 struct SEE_string *s;
933 struct SEE_value *res;
934 {
935 SEE_number_t n, sign;
936 int seendig, hexok;
937 int len = s->length;
938 int i, pos;
939 int start;
940 char *numbuf, *endstr;
941
942 /* These work becuase we expect no Unicode surrogates in numbers */
943 #undef ATEOF
944 #undef NEXT
945 #undef SKIP
946 #define ATEOF (pos >= len)
947 #define NEXT (s->data[pos])
948 #define SKIP pos++
949
950 pos = 0;
951
952 /* StrWhiteSpace */
953 while (!ATEOF && (is_WhiteSpace(NEXT) || is_LineTerminator(NEXT)))
954 SKIP;
955
956 if (ATEOF) {
957 SEE_SET_NUMBER(res, 0); /* +0 */
958 return 1;
959 }
960
961 sign = 0;
962 if (NEXT == '-') {
963 sign = NEGATIVE;
964 SKIP;
965 } else if (NEXT == '+') {
966 sign = POSITIVE;
967 SKIP;
968 }
969
970 /* Strict ECMA262-3 hex strings require no sign. Netscape relaxes this. */
971 hexok = !sign || SEE_GET_JS_COMPAT(interp);
972
973 if (ATEOF) goto fail;
974 if (NEXT == 'I') {
975 SKIP; if (ATEOF || NEXT != 'n') goto fail;
976 SKIP; if (ATEOF || NEXT != 'f') goto fail;
977 SKIP; if (ATEOF || NEXT != 'i') goto fail;
978 SKIP; if (ATEOF || NEXT != 'n') goto fail;
979 SKIP; if (ATEOF || NEXT != 'i') goto fail;
980 SKIP; if (ATEOF || NEXT != 't') goto fail;
981 SKIP; if (ATEOF || NEXT != 'y') goto fail;
982 SKIP; n = SEE_Infinity;
983 } else {
984 n = 0;
985 start = pos;
986
987 /* Hexadecimal */
988 if (hexok && pos + 1 < len && s->data[pos] == '0' &&
989 (s->data[pos+1] == 'x' || s->data[pos+1] == 'X'))
990 {
991 SKIP;
992 SKIP;
993 seendig = 0;
994 while (!ATEOF && is_HexDigit(NEXT)) {
995 seendig = 1;
996 n = 16 * n + HexValue(NEXT);
997 SKIP;
998 }
999 if (!seendig) goto fail;
1000 goto out;
1001 }
1002
1003 #if 0
1004 /* Octal */
1005 if (SEE_COMPAT_JS(interp, >=, JS11) && /* EXT:4 */
1006 !ATEOF && NEXT == '0' &&
1007 !(pos + 1 < len && (s->data[pos+1] == '.' ||
1008 s->data[pos+1] == 'e' || s->data[pos+1] == 'E')))
1009 {
1010 SKIP;
1011 n = 0;
1012 while (!ATEOF && NEXT >= '0' && NEXT <= '7') {
1013 n = 8 * n + NEXT - '0';
1014 SKIP;
1015 }
1016 goto out;
1017 }
1018 #endif
1019
1020 /*
1021 * After this point, we expect to use strtod, so we
1022 * just check for character validity, rather than computing n.
1023 */
1024 seendig = 0;
1025 while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
1026 seendig = 1;
1027 SKIP;
1028 }
1029 if (!ATEOF && NEXT == '.') {
1030 SKIP; /* '.' */
1031 while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
1032 seendig = 1;
1033 SKIP;
1034 }
1035 }
1036 if (!seendig) goto fail; /* a lone dot is illegal */
1037 if (!ATEOF && (NEXT == 'e' || NEXT == 'E')) {
1038 SKIP;
1039 if (!ATEOF && NEXT == '-') {
1040 SKIP;
1041 } else if (!ATEOF && NEXT == '+')
1042 SKIP;
1043 seendig = 0;
1044 while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
1045 seendig = 1;
1046 SKIP;
1047 }
1048 if (!seendig) goto fail;
1049 }
1050 numbuf = SEE_STRING_ALLOCA(interp, char, pos - start + 1);
1051 for (i = 0; i < pos - start; i++)
1052 numbuf[i] = s->data[i + start] & 0x7f;
1053 numbuf[i] = '\0';
1054 endstr = NULL;
1055 n = SEE_strtod(numbuf, &endstr);
1056 if (!endstr || *endstr != '\0')
1057 goto fail;
1058 }
1059
1060 out:
1061 if (!sign) sign = POSITIVE;
1062
1063 /* trailing StrWhiteSpace */
1064 while (!ATEOF && (is_WhiteSpace(NEXT) || is_LineTerminator(NEXT)))
1065 SKIP;
1066 if (ATEOF) {
1067 SEE_SET_NUMBER(res, SEE_COPYSIGN(n, sign));
1068 return 1;
1069 }
1070
1071 fail:
1072 return 0;
1073 }
1074
1075