1 /*
2  * Copyright (c) 2003
3  *      David Leonard.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of David Leonard nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #if HAVE_CONFIG_H
32 # include <config.h>
33 #endif
34 
35 #if STDC_HEADERS
36 # include <stdio.h>
37 # include <stdlib.h>
38 #endif
39 
40 #if HAVE_STRING_H
41 # include <string.h>
42 #endif
43 
44 #include <see/type.h>
45 #include <see/string.h>
46 #include <see/value.h>
47 #include <see/object.h>
48 #include <see/input.h>
49 #include <see/try.h>
50 #include <see/intern.h>
51 #include <see/error.h>
52 #include <see/debug.h>
53 #include <see/interpreter.h>
54 #include <see/system.h>
55 #include "tokens.h"
56 #include "lex.h"
57 #include "stringdefs.h"
58 #include "unicode.h"
59 #include "dtoa.h"
60 #include "dprint.h"
61 #include "nmath.h"
62 
63 #ifndef NDEBUG
64 int SEE_lex_debug = 0;
65 #endif
66 
67 /*
68  * Lexical analyser.
69  *
70  * This is a lexical analyser for ECMAScript. It uses a 6-character
71  * lookahead input 'filter' (to detect '\u####') and provides an
72  * interface that reveals if the returned token was immediately
73  * preceeded by a line terminator.
74  *
75  * The lexical analyser's behaviour when deciding a slash '/' as
76  * a division or the start of a regular expression is determined by
77  * a flag. The parser is exepected to set it.
78  *
79  * NOTE: Although all strings generated for ECMAscript are UTF-16,
80  * this lexer requires UCS-32 input.
81  */
82 
83 /* Macros that assume local variable lex */
84 #define NEXT		lex->input->lookahead
85 #define SKIP		do { SEE_INPUT_NEXT(lex->input);		\
86 			} while (!ATEOF && is_FormatControl(NEXT))
87 #define UNGET(c)	do { lex->la[++lex->lalen]=(c); } while (0)
88 #define ATEOF		(lex->input->eof)
89 #define LOOKAHEAD(buf, len) SEE_input_lookahead_copy(lex->input, buf, len)
90 #define CONSUME(ch)							\
91     do {								\
92 	if (ATEOF)							\
93 	    SYNTAX_ERROR(STR(unexpected_eof));				\
94 	if (NEXT != (ch))						\
95 	    SYNTAX_ERROR(SEE_string_sprintf(				\
96 		lex->input->interpreter, "expected '%c'", (ch)));	\
97 	SKIP;								\
98     } while (0)
99 
100 #define SYNTAX_ERROR(s)							\
101 	SEE_error_throw_string(lex->input->interpreter,			\
102 	    lex->input->interpreter->SyntaxError,			\
103 	    prefix_msg(s, lex))
104 
105 /* Sign constants */
106 #define NEGATIVE	(-1)
107 #define POSITIVE	(1)
108 
109 /* Prototypes */
110 static struct SEE_string *prefix_msg(struct SEE_string *s, struct lex *lex);
111 static int is_FormatControl(SEE_unicode_t c);
112 static int is_WhiteSpace(SEE_unicode_t c);
113 static int is_LineTerminator(SEE_unicode_t c);
114 static int is_HexDigit(SEE_unicode_t c);
115 static int HexValue(SEE_unicode_t c);
116 static int is_HexEscape(struct lex *lex);
117 static int is_UnicodeEscape(struct lex *lex);
118 static int is_IdentifierStart(struct lex *lex);
119 static int is_IdentifierPart(struct lex *lex);
120 static SEE_unicode_t HexEscape(struct lex *lex);
121 static SEE_unicode_t UnicodeEscape(struct lex *lex);
122 static int DivPunctuator(struct lex *lex);
123 static int LineTerminator(struct lex *lex);
124 static int SkipToEndOfLine(struct lex *lex);
125 static int SGMLComment(struct lex *lex);
126 static int SGMLCommentEnd(struct lex *lex);
127 static int Punctuator(struct lex *lex);
128 static int StringLiteral(struct lex *lex);
129 static int RegularExpressionLiteral(struct lex *lex, int prev);
130 static int NumericLiteral(struct lex *lex);
131 static int CommentDiv(struct lex *lex);
132 static int Token(struct lex *lex);
133 static int lex0(struct lex *lex);
134 
135 /* Returns ("line " + next_lineno + ": " + s) */
136 static struct SEE_string *
prefix_msg(s,lex)137 prefix_msg(s, lex)
138 	struct SEE_string *s;
139 	struct lex *lex;
140 {
141 	struct SEE_string *t;
142 	struct SEE_interpreter *interp = lex->input->interpreter;
143 
144 	t = SEE_string_sprintf(interp, "line %d: ", lex->next_lineno);
145 	SEE_string_append(t, s);
146 	return t;
147 }
148 
149 static int
is_FormatControl(c)150 is_FormatControl(c)
151 	SEE_unicode_t c;			/* 7.1 */
152 {
153 	return UNICODE_IS_Cf(c);	/* category Cf or L or R */
154 }
155 
156 static int
is_WhiteSpace(c)157 is_WhiteSpace(c)
158 	SEE_unicode_t c;			/* 7.2 */
159 {
160 	return (c == 0x0009 || c == 0x000B || c == 0x000C || c == 0x0020
161 		|| c == 0x00A0 || UNICODE_IS_Zs(c));
162 }
163 
164 static int
is_LineTerminator(c)165 is_LineTerminator(c)
166 	SEE_unicode_t c;			/* 7.3 */
167 {
168 	return (c == 0x000A || c == 0x000D || c == 0x2028 || c == 0x2029);
169 }
170 
171 static int
is_HexDigit(c)172 is_HexDigit(c)
173 	SEE_unicode_t c;
174 {
175 	return ((c >= '0' && c <= '9') ||
176 		(c >= 'A' && c <= 'F') ||
177 		(c >= 'a' && c <= 'f'));
178 }
179 
180 /* Returns the hexadecimal value of a character. Assumes char is a hex digit */
181 static int
HexValue(c)182 HexValue(c)
183 	SEE_unicode_t c;
184 {
185 	if (c >= '0' && c <= '9')		return c - '0';
186 	else if (c >= 'a' && c <= 'f')		return c - 'a' + 10;
187 	else /* (c >= 'A' && c <= 'F') */	return c - 'A' + 10;
188 }
189 
190 static int
is_HexEscape(lex)191 is_HexEscape(lex)
192 	struct lex *lex;			/* 7.6 */
193 {
194 	SEE_unicode_t lookahead[4];
195 	int lookahead_len;
196 
197 	lookahead_len = LOOKAHEAD(lookahead, 4);
198 	return (lookahead_len >= 4 &&
199 		lookahead[0] == '\\' &&
200 		lookahead[1] == 'x' &&
201 		is_HexDigit(lookahead[2]) &&
202 		is_HexDigit(lookahead[3]));
203 }
204 
205 static int
is_UnicodeEscape(lex)206 is_UnicodeEscape(lex)
207 	struct lex *lex;			/* 7.6 */
208 {
209 	SEE_unicode_t lookahead[6];
210 	int lookahead_len;
211 
212 	lookahead_len = LOOKAHEAD(lookahead, 6);
213 	return (lookahead_len >= 6 &&
214 		lookahead[0] == '\\' &&
215 		lookahead[1] == 'u' &&
216 		is_HexDigit(lookahead[2]) &&
217 		is_HexDigit(lookahead[3]) &&
218 		is_HexDigit(lookahead[4]) &&
219 		is_HexDigit(lookahead[5]));
220 }
221 
222 static int
is_IdentifierStart(lex)223 is_IdentifierStart(lex)
224 	struct lex *lex;			/* 7.6 */
225 {
226 	SEE_unicode_t c;
227 
228 	if (ATEOF)
229 		return 0;
230 	if (is_UnicodeEscape(lex))
231 		return 1;
232 	c = NEXT;
233 	return UNICODE_IS_IS(c);
234 }
235 
236 static int
is_IdentifierPart(lex)237 is_IdentifierPart(lex)
238 	struct lex *lex;			/* 7.6 */
239 {
240 	SEE_unicode_t c;
241 
242 	if (ATEOF)
243 		return 0;
244 	if (is_UnicodeEscape(lex))
245 		return 1;
246 	c = NEXT;
247 	return UNICODE_IS_IP(c);
248 }
249 
250 static SEE_unicode_t
HexEscape(lex)251 HexEscape(lex)
252 	struct lex *lex;			/* 7.6 la \x */
253 {
254 	int i;
255 	SEE_unicode_t r = 0;
256 	CONSUME('\\'); CONSUME('x');
257 	for (i = 0; i < 2; i++) {
258 		if (ATEOF) SYNTAX_ERROR(STR(unexpected_eof));
259 		r = (r << 4) | HexValue(NEXT);
260 		SKIP;
261 	}
262 	return r;
263 }
264 
265 static SEE_unicode_t
UnicodeEscape(lex)266 UnicodeEscape(lex)
267 	struct lex *lex;			/* 7.6 la \u */
268 {
269 	int i;
270 	SEE_unicode_t r = 0;
271 	CONSUME('\\'); CONSUME('u');
272 	for (i = 0; i < 4; i++) {
273 		if (ATEOF) SYNTAX_ERROR(STR(unexpected_eof));
274 		r = (r << 4) | HexValue(NEXT);
275 		SKIP;
276 	}
277 	return r;
278 
279 	/*
280 	 * XXX NOTE: the \uxxxx escape can only encode characters
281 	 * up to 0xffff. To express unicode characters above this
282 	 * codepoint, you would have to use a UTF-16 surrogate, but
283 	 * this is problematic. Better would be to augment ECMA-262
284 	 * with a \Uxxxxxxxx escape, such as Python provides.
285 	 * (spec bug?)
286 	 */
287 }
288 
289 static int
DivPunctuator(lex)290 DivPunctuator(lex)
291 	struct lex *lex;			/* 7.7 la / */
292 {
293 	CONSUME('/');
294 	if (!ATEOF && NEXT == '=') {
295 		SKIP;
296 		return tDIVEQ;
297 	}
298 	return tDIV;
299 }
300 
301 static int
LineTerminator(lex)302 LineTerminator(lex)
303 	struct lex *lex;			/* line terminator */
304 {
305 	SEE_unicode_t lookahead[2];
306 	int lookahead_len;
307 
308 	lookahead_len = LOOKAHEAD(lookahead, 2);
309 	SEE_ASSERT(lex->input->interpreter, is_LineTerminator(lookahead[0]));
310 	SKIP;
311 	if (lookahead_len == 2 &&
312 	    lookahead[0] == '\r' &&
313 	    lookahead[1] == '\n')
314 	    {} /* Don't count the \r in a CRLF pair */
315 	else
316 	    lex->next_lineno++;
317 	return tLINETERMINATOR;
318 }
319 
320 /* Skips all characters up to and including a line terminator (or EOF) */
321 static int
SkipToEndOfLine(lex)322 SkipToEndOfLine(lex)
323 	struct lex *lex;
324 {
325 	while (!ATEOF && !is_LineTerminator(NEXT))
326 		SKIP;
327 	if (ATEOF)
328 		return tEND;
329 	return LineTerminator(lex);
330 }
331 
332 static int
SGMLComment(lex)333 SGMLComment(lex)
334 	struct lex *lex;			/* la <!-- */
335 {
336 	/*
337 	 * Treat SGML comment introducers the same as '//',
338 	 * i.e. to ignore everything up to the end of the line.
339 	 */
340 	return SkipToEndOfLine(lex);
341 }
342 
343 static int
SGMLCommentEnd(lex)344 SGMLCommentEnd(lex)
345 	struct lex *lex;			/* la (^) --> */
346 {
347 	/*
348 	 * The closing '-->' is supposed to be protected by an
349 	 * actual '//' comment leader. (Refer to Chapter 9 of
350 	 * 'Client-Side JavaScript Guide', by Netscape) but
351 	 * we treat it as '//' for compatibility.
352 	 */
353 	return SkipToEndOfLine(lex);
354 }
355 
356 static int
Punctuator(lex)357 Punctuator(lex)
358 	struct lex *lex;			/* 7.7 */
359 {
360 	SEE_unicode_t op[4];	/* ">>>=" is the longest punctuator */
361 	struct token *t;
362 	int j, len, oplen;
363 	struct SEE_interpreter *interp = lex->input->interpreter;
364 
365 	if (ATEOF)
366 		return tEND;
367 	oplen = LOOKAHEAD(op, 4);
368 	len = SEE_tok_noperators - 1;
369 	if (len > oplen)
370 		len = oplen;
371 	for (; len > 0; len--)
372 		for (t = SEE_tok_operators[len]; t->token; t++) {
373 			for (j = 0; j < len; j++)
374 			    if (t->identifier[j] != op[j])
375 				goto out;
376 			if (t->token == tSGMLCOMMENT) {
377 			    if (interp->compatibility & SEE_COMPAT_SGMLCOM)
378 				return SGMLComment(lex);
379 			    else
380 				goto out;
381 			}
382 			if (t->token == tSGMLCOMMENTEND && lex->next_at_bol) {
383 			    if (interp->compatibility & SEE_COMPAT_SGMLCOM)
384 				return SGMLCommentEnd(lex);
385 			    else
386 				goto out;
387 			}
388 			for (j = 0; j < len; j++)
389 			    SKIP;
390 			return t->token;
391 	   out:
392 			/* continue */ ;
393 		}
394 
395 	/*
396 	 * Throw a descriptive error message
397 	 */
398 	if (op[0] == SEE_INPUT_BADCHAR)
399 		SYNTAX_ERROR(SEE_string_sprintf(interp,
400 			"malformed unicode input"));
401 	else if (op[0] >= ' ' && op[0] <= '~')
402 		SYNTAX_ERROR(SEE_string_sprintf(interp,
403 			"unexpected character '%c'", op[0]));
404 	else
405 		SYNTAX_ERROR(SEE_string_sprintf(interp,
406 			"unexpected character '\\u%04x'", op[0]));
407 	/* NOTREACHED */
408 }
409 
410 static int
StringLiteral(lex)411 StringLiteral(lex)
412 	struct lex *lex;			/* 7.8.4 la ' " */
413 {
414 	SEE_unicode_t quote;
415 	SEE_unicode_t c = 0;
416 	struct SEE_string *s;
417 	struct SEE_interpreter *interp = lex->input->interpreter;
418 
419 	s = SEE_string_new(interp, 0);
420 	quote = NEXT;
421 	SKIP;
422 	while (!ATEOF && NEXT != quote) {
423 		if (is_LineTerminator(NEXT))
424 			SYNTAX_ERROR(STR(broken_literal));
425 		else if (is_UnicodeEscape(lex))
426 			c = UnicodeEscape(lex);
427 		else if (is_HexEscape(lex))
428 			c = HexEscape(lex);
429 		else if (NEXT == '\\') {
430 			SKIP;
431 			if (is_LineTerminator(NEXT)) {
432 			    if (SEE_GET_JS_COMPAT(interp)) {
433 				/* Ignore escaped LineTerminator */
434 				SKIP;
435 				continue;
436 			    }
437 			    SYNTAX_ERROR(STR(escaped_lit_nl));
438 			}
439 			else if (ATEOF)
440 			    SYNTAX_ERROR(STR(escaped_lit_nl));
441 			switch (NEXT) {
442 			case 'b':	c = 0x0008; SKIP; break;
443 			case 't':	c = 0x0009; SKIP; break;
444 			case 'n':	c = 0x000a; SKIP; break;
445 			case 'v':	c = 0x000b; SKIP; break;
446 			case 'f':	c = 0x000c; SKIP; break;
447 			case 'r':	c = 0x000d; SKIP; break;
448 			case '0': case '1': case '2': case '3':
449 				c = NEXT - '0'; SKIP;
450 				if (!ATEOF && NEXT >= '0' && NEXT <= '7')
451 					{ c = (c << 3) | (NEXT - '0'); SKIP; }
452 				if (!ATEOF && NEXT >= '0' && NEXT <= '7')
453 					{ c = (c << 3) | (NEXT - '0'); SKIP; }
454 				break;
455 			case '4': case '5': case '6': case '7':
456 				c = NEXT - '0'; SKIP;
457 				if (!ATEOF && NEXT >= '0' && NEXT <= '7')
458 					{ c = (c << 3) | (NEXT - '0'); SKIP; }
459 				break;
460 			case 'x':
461 			case 'u':
462 				if (SEE_GET_JS_COMPAT(interp))
463 				    goto literal;
464 				/* Strict ECMA: */
465 				if (NEXT == 'x')
466 				     SYNTAX_ERROR(STR(invalid_esc_x));
467 				else
468 				     SYNTAX_ERROR(STR(invalid_esc_u));
469 				/* NOTREACHED */
470 			default:
471 	literal:
472 				c = NEXT; SKIP; break;
473 			}
474 		} else {
475 			c = NEXT;
476 			SKIP;
477 		}
478 		SEE_string_append_unicode(s, c);
479 	}
480 	CONSUME(quote);
481 	SEE_SET_STRING(&lex->value, s);
482 	return tSTRING;
483 }
484 
485 /*
486  * 7.8.5 Scans for a regular expression token.
487  * Assumes prev (immediately previous token) is either tDIV or tDIVEQ.
488  * Returns tREGEX on success or throws an exception on failure.
489  * The string in lex->value is of the form "/regex/flags"
490  */
491 static int
RegularExpressionLiteral(lex,prev)492 RegularExpressionLiteral(lex, prev)
493 	struct lex *lex;
494 	int prev;
495 {
496 	struct SEE_string *s;
497 	int incc = 0;
498 	struct SEE_interpreter *interp = lex->input->interpreter;
499 
500 	s = SEE_string_new(interp, 0);
501 	SEE_string_addch(s, '/');
502 	if (prev == tDIVEQ)
503 		SEE_string_addch(s, '=');
504 	while (!ATEOF) {
505 		if (NEXT == '/' &&
506 		    (!incc || !(SEE_GET_JS_COMPAT(interp))))	/* EXT:15 */
507 			break;
508 		if (NEXT == '\\') {
509 			SEE_string_addch(s, '\\');
510 			SKIP;
511 			if (ATEOF) break;
512 		} else {
513 			/* Track charclasses for JS_COMPAT */
514 			if (NEXT == '[') incc = 1;
515 			if (NEXT == ']') incc = 0;
516 		}
517 		if (is_LineTerminator(NEXT))
518 			SYNTAX_ERROR(STR(broken_regex));
519 		SEE_string_append_unicode(s, NEXT);
520 		SKIP;
521 	}
522 	if (ATEOF)
523 		SYNTAX_ERROR(STR(eof_in_regex));
524 	CONSUME('/');
525 
526 	SEE_string_addch(s, '/');
527 	while (!ATEOF && is_IdentifierPart(lex)) {
528 		SEE_string_append_unicode(s, NEXT);
529 		SKIP;
530 	}
531 
532 	SEE_SET_STRING(&lex->value, s);
533 	return tREGEX;
534 }
535 
536 static int
NumericLiteral(lex)537 NumericLiteral(lex)
538 	struct lex *lex;			/* 7.8.3 la [.0-9] */
539 {
540 	SEE_number_t n, e;
541 	int seendigit;
542 	unsigned int i;
543 	struct SEE_string *s;
544 	char *numbuf, *endstr;
545 	struct SEE_interpreter *interp = lex->input->interpreter;
546 
547 	seendigit = 0;
548 	n = 0;
549 	s = SEE_string_new(interp, 0);
550 
551 	if (NEXT == '0') {
552 	    SKIP;
553 	    if (!ATEOF && (NEXT == 'x' || NEXT == 'X')) {
554 		SKIP;
555 		if (ATEOF || !is_HexDigit(NEXT))
556 		    SYNTAX_ERROR(STR(hex_literal_detritus));
557 		while (!ATEOF && is_HexDigit(NEXT)) {
558 		    SEE_string_addch(s, (SEE_char_t)NEXT);
559 		    SKIP;
560 		}
561 		if (!ATEOF && is_IdentifierStart(lex))
562 		    SYNTAX_ERROR(STR(hex_literal_detritus));
563 		e = 1;
564 		for (i = 0; i < s->length; i++) {
565 		    n += e * HexValue(s->data[s->length - i - 1]);
566 		    e *= 16;
567 		}
568 		SEE_SET_NUMBER(&lex->value, n);
569 		return tNUMBER;
570 	    }
571 	    SEE_string_addch(s, '0');
572 	    seendigit = 1;
573 	}
574 
575 	while (!ATEOF && '0' <= NEXT && NEXT <= '9') {
576 	    SEE_string_addch(s, (SEE_char_t)NEXT);
577 	    seendigit = 1;
578 	    SKIP;
579 	}
580 
581 	/* Octal integers */
582 	if (SEE_GET_JS_COMPAT(interp)
583 	    && seendigit
584 	    && (ATEOF || (NEXT != '.' && NEXT != 'e' && NEXT != 'E'))
585 	    && s->length > 1
586 	    && s->data[0] == '0')
587 	{
588 		/* Octal integers start with 0 and dont follow with . or e */
589 		n = 0;
590 		for (i = 1; i < s->length; i++) {
591 		    if (s->data[i] > '7')
592 			goto not_octal;
593 		    n = n * 8 + s->data[i] - '0';
594 		}
595 		if (!ATEOF && is_IdentifierStart(lex))
596 		    goto not_octal;
597 		SEE_SET_NUMBER(&lex->value, n);
598 		return tNUMBER;
599 	}
600     not_octal:
601 
602 	if (!ATEOF && NEXT == '.') {
603 	    SEE_string_addch(s, '.');
604 	    SKIP;
605 	    while (!ATEOF && '0' <= NEXT && NEXT <= '9') {
606 		seendigit = 1;
607 	        SEE_string_addch(s, (SEE_char_t)NEXT);
608 		SKIP;
609 	    }
610 	}
611 	if (!seendigit) {
612 	    /* free(s) */
613 	    return '.';		/* Actually matched Punctuator! */
614 	}
615 
616 	if (!ATEOF && (NEXT == 'e' || NEXT == 'E')) {
617 	    SEE_string_addch(s, (SEE_char_t)NEXT);
618 	    SKIP;
619 	    seendigit = 0;
620 	    if (!ATEOF && NEXT == '-') {
621 	        SEE_string_addch(s, '-');
622 		SKIP;
623 	    } else if (!ATEOF && NEXT == '+') {
624 	        SEE_string_addch(s, '+');
625 		SKIP;
626 	    }
627 	    e = 0;
628 	    while (!ATEOF && '0' <= NEXT && NEXT <= '9') {
629 		seendigit = 1;
630 	        SEE_string_addch(s, (SEE_char_t)NEXT);
631 		SKIP;
632 	    }
633 	    if (!seendigit)
634 		SYNTAX_ERROR(STR(dec_literal_detritus));
635 	}
636 
637 	numbuf = SEE_STRING_ALLOCA(interp, char, s->length + 1);
638 	for (i = 0; i < s->length; i++)
639 		numbuf[i] = s->data[i] & 0x7f;
640 	numbuf[i] = '\0';
641 	endstr = NULL;
642 	n = SEE_strtod(numbuf, &endstr);
643 	if (!endstr || *endstr) 		/* impossible condition? */
644 		SYNTAX_ERROR(STR(dec_literal_detritus));
645 	SEE_SET_NUMBER(&lex->value, n);
646 	return tNUMBER;
647 }
648 
649 static int
CommentDiv(lex)650 CommentDiv(lex)
651 	struct lex *lex;			/* 7.4 la / */
652 {
653 	SEE_unicode_t lookahead[2];
654 	int lookahead_len;
655 
656 	lookahead_len = LOOKAHEAD(lookahead, 2);
657 
658 	if (lookahead_len >= 2 && lookahead[0] == '/' && lookahead[1] == '*') {
659 		int starprev = 0, contains_newline = 0;
660 		SKIP;
661 		SKIP;
662 		while (!ATEOF) {
663 			if (starprev && NEXT == '/') {
664 			    CONSUME('/');
665 			    return contains_newline
666 				? tLINETERMINATOR
667 				: tCOMMENT;
668 			}
669 			if (is_LineTerminator(NEXT)) {
670 			    (void)LineTerminator(lex);
671 			    contains_newline = 1;
672 			    starprev = 0;
673 			} else {
674 			    starprev = (NEXT == '*');
675 			    SKIP;
676 			}
677 		}
678 		SYNTAX_ERROR(STR(eof_in_c_comment));
679 	}
680 	if (lookahead_len >= 2 && lookahead[0] == '/' && lookahead[1] == '/')
681 		return SkipToEndOfLine(lex);
682 
683 	/*
684 	 * NB: This assumes regular expressions not wanted,
685 	 * and that the rest of the regex can be scanned later
686 	 * if the parser wants it.
687 	 */
688 	return DivPunctuator(lex);
689 }
690 
691 static int
Token(lex)692 Token(lex)
693 	struct lex *lex;				/* 7.5 */
694 {
695 	struct SEE_interpreter *interp = lex->input->interpreter;
696 
697 	if (ATEOF)
698 		return tEND;
699 
700 	if (NEXT == '\'' || NEXT == '\"')
701 		return StringLiteral(lex);
702 
703 	if ((NEXT >= '0' && NEXT <= '9') || NEXT == '.')
704 		return NumericLiteral(lex);
705 
706 	if (is_IdentifierStart(lex)) {
707 		int hasescape = 0, i;
708 		struct SEE_string *s;
709 		SEE_unicode_t c;
710 
711 		s = SEE_string_new(interp, 0);
712 		do {
713 			if (is_UnicodeEscape(lex)) {
714 				c = UnicodeEscape(lex);
715 				if (s->length == 0) {
716 				    if (!UNICODE_IS_IS(c))
717 					SYNTAX_ERROR(STR(bad_unicode_ident));
718 				} else
719 				    if (!UNICODE_IS_IP(c))
720 					SYNTAX_ERROR(STR(bad_unicode_ident));
721 				hasescape = 1;
722 			} else  {
723 				c = NEXT;
724 				SKIP;
725 			}
726 			SEE_string_append_unicode(s, c);
727 		} while (is_IdentifierPart(lex));
728 
729 		/* match keywords */
730 		if (!hasescape)
731 		    for (i = 0; i < SEE_tok_nkeywords; i++) {
732 			const struct SEE_string *keyword;
733 
734 			keyword = STRn(SEE_tok_keywords[i].index);
735 			if (keyword->length == s->length &&
736 		            SEE_string_cmp(keyword, s) == 0)
737 			{
738 			    int token = SEE_tok_keywords[i].token;
739 			    if (token == tRESERVED &&
740 /* EXT:3 */			SEE_COMPAT_JS(interp, >=, JS11))
741 			    {
742 #ifndef NDEBUG
743 				dprintf("Warning: line %d: reserved token '",
744 				    lex->next_lineno);
745 				dprints(s);
746 				dprintf("' treated as identifier\n");
747 #endif
748 			        break;
749 			    }
750 			    return token;
751 			}
752 		     }
753 
754 		SEE_intern_and_free(interp, &s);
755 		SEE_SET_STRING(&lex->value, s);
756 		return tIDENT;
757 	}
758 
759 	return Punctuator(lex);
760 }
761 
762 
763 
764 /*
765  * Scanner grammar goal. Scans lex->input for a token, and returns it.
766  *
767  * May return multiple tLINETERMINATORs, but will never return tCOMMENT.
768  * Scans the InputElementDiv production (never InputElementRegex).
769  * If this function returns tDIV or tDIVEQ, and a regular expression is wanted,
770  * then SEE_lex_regex() should be called immediately.
771  */
772 static int
lex0(lex)773 lex0(lex)
774 	struct lex *lex;
775 {
776 	int ret;
777 
778     again:
779 
780 	while (!ATEOF && is_WhiteSpace(NEXT) && !is_LineTerminator(NEXT))
781 		SKIP;			/* skip non-newline whitespace */
782 	if (ATEOF)
783 		return tEND;
784 	if (is_LineTerminator(NEXT))
785 		return LineTerminator(lex);
786 
787 	switch (NEXT) {
788 	case '/':
789 		ret = CommentDiv(lex);
790 		if (ret == tCOMMENT)
791 			goto again;	/* Discard tCOMMENTs */
792 		return ret;
793 	case '\"':
794 	case '\'':
795 		return StringLiteral(lex);
796 	case '0': case '1': case '2': case '3': case '4':
797 	case '5': case '6': case '7': case '8': case '9':
798 		return NumericLiteral(lex);
799 	case '.':
800 	    {
801 		SEE_unicode_t lookahead[2];
802 		int lookahead_len;
803 
804 		lookahead_len = LOOKAHEAD(lookahead, 2);
805 		if (lookahead_len >= 2
806 		 && lookahead[1] >= '0'
807 		 && lookahead[1] <= '9')
808 			return NumericLiteral(lex);
809 		SKIP;
810 		return '.';
811 	    }
812 	default:
813 		return Token(lex);
814 	}
815 }
816 
817 /*------------------------------------------------------------
818  * Public API
819  */
820 
821 /*
822  * Initialises a tokenizer structure
823  */
824 void
SEE_lex_init(lex,inp)825 SEE_lex_init(lex, inp)
826 	struct lex *lex;
827 	struct SEE_input *inp;
828 {
829 	lex->input = inp;
830 	SEE_SET_UNDEFINED(&lex->value);
831 	lex->next_lineno = inp->first_lineno;
832 	lex->next_filename = SEE_intern(inp->interpreter, inp->filename);
833 	lex->next_at_bol = 1;
834 	(void)SEE_lex_next(lex);
835 }
836 
837 /*
838  * Main interface to the lexical anaylser.
839  *
840  * We keep a one-token lookahead.
841  * Each call to this function generates a new lookahead token
842  * (in lex->next) and returns the previous one, so
843  * the lex flags apply to the scanning of the NEXT token,
844  * and NOT to the token being returned. (ie The caller should
845  * generally refer to the resulting lex->next to make
846  * decisions. The value returned is merely a convenience.)
847  *
848  * On return, this function also sets (or clears) the
849  * lex->next_follows_nl flag when a newline is seen immediately
850  * before lex->next. The parser should use this information to
851  * perform automatic semicolon insertion. Note that the defined
852  * tLINETERMINATOR token is an internal scanner pseudo-token and
853  * is never returned by this function. Use the next_follows_nl flag.
854  *
855  * As a special case, if end-of-file (tEND) does not follow
856  * a line terminator, then this function pretends that it does.
857  *
858  * The lex->next_lineno field reflects the line number of
859  * lex->next.
860  */
861 int
SEE_lex_next(lex)862 SEE_lex_next(lex)
863 	struct lex *lex;
864 {
865 	int next, token;
866 
867 	lex->next_follows_nl = 0;
868 	next = lex->next;
869 
870 	token = lex0(lex);
871 	while (token == tLINETERMINATOR) {
872 #ifndef NDEBUG
873 		if (SEE_lex_debug && !lex->next_follows_nl)
874 		    dprintf("lex: [LINETERMINATOR]\n");
875 
876 #endif
877 		lex->next_follows_nl = 1;
878 		lex->next_at_bol = 1;
879 		token = lex0(lex);
880 	}
881 	lex->next_at_bol = 0;
882 
883 	if (token == tEND)
884 		lex->next_follows_nl = 1;
885 	lex->next = token;
886 
887 #ifndef NDEBUG
888 	if (SEE_lex_debug)
889 	    switch (lex->next) {
890 	    case tIDENT:
891 		  dprintf("lex: tIDENT ");
892 		  dprintv(lex->input->interpreter, &lex->value);
893 		  dprintf("\n"); break;
894 	    case tSTRING:
895 		  dprintf("lex: tSTRING ");
896 		  dprintv(lex->input->interpreter, &lex->value);
897 		  dprintf("\n"); break;
898 	    case tNUMBER:
899 		  dprintf("lex: tNUMBER ");
900 		  dprintv(lex->input->interpreter, &lex->value);
901 		  dprintf("\n"); break;
902 	    default:
903 		  dprintf("lex: %s\n", SEE_tokenname(lex->next));
904 	}
905 #endif
906 
907 	return next;
908 }
909 
910 /*
911  * Converts the next token (just scanned) into a regular expression,
912  * if possible.
913  */
914 void
SEE_lex_regex(lex)915 SEE_lex_regex(lex)
916 	struct lex *lex;
917 {
918 	if (lex->next == tDIV || lex->next == tDIVEQ)
919 		lex->next = RegularExpressionLiteral(lex, lex->next);
920 }
921 
922 /*
923  * 9.3.1
924  * Scans a SEE_string to convert it into a number.
925  * On success, sets res to the resulting number and returns non-zero.
926  *
927  * This function is called by SEE_ToNumber().
928  */
929 int
SEE_lex_number(interp,s,res)930 SEE_lex_number(interp, s, res)
931 	struct SEE_interpreter *interp;
932 	struct SEE_string *s;
933 	struct SEE_value *res;
934 {
935 	SEE_number_t n, sign;
936 	int seendig, hexok;
937 	int len = s->length;
938 	int i, pos;
939 	int start;
940 	char *numbuf, *endstr;
941 
942 /* These work becuase we expect no Unicode surrogates in numbers */
943 #undef ATEOF
944 #undef NEXT
945 #undef SKIP
946 #define ATEOF	(pos >= len)
947 #define NEXT	(s->data[pos])
948 #define SKIP	pos++
949 
950 	pos = 0;
951 
952 	/* StrWhiteSpace */
953 	while (!ATEOF && (is_WhiteSpace(NEXT) || is_LineTerminator(NEXT)))
954 		SKIP;
955 
956 	if (ATEOF) {
957 		SEE_SET_NUMBER(res, 0);		/* +0 */
958 		return 1;
959 	}
960 
961 	sign = 0;
962 	if (NEXT == '-') {
963 		sign = NEGATIVE;
964 		SKIP;
965 	} else if (NEXT == '+') {
966 		sign = POSITIVE;
967 		SKIP;
968 	}
969 
970 	/* Strict ECMA262-3 hex strings require no sign. Netscape relaxes this. */
971 	hexok = !sign || SEE_GET_JS_COMPAT(interp);
972 
973 	if (ATEOF) goto fail;
974 	if (NEXT == 'I') {
975 		SKIP; if (ATEOF || NEXT != 'n') goto fail;
976 		SKIP; if (ATEOF || NEXT != 'f') goto fail;
977 		SKIP; if (ATEOF || NEXT != 'i') goto fail;
978 		SKIP; if (ATEOF || NEXT != 'n') goto fail;
979 		SKIP; if (ATEOF || NEXT != 'i') goto fail;
980 		SKIP; if (ATEOF || NEXT != 't') goto fail;
981 		SKIP; if (ATEOF || NEXT != 'y') goto fail;
982 		SKIP; n = SEE_Infinity;
983 	} else {
984 		n = 0;
985 		start = pos;
986 
987 		/* Hexadecimal */
988 		if (hexok && pos + 1 < len && s->data[pos] == '0' &&
989 			(s->data[pos+1] == 'x' || s->data[pos+1] == 'X'))
990 		{
991 		    SKIP;
992 		    SKIP;
993 		    seendig = 0;
994 		    while (!ATEOF && is_HexDigit(NEXT)) {
995 			seendig = 1;
996 			n = 16 * n + HexValue(NEXT);
997 			SKIP;
998 		    }
999 		    if (!seendig) goto fail;
1000 		    goto out;
1001 		}
1002 
1003 #if 0
1004 		/* Octal */
1005 		if (SEE_COMPAT_JS(interp, >=, JS11) && /* EXT:4 */
1006 		    !ATEOF && NEXT == '0' &&
1007 		    !(pos + 1 < len && (s->data[pos+1] == '.' ||
1008 		      s->data[pos+1] == 'e' || s->data[pos+1] == 'E')))
1009 		{
1010 		    SKIP;
1011 		    n = 0;
1012 		    while (!ATEOF && NEXT >= '0' && NEXT <= '7') {
1013 			n = 8 * n + NEXT - '0';
1014 			SKIP;
1015 		    }
1016 		    goto out;
1017 		}
1018 #endif
1019 
1020 		/*
1021 		 * After this point, we expect to use strtod, so we
1022 		 * just check for character validity, rather than computing n.
1023 		 */
1024 		seendig = 0;
1025 		while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
1026 		    seendig = 1;
1027 		    SKIP;
1028 		}
1029 		if (!ATEOF && NEXT == '.') {
1030 		    SKIP; /* '.' */
1031 		    while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
1032 			seendig = 1;
1033 			SKIP;
1034 		    }
1035 		}
1036 		if (!seendig) goto fail;	/* a lone dot is illegal */
1037 		if (!ATEOF && (NEXT == 'e' || NEXT == 'E')) {
1038 		    SKIP;
1039 		    if (!ATEOF && NEXT == '-') {
1040 			SKIP;
1041 		    } else if (!ATEOF && NEXT == '+')
1042 			SKIP;
1043 		    seendig = 0;
1044 		    while (!ATEOF && NEXT >= '0' && NEXT <= '9') {
1045 			seendig = 1;
1046 			SKIP;
1047 		    }
1048 		    if (!seendig) goto fail;
1049 		}
1050 		numbuf = SEE_STRING_ALLOCA(interp, char, pos - start + 1);
1051 		for (i = 0; i < pos - start; i++)
1052 			numbuf[i] = s->data[i + start] & 0x7f;
1053 		numbuf[i] = '\0';
1054 		endstr = NULL;
1055 		n = SEE_strtod(numbuf, &endstr);
1056 		if (!endstr || *endstr != '\0')
1057 			goto fail;
1058 	}
1059 
1060    out:
1061 	if (!sign) sign = POSITIVE;
1062 
1063 	/* trailing StrWhiteSpace */
1064 	while (!ATEOF && (is_WhiteSpace(NEXT) || is_LineTerminator(NEXT)))
1065 		SKIP;
1066 	if (ATEOF) {
1067 	    SEE_SET_NUMBER(res, SEE_COPYSIGN(n, sign));
1068 	    return 1;
1069 	}
1070 
1071     fail:
1072 	return 0;
1073 }
1074 
1075