1 /*
2  *  Lexer defines.
3  */
4 
5 #if !defined(DUK_LEXER_H_INCLUDED)
6 #define DUK_LEXER_H_INCLUDED
7 
8 typedef void (*duk_re_range_callback)(void *user, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct);
9 
10 /*
11  *  A token is interpreted as any possible production of InputElementDiv
12  *  and InputElementRegExp, see E5 Section 7 in its entirety.  Note that
13  *  the E5 "Token" production does not cover all actual tokens of the
14  *  language (which is explicitly stated in the specification, Section 7.5).
15  *  Null and boolean literals are defined as part of both ReservedWord
16  *  (E5 Section 7.6.1) and Literal (E5 Section 7.8) productions.  Here,
17  *  null and boolean values have literal tokens, and are not reserved
18  *  words.
19  *
20  *  Decimal literal negative/positive sign is -not- part of DUK_TOK_NUMBER.
21  *  The number tokens always have a non-negative value.  The unary minus
22  *  operator in "-1.0" is optimized during compilation to yield a single
23  *  negative constant.
24  *
25  *  Token numbering is free except that reserved words are required to be
26  *  in a continuous range and in a particular order.  See genstrings.py.
27  */
28 
29 #define DUK_LEXER_INITCTX(ctx)        duk_lexer_initctx((ctx))
30 
31 #define DUK_LEXER_SETPOINT(ctx,pt)    duk_lexer_setpoint((ctx), (pt))
32 
33 #define DUK_LEXER_GETPOINT(ctx,pt)    duk_lexer_getpoint((ctx), (pt))
34 
35 /* Currently 6 characters of lookup are actually needed (duk_lexer.c). */
36 #define DUK_LEXER_WINDOW_SIZE                     6
37 #if defined(DUK_USE_LEXER_SLIDING_WINDOW)
38 #define DUK_LEXER_BUFFER_SIZE                     64
39 #endif
40 
41 #define DUK_TOK_MINVAL                            0
42 
43 /* returned after EOF (infinite amount) */
44 #define DUK_TOK_EOF                               0
45 
46 /* identifier names (E5 Section 7.6) */
47 #define DUK_TOK_IDENTIFIER                        1
48 
49 /* reserved words: keywords */
50 #define DUK_TOK_START_RESERVED                    2
51 #define DUK_TOK_BREAK                             2
52 #define DUK_TOK_CASE                              3
53 #define DUK_TOK_CATCH                             4
54 #define DUK_TOK_CONTINUE                          5
55 #define DUK_TOK_DEBUGGER                          6
56 #define DUK_TOK_DEFAULT                           7
57 #define DUK_TOK_DELETE                            8
58 #define DUK_TOK_DO                                9
59 #define DUK_TOK_ELSE                              10
60 #define DUK_TOK_FINALLY                           11
61 #define DUK_TOK_FOR                               12
62 #define DUK_TOK_FUNCTION                          13
63 #define DUK_TOK_IF                                14
64 #define DUK_TOK_IN                                15
65 #define DUK_TOK_INSTANCEOF                        16
66 #define DUK_TOK_NEW                               17
67 #define DUK_TOK_RETURN                            18
68 #define DUK_TOK_SWITCH                            19
69 #define DUK_TOK_THIS                              20
70 #define DUK_TOK_THROW                             21
71 #define DUK_TOK_TRY                               22
72 #define DUK_TOK_TYPEOF                            23
73 #define DUK_TOK_VAR                               24
74 #define DUK_TOK_CONST                             25
75 #define DUK_TOK_VOID                              26
76 #define DUK_TOK_WHILE                             27
77 #define DUK_TOK_WITH                              28
78 
79 /* reserved words: future reserved words */
80 #define DUK_TOK_CLASS                             29
81 #define DUK_TOK_ENUM                              30
82 #define DUK_TOK_EXPORT                            31
83 #define DUK_TOK_EXTENDS                           32
84 #define DUK_TOK_IMPORT                            33
85 #define DUK_TOK_SUPER                             34
86 
87 /* "null", "true", and "false" are always reserved words.
88  * Note that "get" and "set" are not!
89  */
90 #define DUK_TOK_NULL                              35
91 #define DUK_TOK_TRUE                              36
92 #define DUK_TOK_FALSE                             37
93 
94 /* reserved words: additional future reserved words in strict mode */
95 #define DUK_TOK_START_STRICT_RESERVED             38  /* inclusive */
96 #define DUK_TOK_IMPLEMENTS                        38
97 #define DUK_TOK_INTERFACE                         39
98 #define DUK_TOK_LET                               40
99 #define DUK_TOK_PACKAGE                           41
100 #define DUK_TOK_PRIVATE                           42
101 #define DUK_TOK_PROTECTED                         43
102 #define DUK_TOK_PUBLIC                            44
103 #define DUK_TOK_STATIC                            45
104 #define DUK_TOK_YIELD                             46
105 
106 #define DUK_TOK_END_RESERVED                      47  /* exclusive */
107 
108 /* "get" and "set" are tokens but NOT ReservedWords.  They are currently
109  * parsed and identifiers and these defines are actually now unused.
110  */
111 #define DUK_TOK_GET                               47
112 #define DUK_TOK_SET                               48
113 
114 /* punctuators (unlike the spec, also includes "/" and "/=") */
115 #define DUK_TOK_LCURLY                            49
116 #define DUK_TOK_RCURLY                            50
117 #define DUK_TOK_LBRACKET                          51
118 #define DUK_TOK_RBRACKET                          52
119 #define DUK_TOK_LPAREN                            53
120 #define DUK_TOK_RPAREN                            54
121 #define DUK_TOK_PERIOD                            55
122 #define DUK_TOK_SEMICOLON                         56
123 #define DUK_TOK_COMMA                             57
124 #define DUK_TOK_LT                                58
125 #define DUK_TOK_GT                                59
126 #define DUK_TOK_LE                                60
127 #define DUK_TOK_GE                                61
128 #define DUK_TOK_EQ                                62
129 #define DUK_TOK_NEQ                               63
130 #define DUK_TOK_SEQ                               64
131 #define DUK_TOK_SNEQ                              65
132 #define DUK_TOK_ADD                               66
133 #define DUK_TOK_SUB                               67
134 #define DUK_TOK_MUL                               68
135 #define DUK_TOK_DIV                               69
136 #define DUK_TOK_MOD                               70
137 #define DUK_TOK_EXP                               71
138 #define DUK_TOK_INCREMENT                         72
139 #define DUK_TOK_DECREMENT                         73
140 #define DUK_TOK_ALSHIFT                           74   /* named "arithmetic" because result is signed */
141 #define DUK_TOK_ARSHIFT                           75
142 #define DUK_TOK_RSHIFT                            76
143 #define DUK_TOK_BAND                              77
144 #define DUK_TOK_BOR                               78
145 #define DUK_TOK_BXOR                              79
146 #define DUK_TOK_LNOT                              80
147 #define DUK_TOK_BNOT                              81
148 #define DUK_TOK_LAND                              82
149 #define DUK_TOK_LOR                               83
150 #define DUK_TOK_QUESTION                          84
151 #define DUK_TOK_COLON                             85
152 #define DUK_TOK_EQUALSIGN                         86
153 #define DUK_TOK_ADD_EQ                            87
154 #define DUK_TOK_SUB_EQ                            88
155 #define DUK_TOK_MUL_EQ                            89
156 #define DUK_TOK_DIV_EQ                            90
157 #define DUK_TOK_MOD_EQ                            91
158 #define DUK_TOK_EXP_EQ                            92
159 #define DUK_TOK_ALSHIFT_EQ                        93
160 #define DUK_TOK_ARSHIFT_EQ                        94
161 #define DUK_TOK_RSHIFT_EQ                         95
162 #define DUK_TOK_BAND_EQ                           96
163 #define DUK_TOK_BOR_EQ                            97
164 #define DUK_TOK_BXOR_EQ                           98
165 
166 /* literals (E5 Section 7.8), except null, true, false, which are treated
167  * like reserved words (above).
168  */
169 #define DUK_TOK_NUMBER                            99
170 #define DUK_TOK_STRING                            100
171 #define DUK_TOK_REGEXP                            101
172 
173 #define DUK_TOK_MAXVAL                            101  /* inclusive */
174 
175 #define DUK_TOK_INVALID                           DUK_SMALL_UINT_MAX
176 
177 /* Convert heap string index to a token (reserved words) */
178 #define DUK_STRIDX_TO_TOK(x)                        ((x) - DUK_STRIDX_START_RESERVED + DUK_TOK_START_RESERVED)
179 
180 /* Sanity check */
181 #if (DUK_TOK_MAXVAL > 255)
182 #error DUK_TOK_MAXVAL too large, code assumes it fits into 8 bits
183 #endif
184 
185 /* Sanity checks for string and token defines */
186 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_BREAK) != DUK_TOK_BREAK)
187 #error mismatch in token defines
188 #endif
189 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_CASE) != DUK_TOK_CASE)
190 #error mismatch in token defines
191 #endif
192 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_CATCH) != DUK_TOK_CATCH)
193 #error mismatch in token defines
194 #endif
195 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_CONTINUE) != DUK_TOK_CONTINUE)
196 #error mismatch in token defines
197 #endif
198 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_DEBUGGER) != DUK_TOK_DEBUGGER)
199 #error mismatch in token defines
200 #endif
201 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_DEFAULT) != DUK_TOK_DEFAULT)
202 #error mismatch in token defines
203 #endif
204 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_DELETE) != DUK_TOK_DELETE)
205 #error mismatch in token defines
206 #endif
207 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_DO) != DUK_TOK_DO)
208 #error mismatch in token defines
209 #endif
210 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_ELSE) != DUK_TOK_ELSE)
211 #error mismatch in token defines
212 #endif
213 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_FINALLY) != DUK_TOK_FINALLY)
214 #error mismatch in token defines
215 #endif
216 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_FOR) != DUK_TOK_FOR)
217 #error mismatch in token defines
218 #endif
219 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_LC_FUNCTION) != DUK_TOK_FUNCTION)
220 #error mismatch in token defines
221 #endif
222 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_IF) != DUK_TOK_IF)
223 #error mismatch in token defines
224 #endif
225 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_IN) != DUK_TOK_IN)
226 #error mismatch in token defines
227 #endif
228 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_INSTANCEOF) != DUK_TOK_INSTANCEOF)
229 #error mismatch in token defines
230 #endif
231 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_NEW) != DUK_TOK_NEW)
232 #error mismatch in token defines
233 #endif
234 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_RETURN) != DUK_TOK_RETURN)
235 #error mismatch in token defines
236 #endif
237 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_SWITCH) != DUK_TOK_SWITCH)
238 #error mismatch in token defines
239 #endif
240 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_THIS) != DUK_TOK_THIS)
241 #error mismatch in token defines
242 #endif
243 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_THROW) != DUK_TOK_THROW)
244 #error mismatch in token defines
245 #endif
246 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_TRY) != DUK_TOK_TRY)
247 #error mismatch in token defines
248 #endif
249 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_TYPEOF) != DUK_TOK_TYPEOF)
250 #error mismatch in token defines
251 #endif
252 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_VAR) != DUK_TOK_VAR)
253 #error mismatch in token defines
254 #endif
255 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_VOID) != DUK_TOK_VOID)
256 #error mismatch in token defines
257 #endif
258 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_WHILE) != DUK_TOK_WHILE)
259 #error mismatch in token defines
260 #endif
261 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_WITH) != DUK_TOK_WITH)
262 #error mismatch in token defines
263 #endif
264 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_CLASS) != DUK_TOK_CLASS)
265 #error mismatch in token defines
266 #endif
267 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_CONST) != DUK_TOK_CONST)
268 #error mismatch in token defines
269 #endif
270 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_ENUM) != DUK_TOK_ENUM)
271 #error mismatch in token defines
272 #endif
273 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_EXPORT) != DUK_TOK_EXPORT)
274 #error mismatch in token defines
275 #endif
276 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_EXTENDS) != DUK_TOK_EXTENDS)
277 #error mismatch in token defines
278 #endif
279 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_IMPORT) != DUK_TOK_IMPORT)
280 #error mismatch in token defines
281 #endif
282 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_SUPER) != DUK_TOK_SUPER)
283 #error mismatch in token defines
284 #endif
285 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_LC_NULL) != DUK_TOK_NULL)
286 #error mismatch in token defines
287 #endif
288 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_TRUE) != DUK_TOK_TRUE)
289 #error mismatch in token defines
290 #endif
291 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_FALSE) != DUK_TOK_FALSE)
292 #error mismatch in token defines
293 #endif
294 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_IMPLEMENTS) != DUK_TOK_IMPLEMENTS)
295 #error mismatch in token defines
296 #endif
297 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_INTERFACE) != DUK_TOK_INTERFACE)
298 #error mismatch in token defines
299 #endif
300 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_LET) != DUK_TOK_LET)
301 #error mismatch in token defines
302 #endif
303 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_PACKAGE) != DUK_TOK_PACKAGE)
304 #error mismatch in token defines
305 #endif
306 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_PRIVATE) != DUK_TOK_PRIVATE)
307 #error mismatch in token defines
308 #endif
309 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_PROTECTED) != DUK_TOK_PROTECTED)
310 #error mismatch in token defines
311 #endif
312 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_PUBLIC) != DUK_TOK_PUBLIC)
313 #error mismatch in token defines
314 #endif
315 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_STATIC) != DUK_TOK_STATIC)
316 #error mismatch in token defines
317 #endif
318 #if (DUK_STRIDX_TO_TOK(DUK_STRIDX_YIELD) != DUK_TOK_YIELD)
319 #error mismatch in token defines
320 #endif
321 
322 /* Regexp tokens */
323 #define DUK_RETOK_EOF                              0
324 #define DUK_RETOK_DISJUNCTION                      1
325 #define DUK_RETOK_QUANTIFIER                       2
326 #define DUK_RETOK_ASSERT_START                     3
327 #define DUK_RETOK_ASSERT_END                       4
328 #define DUK_RETOK_ASSERT_WORD_BOUNDARY             5
329 #define DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY         6
330 #define DUK_RETOK_ASSERT_START_POS_LOOKAHEAD       7
331 #define DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD       8
332 #define DUK_RETOK_ATOM_PERIOD                      9
333 #define DUK_RETOK_ATOM_CHAR                        10
334 #define DUK_RETOK_ATOM_DIGIT                       11  /* assumptions in regexp compiler */
335 #define DUK_RETOK_ATOM_NOT_DIGIT                   12  /* -""- */
336 #define DUK_RETOK_ATOM_WHITE                       13  /* -""- */
337 #define DUK_RETOK_ATOM_NOT_WHITE                   14  /* -""- */
338 #define DUK_RETOK_ATOM_WORD_CHAR                   15  /* -""- */
339 #define DUK_RETOK_ATOM_NOT_WORD_CHAR               16  /* -""- */
340 #define DUK_RETOK_ATOM_BACKREFERENCE               17
341 #define DUK_RETOK_ATOM_START_CAPTURE_GROUP         18
342 #define DUK_RETOK_ATOM_START_NONCAPTURE_GROUP      19
343 #define DUK_RETOK_ATOM_START_CHARCLASS             20
344 #define DUK_RETOK_ATOM_START_CHARCLASS_INVERTED    21
345 #define DUK_RETOK_ATOM_END_GROUP                   22
346 
347 /* Constants for duk_lexer_ctx.buf. */
348 #define DUK_LEXER_TEMP_BUF_LIMIT                   256
349 
350 /* A token value.  Can be memcpy()'d, but note that slot1/slot2 values are on the valstack.
351  * Some fields (like num, str1, str2) are only valid for specific token types and may have
352  * stale values otherwise.
353  */
354 struct duk_token {
355 	duk_small_uint_t t;           /* token type (with reserved word identification) */
356 	duk_small_uint_t t_nores;     /* token type (with reserved words as DUK_TOK_IDENTIFER) */
357 	duk_double_t num;             /* numeric value of token */
358 	duk_hstring *str1;            /* string 1 of token (borrowed, stored to ctx->slot1_idx) */
359 	duk_hstring *str2;            /* string 2 of token (borrowed, stored to ctx->slot2_idx) */
360 	duk_size_t start_offset;      /* start byte offset of token in lexer input */
361 	duk_int_t start_line;         /* start line of token (first char) */
362 	duk_int_t num_escapes;        /* number of escapes and line continuations (for directive prologue) */
363 	duk_bool_t lineterm;          /* token was preceded by a lineterm */
364 	duk_bool_t allow_auto_semi;   /* token allows automatic semicolon insertion (eof or preceded by newline) */
365 };
366 
367 #define DUK_RE_QUANTIFIER_INFINITE         ((duk_uint32_t) 0xffffffffUL)
368 
369 /* A regexp token value. */
370 struct duk_re_token {
371 	duk_small_uint_t t;          /* token type */
372 	duk_small_uint_t greedy;
373 	duk_uint32_t num;            /* numeric value (character, count) */
374 	duk_uint32_t qmin;
375 	duk_uint32_t qmax;
376 };
377 
378 /* A structure for 'snapshotting' a point for rewinding */
379 struct duk_lexer_point {
380 	duk_size_t offset;
381 	duk_int_t line;
382 };
383 
384 /* Lexer codepoint with additional info like offset/line number */
385 struct duk_lexer_codepoint {
386 	duk_codepoint_t codepoint;
387 	duk_size_t offset;
388 	duk_int_t line;
389 };
390 
391 /* Lexer context.  Same context is used for ECMAScript and Regexp parsing. */
392 struct duk_lexer_ctx {
393 #if defined(DUK_USE_LEXER_SLIDING_WINDOW)
394 	duk_lexer_codepoint *window; /* unicode code points, window[0] is always next, points to 'buffer' */
395 	duk_lexer_codepoint buffer[DUK_LEXER_BUFFER_SIZE];
396 #else
397 	duk_lexer_codepoint window[DUK_LEXER_WINDOW_SIZE]; /* unicode code points, window[0] is always next */
398 #endif
399 
400 	duk_hthread *thr;                              /* thread; minimizes argument passing */
401 
402 	const duk_uint8_t *input;                      /* input string (may be a user pointer) */
403 	duk_size_t input_length;                       /* input byte length */
404 	duk_size_t input_offset;                       /* input offset for window leading edge (not window[0]) */
405 	duk_int_t input_line;                          /* input linenumber at input_offset (not window[0]), init to 1 */
406 
407 	duk_idx_t slot1_idx;                           /* valstack slot for 1st token value */
408 	duk_idx_t slot2_idx;                           /* valstack slot for 2nd token value */
409 	duk_idx_t buf_idx;                             /* valstack slot for temp buffer */
410 	duk_hbuffer_dynamic *buf;                      /* temp accumulation buffer */
411 	duk_bufwriter_ctx bw;                          /* bufwriter for temp accumulation */
412 
413 	duk_int_t token_count;                         /* number of tokens parsed */
414 	duk_int_t token_limit;                         /* maximum token count before error (sanity backstop) */
415 
416 	duk_small_uint_t flags;                        /* lexer flags, use compiler flag defines for now */
417 };
418 
419 /*
420  *  Prototypes
421  */
422 
423 DUK_INTERNAL_DECL void duk_lexer_initctx(duk_lexer_ctx *lex_ctx);
424 
425 DUK_INTERNAL_DECL void duk_lexer_getpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt);
426 DUK_INTERNAL_DECL void duk_lexer_setpoint(duk_lexer_ctx *lex_ctx, duk_lexer_point *pt);
427 
428 DUK_INTERNAL_DECL
429 void duk_lexer_parse_js_input_element(duk_lexer_ctx *lex_ctx,
430                                       duk_token *out_token,
431                                       duk_bool_t strict_mode,
432                                       duk_bool_t regexp_mode);
433 #if defined(DUK_USE_REGEXP_SUPPORT)
434 DUK_INTERNAL_DECL void duk_lexer_parse_re_token(duk_lexer_ctx *lex_ctx, duk_re_token *out_token);
435 DUK_INTERNAL_DECL void duk_lexer_parse_re_ranges(duk_lexer_ctx *lex_ctx, duk_re_range_callback gen_range, void *userdata);
436 #endif  /* DUK_USE_REGEXP_SUPPORT */
437 
438 #endif  /* DUK_LEXER_H_INCLUDED */
439