1 /*
2 * Regexp compilation.
3 *
4 * See doc/regexp.rst for a discussion of the compilation approach and
5 * current limitations.
6 *
7 * Regexp bytecode assumes jumps can be expressed with signed 32-bit
8 * integers. Consequently the bytecode size must not exceed 0x7fffffffL.
9 * The implementation casts duk_size_t (buffer size) to duk_(u)int32_t
10 * in many places. Although this could be changed, the bytecode format
11 * limit would still prevent regexps exceeding the signed 32-bit limit
12 * from working.
13 *
14 * XXX: The implementation does not prevent bytecode from exceeding the
15 * maximum supported size. This could be done by limiting the maximum
16 * input string size (assuming an upper bound can be computed for number
17 * of bytecode bytes emitted per input byte) or checking buffer maximum
18 * size when emitting bytecode (slower).
19 */
20
21 #include "duk_internal.h"
22
23 #if defined(DUK_USE_REGEXP_SUPPORT)
24
25 /*
26 * Helper macros
27 */
28
29 #define DUK__RE_INITIAL_BUFSIZE 64
30
31 #define DUK__RE_BUFLEN(re_ctx) \
32 DUK_BW_GET_SIZE(re_ctx->thr, &re_ctx->bw)
33
34 /*
35 * Disjunction struct: result of parsing a disjunction
36 */
37
38 typedef struct {
39 /* Number of characters that the atom matches (e.g. 3 for 'abc'),
40 * -1 if atom is complex and number of matched characters either
41 * varies or is not known.
42 */
43 duk_int32_t charlen;
44
45 #if 0
46 /* These are not needed to implement quantifier capture handling,
47 * but might be needed at some point.
48 */
49
50 /* re_ctx->captures at start and end of atom parsing.
51 * Since 'captures' indicates highest capture number emitted
52 * so far in a DUK_REOP_SAVE, the captures numbers saved by
53 * the atom are: ]start_captures,end_captures].
54 */
55 duk_uint32_t start_captures;
56 duk_uint32_t end_captures;
57 #endif
58 } duk__re_disjunction_info;
59
60 /*
61 * Encoding helpers
62 *
63 * Some of the typing is bytecode based, e.g. slice sizes are unsigned 32-bit
64 * even though the buffer operations will use duk_size_t.
65 */
66
67 /* XXX: the insert helpers should ensure that the bytecode result is not
68 * larger than expected (or at least assert for it). Many things in the
69 * bytecode, like skip offsets, won't work correctly if the bytecode is
70 * larger than say 2G.
71 */
72
duk__encode_i32(duk_int32_t x)73 DUK_LOCAL duk_uint32_t duk__encode_i32(duk_int32_t x) {
74 if (x < 0) {
75 return ((duk_uint32_t) (-x)) * 2 + 1;
76 } else {
77 return ((duk_uint32_t) x) * 2;
78 }
79 }
80
81 /* XXX: return type should probably be duk_size_t, or explicit checks are needed for
82 * maximum size.
83 */
duk__insert_u32(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_uint32_t x)84 DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) {
85 duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
86 duk_small_int_t len;
87
88 len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
89 DUK_ASSERT(len >= 0);
90 DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, (duk_size_t) len);
91 return (duk_uint32_t) len;
92 }
93
duk__append_u32(duk_re_compiler_ctx * re_ctx,duk_uint32_t x)94 DUK_LOCAL void duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
95 DUK_BW_WRITE_ENSURE_XUTF8(re_ctx->thr, &re_ctx->bw, x);
96 }
97
duk__append_7bit(duk_re_compiler_ctx * re_ctx,duk_uint32_t x)98 DUK_LOCAL void duk__append_7bit(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
99 #if defined(DUK_USE_PREFER_SIZE)
100 duk__append_u32(re_ctx, x);
101 #else
102 DUK_ASSERT(x <= 0x7fU);
103 DUK_BW_WRITE_ENSURE_U8(re_ctx->thr, &re_ctx->bw, (duk_uint8_t) x);
104 #endif
105 }
106
107 #if 0
108 DUK_LOCAL void duk__append_2bytes(duk_re_compiler_ctx *re_ctx, duk_uint8_t x, duk_uint8_t y) {
109 DUK_BW_WRITE_ENSURE_U8_2(re_ctx->thr, &re_ctx->bw, x, y);
110 }
111 #endif
112
duk__insert_i32(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_int32_t x)113 DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) {
114 return duk__insert_u32(re_ctx, offset, duk__encode_i32(x));
115 }
116
duk__append_reop(duk_re_compiler_ctx * re_ctx,duk_uint32_t reop)117 DUK_LOCAL void duk__append_reop(duk_re_compiler_ctx *re_ctx, duk_uint32_t reop) {
118 DUK_ASSERT(reop <= 0x7fU);
119 (void) duk__append_7bit(re_ctx, reop);
120 }
121
122 #if 0 /* unused */
123 DUK_LOCAL void duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) {
124 duk__append_u32(re_ctx, duk__encode_i32(x));
125 }
126 #endif
127
128 /* special helper for emitting u16 lists (used for character ranges for built-in char classes) */
duk__append_u16_list(duk_re_compiler_ctx * re_ctx,const duk_uint16_t * values,duk_uint32_t count)129 DUK_LOCAL void duk__append_u16_list(duk_re_compiler_ctx *re_ctx, const duk_uint16_t *values, duk_uint32_t count) {
130 /* Call sites don't need the result length so it's not accumulated. */
131 while (count-- > 0) {
132 duk__append_u32(re_ctx, (duk_uint32_t) (*values++));
133 }
134 }
135
duk__insert_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_uint32_t data_offset,duk_uint32_t data_length)136 DUK_LOCAL void duk__insert_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t data_offset, duk_uint32_t data_length) {
137 DUK_BW_INSERT_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, offset, data_offset, data_length);
138 }
139
duk__append_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t data_offset,duk_uint32_t data_length)140 DUK_LOCAL void duk__append_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
141 DUK_BW_WRITE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
142 }
143
duk__remove_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t data_offset,duk_uint32_t data_length)144 DUK_LOCAL void duk__remove_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
145 DUK_BW_REMOVE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
146 }
147
148 /*
149 * Insert a jump offset at 'offset' to complete an instruction
150 * (the jump offset is always the last component of an instruction).
151 * The 'skip' argument must be computed relative to 'offset',
152 * -without- taking into account the skip field being inserted.
153 *
154 * ... A B C ins X Y Z ... (ins may be a JUMP, SPLIT1/SPLIT2, etc)
155 * => ... A B C ins SKIP X Y Z
156 *
157 * Computing the final (adjusted) skip value, which is relative to the
158 * first byte of the next instruction, is a bit tricky because of the
159 * variable length UTF-8 encoding. See doc/regexp.rst for discussion.
160 */
duk__insert_jump_offset(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_int32_t skip)161 DUK_LOCAL duk_uint32_t duk__insert_jump_offset(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t skip) {
162 #if 0
163 /* Iterative solution. */
164 if (skip < 0) {
165 duk_small_int_t len;
166 /* two encoding attempts suffices */
167 len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip));
168 len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip - (duk_int32_t) len));
169 DUK_ASSERT(duk_unicode_get_xutf8_length(duk__encode_i32(skip - (duk_int32_t) len)) == len); /* no change */
170 skip -= (duk_int32_t) len;
171 }
172 #endif
173
174 #if defined(DUK_USE_PREFER_SIZE)
175 /* Closed form solution, this produces smallest code.
176 * See re_neg_jump_offset (closed2).
177 */
178 if (skip < 0) {
179 skip--;
180 if (skip < -0x3fL) {
181 skip--;
182 }
183 if (skip < -0x3ffL) {
184 skip--;
185 }
186 if (skip < -0x7fffL) {
187 skip--;
188 }
189 if (skip < -0xfffffL) {
190 skip--;
191 }
192 if (skip < -0x1ffffffL) {
193 skip--;
194 }
195 if (skip < -0x3fffffffL) {
196 skip--;
197 }
198 }
199 #else /* DUK_USE_PREFER_SIZE */
200 /* Closed form solution, this produces fastest code.
201 * See re_neg_jump_offset (closed1).
202 */
203 if (skip < 0) {
204 if (skip >= -0x3eL) {
205 skip -= 1;
206 } else if (skip >= -0x3fdL) {
207 skip -= 2;
208 } else if (skip >= -0x7ffcL) {
209 skip -= 3;
210 } else if (skip >= -0xffffbL) {
211 skip -= 4;
212 } else if (skip >= -0x1fffffaL) {
213 skip -= 5;
214 } else if (skip >= -0x3ffffff9L) {
215 skip -= 6;
216 } else {
217 skip -= 7;
218 }
219 }
220 #endif /* DUK_USE_PREFER_SIZE */
221
222 return duk__insert_i32(re_ctx, offset, skip);
223 }
224
duk__append_jump_offset(duk_re_compiler_ctx * re_ctx,duk_int32_t skip)225 DUK_LOCAL duk_uint32_t duk__append_jump_offset(duk_re_compiler_ctx *re_ctx, duk_int32_t skip) {
226 return (duk_uint32_t) duk__insert_jump_offset(re_ctx, (duk_uint32_t) DUK__RE_BUFLEN(re_ctx), skip);
227 }
228
229 /*
230 * duk_re_range_callback for generating character class ranges.
231 *
232 * When ignoreCase is false, the range is simply emitted as is. We don't,
233 * for instance, eliminate duplicates or overlapping ranges in a character
234 * class.
235 *
236 * When ignoreCase is true but the 'direct' flag is set, the caller knows
237 * that the range canonicalizes to itself for case insensitive matching,
238 * so the range is emitted as is. This is mainly useful for built-in ranges
239 * like \W.
240 *
241 * Otherwise, when ignoreCase is true, the range needs to be normalized
242 * through canonicalization. Unfortunately a canonicalized version of a
243 * continuous range is not necessarily continuous (e.g. [x-{] is continuous
244 * but [X-{] is not). As a result, a single input range may expand to a lot
245 * of output ranges. The current algorithm creates the canonicalized ranges
246 * footprint efficiently at the cost of compile time execution time; see
247 * doc/regexp.rst for discussion, and some more details below.
248 *
249 * Note that the ctx->nranges is a context-wide temporary value. This is OK
250 * because there cannot be multiple character classes being parsed
251 * simultaneously.
252 *
253 * More detail on canonicalization:
254 *
255 * Conceptually, a range is canonicalized by scanning the entire range,
256 * normalizing each codepoint by converting it to uppercase, and generating
257 * a set of result ranges.
258 *
259 * Ideally a minimal set of output ranges would be emitted by merging all
260 * possible ranges even if they're emitted out of sequence. Because the
261 * input string is also case normalized during matching, some codepoints
262 * never occur at runtime; these "don't care" codepoints can be included or
263 * excluded from ranges when merging/optimizing ranges.
264 *
265 * The current algorithm does not do optimal range merging. Rather, output
266 * codepoints are generated in sequence, and when the output codepoints are
267 * continuous (CP, CP+1, CP+2, ...), they are merged locally into as large a
268 * range as possible. A small canonicalization bitmap is used to reduce
269 * actual codepoint canonicalizations which are quite slow at present. The
270 * bitmap provides a "codepoint block is continuous with respect to
271 * canonicalization" for N-codepoint blocks. This allows blocks to be
272 * skipped quickly.
273 *
274 * There are a number of shortcomings and future work here:
275 *
276 * - Individual codepoint normalizations are slow because they involve
277 * walking bit-packed rules without a lookup index.
278 *
279 * - The conceptual algorithm needs to canonicalize every codepoint in the
280 * input range to figure out the output range(s). Even with the small
281 * canonicalization bitmap the algorithm runs quite slowly for worst case
282 * inputs. There are many data structure alternatives to improve this.
283 *
284 * - While the current algorithm generates maximal output ranges when the
285 * output codepoints are emitted linearly, output ranges are not sorted or
286 * merged otherwise. In the worst case a lot of ranges are emitted when
287 * most of the ranges could be merged. In this process one could take
288 * advantage of "don't care" codepoints, which are never matched against at
289 * runtime due to canonicalization of input codepoints before comparison,
290 * to merge otherwise discontinuous output ranges.
291 *
292 * - The runtime data structure is just a linear list of ranges to match
293 * against. This can be quite slow if there are a lot of output ranges.
294 * There are various ways to make matching against the ranges faster,
295 * e.g. sorting the ranges and using a binary search; skip lists; tree
296 * based representations; full or approximate codepoint bitmaps, etc.
297 *
298 * - Only BMP is supported, codepoints above BMP are assumed to canonicalize
299 * to themselves. For now this is one place where we don't want to
300 * support chars outside the BMP, because the exhaustive search would be
301 * massively larger. It would be possible to support non-BMP with a
302 * different algorithm, or perhaps doing case normalization only at match
303 * time.
304 */
305
duk__regexp_emit_range(duk_re_compiler_ctx * re_ctx,duk_codepoint_t r1,duk_codepoint_t r2)306 DUK_LOCAL void duk__regexp_emit_range(duk_re_compiler_ctx *re_ctx, duk_codepoint_t r1, duk_codepoint_t r2) {
307 DUK_ASSERT(r2 >= r1);
308 duk__append_u32(re_ctx, (duk_uint32_t) r1);
309 duk__append_u32(re_ctx, (duk_uint32_t) r2);
310 re_ctx->nranges++;
311 }
312
313 #if defined(DUK_USE_REGEXP_CANON_BITMAP)
314 /* Find next canonicalization discontinuity (conservative estimate) starting
315 * from 'start', not exceeding 'end'. If continuity is fine up to 'end'
316 * inclusive, returns end. Minimum possible return value is start.
317 */
duk__re_canon_next_discontinuity(duk_codepoint_t start,duk_codepoint_t end)318 DUK_LOCAL duk_codepoint_t duk__re_canon_next_discontinuity(duk_codepoint_t start, duk_codepoint_t end) {
319 duk_uint_t start_blk;
320 duk_uint_t end_blk;
321 duk_uint_t blk;
322 duk_uint_t offset;
323 duk_uint8_t mask;
324
325 /* Inclusive block range. */
326 DUK_ASSERT(start >= 0);
327 DUK_ASSERT(end >= 0);
328 DUK_ASSERT(end >= start);
329 start_blk = (duk_uint_t) (start >> DUK_CANON_BITMAP_BLKSHIFT);
330 end_blk = (duk_uint_t) (end >> DUK_CANON_BITMAP_BLKSHIFT);
331
332 for (blk = start_blk; blk <= end_blk; blk++) {
333 offset = blk >> 3;
334 mask = 1U << (blk & 0x07);
335 if (offset >= sizeof(duk_unicode_re_canon_bitmap)) {
336 /* Reached non-BMP range which is assumed continuous. */
337 return end;
338 }
339 DUK_ASSERT(offset < sizeof(duk_unicode_re_canon_bitmap));
340 if ((duk_unicode_re_canon_bitmap[offset] & mask) == 0) {
341 /* Block is discontinuous, continuity is guaranteed
342 * only up to end of previous block (+1 for exclusive
343 * return value => start of current block). Start
344 * block requires special handling.
345 */
346 if (blk > start_blk) {
347 return (duk_codepoint_t) (blk << DUK_CANON_BITMAP_BLKSHIFT);
348 } else {
349 return start;
350 }
351 }
352 }
353 DUK_ASSERT(blk == end_blk + 1); /* Reached end block which is continuous. */
354 return end;
355 }
356 #else /* DUK_USE_REGEXP_CANON_BITMAP */
duk__re_canon_next_discontinuity(duk_codepoint_t start,duk_codepoint_t end)357 DUK_LOCAL duk_codepoint_t duk__re_canon_next_discontinuity(duk_codepoint_t start, duk_codepoint_t end) {
358 DUK_ASSERT(start >= 0);
359 DUK_ASSERT(end >= 0);
360 DUK_ASSERT(end >= start);
361 if (start >= 0x10000) {
362 /* Even without the bitmap, treat non-BMP as continuous. */
363 return end;
364 }
365 return start;
366 }
367 #endif /* DUK_USE_REGEXP_CANON_BITMAP */
368
duk__regexp_generate_ranges(void * userdata,duk_codepoint_t r1,duk_codepoint_t r2,duk_bool_t direct)369 DUK_LOCAL void duk__regexp_generate_ranges(void *userdata, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct) {
370 duk_re_compiler_ctx *re_ctx = (duk_re_compiler_ctx *) userdata;
371 duk_codepoint_t r_start;
372 duk_codepoint_t r_end;
373 duk_codepoint_t i;
374 duk_codepoint_t t;
375 duk_codepoint_t r_disc;
376
377 DUK_DD(DUK_DDPRINT("duk__regexp_generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld",
378 (void *) re_ctx, (long) r1, (long) r2, (long) direct));
379
380 DUK_ASSERT(r2 >= r1); /* SyntaxError for out of order range. */
381
382 if (direct || (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) == 0) {
383 DUK_DD(DUK_DDPRINT("direct or not case sensitive, emit range: [%ld,%ld]", (long) r1, (long) r2));
384 duk__regexp_emit_range(re_ctx, r1, r2);
385 return;
386 }
387
388 DUK_DD(DUK_DDPRINT("case sensitive, process range: [%ld,%ld]", (long) r1, (long) r2));
389
390 r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
391 r_end = r_start;
392
393 for (i = r1 + 1; i <= r2;) {
394 /* Input codepoint space processed up to i-1, and
395 * current range in r_{start,end} is up-to-date
396 * (inclusive) and may either break or continue.
397 */
398 r_disc = duk__re_canon_next_discontinuity(i, r2);
399 DUK_ASSERT(r_disc >= i);
400 DUK_ASSERT(r_disc <= r2);
401
402 r_end += r_disc - i; /* May be zero. */
403 t = duk_unicode_re_canonicalize_char(re_ctx->thr, r_disc);
404 if (t == r_end + 1) {
405 /* Not actually a discontinuity, continue range
406 * to r_disc and recheck.
407 */
408 r_end = t;
409 } else {
410 duk__regexp_emit_range(re_ctx, r_start, r_end);
411 r_start = t;
412 r_end = t;
413 }
414 i = r_disc + 1; /* Guarantees progress. */
415 }
416 duk__regexp_emit_range(re_ctx, r_start, r_end);
417
418 #if 0 /* Exhaustive search, very slow. */
419 r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
420 r_end = r_start;
421 for (i = r1 + 1; i <= r2; i++) {
422 t = duk_unicode_re_canonicalize_char(re_ctx->thr, i);
423 if (t == r_end + 1) {
424 r_end = t;
425 } else {
426 DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
427 duk__append_u32(re_ctx, (duk_uint32_t) r_start);
428 duk__append_u32(re_ctx, (duk_uint32_t) r_end);
429 re_ctx->nranges++;
430 r_start = t;
431 r_end = t;
432 }
433 }
434 DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
435 duk__append_u32(re_ctx, (duk_uint32_t) r_start);
436 duk__append_u32(re_ctx, (duk_uint32_t) r_end);
437 re_ctx->nranges++;
438 #endif
439 }
440
441 /*
442 * Parse regexp Disjunction. Most of regexp compilation happens here.
443 *
444 * Handles Disjunction, Alternative, and Term productions directly without
445 * recursion. The only constructs requiring recursion are positive/negative
446 * lookaheads, capturing parentheses, and non-capturing parentheses.
447 *
448 * The function determines whether the entire disjunction is a 'simple atom'
449 * (see doc/regexp.rst discussion on 'simple quantifiers') and if so,
450 * returns the atom character length which is needed by the caller to keep
451 * track of its own atom character length. A disjunction with more than one
452 * alternative is never considered a simple atom (although in some cases
453 * that might be the case).
454 *
455 * Return value: simple atom character length or < 0 if not a simple atom.
456 * Appends the bytecode for the disjunction matcher to the end of the temp
457 * buffer.
458 *
459 * Regexp top level structure is:
460 *
461 * Disjunction = Term*
462 * | Term* | Disjunction
463 *
464 * Term = Assertion
465 * | Atom
466 * | Atom Quantifier
467 *
468 * An empty Term sequence is a valid disjunction alternative (e.g. /|||c||/).
469 *
470 * Notes:
471 *
472 * * Tracking of the 'simple-ness' of the current atom vs. the entire
473 * disjunction are separate matters. For instance, the disjunction
474 * may be complex, but individual atoms may be simple. Furthermore,
475 * simple quantifiers are used whenever possible, even if the
476 * disjunction as a whole is complex.
477 *
478 * * The estimate of whether an atom is simple is conservative now,
479 * and it would be possible to expand it. For instance, captures
480 * cause the disjunction to be marked complex, even though captures
481 * -can- be handled by simple quantifiers with some minor modifications.
482 *
483 * * Disjunction 'tainting' as 'complex' is handled at the end of the
484 * main for loop collectively for atoms. Assertions, quantifiers,
485 * and '|' tokens need to taint the result manually if necessary.
486 * Assertions cannot add to result char length, only atoms (and
487 * quantifiers) can; currently quantifiers will taint the result
488 * as complex though.
489 */
490
491 DUK_LOCAL const duk_uint16_t * const duk__re_range_lookup1[3] = {
492 duk_unicode_re_ranges_digit,
493 duk_unicode_re_ranges_white,
494 duk_unicode_re_ranges_wordchar
495 };
496 DUK_LOCAL const duk_uint8_t duk__re_range_lookup2[3] = {
497 sizeof(duk_unicode_re_ranges_digit) / (2 * sizeof(duk_uint16_t)),
498 sizeof(duk_unicode_re_ranges_white) / (2 * sizeof(duk_uint16_t)),
499 sizeof(duk_unicode_re_ranges_wordchar) / (2 * sizeof(duk_uint16_t))
500 };
501
duk__append_range_atom_matcher(duk_re_compiler_ctx * re_ctx,duk_small_uint_t re_op,const duk_uint16_t * ranges,duk_small_uint_t count)502 DUK_LOCAL void duk__append_range_atom_matcher(duk_re_compiler_ctx *re_ctx, duk_small_uint_t re_op, const duk_uint16_t *ranges, duk_small_uint_t count) {
503 #if 0
504 DUK_ASSERT(re_op <= 0x7fUL);
505 DUK_ASSERT(count <= 0x7fUL);
506 duk__append_2bytes(re_ctx, (duk_uint8_t) re_op, (duk_uint8_t) count);
507 #endif
508 duk__append_reop(re_ctx, re_op);
509 duk__append_7bit(re_ctx, count);
510 duk__append_u16_list(re_ctx, ranges, count * 2);
511 }
512
duk__parse_disjunction(duk_re_compiler_ctx * re_ctx,duk_bool_t expect_eof,duk__re_disjunction_info * out_atom_info)513 DUK_LOCAL void duk__parse_disjunction(duk_re_compiler_ctx *re_ctx, duk_bool_t expect_eof, duk__re_disjunction_info *out_atom_info) {
514 duk_int32_t atom_start_offset = -1; /* negative -> no atom matched on previous round */
515 duk_int32_t atom_char_length = 0; /* negative -> complex atom */
516 duk_uint32_t atom_start_captures = re_ctx->captures; /* value of re_ctx->captures at start of atom */
517 duk_int32_t unpatched_disjunction_split = -1;
518 duk_int32_t unpatched_disjunction_jump = -1;
519 duk_uint32_t entry_offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
520 duk_int32_t res_charlen = 0; /* -1 if disjunction is complex, char length if simple */
521 duk__re_disjunction_info tmp_disj;
522
523 DUK_ASSERT(out_atom_info != NULL);
524
525 duk_native_stack_check(re_ctx->thr);
526 if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
527 DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT);
528 DUK_WO_NORETURN(return;);
529 }
530 re_ctx->recursion_depth++;
531
532 #if 0
533 out_atom_info->start_captures = re_ctx->captures;
534 #endif
535
536 for (;;) {
537 /* atom_char_length, atom_start_offset, atom_start_offset reflect the
538 * atom matched on the previous loop. If a quantifier is encountered
539 * on this loop, these are needed to handle the quantifier correctly.
540 * new_atom_char_length etc are for the atom parsed on this round;
541 * they're written to atom_char_length etc at the end of the round.
542 */
543 duk_int32_t new_atom_char_length; /* char length of the atom parsed in this loop */
544 duk_int32_t new_atom_start_offset; /* bytecode start offset of the atom parsed in this loop
545 * (allows quantifiers to copy the atom bytecode)
546 */
547 duk_uint32_t new_atom_start_captures; /* re_ctx->captures at the start of the atom parsed in this loop */
548
549 duk_lexer_parse_re_token(&re_ctx->lex, &re_ctx->curr_token);
550
551 DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)",
552 (long) re_ctx->curr_token.t,
553 (long) re_ctx->curr_token.num,
554 (re_ctx->curr_token.num >= 0x20 && re_ctx->curr_token.num <= 0x7e) ?
555 (int) re_ctx->curr_token.num : (int) '?'));
556
557 /* set by atom case clauses */
558 new_atom_start_offset = -1;
559 new_atom_char_length = -1;
560 new_atom_start_captures = re_ctx->captures;
561
562 switch (re_ctx->curr_token.t) {
563 case DUK_RETOK_DISJUNCTION: {
564 /*
565 * The handling here is a bit tricky. If a previous '|' has been processed,
566 * we have a pending split1 and a pending jump (for a previous match). These
567 * need to be back-patched carefully. See docs for a detailed example.
568 */
569
570 /* patch pending jump and split */
571 if (unpatched_disjunction_jump >= 0) {
572 duk_uint32_t offset;
573
574 DUK_ASSERT(unpatched_disjunction_split >= 0);
575 offset = (duk_uint32_t) unpatched_disjunction_jump;
576 offset += duk__insert_jump_offset(re_ctx,
577 offset,
578 (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
579 /* offset is now target of the pending split (right after jump) */
580 duk__insert_jump_offset(re_ctx,
581 (duk_uint32_t) unpatched_disjunction_split,
582 (duk_int32_t) offset - unpatched_disjunction_split);
583 }
584
585 /* add a new pending split to the beginning of the entire disjunction */
586 (void) duk__insert_u32(re_ctx,
587 entry_offset,
588 DUK_REOP_SPLIT1); /* prefer direct execution */
589 unpatched_disjunction_split = (duk_int32_t) (entry_offset + 1); /* +1 for opcode */
590
591 /* add a new pending match jump for latest finished alternative */
592 duk__append_reop(re_ctx, DUK_REOP_JUMP);
593 unpatched_disjunction_jump = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
594
595 /* 'taint' result as complex */
596 res_charlen = -1;
597 break;
598 }
599 case DUK_RETOK_QUANTIFIER: {
600 if (atom_start_offset < 0) {
601 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_NO_ATOM);
602 DUK_WO_NORETURN(return;);
603 }
604 if (re_ctx->curr_token.qmin > re_ctx->curr_token.qmax) {
605 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_VALUES);
606 DUK_WO_NORETURN(return;);
607 }
608 if (atom_char_length >= 0) {
609 /*
610 * Simple atom
611 *
612 * If atom_char_length is zero, we'll have unbounded execution time for e.g.
613 * /()*x/.exec('x'). We can't just skip the match because it might have some
614 * side effects (for instance, if we allowed captures in simple atoms, the
615 * capture needs to happen). The simple solution below is to force the
616 * quantifier to match at most once, since the additional matches have no effect.
617 *
618 * With a simple atom there can be no capture groups, so no captures need
619 * to be reset.
620 */
621 duk_int32_t atom_code_length;
622 duk_uint32_t offset;
623 duk_uint32_t qmin, qmax;
624
625 qmin = re_ctx->curr_token.qmin;
626 qmax = re_ctx->curr_token.qmax;
627 if (atom_char_length == 0) {
628 /* qmin and qmax will be 0 or 1 */
629 if (qmin > 1) {
630 qmin = 1;
631 }
632 if (qmax > 1) {
633 qmax = 1;
634 }
635 }
636
637 duk__append_reop(re_ctx, DUK_REOP_MATCH); /* complete 'sub atom' */
638 atom_code_length = (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (duk_size_t) atom_start_offset);
639
640 offset = (duk_uint32_t) atom_start_offset;
641 if (re_ctx->curr_token.greedy) {
642 offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQGREEDY);
643 offset += duk__insert_u32(re_ctx, offset, qmin);
644 offset += duk__insert_u32(re_ctx, offset, qmax);
645 offset += duk__insert_u32(re_ctx, offset, (duk_uint32_t) atom_char_length);
646 offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
647 } else {
648 offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQMINIMAL);
649 offset += duk__insert_u32(re_ctx, offset, qmin);
650 offset += duk__insert_u32(re_ctx, offset, qmax);
651 offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
652 }
653 DUK_UNREF(offset); /* silence scan-build warning */
654 } else {
655 /*
656 * Complex atom
657 *
658 * The original code is used as a template, and removed at the end
659 * (this differs from the handling of simple quantifiers).
660 *
661 * NOTE: there is no current solution for empty atoms in complex
662 * quantifiers. This would need some sort of a 'progress' instruction.
663 *
664 * XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies?
665 */
666 duk_int32_t atom_code_length;
667 duk_uint32_t atom_copies;
668 duk_uint32_t tmp_qmin, tmp_qmax;
669
670 /* pre-check how many atom copies we're willing to make (atom_copies not needed below) */
671 atom_copies = (re_ctx->curr_token.qmax == DUK_RE_QUANTIFIER_INFINITE) ?
672 re_ctx->curr_token.qmin : re_ctx->curr_token.qmax;
673 if (atom_copies > DUK_RE_MAX_ATOM_COPIES) {
674 DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_QUANTIFIER_TOO_MANY_COPIES);
675 DUK_WO_NORETURN(return;);
676 }
677
678 /* wipe the capture range made by the atom (if any) */
679 DUK_ASSERT(atom_start_captures <= re_ctx->captures);
680 if (atom_start_captures != re_ctx->captures) {
681 DUK_ASSERT(atom_start_captures < re_ctx->captures);
682 DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]",
683 (long) atom_start_captures, (long) re_ctx->captures));
684
685 /* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */
686 duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, (re_ctx->captures - atom_start_captures) * 2U);
687 duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, (atom_start_captures + 1) * 2);
688 duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, DUK_REOP_WIPERANGE);
689 } else {
690 DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld",
691 (long) atom_start_captures));
692 }
693
694 atom_code_length = (duk_int32_t) DUK__RE_BUFLEN(re_ctx) - atom_start_offset;
695
696 /* insert the required matches (qmin) by copying the atom */
697 tmp_qmin = re_ctx->curr_token.qmin;
698 tmp_qmax = re_ctx->curr_token.qmax;
699 while (tmp_qmin > 0) {
700 duk__append_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
701 tmp_qmin--;
702 if (tmp_qmax != DUK_RE_QUANTIFIER_INFINITE) {
703 tmp_qmax--;
704 }
705 }
706 DUK_ASSERT(tmp_qmin == 0);
707
708 /* insert code for matching the remainder - infinite or finite */
709 if (tmp_qmax == DUK_RE_QUANTIFIER_INFINITE) {
710 /* reuse last emitted atom for remaining 'infinite' quantifier */
711
712 if (re_ctx->curr_token.qmin == 0) {
713 /* Special case: original qmin was zero so there is nothing
714 * to repeat. Emit an atom copy but jump over it here.
715 */
716 duk__append_reop(re_ctx, DUK_REOP_JUMP);
717 duk__append_jump_offset(re_ctx, atom_code_length);
718 duk__append_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
719 }
720 if (re_ctx->curr_token.greedy) {
721 duk__append_reop(re_ctx, DUK_REOP_SPLIT2); /* prefer jump */
722 } else {
723 duk__append_reop(re_ctx, DUK_REOP_SPLIT1); /* prefer direct */
724 }
725 duk__append_jump_offset(re_ctx, -atom_code_length - 1); /* -1 for opcode */
726 } else {
727 /*
728 * The remaining matches are emitted as sequence of SPLITs and atom
729 * copies; the SPLITs skip the remaining copies and match the sequel.
730 * This sequence needs to be emitted starting from the last copy
731 * because the SPLITs are variable length due to the variable length
732 * skip offset. This causes a lot of memory copying now.
733 *
734 * Example structure (greedy, match maximum # atoms):
735 *
736 * SPLIT1 LSEQ
737 * (atom)
738 * SPLIT1 LSEQ ; <- the byte length of this instruction is needed
739 * (atom) ; to encode the above SPLIT1 correctly
740 * ...
741 * LSEQ:
742 */
743 duk_uint32_t offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
744 while (tmp_qmax > 0) {
745 duk__insert_slice(re_ctx, offset, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
746 if (re_ctx->curr_token.greedy) {
747 duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT1); /* prefer direct */
748 } else {
749 duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT2); /* prefer jump */
750 }
751 duk__insert_jump_offset(re_ctx,
752 offset + 1, /* +1 for opcode */
753 (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
754 tmp_qmax--;
755 }
756 }
757
758 /* remove the original 'template' atom */
759 duk__remove_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
760 }
761
762 /* 'taint' result as complex */
763 res_charlen = -1;
764 break;
765 }
766 case DUK_RETOK_ASSERT_START: {
767 duk__append_reop(re_ctx, DUK_REOP_ASSERT_START);
768 break;
769 }
770 case DUK_RETOK_ASSERT_END: {
771 duk__append_reop(re_ctx, DUK_REOP_ASSERT_END);
772 break;
773 }
774 case DUK_RETOK_ASSERT_WORD_BOUNDARY: {
775 duk__append_reop(re_ctx, DUK_REOP_ASSERT_WORD_BOUNDARY);
776 break;
777 }
778 case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY: {
779 duk__append_reop(re_ctx, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
780 break;
781 }
782 case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD:
783 case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD: {
784 duk_uint32_t offset;
785 duk_uint32_t opcode = (re_ctx->curr_token.t == DUK_RETOK_ASSERT_START_POS_LOOKAHEAD) ?
786 DUK_REOP_LOOKPOS : DUK_REOP_LOOKNEG;
787
788 offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
789 duk__parse_disjunction(re_ctx, 0, &tmp_disj);
790 duk__append_reop(re_ctx, DUK_REOP_MATCH);
791
792 (void) duk__insert_u32(re_ctx, offset, opcode);
793 (void) duk__insert_jump_offset(re_ctx,
794 offset + 1, /* +1 for opcode */
795 (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
796
797 /* 'taint' result as complex -- this is conservative,
798 * as lookaheads do not backtrack.
799 */
800 res_charlen = -1;
801 break;
802 }
803 case DUK_RETOK_ATOM_PERIOD: {
804 new_atom_char_length = 1;
805 new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
806 duk__append_reop(re_ctx, DUK_REOP_PERIOD);
807 break;
808 }
809 case DUK_RETOK_ATOM_CHAR: {
810 /* Note: successive characters could be joined into string matches
811 * but this is not trivial (consider e.g. '/xyz+/); see docs for
812 * more discussion.
813 *
814 * No support for \u{H+} yet. While only BMP Unicode escapes are
815 * supported for RegExps at present, 'ch' may still be a non-BMP
816 * codepoint if it is decoded straight from source text UTF-8.
817 * There's no non-BMP support yet so this is handled simply by
818 * matching the non-BMP character (which is custom behavior).
819 */
820 duk_uint32_t ch;
821
822 new_atom_char_length = 1;
823 new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
824 duk__append_reop(re_ctx, DUK_REOP_CHAR);
825 ch = re_ctx->curr_token.num;
826 if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
827 ch = (duk_uint32_t) duk_unicode_re_canonicalize_char(re_ctx->thr, (duk_codepoint_t) ch);
828 }
829 duk__append_u32(re_ctx, ch);
830 break;
831 }
832 case DUK_RETOK_ATOM_DIGIT:
833 case DUK_RETOK_ATOM_NOT_DIGIT:
834 case DUK_RETOK_ATOM_WHITE:
835 case DUK_RETOK_ATOM_NOT_WHITE:
836 case DUK_RETOK_ATOM_WORD_CHAR:
837 case DUK_RETOK_ATOM_NOT_WORD_CHAR: {
838 duk_small_uint_t re_op;
839 duk_small_uint_t idx;
840
841 new_atom_char_length = 1;
842 new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
843
844 DUK_ASSERT((DUK_RETOK_ATOM_DIGIT & 0x01) != 0);
845 DUK_ASSERT((DUK_RETOK_ATOM_WHITE & 0x01) != 0);
846 DUK_ASSERT((DUK_RETOK_ATOM_WORD_CHAR & 0x01) != 0);
847 DUK_ASSERT((DUK_RETOK_ATOM_NOT_DIGIT & 0x01) == 0);
848 DUK_ASSERT((DUK_RETOK_ATOM_NOT_WHITE & 0x01) == 0);
849 DUK_ASSERT((DUK_RETOK_ATOM_NOT_WORD_CHAR & 0x01) == 0);
850 re_op = (re_ctx->curr_token.t & 0x01) ? DUK_REOP_RANGES : DUK_REOP_INVRANGES;
851
852 DUK_ASSERT(DUK_RETOK_ATOM_WHITE == DUK_RETOK_ATOM_DIGIT + 2);
853 DUK_ASSERT(DUK_RETOK_ATOM_WORD_CHAR == DUK_RETOK_ATOM_DIGIT + 4);
854 idx = (duk_small_uint_t) ((re_ctx->curr_token.t - DUK_RETOK_ATOM_DIGIT) >> 1U);
855 DUK_ASSERT(idx <= 2U); /* Assume continuous token numbers; also checks negative underflow. */
856
857 duk__append_range_atom_matcher(re_ctx, re_op, duk__re_range_lookup1[idx], duk__re_range_lookup2[idx]);
858 break;
859 }
860 case DUK_RETOK_ATOM_BACKREFERENCE: {
861 duk_uint32_t backref = (duk_uint32_t) re_ctx->curr_token.num;
862 if (backref > re_ctx->highest_backref) {
863 re_ctx->highest_backref = backref;
864 }
865 new_atom_char_length = -1; /* mark as complex */
866 new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
867 duk__append_reop(re_ctx, DUK_REOP_BACKREFERENCE);
868 duk__append_u32(re_ctx, backref);
869 break;
870 }
871 case DUK_RETOK_ATOM_START_CAPTURE_GROUP: {
872 duk_uint32_t cap;
873
874 new_atom_char_length = -1; /* mark as complex (capture handling) */
875 new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
876 cap = ++re_ctx->captures;
877 duk__append_reop(re_ctx, DUK_REOP_SAVE);
878 duk__append_u32(re_ctx, cap * 2);
879 duk__parse_disjunction(re_ctx, 0, &tmp_disj); /* retval (sub-atom char length) unused, tainted as complex above */
880 duk__append_reop(re_ctx, DUK_REOP_SAVE);
881 duk__append_u32(re_ctx, cap * 2 + 1);
882 break;
883 }
884 case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP: {
885 new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
886 duk__parse_disjunction(re_ctx, 0, &tmp_disj);
887 new_atom_char_length = tmp_disj.charlen;
888 break;
889 }
890 case DUK_RETOK_ATOM_START_CHARCLASS:
891 case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED: {
892 /*
893 * Range parsing is done with a special lexer function which calls
894 * us for every range parsed. This is different from how rest of
895 * the parsing works, but avoids a heavy, arbitrary size intermediate
896 * value type to hold the ranges.
897 *
898 * Another complication is the handling of character ranges when
899 * case insensitive matching is used (see docs for discussion).
900 * The range handler callback given to the lexer takes care of this
901 * as well.
902 *
903 * Note that duplicate ranges are not eliminated when parsing character
904 * classes, so that canonicalization of
905 *
906 * [0-9a-fA-Fx-{]
907 *
908 * creates the result (note the duplicate ranges):
909 *
910 * [0-9A-FA-FX-Z{-{]
911 *
912 * where [x-{] is split as a result of canonicalization. The duplicate
913 * ranges are not a semantics issue: they work correctly.
914 */
915
916 duk_uint32_t offset;
917
918 DUK_DD(DUK_DDPRINT("character class"));
919
920 /* insert ranges instruction, range count patched in later */
921 new_atom_char_length = 1;
922 new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
923 duk__append_reop(re_ctx,
924 (re_ctx->curr_token.t == DUK_RETOK_ATOM_START_CHARCLASS) ?
925 DUK_REOP_RANGES : DUK_REOP_INVRANGES);
926 offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx); /* patch in range count later */
927
928 /* parse ranges until character class ends */
929 re_ctx->nranges = 0; /* note: ctx-wide temporary */
930 duk_lexer_parse_re_ranges(&re_ctx->lex, duk__regexp_generate_ranges, (void *) re_ctx);
931
932 /* insert range count */
933 duk__insert_u32(re_ctx, offset, re_ctx->nranges);
934 break;
935 }
936 case DUK_RETOK_ATOM_END_GROUP: {
937 if (expect_eof) {
938 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_CLOSING_PAREN);
939 DUK_WO_NORETURN(return;);
940 }
941 goto done;
942 }
943 case DUK_RETOK_EOF: {
944 if (!expect_eof) {
945 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_END_OF_PATTERN);
946 DUK_WO_NORETURN(return;);
947 }
948 goto done;
949 }
950 default: {
951 DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_REGEXP_TOKEN);
952 DUK_WO_NORETURN(return;);
953 }
954 }
955
956 /* a complex (new) atom taints the result */
957 if (new_atom_start_offset >= 0) {
958 if (new_atom_char_length < 0) {
959 res_charlen = -1;
960 } else if (res_charlen >= 0) {
961 /* only advance if not tainted */
962 res_charlen += new_atom_char_length;
963 }
964 }
965
966 /* record previous atom info in case next token is a quantifier */
967 atom_start_offset = new_atom_start_offset;
968 atom_char_length = new_atom_char_length;
969 atom_start_captures = new_atom_start_captures;
970 }
971
972 done:
973
974 /* finish up pending jump and split for last alternative */
975 if (unpatched_disjunction_jump >= 0) {
976 duk_uint32_t offset;
977
978 DUK_ASSERT(unpatched_disjunction_split >= 0);
979 offset = (duk_uint32_t) unpatched_disjunction_jump;
980 offset += duk__insert_jump_offset(re_ctx,
981 offset,
982 (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
983 /* offset is now target of the pending split (right after jump) */
984 duk__insert_jump_offset(re_ctx,
985 (duk_uint32_t) unpatched_disjunction_split,
986 (duk_int32_t) offset - unpatched_disjunction_split);
987 }
988
989 #if 0
990 out_atom_info->end_captures = re_ctx->captures;
991 #endif
992 out_atom_info->charlen = res_charlen;
993 DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld",
994 (long) out_atom_info->charlen));
995
996 re_ctx->recursion_depth--;
997 }
998
999 /*
1000 * Flags parsing (see E5 Section 15.10.4.1).
1001 */
1002
duk__parse_regexp_flags(duk_hthread * thr,duk_hstring * h)1003 DUK_LOCAL duk_uint32_t duk__parse_regexp_flags(duk_hthread *thr, duk_hstring *h) {
1004 const duk_uint8_t *p;
1005 const duk_uint8_t *p_end;
1006 duk_uint32_t flags = 0;
1007
1008 p = DUK_HSTRING_GET_DATA(h);
1009 p_end = p + DUK_HSTRING_GET_BYTELEN(h);
1010
1011 /* Note: can be safely scanned as bytes (undecoded) */
1012
1013 while (p < p_end) {
1014 duk_uint8_t c = *p++;
1015 switch (c) {
1016 case (duk_uint8_t) 'g': {
1017 if (flags & DUK_RE_FLAG_GLOBAL) {
1018 goto flags_error;
1019 }
1020 flags |= DUK_RE_FLAG_GLOBAL;
1021 break;
1022 }
1023 case (duk_uint8_t) 'i': {
1024 if (flags & DUK_RE_FLAG_IGNORE_CASE) {
1025 goto flags_error;
1026 }
1027 flags |= DUK_RE_FLAG_IGNORE_CASE;
1028 break;
1029 }
1030 case (duk_uint8_t) 'm': {
1031 if (flags & DUK_RE_FLAG_MULTILINE) {
1032 goto flags_error;
1033 }
1034 flags |= DUK_RE_FLAG_MULTILINE;
1035 break;
1036 }
1037 default: {
1038 goto flags_error;
1039 }
1040 }
1041 }
1042
1043 return flags;
1044
1045 flags_error:
1046 DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_REGEXP_FLAGS);
1047 DUK_WO_NORETURN(return 0U;);
1048 }
1049
1050 /*
1051 * Create escaped RegExp source (E5 Section 15.10.3).
1052 *
1053 * The current approach is to special case the empty RegExp
1054 * ('' -> '(?:)') and otherwise replace unescaped '/' characters
1055 * with '\/' regardless of where they occur in the regexp.
1056 *
1057 * Note that normalization does not seem to be necessary for
1058 * RegExp literals (e.g. '/foo/') because to be acceptable as
1059 * a RegExp literal, the text between forward slashes must
1060 * already match the escaping requirements (e.g. must not contain
1061 * unescaped forward slashes or be empty). Escaping IS needed
1062 * for expressions like 'new Regexp("...", "")' however.
1063 * Currently, we re-escape in either case.
1064 *
1065 * Also note that we process the source here in UTF-8 encoded
1066 * form. This is correct, because any non-ASCII characters are
1067 * passed through without change.
1068 */
1069
duk__create_escaped_source(duk_hthread * thr,int idx_pattern)1070 DUK_LOCAL void duk__create_escaped_source(duk_hthread *thr, int idx_pattern) {
1071 duk_hstring *h;
1072 const duk_uint8_t *p;
1073 duk_bufwriter_ctx bw_alloc;
1074 duk_bufwriter_ctx *bw;
1075 duk_uint8_t *q;
1076 duk_size_t i, n;
1077 duk_uint_fast8_t c_prev, c;
1078
1079 h = duk_known_hstring(thr, idx_pattern);
1080 p = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);
1081 n = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h);
1082
1083 if (n == 0) {
1084 duk_push_literal(thr, "(?:)");
1085 return;
1086 }
1087
1088 bw = &bw_alloc;
1089 DUK_BW_INIT_PUSHBUF(thr, bw, n);
1090 q = DUK_BW_GET_PTR(thr, bw);
1091
1092 c_prev = (duk_uint_fast8_t) 0;
1093
1094 for (i = 0; i < n; i++) {
1095 c = p[i];
1096
1097 q = DUK_BW_ENSURE_RAW(thr, bw, 2, q);
1098
1099 if (c == (duk_uint_fast8_t) '/' && c_prev != (duk_uint_fast8_t) '\\') {
1100 /* Unescaped '/' ANYWHERE in the regexp (in disjunction,
1101 * inside a character class, ...) => same escape works.
1102 */
1103 *q++ = DUK_ASC_BACKSLASH;
1104 }
1105 *q++ = (duk_uint8_t) c;
1106
1107 c_prev = c;
1108 }
1109
1110 DUK_BW_SETPTR_AND_COMPACT(thr, bw, q);
1111 (void) duk_buffer_to_string(thr, -1); /* Safe if input is safe. */
1112
1113 /* [ ... escaped_source ] */
1114 }
1115
1116 /*
1117 * Exposed regexp compilation primitive.
1118 *
1119 * Sets up a regexp compilation context, and calls duk__parse_disjunction() to do the
1120 * actual parsing. Handles generation of the compiled regexp header and the
1121 * "boilerplate" capture of the matching substring (save 0 and 1). Also does some
1122 * global level regexp checks after recursive compilation has finished.
1123 *
1124 * An escaped version of the regexp source, suitable for use as a RegExp instance
1125 * 'source' property (see E5 Section 15.10.3), is also left on the stack.
1126 *
1127 * Input stack: [ pattern flags ]
1128 * Output stack: [ bytecode escaped_source ] (both as strings)
1129 */
1130
duk_regexp_compile(duk_hthread * thr)1131 DUK_INTERNAL void duk_regexp_compile(duk_hthread *thr) {
1132 duk_re_compiler_ctx re_ctx;
1133 duk_lexer_point lex_point;
1134 duk_hstring *h_pattern;
1135 duk_hstring *h_flags;
1136 duk__re_disjunction_info ign_disj;
1137
1138 DUK_ASSERT(thr != NULL);
1139
1140 /*
1141 * Args validation
1142 */
1143
1144 /* TypeError if fails */
1145 h_pattern = duk_require_hstring_notsymbol(thr, -2);
1146 h_flags = duk_require_hstring_notsymbol(thr, -1);
1147
1148 /*
1149 * Create normalized 'source' property (E5 Section 15.10.3).
1150 */
1151
1152 /* [ ... pattern flags ] */
1153
1154 duk__create_escaped_source(thr, -2);
1155
1156 /* [ ... pattern flags escaped_source ] */
1157
1158 /*
1159 * Init compilation context
1160 */
1161
1162 /* [ ... pattern flags escaped_source buffer ] */
1163
1164 duk_memzero(&re_ctx, sizeof(re_ctx));
1165 DUK_LEXER_INITCTX(&re_ctx.lex); /* duplicate zeroing, expect for (possible) NULL inits */
1166 re_ctx.thr = thr;
1167 re_ctx.lex.thr = thr;
1168 re_ctx.lex.input = DUK_HSTRING_GET_DATA(h_pattern);
1169 re_ctx.lex.input_length = DUK_HSTRING_GET_BYTELEN(h_pattern);
1170 re_ctx.lex.token_limit = DUK_RE_COMPILE_TOKEN_LIMIT;
1171 re_ctx.recursion_limit = DUK_USE_REGEXP_COMPILER_RECLIMIT;
1172 re_ctx.re_flags = duk__parse_regexp_flags(thr, h_flags);
1173
1174 DUK_BW_INIT_PUSHBUF(thr, &re_ctx.bw, DUK__RE_INITIAL_BUFSIZE);
1175
1176 DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld",
1177 (unsigned long) re_ctx.re_flags, (long) re_ctx.recursion_limit));
1178
1179 /*
1180 * Init lexer
1181 */
1182
1183 lex_point.offset = 0; /* expensive init, just want to fill window */
1184 lex_point.line = 1;
1185 DUK_LEXER_SETPOINT(&re_ctx.lex, &lex_point);
1186
1187 /*
1188 * Compilation
1189 */
1190
1191 DUK_DD(DUK_DDPRINT("starting regexp compilation"));
1192
1193 duk__append_reop(&re_ctx, DUK_REOP_SAVE);
1194 duk__append_7bit(&re_ctx, 0);
1195 duk__parse_disjunction(&re_ctx, 1 /*expect_eof*/, &ign_disj);
1196 duk__append_reop(&re_ctx, DUK_REOP_SAVE);
1197 duk__append_7bit(&re_ctx, 1);
1198 duk__append_reop(&re_ctx, DUK_REOP_MATCH);
1199
1200 /*
1201 * Check for invalid backreferences; note that it is NOT an error
1202 * to back-reference a capture group which has not yet been introduced
1203 * in the pattern (as in /\1(foo)/); in fact, the backreference will
1204 * always match! It IS an error to back-reference a capture group
1205 * which will never be introduced in the pattern. Thus, we can check
1206 * for such references only after parsing is complete.
1207 */
1208
1209 if (re_ctx.highest_backref > re_ctx.captures) {
1210 DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_BACKREFS);
1211 DUK_WO_NORETURN(return;);
1212 }
1213
1214 /*
1215 * Emit compiled regexp header: flags, ncaptures
1216 * (insertion order inverted on purpose)
1217 */
1218
1219 duk__insert_u32(&re_ctx, 0, (re_ctx.captures + 1) * 2);
1220 duk__insert_u32(&re_ctx, 0, re_ctx.re_flags);
1221
1222 /* [ ... pattern flags escaped_source buffer ] */
1223
1224 DUK_BW_COMPACT(thr, &re_ctx.bw);
1225 (void) duk_buffer_to_string(thr, -1); /* Safe because flags is at most 7 bit. */
1226
1227 /* [ ... pattern flags escaped_source bytecode ] */
1228
1229 /*
1230 * Finalize stack
1231 */
1232
1233 duk_remove(thr, -4); /* -> [ ... flags escaped_source bytecode ] */
1234 duk_remove(thr, -3); /* -> [ ... escaped_source bytecode ] */
1235
1236 DUK_DD(DUK_DDPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T",
1237 (duk_tval *) duk_get_tval(thr, -1), (duk_tval *) duk_get_tval(thr, -2)));
1238 }
1239
1240 /*
1241 * Create a RegExp instance (E5 Section 15.10.7).
1242 *
1243 * Note: the output stack left by duk_regexp_compile() is directly compatible
1244 * with the input here.
1245 *
1246 * Input stack: [ escaped_source bytecode ] (both as strings)
1247 * Output stack: [ RegExp ]
1248 */
1249
duk_regexp_create_instance(duk_hthread * thr)1250 DUK_INTERNAL void duk_regexp_create_instance(duk_hthread *thr) {
1251 duk_hobject *h;
1252
1253 /* [ ... escaped_source bytecode ] */
1254
1255 duk_push_object(thr);
1256 h = duk_known_hobject(thr, -1);
1257 duk_insert(thr, -3);
1258
1259 /* [ ... regexp_object escaped_source bytecode ] */
1260
1261 DUK_HOBJECT_SET_CLASS_NUMBER(h, DUK_HOBJECT_CLASS_REGEXP);
1262 DUK_HOBJECT_SET_PROTOTYPE_UPDREF(thr, h, thr->builtins[DUK_BIDX_REGEXP_PROTOTYPE]);
1263
1264 duk_xdef_prop_stridx_short(thr, -3, DUK_STRIDX_INT_BYTECODE, DUK_PROPDESC_FLAGS_NONE);
1265
1266 /* [ ... regexp_object escaped_source ] */
1267
1268 /* In ES2015 .source, and the .global, .multiline, etc flags are
1269 * inherited getters. Store the escaped source as an internal
1270 * property for the getter.
1271 */
1272
1273 duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_INT_SOURCE, DUK_PROPDESC_FLAGS_NONE);
1274
1275 /* [ ... regexp_object ] */
1276
1277 duk_push_int(thr, 0);
1278 duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_LAST_INDEX, DUK_PROPDESC_FLAGS_W);
1279
1280 /* [ ... regexp_object ] */
1281 }
1282
1283 #else /* DUK_USE_REGEXP_SUPPORT */
1284
1285 /* regexp support disabled */
1286
1287 #endif /* DUK_USE_REGEXP_SUPPORT */
1288