1 /*
2  *  Regexp compilation.
3  *
4  *  See doc/regexp.rst for a discussion of the compilation approach and
5  *  current limitations.
6  *
7  *  Regexp bytecode assumes jumps can be expressed with signed 32-bit
8  *  integers.  Consequently the bytecode size must not exceed 0x7fffffffL.
9  *  The implementation casts duk_size_t (buffer size) to duk_(u)int32_t
10  *  in many places.  Although this could be changed, the bytecode format
11  *  limit would still prevent regexps exceeding the signed 32-bit limit
12  *  from working.
13  *
14  *  XXX: The implementation does not prevent bytecode from exceeding the
15  *  maximum supported size.  This could be done by limiting the maximum
16  *  input string size (assuming an upper bound can be computed for number
17  *  of bytecode bytes emitted per input byte) or checking buffer maximum
18  *  size when emitting bytecode (slower).
19  */
20 
21 #include "duk_internal.h"
22 
23 #if defined(DUK_USE_REGEXP_SUPPORT)
24 
25 /*
26  *  Helper macros
27  */
28 
29 #define DUK__RE_INITIAL_BUFSIZE 64
30 
31 #define DUK__RE_BUFLEN(re_ctx) \
32 	DUK_BW_GET_SIZE(re_ctx->thr, &re_ctx->bw)
33 
34 /*
35  *  Disjunction struct: result of parsing a disjunction
36  */
37 
38 typedef struct {
39 	/* Number of characters that the atom matches (e.g. 3 for 'abc'),
40 	 * -1 if atom is complex and number of matched characters either
41 	 * varies or is not known.
42 	 */
43 	duk_int32_t charlen;
44 
45 #if 0
46 	/* These are not needed to implement quantifier capture handling,
47 	 * but might be needed at some point.
48 	 */
49 
50 	/* re_ctx->captures at start and end of atom parsing.
51 	 * Since 'captures' indicates highest capture number emitted
52 	 * so far in a DUK_REOP_SAVE, the captures numbers saved by
53 	 * the atom are: ]start_captures,end_captures].
54 	 */
55 	duk_uint32_t start_captures;
56 	duk_uint32_t end_captures;
57 #endif
58 } duk__re_disjunction_info;
59 
60 /*
61  *  Encoding helpers
62  *
63  *  Some of the typing is bytecode based, e.g. slice sizes are unsigned 32-bit
64  *  even though the buffer operations will use duk_size_t.
65  */
66 
67 /* XXX: the insert helpers should ensure that the bytecode result is not
68  * larger than expected (or at least assert for it).  Many things in the
69  * bytecode, like skip offsets, won't work correctly if the bytecode is
70  * larger than say 2G.
71  */
72 
duk__encode_i32(duk_int32_t x)73 DUK_LOCAL duk_uint32_t duk__encode_i32(duk_int32_t x) {
74 	if (x < 0) {
75 		return ((duk_uint32_t) (-x)) * 2 + 1;
76 	} else {
77 		return ((duk_uint32_t) x) * 2;
78 	}
79 }
80 
81 /* XXX: return type should probably be duk_size_t, or explicit checks are needed for
82  * maximum size.
83  */
duk__insert_u32(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_uint32_t x)84 DUK_LOCAL duk_uint32_t duk__insert_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t x) {
85 	duk_uint8_t buf[DUK_UNICODE_MAX_XUTF8_LENGTH];
86 	duk_small_int_t len;
87 
88 	len = duk_unicode_encode_xutf8((duk_ucodepoint_t) x, buf);
89 	DUK_ASSERT(len >= 0);
90 	DUK_BW_INSERT_ENSURE_BYTES(re_ctx->thr, &re_ctx->bw, offset, buf, (duk_size_t) len);
91 	return (duk_uint32_t) len;
92 }
93 
duk__append_u32(duk_re_compiler_ctx * re_ctx,duk_uint32_t x)94 DUK_LOCAL void duk__append_u32(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
95 	DUK_BW_WRITE_ENSURE_XUTF8(re_ctx->thr, &re_ctx->bw, x);
96 }
97 
duk__append_7bit(duk_re_compiler_ctx * re_ctx,duk_uint32_t x)98 DUK_LOCAL void duk__append_7bit(duk_re_compiler_ctx *re_ctx, duk_uint32_t x) {
99 #if defined(DUK_USE_PREFER_SIZE)
100 	duk__append_u32(re_ctx, x);
101 #else
102 	DUK_ASSERT(x <= 0x7fU);
103 	DUK_BW_WRITE_ENSURE_U8(re_ctx->thr, &re_ctx->bw, (duk_uint8_t) x);
104 #endif
105 }
106 
107 #if 0
108 DUK_LOCAL void duk__append_2bytes(duk_re_compiler_ctx *re_ctx, duk_uint8_t x, duk_uint8_t y) {
109 	DUK_BW_WRITE_ENSURE_U8_2(re_ctx->thr, &re_ctx->bw, x, y);
110 }
111 #endif
112 
duk__insert_i32(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_int32_t x)113 DUK_LOCAL duk_uint32_t duk__insert_i32(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t x) {
114 	return duk__insert_u32(re_ctx, offset, duk__encode_i32(x));
115 }
116 
duk__append_reop(duk_re_compiler_ctx * re_ctx,duk_uint32_t reop)117 DUK_LOCAL void duk__append_reop(duk_re_compiler_ctx *re_ctx, duk_uint32_t reop) {
118 	DUK_ASSERT(reop <= 0x7fU);
119 	(void) duk__append_7bit(re_ctx, reop);
120 }
121 
122 #if 0  /* unused */
123 DUK_LOCAL void duk__append_i32(duk_re_compiler_ctx *re_ctx, duk_int32_t x) {
124 	duk__append_u32(re_ctx, duk__encode_i32(x));
125 }
126 #endif
127 
128 /* special helper for emitting u16 lists (used for character ranges for built-in char classes) */
duk__append_u16_list(duk_re_compiler_ctx * re_ctx,const duk_uint16_t * values,duk_uint32_t count)129 DUK_LOCAL void duk__append_u16_list(duk_re_compiler_ctx *re_ctx, const duk_uint16_t *values, duk_uint32_t count) {
130 	/* Call sites don't need the result length so it's not accumulated. */
131 	while (count-- > 0) {
132 		duk__append_u32(re_ctx, (duk_uint32_t) (*values++));
133 	}
134 }
135 
duk__insert_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_uint32_t data_offset,duk_uint32_t data_length)136 DUK_LOCAL void duk__insert_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_uint32_t data_offset, duk_uint32_t data_length) {
137 	DUK_BW_INSERT_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, offset, data_offset, data_length);
138 }
139 
duk__append_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t data_offset,duk_uint32_t data_length)140 DUK_LOCAL void duk__append_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
141 	DUK_BW_WRITE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
142 }
143 
duk__remove_slice(duk_re_compiler_ctx * re_ctx,duk_uint32_t data_offset,duk_uint32_t data_length)144 DUK_LOCAL void duk__remove_slice(duk_re_compiler_ctx *re_ctx, duk_uint32_t data_offset, duk_uint32_t data_length) {
145 	DUK_BW_REMOVE_ENSURE_SLICE(re_ctx->thr, &re_ctx->bw, data_offset, data_length);
146 }
147 
148 /*
149  *  Insert a jump offset at 'offset' to complete an instruction
150  *  (the jump offset is always the last component of an instruction).
151  *  The 'skip' argument must be computed relative to 'offset',
152  *  -without- taking into account the skip field being inserted.
153  *
154  *       ... A B C ins X Y Z ...   (ins may be a JUMP, SPLIT1/SPLIT2, etc)
155  *   =>  ... A B C ins SKIP X Y Z
156  *
157  *  Computing the final (adjusted) skip value, which is relative to the
158  *  first byte of the next instruction, is a bit tricky because of the
159  *  variable length UTF-8 encoding.  See doc/regexp.rst for discussion.
160  */
duk__insert_jump_offset(duk_re_compiler_ctx * re_ctx,duk_uint32_t offset,duk_int32_t skip)161 DUK_LOCAL duk_uint32_t duk__insert_jump_offset(duk_re_compiler_ctx *re_ctx, duk_uint32_t offset, duk_int32_t skip) {
162 #if 0
163 	/* Iterative solution. */
164 	if (skip < 0) {
165 		duk_small_int_t len;
166 		/* two encoding attempts suffices */
167 		len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip));
168 		len = duk_unicode_get_xutf8_length((duk_codepoint_t) duk__encode_i32(skip - (duk_int32_t) len));
169 		DUK_ASSERT(duk_unicode_get_xutf8_length(duk__encode_i32(skip - (duk_int32_t) len)) == len);  /* no change */
170 		skip -= (duk_int32_t) len;
171 	}
172 #endif
173 
174 #if defined(DUK_USE_PREFER_SIZE)
175 	/* Closed form solution, this produces smallest code.
176 	 * See re_neg_jump_offset (closed2).
177 	 */
178 	if (skip < 0) {
179 		skip--;
180 		if (skip < -0x3fL) {
181 			skip--;
182 		}
183 		if (skip < -0x3ffL) {
184 			skip--;
185 		}
186 		if (skip < -0x7fffL) {
187 			skip--;
188 		}
189 		if (skip < -0xfffffL) {
190 			skip--;
191 		}
192 		if (skip < -0x1ffffffL) {
193 			skip--;
194 		}
195 		if (skip < -0x3fffffffL) {
196 			skip--;
197 		}
198 	}
199 #else  /* DUK_USE_PREFER_SIZE */
200 	/* Closed form solution, this produces fastest code.
201 	 * See re_neg_jump_offset (closed1).
202 	 */
203 	if (skip < 0) {
204 		if (skip >= -0x3eL) {
205 			skip -= 1;
206 		} else if (skip >= -0x3fdL) {
207 			skip -= 2;
208 		} else if (skip >= -0x7ffcL) {
209 			skip -= 3;
210 		} else if (skip >= -0xffffbL) {
211 			skip -= 4;
212 		} else if (skip >= -0x1fffffaL) {
213 			skip -= 5;
214 		} else if (skip >= -0x3ffffff9L) {
215 			skip -= 6;
216 		} else {
217 			skip -= 7;
218 		}
219 	}
220 #endif  /* DUK_USE_PREFER_SIZE */
221 
222 	return duk__insert_i32(re_ctx, offset, skip);
223 }
224 
duk__append_jump_offset(duk_re_compiler_ctx * re_ctx,duk_int32_t skip)225 DUK_LOCAL duk_uint32_t duk__append_jump_offset(duk_re_compiler_ctx *re_ctx, duk_int32_t skip) {
226 	return (duk_uint32_t) duk__insert_jump_offset(re_ctx, (duk_uint32_t) DUK__RE_BUFLEN(re_ctx), skip);
227 }
228 
229 /*
230  *  duk_re_range_callback for generating character class ranges.
231  *
232  *  When ignoreCase is false, the range is simply emitted as is.  We don't,
233  *  for instance, eliminate duplicates or overlapping ranges in a character
234  *  class.
235  *
236  *  When ignoreCase is true but the 'direct' flag is set, the caller knows
237  *  that the range canonicalizes to itself for case insensitive matching,
238  *  so the range is emitted as is.  This is mainly useful for built-in ranges
239  *  like \W.
240  *
241  *  Otherwise, when ignoreCase is true, the range needs to be normalized
242  *  through canonicalization.  Unfortunately a canonicalized version of a
243  *  continuous range is not necessarily continuous (e.g. [x-{] is continuous
244  *  but [X-{] is not).  As a result, a single input range may expand to a lot
245  *  of output ranges.  The current algorithm creates the canonicalized ranges
246  *  footprint efficiently at the cost of compile time execution time; see
247  *  doc/regexp.rst for discussion, and some more details below.
248  *
249  *  Note that the ctx->nranges is a context-wide temporary value.  This is OK
250  *  because there cannot be multiple character classes being parsed
251  *  simultaneously.
252  *
253  *  More detail on canonicalization:
254  *
255  *  Conceptually, a range is canonicalized by scanning the entire range,
256  *  normalizing each codepoint by converting it to uppercase, and generating
257  *  a set of result ranges.
258  *
259  *  Ideally a minimal set of output ranges would be emitted by merging all
260  *  possible ranges even if they're emitted out of sequence.  Because the
261  *  input string is also case normalized during matching, some codepoints
262  *  never occur at runtime; these "don't care" codepoints can be included or
263  *  excluded from ranges when merging/optimizing ranges.
264  *
265  *  The current algorithm does not do optimal range merging.  Rather, output
266  *  codepoints are generated in sequence, and when the output codepoints are
267  *  continuous (CP, CP+1, CP+2, ...), they are merged locally into as large a
268  *  range as possible.  A small canonicalization bitmap is used to reduce
269  *  actual codepoint canonicalizations which are quite slow at present.  The
270  *  bitmap provides a "codepoint block is continuous with respect to
271  *  canonicalization" for N-codepoint blocks.  This allows blocks to be
272  *  skipped quickly.
273  *
274  *  There are a number of shortcomings and future work here:
275  *
276  *    - Individual codepoint normalizations are slow because they involve
277  *      walking bit-packed rules without a lookup index.
278  *
279  *    - The conceptual algorithm needs to canonicalize every codepoint in the
280  *      input range to figure out the output range(s).  Even with the small
281  *      canonicalization bitmap the algorithm runs quite slowly for worst case
282  *      inputs.  There are many data structure alternatives to improve this.
283  *
284  *    - While the current algorithm generates maximal output ranges when the
285  *      output codepoints are emitted linearly, output ranges are not sorted or
286  *      merged otherwise.  In the worst case a lot of ranges are emitted when
287  *      most of the ranges could be merged.  In this process one could take
288  *      advantage of "don't care" codepoints, which are never matched against at
289  *      runtime due to canonicalization of input codepoints before comparison,
290  *      to merge otherwise discontinuous output ranges.
291  *
292  *    - The runtime data structure is just a linear list of ranges to match
293  *      against.  This can be quite slow if there are a lot of output ranges.
294  *      There are various ways to make matching against the ranges faster,
295  *      e.g. sorting the ranges and using a binary search; skip lists; tree
296  *      based representations; full or approximate codepoint bitmaps, etc.
297  *
298  *    - Only BMP is supported, codepoints above BMP are assumed to canonicalize
299  *      to themselves.  For now this is one place where we don't want to
300  *      support chars outside the BMP, because the exhaustive search would be
301  *      massively larger.  It would be possible to support non-BMP with a
302  *      different algorithm, or perhaps doing case normalization only at match
303  *      time.
304  */
305 
duk__regexp_emit_range(duk_re_compiler_ctx * re_ctx,duk_codepoint_t r1,duk_codepoint_t r2)306 DUK_LOCAL void duk__regexp_emit_range(duk_re_compiler_ctx *re_ctx, duk_codepoint_t r1, duk_codepoint_t r2) {
307 	DUK_ASSERT(r2 >= r1);
308 	duk__append_u32(re_ctx, (duk_uint32_t) r1);
309 	duk__append_u32(re_ctx, (duk_uint32_t) r2);
310 	re_ctx->nranges++;
311 }
312 
313 #if defined(DUK_USE_REGEXP_CANON_BITMAP)
314 /* Find next canonicalization discontinuity (conservative estimate) starting
315  * from 'start', not exceeding 'end'.  If continuity is fine up to 'end'
316  * inclusive, returns end.  Minimum possible return value is start.
317  */
duk__re_canon_next_discontinuity(duk_codepoint_t start,duk_codepoint_t end)318 DUK_LOCAL duk_codepoint_t duk__re_canon_next_discontinuity(duk_codepoint_t start, duk_codepoint_t end) {
319 	duk_uint_t start_blk;
320 	duk_uint_t end_blk;
321 	duk_uint_t blk;
322 	duk_uint_t offset;
323 	duk_uint8_t mask;
324 
325 	/* Inclusive block range. */
326 	DUK_ASSERT(start >= 0);
327 	DUK_ASSERT(end >= 0);
328 	DUK_ASSERT(end >= start);
329 	start_blk = (duk_uint_t) (start >> DUK_CANON_BITMAP_BLKSHIFT);
330 	end_blk = (duk_uint_t) (end >> DUK_CANON_BITMAP_BLKSHIFT);
331 
332 	for (blk = start_blk; blk <= end_blk; blk++) {
333 		offset = blk >> 3;
334 		mask = 1U << (blk & 0x07);
335 		if (offset >= sizeof(duk_unicode_re_canon_bitmap)) {
336 			/* Reached non-BMP range which is assumed continuous. */
337 			return end;
338 		}
339 		DUK_ASSERT(offset < sizeof(duk_unicode_re_canon_bitmap));
340 		if ((duk_unicode_re_canon_bitmap[offset] & mask) == 0) {
341 			/* Block is discontinuous, continuity is guaranteed
342 			 * only up to end of previous block (+1 for exclusive
343 			 * return value => start of current block).  Start
344 			 * block requires special handling.
345 			 */
346 			if (blk > start_blk) {
347 				return (duk_codepoint_t) (blk << DUK_CANON_BITMAP_BLKSHIFT);
348 			} else {
349 				return start;
350 			}
351 		}
352 	}
353 	DUK_ASSERT(blk == end_blk + 1);  /* Reached end block which is continuous. */
354 	return end;
355 }
356 #else  /* DUK_USE_REGEXP_CANON_BITMAP */
duk__re_canon_next_discontinuity(duk_codepoint_t start,duk_codepoint_t end)357 DUK_LOCAL duk_codepoint_t duk__re_canon_next_discontinuity(duk_codepoint_t start, duk_codepoint_t end) {
358 	DUK_ASSERT(start >= 0);
359 	DUK_ASSERT(end >= 0);
360 	DUK_ASSERT(end >= start);
361 	if (start >= 0x10000) {
362 		/* Even without the bitmap, treat non-BMP as continuous. */
363 		return end;
364 	}
365 	return start;
366 }
367 #endif  /* DUK_USE_REGEXP_CANON_BITMAP */
368 
duk__regexp_generate_ranges(void * userdata,duk_codepoint_t r1,duk_codepoint_t r2,duk_bool_t direct)369 DUK_LOCAL void duk__regexp_generate_ranges(void *userdata, duk_codepoint_t r1, duk_codepoint_t r2, duk_bool_t direct) {
370 	duk_re_compiler_ctx *re_ctx = (duk_re_compiler_ctx *) userdata;
371 	duk_codepoint_t r_start;
372 	duk_codepoint_t r_end;
373 	duk_codepoint_t i;
374 	duk_codepoint_t t;
375 	duk_codepoint_t r_disc;
376 
377 	DUK_DD(DUK_DDPRINT("duk__regexp_generate_ranges(): re_ctx=%p, range=[%ld,%ld] direct=%ld",
378 	                   (void *) re_ctx, (long) r1, (long) r2, (long) direct));
379 
380 	DUK_ASSERT(r2 >= r1);  /* SyntaxError for out of order range. */
381 
382 	if (direct || (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) == 0) {
383 		DUK_DD(DUK_DDPRINT("direct or not case sensitive, emit range: [%ld,%ld]", (long) r1, (long) r2));
384 		duk__regexp_emit_range(re_ctx, r1, r2);
385 		return;
386 	}
387 
388 	DUK_DD(DUK_DDPRINT("case sensitive, process range: [%ld,%ld]", (long) r1, (long) r2));
389 
390 	r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
391 	r_end = r_start;
392 
393 	for (i = r1 + 1; i <= r2;) {
394 		/* Input codepoint space processed up to i-1, and
395 		 * current range in r_{start,end} is up-to-date
396 		 * (inclusive) and may either break or continue.
397 		 */
398 		r_disc = duk__re_canon_next_discontinuity(i, r2);
399 		DUK_ASSERT(r_disc >= i);
400 		DUK_ASSERT(r_disc <= r2);
401 
402 		r_end += r_disc - i;  /* May be zero. */
403 		t = duk_unicode_re_canonicalize_char(re_ctx->thr, r_disc);
404 		if (t == r_end + 1) {
405 			/* Not actually a discontinuity, continue range
406 			 * to r_disc and recheck.
407 			 */
408 			r_end = t;
409 		} else {
410 			duk__regexp_emit_range(re_ctx, r_start, r_end);
411 			r_start = t;
412 			r_end = t;
413 		}
414 		i = r_disc + 1;  /* Guarantees progress. */
415 	}
416 	duk__regexp_emit_range(re_ctx, r_start, r_end);
417 
418 #if 0  /* Exhaustive search, very slow. */
419 	r_start = duk_unicode_re_canonicalize_char(re_ctx->thr, r1);
420 	r_end = r_start;
421 	for (i = r1 + 1; i <= r2; i++) {
422 		t = duk_unicode_re_canonicalize_char(re_ctx->thr, i);
423 		if (t == r_end + 1) {
424 			r_end = t;
425 		} else {
426 			DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
427 			duk__append_u32(re_ctx, (duk_uint32_t) r_start);
428 			duk__append_u32(re_ctx, (duk_uint32_t) r_end);
429 			re_ctx->nranges++;
430 			r_start = t;
431 			r_end = t;
432 		}
433 	}
434 	DUK_DD(DUK_DDPRINT("canonicalized, emit range: [%ld,%ld]", (long) r_start, (long) r_end));
435 	duk__append_u32(re_ctx, (duk_uint32_t) r_start);
436 	duk__append_u32(re_ctx, (duk_uint32_t) r_end);
437 	re_ctx->nranges++;
438 #endif
439 }
440 
441 /*
442  *  Parse regexp Disjunction.  Most of regexp compilation happens here.
443  *
444  *  Handles Disjunction, Alternative, and Term productions directly without
445  *  recursion.  The only constructs requiring recursion are positive/negative
446  *  lookaheads, capturing parentheses, and non-capturing parentheses.
447  *
448  *  The function determines whether the entire disjunction is a 'simple atom'
449  *  (see doc/regexp.rst discussion on 'simple quantifiers') and if so,
450  *  returns the atom character length which is needed by the caller to keep
451  *  track of its own atom character length.  A disjunction with more than one
452  *  alternative is never considered a simple atom (although in some cases
453  *  that might be the case).
454  *
455  *  Return value: simple atom character length or < 0 if not a simple atom.
456  *  Appends the bytecode for the disjunction matcher to the end of the temp
457  *  buffer.
458  *
459  *  Regexp top level structure is:
460  *
461  *    Disjunction = Term*
462  *                | Term* | Disjunction
463  *
464  *    Term = Assertion
465  *         | Atom
466  *         | Atom Quantifier
467  *
468  *  An empty Term sequence is a valid disjunction alternative (e.g. /|||c||/).
469  *
470  *  Notes:
471  *
472  *    * Tracking of the 'simple-ness' of the current atom vs. the entire
473  *      disjunction are separate matters.  For instance, the disjunction
474  *      may be complex, but individual atoms may be simple.  Furthermore,
475  *      simple quantifiers are used whenever possible, even if the
476  *      disjunction as a whole is complex.
477  *
478  *    * The estimate of whether an atom is simple is conservative now,
479  *      and it would be possible to expand it.  For instance, captures
480  *      cause the disjunction to be marked complex, even though captures
481  *      -can- be handled by simple quantifiers with some minor modifications.
482  *
483  *    * Disjunction 'tainting' as 'complex' is handled at the end of the
484  *      main for loop collectively for atoms.  Assertions, quantifiers,
485  *      and '|' tokens need to taint the result manually if necessary.
486  *      Assertions cannot add to result char length, only atoms (and
487  *      quantifiers) can; currently quantifiers will taint the result
488  *      as complex though.
489  */
490 
491 DUK_LOCAL const duk_uint16_t * const duk__re_range_lookup1[3] = {
492 	duk_unicode_re_ranges_digit,
493 	duk_unicode_re_ranges_white,
494 	duk_unicode_re_ranges_wordchar
495 };
496 DUK_LOCAL const duk_uint8_t duk__re_range_lookup2[3] = {
497 	sizeof(duk_unicode_re_ranges_digit) / (2 * sizeof(duk_uint16_t)),
498 	sizeof(duk_unicode_re_ranges_white) / (2 * sizeof(duk_uint16_t)),
499 	sizeof(duk_unicode_re_ranges_wordchar) / (2 * sizeof(duk_uint16_t))
500 };
501 
duk__append_range_atom_matcher(duk_re_compiler_ctx * re_ctx,duk_small_uint_t re_op,const duk_uint16_t * ranges,duk_small_uint_t count)502 DUK_LOCAL void duk__append_range_atom_matcher(duk_re_compiler_ctx *re_ctx, duk_small_uint_t re_op, const duk_uint16_t *ranges, duk_small_uint_t count) {
503 #if 0
504 	DUK_ASSERT(re_op <= 0x7fUL);
505 	DUK_ASSERT(count <= 0x7fUL);
506 	duk__append_2bytes(re_ctx, (duk_uint8_t) re_op, (duk_uint8_t) count);
507 #endif
508 	duk__append_reop(re_ctx, re_op);
509 	duk__append_7bit(re_ctx, count);
510 	duk__append_u16_list(re_ctx, ranges, count * 2);
511 }
512 
duk__parse_disjunction(duk_re_compiler_ctx * re_ctx,duk_bool_t expect_eof,duk__re_disjunction_info * out_atom_info)513 DUK_LOCAL void duk__parse_disjunction(duk_re_compiler_ctx *re_ctx, duk_bool_t expect_eof, duk__re_disjunction_info *out_atom_info) {
514 	duk_int32_t atom_start_offset = -1;                   /* negative -> no atom matched on previous round */
515 	duk_int32_t atom_char_length = 0;                     /* negative -> complex atom */
516 	duk_uint32_t atom_start_captures = re_ctx->captures;  /* value of re_ctx->captures at start of atom */
517 	duk_int32_t unpatched_disjunction_split = -1;
518 	duk_int32_t unpatched_disjunction_jump = -1;
519 	duk_uint32_t entry_offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
520 	duk_int32_t res_charlen = 0;  /* -1 if disjunction is complex, char length if simple */
521 	duk__re_disjunction_info tmp_disj;
522 
523 	DUK_ASSERT(out_atom_info != NULL);
524 
525 	duk_native_stack_check(re_ctx->thr);
526 	if (re_ctx->recursion_depth >= re_ctx->recursion_limit) {
527 		DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_REGEXP_COMPILER_RECURSION_LIMIT);
528 		DUK_WO_NORETURN(return;);
529 	}
530 	re_ctx->recursion_depth++;
531 
532 #if 0
533 	out_atom_info->start_captures = re_ctx->captures;
534 #endif
535 
536 	for (;;) {
537 		/* atom_char_length, atom_start_offset, atom_start_offset reflect the
538 		 * atom matched on the previous loop.  If a quantifier is encountered
539 		 * on this loop, these are needed to handle the quantifier correctly.
540 		 * new_atom_char_length etc are for the atom parsed on this round;
541 		 * they're written to atom_char_length etc at the end of the round.
542 		 */
543 		duk_int32_t new_atom_char_length;   /* char length of the atom parsed in this loop */
544 		duk_int32_t new_atom_start_offset;  /* bytecode start offset of the atom parsed in this loop
545 		                                     * (allows quantifiers to copy the atom bytecode)
546 		                                     */
547 		duk_uint32_t new_atom_start_captures;  /* re_ctx->captures at the start of the atom parsed in this loop */
548 
549 		duk_lexer_parse_re_token(&re_ctx->lex, &re_ctx->curr_token);
550 
551 		DUK_DD(DUK_DDPRINT("re token: %ld (num=%ld, char=%c)",
552 		                   (long) re_ctx->curr_token.t,
553 		                   (long) re_ctx->curr_token.num,
554 		                   (re_ctx->curr_token.num >= 0x20 && re_ctx->curr_token.num <= 0x7e) ?
555 		                   (int) re_ctx->curr_token.num : (int) '?'));
556 
557 		/* set by atom case clauses */
558 		new_atom_start_offset = -1;
559 		new_atom_char_length = -1;
560 		new_atom_start_captures = re_ctx->captures;
561 
562 		switch (re_ctx->curr_token.t) {
563 		case DUK_RETOK_DISJUNCTION: {
564 			/*
565 			 *  The handling here is a bit tricky.  If a previous '|' has been processed,
566 			 *  we have a pending split1 and a pending jump (for a previous match).  These
567 			 *  need to be back-patched carefully.  See docs for a detailed example.
568 			 */
569 
570 			/* patch pending jump and split */
571 			if (unpatched_disjunction_jump >= 0) {
572 				duk_uint32_t offset;
573 
574 				DUK_ASSERT(unpatched_disjunction_split >= 0);
575 				offset = (duk_uint32_t) unpatched_disjunction_jump;
576 				offset += duk__insert_jump_offset(re_ctx,
577 				                                  offset,
578 				                                  (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
579 				/* offset is now target of the pending split (right after jump) */
580 				duk__insert_jump_offset(re_ctx,
581 				                        (duk_uint32_t) unpatched_disjunction_split,
582 				                        (duk_int32_t) offset - unpatched_disjunction_split);
583 			}
584 
585 			/* add a new pending split to the beginning of the entire disjunction */
586 			(void) duk__insert_u32(re_ctx,
587 			                       entry_offset,
588 			                       DUK_REOP_SPLIT1);   /* prefer direct execution */
589 			unpatched_disjunction_split = (duk_int32_t) (entry_offset + 1);   /* +1 for opcode */
590 
591 			/* add a new pending match jump for latest finished alternative */
592 			duk__append_reop(re_ctx, DUK_REOP_JUMP);
593 			unpatched_disjunction_jump = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
594 
595 			/* 'taint' result as complex */
596 			res_charlen = -1;
597 			break;
598 		}
599 		case DUK_RETOK_QUANTIFIER: {
600 			if (atom_start_offset < 0) {
601 				DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_NO_ATOM);
602 				DUK_WO_NORETURN(return;);
603 			}
604 			if (re_ctx->curr_token.qmin > re_ctx->curr_token.qmax) {
605 				DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_INVALID_QUANTIFIER_VALUES);
606 				DUK_WO_NORETURN(return;);
607 			}
608 			if (atom_char_length >= 0) {
609 				/*
610 				 *  Simple atom
611 				 *
612 				 *  If atom_char_length is zero, we'll have unbounded execution time for e.g.
613 				 *  /()*x/.exec('x').  We can't just skip the match because it might have some
614 				 *  side effects (for instance, if we allowed captures in simple atoms, the
615 				 *  capture needs to happen).  The simple solution below is to force the
616 				 *  quantifier to match at most once, since the additional matches have no effect.
617 				 *
618 				 *  With a simple atom there can be no capture groups, so no captures need
619 				 *  to be reset.
620 				 */
621 				duk_int32_t atom_code_length;
622 				duk_uint32_t offset;
623 				duk_uint32_t qmin, qmax;
624 
625 				qmin = re_ctx->curr_token.qmin;
626 				qmax = re_ctx->curr_token.qmax;
627 				if (atom_char_length == 0) {
628 					/* qmin and qmax will be 0 or 1 */
629 					if (qmin > 1) {
630 						qmin = 1;
631 					}
632 					if (qmax > 1) {
633 						qmax = 1;
634 					}
635 				}
636 
637 				duk__append_reop(re_ctx, DUK_REOP_MATCH);   /* complete 'sub atom' */
638 				atom_code_length = (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (duk_size_t) atom_start_offset);
639 
640 				offset = (duk_uint32_t) atom_start_offset;
641 				if (re_ctx->curr_token.greedy) {
642 					offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQGREEDY);
643 					offset += duk__insert_u32(re_ctx, offset, qmin);
644 					offset += duk__insert_u32(re_ctx, offset, qmax);
645 					offset += duk__insert_u32(re_ctx, offset, (duk_uint32_t) atom_char_length);
646 					offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
647 				} else {
648 					offset += duk__insert_u32(re_ctx, offset, DUK_REOP_SQMINIMAL);
649 					offset += duk__insert_u32(re_ctx, offset, qmin);
650 					offset += duk__insert_u32(re_ctx, offset, qmax);
651 					offset += duk__insert_jump_offset(re_ctx, offset, atom_code_length);
652 				}
653 				DUK_UNREF(offset);  /* silence scan-build warning */
654 			} else {
655 				/*
656 				 *  Complex atom
657 				 *
658 				 *  The original code is used as a template, and removed at the end
659 				 *  (this differs from the handling of simple quantifiers).
660 				 *
661 				 *  NOTE: there is no current solution for empty atoms in complex
662 				 *  quantifiers.  This would need some sort of a 'progress' instruction.
663 				 *
664 				 *  XXX: impose limit on maximum result size, i.e. atom_code_len * atom_copies?
665 				 */
666 				duk_int32_t atom_code_length;
667 				duk_uint32_t atom_copies;
668 				duk_uint32_t tmp_qmin, tmp_qmax;
669 
670 				/* pre-check how many atom copies we're willing to make (atom_copies not needed below) */
671 				atom_copies = (re_ctx->curr_token.qmax == DUK_RE_QUANTIFIER_INFINITE) ?
672 				              re_ctx->curr_token.qmin : re_ctx->curr_token.qmax;
673 				if (atom_copies > DUK_RE_MAX_ATOM_COPIES) {
674 					DUK_ERROR_RANGE(re_ctx->thr, DUK_STR_QUANTIFIER_TOO_MANY_COPIES);
675 					DUK_WO_NORETURN(return;);
676 				}
677 
678 				/* wipe the capture range made by the atom (if any) */
679 				DUK_ASSERT(atom_start_captures <= re_ctx->captures);
680 				if (atom_start_captures != re_ctx->captures) {
681 					DUK_ASSERT(atom_start_captures < re_ctx->captures);
682 					DUK_DDD(DUK_DDDPRINT("must wipe ]atom_start_captures,re_ctx->captures]: ]%ld,%ld]",
683 					                     (long) atom_start_captures, (long) re_ctx->captures));
684 
685 					/* insert (DUK_REOP_WIPERANGE, start, count) in reverse order so the order ends up right */
686 					duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, (re_ctx->captures - atom_start_captures) * 2U);
687 					duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, (atom_start_captures + 1) * 2);
688 					duk__insert_u32(re_ctx, (duk_uint32_t) atom_start_offset, DUK_REOP_WIPERANGE);
689 				} else {
690 					DUK_DDD(DUK_DDDPRINT("no need to wipe captures: atom_start_captures == re_ctx->captures == %ld",
691 					                     (long) atom_start_captures));
692 				}
693 
694 				atom_code_length = (duk_int32_t) DUK__RE_BUFLEN(re_ctx) - atom_start_offset;
695 
696 				/* insert the required matches (qmin) by copying the atom */
697 				tmp_qmin = re_ctx->curr_token.qmin;
698 				tmp_qmax = re_ctx->curr_token.qmax;
699 				while (tmp_qmin > 0) {
700 					duk__append_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
701 					tmp_qmin--;
702 					if (tmp_qmax != DUK_RE_QUANTIFIER_INFINITE) {
703 						tmp_qmax--;
704 					}
705 				}
706 				DUK_ASSERT(tmp_qmin == 0);
707 
708 				/* insert code for matching the remainder - infinite or finite */
709 				if (tmp_qmax == DUK_RE_QUANTIFIER_INFINITE) {
710 					/* reuse last emitted atom for remaining 'infinite' quantifier */
711 
712 					if (re_ctx->curr_token.qmin == 0) {
713 						/* Special case: original qmin was zero so there is nothing
714 						 * to repeat.  Emit an atom copy but jump over it here.
715 						 */
716 						duk__append_reop(re_ctx, DUK_REOP_JUMP);
717 						duk__append_jump_offset(re_ctx, atom_code_length);
718 						duk__append_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
719 					}
720 					if (re_ctx->curr_token.greedy) {
721 						duk__append_reop(re_ctx, DUK_REOP_SPLIT2);   /* prefer jump */
722 					} else {
723 						duk__append_reop(re_ctx, DUK_REOP_SPLIT1);   /* prefer direct */
724 					}
725 					duk__append_jump_offset(re_ctx, -atom_code_length - 1);  /* -1 for opcode */
726 				} else {
727 					/*
728 					 *  The remaining matches are emitted as sequence of SPLITs and atom
729 					 *  copies; the SPLITs skip the remaining copies and match the sequel.
730 					 *  This sequence needs to be emitted starting from the last copy
731 					 *  because the SPLITs are variable length due to the variable length
732 					 *  skip offset.  This causes a lot of memory copying now.
733 					 *
734 					 *  Example structure (greedy, match maximum # atoms):
735 					 *
736 					 *      SPLIT1 LSEQ
737 					 *      (atom)
738 					 *      SPLIT1 LSEQ    ; <- the byte length of this instruction is needed
739 					 *      (atom)         ; to encode the above SPLIT1 correctly
740 					 *      ...
741 					 *   LSEQ:
742 					 */
743 					duk_uint32_t offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
744 					while (tmp_qmax > 0) {
745 						duk__insert_slice(re_ctx, offset, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
746 						if (re_ctx->curr_token.greedy) {
747 							duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT1);   /* prefer direct */
748 						} else {
749 							duk__insert_u32(re_ctx, offset, DUK_REOP_SPLIT2);   /* prefer jump */
750 						}
751 						duk__insert_jump_offset(re_ctx,
752 						                        offset + 1,   /* +1 for opcode */
753 						                        (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
754 						tmp_qmax--;
755 					}
756 				}
757 
758 				/* remove the original 'template' atom */
759 				duk__remove_slice(re_ctx, (duk_uint32_t) atom_start_offset, (duk_uint32_t) atom_code_length);
760 			}
761 
762 			/* 'taint' result as complex */
763 			res_charlen = -1;
764 			break;
765 		}
766 		case DUK_RETOK_ASSERT_START: {
767 			duk__append_reop(re_ctx, DUK_REOP_ASSERT_START);
768 			break;
769 		}
770 		case DUK_RETOK_ASSERT_END: {
771 			duk__append_reop(re_ctx, DUK_REOP_ASSERT_END);
772 			break;
773 		}
774 		case DUK_RETOK_ASSERT_WORD_BOUNDARY: {
775 			duk__append_reop(re_ctx, DUK_REOP_ASSERT_WORD_BOUNDARY);
776 			break;
777 		}
778 		case DUK_RETOK_ASSERT_NOT_WORD_BOUNDARY: {
779 			duk__append_reop(re_ctx, DUK_REOP_ASSERT_NOT_WORD_BOUNDARY);
780 			break;
781 		}
782 		case DUK_RETOK_ASSERT_START_POS_LOOKAHEAD:
783 		case DUK_RETOK_ASSERT_START_NEG_LOOKAHEAD: {
784 			duk_uint32_t offset;
785 			duk_uint32_t opcode = (re_ctx->curr_token.t == DUK_RETOK_ASSERT_START_POS_LOOKAHEAD) ?
786 			                      DUK_REOP_LOOKPOS : DUK_REOP_LOOKNEG;
787 
788 			offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);
789 			duk__parse_disjunction(re_ctx, 0, &tmp_disj);
790 			duk__append_reop(re_ctx, DUK_REOP_MATCH);
791 
792 			(void) duk__insert_u32(re_ctx, offset, opcode);
793 			(void) duk__insert_jump_offset(re_ctx,
794 			                               offset + 1,   /* +1 for opcode */
795 			                               (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - (offset + 1)));
796 
797 			/* 'taint' result as complex -- this is conservative,
798 			 * as lookaheads do not backtrack.
799 			 */
800 			res_charlen = -1;
801 			break;
802 		}
803 		case DUK_RETOK_ATOM_PERIOD: {
804 			new_atom_char_length = 1;
805 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
806 			duk__append_reop(re_ctx, DUK_REOP_PERIOD);
807 			break;
808 		}
809 		case DUK_RETOK_ATOM_CHAR: {
810 			/* Note: successive characters could be joined into string matches
811 			 * but this is not trivial (consider e.g. '/xyz+/); see docs for
812 			 * more discussion.
813 			 *
814 			 * No support for \u{H+} yet.  While only BMP Unicode escapes are
815 			 * supported for RegExps at present, 'ch' may still be a non-BMP
816 			 * codepoint if it is decoded straight from source text UTF-8.
817 			 * There's no non-BMP support yet so this is handled simply by
818 			 * matching the non-BMP character (which is custom behavior).
819 			 */
820 			duk_uint32_t ch;
821 
822 			new_atom_char_length = 1;
823 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
824 			duk__append_reop(re_ctx, DUK_REOP_CHAR);
825 			ch = re_ctx->curr_token.num;
826 			if (re_ctx->re_flags & DUK_RE_FLAG_IGNORE_CASE) {
827 				ch = (duk_uint32_t) duk_unicode_re_canonicalize_char(re_ctx->thr, (duk_codepoint_t) ch);
828 			}
829 			duk__append_u32(re_ctx, ch);
830 			break;
831 		}
832 		case DUK_RETOK_ATOM_DIGIT:
833 		case DUK_RETOK_ATOM_NOT_DIGIT:
834 		case DUK_RETOK_ATOM_WHITE:
835 		case DUK_RETOK_ATOM_NOT_WHITE:
836 		case DUK_RETOK_ATOM_WORD_CHAR:
837 		case DUK_RETOK_ATOM_NOT_WORD_CHAR: {
838 			duk_small_uint_t re_op;
839 			duk_small_uint_t idx;
840 
841 			new_atom_char_length = 1;
842 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
843 
844 			DUK_ASSERT((DUK_RETOK_ATOM_DIGIT & 0x01) != 0);
845 			DUK_ASSERT((DUK_RETOK_ATOM_WHITE & 0x01) != 0);
846 			DUK_ASSERT((DUK_RETOK_ATOM_WORD_CHAR & 0x01) != 0);
847 			DUK_ASSERT((DUK_RETOK_ATOM_NOT_DIGIT & 0x01) == 0);
848 			DUK_ASSERT((DUK_RETOK_ATOM_NOT_WHITE & 0x01) == 0);
849 			DUK_ASSERT((DUK_RETOK_ATOM_NOT_WORD_CHAR & 0x01) == 0);
850 			re_op = (re_ctx->curr_token.t & 0x01) ? DUK_REOP_RANGES : DUK_REOP_INVRANGES;
851 
852 			DUK_ASSERT(DUK_RETOK_ATOM_WHITE == DUK_RETOK_ATOM_DIGIT + 2);
853 			DUK_ASSERT(DUK_RETOK_ATOM_WORD_CHAR == DUK_RETOK_ATOM_DIGIT + 4);
854 			idx = (duk_small_uint_t) ((re_ctx->curr_token.t - DUK_RETOK_ATOM_DIGIT) >> 1U);
855 			DUK_ASSERT(idx <= 2U);  /* Assume continuous token numbers; also checks negative underflow. */
856 
857 			duk__append_range_atom_matcher(re_ctx, re_op, duk__re_range_lookup1[idx], duk__re_range_lookup2[idx]);
858 			break;
859 		}
860 		case DUK_RETOK_ATOM_BACKREFERENCE: {
861 			duk_uint32_t backref = (duk_uint32_t) re_ctx->curr_token.num;
862 			if (backref > re_ctx->highest_backref) {
863 				re_ctx->highest_backref = backref;
864 			}
865 			new_atom_char_length = -1;   /* mark as complex */
866 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
867 			duk__append_reop(re_ctx, DUK_REOP_BACKREFERENCE);
868 			duk__append_u32(re_ctx, backref);
869 			break;
870 		}
871 		case DUK_RETOK_ATOM_START_CAPTURE_GROUP: {
872 			duk_uint32_t cap;
873 
874 			new_atom_char_length = -1;   /* mark as complex (capture handling) */
875 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
876 			cap = ++re_ctx->captures;
877 			duk__append_reop(re_ctx, DUK_REOP_SAVE);
878 			duk__append_u32(re_ctx, cap * 2);
879 			duk__parse_disjunction(re_ctx, 0, &tmp_disj);  /* retval (sub-atom char length) unused, tainted as complex above */
880 			duk__append_reop(re_ctx, DUK_REOP_SAVE);
881 			duk__append_u32(re_ctx, cap * 2 + 1);
882 			break;
883 		}
884 		case DUK_RETOK_ATOM_START_NONCAPTURE_GROUP: {
885 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
886 			duk__parse_disjunction(re_ctx, 0, &tmp_disj);
887 			new_atom_char_length = tmp_disj.charlen;
888 			break;
889 		}
890 		case DUK_RETOK_ATOM_START_CHARCLASS:
891 		case DUK_RETOK_ATOM_START_CHARCLASS_INVERTED: {
892 			/*
893 			 *  Range parsing is done with a special lexer function which calls
894 			 *  us for every range parsed.  This is different from how rest of
895 			 *  the parsing works, but avoids a heavy, arbitrary size intermediate
896 			 *  value type to hold the ranges.
897 			 *
898 			 *  Another complication is the handling of character ranges when
899 			 *  case insensitive matching is used (see docs for discussion).
900 			 *  The range handler callback given to the lexer takes care of this
901 			 *  as well.
902 			 *
903 			 *  Note that duplicate ranges are not eliminated when parsing character
904 			 *  classes, so that canonicalization of
905 			 *
906 			 *    [0-9a-fA-Fx-{]
907 			 *
908 			 *  creates the result (note the duplicate ranges):
909 			 *
910 			 *    [0-9A-FA-FX-Z{-{]
911 			 *
912 			 *  where [x-{] is split as a result of canonicalization.  The duplicate
913 			 *  ranges are not a semantics issue: they work correctly.
914 			 */
915 
916 			duk_uint32_t offset;
917 
918 			DUK_DD(DUK_DDPRINT("character class"));
919 
920 			/* insert ranges instruction, range count patched in later */
921 			new_atom_char_length = 1;
922 			new_atom_start_offset = (duk_int32_t) DUK__RE_BUFLEN(re_ctx);
923 			duk__append_reop(re_ctx,
924 			                 (re_ctx->curr_token.t == DUK_RETOK_ATOM_START_CHARCLASS) ?
925 			                 DUK_REOP_RANGES : DUK_REOP_INVRANGES);
926 			offset = (duk_uint32_t) DUK__RE_BUFLEN(re_ctx);    /* patch in range count later */
927 
928 			/* parse ranges until character class ends */
929 			re_ctx->nranges = 0;    /* note: ctx-wide temporary */
930 			duk_lexer_parse_re_ranges(&re_ctx->lex, duk__regexp_generate_ranges, (void *) re_ctx);
931 
932 			/* insert range count */
933 			duk__insert_u32(re_ctx, offset, re_ctx->nranges);
934 			break;
935 		}
936 		case DUK_RETOK_ATOM_END_GROUP: {
937 			if (expect_eof) {
938 				DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_CLOSING_PAREN);
939 				DUK_WO_NORETURN(return;);
940 			}
941 			goto done;
942 		}
943 		case DUK_RETOK_EOF: {
944 			if (!expect_eof) {
945 				DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_END_OF_PATTERN);
946 				DUK_WO_NORETURN(return;);
947 			}
948 			goto done;
949 		}
950 		default: {
951 			DUK_ERROR_SYNTAX(re_ctx->thr, DUK_STR_UNEXPECTED_REGEXP_TOKEN);
952 			DUK_WO_NORETURN(return;);
953 		}
954 		}
955 
956 		/* a complex (new) atom taints the result */
957 		if (new_atom_start_offset >= 0) {
958 			if (new_atom_char_length < 0) {
959 				res_charlen = -1;
960 			} else if (res_charlen >= 0) {
961 				/* only advance if not tainted */
962 				res_charlen += new_atom_char_length;
963 			}
964 		}
965 
966 		/* record previous atom info in case next token is a quantifier */
967 		atom_start_offset = new_atom_start_offset;
968 		atom_char_length = new_atom_char_length;
969 		atom_start_captures = new_atom_start_captures;
970 	}
971 
972  done:
973 
974 	/* finish up pending jump and split for last alternative */
975 	if (unpatched_disjunction_jump >= 0) {
976 		duk_uint32_t offset;
977 
978 		DUK_ASSERT(unpatched_disjunction_split >= 0);
979 		offset = (duk_uint32_t) unpatched_disjunction_jump;
980 		offset += duk__insert_jump_offset(re_ctx,
981 		                                  offset,
982 		                                  (duk_int32_t) (DUK__RE_BUFLEN(re_ctx) - offset));
983 		/* offset is now target of the pending split (right after jump) */
984 		duk__insert_jump_offset(re_ctx,
985 		                        (duk_uint32_t) unpatched_disjunction_split,
986 		                        (duk_int32_t) offset - unpatched_disjunction_split);
987 	}
988 
989 #if 0
990 	out_atom_info->end_captures = re_ctx->captures;
991 #endif
992 	out_atom_info->charlen = res_charlen;
993 	DUK_DDD(DUK_DDDPRINT("parse disjunction finished: charlen=%ld",
994 	                     (long) out_atom_info->charlen));
995 
996 	re_ctx->recursion_depth--;
997 }
998 
999 /*
1000  *  Flags parsing (see E5 Section 15.10.4.1).
1001  */
1002 
duk__parse_regexp_flags(duk_hthread * thr,duk_hstring * h)1003 DUK_LOCAL duk_uint32_t duk__parse_regexp_flags(duk_hthread *thr, duk_hstring *h) {
1004 	const duk_uint8_t *p;
1005 	const duk_uint8_t *p_end;
1006 	duk_uint32_t flags = 0;
1007 
1008 	p = DUK_HSTRING_GET_DATA(h);
1009 	p_end = p + DUK_HSTRING_GET_BYTELEN(h);
1010 
1011 	/* Note: can be safely scanned as bytes (undecoded) */
1012 
1013 	while (p < p_end) {
1014 		duk_uint8_t c = *p++;
1015 		switch (c) {
1016 		case (duk_uint8_t) 'g': {
1017 			if (flags & DUK_RE_FLAG_GLOBAL) {
1018 				goto flags_error;
1019 			}
1020 			flags |= DUK_RE_FLAG_GLOBAL;
1021 			break;
1022 		}
1023 		case (duk_uint8_t) 'i': {
1024 			if (flags & DUK_RE_FLAG_IGNORE_CASE) {
1025 				goto flags_error;
1026 			}
1027 			flags |= DUK_RE_FLAG_IGNORE_CASE;
1028 			break;
1029 		}
1030 		case (duk_uint8_t) 'm': {
1031 			if (flags & DUK_RE_FLAG_MULTILINE) {
1032 				goto flags_error;
1033 			}
1034 			flags |= DUK_RE_FLAG_MULTILINE;
1035 			break;
1036 		}
1037 		default: {
1038 			goto flags_error;
1039 		}
1040 		}
1041 	}
1042 
1043 	return flags;
1044 
1045  flags_error:
1046 	DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_REGEXP_FLAGS);
1047 	DUK_WO_NORETURN(return 0U;);
1048 }
1049 
1050 /*
1051  *  Create escaped RegExp source (E5 Section 15.10.3).
1052  *
1053  *  The current approach is to special case the empty RegExp
1054  *  ('' -> '(?:)') and otherwise replace unescaped '/' characters
1055  *  with '\/' regardless of where they occur in the regexp.
1056  *
1057  *  Note that normalization does not seem to be necessary for
1058  *  RegExp literals (e.g. '/foo/') because to be acceptable as
1059  *  a RegExp literal, the text between forward slashes must
1060  *  already match the escaping requirements (e.g. must not contain
1061  *  unescaped forward slashes or be empty).  Escaping IS needed
1062  *  for expressions like 'new Regexp("...", "")' however.
1063  *  Currently, we re-escape in either case.
1064  *
1065  *  Also note that we process the source here in UTF-8 encoded
1066  *  form.  This is correct, because any non-ASCII characters are
1067  *  passed through without change.
1068  */
1069 
duk__create_escaped_source(duk_hthread * thr,int idx_pattern)1070 DUK_LOCAL void duk__create_escaped_source(duk_hthread *thr, int idx_pattern) {
1071 	duk_hstring *h;
1072 	const duk_uint8_t *p;
1073 	duk_bufwriter_ctx bw_alloc;
1074 	duk_bufwriter_ctx *bw;
1075 	duk_uint8_t *q;
1076 	duk_size_t i, n;
1077 	duk_uint_fast8_t c_prev, c;
1078 
1079 	h = duk_known_hstring(thr, idx_pattern);
1080 	p = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);
1081 	n = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h);
1082 
1083 	if (n == 0) {
1084 		duk_push_literal(thr, "(?:)");
1085 		return;
1086 	}
1087 
1088 	bw = &bw_alloc;
1089 	DUK_BW_INIT_PUSHBUF(thr, bw, n);
1090 	q = DUK_BW_GET_PTR(thr, bw);
1091 
1092 	c_prev = (duk_uint_fast8_t) 0;
1093 
1094 	for (i = 0; i < n; i++) {
1095 		c = p[i];
1096 
1097 		q = DUK_BW_ENSURE_RAW(thr, bw, 2, q);
1098 
1099 		if (c == (duk_uint_fast8_t) '/' && c_prev != (duk_uint_fast8_t) '\\') {
1100 			/* Unescaped '/' ANYWHERE in the regexp (in disjunction,
1101 			 * inside a character class, ...) => same escape works.
1102 			 */
1103 			*q++ = DUK_ASC_BACKSLASH;
1104 		}
1105 		*q++ = (duk_uint8_t) c;
1106 
1107 		c_prev = c;
1108 	}
1109 
1110 	DUK_BW_SETPTR_AND_COMPACT(thr, bw, q);
1111 	(void) duk_buffer_to_string(thr, -1);  /* Safe if input is safe. */
1112 
1113 	/* [ ... escaped_source ] */
1114 }
1115 
1116 /*
1117  *  Exposed regexp compilation primitive.
1118  *
1119  *  Sets up a regexp compilation context, and calls duk__parse_disjunction() to do the
1120  *  actual parsing.  Handles generation of the compiled regexp header and the
1121  *  "boilerplate" capture of the matching substring (save 0 and 1).  Also does some
1122  *  global level regexp checks after recursive compilation has finished.
1123  *
1124  *  An escaped version of the regexp source, suitable for use as a RegExp instance
1125  *  'source' property (see E5 Section 15.10.3), is also left on the stack.
1126  *
1127  *  Input stack:  [ pattern flags ]
1128  *  Output stack: [ bytecode escaped_source ]  (both as strings)
1129  */
1130 
duk_regexp_compile(duk_hthread * thr)1131 DUK_INTERNAL void duk_regexp_compile(duk_hthread *thr) {
1132 	duk_re_compiler_ctx re_ctx;
1133 	duk_lexer_point lex_point;
1134 	duk_hstring *h_pattern;
1135 	duk_hstring *h_flags;
1136 	duk__re_disjunction_info ign_disj;
1137 
1138 	DUK_ASSERT(thr != NULL);
1139 
1140 	/*
1141 	 *  Args validation
1142 	 */
1143 
1144 	/* TypeError if fails */
1145 	h_pattern = duk_require_hstring_notsymbol(thr, -2);
1146 	h_flags = duk_require_hstring_notsymbol(thr, -1);
1147 
1148 	/*
1149 	 *  Create normalized 'source' property (E5 Section 15.10.3).
1150 	 */
1151 
1152 	/* [ ... pattern flags ] */
1153 
1154 	duk__create_escaped_source(thr, -2);
1155 
1156 	/* [ ... pattern flags escaped_source ] */
1157 
1158 	/*
1159 	 *  Init compilation context
1160 	 */
1161 
1162 	/* [ ... pattern flags escaped_source buffer ] */
1163 
1164 	duk_memzero(&re_ctx, sizeof(re_ctx));
1165 	DUK_LEXER_INITCTX(&re_ctx.lex);  /* duplicate zeroing, expect for (possible) NULL inits */
1166 	re_ctx.thr = thr;
1167 	re_ctx.lex.thr = thr;
1168 	re_ctx.lex.input = DUK_HSTRING_GET_DATA(h_pattern);
1169 	re_ctx.lex.input_length = DUK_HSTRING_GET_BYTELEN(h_pattern);
1170 	re_ctx.lex.token_limit = DUK_RE_COMPILE_TOKEN_LIMIT;
1171 	re_ctx.recursion_limit = DUK_USE_REGEXP_COMPILER_RECLIMIT;
1172 	re_ctx.re_flags = duk__parse_regexp_flags(thr, h_flags);
1173 
1174 	DUK_BW_INIT_PUSHBUF(thr, &re_ctx.bw, DUK__RE_INITIAL_BUFSIZE);
1175 
1176 	DUK_DD(DUK_DDPRINT("regexp compiler ctx initialized, flags=0x%08lx, recursion_limit=%ld",
1177 	                   (unsigned long) re_ctx.re_flags, (long) re_ctx.recursion_limit));
1178 
1179 	/*
1180 	 *  Init lexer
1181 	 */
1182 
1183 	lex_point.offset = 0;  /* expensive init, just want to fill window */
1184 	lex_point.line = 1;
1185 	DUK_LEXER_SETPOINT(&re_ctx.lex, &lex_point);
1186 
1187 	/*
1188 	 *  Compilation
1189 	 */
1190 
1191 	DUK_DD(DUK_DDPRINT("starting regexp compilation"));
1192 
1193 	duk__append_reop(&re_ctx, DUK_REOP_SAVE);
1194 	duk__append_7bit(&re_ctx, 0);
1195 	duk__parse_disjunction(&re_ctx, 1 /*expect_eof*/, &ign_disj);
1196 	duk__append_reop(&re_ctx, DUK_REOP_SAVE);
1197 	duk__append_7bit(&re_ctx, 1);
1198 	duk__append_reop(&re_ctx, DUK_REOP_MATCH);
1199 
1200 	/*
1201 	 *  Check for invalid backreferences; note that it is NOT an error
1202 	 *  to back-reference a capture group which has not yet been introduced
1203 	 *  in the pattern (as in /\1(foo)/); in fact, the backreference will
1204 	 *  always match!  It IS an error to back-reference a capture group
1205 	 *  which will never be introduced in the pattern.  Thus, we can check
1206 	 *  for such references only after parsing is complete.
1207 	 */
1208 
1209 	if (re_ctx.highest_backref > re_ctx.captures) {
1210 		DUK_ERROR_SYNTAX(thr, DUK_STR_INVALID_BACKREFS);
1211 		DUK_WO_NORETURN(return;);
1212 	}
1213 
1214 	/*
1215 	 *  Emit compiled regexp header: flags, ncaptures
1216 	 *  (insertion order inverted on purpose)
1217 	 */
1218 
1219 	duk__insert_u32(&re_ctx, 0, (re_ctx.captures + 1) * 2);
1220 	duk__insert_u32(&re_ctx, 0, re_ctx.re_flags);
1221 
1222 	/* [ ... pattern flags escaped_source buffer ] */
1223 
1224 	DUK_BW_COMPACT(thr, &re_ctx.bw);
1225 	(void) duk_buffer_to_string(thr, -1);  /* Safe because flags is at most 7 bit. */
1226 
1227 	/* [ ... pattern flags escaped_source bytecode ] */
1228 
1229 	/*
1230 	 *  Finalize stack
1231 	 */
1232 
1233 	duk_remove(thr, -4);     /* -> [ ... flags escaped_source bytecode ] */
1234 	duk_remove(thr, -3);     /* -> [ ... escaped_source bytecode ] */
1235 
1236 	DUK_DD(DUK_DDPRINT("regexp compilation successful, bytecode: %!T, escaped source: %!T",
1237 	                   (duk_tval *) duk_get_tval(thr, -1), (duk_tval *) duk_get_tval(thr, -2)));
1238 }
1239 
1240 /*
1241  *  Create a RegExp instance (E5 Section 15.10.7).
1242  *
1243  *  Note: the output stack left by duk_regexp_compile() is directly compatible
1244  *  with the input here.
1245  *
1246  *  Input stack:  [ escaped_source bytecode ]  (both as strings)
1247  *  Output stack: [ RegExp ]
1248  */
1249 
duk_regexp_create_instance(duk_hthread * thr)1250 DUK_INTERNAL void duk_regexp_create_instance(duk_hthread *thr) {
1251 	duk_hobject *h;
1252 
1253 	/* [ ... escaped_source bytecode ] */
1254 
1255 	duk_push_object(thr);
1256 	h = duk_known_hobject(thr, -1);
1257 	duk_insert(thr, -3);
1258 
1259 	/* [ ... regexp_object escaped_source bytecode ] */
1260 
1261 	DUK_HOBJECT_SET_CLASS_NUMBER(h, DUK_HOBJECT_CLASS_REGEXP);
1262 	DUK_HOBJECT_SET_PROTOTYPE_UPDREF(thr, h, thr->builtins[DUK_BIDX_REGEXP_PROTOTYPE]);
1263 
1264 	duk_xdef_prop_stridx_short(thr, -3, DUK_STRIDX_INT_BYTECODE, DUK_PROPDESC_FLAGS_NONE);
1265 
1266 	/* [ ... regexp_object escaped_source ] */
1267 
1268 	/* In ES2015 .source, and the .global, .multiline, etc flags are
1269 	 * inherited getters.  Store the escaped source as an internal
1270 	 * property for the getter.
1271 	 */
1272 
1273 	duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_INT_SOURCE, DUK_PROPDESC_FLAGS_NONE);
1274 
1275 	/* [ ... regexp_object ] */
1276 
1277 	duk_push_int(thr, 0);
1278 	duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_LAST_INDEX, DUK_PROPDESC_FLAGS_W);
1279 
1280 	/* [ ... regexp_object ] */
1281 }
1282 
1283 #else  /* DUK_USE_REGEXP_SUPPORT */
1284 
1285 /* regexp support disabled */
1286 
1287 #endif  /* DUK_USE_REGEXP_SUPPORT */
1288