1 /*
2  *  String built-ins
3  *
4  *  Most String built-ins must only accept strings (or String objects).
5  *  Symbols, represented internally as strings, must be generally rejected.
6  *  The duk_push_this_coercible_to_string() helper does this automatically.
7  */
8 
9 /* XXX: There are several limitations in the current implementation for
10  * strings with >= 0x80000000UL characters.  In some cases one would need
11  * to be able to represent the range [-0xffffffff,0xffffffff] and so on.
12  * Generally character and byte length are assumed to fit into signed 32
13  * bits (< 0x80000000UL).  Places with issues are not marked explicitly
14  * below in all cases, look for signed type usage (duk_int_t etc) for
15  * offsets/lengths.
16  */
17 
18 #include "duk_internal.h"
19 
20 #if defined(DUK_USE_STRING_BUILTIN)
21 
22 /*
23  *  Helpers
24  */
25 
duk__str_tostring_notregexp(duk_hthread * thr,duk_idx_t idx)26 DUK_LOCAL duk_hstring *duk__str_tostring_notregexp(duk_hthread *thr, duk_idx_t idx) {
27 	duk_hstring *h;
28 
29 	if (duk_get_class_number(thr, idx) == DUK_HOBJECT_CLASS_REGEXP) {
30 		DUK_ERROR_TYPE_INVALID_ARGS(thr);
31 		DUK_WO_NORETURN(return NULL;);
32 	}
33 	h = duk_to_hstring(thr, idx);
34 	DUK_ASSERT(h != NULL);
35 
36 	return h;
37 }
38 
duk__str_search_shared(duk_hthread * thr,duk_hstring * h_this,duk_hstring * h_search,duk_int_t start_cpos,duk_bool_t backwards)39 DUK_LOCAL duk_int_t duk__str_search_shared(duk_hthread *thr, duk_hstring *h_this, duk_hstring *h_search, duk_int_t start_cpos, duk_bool_t backwards) {
40 	duk_int_t cpos;
41 	duk_int_t bpos;
42 	const duk_uint8_t *p_start, *p_end, *p;
43 	const duk_uint8_t *q_start;
44 	duk_int_t q_blen;
45 	duk_uint8_t firstbyte;
46 	duk_uint8_t t;
47 
48 	cpos = start_cpos;
49 
50 	/* Empty searchstring always matches; cpos must be clamped here.
51 	 * (If q_blen were < 0 due to clamped coercion, it would also be
52 	 * caught here.)
53 	 */
54 	q_start = DUK_HSTRING_GET_DATA(h_search);
55 	q_blen = (duk_int_t) DUK_HSTRING_GET_BYTELEN(h_search);
56 	if (q_blen <= 0) {
57 		return cpos;
58 	}
59 	DUK_ASSERT(q_blen > 0);
60 
61 	bpos = (duk_int_t) duk_heap_strcache_offset_char2byte(thr, h_this, (duk_uint32_t) cpos);
62 
63 	p_start = DUK_HSTRING_GET_DATA(h_this);
64 	p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_this);
65 	p = p_start + bpos;
66 
67 	/* This loop is optimized for size.  For speed, there should be
68 	 * two separate loops, and we should ensure that memcmp() can be
69 	 * used without an extra "will searchstring fit" check.  Doing
70 	 * the preconditioning for 'p' and 'p_end' is easy but cpos
71 	 * must be updated if 'p' is wound back (backward scanning).
72 	 */
73 
74 	firstbyte = q_start[0];  /* leading byte of match string */
75 	while (p <= p_end && p >= p_start) {
76 		t = *p;
77 
78 		/* For ECMAScript strings, this check can only match for
79 		 * initial UTF-8 bytes (not continuation bytes).  For other
80 		 * strings all bets are off.
81 		 */
82 
83 		if ((t == firstbyte) && ((duk_size_t) (p_end - p) >= (duk_size_t) q_blen)) {
84 			DUK_ASSERT(q_blen > 0);
85 			if (duk_memcmp((const void *) p, (const void *) q_start, (size_t) q_blen) == 0) {
86 				return cpos;
87 			}
88 		}
89 
90 		/* track cpos while scanning */
91 		if (backwards) {
92 			/* when going backwards, we decrement cpos 'early';
93 			 * 'p' may point to a continuation byte of the char
94 			 * at offset 'cpos', but that's OK because we'll
95 			 * backtrack all the way to the initial byte.
96 			 */
97 			if ((t & 0xc0) != 0x80) {
98 				cpos--;
99 			}
100 			p--;
101 		} else {
102 			if ((t & 0xc0) != 0x80) {
103 				cpos++;
104 			}
105 			p++;
106 		}
107 	}
108 
109 	/* Not found.  Empty string case is handled specially above. */
110 	return -1;
111 }
112 
113 /*
114  *  Constructor
115  */
116 
duk_bi_string_constructor(duk_hthread * thr)117 DUK_INTERNAL duk_ret_t duk_bi_string_constructor(duk_hthread *thr) {
118 	duk_hstring *h;
119 	duk_uint_t flags;
120 
121 	/* String constructor needs to distinguish between an argument not given at all
122 	 * vs. given as 'undefined'.  We're a vararg function to handle this properly.
123 	 */
124 
125 	/* XXX: copy current activation flags to thr, including current magic,
126 	 * is_constructor_call etc.  This takes a few bytes in duk_hthread but
127 	 * makes call sites smaller (there are >30 is_constructor_call and get
128 	 * current magic call sites.
129 	 */
130 
131 	if (duk_get_top(thr) == 0) {
132 		duk_push_hstring_empty(thr);
133 	} else {
134 		h = duk_to_hstring_acceptsymbol(thr, 0);
135 		if (DUK_UNLIKELY(DUK_HSTRING_HAS_SYMBOL(h) && !duk_is_constructor_call(thr))) {
136 			duk_push_symbol_descriptive_string(thr, h);
137 			duk_replace(thr, 0);
138 		}
139 	}
140 	duk_to_string(thr, 0);  /* catches symbol argument for constructor call */
141 	DUK_ASSERT(duk_is_string(thr, 0));
142 	duk_set_top(thr, 1);  /* Top may be 1 or larger. */
143 
144 	if (duk_is_constructor_call(thr)) {
145 		/* String object internal value is immutable */
146 		flags = DUK_HOBJECT_FLAG_EXTENSIBLE |
147 		        DUK_HOBJECT_FLAG_FASTREFS |
148 		        DUK_HOBJECT_FLAG_EXOTIC_STRINGOBJ |
149 		        DUK_HOBJECT_CLASS_AS_FLAGS(DUK_HOBJECT_CLASS_STRING);
150 		duk_push_object_helper(thr, flags, DUK_BIDX_STRING_PROTOTYPE);
151 		duk_dup_0(thr);
152 		duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_INT_VALUE, DUK_PROPDESC_FLAGS_NONE);
153 	}
154 	/* Note: unbalanced stack on purpose */
155 
156 	return 1;
157 }
158 
duk__construct_from_codepoints(duk_hthread * thr,duk_bool_t nonbmp)159 DUK_LOCAL duk_ret_t duk__construct_from_codepoints(duk_hthread *thr, duk_bool_t nonbmp) {
160 	duk_bufwriter_ctx bw_alloc;
161 	duk_bufwriter_ctx *bw;
162 	duk_idx_t i, n;
163 	duk_ucodepoint_t cp;
164 
165 	/* XXX: It would be nice to build the string directly but ToUint16()
166 	 * coercion is needed so a generic helper would not be very
167 	 * helpful (perhaps coerce the value stack first here and then
168 	 * build a string from a duk_tval number sequence in one go?).
169 	 */
170 
171 	n = duk_get_top(thr);
172 
173 	bw = &bw_alloc;
174 	DUK_BW_INIT_PUSHBUF(thr, bw, (duk_size_t) n);  /* initial estimate for ASCII only codepoints */
175 
176 	for (i = 0; i < n; i++) {
177 		/* XXX: could improve bufwriter handling to write multiple codepoints
178 		 * with one ensure call but the relative benefit would be quite small.
179 		 */
180 
181 		if (nonbmp) {
182 			/* ES2015 requires that (1) SameValue(cp, ToInteger(cp)) and
183 			 * (2) cp >= 0 and cp <= 0x10ffff.  This check does not
184 			 * implement the steps exactly but the outcome should be
185 			 * the same.
186 			 */
187 			duk_int32_t i32 = 0;
188 			if (!duk_is_whole_get_int32(duk_to_number(thr, i), &i32) ||
189 			    i32 < 0 || i32 > 0x10ffffL) {
190 				DUK_DCERROR_RANGE_INVALID_ARGS(thr);
191 			}
192 			DUK_ASSERT(i32 >= 0 && i32 <= 0x10ffffL);
193 			cp = (duk_ucodepoint_t) i32;
194 			DUK_BW_WRITE_ENSURE_CESU8(thr, bw, cp);
195 		} else {
196 #if defined(DUK_USE_NONSTD_STRING_FROMCHARCODE_32BIT)
197 			/* ToUint16() coercion is mandatory in the E5.1 specification, but
198 			 * this non-compliant behavior makes more sense because we support
199 			 * non-BMP codepoints.  Don't use CESU-8 because that'd create
200 			 * surrogate pairs.
201 			 */
202 			cp = (duk_ucodepoint_t) duk_to_uint32(thr, i);
203 			DUK_BW_WRITE_ENSURE_XUTF8(thr, bw, cp);
204 #else
205 			cp = (duk_ucodepoint_t) duk_to_uint16(thr, i);
206 			DUK_ASSERT(cp >= 0 && cp <= 0x10ffffL);
207 			DUK_BW_WRITE_ENSURE_CESU8(thr, bw, cp);
208 #endif
209 		}
210 	}
211 
212 	DUK_BW_COMPACT(thr, bw);
213 	(void) duk_buffer_to_string(thr, -1);  /* Safe, extended UTF-8 or CESU-8 encoded. */
214 	return 1;
215 }
216 
duk_bi_string_constructor_from_char_code(duk_hthread * thr)217 DUK_INTERNAL duk_ret_t duk_bi_string_constructor_from_char_code(duk_hthread *thr) {
218 	return duk__construct_from_codepoints(thr, 0 /*nonbmp*/);
219 }
220 
221 #if defined(DUK_USE_ES6)
duk_bi_string_constructor_from_code_point(duk_hthread * thr)222 DUK_INTERNAL duk_ret_t duk_bi_string_constructor_from_code_point(duk_hthread *thr) {
223 	return duk__construct_from_codepoints(thr, 1 /*nonbmp*/);
224 }
225 #endif
226 
227 /*
228  *  toString(), valueOf()
229  */
230 
duk_bi_string_prototype_to_string(duk_hthread * thr)231 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_to_string(duk_hthread *thr) {
232 	duk_tval *tv;
233 
234 	duk_push_this(thr);
235 	tv = duk_require_tval(thr, -1);
236 	DUK_ASSERT(tv != NULL);
237 
238 	if (DUK_TVAL_IS_STRING(tv)) {
239 		/* return as is */
240 	} else if (DUK_TVAL_IS_OBJECT(tv)) {
241 		duk_hobject *h = DUK_TVAL_GET_OBJECT(tv);
242 		DUK_ASSERT(h != NULL);
243 
244 		/* Must be a "string object", i.e. class "String" */
245 		if (DUK_HOBJECT_GET_CLASS_NUMBER(h) != DUK_HOBJECT_CLASS_STRING) {
246 			goto type_error;
247 		}
248 
249 		duk_xget_owndataprop_stridx_short(thr, -1, DUK_STRIDX_INT_VALUE);
250 		DUK_ASSERT(duk_is_string(thr, -1));
251 	} else {
252 		goto type_error;
253 	}
254 
255 	(void) duk_require_hstring_notsymbol(thr, -1);  /* Reject symbols (and wrapped symbols). */
256 	return 1;
257 
258  type_error:
259 	DUK_DCERROR_TYPE_INVALID_ARGS(thr);
260 }
261 
262 /*
263  *  Character and charcode access
264  */
265 
duk_bi_string_prototype_char_at(duk_hthread * thr)266 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_char_at(duk_hthread *thr) {
267 	duk_hstring *h;
268 	duk_int_t pos;
269 
270 	/* XXX: faster implementation */
271 
272 	h = duk_push_this_coercible_to_string(thr);
273 	DUK_ASSERT(h != NULL);
274 
275 	pos = duk_to_int(thr, 0);
276 
277 	if (sizeof(duk_size_t) >= sizeof(duk_uint_t)) {
278 		/* Cast to duk_size_t works in this case:
279 		 * - If pos < 0, (duk_size_t) pos will always be
280 		 *   >= max_charlen, and result will be the empty string
281 		 *   (see duk_substring()).
282 		 * - If pos >= 0, pos + 1 cannot wrap.
283 		 */
284 		DUK_ASSERT((duk_size_t) DUK_INT_MIN >= DUK_HSTRING_MAX_BYTELEN);
285 		DUK_ASSERT((duk_size_t) DUK_INT_MAX + 1U > (duk_size_t) DUK_INT_MAX);
286 		duk_substring(thr, -1, (duk_size_t) pos, (duk_size_t) pos + 1U);
287 	} else {
288 		/* If size_t is smaller than int, explicit bounds checks
289 		 * are needed because an int may wrap multiple times.
290 		 */
291 		if (DUK_UNLIKELY(pos < 0 || (duk_uint_t) pos >= (duk_uint_t) DUK_HSTRING_GET_CHARLEN(h))) {
292 			duk_push_hstring_empty(thr);
293 		} else {
294 			duk_substring(thr, -1, (duk_size_t) pos, (duk_size_t) pos + 1U);
295 		}
296 	}
297 
298 	return 1;
299 }
300 
301 /* Magic: 0=charCodeAt, 1=codePointAt */
duk_bi_string_prototype_char_code_at(duk_hthread * thr)302 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_char_code_at(duk_hthread *thr) {
303 	duk_int_t pos;
304 	duk_hstring *h;
305 	duk_bool_t clamped;
306 	duk_uint32_t cp;
307 	duk_int_t magic;
308 
309 	/* XXX: faster implementation */
310 
311 	DUK_DDD(DUK_DDDPRINT("arg=%!T", (duk_tval *) duk_get_tval(thr, 0)));
312 
313 	h = duk_push_this_coercible_to_string(thr);
314 	DUK_ASSERT(h != NULL);
315 
316 	pos = duk_to_int_clamped_raw(thr,
317 	                             0 /*index*/,
318 	                             0 /*min(incl)*/,
319 	                             (duk_int_t) DUK_HSTRING_GET_CHARLEN(h) - 1 /*max(incl)*/,
320 	                             &clamped /*out_clamped*/);
321 #if defined(DUK_USE_ES6)
322 	magic = duk_get_current_magic(thr);
323 #else
324 	DUK_ASSERT(duk_get_current_magic(thr) == 0);
325 	magic = 0;
326 #endif
327 	if (clamped) {
328 		/* For out-of-bounds indices .charCodeAt() returns NaN and
329 		 * .codePointAt() returns undefined.
330 		 */
331 		if (magic != 0) {
332 			return 0;
333 		}
334 		duk_push_nan(thr);
335 	} else {
336 		DUK_ASSERT(pos >= 0);
337 		cp = (duk_uint32_t) duk_hstring_char_code_at_raw(thr, h, (duk_uint_t) pos, (duk_bool_t) magic /*surrogate_aware*/);
338 		duk_push_u32(thr, cp);
339 	}
340 	return 1;
341 }
342 
343 /*
344  *  substring(), substr(), slice()
345  */
346 
347 /* XXX: any chance of merging these three similar but still slightly
348  * different algorithms so that footprint would be reduced?
349  */
350 
duk_bi_string_prototype_substring(duk_hthread * thr)351 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_substring(duk_hthread *thr) {
352 	duk_hstring *h;
353 	duk_int_t start_pos, end_pos;
354 	duk_int_t len;
355 
356 	h = duk_push_this_coercible_to_string(thr);
357 	DUK_ASSERT(h != NULL);
358 	len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h);
359 
360 	/* [ start end str ] */
361 
362 	start_pos = duk_to_int_clamped(thr, 0, 0, len);
363 	if (duk_is_undefined(thr, 1)) {
364 		end_pos = len;
365 	} else {
366 		end_pos = duk_to_int_clamped(thr, 1, 0, len);
367 	}
368 	DUK_ASSERT(start_pos >= 0 && start_pos <= len);
369 	DUK_ASSERT(end_pos >= 0 && end_pos <= len);
370 
371 	if (start_pos > end_pos) {
372 		duk_int_t tmp = start_pos;
373 		start_pos = end_pos;
374 		end_pos = tmp;
375 	}
376 
377 	DUK_ASSERT(end_pos >= start_pos);
378 
379 	duk_substring(thr, -1, (duk_size_t) start_pos, (duk_size_t) end_pos);
380 	return 1;
381 }
382 
383 #if defined(DUK_USE_SECTION_B)
duk_bi_string_prototype_substr(duk_hthread * thr)384 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_substr(duk_hthread *thr) {
385 	duk_hstring *h;
386 	duk_int_t start_pos, end_pos;
387 	duk_int_t len;
388 
389 	/* Unlike non-obsolete String calls, substr() algorithm in E5.1
390 	 * specification will happily coerce undefined and null to strings
391 	 * ("undefined" and "null").
392 	 */
393 	duk_push_this(thr);
394 	h = duk_to_hstring_m1(thr);  /* Reject Symbols. */
395 	DUK_ASSERT(h != NULL);
396 	len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h);
397 
398 	/* [ start length str ] */
399 
400 	/* The implementation for computing of start_pos and end_pos differs
401 	 * from the standard algorithm, but is intended to result in the exactly
402 	 * same behavior.  This is not always obvious.
403 	 */
404 
405 	/* combines steps 2 and 5; -len ensures max() not needed for step 5 */
406 	start_pos = duk_to_int_clamped(thr, 0, -len, len);
407 	if (start_pos < 0) {
408 		start_pos = len + start_pos;
409 	}
410 	DUK_ASSERT(start_pos >= 0 && start_pos <= len);
411 
412 	/* combines steps 3, 6; step 7 is not needed */
413 	if (duk_is_undefined(thr, 1)) {
414 		end_pos = len;
415 	} else {
416 		DUK_ASSERT(start_pos <= len);
417 		end_pos = start_pos + duk_to_int_clamped(thr, 1, 0, len - start_pos);
418 	}
419 	DUK_ASSERT(start_pos >= 0 && start_pos <= len);
420 	DUK_ASSERT(end_pos >= 0 && end_pos <= len);
421 	DUK_ASSERT(end_pos >= start_pos);
422 
423 	duk_substring(thr, -1, (duk_size_t) start_pos, (duk_size_t) end_pos);
424 	return 1;
425 }
426 #endif  /* DUK_USE_SECTION_B */
427 
duk_bi_string_prototype_slice(duk_hthread * thr)428 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_slice(duk_hthread *thr) {
429 	duk_hstring *h;
430 	duk_int_t start_pos, end_pos;
431 	duk_int_t len;
432 
433 	h = duk_push_this_coercible_to_string(thr);
434 	DUK_ASSERT(h != NULL);
435 	len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h);
436 
437 	/* [ start end str ] */
438 
439 	start_pos = duk_to_int_clamped(thr, 0, -len, len);
440 	if (start_pos < 0) {
441 		start_pos = len + start_pos;
442 	}
443 	if (duk_is_undefined(thr, 1)) {
444 		end_pos = len;
445 	} else {
446 		end_pos = duk_to_int_clamped(thr, 1, -len, len);
447 		if (end_pos < 0) {
448 			end_pos = len + end_pos;
449 		}
450 	}
451 	DUK_ASSERT(start_pos >= 0 && start_pos <= len);
452 	DUK_ASSERT(end_pos >= 0 && end_pos <= len);
453 
454 	if (end_pos < start_pos) {
455 		end_pos = start_pos;
456 	}
457 
458 	DUK_ASSERT(end_pos >= start_pos);
459 
460 	duk_substring(thr, -1, (duk_size_t) start_pos, (duk_size_t) end_pos);
461 	return 1;
462 }
463 
464 /*
465  *  Case conversion
466  */
467 
duk_bi_string_prototype_caseconv_shared(duk_hthread * thr)468 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_caseconv_shared(duk_hthread *thr) {
469 	duk_small_int_t uppercase = duk_get_current_magic(thr);
470 
471 	(void) duk_push_this_coercible_to_string(thr);
472 	duk_unicode_case_convert_string(thr, (duk_bool_t) uppercase);
473 	return 1;
474 }
475 
476 /*
477  *  indexOf() and lastIndexOf()
478  */
479 
duk_bi_string_prototype_indexof_shared(duk_hthread * thr)480 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_indexof_shared(duk_hthread *thr) {
481 	duk_hstring *h_this;
482 	duk_hstring *h_search;
483 	duk_int_t clen_this;
484 	duk_int_t cpos;
485 	duk_small_uint_t is_lastindexof = (duk_small_uint_t) duk_get_current_magic(thr);  /* 0=indexOf, 1=lastIndexOf */
486 
487 	h_this = duk_push_this_coercible_to_string(thr);
488 	DUK_ASSERT(h_this != NULL);
489 	clen_this = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h_this);
490 
491 	h_search = duk_to_hstring(thr, 0);
492 	DUK_ASSERT(h_search != NULL);
493 
494 	duk_to_number(thr, 1);
495 	if (duk_is_nan(thr, 1) && is_lastindexof) {
496 		/* indexOf: NaN should cause pos to be zero.
497 		 * lastIndexOf: NaN should cause pos to be +Infinity
498 		 * (and later be clamped to len).
499 		 */
500 		cpos = clen_this;
501 	} else {
502 		cpos = duk_to_int_clamped(thr, 1, 0, clen_this);
503 	}
504 
505 	cpos = duk__str_search_shared(thr, h_this, h_search, cpos, is_lastindexof /*backwards*/);
506 	duk_push_int(thr, cpos);
507 	return 1;
508 }
509 
510 /*
511  *  replace()
512  */
513 
514 /* XXX: the current implementation works but is quite clunky; it compiles
515  * to almost 1,4kB of x86 code so it needs to be simplified (better approach,
516  * shared helpers, etc).  Some ideas for refactoring:
517  *
518  * - a primitive to convert a string into a regexp matcher (reduces matching
519  *   code at the cost of making matching much slower)
520  * - use replace() as a basic helper for match() and split(), which are both
521  *   much simpler
522  * - API call to get_prop and to_boolean
523  */
524 
duk_bi_string_prototype_replace(duk_hthread * thr)525 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_replace(duk_hthread *thr) {
526 	duk_hstring *h_input;
527 	duk_hstring *h_match;
528 	duk_hstring *h_search;
529 	duk_hobject *h_re;
530 	duk_bufwriter_ctx bw_alloc;
531 	duk_bufwriter_ctx *bw;
532 #if defined(DUK_USE_REGEXP_SUPPORT)
533 	duk_bool_t is_regexp;
534 	duk_bool_t is_global;
535 #endif
536 	duk_bool_t is_repl_func;
537 	duk_uint32_t match_start_coff, match_start_boff;
538 #if defined(DUK_USE_REGEXP_SUPPORT)
539 	duk_int_t match_caps;
540 #endif
541 	duk_uint32_t prev_match_end_boff;
542 	const duk_uint8_t *r_start, *r_end, *r;   /* repl string scan */
543 	duk_size_t tmp_sz;
544 
545 	DUK_ASSERT_TOP(thr, 2);
546 	h_input = duk_push_this_coercible_to_string(thr);
547 	DUK_ASSERT(h_input != NULL);
548 
549 	bw = &bw_alloc;
550 	DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input));  /* input size is good output starting point */
551 
552 	DUK_ASSERT_TOP(thr, 4);
553 
554 	/* stack[0] = search value
555 	 * stack[1] = replace value
556 	 * stack[2] = input string
557 	 * stack[3] = result buffer
558 	 */
559 
560 	h_re = duk_get_hobject_with_class(thr, 0, DUK_HOBJECT_CLASS_REGEXP);
561 	if (h_re) {
562 #if defined(DUK_USE_REGEXP_SUPPORT)
563 		is_regexp = 1;
564 		is_global = duk_get_prop_stridx_boolean(thr, 0, DUK_STRIDX_GLOBAL, NULL);
565 
566 		if (is_global) {
567 			/* start match from beginning */
568 			duk_push_int(thr, 0);
569 			duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
570 		}
571 #else  /* DUK_USE_REGEXP_SUPPORT */
572 		DUK_DCERROR_UNSUPPORTED(thr);
573 #endif  /* DUK_USE_REGEXP_SUPPORT */
574 	} else {
575 		duk_to_string(thr, 0);  /* rejects symbols */
576 #if defined(DUK_USE_REGEXP_SUPPORT)
577 		is_regexp = 0;
578 		is_global = 0;
579 #endif
580 	}
581 
582 	if (duk_is_function(thr, 1)) {
583 		is_repl_func = 1;
584 		r_start = NULL;
585 		r_end = NULL;
586 	} else {
587 		duk_hstring *h_repl;
588 
589 		is_repl_func = 0;
590 		h_repl = duk_to_hstring(thr, 1);  /* reject symbols */
591 		DUK_ASSERT(h_repl != NULL);
592 		r_start = DUK_HSTRING_GET_DATA(h_repl);
593 		r_end = r_start + DUK_HSTRING_GET_BYTELEN(h_repl);
594 	}
595 
596 	prev_match_end_boff = 0;
597 
598 	for (;;) {
599 		/*
600 		 *  If matching with a regexp:
601 		 *    - non-global RegExp: lastIndex not touched on a match, zeroed
602 		 *      on a non-match
603 		 *    - global RegExp: on match, lastIndex will be updated by regexp
604 		 *      executor to point to next char after the matching part (so that
605 		 *      characters in the matching part are not matched again)
606 		 *
607 		 *  If matching with a string:
608 		 *    - always non-global match, find first occurrence
609 		 *
610 		 *  We need:
611 		 *    - The character offset of start-of-match for the replacer function
612 		 *    - The byte offsets for start-of-match and end-of-match to implement
613 		 *      the replacement values $&, $`, and $', and to copy non-matching
614 		 *      input string portions (including header and trailer) verbatim.
615 		 *
616 		 *  NOTE: the E5.1 specification is a bit vague how the RegExp should
617 		 *  behave in the replacement process; e.g. is matching done first for
618 		 *  all matches (in the global RegExp case) before any replacer calls
619 		 *  are made?  See: test-bi-string-proto-replace.js for discussion.
620 		 */
621 
622 		DUK_ASSERT_TOP(thr, 4);
623 
624 #if defined(DUK_USE_REGEXP_SUPPORT)
625 		if (is_regexp) {
626 			duk_dup_0(thr);
627 			duk_dup_2(thr);
628 			duk_regexp_match(thr);  /* [ ... regexp input ] -> [ res_obj ] */
629 			if (!duk_is_object(thr, -1)) {
630 				duk_pop(thr);
631 				break;
632 			}
633 
634 			duk_get_prop_stridx_short(thr, -1, DUK_STRIDX_INDEX);
635 			DUK_ASSERT(duk_is_number(thr, -1));
636 			match_start_coff = duk_get_uint(thr, -1);
637 			duk_pop(thr);
638 
639 			duk_get_prop_index(thr, -1, 0);
640 			DUK_ASSERT(duk_is_string(thr, -1));
641 			h_match = duk_known_hstring(thr, -1);
642 			duk_pop(thr);  /* h_match is borrowed, remains reachable through match_obj */
643 
644 			if (DUK_HSTRING_GET_BYTELEN(h_match) == 0) {
645 				/* This should be equivalent to match() algorithm step 8.f.iii.2:
646 				 * detect an empty match and allow it, but don't allow it twice.
647 				 */
648 				duk_uint32_t last_index;
649 
650 				duk_get_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
651 				last_index = (duk_uint32_t) duk_get_uint(thr, -1);
652 				DUK_DDD(DUK_DDDPRINT("empty match, bump lastIndex: %ld -> %ld",
653 				                     (long) last_index, (long) (last_index + 1)));
654 				duk_pop(thr);
655 				duk_push_uint(thr, (duk_uint_t) (last_index + 1));
656 				duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
657 			}
658 
659 			DUK_ASSERT(duk_get_length(thr, -1) <= DUK_INT_MAX);  /* string limits */
660 			match_caps = (duk_int_t) duk_get_length(thr, -1);
661 		} else {
662 #else  /* DUK_USE_REGEXP_SUPPORT */
663 		{  /* unconditionally */
664 #endif  /* DUK_USE_REGEXP_SUPPORT */
665 			const duk_uint8_t *p_start, *p_end, *p;   /* input string scan */
666 			const duk_uint8_t *q_start;               /* match string */
667 			duk_size_t p_blen;
668 			duk_size_t q_blen;
669 
670 #if defined(DUK_USE_REGEXP_SUPPORT)
671 			DUK_ASSERT(!is_global);  /* single match always */
672 #endif
673 
674 			p_start = DUK_HSTRING_GET_DATA(h_input);
675 			p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
676 			p_blen = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h_input);
677 			p = p_start;
678 
679 			h_search = duk_known_hstring(thr, 0);
680 			q_start = DUK_HSTRING_GET_DATA(h_search);
681 			q_blen = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h_search);
682 
683 			if (q_blen > p_blen) {
684 				break;  /* no match */
685 			}
686 
687 			p_end -= q_blen;  /* ensure full memcmp() fits in while */
688 			DUK_ASSERT(p_end >= p);
689 
690 			match_start_coff = 0;
691 
692 			while (p <= p_end) {
693 				DUK_ASSERT(p + q_blen <= DUK_HSTRING_GET_DATA(h_input) + DUK_HSTRING_GET_BYTELEN(h_input));
694 				if (duk_memcmp((const void *) p, (const void *) q_start, (size_t) q_blen) == 0) {
695 					duk_dup_0(thr);
696 					h_match = duk_known_hstring(thr, -1);
697 #if defined(DUK_USE_REGEXP_SUPPORT)
698 					match_caps = 0;
699 #endif
700 					goto found;
701 				}
702 
703 				/* track utf-8 non-continuation bytes */
704 				if ((p[0] & 0xc0) != 0x80) {
705 					match_start_coff++;
706 				}
707 				p++;
708 			}
709 
710 			/* not found */
711 			break;
712 		}
713 	 found:
714 
715 		/* stack[0] = search value
716 		 * stack[1] = replace value
717 		 * stack[2] = input string
718 		 * stack[3] = result buffer
719 		 * stack[4] = regexp match OR match string
720 		 */
721 
722 		match_start_boff = (duk_uint32_t) duk_heap_strcache_offset_char2byte(thr, h_input, match_start_coff);
723 
724 		tmp_sz = (duk_size_t) (match_start_boff - prev_match_end_boff);
725 		DUK_BW_WRITE_ENSURE_BYTES(thr, bw, DUK_HSTRING_GET_DATA(h_input) + prev_match_end_boff, tmp_sz);
726 
727 		prev_match_end_boff = match_start_boff + DUK_HSTRING_GET_BYTELEN(h_match);
728 
729 		if (is_repl_func) {
730 			duk_idx_t idx_args;
731 			duk_hstring *h_repl;
732 
733 			/* regexp res_obj is at index 4 */
734 
735 			duk_dup_1(thr);
736 			idx_args = duk_get_top(thr);
737 
738 #if defined(DUK_USE_REGEXP_SUPPORT)
739 			if (is_regexp) {
740 				duk_int_t idx;
741 				duk_require_stack(thr, match_caps + 2);
742 				for (idx = 0; idx < match_caps; idx++) {
743 					/* match followed by capture(s) */
744 					duk_get_prop_index(thr, 4, (duk_uarridx_t) idx);
745 				}
746 			} else {
747 #else  /* DUK_USE_REGEXP_SUPPORT */
748 			{  /* unconditionally */
749 #endif  /* DUK_USE_REGEXP_SUPPORT */
750 				/* match == search string, by definition */
751 				duk_dup_0(thr);
752 			}
753 			duk_push_uint(thr, (duk_uint_t) match_start_coff);
754 			duk_dup_2(thr);
755 
756 			/* [ ... replacer match [captures] match_char_offset input ] */
757 
758 			duk_call(thr, duk_get_top(thr) - idx_args);
759 			h_repl = duk_to_hstring_m1(thr);  /* -> [ ... repl_value ] */
760 			DUK_ASSERT(h_repl != NULL);
761 
762 			DUK_BW_WRITE_ENSURE_HSTRING(thr, bw, h_repl);
763 
764 			duk_pop(thr);  /* repl_value */
765 		} else {
766 			r = r_start;
767 
768 			while (r < r_end) {
769 				duk_int_t ch1;
770 				duk_int_t ch2;
771 #if defined(DUK_USE_REGEXP_SUPPORT)
772 				duk_int_t ch3;
773 #endif
774 				duk_size_t left;
775 
776 				ch1 = *r++;
777 				if (ch1 != DUK_ASC_DOLLAR) {
778 					goto repl_write;
779 				}
780 				DUK_ASSERT(r <= r_end);
781 				left = (duk_size_t) (r_end - r);
782 
783 				if (left <= 0) {
784 					goto repl_write;
785 				}
786 
787 				ch2 = r[0];
788 				switch (ch2) {
789 				case DUK_ASC_DOLLAR: {
790 					ch1 = (1 << 8) + DUK_ASC_DOLLAR;
791 					goto repl_write;
792 				}
793 				case DUK_ASC_AMP: {
794 					DUK_BW_WRITE_ENSURE_HSTRING(thr, bw, h_match);
795 					r++;
796 					continue;
797 				}
798 				case DUK_ASC_GRAVE: {
799 					tmp_sz = (duk_size_t) match_start_boff;
800 					DUK_BW_WRITE_ENSURE_BYTES(thr, bw, DUK_HSTRING_GET_DATA(h_input), tmp_sz);
801 					r++;
802 					continue;
803 				}
804 				case DUK_ASC_SINGLEQUOTE: {
805 					duk_uint32_t match_end_boff;
806 
807 					/* Use match charlen instead of bytelen, just in case the input and
808 					 * match codepoint encodings would have different lengths.
809 					 */
810 					/* XXX: charlen computed here, and also in char2byte helper. */
811 					match_end_boff = (duk_uint32_t) duk_heap_strcache_offset_char2byte(thr,
812 					                                                                   h_input,
813 					                                                                   match_start_coff + (duk_uint_fast32_t) DUK_HSTRING_GET_CHARLEN(h_match));
814 
815 					tmp_sz = (duk_size_t) (DUK_HSTRING_GET_BYTELEN(h_input) - match_end_boff);
816 					DUK_BW_WRITE_ENSURE_BYTES(thr, bw, DUK_HSTRING_GET_DATA(h_input) + match_end_boff, tmp_sz);
817 					r++;
818 					continue;
819 				}
820 				default: {
821 #if defined(DUK_USE_REGEXP_SUPPORT)
822 					duk_int_t capnum, captmp, capadv;
823 					/* XXX: optional check, match_caps is zero if no regexp,
824 					 * so dollar will be interpreted literally anyway.
825 					 */
826 
827 					if (!is_regexp) {
828 						goto repl_write;
829 					}
830 
831 					if (!(ch2 >= DUK_ASC_0 && ch2 <= DUK_ASC_9)) {
832 						goto repl_write;
833 					}
834 					capnum = ch2 - DUK_ASC_0;
835 					capadv = 1;
836 
837 					if (left >= 2) {
838 						ch3 = r[1];
839 						if (ch3 >= DUK_ASC_0 && ch3 <= DUK_ASC_9) {
840 							captmp = capnum * 10 + (ch3 - DUK_ASC_0);
841 							if (captmp < match_caps) {
842 								capnum = captmp;
843 								capadv = 2;
844 							}
845 						}
846 					}
847 
848 					if (capnum > 0 && capnum < match_caps) {
849 						DUK_ASSERT(is_regexp != 0);  /* match_caps == 0 without regexps */
850 
851 						/* regexp res_obj is at offset 4 */
852 						duk_get_prop_index(thr, 4, (duk_uarridx_t) capnum);
853 						if (duk_is_string(thr, -1)) {
854 							duk_hstring *h_tmp_str;
855 
856 							h_tmp_str = duk_known_hstring(thr, -1);
857 
858 							DUK_BW_WRITE_ENSURE_HSTRING(thr, bw, h_tmp_str);
859 						} else {
860 							/* undefined -> skip (replaced with empty) */
861 						}
862 						duk_pop(thr);
863 						r += capadv;
864 						continue;
865 					} else {
866 						goto repl_write;
867 					}
868 #else  /* DUK_USE_REGEXP_SUPPORT */
869 					goto repl_write;  /* unconditionally */
870 #endif  /* DUK_USE_REGEXP_SUPPORT */
871 				}  /* default case */
872 				}  /* switch (ch2) */
873 
874 			 repl_write:
875 				/* ch1 = (r_increment << 8) + byte */
876 
877 				DUK_BW_WRITE_ENSURE_U8(thr, bw, (duk_uint8_t) (ch1 & 0xff));
878 				r += ch1 >> 8;
879 			}  /* while repl */
880 		}  /* if (is_repl_func) */
881 
882 		duk_pop(thr);  /* pop regexp res_obj or match string */
883 
884 #if defined(DUK_USE_REGEXP_SUPPORT)
885 		if (!is_global) {
886 #else
887 		{  /* unconditionally; is_global==0 */
888 #endif
889 			break;
890 		}
891 	}
892 
893 	/* trailer */
894 	tmp_sz = (duk_size_t) (DUK_HSTRING_GET_BYTELEN(h_input) - prev_match_end_boff);
895 	DUK_BW_WRITE_ENSURE_BYTES(thr, bw, DUK_HSTRING_GET_DATA(h_input) + prev_match_end_boff, tmp_sz);
896 
897 	DUK_ASSERT_TOP(thr, 4);
898 	DUK_BW_COMPACT(thr, bw);
899 	(void) duk_buffer_to_string(thr, -1);  /* Safe if inputs are safe. */
900 	return 1;
901 }
902 
903 /*
904  *  split()
905  */
906 
907 /* XXX: very messy now, but works; clean up, remove unused variables (nomimally
908  * used so compiler doesn't complain).
909  */
910 
911 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_split(duk_hthread *thr) {
912 	duk_hstring *h_input;
913 	duk_hstring *h_sep;
914 	duk_uint32_t limit;
915 	duk_uint32_t arr_idx;
916 #if defined(DUK_USE_REGEXP_SUPPORT)
917 	duk_bool_t is_regexp;
918 #endif
919 	duk_bool_t matched;  /* set to 1 if any match exists (needed for empty input special case) */
920 	duk_uint32_t prev_match_end_coff, prev_match_end_boff;
921 	duk_uint32_t match_start_boff, match_start_coff;
922 	duk_uint32_t match_end_boff, match_end_coff;
923 
924 	h_input = duk_push_this_coercible_to_string(thr);
925 	DUK_ASSERT(h_input != NULL);
926 
927 	duk_push_array(thr);
928 
929 	if (duk_is_undefined(thr, 1)) {
930 		limit = 0xffffffffUL;
931 	} else {
932 		limit = duk_to_uint32(thr, 1);
933 	}
934 
935 	if (limit == 0) {
936 		return 1;
937 	}
938 
939 	/* If the separator is a RegExp, make a "clone" of it.  The specification
940 	 * algorithm calls [[Match]] directly for specific indices; we emulate this
941 	 * by tweaking lastIndex and using a "force global" variant of duk_regexp_match()
942 	 * which will use global-style matching even when the RegExp itself is non-global.
943 	 */
944 
945 	if (duk_is_undefined(thr, 0)) {
946 		/* The spec algorithm first does "R = ToString(separator)" before checking
947 		 * whether separator is undefined.  Since this is side effect free, we can
948 		 * skip the ToString() here.
949 		 */
950 		duk_dup_2(thr);
951 		duk_put_prop_index(thr, 3, 0);
952 		return 1;
953 	} else if (duk_get_hobject_with_class(thr, 0, DUK_HOBJECT_CLASS_REGEXP) != NULL) {
954 #if defined(DUK_USE_REGEXP_SUPPORT)
955 		duk_push_hobject_bidx(thr, DUK_BIDX_REGEXP_CONSTRUCTOR);
956 		duk_dup_0(thr);
957 		duk_new(thr, 1);  /* [ ... RegExp val ] -> [ ... res ] */
958 		duk_replace(thr, 0);
959 		/* lastIndex is initialized to zero by new RegExp() */
960 		is_regexp = 1;
961 #else
962 		DUK_DCERROR_UNSUPPORTED(thr);
963 #endif
964 	} else {
965 		duk_to_string(thr, 0);
966 #if defined(DUK_USE_REGEXP_SUPPORT)
967 		is_regexp = 0;
968 #endif
969 	}
970 
971 	/* stack[0] = separator (string or regexp)
972 	 * stack[1] = limit
973 	 * stack[2] = input string
974 	 * stack[3] = result array
975 	 */
976 
977 	prev_match_end_boff = 0;
978 	prev_match_end_coff = 0;
979 	arr_idx = 0;
980 	matched = 0;
981 
982 	for (;;) {
983 		/*
984 		 *  The specification uses RegExp [[Match]] to attempt match at specific
985 		 *  offsets.  We don't have such a primitive, so we use an actual RegExp
986 		 *  and tweak lastIndex.  Since the RegExp may be non-global, we use a
987 		 *  special variant which forces global-like behavior for matching.
988 		 */
989 
990 		DUK_ASSERT_TOP(thr, 4);
991 
992 #if defined(DUK_USE_REGEXP_SUPPORT)
993 		if (is_regexp) {
994 			duk_dup_0(thr);
995 			duk_dup_2(thr);
996 			duk_regexp_match_force_global(thr);  /* [ ... regexp input ] -> [ res_obj ] */
997 			if (!duk_is_object(thr, -1)) {
998 				duk_pop(thr);
999 				break;
1000 			}
1001 			matched = 1;
1002 
1003 			duk_get_prop_stridx_short(thr, -1, DUK_STRIDX_INDEX);
1004 			DUK_ASSERT(duk_is_number(thr, -1));
1005 			match_start_coff = duk_get_uint(thr, -1);
1006 			match_start_boff = (duk_uint32_t) duk_heap_strcache_offset_char2byte(thr, h_input, match_start_coff);
1007 			duk_pop(thr);
1008 
1009 			if (match_start_coff == DUK_HSTRING_GET_CHARLEN(h_input)) {
1010 				/* don't allow an empty match at the end of the string */
1011 				duk_pop(thr);
1012 				break;
1013 			}
1014 
1015 			duk_get_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1016 			DUK_ASSERT(duk_is_number(thr, -1));
1017 			match_end_coff = duk_get_uint(thr, -1);
1018 			match_end_boff = (duk_uint32_t) duk_heap_strcache_offset_char2byte(thr, h_input, match_end_coff);
1019 			duk_pop(thr);
1020 
1021 			/* empty match -> bump and continue */
1022 			if (prev_match_end_boff == match_end_boff) {
1023 				duk_push_uint(thr, (duk_uint_t) (match_end_coff + 1));
1024 				duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1025 				duk_pop(thr);
1026 				continue;
1027 			}
1028 		} else {
1029 #else  /* DUK_USE_REGEXP_SUPPORT */
1030 		{  /* unconditionally */
1031 #endif  /* DUK_USE_REGEXP_SUPPORT */
1032 			const duk_uint8_t *p_start, *p_end, *p;   /* input string scan */
1033 			const duk_uint8_t *q_start;               /* match string */
1034 			duk_size_t q_blen, q_clen;
1035 
1036 			p_start = DUK_HSTRING_GET_DATA(h_input);
1037 			p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
1038 			p = p_start + prev_match_end_boff;
1039 
1040 			h_sep = duk_known_hstring(thr, 0);  /* symbol already rejected above */
1041 			q_start = DUK_HSTRING_GET_DATA(h_sep);
1042 			q_blen = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h_sep);
1043 			q_clen = (duk_size_t) DUK_HSTRING_GET_CHARLEN(h_sep);
1044 
1045 			p_end -= q_blen;  /* ensure full memcmp() fits in while */
1046 
1047 			match_start_coff = prev_match_end_coff;
1048 
1049 			if (q_blen == 0) {
1050 				/* Handle empty separator case: it will always match, and always
1051 				 * triggers the check in step 13.c.iii initially.  Note that we
1052 				 * must skip to either end of string or start of first codepoint,
1053 				 * skipping over any continuation bytes!
1054 				 *
1055 				 * Don't allow an empty string to match at the end of the input.
1056 				 */
1057 
1058 				matched = 1;  /* empty separator can always match */
1059 
1060 				match_start_coff++;
1061 				p++;
1062 				while (p < p_end) {
1063 					if ((p[0] & 0xc0) != 0x80) {
1064 						goto found;
1065 					}
1066 					p++;
1067 				}
1068 				goto not_found;
1069 			}
1070 
1071 			DUK_ASSERT(q_blen > 0 && q_clen > 0);
1072 			while (p <= p_end) {
1073 				DUK_ASSERT(p + q_blen <= DUK_HSTRING_GET_DATA(h_input) + DUK_HSTRING_GET_BYTELEN(h_input));
1074 				DUK_ASSERT(q_blen > 0);  /* no issues with empty memcmp() */
1075 				if (duk_memcmp((const void *) p, (const void *) q_start, (size_t) q_blen) == 0) {
1076 					/* never an empty match, so step 13.c.iii can't be triggered */
1077 					goto found;
1078 				}
1079 
1080 				/* track utf-8 non-continuation bytes */
1081 				if ((p[0] & 0xc0) != 0x80) {
1082 					match_start_coff++;
1083 				}
1084 				p++;
1085 			}
1086 
1087 		 not_found:
1088 			/* not found */
1089 			break;
1090 
1091 		 found:
1092 			matched = 1;
1093 			match_start_boff = (duk_uint32_t) (p - p_start);
1094 			match_end_coff = (duk_uint32_t) (match_start_coff + q_clen);  /* constrained by string length */
1095 			match_end_boff = (duk_uint32_t) (match_start_boff + q_blen);  /* ditto */
1096 
1097 			/* empty match (may happen with empty separator) -> bump and continue */
1098 			if (prev_match_end_boff == match_end_boff) {
1099 				prev_match_end_boff++;
1100 				prev_match_end_coff++;
1101 				continue;
1102 			}
1103 		}  /* if (is_regexp) */
1104 
1105 		/* stack[0] = separator (string or regexp)
1106 		 * stack[1] = limit
1107 		 * stack[2] = input string
1108 		 * stack[3] = result array
1109 		 * stack[4] = regexp res_obj (if is_regexp)
1110 		 */
1111 
1112 		DUK_DDD(DUK_DDDPRINT("split; match_start b=%ld,c=%ld, match_end b=%ld,c=%ld, prev_end b=%ld,c=%ld",
1113 		                     (long) match_start_boff, (long) match_start_coff,
1114 		                     (long) match_end_boff, (long) match_end_coff,
1115 		                     (long) prev_match_end_boff, (long) prev_match_end_coff));
1116 
1117 		duk_push_lstring(thr,
1118 		                 (const char *) (DUK_HSTRING_GET_DATA(h_input) + prev_match_end_boff),
1119 		                 (duk_size_t) (match_start_boff - prev_match_end_boff));
1120 		duk_put_prop_index(thr, 3, arr_idx);
1121 		arr_idx++;
1122 		if (arr_idx >= limit) {
1123 			goto hit_limit;
1124 		}
1125 
1126 #if defined(DUK_USE_REGEXP_SUPPORT)
1127 		if (is_regexp) {
1128 			duk_size_t i, len;
1129 
1130 			len = duk_get_length(thr, 4);
1131 			for (i = 1; i < len; i++) {
1132 				DUK_ASSERT(i <= DUK_UARRIDX_MAX);  /* cannot have >4G captures */
1133 				duk_get_prop_index(thr, 4, (duk_uarridx_t) i);
1134 				duk_put_prop_index(thr, 3, arr_idx);
1135 				arr_idx++;
1136 				if (arr_idx >= limit) {
1137 					goto hit_limit;
1138 				}
1139 			}
1140 
1141 			duk_pop(thr);
1142 			/* lastIndex already set up for next match */
1143 		} else {
1144 #else  /* DUK_USE_REGEXP_SUPPORT */
1145 		{  /* unconditionally */
1146 #endif  /* DUK_USE_REGEXP_SUPPORT */
1147 			/* no action */
1148 		}
1149 
1150 		prev_match_end_boff = match_end_boff;
1151 		prev_match_end_coff = match_end_coff;
1152 		continue;
1153 	}  /* for */
1154 
1155 	/* Combined step 11 (empty string special case) and 14-15. */
1156 
1157 	DUK_DDD(DUK_DDDPRINT("split trailer; prev_end b=%ld,c=%ld",
1158 	                     (long) prev_match_end_boff, (long) prev_match_end_coff));
1159 
1160 	if (DUK_HSTRING_GET_BYTELEN(h_input) > 0 || !matched) {
1161 		/* Add trailer if:
1162 		 *   a) non-empty input
1163 		 *   b) empty input and no (zero size) match found (step 11)
1164 		 */
1165 
1166 		duk_push_lstring(thr,
1167 		                 (const char *) DUK_HSTRING_GET_DATA(h_input) + prev_match_end_boff,
1168 		                 (duk_size_t) (DUK_HSTRING_GET_BYTELEN(h_input) - prev_match_end_boff));
1169 		duk_put_prop_index(thr, 3, arr_idx);
1170 		/* No arr_idx update or limit check */
1171 	}
1172 
1173 	return 1;
1174 
1175  hit_limit:
1176 #if defined(DUK_USE_REGEXP_SUPPORT)
1177 	if (is_regexp) {
1178 		duk_pop(thr);
1179 	}
1180 #endif
1181 
1182 	return 1;
1183 }
1184 
1185 /*
1186  *  Various
1187  */
1188 
1189 #if defined(DUK_USE_REGEXP_SUPPORT)
1190 DUK_LOCAL void duk__to_regexp_helper(duk_hthread *thr, duk_idx_t idx, duk_bool_t force_new) {
1191 	duk_hobject *h;
1192 
1193 	/* Shared helper for match() steps 3-4, search() steps 3-4. */
1194 
1195 	DUK_ASSERT(idx >= 0);
1196 
1197 	if (force_new) {
1198 		goto do_new;
1199 	}
1200 
1201 	h = duk_get_hobject_with_class(thr, idx, DUK_HOBJECT_CLASS_REGEXP);
1202 	if (!h) {
1203 		goto do_new;
1204 	}
1205 	return;
1206 
1207  do_new:
1208 	duk_push_hobject_bidx(thr, DUK_BIDX_REGEXP_CONSTRUCTOR);
1209 	duk_dup(thr, idx);
1210 	duk_new(thr, 1);  /* [ ... RegExp val ] -> [ ... res ] */
1211 	duk_replace(thr, idx);
1212 }
1213 #endif  /* DUK_USE_REGEXP_SUPPORT */
1214 
1215 #if defined(DUK_USE_REGEXP_SUPPORT)
1216 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_search(duk_hthread *thr) {
1217 	/* Easiest way to implement the search required by the specification
1218 	 * is to do a RegExp test() with lastIndex forced to zero.  To avoid
1219 	 * side effects on the argument, "clone" the RegExp if a RegExp was
1220 	 * given as input.
1221 	 *
1222 	 * The global flag of the RegExp should be ignored; setting lastIndex
1223 	 * to zero (which happens when "cloning" the RegExp) should have an
1224 	 * equivalent effect.
1225 	 */
1226 
1227 	DUK_ASSERT_TOP(thr, 1);
1228 	(void) duk_push_this_coercible_to_string(thr);  /* at index 1 */
1229 	duk__to_regexp_helper(thr, 0 /*index*/, 1 /*force_new*/);
1230 
1231 	/* stack[0] = regexp
1232 	 * stack[1] = string
1233 	 */
1234 
1235 	/* Avoid using RegExp.prototype methods, as they're writable and
1236 	 * configurable and may have been changed.
1237 	 */
1238 
1239 	duk_dup_0(thr);
1240 	duk_dup_1(thr);  /* [ ... re_obj input ] */
1241 	duk_regexp_match(thr);  /* -> [ ... res_obj ] */
1242 
1243 	if (!duk_is_object(thr, -1)) {
1244 		duk_push_int(thr, -1);
1245 		return 1;
1246 	}
1247 
1248 	duk_get_prop_stridx_short(thr, -1, DUK_STRIDX_INDEX);
1249 	DUK_ASSERT(duk_is_number(thr, -1));
1250 	return 1;
1251 }
1252 #endif  /* DUK_USE_REGEXP_SUPPORT */
1253 
1254 #if defined(DUK_USE_REGEXP_SUPPORT)
1255 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_match(duk_hthread *thr) {
1256 	duk_bool_t global;
1257 	duk_int_t prev_last_index;
1258 	duk_int_t this_index;
1259 	duk_int_t arr_idx;
1260 
1261 	DUK_ASSERT_TOP(thr, 1);
1262 	(void) duk_push_this_coercible_to_string(thr);
1263 	duk__to_regexp_helper(thr, 0 /*index*/, 0 /*force_new*/);
1264 	global = duk_get_prop_stridx_boolean(thr, 0, DUK_STRIDX_GLOBAL, NULL);
1265 	DUK_ASSERT_TOP(thr, 2);
1266 
1267 	/* stack[0] = regexp
1268 	 * stack[1] = string
1269 	 */
1270 
1271 	if (!global) {
1272 		duk_regexp_match(thr);  /* -> [ res_obj ] */
1273 		return 1;  /* return 'res_obj' */
1274 	}
1275 
1276 	/* Global case is more complex. */
1277 
1278 	/* [ regexp string ] */
1279 
1280 	duk_push_int(thr, 0);
1281 	duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1282 	duk_push_array(thr);
1283 
1284 	/* [ regexp string res_arr ] */
1285 
1286 	prev_last_index = 0;
1287 	arr_idx = 0;
1288 
1289 	for (;;) {
1290 		DUK_ASSERT_TOP(thr, 3);
1291 
1292 		duk_dup_0(thr);
1293 		duk_dup_1(thr);
1294 		duk_regexp_match(thr);  /* -> [ ... regexp string ] -> [ ... res_obj ] */
1295 
1296 		if (!duk_is_object(thr, -1)) {
1297 			duk_pop(thr);
1298 			break;
1299 		}
1300 
1301 		duk_get_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1302 		DUK_ASSERT(duk_is_number(thr, -1));
1303 		this_index = duk_get_int(thr, -1);
1304 		duk_pop(thr);
1305 
1306 		if (this_index == prev_last_index) {
1307 			this_index++;
1308 			duk_push_int(thr, this_index);
1309 			duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1310 		}
1311 		prev_last_index = this_index;
1312 
1313 		duk_get_prop_index(thr, -1, 0);  /* match string */
1314 		duk_put_prop_index(thr, 2, (duk_uarridx_t) arr_idx);
1315 		arr_idx++;
1316 		duk_pop(thr);  /* res_obj */
1317 	}
1318 
1319 	if (arr_idx == 0) {
1320 		duk_push_null(thr);
1321 	}
1322 
1323 	return 1;  /* return 'res_arr' or 'null' */
1324 }
1325 #endif  /* DUK_USE_REGEXP_SUPPORT */
1326 
1327 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_concat(duk_hthread *thr) {
1328 	/* duk_concat() coerces arguments with ToString() in correct order */
1329 	(void) duk_push_this_coercible_to_string(thr);
1330 	duk_insert(thr, 0);  /* this is relatively expensive */
1331 	duk_concat(thr, duk_get_top(thr));
1332 	return 1;
1333 }
1334 
1335 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_trim(duk_hthread *thr) {
1336 	DUK_ASSERT_TOP(thr, 0);
1337 	(void) duk_push_this_coercible_to_string(thr);
1338 	duk_trim(thr, 0);
1339 	DUK_ASSERT_TOP(thr, 1);
1340 	return 1;
1341 }
1342 
1343 #if defined(DUK_USE_ES6)
1344 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_repeat(duk_hthread *thr) {
1345 	duk_hstring *h_input;
1346 	duk_size_t input_blen;
1347 	duk_size_t result_len;
1348 	duk_int_t count_signed;
1349 	duk_uint_t count;
1350 	const duk_uint8_t *src;
1351 	duk_uint8_t *buf;
1352 	duk_uint8_t *p;
1353 	duk_double_t d;
1354 #if !defined(DUK_USE_PREFER_SIZE)
1355 	duk_size_t copy_size;
1356 	duk_uint8_t *p_end;
1357 #endif
1358 
1359 	DUK_ASSERT_TOP(thr, 1);
1360 	h_input = duk_push_this_coercible_to_string(thr);
1361 	DUK_ASSERT(h_input != NULL);
1362 	input_blen = DUK_HSTRING_GET_BYTELEN(h_input);
1363 
1364 	/* Count is ToNumber() coerced; +Infinity must be always rejected
1365 	 * (even if input string is zero length), as well as negative values
1366 	 * and -Infinity.  -Infinity doesn't require an explicit check
1367 	 * because duk_get_int() clamps it to DUK_INT_MIN which gets rejected
1368 	 * as a negative value (regardless of input string length).
1369 	 */
1370 	d = duk_to_number(thr, 0);
1371 	if (duk_double_is_posinf(d)) {
1372 		goto fail_range;
1373 	}
1374 	count_signed = duk_get_int(thr, 0);
1375 	if (count_signed < 0) {
1376 		goto fail_range;
1377 	}
1378 	count = (duk_uint_t) count_signed;
1379 
1380 	/* Overflow check for result length. */
1381 	result_len = count * input_blen;
1382 	if (count != 0 && result_len / count != input_blen) {
1383 		goto fail_range;
1384 	}
1385 
1386 	/* Temporary fixed buffer, later converted to string. */
1387 	buf = (duk_uint8_t *) duk_push_fixed_buffer_nozero(thr, result_len);
1388 	DUK_ASSERT(buf != NULL);
1389 	src = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
1390 	DUK_ASSERT(src != NULL);
1391 
1392 #if defined(DUK_USE_PREFER_SIZE)
1393 	p = buf;
1394 	while (count-- > 0) {
1395 		duk_memcpy((void *) p, (const void *) src, input_blen);  /* copy size may be zero, but pointers are valid */
1396 		p += input_blen;
1397 	}
1398 #else  /* DUK_USE_PREFER_SIZE */
1399 	/* Take advantage of already copied pieces to speed up the process
1400 	 * especially for small repeated strings.
1401 	 */
1402 	p = buf;
1403 	p_end = p + result_len;
1404 	copy_size = input_blen;
1405 	for (;;) {
1406 		duk_size_t remain = (duk_size_t) (p_end - p);
1407 		DUK_DDD(DUK_DDDPRINT("remain=%ld, copy_size=%ld, input_blen=%ld, result_len=%ld",
1408 		                     (long) remain, (long) copy_size, (long) input_blen,
1409 		                     (long) result_len));
1410 		if (remain <= copy_size) {
1411 			/* If result_len is zero, this case is taken and does
1412 			 * a zero size copy (with valid pointers).
1413 			 */
1414 			duk_memcpy((void *) p, (const void *) src, remain);
1415 			break;
1416 		} else {
1417 			duk_memcpy((void *) p, (const void *) src, copy_size);
1418 			p += copy_size;
1419 		}
1420 
1421 		src = (const duk_uint8_t *) buf;  /* Use buf as source for larger copies. */
1422 		copy_size = (duk_size_t) (p - buf);
1423 	}
1424 #endif  /* DUK_USE_PREFER_SIZE */
1425 
1426 	/* XXX: It would be useful to be able to create a duk_hstring with
1427 	 * a certain byte size whose data area wasn't initialized and which
1428 	 * wasn't in the string table yet.  This would allow a string to be
1429 	 * constructed directly without a buffer temporary and when it was
1430 	 * finished, it could be injected into the string table.  Currently
1431 	 * this isn't possible because duk_hstrings are only tracked by the
1432 	 * intern table (they are not in heap_allocated).
1433 	 */
1434 
1435 	duk_buffer_to_string(thr, -1);  /* Safe if input is safe. */
1436 	return 1;
1437 
1438  fail_range:
1439 	DUK_DCERROR_RANGE_INVALID_ARGS(thr);
1440 }
1441 #endif  /* DUK_USE_ES6 */
1442 
1443 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_locale_compare(duk_hthread *thr) {
1444 	duk_hstring *h1;
1445 	duk_hstring *h2;
1446 	duk_size_t h1_len, h2_len, prefix_len;
1447 	duk_small_int_t ret = 0;
1448 	duk_small_int_t rc;
1449 
1450 	/* The current implementation of localeCompare() is simply a codepoint
1451 	 * by codepoint comparison, implemented with a simple string compare
1452 	 * because UTF-8 should preserve codepoint ordering (assuming valid
1453 	 * shortest UTF-8 encoding).
1454 	 *
1455 	 * The specification requires that the return value must be related
1456 	 * to the sort order: e.g. negative means that 'this' comes before
1457 	 * 'that' in sort order.  We assume an ascending sort order.
1458 	 */
1459 
1460 	/* XXX: could share code with duk_js_ops.c, duk_js_compare_helper */
1461 
1462 	h1 = duk_push_this_coercible_to_string(thr);
1463 	DUK_ASSERT(h1 != NULL);
1464 
1465 	h2 = duk_to_hstring(thr, 0);
1466 	DUK_ASSERT(h2 != NULL);
1467 
1468 	h1_len = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h1);
1469 	h2_len = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h2);
1470 	prefix_len = (h1_len <= h2_len ? h1_len : h2_len);
1471 
1472 	rc = (duk_small_int_t) duk_memcmp((const void *) DUK_HSTRING_GET_DATA(h1),
1473 	                                  (const void *) DUK_HSTRING_GET_DATA(h2),
1474 	                                  (size_t) prefix_len);
1475 
1476 	if (rc < 0) {
1477 		ret = -1;
1478 		goto done;
1479 	} else if (rc > 0) {
1480 		ret = 1;
1481 		goto done;
1482 	}
1483 
1484 	/* prefix matches, lengths matter now */
1485 	if (h1_len > h2_len) {
1486 		ret = 1;
1487 		goto done;
1488 	} else if (h1_len == h2_len) {
1489 		DUK_ASSERT(ret == 0);
1490 		goto done;
1491 	}
1492 	ret = -1;
1493 	goto done;
1494 
1495  done:
1496 	duk_push_int(thr, (duk_int_t) ret);
1497 	return 1;
1498 }
1499 
1500 #if defined(DUK_USE_ES6)
1501 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_startswith_endswith(duk_hthread *thr) {
1502 	duk_int_t magic;
1503 	duk_hstring *h_target;
1504 	duk_size_t blen_target;
1505 	duk_hstring *h_search;
1506 	duk_size_t blen_search;
1507 	duk_int_t off;
1508 	duk_bool_t result = 0;
1509 	duk_size_t blen_left;
1510 
1511 	/* Because string byte lengths are in [0,DUK_INT_MAX] it's safe to
1512 	 * subtract two string lengths without overflow.
1513 	 */
1514 	DUK_ASSERT(DUK_HSTRING_MAX_BYTELEN <= DUK_INT_MAX);
1515 
1516 	h_target = duk_push_this_coercible_to_string(thr);
1517 	DUK_ASSERT(h_target != NULL);
1518 
1519 	h_search = duk__str_tostring_notregexp(thr, 0);
1520 	DUK_ASSERT(h_search != NULL);
1521 
1522 	magic = duk_get_current_magic(thr);
1523 
1524 	/* Careful to avoid pointer overflows in the matching logic. */
1525 
1526 	blen_target = DUK_HSTRING_GET_BYTELEN(h_target);
1527 	blen_search = DUK_HSTRING_GET_BYTELEN(h_search);
1528 
1529 #if 0
1530 	/* If search string is longer than the target string, we can
1531 	 * never match.  Could check explicitly, but should be handled
1532 	 * correctly below.
1533 	 */
1534 	if (blen_search > blen_target) {
1535 		goto finish;
1536 	}
1537 #endif
1538 
1539 	off = 0;
1540 	if (duk_is_undefined(thr, 1)) {
1541 		if (magic) {
1542 			off = (duk_int_t) blen_target - (duk_int_t) blen_search;
1543 		} else {
1544 			DUK_ASSERT(off == 0);
1545 		}
1546 	} else {
1547 		duk_int_t len;
1548 		duk_int_t pos;
1549 
1550 		DUK_ASSERT(DUK_HSTRING_MAX_BYTELEN <= DUK_INT_MAX);
1551 		len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h_target);
1552 		pos = duk_to_int_clamped(thr, 1, 0, len);
1553 		DUK_ASSERT(pos >= 0 && pos <= len);
1554 
1555 		off = (duk_int_t) duk_heap_strcache_offset_char2byte(thr, h_target, (duk_uint_fast32_t) pos);
1556 		if (magic) {
1557 			off -= (duk_int_t) blen_search;
1558 		}
1559 	}
1560 	if (off < 0 || off > (duk_int_t) blen_target) {
1561 		goto finish;
1562 	}
1563 
1564 	/* The main comparison can be done using a memcmp() rather than
1565 	 * doing codepoint comparisons: for CESU-8 strings there is a
1566 	 * canonical representation for every codepoint.  But we do need
1567 	 * to deal with the char/byte offset translation to find the
1568 	 * comparison range.
1569 	 */
1570 
1571 	DUK_ASSERT(off >= 0);
1572 	DUK_ASSERT((duk_size_t) off <= blen_target);
1573 	blen_left = blen_target - (duk_size_t) off;
1574 	if (blen_left >= blen_search) {
1575 		const duk_uint8_t *p_cmp_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_target) + off;
1576 		const duk_uint8_t *p_search = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_search);
1577 		if (duk_memcmp_unsafe((const void *) p_cmp_start, (const void *) p_search, (size_t) blen_search) == 0) {
1578 			result = 1;
1579 		}
1580 	}
1581 
1582  finish:
1583 	duk_push_boolean(thr, result);
1584 	return 1;
1585 }
1586 #endif  /* DUK_USE_ES6 */
1587 
1588 #if defined(DUK_USE_ES6)
1589 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_includes(duk_hthread *thr) {
1590 	duk_hstring *h;
1591 	duk_hstring *h_search;
1592 	duk_int_t len;
1593 	duk_int_t pos;
1594 
1595 	h = duk_push_this_coercible_to_string(thr);
1596 	DUK_ASSERT(h != NULL);
1597 
1598 	h_search = duk__str_tostring_notregexp(thr, 0);
1599 	DUK_ASSERT(h_search != NULL);
1600 
1601 	len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h);
1602 	pos = duk_to_int_clamped(thr, 1, 0, len);
1603 	DUK_ASSERT(pos >= 0 && pos <= len);
1604 
1605 	pos = duk__str_search_shared(thr, h, h_search, pos, 0 /*backwards*/);
1606 	duk_push_boolean(thr, pos >= 0);
1607 	return 1;
1608 }
1609 #endif  /* DUK_USE_ES6 */
1610 #endif  /* DUK_USE_STRING_BUILTIN */
1611