1 /*
2 * String built-ins
3 *
4 * Most String built-ins must only accept strings (or String objects).
5 * Symbols, represented internally as strings, must be generally rejected.
6 * The duk_push_this_coercible_to_string() helper does this automatically.
7 */
8
9 /* XXX: There are several limitations in the current implementation for
10 * strings with >= 0x80000000UL characters. In some cases one would need
11 * to be able to represent the range [-0xffffffff,0xffffffff] and so on.
12 * Generally character and byte length are assumed to fit into signed 32
13 * bits (< 0x80000000UL). Places with issues are not marked explicitly
14 * below in all cases, look for signed type usage (duk_int_t etc) for
15 * offsets/lengths.
16 */
17
18 #include "duk_internal.h"
19
20 #if defined(DUK_USE_STRING_BUILTIN)
21
22 /*
23 * Helpers
24 */
25
duk__str_tostring_notregexp(duk_hthread * thr,duk_idx_t idx)26 DUK_LOCAL duk_hstring *duk__str_tostring_notregexp(duk_hthread *thr, duk_idx_t idx) {
27 duk_hstring *h;
28
29 if (duk_get_class_number(thr, idx) == DUK_HOBJECT_CLASS_REGEXP) {
30 DUK_ERROR_TYPE_INVALID_ARGS(thr);
31 DUK_WO_NORETURN(return NULL;);
32 }
33 h = duk_to_hstring(thr, idx);
34 DUK_ASSERT(h != NULL);
35
36 return h;
37 }
38
duk__str_search_shared(duk_hthread * thr,duk_hstring * h_this,duk_hstring * h_search,duk_int_t start_cpos,duk_bool_t backwards)39 DUK_LOCAL duk_int_t duk__str_search_shared(duk_hthread *thr, duk_hstring *h_this, duk_hstring *h_search, duk_int_t start_cpos, duk_bool_t backwards) {
40 duk_int_t cpos;
41 duk_int_t bpos;
42 const duk_uint8_t *p_start, *p_end, *p;
43 const duk_uint8_t *q_start;
44 duk_int_t q_blen;
45 duk_uint8_t firstbyte;
46 duk_uint8_t t;
47
48 cpos = start_cpos;
49
50 /* Empty searchstring always matches; cpos must be clamped here.
51 * (If q_blen were < 0 due to clamped coercion, it would also be
52 * caught here.)
53 */
54 q_start = DUK_HSTRING_GET_DATA(h_search);
55 q_blen = (duk_int_t) DUK_HSTRING_GET_BYTELEN(h_search);
56 if (q_blen <= 0) {
57 return cpos;
58 }
59 DUK_ASSERT(q_blen > 0);
60
61 bpos = (duk_int_t) duk_heap_strcache_offset_char2byte(thr, h_this, (duk_uint32_t) cpos);
62
63 p_start = DUK_HSTRING_GET_DATA(h_this);
64 p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_this);
65 p = p_start + bpos;
66
67 /* This loop is optimized for size. For speed, there should be
68 * two separate loops, and we should ensure that memcmp() can be
69 * used without an extra "will searchstring fit" check. Doing
70 * the preconditioning for 'p' and 'p_end' is easy but cpos
71 * must be updated if 'p' is wound back (backward scanning).
72 */
73
74 firstbyte = q_start[0]; /* leading byte of match string */
75 while (p <= p_end && p >= p_start) {
76 t = *p;
77
78 /* For ECMAScript strings, this check can only match for
79 * initial UTF-8 bytes (not continuation bytes). For other
80 * strings all bets are off.
81 */
82
83 if ((t == firstbyte) && ((duk_size_t) (p_end - p) >= (duk_size_t) q_blen)) {
84 DUK_ASSERT(q_blen > 0);
85 if (duk_memcmp((const void *) p, (const void *) q_start, (size_t) q_blen) == 0) {
86 return cpos;
87 }
88 }
89
90 /* track cpos while scanning */
91 if (backwards) {
92 /* when going backwards, we decrement cpos 'early';
93 * 'p' may point to a continuation byte of the char
94 * at offset 'cpos', but that's OK because we'll
95 * backtrack all the way to the initial byte.
96 */
97 if ((t & 0xc0) != 0x80) {
98 cpos--;
99 }
100 p--;
101 } else {
102 if ((t & 0xc0) != 0x80) {
103 cpos++;
104 }
105 p++;
106 }
107 }
108
109 /* Not found. Empty string case is handled specially above. */
110 return -1;
111 }
112
113 /*
114 * Constructor
115 */
116
duk_bi_string_constructor(duk_hthread * thr)117 DUK_INTERNAL duk_ret_t duk_bi_string_constructor(duk_hthread *thr) {
118 duk_hstring *h;
119 duk_uint_t flags;
120
121 /* String constructor needs to distinguish between an argument not given at all
122 * vs. given as 'undefined'. We're a vararg function to handle this properly.
123 */
124
125 /* XXX: copy current activation flags to thr, including current magic,
126 * is_constructor_call etc. This takes a few bytes in duk_hthread but
127 * makes call sites smaller (there are >30 is_constructor_call and get
128 * current magic call sites.
129 */
130
131 if (duk_get_top(thr) == 0) {
132 duk_push_hstring_empty(thr);
133 } else {
134 h = duk_to_hstring_acceptsymbol(thr, 0);
135 if (DUK_UNLIKELY(DUK_HSTRING_HAS_SYMBOL(h) && !duk_is_constructor_call(thr))) {
136 duk_push_symbol_descriptive_string(thr, h);
137 duk_replace(thr, 0);
138 }
139 }
140 duk_to_string(thr, 0); /* catches symbol argument for constructor call */
141 DUK_ASSERT(duk_is_string(thr, 0));
142 duk_set_top(thr, 1); /* Top may be 1 or larger. */
143
144 if (duk_is_constructor_call(thr)) {
145 /* String object internal value is immutable */
146 flags = DUK_HOBJECT_FLAG_EXTENSIBLE |
147 DUK_HOBJECT_FLAG_FASTREFS |
148 DUK_HOBJECT_FLAG_EXOTIC_STRINGOBJ |
149 DUK_HOBJECT_CLASS_AS_FLAGS(DUK_HOBJECT_CLASS_STRING);
150 duk_push_object_helper(thr, flags, DUK_BIDX_STRING_PROTOTYPE);
151 duk_dup_0(thr);
152 duk_xdef_prop_stridx_short(thr, -2, DUK_STRIDX_INT_VALUE, DUK_PROPDESC_FLAGS_NONE);
153 }
154 /* Note: unbalanced stack on purpose */
155
156 return 1;
157 }
158
duk__construct_from_codepoints(duk_hthread * thr,duk_bool_t nonbmp)159 DUK_LOCAL duk_ret_t duk__construct_from_codepoints(duk_hthread *thr, duk_bool_t nonbmp) {
160 duk_bufwriter_ctx bw_alloc;
161 duk_bufwriter_ctx *bw;
162 duk_idx_t i, n;
163 duk_ucodepoint_t cp;
164
165 /* XXX: It would be nice to build the string directly but ToUint16()
166 * coercion is needed so a generic helper would not be very
167 * helpful (perhaps coerce the value stack first here and then
168 * build a string from a duk_tval number sequence in one go?).
169 */
170
171 n = duk_get_top(thr);
172
173 bw = &bw_alloc;
174 DUK_BW_INIT_PUSHBUF(thr, bw, (duk_size_t) n); /* initial estimate for ASCII only codepoints */
175
176 for (i = 0; i < n; i++) {
177 /* XXX: could improve bufwriter handling to write multiple codepoints
178 * with one ensure call but the relative benefit would be quite small.
179 */
180
181 if (nonbmp) {
182 /* ES2015 requires that (1) SameValue(cp, ToInteger(cp)) and
183 * (2) cp >= 0 and cp <= 0x10ffff. This check does not
184 * implement the steps exactly but the outcome should be
185 * the same.
186 */
187 duk_int32_t i32 = 0;
188 if (!duk_is_whole_get_int32(duk_to_number(thr, i), &i32) ||
189 i32 < 0 || i32 > 0x10ffffL) {
190 DUK_DCERROR_RANGE_INVALID_ARGS(thr);
191 }
192 DUK_ASSERT(i32 >= 0 && i32 <= 0x10ffffL);
193 cp = (duk_ucodepoint_t) i32;
194 DUK_BW_WRITE_ENSURE_CESU8(thr, bw, cp);
195 } else {
196 #if defined(DUK_USE_NONSTD_STRING_FROMCHARCODE_32BIT)
197 /* ToUint16() coercion is mandatory in the E5.1 specification, but
198 * this non-compliant behavior makes more sense because we support
199 * non-BMP codepoints. Don't use CESU-8 because that'd create
200 * surrogate pairs.
201 */
202 cp = (duk_ucodepoint_t) duk_to_uint32(thr, i);
203 DUK_BW_WRITE_ENSURE_XUTF8(thr, bw, cp);
204 #else
205 cp = (duk_ucodepoint_t) duk_to_uint16(thr, i);
206 DUK_ASSERT(cp >= 0 && cp <= 0x10ffffL);
207 DUK_BW_WRITE_ENSURE_CESU8(thr, bw, cp);
208 #endif
209 }
210 }
211
212 DUK_BW_COMPACT(thr, bw);
213 (void) duk_buffer_to_string(thr, -1); /* Safe, extended UTF-8 or CESU-8 encoded. */
214 return 1;
215 }
216
duk_bi_string_constructor_from_char_code(duk_hthread * thr)217 DUK_INTERNAL duk_ret_t duk_bi_string_constructor_from_char_code(duk_hthread *thr) {
218 return duk__construct_from_codepoints(thr, 0 /*nonbmp*/);
219 }
220
221 #if defined(DUK_USE_ES6)
duk_bi_string_constructor_from_code_point(duk_hthread * thr)222 DUK_INTERNAL duk_ret_t duk_bi_string_constructor_from_code_point(duk_hthread *thr) {
223 return duk__construct_from_codepoints(thr, 1 /*nonbmp*/);
224 }
225 #endif
226
227 /*
228 * toString(), valueOf()
229 */
230
duk_bi_string_prototype_to_string(duk_hthread * thr)231 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_to_string(duk_hthread *thr) {
232 duk_tval *tv;
233
234 duk_push_this(thr);
235 tv = duk_require_tval(thr, -1);
236 DUK_ASSERT(tv != NULL);
237
238 if (DUK_TVAL_IS_STRING(tv)) {
239 /* return as is */
240 } else if (DUK_TVAL_IS_OBJECT(tv)) {
241 duk_hobject *h = DUK_TVAL_GET_OBJECT(tv);
242 DUK_ASSERT(h != NULL);
243
244 /* Must be a "string object", i.e. class "String" */
245 if (DUK_HOBJECT_GET_CLASS_NUMBER(h) != DUK_HOBJECT_CLASS_STRING) {
246 goto type_error;
247 }
248
249 duk_xget_owndataprop_stridx_short(thr, -1, DUK_STRIDX_INT_VALUE);
250 DUK_ASSERT(duk_is_string(thr, -1));
251 } else {
252 goto type_error;
253 }
254
255 (void) duk_require_hstring_notsymbol(thr, -1); /* Reject symbols (and wrapped symbols). */
256 return 1;
257
258 type_error:
259 DUK_DCERROR_TYPE_INVALID_ARGS(thr);
260 }
261
262 /*
263 * Character and charcode access
264 */
265
duk_bi_string_prototype_char_at(duk_hthread * thr)266 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_char_at(duk_hthread *thr) {
267 duk_hstring *h;
268 duk_int_t pos;
269
270 /* XXX: faster implementation */
271
272 h = duk_push_this_coercible_to_string(thr);
273 DUK_ASSERT(h != NULL);
274
275 pos = duk_to_int(thr, 0);
276
277 if (sizeof(duk_size_t) >= sizeof(duk_uint_t)) {
278 /* Cast to duk_size_t works in this case:
279 * - If pos < 0, (duk_size_t) pos will always be
280 * >= max_charlen, and result will be the empty string
281 * (see duk_substring()).
282 * - If pos >= 0, pos + 1 cannot wrap.
283 */
284 DUK_ASSERT((duk_size_t) DUK_INT_MIN >= DUK_HSTRING_MAX_BYTELEN);
285 DUK_ASSERT((duk_size_t) DUK_INT_MAX + 1U > (duk_size_t) DUK_INT_MAX);
286 duk_substring(thr, -1, (duk_size_t) pos, (duk_size_t) pos + 1U);
287 } else {
288 /* If size_t is smaller than int, explicit bounds checks
289 * are needed because an int may wrap multiple times.
290 */
291 if (DUK_UNLIKELY(pos < 0 || (duk_uint_t) pos >= (duk_uint_t) DUK_HSTRING_GET_CHARLEN(h))) {
292 duk_push_hstring_empty(thr);
293 } else {
294 duk_substring(thr, -1, (duk_size_t) pos, (duk_size_t) pos + 1U);
295 }
296 }
297
298 return 1;
299 }
300
301 /* Magic: 0=charCodeAt, 1=codePointAt */
duk_bi_string_prototype_char_code_at(duk_hthread * thr)302 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_char_code_at(duk_hthread *thr) {
303 duk_int_t pos;
304 duk_hstring *h;
305 duk_bool_t clamped;
306 duk_uint32_t cp;
307 duk_int_t magic;
308
309 /* XXX: faster implementation */
310
311 DUK_DDD(DUK_DDDPRINT("arg=%!T", (duk_tval *) duk_get_tval(thr, 0)));
312
313 h = duk_push_this_coercible_to_string(thr);
314 DUK_ASSERT(h != NULL);
315
316 pos = duk_to_int_clamped_raw(thr,
317 0 /*index*/,
318 0 /*min(incl)*/,
319 (duk_int_t) DUK_HSTRING_GET_CHARLEN(h) - 1 /*max(incl)*/,
320 &clamped /*out_clamped*/);
321 #if defined(DUK_USE_ES6)
322 magic = duk_get_current_magic(thr);
323 #else
324 DUK_ASSERT(duk_get_current_magic(thr) == 0);
325 magic = 0;
326 #endif
327 if (clamped) {
328 /* For out-of-bounds indices .charCodeAt() returns NaN and
329 * .codePointAt() returns undefined.
330 */
331 if (magic != 0) {
332 return 0;
333 }
334 duk_push_nan(thr);
335 } else {
336 DUK_ASSERT(pos >= 0);
337 cp = (duk_uint32_t) duk_hstring_char_code_at_raw(thr, h, (duk_uint_t) pos, (duk_bool_t) magic /*surrogate_aware*/);
338 duk_push_u32(thr, cp);
339 }
340 return 1;
341 }
342
343 /*
344 * substring(), substr(), slice()
345 */
346
347 /* XXX: any chance of merging these three similar but still slightly
348 * different algorithms so that footprint would be reduced?
349 */
350
duk_bi_string_prototype_substring(duk_hthread * thr)351 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_substring(duk_hthread *thr) {
352 duk_hstring *h;
353 duk_int_t start_pos, end_pos;
354 duk_int_t len;
355
356 h = duk_push_this_coercible_to_string(thr);
357 DUK_ASSERT(h != NULL);
358 len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h);
359
360 /* [ start end str ] */
361
362 start_pos = duk_to_int_clamped(thr, 0, 0, len);
363 if (duk_is_undefined(thr, 1)) {
364 end_pos = len;
365 } else {
366 end_pos = duk_to_int_clamped(thr, 1, 0, len);
367 }
368 DUK_ASSERT(start_pos >= 0 && start_pos <= len);
369 DUK_ASSERT(end_pos >= 0 && end_pos <= len);
370
371 if (start_pos > end_pos) {
372 duk_int_t tmp = start_pos;
373 start_pos = end_pos;
374 end_pos = tmp;
375 }
376
377 DUK_ASSERT(end_pos >= start_pos);
378
379 duk_substring(thr, -1, (duk_size_t) start_pos, (duk_size_t) end_pos);
380 return 1;
381 }
382
383 #if defined(DUK_USE_SECTION_B)
duk_bi_string_prototype_substr(duk_hthread * thr)384 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_substr(duk_hthread *thr) {
385 duk_hstring *h;
386 duk_int_t start_pos, end_pos;
387 duk_int_t len;
388
389 /* Unlike non-obsolete String calls, substr() algorithm in E5.1
390 * specification will happily coerce undefined and null to strings
391 * ("undefined" and "null").
392 */
393 duk_push_this(thr);
394 h = duk_to_hstring_m1(thr); /* Reject Symbols. */
395 DUK_ASSERT(h != NULL);
396 len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h);
397
398 /* [ start length str ] */
399
400 /* The implementation for computing of start_pos and end_pos differs
401 * from the standard algorithm, but is intended to result in the exactly
402 * same behavior. This is not always obvious.
403 */
404
405 /* combines steps 2 and 5; -len ensures max() not needed for step 5 */
406 start_pos = duk_to_int_clamped(thr, 0, -len, len);
407 if (start_pos < 0) {
408 start_pos = len + start_pos;
409 }
410 DUK_ASSERT(start_pos >= 0 && start_pos <= len);
411
412 /* combines steps 3, 6; step 7 is not needed */
413 if (duk_is_undefined(thr, 1)) {
414 end_pos = len;
415 } else {
416 DUK_ASSERT(start_pos <= len);
417 end_pos = start_pos + duk_to_int_clamped(thr, 1, 0, len - start_pos);
418 }
419 DUK_ASSERT(start_pos >= 0 && start_pos <= len);
420 DUK_ASSERT(end_pos >= 0 && end_pos <= len);
421 DUK_ASSERT(end_pos >= start_pos);
422
423 duk_substring(thr, -1, (duk_size_t) start_pos, (duk_size_t) end_pos);
424 return 1;
425 }
426 #endif /* DUK_USE_SECTION_B */
427
duk_bi_string_prototype_slice(duk_hthread * thr)428 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_slice(duk_hthread *thr) {
429 duk_hstring *h;
430 duk_int_t start_pos, end_pos;
431 duk_int_t len;
432
433 h = duk_push_this_coercible_to_string(thr);
434 DUK_ASSERT(h != NULL);
435 len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h);
436
437 /* [ start end str ] */
438
439 start_pos = duk_to_int_clamped(thr, 0, -len, len);
440 if (start_pos < 0) {
441 start_pos = len + start_pos;
442 }
443 if (duk_is_undefined(thr, 1)) {
444 end_pos = len;
445 } else {
446 end_pos = duk_to_int_clamped(thr, 1, -len, len);
447 if (end_pos < 0) {
448 end_pos = len + end_pos;
449 }
450 }
451 DUK_ASSERT(start_pos >= 0 && start_pos <= len);
452 DUK_ASSERT(end_pos >= 0 && end_pos <= len);
453
454 if (end_pos < start_pos) {
455 end_pos = start_pos;
456 }
457
458 DUK_ASSERT(end_pos >= start_pos);
459
460 duk_substring(thr, -1, (duk_size_t) start_pos, (duk_size_t) end_pos);
461 return 1;
462 }
463
464 /*
465 * Case conversion
466 */
467
duk_bi_string_prototype_caseconv_shared(duk_hthread * thr)468 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_caseconv_shared(duk_hthread *thr) {
469 duk_small_int_t uppercase = duk_get_current_magic(thr);
470
471 (void) duk_push_this_coercible_to_string(thr);
472 duk_unicode_case_convert_string(thr, (duk_bool_t) uppercase);
473 return 1;
474 }
475
476 /*
477 * indexOf() and lastIndexOf()
478 */
479
duk_bi_string_prototype_indexof_shared(duk_hthread * thr)480 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_indexof_shared(duk_hthread *thr) {
481 duk_hstring *h_this;
482 duk_hstring *h_search;
483 duk_int_t clen_this;
484 duk_int_t cpos;
485 duk_small_uint_t is_lastindexof = (duk_small_uint_t) duk_get_current_magic(thr); /* 0=indexOf, 1=lastIndexOf */
486
487 h_this = duk_push_this_coercible_to_string(thr);
488 DUK_ASSERT(h_this != NULL);
489 clen_this = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h_this);
490
491 h_search = duk_to_hstring(thr, 0);
492 DUK_ASSERT(h_search != NULL);
493
494 duk_to_number(thr, 1);
495 if (duk_is_nan(thr, 1) && is_lastindexof) {
496 /* indexOf: NaN should cause pos to be zero.
497 * lastIndexOf: NaN should cause pos to be +Infinity
498 * (and later be clamped to len).
499 */
500 cpos = clen_this;
501 } else {
502 cpos = duk_to_int_clamped(thr, 1, 0, clen_this);
503 }
504
505 cpos = duk__str_search_shared(thr, h_this, h_search, cpos, is_lastindexof /*backwards*/);
506 duk_push_int(thr, cpos);
507 return 1;
508 }
509
510 /*
511 * replace()
512 */
513
514 /* XXX: the current implementation works but is quite clunky; it compiles
515 * to almost 1,4kB of x86 code so it needs to be simplified (better approach,
516 * shared helpers, etc). Some ideas for refactoring:
517 *
518 * - a primitive to convert a string into a regexp matcher (reduces matching
519 * code at the cost of making matching much slower)
520 * - use replace() as a basic helper for match() and split(), which are both
521 * much simpler
522 * - API call to get_prop and to_boolean
523 */
524
duk_bi_string_prototype_replace(duk_hthread * thr)525 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_replace(duk_hthread *thr) {
526 duk_hstring *h_input;
527 duk_hstring *h_match;
528 duk_hstring *h_search;
529 duk_hobject *h_re;
530 duk_bufwriter_ctx bw_alloc;
531 duk_bufwriter_ctx *bw;
532 #if defined(DUK_USE_REGEXP_SUPPORT)
533 duk_bool_t is_regexp;
534 duk_bool_t is_global;
535 #endif
536 duk_bool_t is_repl_func;
537 duk_uint32_t match_start_coff, match_start_boff;
538 #if defined(DUK_USE_REGEXP_SUPPORT)
539 duk_int_t match_caps;
540 #endif
541 duk_uint32_t prev_match_end_boff;
542 const duk_uint8_t *r_start, *r_end, *r; /* repl string scan */
543 duk_size_t tmp_sz;
544
545 DUK_ASSERT_TOP(thr, 2);
546 h_input = duk_push_this_coercible_to_string(thr);
547 DUK_ASSERT(h_input != NULL);
548
549 bw = &bw_alloc;
550 DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input)); /* input size is good output starting point */
551
552 DUK_ASSERT_TOP(thr, 4);
553
554 /* stack[0] = search value
555 * stack[1] = replace value
556 * stack[2] = input string
557 * stack[3] = result buffer
558 */
559
560 h_re = duk_get_hobject_with_class(thr, 0, DUK_HOBJECT_CLASS_REGEXP);
561 if (h_re) {
562 #if defined(DUK_USE_REGEXP_SUPPORT)
563 is_regexp = 1;
564 is_global = duk_get_prop_stridx_boolean(thr, 0, DUK_STRIDX_GLOBAL, NULL);
565
566 if (is_global) {
567 /* start match from beginning */
568 duk_push_int(thr, 0);
569 duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
570 }
571 #else /* DUK_USE_REGEXP_SUPPORT */
572 DUK_DCERROR_UNSUPPORTED(thr);
573 #endif /* DUK_USE_REGEXP_SUPPORT */
574 } else {
575 duk_to_string(thr, 0); /* rejects symbols */
576 #if defined(DUK_USE_REGEXP_SUPPORT)
577 is_regexp = 0;
578 is_global = 0;
579 #endif
580 }
581
582 if (duk_is_function(thr, 1)) {
583 is_repl_func = 1;
584 r_start = NULL;
585 r_end = NULL;
586 } else {
587 duk_hstring *h_repl;
588
589 is_repl_func = 0;
590 h_repl = duk_to_hstring(thr, 1); /* reject symbols */
591 DUK_ASSERT(h_repl != NULL);
592 r_start = DUK_HSTRING_GET_DATA(h_repl);
593 r_end = r_start + DUK_HSTRING_GET_BYTELEN(h_repl);
594 }
595
596 prev_match_end_boff = 0;
597
598 for (;;) {
599 /*
600 * If matching with a regexp:
601 * - non-global RegExp: lastIndex not touched on a match, zeroed
602 * on a non-match
603 * - global RegExp: on match, lastIndex will be updated by regexp
604 * executor to point to next char after the matching part (so that
605 * characters in the matching part are not matched again)
606 *
607 * If matching with a string:
608 * - always non-global match, find first occurrence
609 *
610 * We need:
611 * - The character offset of start-of-match for the replacer function
612 * - The byte offsets for start-of-match and end-of-match to implement
613 * the replacement values $&, $`, and $', and to copy non-matching
614 * input string portions (including header and trailer) verbatim.
615 *
616 * NOTE: the E5.1 specification is a bit vague how the RegExp should
617 * behave in the replacement process; e.g. is matching done first for
618 * all matches (in the global RegExp case) before any replacer calls
619 * are made? See: test-bi-string-proto-replace.js for discussion.
620 */
621
622 DUK_ASSERT_TOP(thr, 4);
623
624 #if defined(DUK_USE_REGEXP_SUPPORT)
625 if (is_regexp) {
626 duk_dup_0(thr);
627 duk_dup_2(thr);
628 duk_regexp_match(thr); /* [ ... regexp input ] -> [ res_obj ] */
629 if (!duk_is_object(thr, -1)) {
630 duk_pop(thr);
631 break;
632 }
633
634 duk_get_prop_stridx_short(thr, -1, DUK_STRIDX_INDEX);
635 DUK_ASSERT(duk_is_number(thr, -1));
636 match_start_coff = duk_get_uint(thr, -1);
637 duk_pop(thr);
638
639 duk_get_prop_index(thr, -1, 0);
640 DUK_ASSERT(duk_is_string(thr, -1));
641 h_match = duk_known_hstring(thr, -1);
642 duk_pop(thr); /* h_match is borrowed, remains reachable through match_obj */
643
644 if (DUK_HSTRING_GET_BYTELEN(h_match) == 0) {
645 /* This should be equivalent to match() algorithm step 8.f.iii.2:
646 * detect an empty match and allow it, but don't allow it twice.
647 */
648 duk_uint32_t last_index;
649
650 duk_get_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
651 last_index = (duk_uint32_t) duk_get_uint(thr, -1);
652 DUK_DDD(DUK_DDDPRINT("empty match, bump lastIndex: %ld -> %ld",
653 (long) last_index, (long) (last_index + 1)));
654 duk_pop(thr);
655 duk_push_uint(thr, (duk_uint_t) (last_index + 1));
656 duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
657 }
658
659 DUK_ASSERT(duk_get_length(thr, -1) <= DUK_INT_MAX); /* string limits */
660 match_caps = (duk_int_t) duk_get_length(thr, -1);
661 } else {
662 #else /* DUK_USE_REGEXP_SUPPORT */
663 { /* unconditionally */
664 #endif /* DUK_USE_REGEXP_SUPPORT */
665 const duk_uint8_t *p_start, *p_end, *p; /* input string scan */
666 const duk_uint8_t *q_start; /* match string */
667 duk_size_t p_blen;
668 duk_size_t q_blen;
669
670 #if defined(DUK_USE_REGEXP_SUPPORT)
671 DUK_ASSERT(!is_global); /* single match always */
672 #endif
673
674 p_start = DUK_HSTRING_GET_DATA(h_input);
675 p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
676 p_blen = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h_input);
677 p = p_start;
678
679 h_search = duk_known_hstring(thr, 0);
680 q_start = DUK_HSTRING_GET_DATA(h_search);
681 q_blen = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h_search);
682
683 if (q_blen > p_blen) {
684 break; /* no match */
685 }
686
687 p_end -= q_blen; /* ensure full memcmp() fits in while */
688 DUK_ASSERT(p_end >= p);
689
690 match_start_coff = 0;
691
692 while (p <= p_end) {
693 DUK_ASSERT(p + q_blen <= DUK_HSTRING_GET_DATA(h_input) + DUK_HSTRING_GET_BYTELEN(h_input));
694 if (duk_memcmp((const void *) p, (const void *) q_start, (size_t) q_blen) == 0) {
695 duk_dup_0(thr);
696 h_match = duk_known_hstring(thr, -1);
697 #if defined(DUK_USE_REGEXP_SUPPORT)
698 match_caps = 0;
699 #endif
700 goto found;
701 }
702
703 /* track utf-8 non-continuation bytes */
704 if ((p[0] & 0xc0) != 0x80) {
705 match_start_coff++;
706 }
707 p++;
708 }
709
710 /* not found */
711 break;
712 }
713 found:
714
715 /* stack[0] = search value
716 * stack[1] = replace value
717 * stack[2] = input string
718 * stack[3] = result buffer
719 * stack[4] = regexp match OR match string
720 */
721
722 match_start_boff = (duk_uint32_t) duk_heap_strcache_offset_char2byte(thr, h_input, match_start_coff);
723
724 tmp_sz = (duk_size_t) (match_start_boff - prev_match_end_boff);
725 DUK_BW_WRITE_ENSURE_BYTES(thr, bw, DUK_HSTRING_GET_DATA(h_input) + prev_match_end_boff, tmp_sz);
726
727 prev_match_end_boff = match_start_boff + DUK_HSTRING_GET_BYTELEN(h_match);
728
729 if (is_repl_func) {
730 duk_idx_t idx_args;
731 duk_hstring *h_repl;
732
733 /* regexp res_obj is at index 4 */
734
735 duk_dup_1(thr);
736 idx_args = duk_get_top(thr);
737
738 #if defined(DUK_USE_REGEXP_SUPPORT)
739 if (is_regexp) {
740 duk_int_t idx;
741 duk_require_stack(thr, match_caps + 2);
742 for (idx = 0; idx < match_caps; idx++) {
743 /* match followed by capture(s) */
744 duk_get_prop_index(thr, 4, (duk_uarridx_t) idx);
745 }
746 } else {
747 #else /* DUK_USE_REGEXP_SUPPORT */
748 { /* unconditionally */
749 #endif /* DUK_USE_REGEXP_SUPPORT */
750 /* match == search string, by definition */
751 duk_dup_0(thr);
752 }
753 duk_push_uint(thr, (duk_uint_t) match_start_coff);
754 duk_dup_2(thr);
755
756 /* [ ... replacer match [captures] match_char_offset input ] */
757
758 duk_call(thr, duk_get_top(thr) - idx_args);
759 h_repl = duk_to_hstring_m1(thr); /* -> [ ... repl_value ] */
760 DUK_ASSERT(h_repl != NULL);
761
762 DUK_BW_WRITE_ENSURE_HSTRING(thr, bw, h_repl);
763
764 duk_pop(thr); /* repl_value */
765 } else {
766 r = r_start;
767
768 while (r < r_end) {
769 duk_int_t ch1;
770 duk_int_t ch2;
771 #if defined(DUK_USE_REGEXP_SUPPORT)
772 duk_int_t ch3;
773 #endif
774 duk_size_t left;
775
776 ch1 = *r++;
777 if (ch1 != DUK_ASC_DOLLAR) {
778 goto repl_write;
779 }
780 DUK_ASSERT(r <= r_end);
781 left = (duk_size_t) (r_end - r);
782
783 if (left <= 0) {
784 goto repl_write;
785 }
786
787 ch2 = r[0];
788 switch (ch2) {
789 case DUK_ASC_DOLLAR: {
790 ch1 = (1 << 8) + DUK_ASC_DOLLAR;
791 goto repl_write;
792 }
793 case DUK_ASC_AMP: {
794 DUK_BW_WRITE_ENSURE_HSTRING(thr, bw, h_match);
795 r++;
796 continue;
797 }
798 case DUK_ASC_GRAVE: {
799 tmp_sz = (duk_size_t) match_start_boff;
800 DUK_BW_WRITE_ENSURE_BYTES(thr, bw, DUK_HSTRING_GET_DATA(h_input), tmp_sz);
801 r++;
802 continue;
803 }
804 case DUK_ASC_SINGLEQUOTE: {
805 duk_uint32_t match_end_boff;
806
807 /* Use match charlen instead of bytelen, just in case the input and
808 * match codepoint encodings would have different lengths.
809 */
810 /* XXX: charlen computed here, and also in char2byte helper. */
811 match_end_boff = (duk_uint32_t) duk_heap_strcache_offset_char2byte(thr,
812 h_input,
813 match_start_coff + (duk_uint_fast32_t) DUK_HSTRING_GET_CHARLEN(h_match));
814
815 tmp_sz = (duk_size_t) (DUK_HSTRING_GET_BYTELEN(h_input) - match_end_boff);
816 DUK_BW_WRITE_ENSURE_BYTES(thr, bw, DUK_HSTRING_GET_DATA(h_input) + match_end_boff, tmp_sz);
817 r++;
818 continue;
819 }
820 default: {
821 #if defined(DUK_USE_REGEXP_SUPPORT)
822 duk_int_t capnum, captmp, capadv;
823 /* XXX: optional check, match_caps is zero if no regexp,
824 * so dollar will be interpreted literally anyway.
825 */
826
827 if (!is_regexp) {
828 goto repl_write;
829 }
830
831 if (!(ch2 >= DUK_ASC_0 && ch2 <= DUK_ASC_9)) {
832 goto repl_write;
833 }
834 capnum = ch2 - DUK_ASC_0;
835 capadv = 1;
836
837 if (left >= 2) {
838 ch3 = r[1];
839 if (ch3 >= DUK_ASC_0 && ch3 <= DUK_ASC_9) {
840 captmp = capnum * 10 + (ch3 - DUK_ASC_0);
841 if (captmp < match_caps) {
842 capnum = captmp;
843 capadv = 2;
844 }
845 }
846 }
847
848 if (capnum > 0 && capnum < match_caps) {
849 DUK_ASSERT(is_regexp != 0); /* match_caps == 0 without regexps */
850
851 /* regexp res_obj is at offset 4 */
852 duk_get_prop_index(thr, 4, (duk_uarridx_t) capnum);
853 if (duk_is_string(thr, -1)) {
854 duk_hstring *h_tmp_str;
855
856 h_tmp_str = duk_known_hstring(thr, -1);
857
858 DUK_BW_WRITE_ENSURE_HSTRING(thr, bw, h_tmp_str);
859 } else {
860 /* undefined -> skip (replaced with empty) */
861 }
862 duk_pop(thr);
863 r += capadv;
864 continue;
865 } else {
866 goto repl_write;
867 }
868 #else /* DUK_USE_REGEXP_SUPPORT */
869 goto repl_write; /* unconditionally */
870 #endif /* DUK_USE_REGEXP_SUPPORT */
871 } /* default case */
872 } /* switch (ch2) */
873
874 repl_write:
875 /* ch1 = (r_increment << 8) + byte */
876
877 DUK_BW_WRITE_ENSURE_U8(thr, bw, (duk_uint8_t) (ch1 & 0xff));
878 r += ch1 >> 8;
879 } /* while repl */
880 } /* if (is_repl_func) */
881
882 duk_pop(thr); /* pop regexp res_obj or match string */
883
884 #if defined(DUK_USE_REGEXP_SUPPORT)
885 if (!is_global) {
886 #else
887 { /* unconditionally; is_global==0 */
888 #endif
889 break;
890 }
891 }
892
893 /* trailer */
894 tmp_sz = (duk_size_t) (DUK_HSTRING_GET_BYTELEN(h_input) - prev_match_end_boff);
895 DUK_BW_WRITE_ENSURE_BYTES(thr, bw, DUK_HSTRING_GET_DATA(h_input) + prev_match_end_boff, tmp_sz);
896
897 DUK_ASSERT_TOP(thr, 4);
898 DUK_BW_COMPACT(thr, bw);
899 (void) duk_buffer_to_string(thr, -1); /* Safe if inputs are safe. */
900 return 1;
901 }
902
903 /*
904 * split()
905 */
906
907 /* XXX: very messy now, but works; clean up, remove unused variables (nomimally
908 * used so compiler doesn't complain).
909 */
910
911 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_split(duk_hthread *thr) {
912 duk_hstring *h_input;
913 duk_hstring *h_sep;
914 duk_uint32_t limit;
915 duk_uint32_t arr_idx;
916 #if defined(DUK_USE_REGEXP_SUPPORT)
917 duk_bool_t is_regexp;
918 #endif
919 duk_bool_t matched; /* set to 1 if any match exists (needed for empty input special case) */
920 duk_uint32_t prev_match_end_coff, prev_match_end_boff;
921 duk_uint32_t match_start_boff, match_start_coff;
922 duk_uint32_t match_end_boff, match_end_coff;
923
924 h_input = duk_push_this_coercible_to_string(thr);
925 DUK_ASSERT(h_input != NULL);
926
927 duk_push_array(thr);
928
929 if (duk_is_undefined(thr, 1)) {
930 limit = 0xffffffffUL;
931 } else {
932 limit = duk_to_uint32(thr, 1);
933 }
934
935 if (limit == 0) {
936 return 1;
937 }
938
939 /* If the separator is a RegExp, make a "clone" of it. The specification
940 * algorithm calls [[Match]] directly for specific indices; we emulate this
941 * by tweaking lastIndex and using a "force global" variant of duk_regexp_match()
942 * which will use global-style matching even when the RegExp itself is non-global.
943 */
944
945 if (duk_is_undefined(thr, 0)) {
946 /* The spec algorithm first does "R = ToString(separator)" before checking
947 * whether separator is undefined. Since this is side effect free, we can
948 * skip the ToString() here.
949 */
950 duk_dup_2(thr);
951 duk_put_prop_index(thr, 3, 0);
952 return 1;
953 } else if (duk_get_hobject_with_class(thr, 0, DUK_HOBJECT_CLASS_REGEXP) != NULL) {
954 #if defined(DUK_USE_REGEXP_SUPPORT)
955 duk_push_hobject_bidx(thr, DUK_BIDX_REGEXP_CONSTRUCTOR);
956 duk_dup_0(thr);
957 duk_new(thr, 1); /* [ ... RegExp val ] -> [ ... res ] */
958 duk_replace(thr, 0);
959 /* lastIndex is initialized to zero by new RegExp() */
960 is_regexp = 1;
961 #else
962 DUK_DCERROR_UNSUPPORTED(thr);
963 #endif
964 } else {
965 duk_to_string(thr, 0);
966 #if defined(DUK_USE_REGEXP_SUPPORT)
967 is_regexp = 0;
968 #endif
969 }
970
971 /* stack[0] = separator (string or regexp)
972 * stack[1] = limit
973 * stack[2] = input string
974 * stack[3] = result array
975 */
976
977 prev_match_end_boff = 0;
978 prev_match_end_coff = 0;
979 arr_idx = 0;
980 matched = 0;
981
982 for (;;) {
983 /*
984 * The specification uses RegExp [[Match]] to attempt match at specific
985 * offsets. We don't have such a primitive, so we use an actual RegExp
986 * and tweak lastIndex. Since the RegExp may be non-global, we use a
987 * special variant which forces global-like behavior for matching.
988 */
989
990 DUK_ASSERT_TOP(thr, 4);
991
992 #if defined(DUK_USE_REGEXP_SUPPORT)
993 if (is_regexp) {
994 duk_dup_0(thr);
995 duk_dup_2(thr);
996 duk_regexp_match_force_global(thr); /* [ ... regexp input ] -> [ res_obj ] */
997 if (!duk_is_object(thr, -1)) {
998 duk_pop(thr);
999 break;
1000 }
1001 matched = 1;
1002
1003 duk_get_prop_stridx_short(thr, -1, DUK_STRIDX_INDEX);
1004 DUK_ASSERT(duk_is_number(thr, -1));
1005 match_start_coff = duk_get_uint(thr, -1);
1006 match_start_boff = (duk_uint32_t) duk_heap_strcache_offset_char2byte(thr, h_input, match_start_coff);
1007 duk_pop(thr);
1008
1009 if (match_start_coff == DUK_HSTRING_GET_CHARLEN(h_input)) {
1010 /* don't allow an empty match at the end of the string */
1011 duk_pop(thr);
1012 break;
1013 }
1014
1015 duk_get_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1016 DUK_ASSERT(duk_is_number(thr, -1));
1017 match_end_coff = duk_get_uint(thr, -1);
1018 match_end_boff = (duk_uint32_t) duk_heap_strcache_offset_char2byte(thr, h_input, match_end_coff);
1019 duk_pop(thr);
1020
1021 /* empty match -> bump and continue */
1022 if (prev_match_end_boff == match_end_boff) {
1023 duk_push_uint(thr, (duk_uint_t) (match_end_coff + 1));
1024 duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1025 duk_pop(thr);
1026 continue;
1027 }
1028 } else {
1029 #else /* DUK_USE_REGEXP_SUPPORT */
1030 { /* unconditionally */
1031 #endif /* DUK_USE_REGEXP_SUPPORT */
1032 const duk_uint8_t *p_start, *p_end, *p; /* input string scan */
1033 const duk_uint8_t *q_start; /* match string */
1034 duk_size_t q_blen, q_clen;
1035
1036 p_start = DUK_HSTRING_GET_DATA(h_input);
1037 p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
1038 p = p_start + prev_match_end_boff;
1039
1040 h_sep = duk_known_hstring(thr, 0); /* symbol already rejected above */
1041 q_start = DUK_HSTRING_GET_DATA(h_sep);
1042 q_blen = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h_sep);
1043 q_clen = (duk_size_t) DUK_HSTRING_GET_CHARLEN(h_sep);
1044
1045 p_end -= q_blen; /* ensure full memcmp() fits in while */
1046
1047 match_start_coff = prev_match_end_coff;
1048
1049 if (q_blen == 0) {
1050 /* Handle empty separator case: it will always match, and always
1051 * triggers the check in step 13.c.iii initially. Note that we
1052 * must skip to either end of string or start of first codepoint,
1053 * skipping over any continuation bytes!
1054 *
1055 * Don't allow an empty string to match at the end of the input.
1056 */
1057
1058 matched = 1; /* empty separator can always match */
1059
1060 match_start_coff++;
1061 p++;
1062 while (p < p_end) {
1063 if ((p[0] & 0xc0) != 0x80) {
1064 goto found;
1065 }
1066 p++;
1067 }
1068 goto not_found;
1069 }
1070
1071 DUK_ASSERT(q_blen > 0 && q_clen > 0);
1072 while (p <= p_end) {
1073 DUK_ASSERT(p + q_blen <= DUK_HSTRING_GET_DATA(h_input) + DUK_HSTRING_GET_BYTELEN(h_input));
1074 DUK_ASSERT(q_blen > 0); /* no issues with empty memcmp() */
1075 if (duk_memcmp((const void *) p, (const void *) q_start, (size_t) q_blen) == 0) {
1076 /* never an empty match, so step 13.c.iii can't be triggered */
1077 goto found;
1078 }
1079
1080 /* track utf-8 non-continuation bytes */
1081 if ((p[0] & 0xc0) != 0x80) {
1082 match_start_coff++;
1083 }
1084 p++;
1085 }
1086
1087 not_found:
1088 /* not found */
1089 break;
1090
1091 found:
1092 matched = 1;
1093 match_start_boff = (duk_uint32_t) (p - p_start);
1094 match_end_coff = (duk_uint32_t) (match_start_coff + q_clen); /* constrained by string length */
1095 match_end_boff = (duk_uint32_t) (match_start_boff + q_blen); /* ditto */
1096
1097 /* empty match (may happen with empty separator) -> bump and continue */
1098 if (prev_match_end_boff == match_end_boff) {
1099 prev_match_end_boff++;
1100 prev_match_end_coff++;
1101 continue;
1102 }
1103 } /* if (is_regexp) */
1104
1105 /* stack[0] = separator (string or regexp)
1106 * stack[1] = limit
1107 * stack[2] = input string
1108 * stack[3] = result array
1109 * stack[4] = regexp res_obj (if is_regexp)
1110 */
1111
1112 DUK_DDD(DUK_DDDPRINT("split; match_start b=%ld,c=%ld, match_end b=%ld,c=%ld, prev_end b=%ld,c=%ld",
1113 (long) match_start_boff, (long) match_start_coff,
1114 (long) match_end_boff, (long) match_end_coff,
1115 (long) prev_match_end_boff, (long) prev_match_end_coff));
1116
1117 duk_push_lstring(thr,
1118 (const char *) (DUK_HSTRING_GET_DATA(h_input) + prev_match_end_boff),
1119 (duk_size_t) (match_start_boff - prev_match_end_boff));
1120 duk_put_prop_index(thr, 3, arr_idx);
1121 arr_idx++;
1122 if (arr_idx >= limit) {
1123 goto hit_limit;
1124 }
1125
1126 #if defined(DUK_USE_REGEXP_SUPPORT)
1127 if (is_regexp) {
1128 duk_size_t i, len;
1129
1130 len = duk_get_length(thr, 4);
1131 for (i = 1; i < len; i++) {
1132 DUK_ASSERT(i <= DUK_UARRIDX_MAX); /* cannot have >4G captures */
1133 duk_get_prop_index(thr, 4, (duk_uarridx_t) i);
1134 duk_put_prop_index(thr, 3, arr_idx);
1135 arr_idx++;
1136 if (arr_idx >= limit) {
1137 goto hit_limit;
1138 }
1139 }
1140
1141 duk_pop(thr);
1142 /* lastIndex already set up for next match */
1143 } else {
1144 #else /* DUK_USE_REGEXP_SUPPORT */
1145 { /* unconditionally */
1146 #endif /* DUK_USE_REGEXP_SUPPORT */
1147 /* no action */
1148 }
1149
1150 prev_match_end_boff = match_end_boff;
1151 prev_match_end_coff = match_end_coff;
1152 continue;
1153 } /* for */
1154
1155 /* Combined step 11 (empty string special case) and 14-15. */
1156
1157 DUK_DDD(DUK_DDDPRINT("split trailer; prev_end b=%ld,c=%ld",
1158 (long) prev_match_end_boff, (long) prev_match_end_coff));
1159
1160 if (DUK_HSTRING_GET_BYTELEN(h_input) > 0 || !matched) {
1161 /* Add trailer if:
1162 * a) non-empty input
1163 * b) empty input and no (zero size) match found (step 11)
1164 */
1165
1166 duk_push_lstring(thr,
1167 (const char *) DUK_HSTRING_GET_DATA(h_input) + prev_match_end_boff,
1168 (duk_size_t) (DUK_HSTRING_GET_BYTELEN(h_input) - prev_match_end_boff));
1169 duk_put_prop_index(thr, 3, arr_idx);
1170 /* No arr_idx update or limit check */
1171 }
1172
1173 return 1;
1174
1175 hit_limit:
1176 #if defined(DUK_USE_REGEXP_SUPPORT)
1177 if (is_regexp) {
1178 duk_pop(thr);
1179 }
1180 #endif
1181
1182 return 1;
1183 }
1184
1185 /*
1186 * Various
1187 */
1188
1189 #if defined(DUK_USE_REGEXP_SUPPORT)
1190 DUK_LOCAL void duk__to_regexp_helper(duk_hthread *thr, duk_idx_t idx, duk_bool_t force_new) {
1191 duk_hobject *h;
1192
1193 /* Shared helper for match() steps 3-4, search() steps 3-4. */
1194
1195 DUK_ASSERT(idx >= 0);
1196
1197 if (force_new) {
1198 goto do_new;
1199 }
1200
1201 h = duk_get_hobject_with_class(thr, idx, DUK_HOBJECT_CLASS_REGEXP);
1202 if (!h) {
1203 goto do_new;
1204 }
1205 return;
1206
1207 do_new:
1208 duk_push_hobject_bidx(thr, DUK_BIDX_REGEXP_CONSTRUCTOR);
1209 duk_dup(thr, idx);
1210 duk_new(thr, 1); /* [ ... RegExp val ] -> [ ... res ] */
1211 duk_replace(thr, idx);
1212 }
1213 #endif /* DUK_USE_REGEXP_SUPPORT */
1214
1215 #if defined(DUK_USE_REGEXP_SUPPORT)
1216 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_search(duk_hthread *thr) {
1217 /* Easiest way to implement the search required by the specification
1218 * is to do a RegExp test() with lastIndex forced to zero. To avoid
1219 * side effects on the argument, "clone" the RegExp if a RegExp was
1220 * given as input.
1221 *
1222 * The global flag of the RegExp should be ignored; setting lastIndex
1223 * to zero (which happens when "cloning" the RegExp) should have an
1224 * equivalent effect.
1225 */
1226
1227 DUK_ASSERT_TOP(thr, 1);
1228 (void) duk_push_this_coercible_to_string(thr); /* at index 1 */
1229 duk__to_regexp_helper(thr, 0 /*index*/, 1 /*force_new*/);
1230
1231 /* stack[0] = regexp
1232 * stack[1] = string
1233 */
1234
1235 /* Avoid using RegExp.prototype methods, as they're writable and
1236 * configurable and may have been changed.
1237 */
1238
1239 duk_dup_0(thr);
1240 duk_dup_1(thr); /* [ ... re_obj input ] */
1241 duk_regexp_match(thr); /* -> [ ... res_obj ] */
1242
1243 if (!duk_is_object(thr, -1)) {
1244 duk_push_int(thr, -1);
1245 return 1;
1246 }
1247
1248 duk_get_prop_stridx_short(thr, -1, DUK_STRIDX_INDEX);
1249 DUK_ASSERT(duk_is_number(thr, -1));
1250 return 1;
1251 }
1252 #endif /* DUK_USE_REGEXP_SUPPORT */
1253
1254 #if defined(DUK_USE_REGEXP_SUPPORT)
1255 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_match(duk_hthread *thr) {
1256 duk_bool_t global;
1257 duk_int_t prev_last_index;
1258 duk_int_t this_index;
1259 duk_int_t arr_idx;
1260
1261 DUK_ASSERT_TOP(thr, 1);
1262 (void) duk_push_this_coercible_to_string(thr);
1263 duk__to_regexp_helper(thr, 0 /*index*/, 0 /*force_new*/);
1264 global = duk_get_prop_stridx_boolean(thr, 0, DUK_STRIDX_GLOBAL, NULL);
1265 DUK_ASSERT_TOP(thr, 2);
1266
1267 /* stack[0] = regexp
1268 * stack[1] = string
1269 */
1270
1271 if (!global) {
1272 duk_regexp_match(thr); /* -> [ res_obj ] */
1273 return 1; /* return 'res_obj' */
1274 }
1275
1276 /* Global case is more complex. */
1277
1278 /* [ regexp string ] */
1279
1280 duk_push_int(thr, 0);
1281 duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1282 duk_push_array(thr);
1283
1284 /* [ regexp string res_arr ] */
1285
1286 prev_last_index = 0;
1287 arr_idx = 0;
1288
1289 for (;;) {
1290 DUK_ASSERT_TOP(thr, 3);
1291
1292 duk_dup_0(thr);
1293 duk_dup_1(thr);
1294 duk_regexp_match(thr); /* -> [ ... regexp string ] -> [ ... res_obj ] */
1295
1296 if (!duk_is_object(thr, -1)) {
1297 duk_pop(thr);
1298 break;
1299 }
1300
1301 duk_get_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1302 DUK_ASSERT(duk_is_number(thr, -1));
1303 this_index = duk_get_int(thr, -1);
1304 duk_pop(thr);
1305
1306 if (this_index == prev_last_index) {
1307 this_index++;
1308 duk_push_int(thr, this_index);
1309 duk_put_prop_stridx_short(thr, 0, DUK_STRIDX_LAST_INDEX);
1310 }
1311 prev_last_index = this_index;
1312
1313 duk_get_prop_index(thr, -1, 0); /* match string */
1314 duk_put_prop_index(thr, 2, (duk_uarridx_t) arr_idx);
1315 arr_idx++;
1316 duk_pop(thr); /* res_obj */
1317 }
1318
1319 if (arr_idx == 0) {
1320 duk_push_null(thr);
1321 }
1322
1323 return 1; /* return 'res_arr' or 'null' */
1324 }
1325 #endif /* DUK_USE_REGEXP_SUPPORT */
1326
1327 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_concat(duk_hthread *thr) {
1328 /* duk_concat() coerces arguments with ToString() in correct order */
1329 (void) duk_push_this_coercible_to_string(thr);
1330 duk_insert(thr, 0); /* this is relatively expensive */
1331 duk_concat(thr, duk_get_top(thr));
1332 return 1;
1333 }
1334
1335 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_trim(duk_hthread *thr) {
1336 DUK_ASSERT_TOP(thr, 0);
1337 (void) duk_push_this_coercible_to_string(thr);
1338 duk_trim(thr, 0);
1339 DUK_ASSERT_TOP(thr, 1);
1340 return 1;
1341 }
1342
1343 #if defined(DUK_USE_ES6)
1344 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_repeat(duk_hthread *thr) {
1345 duk_hstring *h_input;
1346 duk_size_t input_blen;
1347 duk_size_t result_len;
1348 duk_int_t count_signed;
1349 duk_uint_t count;
1350 const duk_uint8_t *src;
1351 duk_uint8_t *buf;
1352 duk_uint8_t *p;
1353 duk_double_t d;
1354 #if !defined(DUK_USE_PREFER_SIZE)
1355 duk_size_t copy_size;
1356 duk_uint8_t *p_end;
1357 #endif
1358
1359 DUK_ASSERT_TOP(thr, 1);
1360 h_input = duk_push_this_coercible_to_string(thr);
1361 DUK_ASSERT(h_input != NULL);
1362 input_blen = DUK_HSTRING_GET_BYTELEN(h_input);
1363
1364 /* Count is ToNumber() coerced; +Infinity must be always rejected
1365 * (even if input string is zero length), as well as negative values
1366 * and -Infinity. -Infinity doesn't require an explicit check
1367 * because duk_get_int() clamps it to DUK_INT_MIN which gets rejected
1368 * as a negative value (regardless of input string length).
1369 */
1370 d = duk_to_number(thr, 0);
1371 if (duk_double_is_posinf(d)) {
1372 goto fail_range;
1373 }
1374 count_signed = duk_get_int(thr, 0);
1375 if (count_signed < 0) {
1376 goto fail_range;
1377 }
1378 count = (duk_uint_t) count_signed;
1379
1380 /* Overflow check for result length. */
1381 result_len = count * input_blen;
1382 if (count != 0 && result_len / count != input_blen) {
1383 goto fail_range;
1384 }
1385
1386 /* Temporary fixed buffer, later converted to string. */
1387 buf = (duk_uint8_t *) duk_push_fixed_buffer_nozero(thr, result_len);
1388 DUK_ASSERT(buf != NULL);
1389 src = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
1390 DUK_ASSERT(src != NULL);
1391
1392 #if defined(DUK_USE_PREFER_SIZE)
1393 p = buf;
1394 while (count-- > 0) {
1395 duk_memcpy((void *) p, (const void *) src, input_blen); /* copy size may be zero, but pointers are valid */
1396 p += input_blen;
1397 }
1398 #else /* DUK_USE_PREFER_SIZE */
1399 /* Take advantage of already copied pieces to speed up the process
1400 * especially for small repeated strings.
1401 */
1402 p = buf;
1403 p_end = p + result_len;
1404 copy_size = input_blen;
1405 for (;;) {
1406 duk_size_t remain = (duk_size_t) (p_end - p);
1407 DUK_DDD(DUK_DDDPRINT("remain=%ld, copy_size=%ld, input_blen=%ld, result_len=%ld",
1408 (long) remain, (long) copy_size, (long) input_blen,
1409 (long) result_len));
1410 if (remain <= copy_size) {
1411 /* If result_len is zero, this case is taken and does
1412 * a zero size copy (with valid pointers).
1413 */
1414 duk_memcpy((void *) p, (const void *) src, remain);
1415 break;
1416 } else {
1417 duk_memcpy((void *) p, (const void *) src, copy_size);
1418 p += copy_size;
1419 }
1420
1421 src = (const duk_uint8_t *) buf; /* Use buf as source for larger copies. */
1422 copy_size = (duk_size_t) (p - buf);
1423 }
1424 #endif /* DUK_USE_PREFER_SIZE */
1425
1426 /* XXX: It would be useful to be able to create a duk_hstring with
1427 * a certain byte size whose data area wasn't initialized and which
1428 * wasn't in the string table yet. This would allow a string to be
1429 * constructed directly without a buffer temporary and when it was
1430 * finished, it could be injected into the string table. Currently
1431 * this isn't possible because duk_hstrings are only tracked by the
1432 * intern table (they are not in heap_allocated).
1433 */
1434
1435 duk_buffer_to_string(thr, -1); /* Safe if input is safe. */
1436 return 1;
1437
1438 fail_range:
1439 DUK_DCERROR_RANGE_INVALID_ARGS(thr);
1440 }
1441 #endif /* DUK_USE_ES6 */
1442
1443 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_locale_compare(duk_hthread *thr) {
1444 duk_hstring *h1;
1445 duk_hstring *h2;
1446 duk_size_t h1_len, h2_len, prefix_len;
1447 duk_small_int_t ret = 0;
1448 duk_small_int_t rc;
1449
1450 /* The current implementation of localeCompare() is simply a codepoint
1451 * by codepoint comparison, implemented with a simple string compare
1452 * because UTF-8 should preserve codepoint ordering (assuming valid
1453 * shortest UTF-8 encoding).
1454 *
1455 * The specification requires that the return value must be related
1456 * to the sort order: e.g. negative means that 'this' comes before
1457 * 'that' in sort order. We assume an ascending sort order.
1458 */
1459
1460 /* XXX: could share code with duk_js_ops.c, duk_js_compare_helper */
1461
1462 h1 = duk_push_this_coercible_to_string(thr);
1463 DUK_ASSERT(h1 != NULL);
1464
1465 h2 = duk_to_hstring(thr, 0);
1466 DUK_ASSERT(h2 != NULL);
1467
1468 h1_len = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h1);
1469 h2_len = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h2);
1470 prefix_len = (h1_len <= h2_len ? h1_len : h2_len);
1471
1472 rc = (duk_small_int_t) duk_memcmp((const void *) DUK_HSTRING_GET_DATA(h1),
1473 (const void *) DUK_HSTRING_GET_DATA(h2),
1474 (size_t) prefix_len);
1475
1476 if (rc < 0) {
1477 ret = -1;
1478 goto done;
1479 } else if (rc > 0) {
1480 ret = 1;
1481 goto done;
1482 }
1483
1484 /* prefix matches, lengths matter now */
1485 if (h1_len > h2_len) {
1486 ret = 1;
1487 goto done;
1488 } else if (h1_len == h2_len) {
1489 DUK_ASSERT(ret == 0);
1490 goto done;
1491 }
1492 ret = -1;
1493 goto done;
1494
1495 done:
1496 duk_push_int(thr, (duk_int_t) ret);
1497 return 1;
1498 }
1499
1500 #if defined(DUK_USE_ES6)
1501 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_startswith_endswith(duk_hthread *thr) {
1502 duk_int_t magic;
1503 duk_hstring *h_target;
1504 duk_size_t blen_target;
1505 duk_hstring *h_search;
1506 duk_size_t blen_search;
1507 duk_int_t off;
1508 duk_bool_t result = 0;
1509 duk_size_t blen_left;
1510
1511 /* Because string byte lengths are in [0,DUK_INT_MAX] it's safe to
1512 * subtract two string lengths without overflow.
1513 */
1514 DUK_ASSERT(DUK_HSTRING_MAX_BYTELEN <= DUK_INT_MAX);
1515
1516 h_target = duk_push_this_coercible_to_string(thr);
1517 DUK_ASSERT(h_target != NULL);
1518
1519 h_search = duk__str_tostring_notregexp(thr, 0);
1520 DUK_ASSERT(h_search != NULL);
1521
1522 magic = duk_get_current_magic(thr);
1523
1524 /* Careful to avoid pointer overflows in the matching logic. */
1525
1526 blen_target = DUK_HSTRING_GET_BYTELEN(h_target);
1527 blen_search = DUK_HSTRING_GET_BYTELEN(h_search);
1528
1529 #if 0
1530 /* If search string is longer than the target string, we can
1531 * never match. Could check explicitly, but should be handled
1532 * correctly below.
1533 */
1534 if (blen_search > blen_target) {
1535 goto finish;
1536 }
1537 #endif
1538
1539 off = 0;
1540 if (duk_is_undefined(thr, 1)) {
1541 if (magic) {
1542 off = (duk_int_t) blen_target - (duk_int_t) blen_search;
1543 } else {
1544 DUK_ASSERT(off == 0);
1545 }
1546 } else {
1547 duk_int_t len;
1548 duk_int_t pos;
1549
1550 DUK_ASSERT(DUK_HSTRING_MAX_BYTELEN <= DUK_INT_MAX);
1551 len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h_target);
1552 pos = duk_to_int_clamped(thr, 1, 0, len);
1553 DUK_ASSERT(pos >= 0 && pos <= len);
1554
1555 off = (duk_int_t) duk_heap_strcache_offset_char2byte(thr, h_target, (duk_uint_fast32_t) pos);
1556 if (magic) {
1557 off -= (duk_int_t) blen_search;
1558 }
1559 }
1560 if (off < 0 || off > (duk_int_t) blen_target) {
1561 goto finish;
1562 }
1563
1564 /* The main comparison can be done using a memcmp() rather than
1565 * doing codepoint comparisons: for CESU-8 strings there is a
1566 * canonical representation for every codepoint. But we do need
1567 * to deal with the char/byte offset translation to find the
1568 * comparison range.
1569 */
1570
1571 DUK_ASSERT(off >= 0);
1572 DUK_ASSERT((duk_size_t) off <= blen_target);
1573 blen_left = blen_target - (duk_size_t) off;
1574 if (blen_left >= blen_search) {
1575 const duk_uint8_t *p_cmp_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_target) + off;
1576 const duk_uint8_t *p_search = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_search);
1577 if (duk_memcmp_unsafe((const void *) p_cmp_start, (const void *) p_search, (size_t) blen_search) == 0) {
1578 result = 1;
1579 }
1580 }
1581
1582 finish:
1583 duk_push_boolean(thr, result);
1584 return 1;
1585 }
1586 #endif /* DUK_USE_ES6 */
1587
1588 #if defined(DUK_USE_ES6)
1589 DUK_INTERNAL duk_ret_t duk_bi_string_prototype_includes(duk_hthread *thr) {
1590 duk_hstring *h;
1591 duk_hstring *h_search;
1592 duk_int_t len;
1593 duk_int_t pos;
1594
1595 h = duk_push_this_coercible_to_string(thr);
1596 DUK_ASSERT(h != NULL);
1597
1598 h_search = duk__str_tostring_notregexp(thr, 0);
1599 DUK_ASSERT(h_search != NULL);
1600
1601 len = (duk_int_t) DUK_HSTRING_GET_CHARLEN(h);
1602 pos = duk_to_int_clamped(thr, 1, 0, len);
1603 DUK_ASSERT(pos >= 0 && pos <= len);
1604
1605 pos = duk__str_search_shared(thr, h, h_search, pos, 0 /*backwards*/);
1606 duk_push_boolean(thr, pos >= 0);
1607 return 1;
1608 }
1609 #endif /* DUK_USE_ES6 */
1610 #endif /* DUK_USE_STRING_BUILTIN */
1611