1 /*
2  *  String manipulation
3  */
4 
5 #include "duk_internal.h"
6 
duk__concat_and_join_helper(duk_hthread * thr,duk_idx_t count_in,duk_bool_t is_join)7 DUK_LOCAL void duk__concat_and_join_helper(duk_hthread *thr, duk_idx_t count_in, duk_bool_t is_join) {
8 	duk_uint_t count;
9 	duk_uint_t i;
10 	duk_size_t idx;
11 	duk_size_t len;
12 	duk_hstring *h;
13 	duk_uint8_t *buf;
14 
15 	DUK_CTX_ASSERT_VALID(thr);
16 
17 	if (DUK_UNLIKELY(count_in <= 0)) {
18 		if (count_in < 0) {
19 			DUK_ERROR_RANGE_INVALID_COUNT(thr);
20 			DUK_WO_NORETURN(return;);
21 		}
22 		DUK_ASSERT(count_in == 0);
23 		duk_push_hstring_empty(thr);
24 		return;
25 	}
26 	count = (duk_uint_t) count_in;
27 
28 	if (is_join) {
29 		duk_size_t t1, t2, limit;
30 		h = duk_to_hstring(thr, -((duk_idx_t) count) - 1);
31 		DUK_ASSERT(h != NULL);
32 
33 		/* A bit tricky overflow test, see doc/code-issues.rst. */
34 		t1 = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h);
35 		t2 = (duk_size_t) (count - 1);
36 		limit = (duk_size_t) DUK_HSTRING_MAX_BYTELEN;
37 		if (DUK_UNLIKELY(t2 != 0 && t1 > limit / t2)) {
38 			/* Combined size of separators already overflows. */
39 			goto error_overflow;
40 		}
41 		len = (duk_size_t) (t1 * t2);
42 	} else {
43 		len = (duk_size_t) 0;
44 	}
45 
46 	for (i = count; i >= 1; i--) {
47 		duk_size_t new_len;
48 		h = duk_to_hstring(thr, -((duk_idx_t) i));
49 		new_len = len + (duk_size_t) DUK_HSTRING_GET_BYTELEN(h);
50 
51 		/* Impose a string maximum length, need to handle overflow
52 		 * correctly.
53 		 */
54 		if (new_len < len ||  /* wrapped */
55 		    new_len > (duk_size_t) DUK_HSTRING_MAX_BYTELEN) {
56 			goto error_overflow;
57 		}
58 		len = new_len;
59 	}
60 
61 	DUK_DDD(DUK_DDDPRINT("join/concat %lu strings, total length %lu bytes",
62 	                     (unsigned long) count, (unsigned long) len));
63 
64 	/* Use stack allocated buffer to ensure reachability in errors
65 	 * (e.g. intern error).
66 	 */
67 	buf = (duk_uint8_t *) duk_push_fixed_buffer_nozero(thr, len);
68 	DUK_ASSERT(buf != NULL);
69 
70 	/* [ ... (sep) str1 str2 ... strN buf ] */
71 
72 	idx = 0;
73 	for (i = count; i >= 1; i--) {
74 		if (is_join && i != count) {
75 			h = duk_require_hstring(thr, -((duk_idx_t) count) - 2);  /* extra -1 for buffer */
76 			duk_memcpy(buf + idx, DUK_HSTRING_GET_DATA(h), DUK_HSTRING_GET_BYTELEN(h));
77 			idx += DUK_HSTRING_GET_BYTELEN(h);
78 		}
79 		h = duk_require_hstring(thr, -((duk_idx_t) i) - 1);  /* extra -1 for buffer */
80 		duk_memcpy(buf + idx, DUK_HSTRING_GET_DATA(h), DUK_HSTRING_GET_BYTELEN(h));
81 		idx += DUK_HSTRING_GET_BYTELEN(h);
82 	}
83 
84 	DUK_ASSERT(idx == len);
85 
86 	/* [ ... (sep) str1 str2 ... strN buf ] */
87 
88 	/* Get rid of the strings early to minimize memory use before intern. */
89 
90 	if (is_join) {
91 		duk_replace(thr, -((duk_idx_t) count) - 2);  /* overwrite sep */
92 		duk_pop_n(thr, (duk_idx_t) count);
93 	} else {
94 		duk_replace(thr, -((duk_idx_t) count) - 1);  /* overwrite str1 */
95 		duk_pop_n(thr, (duk_idx_t) (count - 1));
96 	}
97 
98 	/* [ ... buf ] */
99 
100 	(void) duk_buffer_to_string(thr, -1);  /* Safe if inputs are safe. */
101 
102 	/* [ ... res ] */
103 	return;
104 
105  error_overflow:
106 	DUK_ERROR_RANGE(thr, DUK_STR_RESULT_TOO_LONG);
107 	DUK_WO_NORETURN(return;);
108 }
109 
duk_concat(duk_hthread * thr,duk_idx_t count)110 DUK_EXTERNAL void duk_concat(duk_hthread *thr, duk_idx_t count) {
111 	DUK_ASSERT_API_ENTRY(thr);
112 
113 	duk__concat_and_join_helper(thr, count, 0 /*is_join*/);
114 }
115 
116 #if defined(DUK_USE_PREFER_SIZE)
duk_concat_2(duk_hthread * thr)117 DUK_INTERNAL void duk_concat_2(duk_hthread *thr) {
118 	DUK_ASSERT_API_ENTRY(thr);
119 	duk_concat(thr, 2);
120 }
121 #else  /* DUK_USE_PREFER_SIZE */
duk_concat_2(duk_hthread * thr)122 DUK_INTERNAL void duk_concat_2(duk_hthread *thr) {
123 	duk_hstring *h1;
124 	duk_hstring *h2;
125 	duk_uint8_t *buf;
126 	duk_size_t len1;
127 	duk_size_t len2;
128 	duk_size_t len;
129 
130 	DUK_ASSERT_API_ENTRY(thr);
131 	DUK_ASSERT(duk_get_top(thr) >= 2);  /* Trusted caller. */
132 
133 	h1 = duk_to_hstring(thr, -2);
134 	h2 = duk_to_hstring(thr, -1);
135 	len1 = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h1);
136 	len2 = (duk_size_t) DUK_HSTRING_GET_BYTELEN(h2);
137 	len = len1 + len2;
138 	if (DUK_UNLIKELY(len < len1 ||  /* wrapped */
139 	                 len > (duk_size_t) DUK_HSTRING_MAX_BYTELEN)) {
140 		goto error_overflow;
141 	}
142 	buf = (duk_uint8_t *) duk_push_fixed_buffer_nozero(thr, len);
143 	DUK_ASSERT(buf != NULL);
144 
145 	duk_memcpy((void *) buf, (const void *) DUK_HSTRING_GET_DATA(h1), (size_t) len1);
146 	duk_memcpy((void *) (buf + len1), (const void *) DUK_HSTRING_GET_DATA(h2), (size_t) len2);
147 	(void) duk_buffer_to_string(thr, -1);  /* Safe if inputs are safe. */
148 
149 	/* [ ... str1 str2 buf ] */
150 
151 	duk_replace(thr, -3);
152 	duk_pop_unsafe(thr);
153 	return;
154 
155  error_overflow:
156 	DUK_ERROR_RANGE(thr, DUK_STR_RESULT_TOO_LONG);
157 	DUK_WO_NORETURN(return;);
158 }
159 #endif  /* DUK_USE_PREFER_SIZE */
160 
duk_join(duk_hthread * thr,duk_idx_t count)161 DUK_EXTERNAL void duk_join(duk_hthread *thr, duk_idx_t count) {
162 	DUK_ASSERT_API_ENTRY(thr);
163 
164 	duk__concat_and_join_helper(thr, count, 1 /*is_join*/);
165 }
166 
167 /* XXX: could map/decode be unified with duk_unicode_support.c code?
168  * Case conversion needs also the character surroundings though.
169  */
170 
duk_decode_string(duk_hthread * thr,duk_idx_t idx,duk_decode_char_function callback,void * udata)171 DUK_EXTERNAL void duk_decode_string(duk_hthread *thr, duk_idx_t idx, duk_decode_char_function callback, void *udata) {
172 	duk_hstring *h_input;
173 	const duk_uint8_t *p, *p_start, *p_end;
174 	duk_codepoint_t cp;
175 
176 	DUK_ASSERT_API_ENTRY(thr);
177 
178 	h_input = duk_require_hstring(thr, idx);  /* Accept symbols. */
179 	DUK_ASSERT(h_input != NULL);
180 
181 	p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
182 	p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
183 	p = p_start;
184 
185 	for (;;) {
186 		if (p >= p_end) {
187 			break;
188 		}
189 		cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);
190 		callback(udata, cp);
191 	}
192 }
193 
duk_map_string(duk_hthread * thr,duk_idx_t idx,duk_map_char_function callback,void * udata)194 DUK_EXTERNAL void duk_map_string(duk_hthread *thr, duk_idx_t idx, duk_map_char_function callback, void *udata) {
195 	duk_hstring *h_input;
196 	duk_bufwriter_ctx bw_alloc;
197 	duk_bufwriter_ctx *bw;
198 	const duk_uint8_t *p, *p_start, *p_end;
199 	duk_codepoint_t cp;
200 
201 	DUK_ASSERT_API_ENTRY(thr);
202 
203 	idx = duk_normalize_index(thr, idx);
204 
205 	h_input = duk_require_hstring(thr, idx);  /* Accept symbols. */
206 	DUK_ASSERT(h_input != NULL);
207 
208 	bw = &bw_alloc;
209 	DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input));  /* Reasonable output estimate. */
210 
211 	p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
212 	p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
213 	p = p_start;
214 
215 	for (;;) {
216 		/* XXX: could write output in chunks with fewer ensure calls,
217 		 * but relative benefit would be small here.
218 		 */
219 
220 		if (p >= p_end) {
221 			break;
222 		}
223 		cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);
224 		cp = callback(udata, cp);
225 
226 		DUK_BW_WRITE_ENSURE_XUTF8(thr, bw, cp);
227 	}
228 
229 	DUK_BW_COMPACT(thr, bw);
230 	(void) duk_buffer_to_string(thr, -1);  /* Safe, extended UTF-8 encoded. */
231 	duk_replace(thr, idx);
232 }
233 
duk_substring(duk_hthread * thr,duk_idx_t idx,duk_size_t start_offset,duk_size_t end_offset)234 DUK_EXTERNAL void duk_substring(duk_hthread *thr, duk_idx_t idx, duk_size_t start_offset, duk_size_t end_offset) {
235 	duk_hstring *h;
236 	duk_hstring *res;
237 	duk_size_t start_byte_offset;
238 	duk_size_t end_byte_offset;
239 	duk_size_t charlen;
240 
241 	DUK_ASSERT_API_ENTRY(thr);
242 
243 	idx = duk_require_normalize_index(thr, idx);  /* Accept symbols. */
244 	h = duk_require_hstring(thr, idx);
245 	DUK_ASSERT(h != NULL);
246 
247 	charlen = DUK_HSTRING_GET_CHARLEN(h);
248 	if (end_offset >= charlen) {
249 		end_offset = charlen;
250 	}
251 	if (start_offset > end_offset) {
252 		start_offset = end_offset;
253 	}
254 
255 	DUK_ASSERT_DISABLE(start_offset >= 0);
256 	DUK_ASSERT(start_offset <= end_offset && start_offset <= DUK_HSTRING_GET_CHARLEN(h));
257 	DUK_ASSERT_DISABLE(end_offset >= 0);
258 	DUK_ASSERT(end_offset >= start_offset && end_offset <= DUK_HSTRING_GET_CHARLEN(h));
259 
260 	/* Guaranteed by string limits. */
261 	DUK_ASSERT(start_offset <= DUK_UINT32_MAX);
262 	DUK_ASSERT(end_offset <= DUK_UINT32_MAX);
263 
264 	start_byte_offset = (duk_size_t) duk_heap_strcache_offset_char2byte(thr, h, (duk_uint_fast32_t) start_offset);
265 	end_byte_offset = (duk_size_t) duk_heap_strcache_offset_char2byte(thr, h, (duk_uint_fast32_t) end_offset);
266 
267 	DUK_ASSERT(end_byte_offset >= start_byte_offset);
268 	DUK_ASSERT(end_byte_offset - start_byte_offset <= DUK_UINT32_MAX);  /* Guaranteed by string limits. */
269 
270 	/* No size check is necessary. */
271 	res = duk_heap_strtable_intern_checked(thr,
272 	                                       DUK_HSTRING_GET_DATA(h) + start_byte_offset,
273 	                                       (duk_uint32_t) (end_byte_offset - start_byte_offset));
274 
275 	duk_push_hstring(thr, res);
276 	duk_replace(thr, idx);
277 }
278 
279 /* XXX: this is quite clunky.  Add Unicode helpers to scan backwards and
280  * forwards with a callback to process codepoints?
281  */
duk_trim(duk_hthread * thr,duk_idx_t idx)282 DUK_EXTERNAL void duk_trim(duk_hthread *thr, duk_idx_t idx) {
283 	duk_hstring *h;
284 	const duk_uint8_t *p, *p_start, *p_end, *p_tmp1, *p_tmp2;  /* pointers for scanning */
285 	const duk_uint8_t *q_start, *q_end;  /* start (incl) and end (excl) of trimmed part */
286 	duk_codepoint_t cp;
287 
288 	DUK_ASSERT_API_ENTRY(thr);
289 
290 	idx = duk_require_normalize_index(thr, idx);  /* Accept symbols. */
291 	h = duk_require_hstring(thr, idx);
292 	DUK_ASSERT(h != NULL);
293 
294 	p_start = DUK_HSTRING_GET_DATA(h);
295 	p_end = p_start + DUK_HSTRING_GET_BYTELEN(h);
296 
297 	p = p_start;
298 	while (p < p_end) {
299 		p_tmp1 = p;
300 		cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp1, p_start, p_end);
301 		if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) {
302 			break;
303 		}
304 		p = p_tmp1;
305 	}
306 	q_start = p;
307 	if (p == p_end) {
308 		/* Entire string is whitespace. */
309 		q_end = p;
310 		goto scan_done;
311 	}
312 
313 	p = p_end;
314 	while (p > p_start) {
315 		p_tmp1 = p;
316 		while (p > p_start) {
317 			p--;
318 			if (((*p) & 0xc0) != 0x80) {
319 				break;
320 			}
321 		}
322 		p_tmp2 = p;
323 
324 		cp = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p_tmp2, p_start, p_end);
325 		if (!(duk_unicode_is_whitespace(cp) || duk_unicode_is_line_terminator(cp))) {
326 			p = p_tmp1;
327 			break;
328 		}
329 	}
330 	q_end = p;
331 
332  scan_done:
333 	/* This may happen when forward and backward scanning disagree
334 	 * (possible for non-extended-UTF-8 strings).
335 	 */
336 	if (q_end < q_start) {
337 		q_end = q_start;
338 	}
339 
340 	DUK_ASSERT(q_start >= p_start && q_start <= p_end);
341 	DUK_ASSERT(q_end >= p_start && q_end <= p_end);
342 	DUK_ASSERT(q_end >= q_start);
343 
344 	DUK_DDD(DUK_DDDPRINT("trim: p_start=%p, p_end=%p, q_start=%p, q_end=%p",
345 	                     (const void *) p_start, (const void *) p_end,
346 	                     (const void *) q_start, (const void *) q_end));
347 
348 	if (q_start == p_start && q_end == p_end) {
349 		DUK_DDD(DUK_DDDPRINT("nothing was trimmed: avoid interning (hashing etc)"));
350 		return;
351 	}
352 
353 	duk_push_lstring(thr, (const char *) q_start, (duk_size_t) (q_end - q_start));
354 	duk_replace(thr, idx);
355 }
356 
duk_char_code_at(duk_hthread * thr,duk_idx_t idx,duk_size_t char_offset)357 DUK_EXTERNAL duk_codepoint_t duk_char_code_at(duk_hthread *thr, duk_idx_t idx, duk_size_t char_offset) {
358 	duk_hstring *h;
359 	duk_ucodepoint_t cp;
360 
361 	DUK_ASSERT_API_ENTRY(thr);
362 
363 	/* XXX: Share code with String.prototype.charCodeAt?  Main difference
364 	 * is handling of clamped offsets.
365 	 */
366 
367 	h = duk_require_hstring(thr, idx);  /* Accept symbols. */
368 	DUK_ASSERT(h != NULL);
369 
370 	DUK_ASSERT_DISABLE(char_offset >= 0);  /* Always true, arg is unsigned. */
371 	if (char_offset >= DUK_HSTRING_GET_CHARLEN(h)) {
372 		return 0;
373 	}
374 
375 	DUK_ASSERT(char_offset <= DUK_UINT_MAX);  /* Guaranteed by string limits. */
376 	cp = duk_hstring_char_code_at_raw(thr, h, (duk_uint_t) char_offset, 0 /*surrogate_aware*/);
377 	return (duk_codepoint_t) cp;
378 }
379