1 /*
2  *  Various Unicode help functions for character classification predicates,
3  *  case conversion, decoding, etc.
4  */
5 
6 #include "duk_internal.h"
7 
8 /*
9  *  Fast path tables
10  */
11 
12 #if defined(DUK_USE_IDCHAR_FASTPATH)
13 DUK_INTERNAL const duk_int8_t duk_is_idchar_tab[128] = {
14 	/* 0: not IdentifierStart or IdentifierPart
15 	 * 1: IdentifierStart and IdentifierPart
16 	 * -1: IdentifierPart only
17 	 */
18 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   /* 0x00...0x0f */
19 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   /* 0x10...0x1f */
20 	0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,   /* 0x20...0x2f */
21 	-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  0,  0,  0,  0,  0,   /* 0x30...0x3f */
22 	0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,   /* 0x40...0x4f */
23 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  1,   /* 0x50...0x5f */
24 	0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,   /* 0x60...0x6f */
25 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0    /* 0x70...0x7f */
26 };
27 #endif
28 
29 /*
30  *  XUTF-8 and CESU-8 encoding/decoding
31  */
32 
duk_unicode_get_xutf8_length(duk_ucodepoint_t cp)33 DUK_INTERNAL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) {
34 	duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
35 	if (x < 0x80UL) {
36 		/* 7 bits */
37 		return 1;
38 	} else if (x < 0x800UL) {
39 		/* 11 bits */
40 		return 2;
41 	} else if (x < 0x10000UL) {
42 		/* 16 bits */
43 		return 3;
44 	} else if (x < 0x200000UL) {
45 		/* 21 bits */
46 		return 4;
47 	} else if (x < 0x4000000UL) {
48 		/* 26 bits */
49 		return 5;
50 	} else if (x < (duk_ucodepoint_t) 0x80000000UL) {
51 		/* 31 bits */
52 		return 6;
53 	} else {
54 		/* 36 bits */
55 		return 7;
56 	}
57 }
58 
59 #if defined(DUK_USE_ASSERTIONS)
duk_unicode_get_cesu8_length(duk_ucodepoint_t cp)60 DUK_INTERNAL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp) {
61 	duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
62 	if (x < 0x80UL) {
63 		/* 7 bits */
64 		return 1;
65 	} else if (x < 0x800UL) {
66 		/* 11 bits */
67 		return 2;
68 	} else if (x < 0x10000UL) {
69 		/* 16 bits */
70 		return 3;
71 	} else {
72 		/* Encoded as surrogate pair, each encoding to 3 bytes for
73 		 * 6 bytes total.  Codepoints above U+10FFFF encode as 6 bytes
74 		 * too, see duk_unicode_encode_cesu8().
75 		  */
76 		return 3 + 3;
77 	}
78 }
79 #endif  /* DUK_USE_ASSERTIONS */
80 
81 DUK_INTERNAL const duk_uint8_t duk_unicode_xutf8_markers[7] = {
82 	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
83 };
84 
85 /* Encode to extended UTF-8; 'out' must have space for at least
86  * DUK_UNICODE_MAX_XUTF8_LENGTH bytes.  Allows encoding of any
87  * 32-bit (unsigned) codepoint.
88  */
duk_unicode_encode_xutf8(duk_ucodepoint_t cp,duk_uint8_t * out)89 DUK_INTERNAL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) {
90 	duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
91 	duk_small_int_t len;
92 	duk_uint8_t marker;
93 	duk_small_int_t i;
94 
95 	len = duk_unicode_get_xutf8_length(cp);
96 	DUK_ASSERT(len > 0);
97 
98 	marker = duk_unicode_xutf8_markers[len - 1];  /* 64-bit OK because always >= 0 */
99 
100 	i = len;
101 	DUK_ASSERT(i > 0);
102 	do {
103 		i--;
104 		if (i > 0) {
105 			out[i] = (duk_uint8_t) (0x80 + (x & 0x3f));
106 			x >>= 6;
107 		} else {
108 			/* Note: masking of 'x' is not necessary because of
109 			 * range check and shifting -> no bits overlapping
110 			 * the marker should be set.
111 			 */
112 			out[0] = (duk_uint8_t) (marker + x);
113 		}
114 	} while (i > 0);
115 
116 	return len;
117 }
118 
119 /* Encode to CESU-8; 'out' must have space for at least
120  * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF
121  * will encode to garbage but won't overwrite the output buffer.
122  */
duk_unicode_encode_cesu8(duk_ucodepoint_t cp,duk_uint8_t * out)123 DUK_INTERNAL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) {
124 	duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
125 	duk_small_int_t len;
126 
127 	if (x < 0x80UL) {
128 		out[0] = (duk_uint8_t) x;
129 		len = 1;
130 	} else if (x < 0x800UL) {
131 		out[0] = (duk_uint8_t) (0xc0 + ((x >> 6) & 0x1f));
132 		out[1] = (duk_uint8_t) (0x80 + (x & 0x3f));
133 		len = 2;
134 	} else if (x < 0x10000UL) {
135 		/* surrogate pairs get encoded here */
136 		out[0] = (duk_uint8_t) (0xe0 + ((x >> 12) & 0x0f));
137 		out[1] = (duk_uint8_t) (0x80 + ((x >> 6) & 0x3f));
138 		out[2] = (duk_uint8_t) (0x80 + (x & 0x3f));
139 		len = 3;
140 	} else {
141 		/*
142 		 *  Unicode codepoints above U+FFFF are encoded as surrogate
143 		 *  pairs here.  This ensures that all CESU-8 codepoints are
144 		 *  16-bit values as expected in ECMAScript.  The surrogate
145 		 *  pairs always get a 3-byte encoding (each) in CESU-8.
146 		 *  See: http://en.wikipedia.org/wiki/Surrogate_pair
147 		 *
148 		 *  20-bit codepoint, 10 bits (A and B) per surrogate pair:
149 		 *
150 		 *    x = 0b00000000 0000AAAA AAAAAABB BBBBBBBB
151 		 *  sp1 = 0b110110AA AAAAAAAA  (0xd800 + ((x >> 10) & 0x3ff))
152 		 *  sp2 = 0b110111BB BBBBBBBB  (0xdc00 + (x & 0x3ff))
153 		 *
154 		 *  Encoded into CESU-8:
155 		 *
156 		 *  sp1 -> 0b11101101  (0xe0 + ((sp1 >> 12) & 0x0f))
157 		 *      -> 0b1010AAAA  (0x80 + ((sp1 >> 6) & 0x3f))
158 		 *      -> 0b10AAAAAA  (0x80 + (sp1 & 0x3f))
159 		 *  sp2 -> 0b11101101  (0xe0 + ((sp2 >> 12) & 0x0f))
160 		 *      -> 0b1011BBBB  (0x80 + ((sp2 >> 6) & 0x3f))
161 		 *      -> 0b10BBBBBB  (0x80 + (sp2 & 0x3f))
162 		 *
163 		 *  Note that 0x10000 must be subtracted first.  The code below
164 		 *  avoids the sp1, sp2 temporaries which saves around 20 bytes
165 		 *  of code.
166 		 */
167 
168 		x -= 0x10000UL;
169 
170 		out[0] = (duk_uint8_t) (0xed);
171 		out[1] = (duk_uint8_t) (0xa0 + ((x >> 16) & 0x0f));
172 		out[2] = (duk_uint8_t) (0x80 + ((x >> 10) & 0x3f));
173 		out[3] = (duk_uint8_t) (0xed);
174 		out[4] = (duk_uint8_t) (0xb0 + ((x >> 6) & 0x0f));
175 		out[5] = (duk_uint8_t) (0x80 + (x & 0x3f));
176 		len = 6;
177 	}
178 
179 	return len;
180 }
181 
182 /* Decode helper.  Return zero on error. */
duk_unicode_decode_xutf8(duk_hthread * thr,const duk_uint8_t ** ptr,const duk_uint8_t * ptr_start,const duk_uint8_t * ptr_end,duk_ucodepoint_t * out_cp)183 DUK_INTERNAL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp) {
184 	const duk_uint8_t *p;
185 	duk_uint32_t res;
186 	duk_uint_fast8_t ch;
187 	duk_small_int_t n;
188 
189 	DUK_UNREF(thr);
190 
191 	p = *ptr;
192 	if (p < ptr_start || p >= ptr_end) {
193 		goto fail;
194 	}
195 
196 	/*
197 	 *  UTF-8 decoder which accepts longer than standard byte sequences.
198 	 *  This allows full 32-bit code points to be used.
199 	 */
200 
201 	ch = (duk_uint_fast8_t) (*p++);
202 	if (ch < 0x80) {
203 		/* 0xxx xxxx   [7 bits] */
204 		res = (duk_uint32_t) (ch & 0x7f);
205 		n = 0;
206 	} else if (ch < 0xc0) {
207 		/* 10xx xxxx -> invalid */
208 		goto fail;
209 	} else if (ch < 0xe0) {
210 		/* 110x xxxx   10xx xxxx   [11 bits] */
211 		res = (duk_uint32_t) (ch & 0x1f);
212 		n = 1;
213 	} else if (ch < 0xf0) {
214 		/* 1110 xxxx   10xx xxxx   10xx xxxx   [16 bits] */
215 		res = (duk_uint32_t) (ch & 0x0f);
216 		n = 2;
217 	} else if (ch < 0xf8) {
218 		/* 1111 0xxx   10xx xxxx   10xx xxxx   10xx xxxx   [21 bits] */
219 		res = (duk_uint32_t) (ch & 0x07);
220 		n = 3;
221 	} else if (ch < 0xfc) {
222 		/* 1111 10xx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [26 bits] */
223 		res = (duk_uint32_t) (ch & 0x03);
224 		n = 4;
225 	} else if (ch < 0xfe) {
226 		/* 1111 110x   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [31 bits] */
227 		res = (duk_uint32_t) (ch & 0x01);
228 		n = 5;
229 	} else if (ch < 0xff) {
230 		/* 1111 1110   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [36 bits] */
231 		res = (duk_uint32_t) (0);
232 		n = 6;
233 	} else {
234 		/* 8-byte format could be:
235 		 * 1111 1111   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   10xx xxxx   [41 bits]
236 		 *
237 		 * However, this format would not have a zero bit following the
238 		 * leading one bits and would not allow 0xFF to be used as an
239 		 * "invalid xutf-8" marker for internal keys.  Further, 8-byte
240 		 * encodings (up to 41 bit code points) are not currently needed.
241 		 */
242 		goto fail;
243 	}
244 
245 	DUK_ASSERT(p >= ptr_start);  /* verified at beginning */
246 	if (p + n > ptr_end) {
247 		/* check pointer at end */
248 		goto fail;
249 	}
250 
251 	while (n > 0) {
252 		DUK_ASSERT(p >= ptr_start && p < ptr_end);
253 		ch = (duk_uint_fast8_t) (*p++);
254 #if 0
255 		if (ch & 0xc0 != 0x80) {
256 			/* not a continuation byte */
257 			p--;
258 			*ptr = p;
259 			*out_cp = DUK_UNICODE_CP_REPLACEMENT_CHARACTER;
260 			return 1;
261 		}
262 #endif
263 		res = (res << 6) + (duk_uint32_t) (ch & 0x3f);
264 		n--;
265 	}
266 
267 	*ptr = p;
268 	*out_cp = res;
269 	return 1;
270 
271  fail:
272 	return 0;
273 }
274 
275 /* used by e.g. duk_regexp_executor.c, string built-ins */
duk_unicode_decode_xutf8_checked(duk_hthread * thr,const duk_uint8_t ** ptr,const duk_uint8_t * ptr_start,const duk_uint8_t * ptr_end)276 DUK_INTERNAL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end) {
277 	duk_ucodepoint_t cp;
278 
279 	if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) {
280 		return cp;
281 	}
282 	DUK_ERROR_INTERNAL(thr);
283 	DUK_WO_NORETURN(return 0;);
284 }
285 
286 /* Compute (extended) utf-8 length without codepoint encoding validation,
287  * used for string interning.
288  *
289  * NOTE: This algorithm is performance critical, more so than string hashing
290  * in some cases.  It is needed when interning a string and needs to scan
291  * every byte of the string with no skipping.  Having an ASCII fast path
292  * is useful if possible in the algorithm.  The current algorithms were
293  * chosen from several variants, based on x64 gcc -O2 testing.  See:
294  * https://github.com/svaarala/duktape/pull/422
295  *
296  * NOTE: must match tools/dukutil.py:duk_unicode_unvalidated_utf8_length().
297  */
298 
299 #if defined(DUK_USE_PREFER_SIZE)
300 /* Small variant; roughly 150 bytes smaller than the fast variant. */
duk_unicode_unvalidated_utf8_length(const duk_uint8_t * data,duk_size_t blen)301 DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) {
302 	const duk_uint8_t *p;
303 	const duk_uint8_t *p_end;
304 	duk_size_t ncont;
305 	duk_size_t clen;
306 
307 	p = data;
308 	p_end = data + blen;
309 	ncont = 0;
310 	while (p != p_end) {
311 		duk_uint8_t x;
312 		x = *p++;
313 		if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
314 			ncont++;
315 		}
316 	}
317 
318 	DUK_ASSERT(ncont <= blen);
319 	clen = blen - ncont;
320 	DUK_ASSERT(clen <= blen);
321 	return clen;
322 }
323 #else  /* DUK_USE_PREFER_SIZE */
324 /* This seems like a good overall approach.  Fast path for ASCII in 4 byte
325  * blocks.
326  */
duk_unicode_unvalidated_utf8_length(const duk_uint8_t * data,duk_size_t blen)327 DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) {
328 	const duk_uint8_t *p;
329 	const duk_uint8_t *p_end;
330 	const duk_uint32_t *p32_end;
331 	const duk_uint32_t *p32;
332 	duk_size_t ncont;
333 	duk_size_t clen;
334 
335 	ncont = 0;  /* number of continuation (non-initial) bytes in [0x80,0xbf] */
336 	p = data;
337 	p_end = data + blen;
338 	if (blen < 16) {
339 		goto skip_fastpath;
340 	}
341 
342 	/* Align 'p' to 4; the input data may have arbitrary alignment.
343 	 * End of string check not needed because blen >= 16.
344 	 */
345 	while (((duk_size_t) (const void *) p) & 0x03U) {
346 		duk_uint8_t x;
347 		x = *p++;
348 		if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
349 			ncont++;
350 		}
351 	}
352 
353 	/* Full, aligned 4-byte reads. */
354 	p32_end = (const duk_uint32_t *) (const void *) (p + ((duk_size_t) (p_end - p) & (duk_size_t) (~0x03)));
355 	p32 = (const duk_uint32_t *) (const void *) p;
356 	while (p32 != (const duk_uint32_t *) p32_end) {
357 		duk_uint32_t x;
358 		x = *p32++;
359 		if (DUK_LIKELY((x & 0x80808080UL) == 0)) {
360 			;  /* ASCII fast path */
361 		} else {
362 			/* Flip highest bit of each byte which changes
363 			 * the bit pattern 10xxxxxx into 00xxxxxx which
364 			 * allows an easy bit mask test.
365 			 */
366 			x ^= 0x80808080UL;
367 			if (DUK_UNLIKELY(!(x & 0xc0000000UL))) {
368 				ncont++;
369 			}
370 			if (DUK_UNLIKELY(!(x & 0x00c00000UL))) {
371 				ncont++;
372 			}
373 			if (DUK_UNLIKELY(!(x & 0x0000c000UL))) {
374 				ncont++;
375 			}
376 			if (DUK_UNLIKELY(!(x & 0x000000c0UL))) {
377 				ncont++;
378 			}
379 		}
380 	}
381 	p = (const duk_uint8_t *) p32;
382 	/* Fall through to handle the rest. */
383 
384  skip_fastpath:
385 	while (p != p_end) {
386 		duk_uint8_t x;
387 		x = *p++;
388 		if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
389 			ncont++;
390 		}
391 	}
392 
393 	DUK_ASSERT(ncont <= blen);
394 	clen = blen - ncont;
395 	DUK_ASSERT(clen <= blen);
396 	return clen;
397 }
398 #endif  /* DUK_USE_PREFER_SIZE */
399 
400 /* Check whether a string is UTF-8 compatible or not. */
duk_unicode_is_utf8_compatible(const duk_uint8_t * buf,duk_size_t len)401 DUK_INTERNAL duk_bool_t duk_unicode_is_utf8_compatible(const duk_uint8_t *buf, duk_size_t len) {
402 	duk_size_t i = 0;
403 #if !defined(DUK_USE_PREFER_SIZE)
404 	duk_size_t len_safe;
405 #endif
406 
407 	/* Many practical strings are ASCII only, so use a fast path check
408 	 * to check chunks of bytes at once with minimal branch cost.
409 	 */
410 #if !defined(DUK_USE_PREFER_SIZE)
411 	len_safe = len & ~0x03UL;
412 	for (; i < len_safe; i += 4) {
413 		duk_uint8_t t = buf[i] | buf[i + 1] | buf[i + 2] | buf[i + 3];
414 		if (DUK_UNLIKELY((t & 0x80U) != 0U)) {
415 			/* At least one byte was outside 0x00-0x7f, break
416 			 * out to slow path (and remain there).
417 			 *
418 			 * XXX: We could also deal with the problem character
419 			 * and resume fast path later.
420 			 */
421 			break;
422 		}
423 	}
424 #endif
425 
426 	for (; i < len;) {
427 		duk_uint8_t t;
428 		duk_size_t left;
429 		duk_size_t ncont;
430 		duk_uint32_t cp;
431 		duk_uint32_t mincp;
432 
433 		t = buf[i++];
434 		if (DUK_LIKELY((t & 0x80U) == 0U)) {
435 			/* Fast path, ASCII. */
436 			continue;
437 		}
438 
439 		/* Non-ASCII start byte, slow path.
440 		 *
441 		 * 10xx xxxx          -> continuation byte
442 		 * 110x xxxx + 1*CONT -> [0x80, 0x7ff]
443 		 * 1110 xxxx + 2*CONT -> [0x800, 0xffff], must reject [0xd800,0xdfff]
444 		 * 1111 0xxx + 3*CONT -> [0x10000, 0x10ffff]
445 		 */
446 		left = len - i;
447 		if (t <= 0xdfU) {  /* 1101 1111 = 0xdf */
448 			if (t <= 0xbfU) {  /* 1011 1111 = 0xbf */
449 				return 0;
450 			}
451 			ncont = 1;
452 			mincp = 0x80UL;
453 			cp = t & 0x1fU;
454 		} else if (t <= 0xefU) {  /* 1110 1111 = 0xef */
455 			ncont = 2;
456 			mincp = 0x800UL;
457 			cp = t & 0x0fU;
458 		} else if (t <= 0xf7U) {  /* 1111 0111 = 0xf7 */
459 			ncont = 3;
460 			mincp = 0x10000UL;
461 			cp = t & 0x07U;
462 		} else {
463 			return 0;
464 		}
465 		if (left < ncont) {
466 			return 0;
467 		}
468 		while (ncont > 0U) {
469 			t = buf[i++];
470 			if ((t & 0xc0U) != 0x80U) {  /* 10xx xxxx */
471 				return 0;
472 			}
473 			cp = (cp << 6) + (t & 0x3fU);
474 			ncont--;
475 		}
476 		if (cp < mincp || cp > 0x10ffffUL || (cp >= 0xd800UL && cp <= 0xdfffUL)) {
477 			return 0;
478 		}
479 	}
480 
481 	return 1;
482 }
483 
484 /*
485  *  Unicode range matcher
486  *
487  *  Matches a codepoint against a packed bitstream of character ranges.
488  *  Used for slow path Unicode matching.
489  */
490 
491 /* Must match tools/extract_chars.py, generate_match_table3(). */
duk__uni_decode_value(duk_bitdecoder_ctx * bd_ctx)492 DUK_LOCAL duk_uint32_t duk__uni_decode_value(duk_bitdecoder_ctx *bd_ctx) {
493 	duk_uint32_t t;
494 
495 	t = (duk_uint32_t) duk_bd_decode(bd_ctx, 4);
496 	if (t <= 0x0eU) {
497 		return t;
498 	}
499 	t = (duk_uint32_t) duk_bd_decode(bd_ctx, 8);
500 	if (t <= 0xfdU) {
501 		return t + 0x0f;
502 	}
503 	if (t == 0xfeU) {
504 		t = (duk_uint32_t) duk_bd_decode(bd_ctx, 12);
505 		return t + 0x0fU + 0xfeU;
506 	} else {
507 		t = (duk_uint32_t) duk_bd_decode(bd_ctx, 24);
508 		return t + 0x0fU + 0xfeU + 0x1000UL;
509 	}
510 }
511 
duk__uni_range_match(const duk_uint8_t * unitab,duk_size_t unilen,duk_codepoint_t cp)512 DUK_LOCAL duk_small_int_t duk__uni_range_match(const duk_uint8_t *unitab, duk_size_t unilen, duk_codepoint_t cp) {
513 	duk_bitdecoder_ctx bd_ctx;
514 	duk_codepoint_t prev_re;
515 
516 	duk_memzero(&bd_ctx, sizeof(bd_ctx));
517 	bd_ctx.data = (const duk_uint8_t *) unitab;
518 	bd_ctx.length = (duk_size_t) unilen;
519 
520 	prev_re = 0;
521 	for (;;) {
522 		duk_codepoint_t r1, r2;
523 		r1 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx);
524 		if (r1 == 0) {
525 			break;
526 		}
527 		r2 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx);
528 
529 		r1 = prev_re + r1;
530 		r2 = r1 + r2;
531 		prev_re = r2;
532 
533 		/* [r1,r2] is the range */
534 
535 		DUK_DDD(DUK_DDDPRINT("duk__uni_range_match: cp=%06lx range=[0x%06lx,0x%06lx]",
536 		                     (unsigned long) cp, (unsigned long) r1, (unsigned long) r2));
537 		if (cp >= r1 && cp <= r2) {
538 			return 1;
539 		}
540 	}
541 
542 	return 0;
543 }
544 
545 /*
546  *  "WhiteSpace" production check.
547  */
548 
duk_unicode_is_whitespace(duk_codepoint_t cp)549 DUK_INTERNAL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) {
550 	/*
551 	 *  E5 Section 7.2 specifies six characters specifically as
552 	 *  white space:
553 	 *
554 	 *    0009;<control>;Cc;0;S;;;;;N;CHARACTER TABULATION;;;;
555 	 *    000B;<control>;Cc;0;S;;;;;N;LINE TABULATION;;;;
556 	 *    000C;<control>;Cc;0;WS;;;;;N;FORM FEED (FF);;;;
557 	 *    0020;SPACE;Zs;0;WS;;;;;N;;;;;
558 	 *    00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
559 	 *    FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;;
560 	 *
561 	 *  It also specifies any Unicode category 'Zs' characters as white
562 	 *  space.  These can be extracted with the "tools/extract_chars.py" script.
563 	 *  Current result:
564 	 *
565 	 *    RAW OUTPUT:
566 	 *    ===========
567 	 *    0020;SPACE;Zs;0;WS;;;;;N;;;;;
568 	 *    00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
569 	 *    1680;OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;;
570 	 *    180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;;
571 	 *    2000;EN QUAD;Zs;0;WS;2002;;;;N;;;;;
572 	 *    2001;EM QUAD;Zs;0;WS;2003;;;;N;;;;;
573 	 *    2002;EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
574 	 *    2003;EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
575 	 *    2004;THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
576 	 *    2005;FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
577 	 *    2006;SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
578 	 *    2007;FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
579 	 *    2008;PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
580 	 *    2009;THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
581 	 *    200A;HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
582 	 *    202F;NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
583 	 *    205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
584 	 *    3000;IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
585 	 *
586 	 *    RANGES:
587 	 *    =======
588 	 *    0x0020
589 	 *    0x00a0
590 	 *    0x1680
591 	 *    0x180e
592 	 *    0x2000 ... 0x200a
593 	 *    0x202f
594 	 *    0x205f
595 	 *    0x3000
596 	 *
597 	 *  A manual decoder (below) is probably most compact for this.
598 	 */
599 
600 	duk_uint_fast8_t lo;
601 	duk_uint_fast32_t hi;
602 
603 	/* cp == -1 (EOF) never matches and causes return value 0 */
604 
605 	lo = (duk_uint_fast8_t) (cp & 0xff);
606 	hi = (duk_uint_fast32_t) (cp >> 8);  /* does not fit into an uchar */
607 
608 	if (hi == 0x0000UL) {
609 		if (lo == 0x09U || lo == 0x0bU || lo == 0x0cU ||
610 		    lo == 0x20U || lo == 0xa0U) {
611 			return 1;
612 		}
613 	} else if (hi == 0x0020UL) {
614 		if (lo <= 0x0aU || lo == 0x2fU || lo == 0x5fU) {
615 			return 1;
616 		}
617 	} else if (cp == 0x1680L || cp == 0x180eL || cp == 0x3000L ||
618 	           cp == 0xfeffL) {
619 		return 1;
620 	}
621 
622 	return 0;
623 }
624 
625 /*
626  *  "LineTerminator" production check.
627  */
628 
duk_unicode_is_line_terminator(duk_codepoint_t cp)629 DUK_INTERNAL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) {
630 	/*
631 	 *  E5 Section 7.3
632 	 *
633 	 *  A LineTerminatorSequence essentially merges <CR> <LF> sequences
634 	 *  into a single line terminator.  This must be handled by the caller.
635 	 */
636 
637 	if (cp == 0x000aL || cp == 0x000dL || cp == 0x2028L ||
638 	    cp == 0x2029L) {
639 		return 1;
640 	}
641 
642 	return 0;
643 }
644 
645 /*
646  *  "IdentifierStart" production check.
647  */
648 
duk_unicode_is_identifier_start(duk_codepoint_t cp)649 DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) {
650 	/*
651 	 *  E5 Section 7.6:
652 	 *
653 	 *    IdentifierStart:
654 	 *      UnicodeLetter
655 	 *      $
656 	 *      _
657 	 *      \ UnicodeEscapeSequence
658 	 *
659 	 *  IdentifierStart production has one multi-character production:
660 	 *
661 	 *    \ UnicodeEscapeSequence
662 	 *
663 	 *  The '\' character is -not- matched by this function.  Rather, the caller
664 	 *  should decode the escape and then call this function to check whether the
665 	 *  decoded character is acceptable (see discussion in E5 Section 7.6).
666 	 *
667 	 *  The "UnicodeLetter" alternative of the production allows letters
668 	 *  from various Unicode categories.  These can be extracted with the
669 	 *  "tools/extract_chars.py" script.
670 	 *
671 	 *  Because the result has hundreds of Unicode codepoint ranges, matching
672 	 *  for any values >= 0x80 are done using a very slow range-by-range scan
673 	 *  and a packed range format.
674 	 *
675 	 *  The ASCII portion (codepoints 0x00 ... 0x7f) is fast-pathed below because
676 	 *  it matters the most.  The ASCII related ranges of IdentifierStart are:
677 	 *
678 	 *    0x0041 ... 0x005a     ['A' ... 'Z']
679 	 *    0x0061 ... 0x007a     ['a' ... 'z']
680 	 *    0x0024                ['$']
681 	 *    0x005f                ['_']
682 	 */
683 
684 	/* ASCII (and EOF) fast path -- quick accept and reject */
685 	if (cp <= 0x7fL) {
686 #if defined(DUK_USE_IDCHAR_FASTPATH)
687 		return (cp >= 0) && (duk_is_idchar_tab[cp] > 0);
688 #else
689 		if ((cp >= 'a' && cp <= 'z') ||
690 		    (cp >= 'A' && cp <= 'Z') ||
691 		    cp == '_' || cp == '$') {
692 			return 1;
693 		}
694 		return 0;
695 #endif
696 	}
697 
698 	/* Non-ASCII slow path (range-by-range linear comparison), very slow */
699 
700 #if defined(DUK_USE_SOURCE_NONBMP)
701 	if (duk__uni_range_match(duk_unicode_ids_noa,
702 	                         (duk_size_t) sizeof(duk_unicode_ids_noa),
703 	                         (duk_codepoint_t) cp)) {
704 		return 1;
705 	}
706 	return 0;
707 #else
708 	if (cp < 0x10000L) {
709 		if (duk__uni_range_match(duk_unicode_ids_noabmp,
710 		                         sizeof(duk_unicode_ids_noabmp),
711 		                         (duk_codepoint_t) cp)) {
712 			return 1;
713 		}
714 		return 0;
715 	} else {
716 		/* without explicit non-BMP support, assume non-BMP characters
717 		 * are always accepted as identifier characters.
718 		 */
719 		return 1;
720 	}
721 #endif
722 }
723 
724 /*
725  *  "IdentifierPart" production check.
726  */
727 
duk_unicode_is_identifier_part(duk_codepoint_t cp)728 DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) {
729 	/*
730 	 *  E5 Section 7.6:
731 	 *
732 	 *    IdentifierPart:
733 	 *      IdentifierStart
734 	 *      UnicodeCombiningMark
735 	 *      UnicodeDigit
736 	 *      UnicodeConnectorPunctuation
737 	 *      <ZWNJ>  [U+200C]
738 	 *      <ZWJ>   [U+200D]
739 	 *
740 	 *  IdentifierPart production has one multi-character production
741 	 *  as part of its IdentifierStart alternative.  The '\' character
742 	 *  of an escape sequence is not matched here, see discussion in
743 	 *  duk_unicode_is_identifier_start().
744 	 *
745 	 *  To match non-ASCII characters (codepoints >= 0x80), a very slow
746 	 *  linear range-by-range scan is used.  The codepoint is first compared
747 	 *  to the IdentifierStart ranges, and if it doesn't match, then to a
748 	 *  set consisting of code points in IdentifierPart but not in
749 	 *  IdentifierStart.  This is done to keep the unicode range data small,
750 	 *  at the expense of speed.
751 	 *
752 	 *  The ASCII fast path consists of:
753 	 *
754 	 *    0x0030 ... 0x0039     ['0' ... '9', UnicodeDigit]
755 	 *    0x0041 ... 0x005a     ['A' ... 'Z', IdentifierStart]
756 	 *    0x0061 ... 0x007a     ['a' ... 'z', IdentifierStart]
757 	 *    0x0024                ['$', IdentifierStart]
758 	 *    0x005f                ['_', IdentifierStart and
759 	 *                                UnicodeConnectorPunctuation]
760 	 *
761 	 *  UnicodeCombiningMark has no code points <= 0x7f.
762 	 *
763 	 *  The matching code reuses the "identifier start" tables, and then
764 	 *  consults a separate range set for characters in "identifier part"
765 	 *  but not in "identifier start".  These can be extracted with the
766 	 *  "tools/extract_chars.py" script.
767 	 *
768 	 *  UnicodeCombiningMark -> categories Mn, Mc
769 	 *  UnicodeDigit -> categories Nd
770 	 *  UnicodeConnectorPunctuation -> categories Pc
771 	 */
772 
773 	/* ASCII (and EOF) fast path -- quick accept and reject */
774 	if (cp <= 0x7fL) {
775 #if defined(DUK_USE_IDCHAR_FASTPATH)
776 		return (cp >= 0) && (duk_is_idchar_tab[cp] != 0);
777 #else
778 		if ((cp >= 'a' && cp <= 'z') ||
779 		    (cp >= 'A' && cp <= 'Z') ||
780 		    (cp >= '0' && cp <= '9') ||
781 		    cp == '_' || cp == '$') {
782 			return 1;
783 		}
784 		return 0;
785 #endif
786 	}
787 
788 	/* Non-ASCII slow path (range-by-range linear comparison), very slow */
789 
790 #if defined(DUK_USE_SOURCE_NONBMP)
791 	if (duk__uni_range_match(duk_unicode_ids_noa,
792 	                         sizeof(duk_unicode_ids_noa),
793 	                         (duk_codepoint_t) cp) ||
794 	    duk__uni_range_match(duk_unicode_idp_m_ids_noa,
795 	                         sizeof(duk_unicode_idp_m_ids_noa),
796 	                         (duk_codepoint_t) cp)) {
797 		return 1;
798 	}
799 	return 0;
800 #else
801 	if (cp < 0x10000L) {
802 		if (duk__uni_range_match(duk_unicode_ids_noabmp,
803 		                         sizeof(duk_unicode_ids_noabmp),
804 		                         (duk_codepoint_t) cp) ||
805 		    duk__uni_range_match(duk_unicode_idp_m_ids_noabmp,
806 		                         sizeof(duk_unicode_idp_m_ids_noabmp),
807 		                         (duk_codepoint_t) cp)) {
808 			return 1;
809 		}
810 		return 0;
811 	} else {
812 		/* without explicit non-BMP support, assume non-BMP characters
813 		 * are always accepted as identifier characters.
814 		 */
815 		return 1;
816 	}
817 #endif
818 }
819 
820 /*
821  *  Unicode letter check.
822  */
823 
duk_unicode_is_letter(duk_codepoint_t cp)824 DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) {
825 	/*
826 	 *  Unicode letter is now taken to be the categories:
827 	 *
828 	 *    Lu, Ll, Lt, Lm, Lo
829 	 *
830 	 *  (Not sure if this is exactly correct.)
831 	 *
832 	 *  The ASCII fast path consists of:
833 	 *
834 	 *    0x0041 ... 0x005a     ['A' ... 'Z']
835 	 *    0x0061 ... 0x007a     ['a' ... 'z']
836 	 */
837 
838 	/* ASCII (and EOF) fast path -- quick accept and reject */
839 	if (cp <= 0x7fL) {
840 		if ((cp >= 'a' && cp <= 'z') ||
841 		    (cp >= 'A' && cp <= 'Z')) {
842 			return 1;
843 		}
844 		return 0;
845 	}
846 
847 	/* Non-ASCII slow path (range-by-range linear comparison), very slow */
848 
849 #if defined(DUK_USE_SOURCE_NONBMP)
850 	if (duk__uni_range_match(duk_unicode_ids_noa,
851 	                         sizeof(duk_unicode_ids_noa),
852 	                         (duk_codepoint_t) cp) &&
853 	    !duk__uni_range_match(duk_unicode_ids_m_let_noa,
854 	                          sizeof(duk_unicode_ids_m_let_noa),
855 	                          (duk_codepoint_t) cp)) {
856 		return 1;
857 	}
858 	return 0;
859 #else
860 	if (cp < 0x10000L) {
861 		if (duk__uni_range_match(duk_unicode_ids_noabmp,
862 		                         sizeof(duk_unicode_ids_noabmp),
863 		                         (duk_codepoint_t) cp) &&
864 		    !duk__uni_range_match(duk_unicode_ids_m_let_noabmp,
865 		                          sizeof(duk_unicode_ids_m_let_noabmp),
866 		                          (duk_codepoint_t) cp)) {
867 			return 1;
868 		}
869 		return 0;
870 	} else {
871 		/* without explicit non-BMP support, assume non-BMP characters
872 		 * are always accepted as letters.
873 		 */
874 		return 1;
875 	}
876 #endif
877 }
878 
879 /*
880  *  Complex case conversion helper which decodes a bit-packed conversion
881  *  control stream generated by tools/extract_caseconv.py.  The conversion
882  *  is very slow because it runs through the conversion data in a linear
883  *  fashion to save space (which is why ASCII characters have a special
884  *  fast path before arriving here).
885  *
886  *  The particular bit counts etc have been determined experimentally to
887  *  be small but still sufficient, and must match the Python script
888  *  (tools/extract_caseconv.py).
889  *
890  *  The return value is the case converted codepoint or -1 if the conversion
891  *  results in multiple characters (this is useful for regexp Canonicalization
892  *  operation).  If 'buf' is not NULL, the result codepoint(s) are also
893  *  appended to the hbuffer.
894  *
895  *  Context and locale specific rules must be checked before consulting
896  *  this function.
897  */
898 
899 DUK_LOCAL
duk__slow_case_conversion(duk_hthread * thr,duk_bufwriter_ctx * bw,duk_codepoint_t cp,duk_bitdecoder_ctx * bd_ctx)900 duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
901                                           duk_bufwriter_ctx *bw,
902                                           duk_codepoint_t cp,
903                                           duk_bitdecoder_ctx *bd_ctx) {
904 	duk_small_int_t skip = 0;
905 	duk_small_int_t n;
906 	duk_small_int_t t;
907 	duk_small_int_t count;
908 	duk_codepoint_t tmp_cp;
909 	duk_codepoint_t start_i;
910 	duk_codepoint_t start_o;
911 
912 	DUK_ASSERT(bd_ctx != NULL);
913 	DUK_UNREF(thr);
914 
915 	DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp));
916 
917 	/* range conversion with a "skip" */
918 	DUK_DDD(DUK_DDDPRINT("checking ranges"));
919 	for (;;) {
920 		skip++;
921 		n = (duk_small_int_t) duk_bd_decode(bd_ctx, 6);
922 		if (n == 0x3f) {
923 			/* end marker */
924 			break;
925 		}
926 		DUK_DDD(DUK_DDDPRINT("skip=%ld, n=%ld", (long) skip, (long) n));
927 
928 		while (n--) {
929 			start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
930 			start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
931 			count = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);
932 			DUK_DDD(DUK_DDDPRINT("range: start_i=%ld, start_o=%ld, count=%ld, skip=%ld",
933 			                     (long) start_i, (long) start_o, (long) count, (long) skip));
934 
935 			if (cp >= start_i) {
936 				tmp_cp = cp - start_i;  /* always >= 0 */
937 				if (tmp_cp < (duk_codepoint_t) count * (duk_codepoint_t) skip &&
938 				    (tmp_cp % (duk_codepoint_t) skip) == 0) {
939 					DUK_DDD(DUK_DDDPRINT("range matches input codepoint"));
940 					cp = start_o + tmp_cp;
941 					goto single;
942 				}
943 			}
944 		}
945 	}
946 
947 	/* 1:1 conversion */
948 	n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);
949 	DUK_DDD(DUK_DDDPRINT("checking 1:1 conversions (count %ld)", (long) n));
950 	while (n--) {
951 		start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
952 		start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
953 		DUK_DDD(DUK_DDDPRINT("1:1 conversion %ld -> %ld", (long) start_i, (long) start_o));
954 		if (cp == start_i) {
955 			DUK_DDD(DUK_DDDPRINT("1:1 matches input codepoint"));
956 			cp = start_o;
957 			goto single;
958 		}
959 	}
960 
961 	/* complex, multicharacter conversion */
962 	n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);
963 	DUK_DDD(DUK_DDDPRINT("checking 1:n conversions (count %ld)", (long) n));
964 	while (n--) {
965 		start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
966 		t = (duk_small_int_t) duk_bd_decode(bd_ctx, 2);
967 		DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t));
968 		if (cp == start_i) {
969 			DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint"));
970 			if (bw != NULL) {
971 				while (t--) {
972 					tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
973 					DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) tmp_cp);
974 				}
975 			}
976 			return -1;
977 		} else {
978 			while (t--) {
979 				(void) duk_bd_decode(bd_ctx, 16);
980 			}
981 		}
982 	}
983 
984 	/* default: no change */
985 	DUK_DDD(DUK_DDDPRINT("no rule matches, output is same as input"));
986 	/* fall through */
987 
988  single:
989 	if (bw != NULL) {
990 		DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
991 	}
992 	return cp;
993 }
994 
995 /*
996  *  Case conversion helper, with context/local sensitivity.
997  *  For proper case conversion, one needs to know the character
998  *  and the preceding and following characters, as well as
999  *  locale/language.
1000  */
1001 
1002 /* XXX: add 'language' argument when locale/language sensitive rule
1003  * support added.
1004  */
1005 DUK_LOCAL
duk__case_transform_helper(duk_hthread * thr,duk_bufwriter_ctx * bw,duk_codepoint_t cp,duk_codepoint_t prev,duk_codepoint_t next,duk_bool_t uppercase)1006 duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
1007                                            duk_bufwriter_ctx *bw,
1008                                            duk_codepoint_t cp,
1009                                            duk_codepoint_t prev,
1010                                            duk_codepoint_t next,
1011                                            duk_bool_t uppercase) {
1012 	duk_bitdecoder_ctx bd_ctx;
1013 
1014 	/* fast path for ASCII */
1015 	if (cp < 0x80L) {
1016 		/* XXX: there are language sensitive rules for the ASCII range.
1017 		 * If/when language/locale support is implemented, they need to
1018 		 * be implemented here for the fast path.  There are no context
1019 		 * sensitive rules for ASCII range.
1020 		 */
1021 
1022 		if (uppercase) {
1023 			if (cp >= 'a' && cp <= 'z') {
1024 				cp = cp - 'a' + 'A';
1025 			}
1026 		} else {
1027 			if (cp >= 'A' && cp <= 'Z') {
1028 				cp = cp - 'A' + 'a';
1029 			}
1030 		}
1031 
1032 		if (bw != NULL) {
1033 			DUK_BW_WRITE_RAW_U8(thr, bw, (duk_uint8_t) cp);
1034 		}
1035 		return cp;
1036 	}
1037 
1038 	/* context and locale specific rules which cannot currently be represented
1039 	 * in the caseconv bitstream: hardcoded rules in C
1040 	 */
1041 	if (uppercase) {
1042 		/* XXX: turkish / azeri */
1043 	} else {
1044 		/*
1045 		 *  Final sigma context specific rule.  This is a rather tricky
1046 		 *  rule and this handling is probably not 100% correct now.
1047 		 *  The rule is not locale/language specific so it is supported.
1048 		 */
1049 
1050 		if (cp == 0x03a3L &&    /* U+03A3 = GREEK CAPITAL LETTER SIGMA */
1051 		    duk_unicode_is_letter(prev) &&        /* prev exists and is not a letter */
1052 		    !duk_unicode_is_letter(next)) {       /* next does not exist or next is not a letter */
1053 			/* Capital sigma occurred at "end of word", lowercase to
1054 			 * U+03C2 = GREEK SMALL LETTER FINAL SIGMA.  Otherwise
1055 			 * fall through and let the normal rules lowercase it to
1056 			 * U+03C3 = GREEK SMALL LETTER SIGMA.
1057 			 */
1058 			cp = 0x03c2L;
1059 			goto singlechar;
1060 		}
1061 
1062 		/* XXX: lithuanian not implemented */
1063 		/* XXX: lithuanian, explicit dot rules */
1064 		/* XXX: turkish / azeri, lowercase rules */
1065 	}
1066 
1067 	/* 1:1 or special conversions, but not locale/context specific: script generated rules */
1068 	duk_memzero(&bd_ctx, sizeof(bd_ctx));
1069 	if (uppercase) {
1070 		bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_uc;
1071 		bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_uc);
1072 	} else {
1073 		bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_lc;
1074 		bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc);
1075 	}
1076 	return duk__slow_case_conversion(thr, bw, cp, &bd_ctx);
1077 
1078  singlechar:
1079 	if (bw != NULL) {
1080 		DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
1081 	}
1082 	return cp;
1083 
1084  /* unused now, not needed until Turkish/Azeri */
1085 #if 0
1086  nochar:
1087 	return -1;
1088 #endif
1089 }
1090 
1091 /*
1092  *  Replace valstack top with case converted version.
1093  */
1094 
duk_unicode_case_convert_string(duk_hthread * thr,duk_bool_t uppercase)1095 DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_bool_t uppercase) {
1096 	duk_hstring *h_input;
1097 	duk_bufwriter_ctx bw_alloc;
1098 	duk_bufwriter_ctx *bw;
1099 	const duk_uint8_t *p, *p_start, *p_end;
1100 	duk_codepoint_t prev, curr, next;
1101 
1102 	h_input = duk_require_hstring(thr, -1);  /* Accept symbols. */
1103 	DUK_ASSERT(h_input != NULL);
1104 
1105 	bw = &bw_alloc;
1106 	DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input));
1107 
1108 	/* [ ... input buffer ] */
1109 
1110 	p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
1111 	p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
1112 	p = p_start;
1113 
1114 	prev = -1; DUK_UNREF(prev);
1115 	curr = -1;
1116 	next = -1;
1117 	for (;;) {
1118 		prev = curr;
1119 		curr = next;
1120 		next = -1;
1121 		if (p < p_end) {
1122 			next = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);
1123 		} else {
1124 			/* end of input and last char has been processed */
1125 			if (curr < 0) {
1126 				break;
1127 			}
1128 		}
1129 
1130 		/* on first round, skip */
1131 		if (curr >= 0) {
1132 			/* XXX: could add a fast path to process chunks of input codepoints,
1133 			 * but relative benefit would be quite small.
1134 			 */
1135 
1136 			/* Ensure space for maximum multi-character result; estimate is overkill. */
1137 			DUK_BW_ENSURE(thr, bw, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH);
1138 
1139 			duk__case_transform_helper(thr,
1140 			                           bw,
1141 			                           (duk_codepoint_t) curr,
1142 			                           prev,
1143 			                           next,
1144 			                           uppercase);
1145 		}
1146 	}
1147 
1148 	DUK_BW_COMPACT(thr, bw);
1149 	(void) duk_buffer_to_string(thr, -1);  /* Safe, output is encoded. */
1150 	/* invalidates h_buf pointer */
1151 	duk_remove_m2(thr);
1152 }
1153 
1154 #if defined(DUK_USE_REGEXP_SUPPORT)
1155 
1156 /*
1157  *  Canonicalize() abstract operation needed for canonicalization of individual
1158  *  codepoints during regexp compilation and execution, see E5 Section 15.10.2.8.
1159  *  Note that codepoints are canonicalized one character at a time, so no context
1160  *  specific rules can apply.  Locale specific rules can apply, though.
1161  */
1162 
duk_unicode_re_canonicalize_char(duk_hthread * thr,duk_codepoint_t cp)1163 DUK_INTERNAL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) {
1164 #if defined(DUK_USE_REGEXP_CANON_WORKAROUND)
1165 	/* Fast canonicalization lookup at the cost of 128kB footprint. */
1166 	DUK_ASSERT(cp >= 0);
1167 	DUK_UNREF(thr);
1168 	if (DUK_LIKELY(cp < 0x10000L)) {
1169 		return (duk_codepoint_t) duk_unicode_re_canon_lookup[cp];
1170 	}
1171 	return cp;
1172 #else  /* DUK_USE_REGEXP_CANON_WORKAROUND */
1173 	duk_codepoint_t y;
1174 
1175 	y = duk__case_transform_helper(thr,
1176 	                               NULL,    /* NULL is allowed, no output */
1177 	                               cp,      /* curr char */
1178 	                               -1,      /* prev char */
1179 	                               -1,      /* next char */
1180 	                               1);      /* uppercase */
1181 
1182 	if ((y < 0) || (cp >= 0x80 && y < 0x80)) {
1183 		/* multiple codepoint conversion or non-ASCII mapped to ASCII
1184 		 * --> leave as is.
1185 		 */
1186 		return cp;
1187 	}
1188 
1189 	return y;
1190 #endif  /* DUK_USE_REGEXP_CANON_WORKAROUND */
1191 }
1192 
1193 /*
1194  *  E5 Section 15.10.2.6 "IsWordChar" abstract operation.  Assume
1195  *  x < 0 for characters read outside the string.
1196  */
1197 
duk_unicode_re_is_wordchar(duk_codepoint_t x)1198 DUK_INTERNAL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t x) {
1199 	/*
1200 	 *  Note: the description in E5 Section 15.10.2.6 has a typo, it
1201 	 *  contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_].
1202 	 */
1203 	if ((x >= '0' && x <= '9') ||
1204 	    (x >= 'a' && x <= 'z') ||
1205 	    (x >= 'A' && x <= 'Z') ||
1206 	    (x == '_')) {
1207 		return 1;
1208 	}
1209 	return 0;
1210 }
1211 
1212 /*
1213  *  Regexp range tables
1214  */
1215 
1216 /* exposed because lexer needs these too */
1217 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_digit[2] = {
1218 	(duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL,
1219 };
1220 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_white[22] = {
1221 	(duk_uint16_t) 0x0009UL, (duk_uint16_t) 0x000DUL,
1222 	(duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x0020UL,
1223 	(duk_uint16_t) 0x00A0UL, (duk_uint16_t) 0x00A0UL,
1224 	(duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x1680UL,
1225 	(duk_uint16_t) 0x180EUL, (duk_uint16_t) 0x180EUL,
1226 	(duk_uint16_t) 0x2000UL, (duk_uint16_t) 0x200AUL,
1227 	(duk_uint16_t) 0x2028UL, (duk_uint16_t) 0x2029UL,
1228 	(duk_uint16_t) 0x202FUL, (duk_uint16_t) 0x202FUL,
1229 	(duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x205FUL,
1230 	(duk_uint16_t) 0x3000UL, (duk_uint16_t) 0x3000UL,
1231 	(duk_uint16_t) 0xFEFFUL, (duk_uint16_t) 0xFEFFUL,
1232 };
1233 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_wordchar[8] = {
1234 	(duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL,
1235 	(duk_uint16_t) 0x0041UL, (duk_uint16_t) 0x005AUL,
1236 	(duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x005FUL,
1237 	(duk_uint16_t) 0x0061UL, (duk_uint16_t) 0x007AUL,
1238 };
1239 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_digit[4] = {
1240 	(duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL,
1241 	(duk_uint16_t) 0x003AUL, (duk_uint16_t) 0xFFFFUL,
1242 };
1243 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_white[24] = {
1244 	(duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x0008UL,
1245 	(duk_uint16_t) 0x000EUL, (duk_uint16_t) 0x001FUL,
1246 	(duk_uint16_t) 0x0021UL, (duk_uint16_t) 0x009FUL,
1247 	(duk_uint16_t) 0x00A1UL, (duk_uint16_t) 0x167FUL,
1248 	(duk_uint16_t) 0x1681UL, (duk_uint16_t) 0x180DUL,
1249 	(duk_uint16_t) 0x180FUL, (duk_uint16_t) 0x1FFFUL,
1250 	(duk_uint16_t) 0x200BUL, (duk_uint16_t) 0x2027UL,
1251 	(duk_uint16_t) 0x202AUL, (duk_uint16_t) 0x202EUL,
1252 	(duk_uint16_t) 0x2030UL, (duk_uint16_t) 0x205EUL,
1253 	(duk_uint16_t) 0x2060UL, (duk_uint16_t) 0x2FFFUL,
1254 	(duk_uint16_t) 0x3001UL, (duk_uint16_t) 0xFEFEUL,
1255 	(duk_uint16_t) 0xFF00UL, (duk_uint16_t) 0xFFFFUL,
1256 };
1257 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_wordchar[10] = {
1258 	(duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL,
1259 	(duk_uint16_t) 0x003AUL, (duk_uint16_t) 0x0040UL,
1260 	(duk_uint16_t) 0x005BUL, (duk_uint16_t) 0x005EUL,
1261 	(duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x0060UL,
1262 	(duk_uint16_t) 0x007BUL, (duk_uint16_t) 0xFFFFUL,
1263 };
1264 
1265 #endif  /* DUK_USE_REGEXP_SUPPORT */
1266