1 /*
2 * Various Unicode help functions for character classification predicates,
3 * case conversion, decoding, etc.
4 */
5
6 #include "duk_internal.h"
7
8 /*
9 * Fast path tables
10 */
11
12 #if defined(DUK_USE_IDCHAR_FASTPATH)
13 DUK_INTERNAL const duk_int8_t duk_is_idchar_tab[128] = {
14 /* 0: not IdentifierStart or IdentifierPart
15 * 1: IdentifierStart and IdentifierPart
16 * -1: IdentifierPart only
17 */
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00...0x0f */
19 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10...0x1f */
20 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20...0x2f */
21 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, /* 0x30...0x3f */
22 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40...0x4f */
23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 0x50...0x5f */
24 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60...0x6f */
25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 /* 0x70...0x7f */
26 };
27 #endif
28
29 /*
30 * XUTF-8 and CESU-8 encoding/decoding
31 */
32
duk_unicode_get_xutf8_length(duk_ucodepoint_t cp)33 DUK_INTERNAL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp) {
34 duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
35 if (x < 0x80UL) {
36 /* 7 bits */
37 return 1;
38 } else if (x < 0x800UL) {
39 /* 11 bits */
40 return 2;
41 } else if (x < 0x10000UL) {
42 /* 16 bits */
43 return 3;
44 } else if (x < 0x200000UL) {
45 /* 21 bits */
46 return 4;
47 } else if (x < 0x4000000UL) {
48 /* 26 bits */
49 return 5;
50 } else if (x < (duk_ucodepoint_t) 0x80000000UL) {
51 /* 31 bits */
52 return 6;
53 } else {
54 /* 36 bits */
55 return 7;
56 }
57 }
58
59 #if defined(DUK_USE_ASSERTIONS)
duk_unicode_get_cesu8_length(duk_ucodepoint_t cp)60 DUK_INTERNAL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp) {
61 duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
62 if (x < 0x80UL) {
63 /* 7 bits */
64 return 1;
65 } else if (x < 0x800UL) {
66 /* 11 bits */
67 return 2;
68 } else if (x < 0x10000UL) {
69 /* 16 bits */
70 return 3;
71 } else {
72 /* Encoded as surrogate pair, each encoding to 3 bytes for
73 * 6 bytes total. Codepoints above U+10FFFF encode as 6 bytes
74 * too, see duk_unicode_encode_cesu8().
75 */
76 return 3 + 3;
77 }
78 }
79 #endif /* DUK_USE_ASSERTIONS */
80
81 DUK_INTERNAL const duk_uint8_t duk_unicode_xutf8_markers[7] = {
82 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
83 };
84
85 /* Encode to extended UTF-8; 'out' must have space for at least
86 * DUK_UNICODE_MAX_XUTF8_LENGTH bytes. Allows encoding of any
87 * 32-bit (unsigned) codepoint.
88 */
duk_unicode_encode_xutf8(duk_ucodepoint_t cp,duk_uint8_t * out)89 DUK_INTERNAL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out) {
90 duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
91 duk_small_int_t len;
92 duk_uint8_t marker;
93 duk_small_int_t i;
94
95 len = duk_unicode_get_xutf8_length(cp);
96 DUK_ASSERT(len > 0);
97
98 marker = duk_unicode_xutf8_markers[len - 1]; /* 64-bit OK because always >= 0 */
99
100 i = len;
101 DUK_ASSERT(i > 0);
102 do {
103 i--;
104 if (i > 0) {
105 out[i] = (duk_uint8_t) (0x80 + (x & 0x3f));
106 x >>= 6;
107 } else {
108 /* Note: masking of 'x' is not necessary because of
109 * range check and shifting -> no bits overlapping
110 * the marker should be set.
111 */
112 out[0] = (duk_uint8_t) (marker + x);
113 }
114 } while (i > 0);
115
116 return len;
117 }
118
119 /* Encode to CESU-8; 'out' must have space for at least
120 * DUK_UNICODE_MAX_CESU8_LENGTH bytes; codepoints above U+10FFFF
121 * will encode to garbage but won't overwrite the output buffer.
122 */
duk_unicode_encode_cesu8(duk_ucodepoint_t cp,duk_uint8_t * out)123 DUK_INTERNAL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out) {
124 duk_uint_fast32_t x = (duk_uint_fast32_t) cp;
125 duk_small_int_t len;
126
127 if (x < 0x80UL) {
128 out[0] = (duk_uint8_t) x;
129 len = 1;
130 } else if (x < 0x800UL) {
131 out[0] = (duk_uint8_t) (0xc0 + ((x >> 6) & 0x1f));
132 out[1] = (duk_uint8_t) (0x80 + (x & 0x3f));
133 len = 2;
134 } else if (x < 0x10000UL) {
135 /* surrogate pairs get encoded here */
136 out[0] = (duk_uint8_t) (0xe0 + ((x >> 12) & 0x0f));
137 out[1] = (duk_uint8_t) (0x80 + ((x >> 6) & 0x3f));
138 out[2] = (duk_uint8_t) (0x80 + (x & 0x3f));
139 len = 3;
140 } else {
141 /*
142 * Unicode codepoints above U+FFFF are encoded as surrogate
143 * pairs here. This ensures that all CESU-8 codepoints are
144 * 16-bit values as expected in ECMAScript. The surrogate
145 * pairs always get a 3-byte encoding (each) in CESU-8.
146 * See: http://en.wikipedia.org/wiki/Surrogate_pair
147 *
148 * 20-bit codepoint, 10 bits (A and B) per surrogate pair:
149 *
150 * x = 0b00000000 0000AAAA AAAAAABB BBBBBBBB
151 * sp1 = 0b110110AA AAAAAAAA (0xd800 + ((x >> 10) & 0x3ff))
152 * sp2 = 0b110111BB BBBBBBBB (0xdc00 + (x & 0x3ff))
153 *
154 * Encoded into CESU-8:
155 *
156 * sp1 -> 0b11101101 (0xe0 + ((sp1 >> 12) & 0x0f))
157 * -> 0b1010AAAA (0x80 + ((sp1 >> 6) & 0x3f))
158 * -> 0b10AAAAAA (0x80 + (sp1 & 0x3f))
159 * sp2 -> 0b11101101 (0xe0 + ((sp2 >> 12) & 0x0f))
160 * -> 0b1011BBBB (0x80 + ((sp2 >> 6) & 0x3f))
161 * -> 0b10BBBBBB (0x80 + (sp2 & 0x3f))
162 *
163 * Note that 0x10000 must be subtracted first. The code below
164 * avoids the sp1, sp2 temporaries which saves around 20 bytes
165 * of code.
166 */
167
168 x -= 0x10000UL;
169
170 out[0] = (duk_uint8_t) (0xed);
171 out[1] = (duk_uint8_t) (0xa0 + ((x >> 16) & 0x0f));
172 out[2] = (duk_uint8_t) (0x80 + ((x >> 10) & 0x3f));
173 out[3] = (duk_uint8_t) (0xed);
174 out[4] = (duk_uint8_t) (0xb0 + ((x >> 6) & 0x0f));
175 out[5] = (duk_uint8_t) (0x80 + (x & 0x3f));
176 len = 6;
177 }
178
179 return len;
180 }
181
182 /* Decode helper. Return zero on error. */
duk_unicode_decode_xutf8(duk_hthread * thr,const duk_uint8_t ** ptr,const duk_uint8_t * ptr_start,const duk_uint8_t * ptr_end,duk_ucodepoint_t * out_cp)183 DUK_INTERNAL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp) {
184 const duk_uint8_t *p;
185 duk_uint32_t res;
186 duk_uint_fast8_t ch;
187 duk_small_int_t n;
188
189 DUK_UNREF(thr);
190
191 p = *ptr;
192 if (p < ptr_start || p >= ptr_end) {
193 goto fail;
194 }
195
196 /*
197 * UTF-8 decoder which accepts longer than standard byte sequences.
198 * This allows full 32-bit code points to be used.
199 */
200
201 ch = (duk_uint_fast8_t) (*p++);
202 if (ch < 0x80) {
203 /* 0xxx xxxx [7 bits] */
204 res = (duk_uint32_t) (ch & 0x7f);
205 n = 0;
206 } else if (ch < 0xc0) {
207 /* 10xx xxxx -> invalid */
208 goto fail;
209 } else if (ch < 0xe0) {
210 /* 110x xxxx 10xx xxxx [11 bits] */
211 res = (duk_uint32_t) (ch & 0x1f);
212 n = 1;
213 } else if (ch < 0xf0) {
214 /* 1110 xxxx 10xx xxxx 10xx xxxx [16 bits] */
215 res = (duk_uint32_t) (ch & 0x0f);
216 n = 2;
217 } else if (ch < 0xf8) {
218 /* 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx [21 bits] */
219 res = (duk_uint32_t) (ch & 0x07);
220 n = 3;
221 } else if (ch < 0xfc) {
222 /* 1111 10xx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [26 bits] */
223 res = (duk_uint32_t) (ch & 0x03);
224 n = 4;
225 } else if (ch < 0xfe) {
226 /* 1111 110x 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [31 bits] */
227 res = (duk_uint32_t) (ch & 0x01);
228 n = 5;
229 } else if (ch < 0xff) {
230 /* 1111 1110 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [36 bits] */
231 res = (duk_uint32_t) (0);
232 n = 6;
233 } else {
234 /* 8-byte format could be:
235 * 1111 1111 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx 10xx xxxx [41 bits]
236 *
237 * However, this format would not have a zero bit following the
238 * leading one bits and would not allow 0xFF to be used as an
239 * "invalid xutf-8" marker for internal keys. Further, 8-byte
240 * encodings (up to 41 bit code points) are not currently needed.
241 */
242 goto fail;
243 }
244
245 DUK_ASSERT(p >= ptr_start); /* verified at beginning */
246 if (p + n > ptr_end) {
247 /* check pointer at end */
248 goto fail;
249 }
250
251 while (n > 0) {
252 DUK_ASSERT(p >= ptr_start && p < ptr_end);
253 ch = (duk_uint_fast8_t) (*p++);
254 #if 0
255 if (ch & 0xc0 != 0x80) {
256 /* not a continuation byte */
257 p--;
258 *ptr = p;
259 *out_cp = DUK_UNICODE_CP_REPLACEMENT_CHARACTER;
260 return 1;
261 }
262 #endif
263 res = (res << 6) + (duk_uint32_t) (ch & 0x3f);
264 n--;
265 }
266
267 *ptr = p;
268 *out_cp = res;
269 return 1;
270
271 fail:
272 return 0;
273 }
274
275 /* used by e.g. duk_regexp_executor.c, string built-ins */
duk_unicode_decode_xutf8_checked(duk_hthread * thr,const duk_uint8_t ** ptr,const duk_uint8_t * ptr_start,const duk_uint8_t * ptr_end)276 DUK_INTERNAL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end) {
277 duk_ucodepoint_t cp;
278
279 if (duk_unicode_decode_xutf8(thr, ptr, ptr_start, ptr_end, &cp)) {
280 return cp;
281 }
282 DUK_ERROR_INTERNAL(thr);
283 DUK_WO_NORETURN(return 0;);
284 }
285
286 /* Compute (extended) utf-8 length without codepoint encoding validation,
287 * used for string interning.
288 *
289 * NOTE: This algorithm is performance critical, more so than string hashing
290 * in some cases. It is needed when interning a string and needs to scan
291 * every byte of the string with no skipping. Having an ASCII fast path
292 * is useful if possible in the algorithm. The current algorithms were
293 * chosen from several variants, based on x64 gcc -O2 testing. See:
294 * https://github.com/svaarala/duktape/pull/422
295 *
296 * NOTE: must match tools/dukutil.py:duk_unicode_unvalidated_utf8_length().
297 */
298
299 #if defined(DUK_USE_PREFER_SIZE)
300 /* Small variant; roughly 150 bytes smaller than the fast variant. */
duk_unicode_unvalidated_utf8_length(const duk_uint8_t * data,duk_size_t blen)301 DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) {
302 const duk_uint8_t *p;
303 const duk_uint8_t *p_end;
304 duk_size_t ncont;
305 duk_size_t clen;
306
307 p = data;
308 p_end = data + blen;
309 ncont = 0;
310 while (p != p_end) {
311 duk_uint8_t x;
312 x = *p++;
313 if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
314 ncont++;
315 }
316 }
317
318 DUK_ASSERT(ncont <= blen);
319 clen = blen - ncont;
320 DUK_ASSERT(clen <= blen);
321 return clen;
322 }
323 #else /* DUK_USE_PREFER_SIZE */
324 /* This seems like a good overall approach. Fast path for ASCII in 4 byte
325 * blocks.
326 */
duk_unicode_unvalidated_utf8_length(const duk_uint8_t * data,duk_size_t blen)327 DUK_INTERNAL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen) {
328 const duk_uint8_t *p;
329 const duk_uint8_t *p_end;
330 const duk_uint32_t *p32_end;
331 const duk_uint32_t *p32;
332 duk_size_t ncont;
333 duk_size_t clen;
334
335 ncont = 0; /* number of continuation (non-initial) bytes in [0x80,0xbf] */
336 p = data;
337 p_end = data + blen;
338 if (blen < 16) {
339 goto skip_fastpath;
340 }
341
342 /* Align 'p' to 4; the input data may have arbitrary alignment.
343 * End of string check not needed because blen >= 16.
344 */
345 while (((duk_size_t) (const void *) p) & 0x03U) {
346 duk_uint8_t x;
347 x = *p++;
348 if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
349 ncont++;
350 }
351 }
352
353 /* Full, aligned 4-byte reads. */
354 p32_end = (const duk_uint32_t *) (const void *) (p + ((duk_size_t) (p_end - p) & (duk_size_t) (~0x03)));
355 p32 = (const duk_uint32_t *) (const void *) p;
356 while (p32 != (const duk_uint32_t *) p32_end) {
357 duk_uint32_t x;
358 x = *p32++;
359 if (DUK_LIKELY((x & 0x80808080UL) == 0)) {
360 ; /* ASCII fast path */
361 } else {
362 /* Flip highest bit of each byte which changes
363 * the bit pattern 10xxxxxx into 00xxxxxx which
364 * allows an easy bit mask test.
365 */
366 x ^= 0x80808080UL;
367 if (DUK_UNLIKELY(!(x & 0xc0000000UL))) {
368 ncont++;
369 }
370 if (DUK_UNLIKELY(!(x & 0x00c00000UL))) {
371 ncont++;
372 }
373 if (DUK_UNLIKELY(!(x & 0x0000c000UL))) {
374 ncont++;
375 }
376 if (DUK_UNLIKELY(!(x & 0x000000c0UL))) {
377 ncont++;
378 }
379 }
380 }
381 p = (const duk_uint8_t *) p32;
382 /* Fall through to handle the rest. */
383
384 skip_fastpath:
385 while (p != p_end) {
386 duk_uint8_t x;
387 x = *p++;
388 if (DUK_UNLIKELY(x >= 0x80 && x <= 0xbf)) {
389 ncont++;
390 }
391 }
392
393 DUK_ASSERT(ncont <= blen);
394 clen = blen - ncont;
395 DUK_ASSERT(clen <= blen);
396 return clen;
397 }
398 #endif /* DUK_USE_PREFER_SIZE */
399
400 /* Check whether a string is UTF-8 compatible or not. */
duk_unicode_is_utf8_compatible(const duk_uint8_t * buf,duk_size_t len)401 DUK_INTERNAL duk_bool_t duk_unicode_is_utf8_compatible(const duk_uint8_t *buf, duk_size_t len) {
402 duk_size_t i = 0;
403 #if !defined(DUK_USE_PREFER_SIZE)
404 duk_size_t len_safe;
405 #endif
406
407 /* Many practical strings are ASCII only, so use a fast path check
408 * to check chunks of bytes at once with minimal branch cost.
409 */
410 #if !defined(DUK_USE_PREFER_SIZE)
411 len_safe = len & ~0x03UL;
412 for (; i < len_safe; i += 4) {
413 duk_uint8_t t = buf[i] | buf[i + 1] | buf[i + 2] | buf[i + 3];
414 if (DUK_UNLIKELY((t & 0x80U) != 0U)) {
415 /* At least one byte was outside 0x00-0x7f, break
416 * out to slow path (and remain there).
417 *
418 * XXX: We could also deal with the problem character
419 * and resume fast path later.
420 */
421 break;
422 }
423 }
424 #endif
425
426 for (; i < len;) {
427 duk_uint8_t t;
428 duk_size_t left;
429 duk_size_t ncont;
430 duk_uint32_t cp;
431 duk_uint32_t mincp;
432
433 t = buf[i++];
434 if (DUK_LIKELY((t & 0x80U) == 0U)) {
435 /* Fast path, ASCII. */
436 continue;
437 }
438
439 /* Non-ASCII start byte, slow path.
440 *
441 * 10xx xxxx -> continuation byte
442 * 110x xxxx + 1*CONT -> [0x80, 0x7ff]
443 * 1110 xxxx + 2*CONT -> [0x800, 0xffff], must reject [0xd800,0xdfff]
444 * 1111 0xxx + 3*CONT -> [0x10000, 0x10ffff]
445 */
446 left = len - i;
447 if (t <= 0xdfU) { /* 1101 1111 = 0xdf */
448 if (t <= 0xbfU) { /* 1011 1111 = 0xbf */
449 return 0;
450 }
451 ncont = 1;
452 mincp = 0x80UL;
453 cp = t & 0x1fU;
454 } else if (t <= 0xefU) { /* 1110 1111 = 0xef */
455 ncont = 2;
456 mincp = 0x800UL;
457 cp = t & 0x0fU;
458 } else if (t <= 0xf7U) { /* 1111 0111 = 0xf7 */
459 ncont = 3;
460 mincp = 0x10000UL;
461 cp = t & 0x07U;
462 } else {
463 return 0;
464 }
465 if (left < ncont) {
466 return 0;
467 }
468 while (ncont > 0U) {
469 t = buf[i++];
470 if ((t & 0xc0U) != 0x80U) { /* 10xx xxxx */
471 return 0;
472 }
473 cp = (cp << 6) + (t & 0x3fU);
474 ncont--;
475 }
476 if (cp < mincp || cp > 0x10ffffUL || (cp >= 0xd800UL && cp <= 0xdfffUL)) {
477 return 0;
478 }
479 }
480
481 return 1;
482 }
483
484 /*
485 * Unicode range matcher
486 *
487 * Matches a codepoint against a packed bitstream of character ranges.
488 * Used for slow path Unicode matching.
489 */
490
491 /* Must match tools/extract_chars.py, generate_match_table3(). */
duk__uni_decode_value(duk_bitdecoder_ctx * bd_ctx)492 DUK_LOCAL duk_uint32_t duk__uni_decode_value(duk_bitdecoder_ctx *bd_ctx) {
493 duk_uint32_t t;
494
495 t = (duk_uint32_t) duk_bd_decode(bd_ctx, 4);
496 if (t <= 0x0eU) {
497 return t;
498 }
499 t = (duk_uint32_t) duk_bd_decode(bd_ctx, 8);
500 if (t <= 0xfdU) {
501 return t + 0x0f;
502 }
503 if (t == 0xfeU) {
504 t = (duk_uint32_t) duk_bd_decode(bd_ctx, 12);
505 return t + 0x0fU + 0xfeU;
506 } else {
507 t = (duk_uint32_t) duk_bd_decode(bd_ctx, 24);
508 return t + 0x0fU + 0xfeU + 0x1000UL;
509 }
510 }
511
duk__uni_range_match(const duk_uint8_t * unitab,duk_size_t unilen,duk_codepoint_t cp)512 DUK_LOCAL duk_small_int_t duk__uni_range_match(const duk_uint8_t *unitab, duk_size_t unilen, duk_codepoint_t cp) {
513 duk_bitdecoder_ctx bd_ctx;
514 duk_codepoint_t prev_re;
515
516 duk_memzero(&bd_ctx, sizeof(bd_ctx));
517 bd_ctx.data = (const duk_uint8_t *) unitab;
518 bd_ctx.length = (duk_size_t) unilen;
519
520 prev_re = 0;
521 for (;;) {
522 duk_codepoint_t r1, r2;
523 r1 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx);
524 if (r1 == 0) {
525 break;
526 }
527 r2 = (duk_codepoint_t) duk__uni_decode_value(&bd_ctx);
528
529 r1 = prev_re + r1;
530 r2 = r1 + r2;
531 prev_re = r2;
532
533 /* [r1,r2] is the range */
534
535 DUK_DDD(DUK_DDDPRINT("duk__uni_range_match: cp=%06lx range=[0x%06lx,0x%06lx]",
536 (unsigned long) cp, (unsigned long) r1, (unsigned long) r2));
537 if (cp >= r1 && cp <= r2) {
538 return 1;
539 }
540 }
541
542 return 0;
543 }
544
545 /*
546 * "WhiteSpace" production check.
547 */
548
duk_unicode_is_whitespace(duk_codepoint_t cp)549 DUK_INTERNAL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp) {
550 /*
551 * E5 Section 7.2 specifies six characters specifically as
552 * white space:
553 *
554 * 0009;<control>;Cc;0;S;;;;;N;CHARACTER TABULATION;;;;
555 * 000B;<control>;Cc;0;S;;;;;N;LINE TABULATION;;;;
556 * 000C;<control>;Cc;0;WS;;;;;N;FORM FEED (FF);;;;
557 * 0020;SPACE;Zs;0;WS;;;;;N;;;;;
558 * 00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
559 * FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;;
560 *
561 * It also specifies any Unicode category 'Zs' characters as white
562 * space. These can be extracted with the "tools/extract_chars.py" script.
563 * Current result:
564 *
565 * RAW OUTPUT:
566 * ===========
567 * 0020;SPACE;Zs;0;WS;;;;;N;;;;;
568 * 00A0;NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
569 * 1680;OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;;
570 * 180E;MONGOLIAN VOWEL SEPARATOR;Zs;0;WS;;;;;N;;;;;
571 * 2000;EN QUAD;Zs;0;WS;2002;;;;N;;;;;
572 * 2001;EM QUAD;Zs;0;WS;2003;;;;N;;;;;
573 * 2002;EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
574 * 2003;EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
575 * 2004;THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
576 * 2005;FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
577 * 2006;SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
578 * 2007;FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
579 * 2008;PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
580 * 2009;THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
581 * 200A;HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
582 * 202F;NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
583 * 205F;MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
584 * 3000;IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
585 *
586 * RANGES:
587 * =======
588 * 0x0020
589 * 0x00a0
590 * 0x1680
591 * 0x180e
592 * 0x2000 ... 0x200a
593 * 0x202f
594 * 0x205f
595 * 0x3000
596 *
597 * A manual decoder (below) is probably most compact for this.
598 */
599
600 duk_uint_fast8_t lo;
601 duk_uint_fast32_t hi;
602
603 /* cp == -1 (EOF) never matches and causes return value 0 */
604
605 lo = (duk_uint_fast8_t) (cp & 0xff);
606 hi = (duk_uint_fast32_t) (cp >> 8); /* does not fit into an uchar */
607
608 if (hi == 0x0000UL) {
609 if (lo == 0x09U || lo == 0x0bU || lo == 0x0cU ||
610 lo == 0x20U || lo == 0xa0U) {
611 return 1;
612 }
613 } else if (hi == 0x0020UL) {
614 if (lo <= 0x0aU || lo == 0x2fU || lo == 0x5fU) {
615 return 1;
616 }
617 } else if (cp == 0x1680L || cp == 0x180eL || cp == 0x3000L ||
618 cp == 0xfeffL) {
619 return 1;
620 }
621
622 return 0;
623 }
624
625 /*
626 * "LineTerminator" production check.
627 */
628
duk_unicode_is_line_terminator(duk_codepoint_t cp)629 DUK_INTERNAL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp) {
630 /*
631 * E5 Section 7.3
632 *
633 * A LineTerminatorSequence essentially merges <CR> <LF> sequences
634 * into a single line terminator. This must be handled by the caller.
635 */
636
637 if (cp == 0x000aL || cp == 0x000dL || cp == 0x2028L ||
638 cp == 0x2029L) {
639 return 1;
640 }
641
642 return 0;
643 }
644
645 /*
646 * "IdentifierStart" production check.
647 */
648
duk_unicode_is_identifier_start(duk_codepoint_t cp)649 DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp) {
650 /*
651 * E5 Section 7.6:
652 *
653 * IdentifierStart:
654 * UnicodeLetter
655 * $
656 * _
657 * \ UnicodeEscapeSequence
658 *
659 * IdentifierStart production has one multi-character production:
660 *
661 * \ UnicodeEscapeSequence
662 *
663 * The '\' character is -not- matched by this function. Rather, the caller
664 * should decode the escape and then call this function to check whether the
665 * decoded character is acceptable (see discussion in E5 Section 7.6).
666 *
667 * The "UnicodeLetter" alternative of the production allows letters
668 * from various Unicode categories. These can be extracted with the
669 * "tools/extract_chars.py" script.
670 *
671 * Because the result has hundreds of Unicode codepoint ranges, matching
672 * for any values >= 0x80 are done using a very slow range-by-range scan
673 * and a packed range format.
674 *
675 * The ASCII portion (codepoints 0x00 ... 0x7f) is fast-pathed below because
676 * it matters the most. The ASCII related ranges of IdentifierStart are:
677 *
678 * 0x0041 ... 0x005a ['A' ... 'Z']
679 * 0x0061 ... 0x007a ['a' ... 'z']
680 * 0x0024 ['$']
681 * 0x005f ['_']
682 */
683
684 /* ASCII (and EOF) fast path -- quick accept and reject */
685 if (cp <= 0x7fL) {
686 #if defined(DUK_USE_IDCHAR_FASTPATH)
687 return (cp >= 0) && (duk_is_idchar_tab[cp] > 0);
688 #else
689 if ((cp >= 'a' && cp <= 'z') ||
690 (cp >= 'A' && cp <= 'Z') ||
691 cp == '_' || cp == '$') {
692 return 1;
693 }
694 return 0;
695 #endif
696 }
697
698 /* Non-ASCII slow path (range-by-range linear comparison), very slow */
699
700 #if defined(DUK_USE_SOURCE_NONBMP)
701 if (duk__uni_range_match(duk_unicode_ids_noa,
702 (duk_size_t) sizeof(duk_unicode_ids_noa),
703 (duk_codepoint_t) cp)) {
704 return 1;
705 }
706 return 0;
707 #else
708 if (cp < 0x10000L) {
709 if (duk__uni_range_match(duk_unicode_ids_noabmp,
710 sizeof(duk_unicode_ids_noabmp),
711 (duk_codepoint_t) cp)) {
712 return 1;
713 }
714 return 0;
715 } else {
716 /* without explicit non-BMP support, assume non-BMP characters
717 * are always accepted as identifier characters.
718 */
719 return 1;
720 }
721 #endif
722 }
723
724 /*
725 * "IdentifierPart" production check.
726 */
727
duk_unicode_is_identifier_part(duk_codepoint_t cp)728 DUK_INTERNAL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp) {
729 /*
730 * E5 Section 7.6:
731 *
732 * IdentifierPart:
733 * IdentifierStart
734 * UnicodeCombiningMark
735 * UnicodeDigit
736 * UnicodeConnectorPunctuation
737 * <ZWNJ> [U+200C]
738 * <ZWJ> [U+200D]
739 *
740 * IdentifierPart production has one multi-character production
741 * as part of its IdentifierStart alternative. The '\' character
742 * of an escape sequence is not matched here, see discussion in
743 * duk_unicode_is_identifier_start().
744 *
745 * To match non-ASCII characters (codepoints >= 0x80), a very slow
746 * linear range-by-range scan is used. The codepoint is first compared
747 * to the IdentifierStart ranges, and if it doesn't match, then to a
748 * set consisting of code points in IdentifierPart but not in
749 * IdentifierStart. This is done to keep the unicode range data small,
750 * at the expense of speed.
751 *
752 * The ASCII fast path consists of:
753 *
754 * 0x0030 ... 0x0039 ['0' ... '9', UnicodeDigit]
755 * 0x0041 ... 0x005a ['A' ... 'Z', IdentifierStart]
756 * 0x0061 ... 0x007a ['a' ... 'z', IdentifierStart]
757 * 0x0024 ['$', IdentifierStart]
758 * 0x005f ['_', IdentifierStart and
759 * UnicodeConnectorPunctuation]
760 *
761 * UnicodeCombiningMark has no code points <= 0x7f.
762 *
763 * The matching code reuses the "identifier start" tables, and then
764 * consults a separate range set for characters in "identifier part"
765 * but not in "identifier start". These can be extracted with the
766 * "tools/extract_chars.py" script.
767 *
768 * UnicodeCombiningMark -> categories Mn, Mc
769 * UnicodeDigit -> categories Nd
770 * UnicodeConnectorPunctuation -> categories Pc
771 */
772
773 /* ASCII (and EOF) fast path -- quick accept and reject */
774 if (cp <= 0x7fL) {
775 #if defined(DUK_USE_IDCHAR_FASTPATH)
776 return (cp >= 0) && (duk_is_idchar_tab[cp] != 0);
777 #else
778 if ((cp >= 'a' && cp <= 'z') ||
779 (cp >= 'A' && cp <= 'Z') ||
780 (cp >= '0' && cp <= '9') ||
781 cp == '_' || cp == '$') {
782 return 1;
783 }
784 return 0;
785 #endif
786 }
787
788 /* Non-ASCII slow path (range-by-range linear comparison), very slow */
789
790 #if defined(DUK_USE_SOURCE_NONBMP)
791 if (duk__uni_range_match(duk_unicode_ids_noa,
792 sizeof(duk_unicode_ids_noa),
793 (duk_codepoint_t) cp) ||
794 duk__uni_range_match(duk_unicode_idp_m_ids_noa,
795 sizeof(duk_unicode_idp_m_ids_noa),
796 (duk_codepoint_t) cp)) {
797 return 1;
798 }
799 return 0;
800 #else
801 if (cp < 0x10000L) {
802 if (duk__uni_range_match(duk_unicode_ids_noabmp,
803 sizeof(duk_unicode_ids_noabmp),
804 (duk_codepoint_t) cp) ||
805 duk__uni_range_match(duk_unicode_idp_m_ids_noabmp,
806 sizeof(duk_unicode_idp_m_ids_noabmp),
807 (duk_codepoint_t) cp)) {
808 return 1;
809 }
810 return 0;
811 } else {
812 /* without explicit non-BMP support, assume non-BMP characters
813 * are always accepted as identifier characters.
814 */
815 return 1;
816 }
817 #endif
818 }
819
820 /*
821 * Unicode letter check.
822 */
823
duk_unicode_is_letter(duk_codepoint_t cp)824 DUK_INTERNAL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp) {
825 /*
826 * Unicode letter is now taken to be the categories:
827 *
828 * Lu, Ll, Lt, Lm, Lo
829 *
830 * (Not sure if this is exactly correct.)
831 *
832 * The ASCII fast path consists of:
833 *
834 * 0x0041 ... 0x005a ['A' ... 'Z']
835 * 0x0061 ... 0x007a ['a' ... 'z']
836 */
837
838 /* ASCII (and EOF) fast path -- quick accept and reject */
839 if (cp <= 0x7fL) {
840 if ((cp >= 'a' && cp <= 'z') ||
841 (cp >= 'A' && cp <= 'Z')) {
842 return 1;
843 }
844 return 0;
845 }
846
847 /* Non-ASCII slow path (range-by-range linear comparison), very slow */
848
849 #if defined(DUK_USE_SOURCE_NONBMP)
850 if (duk__uni_range_match(duk_unicode_ids_noa,
851 sizeof(duk_unicode_ids_noa),
852 (duk_codepoint_t) cp) &&
853 !duk__uni_range_match(duk_unicode_ids_m_let_noa,
854 sizeof(duk_unicode_ids_m_let_noa),
855 (duk_codepoint_t) cp)) {
856 return 1;
857 }
858 return 0;
859 #else
860 if (cp < 0x10000L) {
861 if (duk__uni_range_match(duk_unicode_ids_noabmp,
862 sizeof(duk_unicode_ids_noabmp),
863 (duk_codepoint_t) cp) &&
864 !duk__uni_range_match(duk_unicode_ids_m_let_noabmp,
865 sizeof(duk_unicode_ids_m_let_noabmp),
866 (duk_codepoint_t) cp)) {
867 return 1;
868 }
869 return 0;
870 } else {
871 /* without explicit non-BMP support, assume non-BMP characters
872 * are always accepted as letters.
873 */
874 return 1;
875 }
876 #endif
877 }
878
879 /*
880 * Complex case conversion helper which decodes a bit-packed conversion
881 * control stream generated by tools/extract_caseconv.py. The conversion
882 * is very slow because it runs through the conversion data in a linear
883 * fashion to save space (which is why ASCII characters have a special
884 * fast path before arriving here).
885 *
886 * The particular bit counts etc have been determined experimentally to
887 * be small but still sufficient, and must match the Python script
888 * (tools/extract_caseconv.py).
889 *
890 * The return value is the case converted codepoint or -1 if the conversion
891 * results in multiple characters (this is useful for regexp Canonicalization
892 * operation). If 'buf' is not NULL, the result codepoint(s) are also
893 * appended to the hbuffer.
894 *
895 * Context and locale specific rules must be checked before consulting
896 * this function.
897 */
898
899 DUK_LOCAL
duk__slow_case_conversion(duk_hthread * thr,duk_bufwriter_ctx * bw,duk_codepoint_t cp,duk_bitdecoder_ctx * bd_ctx)900 duk_codepoint_t duk__slow_case_conversion(duk_hthread *thr,
901 duk_bufwriter_ctx *bw,
902 duk_codepoint_t cp,
903 duk_bitdecoder_ctx *bd_ctx) {
904 duk_small_int_t skip = 0;
905 duk_small_int_t n;
906 duk_small_int_t t;
907 duk_small_int_t count;
908 duk_codepoint_t tmp_cp;
909 duk_codepoint_t start_i;
910 duk_codepoint_t start_o;
911
912 DUK_ASSERT(bd_ctx != NULL);
913 DUK_UNREF(thr);
914
915 DUK_DDD(DUK_DDDPRINT("slow case conversion for codepoint: %ld", (long) cp));
916
917 /* range conversion with a "skip" */
918 DUK_DDD(DUK_DDDPRINT("checking ranges"));
919 for (;;) {
920 skip++;
921 n = (duk_small_int_t) duk_bd_decode(bd_ctx, 6);
922 if (n == 0x3f) {
923 /* end marker */
924 break;
925 }
926 DUK_DDD(DUK_DDDPRINT("skip=%ld, n=%ld", (long) skip, (long) n));
927
928 while (n--) {
929 start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
930 start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
931 count = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);
932 DUK_DDD(DUK_DDDPRINT("range: start_i=%ld, start_o=%ld, count=%ld, skip=%ld",
933 (long) start_i, (long) start_o, (long) count, (long) skip));
934
935 if (cp >= start_i) {
936 tmp_cp = cp - start_i; /* always >= 0 */
937 if (tmp_cp < (duk_codepoint_t) count * (duk_codepoint_t) skip &&
938 (tmp_cp % (duk_codepoint_t) skip) == 0) {
939 DUK_DDD(DUK_DDDPRINT("range matches input codepoint"));
940 cp = start_o + tmp_cp;
941 goto single;
942 }
943 }
944 }
945 }
946
947 /* 1:1 conversion */
948 n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);
949 DUK_DDD(DUK_DDDPRINT("checking 1:1 conversions (count %ld)", (long) n));
950 while (n--) {
951 start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
952 start_o = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
953 DUK_DDD(DUK_DDDPRINT("1:1 conversion %ld -> %ld", (long) start_i, (long) start_o));
954 if (cp == start_i) {
955 DUK_DDD(DUK_DDDPRINT("1:1 matches input codepoint"));
956 cp = start_o;
957 goto single;
958 }
959 }
960
961 /* complex, multicharacter conversion */
962 n = (duk_small_int_t) duk_bd_decode(bd_ctx, 7);
963 DUK_DDD(DUK_DDDPRINT("checking 1:n conversions (count %ld)", (long) n));
964 while (n--) {
965 start_i = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
966 t = (duk_small_int_t) duk_bd_decode(bd_ctx, 2);
967 DUK_DDD(DUK_DDDPRINT("1:n conversion %ld -> %ld chars", (long) start_i, (long) t));
968 if (cp == start_i) {
969 DUK_DDD(DUK_DDDPRINT("1:n matches input codepoint"));
970 if (bw != NULL) {
971 while (t--) {
972 tmp_cp = (duk_codepoint_t) duk_bd_decode(bd_ctx, 16);
973 DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) tmp_cp);
974 }
975 }
976 return -1;
977 } else {
978 while (t--) {
979 (void) duk_bd_decode(bd_ctx, 16);
980 }
981 }
982 }
983
984 /* default: no change */
985 DUK_DDD(DUK_DDDPRINT("no rule matches, output is same as input"));
986 /* fall through */
987
988 single:
989 if (bw != NULL) {
990 DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
991 }
992 return cp;
993 }
994
995 /*
996 * Case conversion helper, with context/local sensitivity.
997 * For proper case conversion, one needs to know the character
998 * and the preceding and following characters, as well as
999 * locale/language.
1000 */
1001
1002 /* XXX: add 'language' argument when locale/language sensitive rule
1003 * support added.
1004 */
1005 DUK_LOCAL
duk__case_transform_helper(duk_hthread * thr,duk_bufwriter_ctx * bw,duk_codepoint_t cp,duk_codepoint_t prev,duk_codepoint_t next,duk_bool_t uppercase)1006 duk_codepoint_t duk__case_transform_helper(duk_hthread *thr,
1007 duk_bufwriter_ctx *bw,
1008 duk_codepoint_t cp,
1009 duk_codepoint_t prev,
1010 duk_codepoint_t next,
1011 duk_bool_t uppercase) {
1012 duk_bitdecoder_ctx bd_ctx;
1013
1014 /* fast path for ASCII */
1015 if (cp < 0x80L) {
1016 /* XXX: there are language sensitive rules for the ASCII range.
1017 * If/when language/locale support is implemented, they need to
1018 * be implemented here for the fast path. There are no context
1019 * sensitive rules for ASCII range.
1020 */
1021
1022 if (uppercase) {
1023 if (cp >= 'a' && cp <= 'z') {
1024 cp = cp - 'a' + 'A';
1025 }
1026 } else {
1027 if (cp >= 'A' && cp <= 'Z') {
1028 cp = cp - 'A' + 'a';
1029 }
1030 }
1031
1032 if (bw != NULL) {
1033 DUK_BW_WRITE_RAW_U8(thr, bw, (duk_uint8_t) cp);
1034 }
1035 return cp;
1036 }
1037
1038 /* context and locale specific rules which cannot currently be represented
1039 * in the caseconv bitstream: hardcoded rules in C
1040 */
1041 if (uppercase) {
1042 /* XXX: turkish / azeri */
1043 } else {
1044 /*
1045 * Final sigma context specific rule. This is a rather tricky
1046 * rule and this handling is probably not 100% correct now.
1047 * The rule is not locale/language specific so it is supported.
1048 */
1049
1050 if (cp == 0x03a3L && /* U+03A3 = GREEK CAPITAL LETTER SIGMA */
1051 duk_unicode_is_letter(prev) && /* prev exists and is not a letter */
1052 !duk_unicode_is_letter(next)) { /* next does not exist or next is not a letter */
1053 /* Capital sigma occurred at "end of word", lowercase to
1054 * U+03C2 = GREEK SMALL LETTER FINAL SIGMA. Otherwise
1055 * fall through and let the normal rules lowercase it to
1056 * U+03C3 = GREEK SMALL LETTER SIGMA.
1057 */
1058 cp = 0x03c2L;
1059 goto singlechar;
1060 }
1061
1062 /* XXX: lithuanian not implemented */
1063 /* XXX: lithuanian, explicit dot rules */
1064 /* XXX: turkish / azeri, lowercase rules */
1065 }
1066
1067 /* 1:1 or special conversions, but not locale/context specific: script generated rules */
1068 duk_memzero(&bd_ctx, sizeof(bd_ctx));
1069 if (uppercase) {
1070 bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_uc;
1071 bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_uc);
1072 } else {
1073 bd_ctx.data = (const duk_uint8_t *) duk_unicode_caseconv_lc;
1074 bd_ctx.length = (duk_size_t) sizeof(duk_unicode_caseconv_lc);
1075 }
1076 return duk__slow_case_conversion(thr, bw, cp, &bd_ctx);
1077
1078 singlechar:
1079 if (bw != NULL) {
1080 DUK_BW_WRITE_RAW_XUTF8(thr, bw, (duk_ucodepoint_t) cp);
1081 }
1082 return cp;
1083
1084 /* unused now, not needed until Turkish/Azeri */
1085 #if 0
1086 nochar:
1087 return -1;
1088 #endif
1089 }
1090
1091 /*
1092 * Replace valstack top with case converted version.
1093 */
1094
duk_unicode_case_convert_string(duk_hthread * thr,duk_bool_t uppercase)1095 DUK_INTERNAL void duk_unicode_case_convert_string(duk_hthread *thr, duk_bool_t uppercase) {
1096 duk_hstring *h_input;
1097 duk_bufwriter_ctx bw_alloc;
1098 duk_bufwriter_ctx *bw;
1099 const duk_uint8_t *p, *p_start, *p_end;
1100 duk_codepoint_t prev, curr, next;
1101
1102 h_input = duk_require_hstring(thr, -1); /* Accept symbols. */
1103 DUK_ASSERT(h_input != NULL);
1104
1105 bw = &bw_alloc;
1106 DUK_BW_INIT_PUSHBUF(thr, bw, DUK_HSTRING_GET_BYTELEN(h_input));
1107
1108 /* [ ... input buffer ] */
1109
1110 p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h_input);
1111 p_end = p_start + DUK_HSTRING_GET_BYTELEN(h_input);
1112 p = p_start;
1113
1114 prev = -1; DUK_UNREF(prev);
1115 curr = -1;
1116 next = -1;
1117 for (;;) {
1118 prev = curr;
1119 curr = next;
1120 next = -1;
1121 if (p < p_end) {
1122 next = (duk_codepoint_t) duk_unicode_decode_xutf8_checked(thr, &p, p_start, p_end);
1123 } else {
1124 /* end of input and last char has been processed */
1125 if (curr < 0) {
1126 break;
1127 }
1128 }
1129
1130 /* on first round, skip */
1131 if (curr >= 0) {
1132 /* XXX: could add a fast path to process chunks of input codepoints,
1133 * but relative benefit would be quite small.
1134 */
1135
1136 /* Ensure space for maximum multi-character result; estimate is overkill. */
1137 DUK_BW_ENSURE(thr, bw, 8 * DUK_UNICODE_MAX_XUTF8_LENGTH);
1138
1139 duk__case_transform_helper(thr,
1140 bw,
1141 (duk_codepoint_t) curr,
1142 prev,
1143 next,
1144 uppercase);
1145 }
1146 }
1147
1148 DUK_BW_COMPACT(thr, bw);
1149 (void) duk_buffer_to_string(thr, -1); /* Safe, output is encoded. */
1150 /* invalidates h_buf pointer */
1151 duk_remove_m2(thr);
1152 }
1153
1154 #if defined(DUK_USE_REGEXP_SUPPORT)
1155
1156 /*
1157 * Canonicalize() abstract operation needed for canonicalization of individual
1158 * codepoints during regexp compilation and execution, see E5 Section 15.10.2.8.
1159 * Note that codepoints are canonicalized one character at a time, so no context
1160 * specific rules can apply. Locale specific rules can apply, though.
1161 */
1162
duk_unicode_re_canonicalize_char(duk_hthread * thr,duk_codepoint_t cp)1163 DUK_INTERNAL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp) {
1164 #if defined(DUK_USE_REGEXP_CANON_WORKAROUND)
1165 /* Fast canonicalization lookup at the cost of 128kB footprint. */
1166 DUK_ASSERT(cp >= 0);
1167 DUK_UNREF(thr);
1168 if (DUK_LIKELY(cp < 0x10000L)) {
1169 return (duk_codepoint_t) duk_unicode_re_canon_lookup[cp];
1170 }
1171 return cp;
1172 #else /* DUK_USE_REGEXP_CANON_WORKAROUND */
1173 duk_codepoint_t y;
1174
1175 y = duk__case_transform_helper(thr,
1176 NULL, /* NULL is allowed, no output */
1177 cp, /* curr char */
1178 -1, /* prev char */
1179 -1, /* next char */
1180 1); /* uppercase */
1181
1182 if ((y < 0) || (cp >= 0x80 && y < 0x80)) {
1183 /* multiple codepoint conversion or non-ASCII mapped to ASCII
1184 * --> leave as is.
1185 */
1186 return cp;
1187 }
1188
1189 return y;
1190 #endif /* DUK_USE_REGEXP_CANON_WORKAROUND */
1191 }
1192
1193 /*
1194 * E5 Section 15.10.2.6 "IsWordChar" abstract operation. Assume
1195 * x < 0 for characters read outside the string.
1196 */
1197
duk_unicode_re_is_wordchar(duk_codepoint_t x)1198 DUK_INTERNAL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t x) {
1199 /*
1200 * Note: the description in E5 Section 15.10.2.6 has a typo, it
1201 * contains 'A' twice and lacks 'a'; the intent is [0-9a-zA-Z_].
1202 */
1203 if ((x >= '0' && x <= '9') ||
1204 (x >= 'a' && x <= 'z') ||
1205 (x >= 'A' && x <= 'Z') ||
1206 (x == '_')) {
1207 return 1;
1208 }
1209 return 0;
1210 }
1211
1212 /*
1213 * Regexp range tables
1214 */
1215
1216 /* exposed because lexer needs these too */
1217 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_digit[2] = {
1218 (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL,
1219 };
1220 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_white[22] = {
1221 (duk_uint16_t) 0x0009UL, (duk_uint16_t) 0x000DUL,
1222 (duk_uint16_t) 0x0020UL, (duk_uint16_t) 0x0020UL,
1223 (duk_uint16_t) 0x00A0UL, (duk_uint16_t) 0x00A0UL,
1224 (duk_uint16_t) 0x1680UL, (duk_uint16_t) 0x1680UL,
1225 (duk_uint16_t) 0x180EUL, (duk_uint16_t) 0x180EUL,
1226 (duk_uint16_t) 0x2000UL, (duk_uint16_t) 0x200AUL,
1227 (duk_uint16_t) 0x2028UL, (duk_uint16_t) 0x2029UL,
1228 (duk_uint16_t) 0x202FUL, (duk_uint16_t) 0x202FUL,
1229 (duk_uint16_t) 0x205FUL, (duk_uint16_t) 0x205FUL,
1230 (duk_uint16_t) 0x3000UL, (duk_uint16_t) 0x3000UL,
1231 (duk_uint16_t) 0xFEFFUL, (duk_uint16_t) 0xFEFFUL,
1232 };
1233 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_wordchar[8] = {
1234 (duk_uint16_t) 0x0030UL, (duk_uint16_t) 0x0039UL,
1235 (duk_uint16_t) 0x0041UL, (duk_uint16_t) 0x005AUL,
1236 (duk_uint16_t) 0x005FUL, (duk_uint16_t) 0x005FUL,
1237 (duk_uint16_t) 0x0061UL, (duk_uint16_t) 0x007AUL,
1238 };
1239 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_digit[4] = {
1240 (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL,
1241 (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0xFFFFUL,
1242 };
1243 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_white[24] = {
1244 (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x0008UL,
1245 (duk_uint16_t) 0x000EUL, (duk_uint16_t) 0x001FUL,
1246 (duk_uint16_t) 0x0021UL, (duk_uint16_t) 0x009FUL,
1247 (duk_uint16_t) 0x00A1UL, (duk_uint16_t) 0x167FUL,
1248 (duk_uint16_t) 0x1681UL, (duk_uint16_t) 0x180DUL,
1249 (duk_uint16_t) 0x180FUL, (duk_uint16_t) 0x1FFFUL,
1250 (duk_uint16_t) 0x200BUL, (duk_uint16_t) 0x2027UL,
1251 (duk_uint16_t) 0x202AUL, (duk_uint16_t) 0x202EUL,
1252 (duk_uint16_t) 0x2030UL, (duk_uint16_t) 0x205EUL,
1253 (duk_uint16_t) 0x2060UL, (duk_uint16_t) 0x2FFFUL,
1254 (duk_uint16_t) 0x3001UL, (duk_uint16_t) 0xFEFEUL,
1255 (duk_uint16_t) 0xFF00UL, (duk_uint16_t) 0xFFFFUL,
1256 };
1257 DUK_INTERNAL const duk_uint16_t duk_unicode_re_ranges_not_wordchar[10] = {
1258 (duk_uint16_t) 0x0000UL, (duk_uint16_t) 0x002FUL,
1259 (duk_uint16_t) 0x003AUL, (duk_uint16_t) 0x0040UL,
1260 (duk_uint16_t) 0x005BUL, (duk_uint16_t) 0x005EUL,
1261 (duk_uint16_t) 0x0060UL, (duk_uint16_t) 0x0060UL,
1262 (duk_uint16_t) 0x007BUL, (duk_uint16_t) 0xFFFFUL,
1263 };
1264
1265 #endif /* DUK_USE_REGEXP_SUPPORT */
1266