1 /*
2 * WHATWG Encoding API built-ins
3 *
4 * API specification: https://encoding.spec.whatwg.org/#api
5 * Web IDL: https://www.w3.org/TR/WebIDL/
6 */
7
8 #include "duk_internal.h"
9
10 /*
11 * Data structures for encoding/decoding
12 */
13
14 typedef struct {
15 duk_uint8_t *out; /* where to write next byte(s) */
16 duk_codepoint_t lead; /* lead surrogate */
17 } duk__encode_context;
18
19 typedef struct {
20 /* UTF-8 decoding state */
21 duk_codepoint_t codepoint; /* built up incrementally */
22 duk_uint8_t upper; /* max value of next byte (decode error otherwise) */
23 duk_uint8_t lower; /* min value of next byte (ditto) */
24 duk_uint8_t needed; /* how many more bytes we need */
25 duk_uint8_t bom_handled; /* BOM seen or no longer expected */
26
27 /* Decoder configuration */
28 duk_uint8_t fatal;
29 duk_uint8_t ignore_bom;
30 } duk__decode_context;
31
32 /* The signed duk_codepoint_t type is used to signal a decoded codepoint
33 * (>= 0) or various other states using negative values.
34 */
35 #define DUK__CP_CONTINUE (-1) /* continue to next byte, no completed codepoint */
36 #define DUK__CP_ERROR (-2) /* decoding error */
37 #define DUK__CP_RETRY (-3) /* decoding error; retry last byte */
38
39 /*
40 * Raw helpers for encoding/decoding
41 */
42
43 /* Emit UTF-8 (= CESU-8) encoded U+FFFD (replacement char), i.e. ef bf bd. */
duk__utf8_emit_repl(duk_uint8_t * ptr)44 DUK_LOCAL duk_uint8_t *duk__utf8_emit_repl(duk_uint8_t *ptr) {
45 *ptr++ = 0xef;
46 *ptr++ = 0xbf;
47 *ptr++ = 0xbd;
48 return ptr;
49 }
50
duk__utf8_decode_init(duk__decode_context * dec_ctx)51 DUK_LOCAL void duk__utf8_decode_init(duk__decode_context *dec_ctx) {
52 /* (Re)init the decoding state of 'dec_ctx' but leave decoder
53 * configuration fields untouched.
54 */
55 dec_ctx->codepoint = 0x0000L;
56 dec_ctx->upper = 0xbf;
57 dec_ctx->lower = 0x80;
58 dec_ctx->needed = 0;
59 dec_ctx->bom_handled = 0;
60 }
61
duk__utf8_decode_next(duk__decode_context * dec_ctx,duk_uint8_t x)62 DUK_LOCAL duk_codepoint_t duk__utf8_decode_next(duk__decode_context *dec_ctx, duk_uint8_t x) {
63 /*
64 * UTF-8 algorithm based on the Encoding specification:
65 * https://encoding.spec.whatwg.org/#utf-8-decoder
66 *
67 * Two main states: decoding initial byte vs. decoding continuation
68 * bytes. Shortest length encoding is validated by restricting the
69 * allowed range of first continuation byte using 'lower' and 'upper'.
70 */
71
72 if (dec_ctx->needed == 0) {
73 /* process initial byte */
74 if (x <= 0x7f) {
75 /* U+0000-U+007F, 1 byte (ASCII) */
76 return (duk_codepoint_t) x;
77 } else if (x >= 0xc2 && x <= 0xdf) {
78 /* U+0080-U+07FF, 2 bytes */
79 dec_ctx->needed = 1;
80 dec_ctx->codepoint = x & 0x1f;
81 DUK_ASSERT(dec_ctx->lower == 0x80);
82 DUK_ASSERT(dec_ctx->upper == 0xbf);
83 return DUK__CP_CONTINUE;
84 } else if (x >= 0xe0 && x <= 0xef) {
85 /* U+0800-U+FFFF, 3 bytes */
86 if (x == 0xe0) {
87 dec_ctx->lower = 0xa0;
88 DUK_ASSERT(dec_ctx->upper == 0xbf);
89 } else if (x == 0xed) {
90 DUK_ASSERT(dec_ctx->lower == 0x80);
91 dec_ctx->upper = 0x9f;
92 }
93 dec_ctx->needed = 2;
94 dec_ctx->codepoint = x & 0x0f;
95 return DUK__CP_CONTINUE;
96 } else if (x >= 0xf0 && x <= 0xf4) {
97 /* U+010000-U+10FFFF, 4 bytes */
98 if (x == 0xf0) {
99 dec_ctx->lower = 0x90;
100 DUK_ASSERT(dec_ctx->upper == 0xbf);
101 } else if (x == 0xf4) {
102 DUK_ASSERT(dec_ctx->lower == 0x80);
103 dec_ctx->upper = 0x8f;
104 }
105 dec_ctx->needed = 3;
106 dec_ctx->codepoint = x & 0x07;
107 return DUK__CP_CONTINUE;
108 } else {
109 /* not a legal initial byte */
110 return DUK__CP_ERROR;
111 }
112 } else {
113 /* process continuation byte */
114 if (x >= dec_ctx->lower && x <= dec_ctx->upper) {
115 dec_ctx->lower = 0x80;
116 dec_ctx->upper = 0xbf;
117 dec_ctx->codepoint = (dec_ctx->codepoint << 6) | (x & 0x3f);
118 if (--dec_ctx->needed > 0) {
119 /* need more bytes */
120 return DUK__CP_CONTINUE;
121 } else {
122 /* got a codepoint */
123 duk_codepoint_t ret;
124 DUK_ASSERT(dec_ctx->codepoint <= 0x10ffffL); /* Decoding rules guarantee. */
125 ret = dec_ctx->codepoint;
126 dec_ctx->codepoint = 0x0000L;
127 dec_ctx->needed = 0;
128 return ret;
129 }
130 } else {
131 /* We just encountered an illegal UTF-8 continuation byte. This might
132 * be the initial byte of the next character; if we return a plain
133 * error status and the decoder is in replacement mode, the character
134 * will be masked. We still need to alert the caller to the error
135 * though.
136 */
137 dec_ctx->codepoint = 0x0000L;
138 dec_ctx->needed = 0;
139 dec_ctx->lower = 0x80;
140 dec_ctx->upper = 0xbf;
141 return DUK__CP_RETRY;
142 }
143 }
144 }
145
146 #if defined(DUK_USE_ENCODING_BUILTINS)
duk__utf8_encode_char(void * udata,duk_codepoint_t codepoint)147 DUK_LOCAL void duk__utf8_encode_char(void *udata, duk_codepoint_t codepoint) {
148 duk__encode_context *enc_ctx;
149
150 DUK_ASSERT(codepoint >= 0);
151 enc_ctx = (duk__encode_context *) udata;
152 DUK_ASSERT(enc_ctx != NULL);
153
154 #if !defined(DUK_USE_PREFER_SIZE)
155 if (codepoint <= 0x7f && enc_ctx->lead == 0x0000L) {
156 /* Fast path for ASCII. */
157 *enc_ctx->out++ = (duk_uint8_t) codepoint;
158 return;
159 }
160 #endif
161
162 if (DUK_UNLIKELY(codepoint > 0x10ffffL)) {
163 /* cannot legally encode in UTF-8 */
164 codepoint = DUK_UNICODE_CP_REPLACEMENT_CHARACTER;
165 } else if (codepoint >= 0xd800L && codepoint <= 0xdfffL) {
166 if (codepoint <= 0xdbffL) {
167 /* high surrogate */
168 duk_codepoint_t prev_lead = enc_ctx->lead;
169 enc_ctx->lead = codepoint;
170 if (prev_lead == 0x0000L) {
171 /* high surrogate, no output */
172 return;
173 } else {
174 /* consecutive high surrogates, consider first one unpaired */
175 codepoint = DUK_UNICODE_CP_REPLACEMENT_CHARACTER;
176 }
177 } else {
178 /* low surrogate */
179 if (enc_ctx->lead != 0x0000L) {
180 codepoint = (duk_codepoint_t) (0x010000L + ((enc_ctx->lead - 0xd800L) << 10) + (codepoint - 0xdc00L));
181 enc_ctx->lead = 0x0000L;
182 } else {
183 /* unpaired low surrogate */
184 DUK_ASSERT(enc_ctx->lead == 0x0000L);
185 codepoint = DUK_UNICODE_CP_REPLACEMENT_CHARACTER;
186 }
187 }
188 } else {
189 if (enc_ctx->lead != 0x0000L) {
190 /* unpaired high surrogate: emit replacement character and the input codepoint */
191 enc_ctx->lead = 0x0000L;
192 enc_ctx->out = duk__utf8_emit_repl(enc_ctx->out);
193 }
194 }
195
196 /* Codepoint may be original input, a decoded surrogate pair, or may
197 * have been replaced with U+FFFD.
198 */
199 enc_ctx->out += duk_unicode_encode_xutf8((duk_ucodepoint_t) codepoint, enc_ctx->out);
200 }
201 #endif /* DUK_USE_ENCODING_BUILTINS */
202
203 /* Shared helper for buffer-to-string using a TextDecoder() compatible UTF-8
204 * decoder.
205 */
duk__decode_helper(duk_hthread * thr,duk__decode_context * dec_ctx)206 DUK_LOCAL duk_ret_t duk__decode_helper(duk_hthread *thr, duk__decode_context *dec_ctx) {
207 const duk_uint8_t *input;
208 duk_size_t len = 0;
209 duk_size_t len_tmp;
210 duk_bool_t stream = 0;
211 duk_codepoint_t codepoint;
212 duk_uint8_t *output;
213 const duk_uint8_t *in;
214 duk_uint8_t *out;
215
216 DUK_ASSERT(dec_ctx != NULL);
217
218 /* Careful with input buffer pointer: any side effects involving
219 * code execution (e.g. getters, coercion calls, and finalizers)
220 * may cause a resize and invalidate a pointer we've read. This
221 * is why the pointer is actually looked up at the last minute.
222 * Argument validation must still happen first to match WHATWG
223 * required side effect order.
224 */
225
226 if (duk_is_undefined(thr, 0)) {
227 duk_push_fixed_buffer_nozero(thr, 0);
228 duk_replace(thr, 0);
229 }
230 (void) duk_require_buffer_data(thr, 0, &len); /* Need 'len', avoid pointer. */
231
232 if (duk_check_type_mask(thr, 1, DUK_TYPE_MASK_UNDEFINED |
233 DUK_TYPE_MASK_NULL |
234 DUK_TYPE_MASK_NONE)) {
235 /* Use defaults, treat missing value like undefined. */
236 } else {
237 duk_require_type_mask(thr, 1, DUK_TYPE_MASK_UNDEFINED |
238 DUK_TYPE_MASK_NULL |
239 DUK_TYPE_MASK_LIGHTFUNC |
240 DUK_TYPE_MASK_BUFFER |
241 DUK_TYPE_MASK_OBJECT);
242 if (duk_get_prop_literal(thr, 1, "stream")) {
243 stream = duk_to_boolean(thr, -1);
244 }
245 }
246
247 /* Allowance is 3*len in the general case because all bytes may potentially
248 * become U+FFFD. If the first byte completes a non-BMP codepoint it will
249 * decode to a CESU-8 surrogate pair (6 bytes) so we allow 3 extra bytes to
250 * compensate: (1*3)+3 = 6. Non-BMP codepoints are safe otherwise because
251 * the 4->6 expansion is well under the 3x allowance.
252 *
253 * XXX: As with TextEncoder, need a better buffer allocation strategy here.
254 */
255 if (len >= (DUK_HBUFFER_MAX_BYTELEN / 3) - 3) {
256 DUK_ERROR_TYPE(thr, DUK_STR_RESULT_TOO_LONG);
257 DUK_WO_NORETURN(return 0;);
258 }
259 output = (duk_uint8_t *) duk_push_fixed_buffer_nozero(thr, 3 + (3 * len)); /* used parts will be always manually written over */
260
261 input = (const duk_uint8_t *) duk_get_buffer_data(thr, 0, &len_tmp);
262 DUK_ASSERT(input != NULL || len == 0);
263 if (DUK_UNLIKELY(len != len_tmp)) {
264 /* Very unlikely but possible: source buffer was resized by
265 * a side effect when fixed buffer was pushed. Output buffer
266 * may not be large enough to hold output, so just fail if
267 * length has changed.
268 */
269 DUK_D(DUK_DPRINT("input buffer resized by side effect, fail"));
270 goto fail_type;
271 }
272
273 /* From this point onwards it's critical that no side effect occur
274 * which may disturb 'input': finalizer execution, property accesses,
275 * active coercions, etc. Even an allocation related mark-and-sweep
276 * may affect the pointer because it may trigger a pending finalizer.
277 */
278
279 in = input;
280 out = output;
281 while (in < input + len) {
282 codepoint = duk__utf8_decode_next(dec_ctx, *in++);
283 if (codepoint < 0) {
284 if (codepoint == DUK__CP_CONTINUE) {
285 continue;
286 }
287
288 /* Decoding error with or without retry. */
289 DUK_ASSERT(codepoint == DUK__CP_ERROR || codepoint == DUK__CP_RETRY);
290 if (codepoint == DUK__CP_RETRY) {
291 --in; /* retry last byte */
292 }
293 /* replacement mode: replace with U+FFFD */
294 codepoint = DUK_UNICODE_CP_REPLACEMENT_CHARACTER;
295 if (dec_ctx->fatal) {
296 /* fatal mode: throw a TypeError */
297 goto fail_type;
298 }
299 /* Continue with 'codepoint', Unicode replacement. */
300 }
301 DUK_ASSERT(codepoint >= 0x0000L && codepoint <= 0x10ffffL);
302
303 if (!dec_ctx->bom_handled) {
304 dec_ctx->bom_handled = 1;
305 if (codepoint == 0xfeffL && !dec_ctx->ignore_bom) {
306 continue;
307 }
308 }
309
310 out += duk_unicode_encode_cesu8((duk_ucodepoint_t) codepoint, out);
311 DUK_ASSERT(out <= output + (3 + (3 * len)));
312 }
313
314 if (!stream) {
315 if (dec_ctx->needed != 0) {
316 /* truncated sequence at end of buffer */
317 if (dec_ctx->fatal) {
318 goto fail_type;
319 } else {
320 out += duk_unicode_encode_cesu8(DUK_UNICODE_CP_REPLACEMENT_CHARACTER, out);
321 DUK_ASSERT(out <= output + (3 + (3 * len)));
322 }
323 }
324 duk__utf8_decode_init(dec_ctx); /* Initialize decoding state for potential reuse. */
325 }
326
327 /* Output buffer is fixed and thus stable even if there had been
328 * side effects (which there shouldn't be).
329 */
330 duk_push_lstring(thr, (const char *) output, (duk_size_t) (out - output));
331 return 1;
332
333 fail_type:
334 DUK_ERROR_TYPE(thr, DUK_STR_UTF8_DECODE_FAILED);
335 DUK_WO_NORETURN(return 0;);
336 }
337
338 /*
339 * Built-in bindings
340 */
341
342 #if defined(DUK_USE_ENCODING_BUILTINS)
duk_bi_textencoder_constructor(duk_hthread * thr)343 DUK_INTERNAL duk_ret_t duk_bi_textencoder_constructor(duk_hthread *thr) {
344 /* TextEncoder currently requires no persistent state, so the constructor
345 * does nothing on purpose.
346 */
347
348 duk_require_constructor_call(thr);
349 return 0;
350 }
351
duk_bi_textencoder_prototype_encoding_getter(duk_hthread * thr)352 DUK_INTERNAL duk_ret_t duk_bi_textencoder_prototype_encoding_getter(duk_hthread *thr) {
353 duk_push_literal(thr, "utf-8");
354 return 1;
355 }
356
duk_bi_textencoder_prototype_encode(duk_hthread * thr)357 DUK_INTERNAL duk_ret_t duk_bi_textencoder_prototype_encode(duk_hthread *thr) {
358 duk__encode_context enc_ctx;
359 duk_size_t len;
360 duk_size_t final_len;
361 duk_uint8_t *output;
362
363 DUK_ASSERT_TOP(thr, 1);
364 if (duk_is_undefined(thr, 0)) {
365 len = 0;
366 } else {
367 duk_hstring *h_input;
368
369 h_input = duk_to_hstring(thr, 0);
370 DUK_ASSERT(h_input != NULL);
371
372 len = (duk_size_t) DUK_HSTRING_GET_CHARLEN(h_input);
373 if (len >= DUK_HBUFFER_MAX_BYTELEN / 3) {
374 DUK_ERROR_TYPE(thr, DUK_STR_RESULT_TOO_LONG);
375 DUK_WO_NORETURN(return 0;);
376 }
377 }
378
379 /* Allowance is 3*len because all bytes can potentially be replaced with
380 * U+FFFD -- which rather inconveniently encodes to 3 bytes in UTF-8.
381 * Rely on dynamic buffer data pointer stability: no other code has
382 * access to the data pointer.
383 *
384 * XXX: The buffer allocation strategy used here is rather inefficient.
385 * Maybe switch to a chunk-based strategy, or preprocess the string to
386 * figure out the space needed ahead of time?
387 */
388 DUK_ASSERT(3 * len >= len);
389 output = (duk_uint8_t *) duk_push_dynamic_buffer(thr, 3 * len);
390
391 if (len > 0) {
392 DUK_ASSERT(duk_is_string(thr, 0)); /* True if len > 0. */
393
394 /* XXX: duk_decode_string() is used to process the input
395 * string. For standard ECMAScript strings, represented
396 * internally as CESU-8, this is fine. However, behavior
397 * beyond CESU-8 is not very strict: codepoints using an
398 * extended form of UTF-8 are also accepted, and invalid
399 * codepoint sequences (which are allowed in Duktape strings)
400 * are not handled as well as they could (e.g. invalid
401 * continuation bytes may mask following codepoints).
402 * This is how ECMAScript code would also see such strings.
403 * Maybe replace duk_decode_string() with an explicit strict
404 * CESU-8 decoder here?
405 */
406 enc_ctx.lead = 0x0000L;
407 enc_ctx.out = output;
408 duk_decode_string(thr, 0, duk__utf8_encode_char, (void *) &enc_ctx);
409 if (enc_ctx.lead != 0x0000L) {
410 /* unpaired high surrogate at end of string */
411 enc_ctx.out = duk__utf8_emit_repl(enc_ctx.out);
412 DUK_ASSERT(enc_ctx.out <= output + (3 * len));
413 }
414
415 /* The output buffer is usually very much oversized, so shrink it to
416 * actually needed size. Pointer stability assumed up to this point.
417 */
418 DUK_ASSERT_TOP(thr, 2);
419 DUK_ASSERT(output == (duk_uint8_t *) duk_get_buffer_data(thr, -1, NULL));
420
421 final_len = (duk_size_t) (enc_ctx.out - output);
422 duk_resize_buffer(thr, -1, final_len);
423 /* 'output' and 'enc_ctx.out' are potentially invalidated by the resize. */
424 } else {
425 final_len = 0;
426 }
427
428 /* Standard WHATWG output is a Uint8Array. Here the Uint8Array will
429 * be backed by a dynamic buffer which differs from e.g. Uint8Arrays
430 * created as 'new Uint8Array(N)'. ECMAScript code won't see the
431 * difference but C code will. When bufferobjects are not supported,
432 * returns a plain dynamic buffer.
433 */
434 #if defined(DUK_USE_BUFFEROBJECT_SUPPORT)
435 duk_push_buffer_object(thr, -1, 0, final_len, DUK_BUFOBJ_UINT8ARRAY);
436 #endif
437 return 1;
438 }
439
duk_bi_textdecoder_constructor(duk_hthread * thr)440 DUK_INTERNAL duk_ret_t duk_bi_textdecoder_constructor(duk_hthread *thr) {
441 duk__decode_context *dec_ctx;
442 duk_bool_t fatal = 0;
443 duk_bool_t ignore_bom = 0;
444
445 DUK_ASSERT_TOP(thr, 2);
446 duk_require_constructor_call(thr);
447 if (!duk_is_undefined(thr, 0)) {
448 /* XXX: For now ignore 'label' (encoding identifier). */
449 duk_to_string(thr, 0);
450 }
451 if (!duk_is_null_or_undefined(thr, 1)) {
452 if (duk_get_prop_literal(thr, 1, "fatal")) {
453 fatal = duk_to_boolean(thr, -1);
454 }
455 if (duk_get_prop_literal(thr, 1, "ignoreBOM")) {
456 ignore_bom = duk_to_boolean(thr, -1);
457 }
458 }
459
460 duk_push_this(thr);
461
462 /* The decode context is not assumed to be zeroed; all fields are
463 * initialized explicitly.
464 */
465 dec_ctx = (duk__decode_context *) duk_push_fixed_buffer(thr, sizeof(duk__decode_context));
466 dec_ctx->fatal = (duk_uint8_t) fatal;
467 dec_ctx->ignore_bom = (duk_uint8_t) ignore_bom;
468 duk__utf8_decode_init(dec_ctx); /* Initializes remaining fields. */
469
470 duk_put_prop_literal(thr, -2, DUK_INTERNAL_SYMBOL("Context"));
471 return 0;
472 }
473
474 /* Get TextDecoder context from 'this'; leaves garbage on stack. */
duk__get_textdecoder_context(duk_hthread * thr)475 DUK_LOCAL duk__decode_context *duk__get_textdecoder_context(duk_hthread *thr) {
476 duk__decode_context *dec_ctx;
477 duk_push_this(thr);
478 duk_get_prop_literal(thr, -1, DUK_INTERNAL_SYMBOL("Context"));
479 dec_ctx = (duk__decode_context *) duk_require_buffer(thr, -1, NULL);
480 DUK_ASSERT(dec_ctx != NULL);
481 return dec_ctx;
482 }
483
duk_bi_textdecoder_prototype_shared_getter(duk_hthread * thr)484 DUK_INTERNAL duk_ret_t duk_bi_textdecoder_prototype_shared_getter(duk_hthread *thr) {
485 duk__decode_context *dec_ctx;
486 duk_int_t magic;
487
488 dec_ctx = duk__get_textdecoder_context(thr);
489 magic = duk_get_current_magic(thr);
490 switch (magic) {
491 case 0:
492 /* Encoding is now fixed, so _Context lookup is only needed to
493 * validate the 'this' binding (TypeError if not TextDecoder-like).
494 */
495 duk_push_literal(thr, "utf-8");
496 break;
497 case 1:
498 duk_push_boolean(thr, dec_ctx->fatal);
499 break;
500 default:
501 duk_push_boolean(thr, dec_ctx->ignore_bom);
502 break;
503 }
504
505 return 1;
506 }
507
duk_bi_textdecoder_prototype_decode(duk_hthread * thr)508 DUK_INTERNAL duk_ret_t duk_bi_textdecoder_prototype_decode(duk_hthread *thr) {
509 duk__decode_context *dec_ctx;
510
511 dec_ctx = duk__get_textdecoder_context(thr);
512 return duk__decode_helper(thr, dec_ctx);
513 }
514 #endif /* DUK_USE_ENCODING_BUILTINS */
515
516 /*
517 * Internal helper for Node.js Buffer
518 */
519
520 /* Internal helper used for Node.js Buffer .toString(). Value stack convention
521 * is currently odd: it mimics TextDecoder .decode() so that argument must be at
522 * index 0, and decode options (not present for Buffer) at index 1. Return value
523 * is a Duktape/C function return value.
524 */
duk_textdecoder_decode_utf8_nodejs(duk_hthread * thr)525 DUK_INTERNAL duk_ret_t duk_textdecoder_decode_utf8_nodejs(duk_hthread *thr) {
526 duk__decode_context dec_ctx;
527
528 dec_ctx.fatal = 0; /* use replacement chars */
529 dec_ctx.ignore_bom = 1; /* ignore BOMs (matches Node.js Buffer .toString()) */
530 duk__utf8_decode_init(&dec_ctx);
531
532 return duk__decode_helper(thr, &dec_ctx);
533 }
534