1 #ifndef ENCODE_H
2 #define ENCODE_H
3
4 #ifndef H_PERL
5 /* check whether we're "in perl" so that we can do data parts without
6 getting extern references to the code parts
7 */
8 typedef unsigned char U8;
9 #endif
10
11 typedef struct encpage_s encpage_t;
12
13 struct encpage_s
14 {
15 /* fields ordered to pack nicely on 32-bit machines */
16 const U8 *const seq; /* Packed output sequences we generate
17 if we match */
18 const encpage_t *const next; /* Page to go to if we match */
19 const U8 min; /* Min value of octet to match this entry */
20 const U8 max; /* Max value of octet to match this entry */
21 const U8 dlen; /* destination length -
22 size of entries in seq */
23 const U8 slen; /* source length -
24 number of source octets needed */
25 };
26
27 /*
28 At any point in a translation there is a page pointer which points
29 at an array of the above structures.
30
31 Basic operation :
32 get octet from source stream.
33 if (octet >= min && octet < max) {
34 if slen is 0 then we cannot represent this character.
35 if we have less than slen octets (including this one) then
36 we have a partial character.
37 otherwise
38 copy dlen octets from seq + dlen*(octet-min) to output
39 (dlen may be zero if we don't know yet.)
40 load page pointer with next to continue.
41 (is slen is one this is end of a character)
42 get next octet.
43 }
44 else {
45 increment the page pointer to look at next slot in the array
46 }
47
48 arrays SHALL be constructed so there is an entry which matches
49 ..0xFF at the end, and either maps it or indicates no
50 representation.
51
52 if MSB of slen is set then mapping is an approximate "FALLBACK" entry.
53
54 */
55
56
57 typedef struct encode_s encode_t;
58 struct encode_s
59 {
60 const encpage_t *const t_utf8; /* Starting table for translation from
61 the encoding to UTF-8 form */
62 const encpage_t *const f_utf8; /* Starting table for translation
63 from UTF-8 to the encoding */
64 const U8 *const rep; /* Replacement character in this
65 encoding e.g. "?" */
66 int replen; /* Number of octets in rep */
67 U8 min_el; /* Minimum octets to represent a
68 character */
69 U8 max_el; /* Maximum octets to represent a
70 character */
71 const char *const name[2]; /* name(s) of this encoding */
72 };
73
74 #ifdef H_PERL
75 /* See comment at top of file for deviousness */
76
77 extern int do_encode(const encpage_t *enc, const U8 *src, STRLEN *slen,
78 U8 *dst, STRLEN dlen, STRLEN *dout, int approx,
79 const U8 *term, STRLEN tlen);
80
81 extern void Encode_DefineEncoding(encode_t *enc);
82
83 #endif /* H_PERL */
84
85 #define ENCODE_NOSPACE 1
86 #define ENCODE_PARTIAL 2
87 #define ENCODE_NOREP 3
88 #define ENCODE_FALLBACK 4
89 #define ENCODE_FOUND_TERM 5
90
91 /* Use the perl core value if available; it is portable to EBCDIC */
92 #ifdef REPLACEMENT_CHARACTER_UTF8
93 # define FBCHAR_UTF8 REPLACEMENT_CHARACTER_UTF8
94 #else
95 # define FBCHAR_UTF8 "\xEF\xBF\xBD"
96 #endif
97
98 #define ENCODE_DIE_ON_ERR 0x0001 /* croaks immediately */
99 #define ENCODE_WARN_ON_ERR 0x0002 /* warn on error; may proceed */
100 #define ENCODE_RETURN_ON_ERR 0x0004 /* immediately returns on NOREP */
101 #define ENCODE_LEAVE_SRC 0x0008 /* $src updated unless set */
102 #define ENCODE_ONLY_PRAGMA_WARNINGS 0x0010 /* when enabled report only warnings configured by pragma warnings, otherwise report all warnings; no effect without ENCODE_WARN_ON_ERR */
103 #define ENCODE_PERLQQ 0x0100 /* perlqq fallback string */
104 #define ENCODE_HTMLCREF 0x0200 /* HTML character ref. fb mode */
105 #define ENCODE_XMLCREF 0x0400 /* XML character ref. fb mode */
106 #define ENCODE_STOP_AT_PARTIAL 0x0800 /* stop at partial explicitly */
107
108 #define ENCODE_FB_DEFAULT 0x0000
109 #define ENCODE_FB_CROAK 0x0001
110 #define ENCODE_FB_QUIET ENCODE_RETURN_ON_ERR
111 #define ENCODE_FB_WARN (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR)
112 #define ENCODE_FB_PERLQQ (ENCODE_PERLQQ|ENCODE_LEAVE_SRC)
113 #define ENCODE_FB_HTMLCREF (ENCODE_HTMLCREF|ENCODE_LEAVE_SRC)
114 #define ENCODE_FB_XMLCREF (ENCODE_XMLCREF|ENCODE_LEAVE_SRC)
115
116 #define encode_ckWARN(c, w) ((c & ENCODE_WARN_ON_ERR) \
117 && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || ckWARN(w)))
118
119 #ifdef UTF8SKIP
120 # ifdef EBCDIC /* The value on early perls is wrong */
121 # undef UTF8_MAXBYTES
122 # define UTF8_MAXBYTES 14
123 # endif
124 # ifndef UNLIKELY
125 # define UNLIKELY(x) (x)
126 # endif
127 # ifndef LIKELY
128 # define LIKELY(x) (x)
129 # endif
130
131 /* EBCDIC requires a later perl to work, so the next two definitions are for
132 * ASCII machines only */
133 # ifndef NATIVE_UTF8_TO_I8
134 # define NATIVE_UTF8_TO_I8(x) (x)
135 # endif
136 # ifndef I8_TO_NATIVE_UTF8
137 # define I8_TO_NATIVE_UTF8(x) (x)
138 # endif
139 # ifndef OFFUNISKIP
140 # define OFFUNISKIP(x) UNISKIP(x)
141 # endif
142 # ifndef uvoffuni_to_utf8_flags
143 # define uvoffuni_to_utf8_flags(a,b,c) uvuni_to_utf8_flags(a,b,c)
144 # endif
145 # ifndef WARN_SURROGATE /* Use the overarching category if these
146 subcategories are missing */
147 # define WARN_SURROGATE WARN_UTF8
148 # define WARN_NONCHAR WARN_UTF8
149 # define WARN_NON_UNICODE WARN_UTF8
150 /* If there's only one possible category, then packing is a no-op */
151 # define encode_ckWARN_packed(c, w) encode_ckWARN(c, w)
152 # else
153 # define encode_ckWARN_packed(c, w) \
154 ((c & ENCODE_WARN_ON_ERR) \
155 && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || Perl_ckwarn(aTHX_ w)))
156 # endif
157
158 /* All these formats take a single UV code point argument */
159 static const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
160 static const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
161 " is not recommended for open interchange";
162 static const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
163 " may not be portable";
164
165 /* If the perl doesn't have the 5.28 functions, this file includes
166 * stripped-down versions of them but containing enough functionality to be
167 * suitable for Encode's needs. Many of the comments have been removed. But
168 * you can inspect the 5.28 source if you get stuck.
169 *
170 * These could be put in Devel::PPPort, but Encode is likely the only user */
171
172 #if (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS)) \
173 && (! defined(utf8n_to_uvchr_msgs) && ! defined(uvchr_to_utf8_flags_msgs))
174
175 # ifndef hv_stores
176 # define hv_stores(hv, key, val) hv_store((hv), ("" key ""), (sizeof(key)-1), (val), 0)
177 # endif
178
179 static HV *
S_new_msg_hv(const char * const message,U32 categories)180 S_new_msg_hv(const char * const message, /* The message text */
181 U32 categories) /* Packed warning categories */
182 {
183 /* Creates, populates, and returns an HV* that describes an error message
184 * for the translators between UTF8 and code point */
185
186 dTHX;
187 SV* msg_sv = newSVpv(message, 0);
188 SV* category_sv = newSVuv(categories);
189
190 HV* msg_hv = newHV();
191
192 (void) hv_stores(msg_hv, "text", msg_sv);
193 (void) hv_stores(msg_hv, "warn_categories", category_sv);
194
195 return msg_hv;
196 }
197
198 #endif
199
200 #if ! defined(utf8n_to_uvchr_msgs) \
201 && (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS))
202
203 # undef utf8n_to_uvchr /* Don't use an earlier version: use the version
204 defined in this file */
205 # define utf8n_to_uvchr(a,b,c,d) utf8n_to_uvchr_msgs(a, b, c, d, 0, NULL)
206
207 # undef UTF8_IS_START /* Early perls wrongly accepted C0 and C1 */
208 # define UTF8_IS_START(c) (((U8)(c)) >= 0xc2)
209 # ifndef isUTF8_POSSIBLY_PROBLEMATIC
210 # ifdef EBCDIC
211 # define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c > ' ')
212 # else
213 # define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
214 # endif
215 # endif
216 # ifndef UTF8_ALLOW_OVERFLOW
217 # define UTF8_ALLOW_OVERFLOW (1U<<31) /* Choose highest bit to avoid
218 potential conflicts */
219 # define UTF8_GOT_OVERFLOW UTF8_ALLOW_OVERFLOW
220 # endif
221 # undef UTF8_ALLOW_ANY /* Early perl definitions don't work properly with
222 the code in this file */
223 # define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
224 |UTF8_ALLOW_NON_CONTINUATION \
225 |UTF8_ALLOW_SHORT \
226 |UTF8_ALLOW_LONG \
227 |UTF8_ALLOW_OVERFLOW)
228
229 /* The meanings of these were complemented at some point, but the functions
230 * bundled in this file use the complemented meanings */
231 # ifndef UTF8_DISALLOW_SURROGATE
232 # define UTF8_DISALLOW_SURROGATE UTF8_ALLOW_SURROGATE
233 # define UTF8_DISALLOW_NONCHAR UTF8_ALLOW_FFFF
234 # define UTF8_DISALLOW_SUPER UTF8_ALLOW_FE_FF
235
236 /* In the stripped-down implementation in this file, disallowing is not
237 * independent of warning */
238 # define UTF8_WARN_SURROGATE UTF8_DISALLOW_SURROGATE
239 # define UTF8_WARN_NONCHAR UTF8_DISALLOW_NONCHAR
240 # define UTF8_WARN_SUPER UTF8_DISALLOW_SUPER
241 # endif
242 # ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
243 # define UTF8_DISALLOW_ILLEGAL_INTERCHANGE \
244 (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_NONCHAR)
245 # endif
246 # ifndef UTF8_WARN_ILLEGAL_INTERCHANGE
247 # define UTF8_WARN_ILLEGAL_INTERCHANGE \
248 (UTF8_WARN_SUPER|UTF8_WARN_SURROGATE|UTF8_WARN_NONCHAR)
249 # endif
250 # ifndef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER
251 # ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
252 # define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
253 # define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
254
255 # define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
256 && ((s1) & 0xFE ) == 0xB6)
257 # else
258 # define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
259 # define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
260 # define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
261 # endif
262 # ifndef HIGHEST_REPRESENTABLE_UTF8
263 # if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
264 # ifdef EBCDIC /* Actually is I8 */
265 # define HIGHEST_REPRESENTABLE_UTF8 \
266 "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
267 # else
268 # define HIGHEST_REPRESENTABLE_UTF8 \
269 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
270 # endif
271 # endif
272 # endif
273 # endif
274
275 # ifndef Newx
276 # define Newx(v,n,t) New(0,v,n,t)
277 # endif
278
279 # ifndef PERL_UNUSED_ARG
280 # define PERL_UNUSED_ARG(x) ((void)x)
281 # endif
282
283 # ifndef memGT
284 # define memGT(s1,s2,l) (memcmp(s1,s2,l) > 0)
285 # endif
286
287 # ifndef MIN
288 # define MIN(a,b) ((a) < (b) ? (a) : (b))
289 # endif
290
291 static const char malformed_text[] = "Malformed UTF-8 character";
292
293 static char *
_byte_dump_string(const U8 * const start,const STRLEN len)294 _byte_dump_string(const U8 * const start, const STRLEN len)
295 {
296 /* Returns a mortalized C string that is a displayable copy of the 'len' */
297
298 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
299 trailing NUL */
300 const U8 * s = start;
301 const U8 * const e = start + len;
302 char * output;
303 char * d;
304 dTHX;
305
306 Newx(output, output_len, char);
307 SAVEFREEPV(output);
308
309 d = output;
310 for (s = start; s < e; s++) {
311 const unsigned high_nibble = (*s & 0xF0) >> 4;
312 const unsigned low_nibble = (*s & 0x0F);
313
314 *d++ = '\\';
315 *d++ = 'x';
316
317 if (high_nibble < 10) {
318 *d++ = high_nibble + '0';
319 }
320 else {
321 *d++ = high_nibble - 10 + 'a';
322 }
323
324 if (low_nibble < 10) {
325 *d++ = low_nibble + '0';
326 }
327 else {
328 *d++ = low_nibble - 10 + 'a';
329 }
330 }
331
332 *d = '\0';
333 return output;
334 }
335
336 static char *
S_unexpected_non_continuation_text(const U8 * const s,STRLEN print_len,const STRLEN non_cont_byte_pos,const STRLEN expect_len)337 S_unexpected_non_continuation_text(const U8 * const s,
338
339 /* Max number of bytes to print */
340 STRLEN print_len,
341
342 /* Which one is the non-continuation */
343 const STRLEN non_cont_byte_pos,
344
345 /* How many bytes should there be? */
346 const STRLEN expect_len)
347 {
348 /* Return the malformation warning text for an unexpected continuation
349 * byte. */
350
351 dTHX;
352 const char * const where = (non_cont_byte_pos == 1)
353 ? "immediately"
354 : Perl_form(aTHX_ "%d bytes",
355 (int) non_cont_byte_pos);
356 const U8 * x = s + non_cont_byte_pos;
357 const U8 * e = s + print_len;
358
359 /* We don't need to pass this parameter, but since it has already been
360 * calculated, it's likely faster to pass it; verify under DEBUGGING */
361 assert(expect_len == UTF8SKIP(s));
362
363 /* As a defensive coding measure, don't output anything past a NUL. Such
364 * bytes shouldn't be in the middle of a malformation, and could mark the
365 * end of the allocated string, and what comes after is undefined */
366 for (; x < e; x++) {
367 if (*x == '\0') {
368 x++; /* Output this particular NUL */
369 break;
370 }
371 }
372
373 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
374 " %s after start byte 0x%02x; need %d bytes, got %d)",
375 malformed_text,
376 _byte_dump_string(s, x - s),
377 *(s + non_cont_byte_pos),
378 where,
379 *s,
380 (int) expect_len,
381 (int) non_cont_byte_pos);
382 }
383
384 static int
385 S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len);
386
387 static int
S_does_utf8_overflow(const U8 * const s,const U8 * e,const bool consider_overlongs)388 S_does_utf8_overflow(const U8 * const s,
389 const U8 * e,
390 const bool consider_overlongs)
391 {
392 /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
393 * 'e' - 1 would overflow an IV on this platform. */
394
395 # if ! defined(UV_IS_QUAD)
396
397 const STRLEN len = e - s;
398 int is_overlong;
399
400 assert(s <= e && s + UTF8SKIP(s) >= e);
401 assert(! UTF8_IS_INVARIANT(*s) && e > s);
402
403 # ifdef EBCDIC
404
405 PERL_UNUSED_ARG(consider_overlongs);
406
407 if (*s != 0xFE) {
408 return 0;
409 }
410
411 if (len == 1) {
412 return -1;
413 }
414
415 # else
416
417 if (LIKELY(*s < 0xFE)) {
418 return 0;
419 }
420
421 if (! consider_overlongs) {
422 return 1;
423 }
424
425 if (len == 1) {
426 return -1;
427 }
428
429 is_overlong = S_is_utf8_overlong_given_start_byte_ok(s, len);
430
431 if (is_overlong == 0) {
432 return 1;
433 }
434
435 if (is_overlong < 0) {
436 return -1;
437 }
438
439 if (*s == 0xFE) {
440 return 0;
441 }
442
443 # endif
444
445 /* Here, ASCII and EBCDIC rejoin:
446 * On ASCII: We have an overlong sequence starting with FF
447 * On EBCDIC: We have a sequence starting with FE. */
448
449 { /* For C89, use a block so the declaration can be close to its use */
450
451 # ifdef EBCDIC
452 const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
453 # else
454 const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
455 # endif
456 const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
457 const STRLEN cmp_len = MIN(conts_len, len - 1);
458
459 if (cmp_len >= conts_len || memNE(s + 1,
460 conts_for_highest_30_bit,
461 cmp_len))
462 {
463 return memGT(s + 1, conts_for_highest_30_bit, cmp_len);
464 }
465
466 return -1;
467 }
468
469 # else /* Below is 64-bit word */
470
471 PERL_UNUSED_ARG(consider_overlongs);
472
473 {
474 const STRLEN len = e - s;
475 const U8 *x;
476 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
477
478 for (x = s; x < e; x++, y++) {
479
480 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
481 continue;
482 }
483 return NATIVE_UTF8_TO_I8(*x) > *y;
484 }
485
486 if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
487 return -1;
488 }
489
490 return 0;
491 }
492
493 # endif
494
495 }
496
497 static int
498 S_isFF_OVERLONG(const U8 * const s, const STRLEN len);
499
500 static int
S_is_utf8_overlong_given_start_byte_ok(const U8 * const s,const STRLEN len)501 S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
502 {
503 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
504 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
505
506 assert(len > 1 && UTF8_IS_START(*s));
507
508 # ifdef EBCDIC
509 # define F0_ABOVE_OVERLONG 0xB0
510 # define F8_ABOVE_OVERLONG 0xA8
511 # define FC_ABOVE_OVERLONG 0xA4
512 # define FE_ABOVE_OVERLONG 0xA2
513 # define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
514 # else
515
516 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
517 return 1;
518 }
519
520 # define F0_ABOVE_OVERLONG 0x90
521 # define F8_ABOVE_OVERLONG 0x88
522 # define FC_ABOVE_OVERLONG 0x84
523 # define FE_ABOVE_OVERLONG 0x82
524 # define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
525 # endif
526
527 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
528 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
529 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
530 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
531 {
532 return 1;
533 }
534
535 /* Check for the FF overlong */
536 return S_isFF_OVERLONG(s, len);
537 }
538
539 int
S_isFF_OVERLONG(const U8 * const s,const STRLEN len)540 S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
541 {
542 if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
543 MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
544 {
545 return 0;
546 }
547
548 if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
549 return 1;
550 }
551
552 return -1;
553 }
554
555 # ifndef UTF8_GOT_CONTINUATION
556 # define UTF8_GOT_CONTINUATION UTF8_ALLOW_CONTINUATION
557 # define UTF8_GOT_EMPTY UTF8_ALLOW_EMPTY
558 # define UTF8_GOT_LONG UTF8_ALLOW_LONG
559 # define UTF8_GOT_NON_CONTINUATION UTF8_ALLOW_NON_CONTINUATION
560 # define UTF8_GOT_SHORT UTF8_ALLOW_SHORT
561 # define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE
562 # define UTF8_GOT_NONCHAR UTF8_DISALLOW_NONCHAR
563 # define UTF8_GOT_SUPER UTF8_DISALLOW_SUPER
564 # endif
565
566 # ifndef UNICODE_IS_SUPER
567 # define UNICODE_IS_SUPER(uv) ((UV) (uv) > PERL_UNICODE_MAX)
568 # endif
569 # ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
570 # define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) ((UV) (uv) >= 0xFDD0 \
571 && (UV) (uv) <= 0xFDEF)
572 # endif
573 # ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
574 # define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \
575 (((UV) (uv) & 0xFFFE) == 0xFFFE)
576 # endif
577 # ifndef is_NONCHAR_utf8_safe
578 # define is_NONCHAR_utf8_safe(s,e) /*** GENERATED CODE ***/ \
579 ( ( ( LIKELY((e) > (s)) ) && ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ) ? ( ( 0xEF == ((const U8*)s)[0] ) ?\
580 ( ( 0xB7 == ((const U8*)s)[1] ) ? \
581 ( ( 0x90 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xAF ) ? 3 : 0 )\
582 : ( ( 0xBF == ((const U8*)s)[1] ) && ( ( ((const U8*)s)[2] & 0xFE ) == 0xBE ) ) ? 3 : 0 )\
583 : ( 0xF0 == ((const U8*)s)[0] ) ? \
584 ( ( ( ( ((const U8*)s)[1] == 0x9F || ( ( ((const U8*)s)[1] & 0xEF ) == 0xAF ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
585 : ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ? \
586 ( ( ( ( ( ((const U8*)s)[1] & 0xCF ) == 0x8F ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
587 : ( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( 0x8F == ((const U8*)s)[1] ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 ) : 0 )
588 # endif
589
590 # ifndef UTF8_IS_NONCHAR
591 # define UTF8_IS_NONCHAR(s, e) (is_NONCHAR_utf8_safe(s,e) > 0)
592 # endif
593 # ifndef UNICODE_IS_NONCHAR
594 # define UNICODE_IS_NONCHAR(uv) \
595 ( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) \
596 || ( LIKELY( ! UNICODE_IS_SUPER(uv)) \
597 && UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
598 # endif
599
600 # ifndef UTF8_MAXBYTES
601 # define UTF8_MAXBYTES UTF8_MAXLEN
602 # endif
603
604 static UV
utf8n_to_uvchr_msgs(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)605 utf8n_to_uvchr_msgs(const U8 *s,
606 STRLEN curlen,
607 STRLEN *retlen,
608 const U32 flags,
609 U32 * errors,
610 AV ** msgs)
611 {
612 const U8 * const s0 = s;
613 const U8 * send = NULL;
614 U32 possible_problems = 0;
615 UV uv = *s;
616 STRLEN expectlen = 0;
617 U8 * adjusted_s0 = (U8 *) s0;
618 U8 temp_char_buf[UTF8_MAXBYTES + 1];
619 UV uv_so_far = 0;
620 dTHX;
621
622 assert(errors == NULL); /* This functionality has been stripped */
623
624 if (UNLIKELY(curlen == 0)) {
625 possible_problems |= UTF8_GOT_EMPTY;
626 curlen = 0;
627 uv = UNICODE_REPLACEMENT;
628 goto ready_to_handle_errors;
629 }
630
631 expectlen = UTF8SKIP(s);
632
633 if (retlen) {
634 *retlen = expectlen;
635 }
636
637 if (UTF8_IS_INVARIANT(uv)) {
638 return uv;
639 }
640
641 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
642 possible_problems |= UTF8_GOT_CONTINUATION;
643 curlen = 1;
644 uv = UNICODE_REPLACEMENT;
645 goto ready_to_handle_errors;
646 }
647
648 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
649
650 send = (U8*) s0;
651 if (UNLIKELY(curlen < expectlen)) {
652 possible_problems |= UTF8_GOT_SHORT;
653 send += curlen;
654 }
655 else {
656 send += expectlen;
657 }
658
659 for (s = s0 + 1; s < send; s++) {
660 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
661 uv = UTF8_ACCUMULATE(uv, *s);
662 continue;
663 }
664
665 possible_problems |= UTF8_GOT_NON_CONTINUATION;
666 break;
667 } /* End of loop through the character's bytes */
668
669 curlen = s - s0;
670
671 # define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
672
673 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
674 uv_so_far = uv;
675 uv = UNICODE_REPLACEMENT;
676 }
677
678 if (UNLIKELY(0 < S_does_utf8_overflow(s0, s, 1))) {
679 possible_problems |= UTF8_GOT_OVERFLOW;
680 uv = UNICODE_REPLACEMENT;
681 }
682
683 if ( ( LIKELY(! possible_problems)
684 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
685 || ( UNLIKELY(possible_problems)
686 && ( UNLIKELY(! UTF8_IS_START(*s0))
687 || ( curlen > 1
688 && UNLIKELY(0 < S_is_utf8_overlong_given_start_byte_ok(s0,
689 s - s0))))))
690 {
691 possible_problems |= UTF8_GOT_LONG;
692
693 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
694 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
695 {
696 UV min_uv = uv_so_far;
697 STRLEN i;
698
699 for (i = curlen; i < expectlen; i++) {
700 min_uv = UTF8_ACCUMULATE(min_uv,
701 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
702 }
703
704 adjusted_s0 = temp_char_buf;
705 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
706 }
707 }
708
709 /* Here, we have found all the possible problems, except for when the input
710 * is for a problematic code point not allowed by the input parameters. */
711
712 /* uv is valid for overlongs */
713 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
714 && uv >= UNICODE_SURROGATE_FIRST)
715 || ( UNLIKELY(possible_problems)
716 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
717 && ((flags & ( UTF8_DISALLOW_NONCHAR
718 |UTF8_DISALLOW_SURROGATE
719 |UTF8_DISALLOW_SUPER))))
720 {
721 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
722 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
723 possible_problems |= UTF8_GOT_SURROGATE;
724 }
725 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
726 possible_problems |= UTF8_GOT_SUPER;
727 }
728 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
729 possible_problems |= UTF8_GOT_NONCHAR;
730 }
731 }
732 else {
733 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
734 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
735 {
736 possible_problems |= UTF8_GOT_SUPER;
737 }
738 else if (curlen > 1) {
739 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
740 NATIVE_UTF8_TO_I8(*adjusted_s0),
741 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
742 {
743 possible_problems |= UTF8_GOT_SUPER;
744 }
745 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
746 NATIVE_UTF8_TO_I8(*adjusted_s0),
747 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
748 {
749 possible_problems |= UTF8_GOT_SURROGATE;
750 }
751 }
752 }
753 }
754
755 ready_to_handle_errors:
756
757 if (UNLIKELY(possible_problems)) {
758 bool disallowed = FALSE;
759 const U32 orig_problems = possible_problems;
760
761 if (msgs) {
762 *msgs = NULL;
763 }
764
765 while (possible_problems) { /* Handle each possible problem */
766 UV pack_warn = 0;
767 char * message = NULL;
768 U32 this_flag_bit = 0;
769
770 /* Each 'if' clause handles one problem. They are ordered so that
771 * the first ones' messages will be displayed before the later
772 * ones; this is kinda in decreasing severity order. But the
773 * overlong must come last, as it changes 'uv' looked at by the
774 * others */
775 if (possible_problems & UTF8_GOT_OVERFLOW) {
776
777 /* Overflow means also got a super; we handle both here */
778 possible_problems
779 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER);
780
781 /* Disallow if any of the categories say to */
782 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
783 || (flags & UTF8_DISALLOW_SUPER))
784 {
785 disallowed = TRUE;
786 }
787
788 /* Likewise, warn if any say to */
789 if ( ! (flags & UTF8_ALLOW_OVERFLOW)) {
790
791 /* The warnings code explicitly says it doesn't handle the
792 * case of packWARN2 and two categories which have
793 * parent-child relationship. Even if it works now to
794 * raise the warning if either is enabled, it wouldn't
795 * necessarily do so in the future. We output (only) the
796 * most dire warning */
797 if (! (flags & UTF8_CHECK_ONLY)) {
798 if (msgs || ckWARN_d(WARN_UTF8)) {
799 pack_warn = packWARN(WARN_UTF8);
800 }
801 else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
802 pack_warn = packWARN(WARN_NON_UNICODE);
803 }
804 if (pack_warn) {
805 message = Perl_form(aTHX_ "%s: %s (overflows)",
806 malformed_text,
807 _byte_dump_string(s0, curlen));
808 this_flag_bit = UTF8_GOT_OVERFLOW;
809 }
810 }
811 }
812 }
813 else if (possible_problems & UTF8_GOT_EMPTY) {
814 possible_problems &= ~UTF8_GOT_EMPTY;
815
816 if (! (flags & UTF8_ALLOW_EMPTY)) {
817 disallowed = TRUE;
818 if ( (msgs
819 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
820 {
821 pack_warn = packWARN(WARN_UTF8);
822 message = Perl_form(aTHX_ "%s (empty string)",
823 malformed_text);
824 this_flag_bit = UTF8_GOT_EMPTY;
825 }
826 }
827 }
828 else if (possible_problems & UTF8_GOT_CONTINUATION) {
829 possible_problems &= ~UTF8_GOT_CONTINUATION;
830
831 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
832 disallowed = TRUE;
833 if (( msgs
834 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
835 {
836 pack_warn = packWARN(WARN_UTF8);
837 message = Perl_form(aTHX_
838 "%s: %s (unexpected continuation byte 0x%02x,"
839 " with no preceding start byte)",
840 malformed_text,
841 _byte_dump_string(s0, 1), *s0);
842 this_flag_bit = UTF8_GOT_CONTINUATION;
843 }
844 }
845 }
846 else if (possible_problems & UTF8_GOT_SHORT) {
847 possible_problems &= ~UTF8_GOT_SHORT;
848
849 if (! (flags & UTF8_ALLOW_SHORT)) {
850 disallowed = TRUE;
851 if (( msgs
852 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
853 {
854 pack_warn = packWARN(WARN_UTF8);
855 message = Perl_form(aTHX_
856 "%s: %s (too short; %d byte%s available, need %d)",
857 malformed_text,
858 _byte_dump_string(s0, send - s0),
859 (int)curlen,
860 curlen == 1 ? "" : "s",
861 (int)expectlen);
862 this_flag_bit = UTF8_GOT_SHORT;
863 }
864 }
865
866 }
867 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
868 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
869
870 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
871 disallowed = TRUE;
872 if (( msgs
873 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
874 {
875 int printlen = s - s0;
876 pack_warn = packWARN(WARN_UTF8);
877 message = Perl_form(aTHX_ "%s",
878 S_unexpected_non_continuation_text(s0,
879 printlen,
880 s - s0,
881 (int) expectlen));
882 this_flag_bit = UTF8_GOT_NON_CONTINUATION;
883 }
884 }
885 }
886 else if (possible_problems & UTF8_GOT_SURROGATE) {
887 possible_problems &= ~UTF8_GOT_SURROGATE;
888
889 if (flags & UTF8_WARN_SURROGATE) {
890
891 if ( ! (flags & UTF8_CHECK_ONLY)
892 && (msgs || ckWARN_d(WARN_SURROGATE)))
893 {
894 pack_warn = packWARN(WARN_SURROGATE);
895
896 /* These are the only errors that can occur with a
897 * surrogate when the 'uv' isn't valid */
898 if (orig_problems & UTF8_GOT_TOO_SHORT) {
899 message = Perl_form(aTHX_
900 "UTF-16 surrogate (any UTF-8 sequence that"
901 " starts with \"%s\" is for a surrogate)",
902 _byte_dump_string(s0, curlen));
903 }
904 else {
905 message = Perl_form(aTHX_ surrogate_cp_format, uv);
906 }
907 this_flag_bit = UTF8_GOT_SURROGATE;
908 }
909 }
910
911 if (flags & UTF8_DISALLOW_SURROGATE) {
912 disallowed = TRUE;
913 }
914 }
915 else if (possible_problems & UTF8_GOT_SUPER) {
916 possible_problems &= ~UTF8_GOT_SUPER;
917
918 if (flags & UTF8_WARN_SUPER) {
919
920 if ( ! (flags & UTF8_CHECK_ONLY)
921 && (msgs || ckWARN_d(WARN_NON_UNICODE)))
922 {
923 pack_warn = packWARN(WARN_NON_UNICODE);
924
925 if (orig_problems & UTF8_GOT_TOO_SHORT) {
926 message = Perl_form(aTHX_
927 "Any UTF-8 sequence that starts with"
928 " \"%s\" is for a non-Unicode code point,"
929 " may not be portable",
930 _byte_dump_string(s0, curlen));
931 }
932 else {
933 message = Perl_form(aTHX_ super_cp_format, uv);
934 }
935 this_flag_bit = UTF8_GOT_SUPER;
936 }
937 }
938
939 if (flags & UTF8_DISALLOW_SUPER) {
940 disallowed = TRUE;
941 }
942 }
943 else if (possible_problems & UTF8_GOT_NONCHAR) {
944 possible_problems &= ~UTF8_GOT_NONCHAR;
945
946 if (flags & UTF8_WARN_NONCHAR) {
947
948 if ( ! (flags & UTF8_CHECK_ONLY)
949 && (msgs || ckWARN_d(WARN_NONCHAR)))
950 {
951 /* The code above should have guaranteed that we don't
952 * get here with errors other than overlong */
953 assert (! (orig_problems
954 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
955
956 pack_warn = packWARN(WARN_NONCHAR);
957 message = Perl_form(aTHX_ nonchar_cp_format, uv);
958 this_flag_bit = UTF8_GOT_NONCHAR;
959 }
960 }
961
962 if (flags & UTF8_DISALLOW_NONCHAR) {
963 disallowed = TRUE;
964 }
965 }
966 else if (possible_problems & UTF8_GOT_LONG) {
967 possible_problems &= ~UTF8_GOT_LONG;
968
969 if (flags & UTF8_ALLOW_LONG) {
970 uv = UNICODE_REPLACEMENT;
971 }
972 else {
973 disallowed = TRUE;
974
975 if (( msgs
976 || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
977 {
978 pack_warn = packWARN(WARN_UTF8);
979
980 /* These error types cause 'uv' to be something that
981 * isn't what was intended, so can't use it in the
982 * message. The other error types either can't
983 * generate an overlong, or else the 'uv' is valid */
984 if (orig_problems &
985 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
986 {
987 message = Perl_form(aTHX_
988 "%s: %s (any UTF-8 sequence that starts"
989 " with \"%s\" is overlong which can and"
990 " should be represented with a"
991 " different, shorter sequence)",
992 malformed_text,
993 _byte_dump_string(s0, send - s0),
994 _byte_dump_string(s0, curlen));
995 }
996 else {
997 U8 tmpbuf[UTF8_MAXBYTES+1];
998 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
999 uv, 0);
1000 /* Don't use U+ for non-Unicode code points, which
1001 * includes those in the Latin1 range */
1002 const char * preface = ( uv > PERL_UNICODE_MAX
1003 # ifdef EBCDIC
1004 || uv <= 0xFF
1005 # endif
1006 )
1007 ? "0x"
1008 : "U+";
1009 message = Perl_form(aTHX_
1010 "%s: %s (overlong; instead use %s to represent"
1011 " %s%0*" UVXf ")",
1012 malformed_text,
1013 _byte_dump_string(s0, send - s0),
1014 _byte_dump_string(tmpbuf, e - tmpbuf),
1015 preface,
1016 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1017 small code points */
1018 UNI_TO_NATIVE(uv));
1019 }
1020 this_flag_bit = UTF8_GOT_LONG;
1021 }
1022 }
1023 } /* End of looking through the possible flags */
1024
1025 /* Display the message (if any) for the problem being handled in
1026 * this iteration of the loop */
1027 if (message) {
1028 if (msgs) {
1029 assert(this_flag_bit);
1030
1031 if (*msgs == NULL) {
1032 *msgs = newAV();
1033 }
1034
1035 av_push(*msgs, newRV_noinc((SV*) S_new_msg_hv(message,
1036 pack_warn)));
1037 }
1038 else if (PL_op)
1039 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1040 OP_DESC(PL_op));
1041 else
1042 Perl_warner(aTHX_ pack_warn, "%s", message);
1043 }
1044 } /* End of 'while (possible_problems)' */
1045
1046 if (retlen) {
1047 *retlen = curlen;
1048 }
1049
1050 if (disallowed) {
1051 if (flags & UTF8_CHECK_ONLY && retlen) {
1052 *retlen = ((STRLEN) -1);
1053 }
1054 return 0;
1055 }
1056 }
1057
1058 return UNI_TO_NATIVE(uv);
1059 }
1060
1061 static STRLEN
S_is_utf8_char_helper(const U8 * const s,const U8 * e,const U32 flags)1062 S_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
1063 {
1064 STRLEN len;
1065 const U8 *x;
1066
1067 assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
1068 assert(! UTF8_IS_INVARIANT(*s));
1069
1070 if (UNLIKELY(! UTF8_IS_START(*s))) {
1071 return 0;
1072 }
1073
1074 /* Examine a maximum of a single whole code point */
1075 if (e - s > UTF8SKIP(s)) {
1076 e = s + UTF8SKIP(s);
1077 }
1078
1079 len = e - s;
1080
1081 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
1082 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
1083
1084 if ( (flags & UTF8_DISALLOW_SUPER)
1085 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
1086 {
1087 return 0; /* Above Unicode */
1088 }
1089
1090 if (len > 1) {
1091 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
1092
1093 if ( (flags & UTF8_DISALLOW_SUPER)
1094 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
1095 {
1096 return 0; /* Above Unicode */
1097 }
1098
1099 if ( (flags & UTF8_DISALLOW_SURROGATE)
1100 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
1101 {
1102 return 0; /* Surrogate */
1103 }
1104
1105 if ( (flags & UTF8_DISALLOW_NONCHAR)
1106 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
1107 {
1108 return 0; /* Noncharacter code point */
1109 }
1110 }
1111 }
1112
1113 for (x = s + 1; x < e; x++) {
1114 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
1115 return 0;
1116 }
1117 }
1118
1119 if (len > 1 && S_is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
1120 return 0;
1121 }
1122
1123 if (0 < S_does_utf8_overflow(s, e, 0)) {
1124 return 0;
1125 }
1126
1127 return UTF8SKIP(s);
1128 }
1129
1130 # undef is_utf8_valid_partial_char_flags
1131
1132 static bool
is_utf8_valid_partial_char_flags(const U8 * const s,const U8 * const e,const U32 flags)1133 is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1134 {
1135
1136 return S_is_utf8_char_helper(s, e, flags) > 0;
1137 }
1138
1139 # undef is_utf8_string_loc_flags
1140
1141 static bool
is_utf8_string_loc_flags(const U8 * s,STRLEN len,const U8 ** ep,const U32 flags)1142 is_utf8_string_loc_flags(const U8 *s, STRLEN len, const U8 **ep, const U32 flags)
1143 {
1144 const U8* send = s + len;
1145
1146 assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
1147
1148 while (s < send) {
1149 if (UTF8_IS_INVARIANT(*s)) {
1150 s++;
1151 }
1152 else if ( UNLIKELY(send - s < UTF8SKIP(s))
1153 || ! S_is_utf8_char_helper(s, send, flags))
1154 {
1155 *ep = s;
1156 return 0;
1157 }
1158 else {
1159 s += UTF8SKIP(s);
1160 }
1161 }
1162
1163 *ep = send;
1164
1165 return 1;
1166 }
1167
1168 #endif
1169
1170 #if defined(IN_UNICODE_XS) && ! defined(uvchr_to_utf8_flags_msgs)
1171
1172 # define MY_SHIFT UTF_ACCUMULATION_SHIFT
1173 # define MY_MARK UTF_CONTINUATION_MARK
1174 # define MY_MASK UTF_CONTINUATION_MASK
1175
1176 static const char cp_above_legal_max[] =
1177 "Use of code point 0x%" UVXf " is not allowed; the"
1178 " permissible max is 0x%" UVXf;
1179
1180 /* These two can be dummys, as they are not looked at by the function, which
1181 * has hard-coded into it what flags it is expecting are */
1182 # ifndef UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
1183 # define UNICODE_DISALLOW_ILLEGAL_INTERCHANGE 0
1184 # endif
1185 # ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
1186 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
1187 # endif
1188
1189 # ifndef OFFUNI_IS_INVARIANT
1190 # define OFFUNI_IS_INVARIANT(cp) UNI_IS_INVARIANT(cp)
1191 # endif
1192 # ifndef MAX_EXTERNALLY_LEGAL_CP
1193 # define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
1194 # endif
1195 # ifndef LATIN1_TO_NATIVE
1196 # define LATIN1_TO_NATIVE(a) ASCII_TO_NATIVE(a)
1197 # endif
1198 # ifndef I8_TO_NATIVE_UTF8
1199 # define I8_TO_NATIVE_UTF8(a) NATIVE_TO_UTF(a)
1200 # endif
1201 # ifndef MAX_UTF8_TWO_BYTE
1202 # define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1)
1203 # endif
1204 # ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
1205 # define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) ((UV) (uv) >= 0xFDD0 \
1206 && (UV) (uv) <= 0xFDEF)
1207 # endif
1208 # ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
1209 # define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \
1210 (((UV) (uv) & 0xFFFE) == 0xFFFE)
1211 # endif
1212 # ifndef UNICODE_IS_SUPER
1213 # define UNICODE_IS_SUPER(uv) ((UV) (uv) > PERL_UNICODE_MAX)
1214 # endif
1215 # ifndef OFFUNISKIP
1216 # define OFFUNISKIP(cp) UNISKIP(NATIVE_TO_UNI(cp))
1217 # endif
1218
1219 # define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
1220 STMT_START { \
1221 U32 category = packWARN(WARN_SURROGATE); \
1222 const char * format = surrogate_cp_format; \
1223 *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), \
1224 category); \
1225 return NULL; \
1226 } STMT_END;
1227
1228 # define HANDLE_UNICODE_NONCHAR(uv, flags, msgs) \
1229 STMT_START { \
1230 U32 category = packWARN(WARN_NONCHAR); \
1231 const char * format = nonchar_cp_format; \
1232 *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), \
1233 category); \
1234 return NULL; \
1235 } STMT_END;
1236
1237 static U8 *
uvchr_to_utf8_flags_msgs(U8 * d,UV uv,const UV flags,HV ** msgs)1238 uvchr_to_utf8_flags_msgs(U8 *d, UV uv, const UV flags, HV** msgs)
1239 {
1240 dTHX;
1241
1242 assert(msgs);
1243
1244 PERL_UNUSED_ARG(flags);
1245
1246 uv = NATIVE_TO_UNI(uv);
1247
1248 *msgs = NULL;
1249
1250 if (OFFUNI_IS_INVARIANT(uv)) {
1251 *d++ = LATIN1_TO_NATIVE(uv);
1252 return d;
1253 }
1254
1255 if (uv <= MAX_UTF8_TWO_BYTE) {
1256 *d++ = I8_TO_NATIVE_UTF8(( uv >> MY_SHIFT) | UTF_START_MARK(2));
1257 *d++ = I8_TO_NATIVE_UTF8(( uv & MY_MASK) | MY_MARK);
1258 return d;
1259 }
1260
1261 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
1262 * below, the 16 is for start bytes E0-EF (which are all the possible ones
1263 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
1264 * contribute MY_SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
1265 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
1266 * 0x800-0xFFFF on ASCII */
1267 if (uv < (16 * (1U << (2 * MY_SHIFT)))) {
1268 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * MY_SHIFT)) | UTF_START_MARK(3));
1269 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
1270 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MY_MASK) | MY_MARK);
1271
1272 #ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
1273 aren't tested here */
1274 /* The most likely code points in this range are below the surrogates.
1275 * Do an extra test to quickly exclude those. */
1276 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
1277 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
1278 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
1279 {
1280 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
1281 }
1282 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1283 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
1284 }
1285 }
1286 #endif
1287 return d;
1288 }
1289
1290 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
1291 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
1292 * happen starting with 4-byte characters on ASCII platforms. We unify the
1293 * code for these with EBCDIC, even though some of them require 5-bytes on
1294 * those, because khw believes the code saving is worth the very slight
1295 * performance hit on these high EBCDIC code points. */
1296
1297 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1298 const char * format = super_cp_format;
1299 U32 category = packWARN(WARN_NON_UNICODE);
1300 if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
1301 Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
1302 }
1303 *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), category);
1304 return NULL;
1305 }
1306 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
1307 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
1308 }
1309
1310 /* Test for and handle 4-byte result. In the test immediately below, the
1311 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
1312 * characters). The 3 is for 3 continuation bytes; these each contribute
1313 * MY_SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
1314 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
1315 * 0x1_0000-0x1F_FFFF on ASCII */
1316 if (uv < (8 * (1U << (3 * MY_SHIFT)))) {
1317 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * MY_SHIFT)) | UTF_START_MARK(4));
1318 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
1319 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
1320 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MY_MASK) | MY_MARK);
1321
1322 #ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
1323 characters. The end-plane non-characters for EBCDIC were
1324 handled just above */
1325 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
1326 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
1327 }
1328 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1329 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
1330 }
1331 #endif
1332
1333 return d;
1334 }
1335
1336 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
1337 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
1338 * format. The unrolled version above turns out to not save all that much
1339 * time, and at these high code points (well above the legal Unicode range
1340 * on ASCII platforms, and well above anything in common use in EBCDIC),
1341 * khw believes that less code outweighs slight performance gains. */
1342
1343 {
1344 STRLEN len = OFFUNISKIP(uv);
1345 U8 *p = d+len-1;
1346 while (p > d) {
1347 *p-- = I8_TO_NATIVE_UTF8((uv & MY_MASK) | MY_MARK);
1348 uv >>= MY_SHIFT;
1349 }
1350 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1351 return d+len;
1352 }
1353 }
1354
1355 #endif /* End of defining our own uvchr_to_utf8_flags_msgs() */
1356 #endif /* End of UTF8SKIP */
1357
1358 #endif /* ENCODE_H */
1359