1 #ifndef ENCODE_H
2 #define ENCODE_H
3 
4 #ifndef H_PERL
5 /* check whether we're "in perl" so that we can do data parts without
6    getting extern references to the code parts
7 */
8 typedef unsigned char U8;
9 #endif
10 
11 typedef struct encpage_s encpage_t;
12 
13 struct encpage_s
14 {
15     /* fields ordered to pack nicely on 32-bit machines */
16     const U8 *const seq;   /* Packed output sequences we generate
17                   if we match */
18     const encpage_t *const next;      /* Page to go to if we match */
19     const U8   min;        /* Min value of octet to match this entry */
20     const U8   max;        /* Max value of octet to match this entry */
21     const U8   dlen;       /* destination length -
22                   size of entries in seq */
23     const U8   slen;       /* source length -
24                   number of source octets needed */
25 };
26 
27 /*
28   At any point in a translation there is a page pointer which points
29   at an array of the above structures.
30 
31   Basic operation :
32   get octet from source stream.
33   if (octet >= min && octet < max) {
34     if slen is 0 then we cannot represent this character.
35     if we have less than slen octets (including this one) then
36       we have a partial character.
37     otherwise
38       copy dlen octets from seq + dlen*(octet-min) to output
39       (dlen may be zero if we don't know yet.)
40       load page pointer with next to continue.
41       (is slen is one this is end of a character)
42       get next octet.
43   }
44   else {
45     increment the page pointer to look at next slot in the array
46   }
47 
48   arrays SHALL be constructed so there is an entry which matches
49   ..0xFF at the end, and either maps it or indicates no
50   representation.
51 
52   if MSB of slen is set then mapping is an approximate "FALLBACK" entry.
53 
54 */
55 
56 
57 typedef struct encode_s encode_t;
58 struct encode_s
59 {
60     const encpage_t *const t_utf8;  /* Starting table for translation from
61                        the encoding to UTF-8 form */
62     const encpage_t *const f_utf8;  /* Starting table for translation
63                        from UTF-8 to the encoding */
64     const U8 *const rep;            /* Replacement character in this
65                        encoding e.g. "?" */
66     int        replen;              /* Number of octets in rep */
67     U8         min_el;              /* Minimum octets to represent a
68                        character */
69     U8         max_el;              /* Maximum octets to represent a
70                        character */
71     const char *const name[2];      /* name(s) of this encoding */
72 };
73 
74 #ifdef H_PERL
75 /* See comment at top of file for deviousness */
76 
77 extern int do_encode(const encpage_t *enc, const U8 *src, STRLEN *slen,
78                      U8 *dst, STRLEN dlen, STRLEN *dout, int approx,
79              const U8 *term, STRLEN tlen);
80 
81 extern void Encode_DefineEncoding(encode_t *enc);
82 
83 #endif /* H_PERL */
84 
85 #define ENCODE_NOSPACE  1
86 #define ENCODE_PARTIAL  2
87 #define ENCODE_NOREP    3
88 #define ENCODE_FALLBACK 4
89 #define ENCODE_FOUND_TERM 5
90 
91 /* Use the perl core value if available; it is portable to EBCDIC */
92 #ifdef REPLACEMENT_CHARACTER_UTF8
93 #  define FBCHAR_UTF8		REPLACEMENT_CHARACTER_UTF8
94 #else
95 #  define FBCHAR_UTF8           "\xEF\xBF\xBD"
96 #endif
97 
98 #define  ENCODE_DIE_ON_ERR     0x0001 /* croaks immediately */
99 #define  ENCODE_WARN_ON_ERR    0x0002 /* warn on error; may proceed */
100 #define  ENCODE_RETURN_ON_ERR  0x0004 /* immediately returns on NOREP */
101 #define  ENCODE_LEAVE_SRC      0x0008 /* $src updated unless set */
102 #define  ENCODE_ONLY_PRAGMA_WARNINGS 0x0010 /* when enabled report only warnings configured by pragma warnings, otherwise report all warnings; no effect without ENCODE_WARN_ON_ERR */
103 #define  ENCODE_PERLQQ         0x0100 /* perlqq fallback string */
104 #define  ENCODE_HTMLCREF       0x0200 /* HTML character ref. fb mode */
105 #define  ENCODE_XMLCREF        0x0400 /* XML  character ref. fb mode */
106 #define  ENCODE_STOP_AT_PARTIAL 0x0800 /* stop at partial explicitly */
107 
108 #define  ENCODE_FB_DEFAULT     0x0000
109 #define  ENCODE_FB_CROAK       0x0001
110 #define  ENCODE_FB_QUIET       ENCODE_RETURN_ON_ERR
111 #define  ENCODE_FB_WARN        (ENCODE_RETURN_ON_ERR|ENCODE_WARN_ON_ERR)
112 #define  ENCODE_FB_PERLQQ      (ENCODE_PERLQQ|ENCODE_LEAVE_SRC)
113 #define  ENCODE_FB_HTMLCREF    (ENCODE_HTMLCREF|ENCODE_LEAVE_SRC)
114 #define  ENCODE_FB_XMLCREF     (ENCODE_XMLCREF|ENCODE_LEAVE_SRC)
115 
116 #define encode_ckWARN(c, w) ((c & ENCODE_WARN_ON_ERR)                         \
117                         && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || ckWARN(w)))
118 
119 #ifdef UTF8SKIP
120 #  ifdef EBCDIC   /* The value on early perls is wrong */
121 #    undef UTF8_MAXBYTES
122 #    define UTF8_MAXBYTES 14
123 #  endif
124 #  ifndef UNLIKELY
125 #    define UNLIKELY(x) (x)
126 #  endif
127 #  ifndef LIKELY
128 #    define LIKELY(x) (x)
129 #  endif
130 
131 /* EBCDIC requires a later perl to work, so the next two definitions are for
132  * ASCII machines only */
133 #  ifndef NATIVE_UTF8_TO_I8
134 #    define NATIVE_UTF8_TO_I8(x) (x)
135 #  endif
136 #  ifndef I8_TO_NATIVE_UTF8
137 #    define I8_TO_NATIVE_UTF8(x)  (x)
138 #  endif
139 #  ifndef OFFUNISKIP
140 #    define OFFUNISKIP(x)  UNISKIP(x)
141 #  endif
142 #  ifndef uvoffuni_to_utf8_flags
143 #    define uvoffuni_to_utf8_flags(a,b,c) uvuni_to_utf8_flags(a,b,c)
144 #  endif
145 #  ifndef WARN_SURROGATE    /* Use the overarching category if these
146                                subcategories are missing */
147 #    define WARN_SURROGATE WARN_UTF8
148 #    define WARN_NONCHAR WARN_UTF8
149 #    define WARN_NON_UNICODE WARN_UTF8
150      /* If there's only one possible category, then packing is a no-op */
151 #    define encode_ckWARN_packed(c, w) encode_ckWARN(c, w)
152 #  else
153 #    define encode_ckWARN_packed(c, w)                                      \
154             ((c & ENCODE_WARN_ON_ERR)                                       \
155         && (!(c & ENCODE_ONLY_PRAGMA_WARNINGS) || Perl_ckwarn(aTHX_ w)))
156 #  endif
157 
158 /* All these formats take a single UV code point argument */
159 static const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
160 static const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
161                                    " is not recommended for open interchange";
162 static const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
163                                    " may not be portable";
164 
165 /* If the perl doesn't have the 5.28 functions, this file includes
166  * stripped-down versions of them but containing enough functionality to be
167  * suitable for Encode's needs.  Many of the comments have been removed.  But
168  * you can inspect the 5.28 source if you get stuck.
169  *
170  * These could be put in Devel::PPPort, but Encode is likely the only user */
171 
172 #if    (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS))                     \
173   && (! defined(utf8n_to_uvchr_msgs) && ! defined(uvchr_to_utf8_flags_msgs))
174 
175 #  ifndef hv_stores
176 #    define hv_stores(hv, key, val) hv_store((hv), ("" key ""), (sizeof(key)-1), (val), 0)
177 #  endif
178 
179 static HV *
S_new_msg_hv(const char * const message,U32 categories)180 S_new_msg_hv(const char * const message, /* The message text */
181                    U32 categories)  /* Packed warning categories */
182 {
183     /* Creates, populates, and returns an HV* that describes an error message
184      * for the translators between UTF8 and code point */
185 
186     dTHX;
187     SV* msg_sv = newSVpv(message, 0);
188     SV* category_sv = newSVuv(categories);
189 
190     HV* msg_hv = newHV();
191 
192     (void) hv_stores(msg_hv, "text", msg_sv);
193     (void) hv_stores(msg_hv, "warn_categories",  category_sv);
194 
195     return msg_hv;
196 }
197 
198 #endif
199 
200 #if ! defined(utf8n_to_uvchr_msgs)                      \
201   && (defined(IN_ENCODE_XS) || defined(IN_UNICODE_XS))
202 
203 #  undef utf8n_to_uvchr     /* Don't use an earlier version: use the version
204                                defined in this file */
205 #  define utf8n_to_uvchr(a,b,c,d) utf8n_to_uvchr_msgs(a, b, c, d, 0, NULL)
206 
207 #  undef UTF8_IS_START      /* Early perls wrongly accepted C0 and C1 */
208 #  define UTF8_IS_START(c)  (((U8)(c)) >= 0xc2)
209 #  ifndef isUTF8_POSSIBLY_PROBLEMATIC
210 #    ifdef EBCDIC
211 #      define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c > ' ')
212 #    else
213 #      define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
214 #    endif
215 #  endif
216 #  ifndef UTF8_ALLOW_OVERFLOW
217 #    define UTF8_ALLOW_OVERFLOW (1U<<31)    /* Choose highest bit to avoid
218                                                potential conflicts */
219 #    define UTF8_GOT_OVERFLOW           UTF8_ALLOW_OVERFLOW
220 #  endif
221 #  undef UTF8_ALLOW_ANY     /* Early perl definitions don't work properly with
222                                the code in this file */
223 #  define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION                              \
224                           |UTF8_ALLOW_NON_CONTINUATION                          \
225                           |UTF8_ALLOW_SHORT                                     \
226                           |UTF8_ALLOW_LONG                                      \
227                           |UTF8_ALLOW_OVERFLOW)
228 
229 /* The meanings of these were complemented at some point, but the functions
230  * bundled in this file use the complemented meanings */
231 #  ifndef UTF8_DISALLOW_SURROGATE
232 #    define UTF8_DISALLOW_SURROGATE     UTF8_ALLOW_SURROGATE
233 #    define UTF8_DISALLOW_NONCHAR       UTF8_ALLOW_FFFF
234 #    define UTF8_DISALLOW_SUPER         UTF8_ALLOW_FE_FF
235 
236      /* In the stripped-down implementation in this file, disallowing is not
237       * independent of warning */
238 #    define UTF8_WARN_SURROGATE     UTF8_DISALLOW_SURROGATE
239 #    define UTF8_WARN_NONCHAR       UTF8_DISALLOW_NONCHAR
240 #    define UTF8_WARN_SUPER         UTF8_DISALLOW_SUPER
241 #  endif
242 #  ifndef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
243 #    define UTF8_DISALLOW_ILLEGAL_INTERCHANGE                                   \
244      (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_NONCHAR)
245 #  endif
246 #  ifndef UTF8_WARN_ILLEGAL_INTERCHANGE
247 #    define UTF8_WARN_ILLEGAL_INTERCHANGE                                       \
248          (UTF8_WARN_SUPER|UTF8_WARN_SURROGATE|UTF8_WARN_NONCHAR)
249 #  endif
250 #  ifndef FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER
251 #    ifdef EBCDIC   /* On EBCDIC, these are actually I8 bytes */
252 #      define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xFA
253 #      define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
254 
255 #      define IS_UTF8_2_BYTE_SURROGATE(s0, s1)     ((s0) == 0xF1            \
256                                               && ((s1) & 0xFE ) == 0xB6)
257 #    else
258 #      define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER  0xF5
259 #      define IS_UTF8_2_BYTE_SUPER(s0, s1)       ((s0) == 0xF4 && (s1) >= 0x90)
260 #      define IS_UTF8_2_BYTE_SURROGATE(s0, s1)   ((s0) == 0xED && (s1) >= 0xA0)
261 #    endif
262 #    ifndef HIGHEST_REPRESENTABLE_UTF8
263 #      if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1 */
264 #        ifdef EBCDIC     /* Actually is I8 */
265 #          define HIGHEST_REPRESENTABLE_UTF8                                    \
266                    "\xFF\xA7\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
267 #        else
268 #          define HIGHEST_REPRESENTABLE_UTF8                                    \
269                    "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
270 #        endif
271 #      endif
272 #    endif
273 #  endif
274 
275 #  ifndef Newx
276 #    define Newx(v,n,t) New(0,v,n,t)
277 #  endif
278 
279 #  ifndef PERL_UNUSED_ARG
280 #    define PERL_UNUSED_ARG(x) ((void)x)
281 #  endif
282 
283 #  ifndef memGT
284 #    define memGT(s1,s2,l) (memcmp(s1,s2,l) > 0)
285 #  endif
286 
287 #  ifndef MIN
288 #    define MIN(a,b) ((a) < (b) ? (a) : (b))
289 #  endif
290 
291 static const char malformed_text[] = "Malformed UTF-8 character";
292 
293 static char *
_byte_dump_string(const U8 * const start,const STRLEN len)294 _byte_dump_string(const U8 * const start, const STRLEN len)
295 {
296     /* Returns a mortalized C string that is a displayable copy of the 'len' */
297 
298     const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
299                                                trailing NUL */
300     const U8 * s = start;
301     const U8 * const e = start + len;
302     char * output;
303     char * d;
304     dTHX;
305 
306     Newx(output, output_len, char);
307     SAVEFREEPV(output);
308 
309     d = output;
310     for (s = start; s < e; s++) {
311         const unsigned high_nibble = (*s & 0xF0) >> 4;
312         const unsigned low_nibble =  (*s & 0x0F);
313 
314         *d++ = '\\';
315         *d++ = 'x';
316 
317         if (high_nibble < 10) {
318             *d++ = high_nibble + '0';
319         }
320         else {
321             *d++ = high_nibble - 10 + 'a';
322         }
323 
324         if (low_nibble < 10) {
325             *d++ = low_nibble + '0';
326         }
327         else {
328             *d++ = low_nibble - 10 + 'a';
329         }
330     }
331 
332     *d = '\0';
333     return output;
334 }
335 
336 static char *
S_unexpected_non_continuation_text(const U8 * const s,STRLEN print_len,const STRLEN non_cont_byte_pos,const STRLEN expect_len)337 S_unexpected_non_continuation_text(const U8 * const s,
338 
339                                          /* Max number of bytes to print */
340                                          STRLEN print_len,
341 
342                                          /* Which one is the non-continuation */
343                                          const STRLEN non_cont_byte_pos,
344 
345                                          /* How many bytes should there be? */
346                                          const STRLEN expect_len)
347 {
348     /* Return the malformation warning text for an unexpected continuation
349      * byte. */
350 
351     dTHX;
352     const char * const where = (non_cont_byte_pos == 1)
353                                ? "immediately"
354                                : Perl_form(aTHX_ "%d bytes",
355                                                  (int) non_cont_byte_pos);
356     const U8 * x = s + non_cont_byte_pos;
357     const U8 * e = s + print_len;
358 
359     /* We don't need to pass this parameter, but since it has already been
360      * calculated, it's likely faster to pass it; verify under DEBUGGING */
361     assert(expect_len == UTF8SKIP(s));
362 
363     /* As a defensive coding measure, don't output anything past a NUL.  Such
364      * bytes shouldn't be in the middle of a malformation, and could mark the
365      * end of the allocated string, and what comes after is undefined */
366     for (; x < e; x++) {
367         if (*x == '\0') {
368             x++;            /* Output this particular NUL */
369             break;
370         }
371     }
372 
373     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
374                            " %s after start byte 0x%02x; need %d bytes, got %d)",
375                            malformed_text,
376                            _byte_dump_string(s, x - s),
377                            *(s + non_cont_byte_pos),
378                            where,
379                            *s,
380                            (int) expect_len,
381                            (int) non_cont_byte_pos);
382 }
383 
384 static int
385 S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len);
386 
387 static int
S_does_utf8_overflow(const U8 * const s,const U8 * e,const bool consider_overlongs)388 S_does_utf8_overflow(const U8 * const s,
389                        const U8 * e,
390                        const bool consider_overlongs)
391 {
392     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
393      * 'e' - 1 would overflow an IV on this platform. */
394 
395 #  if ! defined(UV_IS_QUAD)
396 
397     const STRLEN len = e - s;
398     int is_overlong;
399 
400     assert(s <= e && s + UTF8SKIP(s) >= e);
401     assert(! UTF8_IS_INVARIANT(*s) && e > s);
402 
403 #    ifdef EBCDIC
404 
405     PERL_UNUSED_ARG(consider_overlongs);
406 
407     if (*s != 0xFE) {
408         return 0;
409     }
410 
411     if (len == 1) {
412         return -1;
413     }
414 
415 #    else
416 
417     if (LIKELY(*s < 0xFE)) {
418         return 0;
419     }
420 
421     if (! consider_overlongs) {
422         return 1;
423     }
424 
425     if (len == 1) {
426         return -1;
427     }
428 
429     is_overlong = S_is_utf8_overlong_given_start_byte_ok(s, len);
430 
431     if (is_overlong == 0) {
432         return 1;
433     }
434 
435     if (is_overlong < 0) {
436         return -1;
437     }
438 
439     if (*s == 0xFE) {
440         return 0;
441     }
442 
443 #    endif
444 
445     /* Here, ASCII and EBCDIC rejoin:
446     *  On ASCII:   We have an overlong sequence starting with FF
447     *  On EBCDIC:  We have a sequence starting with FE. */
448 
449     {   /* For C89, use a block so the declaration can be close to its use */
450 
451 #    ifdef EBCDIC
452         const U8 conts_for_highest_30_bit[] = "\x41\x41\x41\x41\x41\x41\x42";
453 #    else
454         const U8 conts_for_highest_30_bit[] = "\x80\x80\x80\x80\x80\x80\x81";
455 #    endif
456         const STRLEN conts_len = sizeof(conts_for_highest_30_bit) - 1;
457         const STRLEN cmp_len = MIN(conts_len, len - 1);
458 
459         if (cmp_len >= conts_len || memNE(s + 1,
460                                           conts_for_highest_30_bit,
461                                           cmp_len))
462         {
463             return memGT(s + 1, conts_for_highest_30_bit, cmp_len);
464         }
465 
466         return -1;
467     }
468 
469 #  else /* Below is 64-bit word */
470 
471     PERL_UNUSED_ARG(consider_overlongs);
472 
473     {
474         const STRLEN len = e - s;
475         const U8 *x;
476         const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
477 
478         for (x = s; x < e; x++, y++) {
479 
480             if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
481                 continue;
482             }
483             return NATIVE_UTF8_TO_I8(*x) > *y;
484         }
485 
486         if (len < sizeof(HIGHEST_REPRESENTABLE_UTF8) - 1) {
487             return -1;
488         }
489 
490         return 0;
491     }
492 
493 #  endif
494 
495 }
496 
497 static int
498 S_isFF_OVERLONG(const U8 * const s, const STRLEN len);
499 
500 static int
S_is_utf8_overlong_given_start_byte_ok(const U8 * const s,const STRLEN len)501 S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
502 {
503     const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
504     const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
505 
506     assert(len > 1 && UTF8_IS_START(*s));
507 
508 #         ifdef EBCDIC
509 #             define F0_ABOVE_OVERLONG 0xB0
510 #             define F8_ABOVE_OVERLONG 0xA8
511 #             define FC_ABOVE_OVERLONG 0xA4
512 #             define FE_ABOVE_OVERLONG 0xA2
513 #             define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
514 #         else
515 
516     if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
517         return 1;
518     }
519 
520 #             define F0_ABOVE_OVERLONG 0x90
521 #             define F8_ABOVE_OVERLONG 0x88
522 #             define FC_ABOVE_OVERLONG 0x84
523 #             define FE_ABOVE_OVERLONG 0x82
524 #             define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
525 #         endif
526 
527     if (   (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
528         || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
529         || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
530         || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
531     {
532         return 1;
533     }
534 
535     /* Check for the FF overlong */
536     return S_isFF_OVERLONG(s, len);
537 }
538 
539 int
S_isFF_OVERLONG(const U8 * const s,const STRLEN len)540 S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
541 {
542     if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
543                      MIN(len, sizeof(FF_OVERLONG_PREFIX) - 1))))
544     {
545         return 0;
546     }
547 
548     if (len >= sizeof(FF_OVERLONG_PREFIX) - 1) {
549         return 1;
550     }
551 
552     return -1;
553 }
554 
555 #  ifndef UTF8_GOT_CONTINUATION
556 #    define UTF8_GOT_CONTINUATION       UTF8_ALLOW_CONTINUATION
557 #    define UTF8_GOT_EMPTY              UTF8_ALLOW_EMPTY
558 #    define UTF8_GOT_LONG               UTF8_ALLOW_LONG
559 #    define UTF8_GOT_NON_CONTINUATION   UTF8_ALLOW_NON_CONTINUATION
560 #    define UTF8_GOT_SHORT              UTF8_ALLOW_SHORT
561 #    define UTF8_GOT_SURROGATE          UTF8_DISALLOW_SURROGATE
562 #    define UTF8_GOT_NONCHAR            UTF8_DISALLOW_NONCHAR
563 #    define UTF8_GOT_SUPER              UTF8_DISALLOW_SUPER
564 #  endif
565 
566 #  ifndef UNICODE_IS_SUPER
567 #    define UNICODE_IS_SUPER(uv)    ((UV) (uv) > PERL_UNICODE_MAX)
568 #  endif
569 #  ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
570 #    define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)      ((UV) (uv) >= 0xFDD0   \
571                                                    && (UV) (uv) <= 0xFDEF)
572 #  endif
573 #  ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
574 #    define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)                  \
575                                           (((UV) (uv) & 0xFFFE) == 0xFFFE)
576 #  endif
577 #  ifndef is_NONCHAR_utf8_safe
578 #    define is_NONCHAR_utf8_safe(s,e)     /*** GENERATED CODE ***/            \
579 ( ( ( LIKELY((e) > (s)) ) && ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ) ? ( ( 0xEF == ((const U8*)s)[0] ) ?\
580 	    ( ( 0xB7 == ((const U8*)s)[1] ) ?                               \
581 		( ( 0x90 <= ((const U8*)s)[2] && ((const U8*)s)[2] <= 0xAF ) ? 3 : 0 )\
582 	    : ( ( 0xBF == ((const U8*)s)[1] ) && ( ( ((const U8*)s)[2] & 0xFE ) == 0xBE ) ) ? 3 : 0 )\
583 	: ( 0xF0 == ((const U8*)s)[0] ) ?                                   \
584 	    ( ( ( ( ((const U8*)s)[1] == 0x9F || ( ( ((const U8*)s)[1] & 0xEF ) == 0xAF ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
585 	: ( 0xF1 <= ((const U8*)s)[0] && ((const U8*)s)[0] <= 0xF3 ) ?      \
586 	    ( ( ( ( ( ((const U8*)s)[1] & 0xCF ) == 0x8F ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 )\
587 	: ( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( 0x8F == ((const U8*)s)[1] ) ) && ( 0xBF == ((const U8*)s)[2] ) ) && ( ( ((const U8*)s)[3] & 0xFE ) == 0xBE ) ) ? 4 : 0 ) : 0 )
588 #  endif
589 
590 #  ifndef UTF8_IS_NONCHAR
591 #    define UTF8_IS_NONCHAR(s, e) (is_NONCHAR_utf8_safe(s,e) > 0)
592 #  endif
593 #  ifndef UNICODE_IS_NONCHAR
594 #    define UNICODE_IS_NONCHAR(uv)                                    \
595     (   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)                       \
596      || (   LIKELY( ! UNICODE_IS_SUPER(uv))                         \
597          && UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
598 #  endif
599 
600 #  ifndef UTF8_MAXBYTES
601 #    define UTF8_MAXBYTES UTF8_MAXLEN
602 #  endif
603 
604 static UV
utf8n_to_uvchr_msgs(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)605 utf8n_to_uvchr_msgs(const U8 *s,
606                     STRLEN curlen,
607                     STRLEN *retlen,
608                     const U32 flags,
609                     U32 * errors,
610                     AV ** msgs)
611 {
612     const U8 * const s0 = s;
613     const U8 * send = NULL;
614     U32 possible_problems = 0;
615     UV uv = *s;
616     STRLEN expectlen   = 0;
617     U8 * adjusted_s0 = (U8 *) s0;
618     U8 temp_char_buf[UTF8_MAXBYTES + 1];
619     UV uv_so_far = 0;
620     dTHX;
621 
622     assert(errors == NULL); /* This functionality has been stripped */
623 
624     if (UNLIKELY(curlen == 0)) {
625         possible_problems |= UTF8_GOT_EMPTY;
626         curlen = 0;
627         uv = UNICODE_REPLACEMENT;
628 	goto ready_to_handle_errors;
629     }
630 
631     expectlen = UTF8SKIP(s);
632 
633     if (retlen) {
634 	*retlen = expectlen;
635     }
636 
637     if (UTF8_IS_INVARIANT(uv)) {
638 	return uv;
639     }
640 
641     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
642 	possible_problems |= UTF8_GOT_CONTINUATION;
643         curlen = 1;
644         uv = UNICODE_REPLACEMENT;
645 	goto ready_to_handle_errors;
646     }
647 
648     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
649 
650     send = (U8*) s0;
651     if (UNLIKELY(curlen < expectlen)) {
652         possible_problems |= UTF8_GOT_SHORT;
653         send += curlen;
654     }
655     else {
656         send += expectlen;
657     }
658 
659     for (s = s0 + 1; s < send; s++) {
660 	if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
661 	    uv = UTF8_ACCUMULATE(uv, *s);
662             continue;
663         }
664 
665         possible_problems |= UTF8_GOT_NON_CONTINUATION;
666         break;
667     } /* End of loop through the character's bytes */
668 
669     curlen = s - s0;
670 
671 #     define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
672 
673     if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
674         uv_so_far = uv;
675         uv = UNICODE_REPLACEMENT;
676     }
677 
678     if (UNLIKELY(0 < S_does_utf8_overflow(s0, s, 1))) {
679         possible_problems |= UTF8_GOT_OVERFLOW;
680         uv = UNICODE_REPLACEMENT;
681     }
682 
683     if (     (   LIKELY(! possible_problems)
684               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
685         || (       UNLIKELY(possible_problems)
686             && (   UNLIKELY(! UTF8_IS_START(*s0))
687                 || (   curlen > 1
688                     && UNLIKELY(0 < S_is_utf8_overlong_given_start_byte_ok(s0,
689                                                                 s - s0))))))
690     {
691         possible_problems |= UTF8_GOT_LONG;
692 
693         if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
694             &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
695         {
696             UV min_uv = uv_so_far;
697             STRLEN i;
698 
699             for (i = curlen; i < expectlen; i++) {
700                 min_uv = UTF8_ACCUMULATE(min_uv,
701                                      I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
702             }
703 
704             adjusted_s0 = temp_char_buf;
705             (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
706         }
707     }
708 
709     /* Here, we have found all the possible problems, except for when the input
710      * is for a problematic code point not allowed by the input parameters. */
711 
712                                 /* uv is valid for overlongs */
713     if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
714                    && uv >= UNICODE_SURROGATE_FIRST)
715             || (   UNLIKELY(possible_problems)
716                 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
717 	&& ((flags & ( UTF8_DISALLOW_NONCHAR
718                       |UTF8_DISALLOW_SURROGATE
719                       |UTF8_DISALLOW_SUPER))))
720     {
721         if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
722             if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
723                 possible_problems |= UTF8_GOT_SURROGATE;
724             }
725             else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
726                 possible_problems |= UTF8_GOT_SUPER;
727             }
728             else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
729                 possible_problems |= UTF8_GOT_NONCHAR;
730             }
731         }
732         else {
733             if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
734                                 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
735             {
736                 possible_problems |= UTF8_GOT_SUPER;
737             }
738             else if (curlen > 1) {
739                 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
740                                       NATIVE_UTF8_TO_I8(*adjusted_s0),
741                                       NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
742                 {
743                     possible_problems |= UTF8_GOT_SUPER;
744                 }
745                 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
746                                       NATIVE_UTF8_TO_I8(*adjusted_s0),
747                                       NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
748                 {
749                     possible_problems |= UTF8_GOT_SURROGATE;
750                 }
751             }
752         }
753     }
754 
755   ready_to_handle_errors:
756 
757     if (UNLIKELY(possible_problems)) {
758         bool disallowed = FALSE;
759         const U32 orig_problems = possible_problems;
760 
761         if (msgs) {
762             *msgs = NULL;
763         }
764 
765         while (possible_problems) { /* Handle each possible problem */
766             UV pack_warn = 0;
767             char * message = NULL;
768             U32 this_flag_bit = 0;
769 
770             /* Each 'if' clause handles one problem.  They are ordered so that
771              * the first ones' messages will be displayed before the later
772              * ones; this is kinda in decreasing severity order.  But the
773              * overlong must come last, as it changes 'uv' looked at by the
774              * others */
775             if (possible_problems & UTF8_GOT_OVERFLOW) {
776 
777                 /* Overflow means also got a super; we handle both here */
778                 possible_problems
779                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER);
780 
781                 /* Disallow if any of the categories say to */
782                 if ( ! (flags &  UTF8_ALLOW_OVERFLOW)
783                     || (flags &  UTF8_DISALLOW_SUPER))
784                 {
785                     disallowed = TRUE;
786                 }
787 
788                 /* Likewise, warn if any say to */
789                 if (  ! (flags & UTF8_ALLOW_OVERFLOW)) {
790 
791                     /* The warnings code explicitly says it doesn't handle the
792                      * case of packWARN2 and two categories which have
793                      * parent-child relationship.  Even if it works now to
794                      * raise the warning if either is enabled, it wouldn't
795                      * necessarily do so in the future.  We output (only) the
796                      * most dire warning */
797                     if (! (flags & UTF8_CHECK_ONLY)) {
798                         if (msgs || ckWARN_d(WARN_UTF8)) {
799                             pack_warn = packWARN(WARN_UTF8);
800                         }
801                         else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
802                             pack_warn = packWARN(WARN_NON_UNICODE);
803                         }
804                         if (pack_warn) {
805                             message = Perl_form(aTHX_ "%s: %s (overflows)",
806                                             malformed_text,
807                                             _byte_dump_string(s0, curlen));
808                             this_flag_bit = UTF8_GOT_OVERFLOW;
809                         }
810                     }
811                 }
812             }
813             else if (possible_problems & UTF8_GOT_EMPTY) {
814                 possible_problems &= ~UTF8_GOT_EMPTY;
815 
816                 if (! (flags & UTF8_ALLOW_EMPTY)) {
817                     disallowed = TRUE;
818                     if (  (msgs
819                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
820                     {
821                         pack_warn = packWARN(WARN_UTF8);
822                         message = Perl_form(aTHX_ "%s (empty string)",
823                                                    malformed_text);
824                         this_flag_bit = UTF8_GOT_EMPTY;
825                     }
826                 }
827             }
828             else if (possible_problems & UTF8_GOT_CONTINUATION) {
829                 possible_problems &= ~UTF8_GOT_CONTINUATION;
830 
831                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
832                     disallowed = TRUE;
833                     if ((   msgs
834                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
835                     {
836                         pack_warn = packWARN(WARN_UTF8);
837                         message = Perl_form(aTHX_
838                                 "%s: %s (unexpected continuation byte 0x%02x,"
839                                 " with no preceding start byte)",
840                                 malformed_text,
841                                 _byte_dump_string(s0, 1), *s0);
842                         this_flag_bit = UTF8_GOT_CONTINUATION;
843                     }
844                 }
845             }
846             else if (possible_problems & UTF8_GOT_SHORT) {
847                 possible_problems &= ~UTF8_GOT_SHORT;
848 
849                 if (! (flags & UTF8_ALLOW_SHORT)) {
850                     disallowed = TRUE;
851                     if ((   msgs
852                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
853                     {
854                         pack_warn = packWARN(WARN_UTF8);
855                         message = Perl_form(aTHX_
856                              "%s: %s (too short; %d byte%s available, need %d)",
857                              malformed_text,
858                              _byte_dump_string(s0, send - s0),
859                              (int)curlen,
860                              curlen == 1 ? "" : "s",
861                              (int)expectlen);
862                         this_flag_bit = UTF8_GOT_SHORT;
863                     }
864                 }
865 
866             }
867             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
868                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
869 
870                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
871                     disallowed = TRUE;
872                     if ((   msgs
873                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
874                     {
875                         int printlen = s - s0;
876                         pack_warn = packWARN(WARN_UTF8);
877                         message = Perl_form(aTHX_ "%s",
878                             S_unexpected_non_continuation_text(s0,
879                                                             printlen,
880                                                             s - s0,
881                                                             (int) expectlen));
882                         this_flag_bit = UTF8_GOT_NON_CONTINUATION;
883                     }
884                 }
885             }
886             else if (possible_problems & UTF8_GOT_SURROGATE) {
887                 possible_problems &= ~UTF8_GOT_SURROGATE;
888 
889                 if (flags & UTF8_WARN_SURROGATE) {
890 
891                     if (   ! (flags & UTF8_CHECK_ONLY)
892                         && (msgs || ckWARN_d(WARN_SURROGATE)))
893                     {
894                         pack_warn = packWARN(WARN_SURROGATE);
895 
896                         /* These are the only errors that can occur with a
897                         * surrogate when the 'uv' isn't valid */
898                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
899                             message = Perl_form(aTHX_
900                                     "UTF-16 surrogate (any UTF-8 sequence that"
901                                     " starts with \"%s\" is for a surrogate)",
902                                     _byte_dump_string(s0, curlen));
903                         }
904                         else {
905                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
906                         }
907                         this_flag_bit = UTF8_GOT_SURROGATE;
908                     }
909                 }
910 
911                 if (flags & UTF8_DISALLOW_SURROGATE) {
912                     disallowed = TRUE;
913                 }
914             }
915             else if (possible_problems & UTF8_GOT_SUPER) {
916                 possible_problems &= ~UTF8_GOT_SUPER;
917 
918                 if (flags & UTF8_WARN_SUPER) {
919 
920                     if (   ! (flags & UTF8_CHECK_ONLY)
921                         && (msgs || ckWARN_d(WARN_NON_UNICODE)))
922                     {
923                         pack_warn = packWARN(WARN_NON_UNICODE);
924 
925                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
926                             message = Perl_form(aTHX_
927                                     "Any UTF-8 sequence that starts with"
928                                     " \"%s\" is for a non-Unicode code point,"
929                                     " may not be portable",
930                                     _byte_dump_string(s0, curlen));
931                         }
932                         else {
933                             message = Perl_form(aTHX_ super_cp_format, uv);
934                         }
935                         this_flag_bit = UTF8_GOT_SUPER;
936                     }
937                 }
938 
939                 if (flags & UTF8_DISALLOW_SUPER) {
940                     disallowed = TRUE;
941                 }
942             }
943             else if (possible_problems & UTF8_GOT_NONCHAR) {
944                 possible_problems &= ~UTF8_GOT_NONCHAR;
945 
946                 if (flags & UTF8_WARN_NONCHAR) {
947 
948                     if (  ! (flags & UTF8_CHECK_ONLY)
949                         && (msgs || ckWARN_d(WARN_NONCHAR)))
950                     {
951                         /* The code above should have guaranteed that we don't
952                          * get here with errors other than overlong */
953                         assert (! (orig_problems
954                                         & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
955 
956                         pack_warn = packWARN(WARN_NONCHAR);
957                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
958                         this_flag_bit = UTF8_GOT_NONCHAR;
959                     }
960                 }
961 
962                 if (flags & UTF8_DISALLOW_NONCHAR) {
963                     disallowed = TRUE;
964                 }
965             }
966             else if (possible_problems & UTF8_GOT_LONG) {
967                 possible_problems &= ~UTF8_GOT_LONG;
968 
969                 if (flags & UTF8_ALLOW_LONG) {
970                     uv = UNICODE_REPLACEMENT;
971                 }
972                 else {
973                     disallowed = TRUE;
974 
975                     if ((   msgs
976                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
977                     {
978                         pack_warn = packWARN(WARN_UTF8);
979 
980                         /* These error types cause 'uv' to be something that
981                          * isn't what was intended, so can't use it in the
982                          * message.  The other error types either can't
983                          * generate an overlong, or else the 'uv' is valid */
984                         if (orig_problems &
985                                         (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
986                         {
987                             message = Perl_form(aTHX_
988                                     "%s: %s (any UTF-8 sequence that starts"
989                                     " with \"%s\" is overlong which can and"
990                                     " should be represented with a"
991                                     " different, shorter sequence)",
992                                     malformed_text,
993                                     _byte_dump_string(s0, send - s0),
994                                     _byte_dump_string(s0, curlen));
995                         }
996                         else {
997                             U8 tmpbuf[UTF8_MAXBYTES+1];
998                             const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
999                                                                         uv, 0);
1000                             /* Don't use U+ for non-Unicode code points, which
1001                              * includes those in the Latin1 range */
1002                             const char * preface = (    uv > PERL_UNICODE_MAX
1003 #  ifdef EBCDIC
1004                                                      || uv <= 0xFF
1005 #  endif
1006                                                     )
1007                                                    ? "0x"
1008                                                    : "U+";
1009                             message = Perl_form(aTHX_
1010                                 "%s: %s (overlong; instead use %s to represent"
1011                                 " %s%0*" UVXf ")",
1012                                 malformed_text,
1013                                 _byte_dump_string(s0, send - s0),
1014                                 _byte_dump_string(tmpbuf, e - tmpbuf),
1015                                 preface,
1016                                 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1017                                                          small code points */
1018                                 UNI_TO_NATIVE(uv));
1019                         }
1020                         this_flag_bit = UTF8_GOT_LONG;
1021                     }
1022                 }
1023             } /* End of looking through the possible flags */
1024 
1025             /* Display the message (if any) for the problem being handled in
1026              * this iteration of the loop */
1027             if (message) {
1028                 if (msgs) {
1029                     assert(this_flag_bit);
1030 
1031                     if (*msgs == NULL) {
1032                         *msgs = newAV();
1033                     }
1034 
1035                     av_push(*msgs, newRV_noinc((SV*) S_new_msg_hv(message,
1036                                                                 pack_warn)));
1037                 }
1038                 else if (PL_op)
1039                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1040                                                  OP_DESC(PL_op));
1041                 else
1042                     Perl_warner(aTHX_ pack_warn, "%s", message);
1043             }
1044         }   /* End of 'while (possible_problems)' */
1045 
1046         if (retlen) {
1047             *retlen = curlen;
1048         }
1049 
1050         if (disallowed) {
1051             if (flags & UTF8_CHECK_ONLY && retlen) {
1052                 *retlen = ((STRLEN) -1);
1053             }
1054             return 0;
1055         }
1056     }
1057 
1058     return UNI_TO_NATIVE(uv);
1059 }
1060 
1061 static STRLEN
S_is_utf8_char_helper(const U8 * const s,const U8 * e,const U32 flags)1062 S_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
1063 {
1064     STRLEN len;
1065     const U8 *x;
1066 
1067     assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
1068     assert(! UTF8_IS_INVARIANT(*s));
1069 
1070     if (UNLIKELY(! UTF8_IS_START(*s))) {
1071         return 0;
1072     }
1073 
1074     /* Examine a maximum of a single whole code point */
1075     if (e - s > UTF8SKIP(s)) {
1076         e = s + UTF8SKIP(s);
1077     }
1078 
1079     len = e - s;
1080 
1081     if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
1082         const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
1083 
1084         if (  (flags & UTF8_DISALLOW_SUPER)
1085             && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
1086         {
1087             return 0;           /* Above Unicode */
1088         }
1089 
1090         if (len > 1) {
1091             const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
1092 
1093             if (   (flags & UTF8_DISALLOW_SUPER)
1094                 &&  UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
1095             {
1096                 return 0;       /* Above Unicode */
1097             }
1098 
1099             if (   (flags & UTF8_DISALLOW_SURROGATE)
1100                 &&  UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
1101             {
1102                 return 0;       /* Surrogate */
1103             }
1104 
1105             if (  (flags & UTF8_DISALLOW_NONCHAR)
1106                 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
1107             {
1108                 return 0;       /* Noncharacter code point */
1109             }
1110         }
1111     }
1112 
1113     for (x = s + 1; x < e; x++) {
1114         if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
1115             return 0;
1116         }
1117     }
1118 
1119     if (len > 1 && S_is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
1120         return 0;
1121     }
1122 
1123     if (0 < S_does_utf8_overflow(s, e, 0)) {
1124         return 0;
1125     }
1126 
1127     return UTF8SKIP(s);
1128 }
1129 
1130 #  undef is_utf8_valid_partial_char_flags
1131 
1132 static bool
is_utf8_valid_partial_char_flags(const U8 * const s,const U8 * const e,const U32 flags)1133 is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1134 {
1135 
1136     return S_is_utf8_char_helper(s, e, flags) > 0;
1137 }
1138 
1139 #  undef is_utf8_string_loc_flags
1140 
1141 static bool
is_utf8_string_loc_flags(const U8 * s,STRLEN len,const U8 ** ep,const U32 flags)1142 is_utf8_string_loc_flags(const U8 *s, STRLEN len, const U8 **ep, const U32 flags)
1143 {
1144     const U8* send = s + len;
1145 
1146     assert(0 == (flags & ~UTF8_DISALLOW_ILLEGAL_INTERCHANGE));
1147 
1148     while (s < send) {
1149         if (UTF8_IS_INVARIANT(*s)) {
1150             s++;
1151         }
1152         else if (     UNLIKELY(send - s < UTF8SKIP(s))
1153                  || ! S_is_utf8_char_helper(s, send, flags))
1154         {
1155             *ep = s;
1156             return 0;
1157         }
1158         else {
1159             s += UTF8SKIP(s);
1160         }
1161     }
1162 
1163     *ep = send;
1164 
1165     return 1;
1166 }
1167 
1168 #endif
1169 
1170 #if defined(IN_UNICODE_XS) && ! defined(uvchr_to_utf8_flags_msgs)
1171 
1172 #  define MY_SHIFT   UTF_ACCUMULATION_SHIFT
1173 #  define MY_MARK    UTF_CONTINUATION_MARK
1174 #  define MY_MASK    UTF_CONTINUATION_MASK
1175 
1176 static const char cp_above_legal_max[] =
1177                         "Use of code point 0x%" UVXf " is not allowed; the"
1178                         " permissible max is 0x%" UVXf;
1179 
1180 /* These two can be dummys, as they are not looked at by the function, which
1181  * has hard-coded into it what flags it is expecting are */
1182 #  ifndef UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
1183 #    define UNICODE_DISALLOW_ILLEGAL_INTERCHANGE 0
1184 #  endif
1185 #  ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
1186 #    define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
1187 #  endif
1188 
1189 #  ifndef OFFUNI_IS_INVARIANT
1190 #    define OFFUNI_IS_INVARIANT(cp) UNI_IS_INVARIANT(cp)
1191 #  endif
1192 #  ifndef MAX_EXTERNALLY_LEGAL_CP
1193 #    define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
1194 #  endif
1195 #  ifndef LATIN1_TO_NATIVE
1196 #    define LATIN1_TO_NATIVE(a) ASCII_TO_NATIVE(a)
1197 #  endif
1198 #  ifndef I8_TO_NATIVE_UTF8
1199 #    define I8_TO_NATIVE_UTF8(a) NATIVE_TO_UTF(a)
1200 #  endif
1201 #  ifndef MAX_UTF8_TWO_BYTE
1202 #    define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1)
1203 #  endif
1204 #  ifndef UNICODE_IS_32_CONTIGUOUS_NONCHARS
1205 #    define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)    ((UV) (uv) >= 0xFDD0   \
1206                                                  && (UV) (uv) <= 0xFDEF)
1207 #  endif
1208 #  ifndef UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER
1209 #    define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)                \
1210                                           (((UV) (uv) & 0xFFFE) == 0xFFFE)
1211 #  endif
1212 #  ifndef UNICODE_IS_SUPER
1213 #    define UNICODE_IS_SUPER(uv)    ((UV) (uv) > PERL_UNICODE_MAX)
1214 #  endif
1215 #  ifndef OFFUNISKIP
1216 #    define OFFUNISKIP(cp)    UNISKIP(NATIVE_TO_UNI(cp))
1217 #  endif
1218 
1219 #  define HANDLE_UNICODE_SURROGATE(uv, flags, msgs)                 \
1220     STMT_START {                                                    \
1221         U32 category = packWARN(WARN_SURROGATE);                    \
1222         const char * format = surrogate_cp_format;                  \
1223         *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv),           \
1224                                  category);                         \
1225         return NULL;                                                \
1226     } STMT_END;
1227 
1228 #  define HANDLE_UNICODE_NONCHAR(uv, flags, msgs)                   \
1229     STMT_START {                                                    \
1230         U32 category = packWARN(WARN_NONCHAR);                      \
1231         const char * format = nonchar_cp_format;                    \
1232         *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv),           \
1233                                  category);                         \
1234         return NULL;                                                \
1235     } STMT_END;
1236 
1237 static U8 *
uvchr_to_utf8_flags_msgs(U8 * d,UV uv,const UV flags,HV ** msgs)1238 uvchr_to_utf8_flags_msgs(U8 *d, UV uv, const UV flags, HV** msgs)
1239 {
1240     dTHX;
1241 
1242     assert(msgs);
1243 
1244     PERL_UNUSED_ARG(flags);
1245 
1246     uv = NATIVE_TO_UNI(uv);
1247 
1248     *msgs = NULL;
1249 
1250     if (OFFUNI_IS_INVARIANT(uv)) {
1251 	*d++ = LATIN1_TO_NATIVE(uv);
1252 	return d;
1253     }
1254 
1255     if (uv <= MAX_UTF8_TWO_BYTE) {
1256         *d++ = I8_TO_NATIVE_UTF8(( uv >> MY_SHIFT) | UTF_START_MARK(2));
1257         *d++ = I8_TO_NATIVE_UTF8(( uv   & MY_MASK) | MY_MARK);
1258         return d;
1259     }
1260 
1261     /* Not 2-byte; test for and handle 3-byte result.   In the test immediately
1262      * below, the 16 is for start bytes E0-EF (which are all the possible ones
1263      * for 3 byte characters).  The 2 is for 2 continuation bytes; these each
1264      * contribute MY_SHIFT bits.  This yields 0x4000 on EBCDIC platforms, 0x1_0000
1265      * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
1266      * 0x800-0xFFFF on ASCII */
1267     if (uv < (16 * (1U << (2 * MY_SHIFT)))) {
1268 	*d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * MY_SHIFT)) | UTF_START_MARK(3));
1269 	*d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
1270 	*d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */           & MY_MASK) | MY_MARK);
1271 
1272 #ifndef EBCDIC  /* These problematic code points are 4 bytes on EBCDIC, so
1273                    aren't tested here */
1274         /* The most likely code points in this range are below the surrogates.
1275          * Do an extra test to quickly exclude those. */
1276         if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
1277             if (UNLIKELY(   UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
1278                          || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
1279             {
1280                 HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
1281             }
1282             else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1283                 HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
1284             }
1285         }
1286 #endif
1287 	return d;
1288     }
1289 
1290     /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
1291      * platforms, and 0x4000 on EBCDIC.  There are problematic cases that can
1292      * happen starting with 4-byte characters on ASCII platforms.  We unify the
1293      * code for these with EBCDIC, even though some of them require 5-bytes on
1294      * those, because khw believes the code saving is worth the very slight
1295      * performance hit on these high EBCDIC code points. */
1296 
1297     if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1298         const char * format = super_cp_format;
1299         U32 category = packWARN(WARN_NON_UNICODE);
1300         if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
1301             Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
1302         }
1303         *msgs = S_new_msg_hv(Perl_form(aTHX_ format, uv), category);
1304         return NULL;
1305     }
1306     else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
1307         HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
1308     }
1309 
1310     /* Test for and handle 4-byte result.   In the test immediately below, the
1311      * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
1312      * characters).  The 3 is for 3 continuation bytes; these each contribute
1313      * MY_SHIFT bits.  This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
1314      * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
1315      * 0x1_0000-0x1F_FFFF on ASCII */
1316     if (uv < (8 * (1U << (3 * MY_SHIFT)))) {
1317 	*d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * MY_SHIFT)) | UTF_START_MARK(4));
1318 	*d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
1319 	*d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * MY_SHIFT)) & MY_MASK) | MY_MARK);
1320 	*d++ = I8_TO_NATIVE_UTF8(( uv  /* (1 - 1) */           & MY_MASK) | MY_MARK);
1321 
1322 #ifdef EBCDIC   /* These were handled on ASCII platforms in the code for 3-byte
1323                    characters.  The end-plane non-characters for EBCDIC were
1324                    handled just above */
1325         if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
1326             HANDLE_UNICODE_NONCHAR(uv, flags, msgs);
1327         }
1328         else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1329             HANDLE_UNICODE_SURROGATE(uv, flags, msgs);
1330         }
1331 #endif
1332 
1333 	return d;
1334     }
1335 
1336     /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
1337      * platforms, and 0x4000 on EBCDIC.  At this point we switch to a loop
1338      * format.  The unrolled version above turns out to not save all that much
1339      * time, and at these high code points (well above the legal Unicode range
1340      * on ASCII platforms, and well above anything in common use in EBCDIC),
1341      * khw believes that less code outweighs slight performance gains. */
1342 
1343     {
1344 	STRLEN len  = OFFUNISKIP(uv);
1345 	U8 *p = d+len-1;
1346 	while (p > d) {
1347 	    *p-- = I8_TO_NATIVE_UTF8((uv & MY_MASK) | MY_MARK);
1348 	    uv >>= MY_SHIFT;
1349 	}
1350 	*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1351 	return d+len;
1352     }
1353 }
1354 
1355 #endif  /* End of defining our own uvchr_to_utf8_flags_msgs() */
1356 #endif  /* End of UTF8SKIP */
1357 
1358 #endif /* ENCODE_H */
1359