xref: /openbsd/gnu/usr.bin/perl/utf8.c (revision 3d61058a)
1 /*    utf8.c
2  *
3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4  *    by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
13  *  heard of that we don't want to see any closer; and that's the one place
14  *  we're trying to get to!  And that's just where we can't get, nohow.'
15  *
16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
17  *
18  * 'Well do I understand your speech,' he answered in the same language;
19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
20  *  as is the custom in the West, if you wish to be answered?'
21  *                           --Gandalf, addressing Théoden's door wardens
22  *
23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24  *
25  * ...the travellers perceived that the floor was paved with stones of many
26  * hues; branching runes and strange devices intertwined beneath their feet.
27  *
28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29  */
30 
31 #include "EXTERN.h"
32 #define PERL_IN_UTF8_C
33 #include "perl.h"
34 #include "invlist_inline.h"
35 
36 static const char malformed_text[] = "Malformed UTF-8 character";
37 static const char unees[] =
38                         "Malformed UTF-8 character (unexpected end of string)";
39 
40 /*
41 These are various utility functions for manipulating UTF8-encoded
42 strings.  For the uninitiated, this is a method of representing arbitrary
43 Unicode characters as a variable number of bytes, in such a way that
44 characters in the ASCII range are unmodified, and a zero byte never appears
45 within non-zero characters.
46 */
47 
48 void
Perl__force_out_malformed_utf8_message(pTHX_ const U8 * const p,const U8 * const e,const U32 flags,const bool die_here)49 Perl__force_out_malformed_utf8_message(pTHX_
50             const U8 *const p,      /* First byte in UTF-8 sequence */
51             const U8 * const e,     /* Final byte in sequence (may include
52                                        multiple chars */
53             const U32 flags,        /* Flags to pass to utf8n_to_uvchr(),
54                                        usually 0, or some DISALLOW flags */
55             const bool die_here)    /* If TRUE, this function does not return */
56 {
57     /* This core-only function is to be called when a malformed UTF-8 character
58      * is found, in order to output the detailed information about the
59      * malformation before dieing.  The reason it exists is for the occasions
60      * when such a malformation is fatal, but warnings might be turned off, so
61      * that normally they would not be actually output.  This ensures that they
62      * do get output.  Because a sequence may be malformed in more than one
63      * way, multiple messages may be generated, so we can't make them fatal, as
64      * that would cause the first one to die.
65      *
66      * Instead we pretend -W was passed to perl, then die afterwards.  The
67      * flexibility is here to return to the caller so they can finish up and
68      * die themselves */
69     U32 errors;
70 
71     PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
72 
73     ENTER;
74     SAVEI8(PL_dowarn);
75     SAVESPTR(PL_curcop);
76 
77     PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
78     if (PL_curcop) {
79         SAVECURCOPWARNINGS();
80         PL_curcop->cop_warnings = pWARN_ALL;
81     }
82 
83     (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
84 
85     LEAVE;
86 
87     if (! errors) {
88         Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
89                          " be called only when there are errors found");
90     }
91 
92     if (die_here) {
93         Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
94     }
95 }
96 
97 STATIC HV *
S_new_msg_hv(pTHX_ const char * const message,U32 categories,U32 flag)98 S_new_msg_hv(pTHX_ const char * const message, /* The message text */
99                    U32 categories,  /* Packed warning categories */
100                    U32 flag)        /* Flag associated with this message */
101 {
102     /* Creates, populates, and returns an HV* that describes an error message
103      * for the translators between UTF8 and code point */
104 
105     SV* msg_sv = newSVpv(message, 0);
106     SV* category_sv = newSVuv(categories);
107     SV* flag_bit_sv = newSVuv(flag);
108 
109     HV* msg_hv = newHV();
110 
111     PERL_ARGS_ASSERT_NEW_MSG_HV;
112 
113     (void) hv_stores(msg_hv, "text", msg_sv);
114     (void) hv_stores(msg_hv, "warn_categories",  category_sv);
115     (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
116 
117     return msg_hv;
118 }
119 
120 /*
121 =for apidoc uvoffuni_to_utf8_flags
122 
123 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
124 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
125 L<perlapi/uvchr_to_utf8_flags>>.
126 
127 This function is like them, but the input is a strict Unicode
128 (as opposed to native) code point.  Only in very rare circumstances should code
129 not be using the native code point.
130 
131 For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
132 
133 =cut
134 */
135 
136 U8 *
Perl_uvoffuni_to_utf8_flags(pTHX_ U8 * d,UV uv,const UV flags)137 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
138 {
139     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
140 
141     return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
142 }
143 
144 /* All these formats take a single UV code point argument */
145 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
146 const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
147                                    " is not recommended for open interchange";
148 const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
149                                    " may not be portable";
150 
151 /*  Use shorter names internally in this file */
152 #define SHIFT   UTF_ACCUMULATION_SHIFT
153 #undef  MARK
154 #define MARK    UTF_CONTINUATION_MARK
155 #define MASK    UTF_CONTINUATION_MASK
156 
157 /*
158 =for apidoc uvchr_to_utf8_flags_msgs
159 
160 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
161 
162 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
163 
164 This function is for code that wants any warning and/or error messages to be
165 returned to the caller rather than be displayed.  All messages that would have
166 been displayed if all lexical warnings are enabled will be returned.
167 
168 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
169 placed after all the others, C<msgs>.  If this parameter is 0, this function
170 behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
171 be a pointer to an C<HV *> variable, in which this function creates a new HV to
172 contain any appropriate messages.  The hash has three key-value pairs, as
173 follows:
174 
175 =over 4
176 
177 =item C<text>
178 
179 The text of the message as a C<SVpv>.
180 
181 =item C<warn_categories>
182 
183 The warning category (or categories) packed into a C<SVuv>.
184 
185 =item C<flag>
186 
187 A single flag bit associated with this message, in a C<SVuv>.
188 The bit corresponds to some bit in the C<*errors> return value,
189 such as C<UNICODE_GOT_SURROGATE>.
190 
191 =back
192 
193 It's important to note that specifying this parameter as non-null will cause
194 any warnings this function would otherwise generate to be suppressed, and
195 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
196 (or not) when choosing what to do with the returned messages.
197 
198 The caller, of course, is responsible for freeing any returned HV.
199 
200 =cut
201 */
202 
203 /* Undocumented; we don't want people using this.  Instead they should use
204  * uvchr_to_utf8_flags_msgs() */
205 U8 *
Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 * d,UV input_uv,UV flags,HV ** msgs)206 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
207 {
208     U8 *p;
209     UV shifted_uv = input_uv;
210     STRLEN utf8_skip = OFFUNISKIP(input_uv);
211 
212     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
213 
214     if (msgs) {
215         *msgs = NULL;
216     }
217 
218     switch (utf8_skip) {
219       case 1:
220         *d++ = LATIN1_TO_NATIVE(input_uv);
221         return d;
222 
223       default:
224         if (   UNLIKELY(input_uv > MAX_LEGAL_CP
225             && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
226         {
227             Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
228                                                          NULL, 0, input_uv));
229         }
230 
231         if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
232             U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
233             const char * format = PL_extended_cp_format;
234             if (msgs) {
235                 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
236                                    category,
237                                    UNICODE_GOT_PERL_EXTENDED);
238             }
239             else {
240                 Perl_ck_warner_d(aTHX_ category, format, input_uv);
241             }
242 
243             /* Don't output a 2nd msg */
244             flags &= ~UNICODE_WARN_SUPER;
245         }
246 
247         if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
248             return NULL;
249         }
250 
251         p = d + utf8_skip - 1;
252         while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
253             *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
254             shifted_uv >>= SHIFT;
255         }
256 
257         /* FALLTHROUGH */
258 
259       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
260         d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
261                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
262         shifted_uv >>= SHIFT;
263         /* FALLTHROUGH */
264 
265       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
266         d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
267                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
268         shifted_uv >>= SHIFT;
269         /* FALLTHROUGH */
270 
271       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
272         if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
273             if (flags & UNICODE_WARN_SUPER) {
274                 U32 category = packWARN(WARN_NON_UNICODE);
275                 const char * format = super_cp_format;
276 
277                 if (msgs) {
278                     *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
279                                        category,
280                                        UNICODE_GOT_SUPER);
281                 }
282                 else {
283                     Perl_ck_warner_d(aTHX_ category, format, input_uv);
284                 }
285 
286                 if (flags & UNICODE_DISALLOW_SUPER) {
287                     return NULL;
288                 }
289             }
290             if (       (flags & UNICODE_DISALLOW_SUPER)
291                 || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
292                     &&  UNICODE_IS_PERL_EXTENDED(input_uv)))
293             {
294                 return NULL;
295             }
296         }
297 
298         d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
299                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
300         shifted_uv >>= SHIFT;
301         /* FALLTHROUGH */
302 
303       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
304         if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
305             if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
306                 if (flags & UNICODE_WARN_NONCHAR) {
307                     U32 category = packWARN(WARN_NONCHAR);
308                     const char * format = nonchar_cp_format;
309                     if (msgs) {
310                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
311                                            category,
312                                            UNICODE_GOT_NONCHAR);
313                     }
314                     else {
315                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
316                     }
317                 }
318                 if (flags & UNICODE_DISALLOW_NONCHAR) {
319                     return NULL;
320                 }
321             }
322             else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
323                 if (flags & UNICODE_WARN_SURROGATE) {
324                     U32 category = packWARN(WARN_SURROGATE);
325                     const char * format = surrogate_cp_format;
326                     if (msgs) {
327                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
328                                            category,
329                                            UNICODE_GOT_SURROGATE);
330                     }
331                     else {
332                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
333                     }
334                 }
335                 if (flags & UNICODE_DISALLOW_SURROGATE) {
336                     return NULL;
337                 }
338             }
339         }
340 
341         d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
342                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
343         shifted_uv >>= SHIFT;
344         /* FALLTHROUGH */
345 
346 #ifdef EBCDIC
347 
348       case 3:
349         d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
350         shifted_uv >>= SHIFT;
351         /* FALLTHROUGH */
352 
353 #endif
354 
355         /* FALLTHROUGH */
356       case 2:
357         d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
358         shifted_uv >>= SHIFT;
359         d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
360                                              | UTF_START_MARK(utf8_skip));
361         break;
362     }
363 
364     return d + utf8_skip;
365 }
366 
367 /*
368 =for apidoc uvchr_to_utf8
369 
370 Adds the UTF-8 representation of the native code point C<uv> to the end
371 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
372 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
373 the byte after the end of the new character.  In other words,
374 
375     d = uvchr_to_utf8(d, uv);
376 
377 is the recommended wide native character-aware way of saying
378 
379     *(d++) = uv;
380 
381 This function accepts any code point from 0..C<IV_MAX> as input.
382 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
383 
384 It is possible to forbid or warn on non-Unicode code points, or those that may
385 be problematic by using L</uvchr_to_utf8_flags>.
386 
387 =cut
388 */
389 
390 /* This is also a macro */
391 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
392 
393 U8 *
Perl_uvchr_to_utf8(pTHX_ U8 * d,UV uv)394 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
395 {
396     return uvchr_to_utf8(d, uv);
397 }
398 
399 /*
400 =for apidoc uvchr_to_utf8_flags
401 
402 Adds the UTF-8 representation of the native code point C<uv> to the end
403 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
404 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
405 the byte after the end of the new character.  In other words,
406 
407     d = uvchr_to_utf8_flags(d, uv, flags);
408 
409 or, in most cases,
410 
411     d = uvchr_to_utf8_flags(d, uv, 0);
412 
413 This is the Unicode-aware way of saying
414 
415     *(d++) = uv;
416 
417 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
418 input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
419 
420 Specifying C<flags> can further restrict what is allowed and not warned on, as
421 follows:
422 
423 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
424 the function will raise a warning, provided UTF8 warnings are enabled.  If
425 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
426 NULL.  If both flags are set, the function will both warn and return NULL.
427 
428 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
429 affect how the function handles a Unicode non-character.
430 
431 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
432 affect the handling of code points that are above the Unicode maximum of
433 0x10FFFF.  Languages other than Perl may not be able to accept files that
434 contain these.
435 
436 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
437 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
438 three DISALLOW flags.  C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
439 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
440 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
441 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
442 above-Unicode and surrogate flags, but not the non-character ones, as
443 defined in
444 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
445 See L<perlunicode/Noncharacter code points>.
446 
447 Extremely high code points were never specified in any standard, and require an
448 extension to UTF-8 to express, which Perl does.  It is likely that programs
449 written in something other than Perl would not be able to read files that
450 contain these; nor would Perl understand files written by something that uses a
451 different extension.  For these reasons, there is a separate set of flags that
452 can warn and/or disallow these extremely high code points, even if other
453 above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
454 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
455 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
456 treat all above-Unicode code points, including these, as malformations.  (Note
457 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
458 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
459 
460 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
461 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
462 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
463 C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
464 platforms,these flags can apply to code points that actually do fit in 31 bits.
465 The new names accurately describe the situation in all cases.
466 
467 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT
468 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE
469 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
470 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR
471 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED
472 =for apidoc Amnh||UNICODE_DISALLOW_SUPER
473 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE
474 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT
475 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE
476 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE
477 =for apidoc Amnh||UNICODE_WARN_NONCHAR
478 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED
479 =for apidoc Amnh||UNICODE_WARN_SUPER
480 =for apidoc Amnh||UNICODE_WARN_SURROGATE
481 
482 =cut
483 */
484 
485 /* This is also a macro */
486 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
487 
488 U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 * d,UV uv,UV flags)489 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
490 {
491     return uvchr_to_utf8_flags(d, uv, flags);
492 }
493 
494 PERL_STATIC_INLINE int
S_is_utf8_overlong(const U8 * const s,const STRLEN len)495 S_is_utf8_overlong(const U8 * const s, const STRLEN len)
496 {
497     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
498      * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
499      * it isn't, and -1 if there isn't enough information to tell.  This last
500      * return value can happen if the sequence is incomplete, missing some
501      * trailing bytes that would form a complete character.  If there are
502      * enough bytes to make a definitive decision, this function does so.
503      * Usually 2 bytes are sufficient.
504      *
505      * Overlongs can occur whenever the number of continuation bytes changes.
506      * That means whenever the number of leading 1 bits in a start byte
507      * increases from the next lower start byte.  That happens for start bytes
508      * C0, E0, F0, F8, FC, FE, and FF.
509      */
510 
511     PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
512 
513     /* Each platform has overlongs after the start bytes given above (expressed
514      * in I8 for EBCDIC).  The values below were found by manually inspecting
515      * the UTF-8 patterns.  See the tables in utf8.h and utfebcdic.h. */
516 
517     switch (NATIVE_UTF8_TO_I8(s[0])) {
518       default:
519         assert(UTF8_IS_START(s[0]));
520         return 0;
521 
522       case 0xC0:
523       case 0xC1:
524         return 1;
525 
526 #ifdef EBCDIC
527 
528       case 0xC2:
529       case 0xC3:
530       case 0xC4:
531       case 0xE0:
532         return 1;
533 #else
534       case 0xE0:
535         return (len < 2) ? -1 : s[1] < 0xA0;
536 #endif
537 
538       case 0xF0:
539         return (len < 2)
540                ? -1
541                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
542       case 0xF8:
543         return (len < 2)
544                ? -1
545                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
546       case 0xFC:
547         return (len < 2)
548                ? -1
549                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
550       case 0xFE:
551         return (len < 2)
552                ? -1
553                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
554       case 0xFF:
555         return isFF_overlong(s, len);
556     }
557 }
558 
559 PERL_STATIC_INLINE int
S_isFF_overlong(const U8 * const s,const STRLEN len)560 S_isFF_overlong(const U8 * const s, const STRLEN len)
561 {
562     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
563      * 'e' - 1 is an overlong beginning with \xFF.  It returns 1 if it is; 0 if
564      * it isn't, and -1 if there isn't enough information to tell.  This last
565      * return value can happen if the sequence is incomplete, missing some
566      * trailing bytes that would form a complete character.  If there are
567      * enough bytes to make a definitive decision, this function does so. */
568 
569     PERL_ARGS_ASSERT_ISFF_OVERLONG;
570 
571 #ifdef EBCDIC
572     /* This works on all three EBCDIC code pages traditionally supported by
573      * perl */
574 #  define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
575 #else
576 #  define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
577 #endif
578 
579     /* To be an FF overlong, all the available bytes must match */
580     if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
581                      MIN(len, STRLENs(FF_OVERLONG_PREFIX)))))
582     {
583         return 0;
584     }
585 
586     /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
587      * be there; what comes after them doesn't matter.  See tables in utf8.h,
588      * utfebcdic.h. */
589     if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
590         return 1;
591     }
592 
593     /* The missing bytes could cause the result to go one way or the other, so
594      * the result is indeterminate */
595     return -1;
596 }
597 
598 /* At some point we may want to allow core to use up to UV_MAX */
599 
600 #ifdef EBCDIC     /* Actually is I8 */
601 #  if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
602 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA7"
603                               /* UV_MAX "\xFF\xAF" */
604 #  else      /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
605 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
606                               /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
607 #  endif
608 #else
609 #  if defined(UV_IS_QUAD)
610 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\x80\x87"
611                               /* UV_MAX "\xFF\x80" */
612 #  else
613 #    define HIGHEST_REPRESENTABLE_UTF  "\xFD"
614                               /* UV_MAX "\xFE\x83" */
615 #  endif
616 #endif
617 
618 PERL_STATIC_INLINE int
S_does_utf8_overflow(const U8 * const s,const U8 * e,const bool consider_overlongs)619 S_does_utf8_overflow(const U8 * const s,
620                      const U8 * e,
621                      const bool consider_overlongs)
622 {
623     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
624      * 'e' - 1 would overflow an IV on this platform; that is if it represents
625      * a code point larger than the highest representable code point.  It
626      * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
627      * enough information to tell.  This last return value can happen if the
628      * sequence is incomplete, missing some trailing bytes that would form a
629      * complete character.  If there are enough bytes to make a definitive
630      * decision, this function does so.
631      *
632      * If 'consider_overlongs' is TRUE, the function checks for the possibility
633      * that the sequence is an overlong that doesn't overflow.  Otherwise, it
634      * assumes the sequence is not an overlong.  This can give different
635      * results only on ASCII 32-bit platforms.
636      *
637      * (For ASCII platforms, we could use memcmp() because we don't have to
638      * convert each byte to I8, but it's very rare input indeed that would
639      * approach overflow, so the loop below will likely only get executed once.)
640      *
641      */
642     const STRLEN len = e - s;
643     const U8 *x;
644     const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
645     int is_overlong = 0;
646 
647     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
648 
649     for (x = s; x < e; x++, y++) {
650 
651         /* 'y' is set up to not include the trailing bytes that are all the
652          * maximum possible continuation byte.  So when we reach the end of 'y'
653          * (known to be NUL terminated), it is impossible for 'x' to contain
654          * bytes larger than those omitted bytes, and therefore 'x' can't
655          * overflow */
656         if (*y == '\0') {
657             return 0;
658         }
659 
660         /* If this byte is less than the corresponding highest non-overflowing
661          * UTF-8, the sequence doesn't overflow */
662         if (NATIVE_UTF8_TO_I8(*x) < *y) {
663             return 0;
664         }
665 
666         if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
667             goto overflows_if_not_overlong;
668         }
669     }
670 
671     /* Got to the end, and all bytes are the same.  If the input is a whole
672      * character, it doesn't overflow.  And if it is a partial character,
673      * there's not enough information to tell */
674     return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
675 
676   overflows_if_not_overlong:
677 
678     /* Here, a well-formed sequence overflows.  If we are assuming
679      * well-formedness, return that it overflows. */
680     if (! consider_overlongs) {
681         return 1;
682     }
683 
684     /* Here, it could be the overlong malformation, and might not actually
685      * overflow if you were to calculate it out.
686      *
687      * See if it actually is overlong */
688     is_overlong = is_utf8_overlong(s, len);
689 
690     /* If it isn't overlong, is well-formed, so overflows */
691     if (is_overlong == 0) {
692         return 1;
693     }
694 
695     /* Not long enough to determine */
696     if (is_overlong < 0) {
697         return -1;
698     }
699 
700     /* Here, it appears to overflow, but it is also overlong */
701 
702 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
703 
704     /* On many platforms, it is impossible for an overlong to overflow.  For
705      * these, no further work is necessary: we can return immediately that this
706      * overlong that is an apparent overflow actually isn't
707      *
708      * To see why, note that a length_N sequence can represent as overlongs all
709      * the code points representable by shorter length sequences, but no
710      * higher.  If it could represent a higher code point without being an
711      * overlong, we wouldn't have had to increase the sequence length!
712      *
713      * The highest possible start byte is FF; the next highest is FE.  The
714      * highest code point representable as an overlong on the platform is thus
715      * the highest code point representable by a non-overlong sequence whose
716      * start byte is FE.  If that value doesn't overflow the platform's word
717      * size, overlongs can't overflow.
718      *
719      * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
720      * information (the high 7 bits, all ones, say that the sequence is 7 bytes
721      * long, and the bottom, zero, bit is s placeholder. That leaves the 6
722      * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
723       If that number of bits doesn't exceed the word size, it can't overflow. */
724 
725     return 0;
726 
727 #else
728 
729     /* In practice, only a 32-bit ASCII box gets here.  The FE start byte can
730      * represent, as an overlong, the highest code point representable by an FD
731      * start byte, which is 5*6 continuation bytes of info plus one bit from
732      * the start byte, or 31 bits.  That doesn't overflow.  More explicitly:
733      * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
734      *
735      * That means only the FF start byte can have an overflowing overlong. */
736     if (*s < 0xFF) {
737         return 0;
738     }
739 
740     /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
741      * evaluates to 2**31, so overflows an IV.  For a UV it's
742      *              \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
743 #  define OVERFLOWS  "\xff\x80\x80\x80\x80\x80\x80\x82"
744 
745     if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) {   /* Not enough info */
746          return -1;
747     }
748 
749 #  define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
750 
751     return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
752 
753 #endif
754 
755 }
756 
757 STRLEN
Perl_is_utf8_char_helper_(const U8 * const s,const U8 * e,const U32 flags)758 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
759 {
760     SSize_t len, full_len;
761 
762     /* An internal helper function.
763      *
764      * On input:
765      *  's' is a string, which is known to be syntactically valid UTF-8 as far
766      *      as (e - 1); e > s must hold.
767      *  'e' This function is allowed to look at any byte from 's'...'e-1', but
768      *      nowhere else.  The function has to cope as best it can if that
769      *      sequence does not form a full character.
770      * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
771      *      accepted by L</utf8n_to_uvchr>.  If non-zero, this function returns
772      *      0 if it determines the input will match something disallowed.
773      * On output:
774      *  The return is the number of bytes required to represent the code point
775      *  if it isn't disallowed by 'flags'; 0 otherwise.  Be aware that if the
776      *  input is for a partial character, a successful return will be larger
777      *  than 'e - s'.
778      *
779      *  If *s..*(e-1) is only for a partial character, the function will return
780      *  non-zero if there is any sequence of well-formed UTF-8 that, when
781      *  appended to the input sequence, could result in an allowed code point;
782      *  otherwise it returns 0.  Non characters cannot be determined based on
783      *  partial character input.  But many  of the other excluded types can be
784      *  determined with just the first one or two bytes.
785      *
786      */
787 
788     PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
789 
790     assert(e > s);
791     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
792                           |UTF8_DISALLOW_PERL_EXTENDED)));
793 
794     full_len = UTF8SKIP(s);
795 
796     len = e - s;
797     if (len > full_len) {
798         e = s + full_len;
799         len = full_len;
800     }
801 
802     switch (full_len) {
803         bool is_super;
804 
805       default: /* Extended */
806         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
807             return 0;
808         }
809 
810         /* FALLTHROUGH */
811 
812       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
813       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
814 
815         if (flags & UTF8_DISALLOW_SUPER) {
816             return 0;                       /* Above Unicode */
817         }
818 
819         return full_len;
820 
821       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
822         is_super = (   UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
823                     || (   len > 1
824                         && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
825                         && NATIVE_UTF8_TO_I8(s[1])
826                                                 >= UTF_FIRST_CONT_BYTE_110000_));
827         if (is_super) {
828             if (flags & UTF8_DISALLOW_SUPER) {
829                 return 0;
830             }
831         }
832         else if (   (flags & UTF8_DISALLOW_NONCHAR)
833                  && len == full_len
834                  && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
835         {
836             return 0;
837         }
838 
839         return full_len;
840 
841       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
842 
843         if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
844             return full_len;
845         }
846 
847         if (   (flags & UTF8_DISALLOW_SURROGATE)
848             &&  UNLIKELY(is_SURROGATE_utf8(s)))
849         {
850             return 0;       /* Surrogate */
851         }
852 
853         if (  (flags & UTF8_DISALLOW_NONCHAR)
854             && len == full_len
855             && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
856         {
857             return 0;
858         }
859 
860         return full_len;
861 
862       /* The lower code points don't have any disallowable characters */
863 #ifdef EBCDIC
864       case 3:
865         return full_len;
866 #endif
867 
868       case 2:
869       case 1:
870         return full_len;
871     }
872 }
873 
874 Size_t
Perl_is_utf8_FF_helper_(const U8 * const s0,const U8 * const e,const bool require_partial)875 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
876                         const bool require_partial)
877 {
878     /* This is called to determine if the UTF-8 sequence starting at s0 and
879      * continuing for up to one full character of bytes, but looking no further
880      * than 'e - 1', is legal.  *s0 must be 0xFF (or whatever the native
881      * equivalent of FF in I8 on EBCDIC platforms is).  This marks it as being
882      * for the largest code points recognized by Perl, the ones that require
883      * the most UTF-8 bytes per character to represent (somewhat less than
884      * twice the size of the next longest kind).  This sequence will only ever
885      * be Perl extended UTF-8.
886      *
887      * The routine returns 0 if the sequence is not fully valid, syntactically
888      * or semantically.  That means it checks that everything following the
889      * start byte is a continuation byte, and that it doesn't overflow, nor is
890      * an overlong representation.
891      *
892      * If 'require_partial' is FALSE, the routine returns non-zero only if the
893      * input (as far as 'e-1') is a full character.  The return is the count of
894      * the bytes in the character.
895      *
896      * If 'require_partial' is TRUE, the routine returns non-zero only if the
897      * input as far as 'e-1' is a partial, not full character, with no
898      * malformations found before position 'e'.  The return is either just
899      * FALSE, or TRUE.  */
900 
901     const U8 *s = s0 + 1;
902     const U8 *send = e;
903 
904     PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
905 
906     assert(s0 < e);
907     assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
908 
909     send = s + MIN(UTF8_MAXBYTES - 1, e - s);
910     while (s < send) {
911         if (! UTF8_IS_CONTINUATION(*s)) {
912             return 0;
913         }
914 
915         s++;
916     }
917 
918     if (0 < does_utf8_overflow(s0, e,
919                                FALSE /* Don't consider_overlongs */
920     )) {
921         return 0;
922     }
923 
924     if (0 < isFF_overlong(s0, e - s0)) {
925         return 0;
926     }
927 
928     /* Here, the character is valid as far as it got.  Check if got a partial
929      * character */
930     if (s - s0 < UTF8_MAXBYTES) {
931         return (require_partial) ? 1 : 0;
932     }
933 
934     /* Here, got a full character */
935     return (require_partial) ? 0 : UTF8_MAXBYTES;
936 }
937 
938 const char *
Perl__byte_dump_string(pTHX_ const U8 * const start,const STRLEN len,const bool format)939 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
940 {
941     /* Returns a mortalized C string that is a displayable copy of the 'len'
942      * bytes starting at 'start'.  'format' gives how to display each byte.
943      * Currently, there are only two formats, so it is currently a bool:
944      *      0   \xab
945      *      1    ab         (that is a space between two hex digit bytes)
946      */
947 
948     if (start == NULL) {
949         return "(nil)";
950     }
951 
952     const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
953                                                trailing NUL */
954     const U8 * s = start;
955     const U8 * const e = start + len;
956     char * output;
957     char * d;
958 
959     PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
960 
961     Newx(output, output_len, char);
962     SAVEFREEPV(output);
963 
964     d = output;
965     for (s = start; s < e; s++) {
966         const unsigned high_nibble = (*s & 0xF0) >> 4;
967         const unsigned low_nibble =  (*s & 0x0F);
968 
969         if (format) {
970             if (s > start) {
971                 *d++ = ' ';
972             }
973         }
974         else {
975             *d++ = '\\';
976             *d++ = 'x';
977         }
978 
979         if (high_nibble < 10) {
980             *d++ = high_nibble + '0';
981         }
982         else {
983             *d++ = high_nibble - 10 + 'a';
984         }
985 
986         if (low_nibble < 10) {
987             *d++ = low_nibble + '0';
988         }
989         else {
990             *d++ = low_nibble - 10 + 'a';
991         }
992     }
993 
994     *d = '\0';
995     return output;
996 }
997 
998 PERL_STATIC_INLINE char *
S_unexpected_non_continuation_text(pTHX_ const U8 * const s,STRLEN print_len,const STRLEN non_cont_byte_pos,const STRLEN expect_len)999 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
1000 
1001                                          /* Max number of bytes to print */
1002                                          STRLEN print_len,
1003 
1004                                          /* Which one is the non-continuation */
1005                                          const STRLEN non_cont_byte_pos,
1006 
1007                                          /* How many bytes should there be? */
1008                                          const STRLEN expect_len)
1009 {
1010     /* Return the malformation warning text for an unexpected continuation
1011      * byte. */
1012 
1013     const char * const where = (non_cont_byte_pos == 1)
1014                                ? "immediately"
1015                                : Perl_form(aTHX_ "%d bytes",
1016                                                  (int) non_cont_byte_pos);
1017     const U8 * x = s + non_cont_byte_pos;
1018     const U8 * e = s + print_len;
1019 
1020     PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1021 
1022     /* We don't need to pass this parameter, but since it has already been
1023      * calculated, it's likely faster to pass it; verify under DEBUGGING */
1024     assert(expect_len == UTF8SKIP(s));
1025 
1026     /* As a defensive coding measure, don't output anything past a NUL.  Such
1027      * bytes shouldn't be in the middle of a malformation, and could mark the
1028      * end of the allocated string, and what comes after is undefined */
1029     for (; x < e; x++) {
1030         if (*x == '\0') {
1031             x++;            /* Output this particular NUL */
1032             break;
1033         }
1034     }
1035 
1036     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1037                            " %s after start byte 0x%02x; need %d bytes, got %d)",
1038                            malformed_text,
1039                            _byte_dump_string(s, x - s, 0),
1040                            *(s + non_cont_byte_pos),
1041                            where,
1042                            *s,
1043                            (int) expect_len,
1044                            (int) non_cont_byte_pos);
1045 }
1046 
1047 /*
1048 
1049 =for apidoc utf8n_to_uvchr
1050 
1051 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1052 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1053 directly.
1054 
1055 Bottom level UTF-8 decode routine.
1056 Returns the native code point value of the first character in the string C<s>,
1057 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1058 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1059 the length, in bytes, of that character.
1060 
1061 The value of C<flags> determines the behavior when C<s> does not point to a
1062 well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
1063 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1064 is the next possible position in C<s> that could begin a non-malformed
1065 character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
1066 is raised.  Some UTF-8 input sequences may contain multiple malformations.
1067 This function tries to find every possible one in each call, so multiple
1068 warnings can be raised for the same sequence.
1069 
1070 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1071 individual types of malformations, such as the sequence being overlong (that
1072 is, when there is a shorter sequence that can express the same code point;
1073 overlong sequences are expressly forbidden in the UTF-8 standard due to
1074 potential security issues).  Another malformation example is the first byte of
1075 a character not being a legal first byte.  See F<utf8.h> for the list of such
1076 flags.  Even if allowed, this function generally returns the Unicode
1077 REPLACEMENT CHARACTER when it encounters a malformation.  There are flags in
1078 F<utf8.h> to override this behavior for the overlong malformations, but don't
1079 do that except for very specialized purposes.
1080 
1081 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
1082 flags) malformation is found.  If this flag is set, the routine assumes that
1083 the caller will raise a warning, and this function will silently just set
1084 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1085 
1086 Note that this API requires disambiguation between successful decoding a C<NUL>
1087 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
1088 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1089 be set to 1.  To disambiguate, upon a zero return, see if the first byte of
1090 C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
1091 error.  Or you can use C<L</utf8n_to_uvchr_error>>.
1092 
1093 Certain code points are considered problematic.  These are Unicode surrogates,
1094 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
1095 By default these are considered regular code points, but certain situations
1096 warrant special handling for them, which can be specified using the C<flags>
1097 parameter.  If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1098 three classes are treated as malformations and handled as such.  The flags
1099 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1100 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1101 disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1102 restricts the allowed inputs to the strict UTF-8 traditionally defined by
1103 Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1104 definition given by
1105 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
1106 The difference between traditional strictness and C9 strictness is that the
1107 latter does not forbid non-character code points.  (They are still discouraged,
1108 however.)  For more discussion see L<perlunicode/Noncharacter code points>.
1109 
1110 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1111 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
1112 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1113 raised for their respective categories, but otherwise the code points are
1114 considered valid (not malformations).  To get a category to both be treated as
1115 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
1116 (But note that warnings are not raised if lexically disabled nor if
1117 C<UTF8_CHECK_ONLY> is also specified.)
1118 
1119 Extremely high code points were never specified in any standard, and require an
1120 extension to UTF-8 to express, which Perl does.  It is likely that programs
1121 written in something other than Perl would not be able to read files that
1122 contain these; nor would Perl understand files written by something that uses a
1123 different extension.  For these reasons, there is a separate set of flags that
1124 can warn and/or disallow these extremely high code points, even if other
1125 above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
1126 C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
1127 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
1128 above-Unicode code points, including these, as malformations.
1129 (Note that the Unicode standard considers anything above 0x10FFFF to be
1130 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1131 (2**31 -1))
1132 
1133 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1134 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
1135 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1136 C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
1137 can apply to code points that actually do fit in 31 bits.  This happens on
1138 EBCDIC platforms, and sometimes when the L<overlong
1139 malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
1140 describe the situation in all cases.
1141 
1142 
1143 All other code points corresponding to Unicode characters, including private
1144 use and those yet to be assigned, are never considered malformed and never
1145 warn.
1146 
1147 =for apidoc Amnh||UTF8_CHECK_ONLY
1148 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1149 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1150 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1151 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1152 =for apidoc Amnh||UTF8_DISALLOW_SUPER
1153 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1154 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1155 =for apidoc Amnh||UTF8_WARN_SURROGATE
1156 =for apidoc Amnh||UTF8_WARN_NONCHAR
1157 =for apidoc Amnh||UTF8_WARN_SUPER
1158 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1159 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1160 
1161 =cut
1162 
1163 Also implemented as a macro in utf8.h
1164 */
1165 
1166 UV
Perl_utf8n_to_uvchr(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags)1167 Perl_utf8n_to_uvchr(const U8 *s,
1168                     STRLEN curlen,
1169                     STRLEN *retlen,
1170                     const U32 flags)
1171 {
1172     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1173 
1174     return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1175 }
1176 
1177 /*
1178 
1179 =for apidoc utf8n_to_uvchr_error
1180 
1181 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1182 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1183 directly.
1184 
1185 This function is for code that needs to know what the precise malformation(s)
1186 are when an error is found.  If you also need to know the generated warning
1187 messages, use L</utf8n_to_uvchr_msgs>() instead.
1188 
1189 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1190 all the others, C<errors>.  If this parameter is 0, this function behaves
1191 identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
1192 to a C<U32> variable, which this function sets to indicate any errors found.
1193 Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
1194 C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
1195 of these bits will be set if a malformation is found, even if the input
1196 C<flags> parameter indicates that the given malformation is allowed; those
1197 exceptions are noted:
1198 
1199 =over 4
1200 
1201 =item C<UTF8_GOT_PERL_EXTENDED>
1202 
1203 The input sequence is not standard UTF-8, but a Perl extension.  This bit is
1204 set only if the input C<flags> parameter contains either the
1205 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1206 
1207 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1208 and so some extension must be used to express them.  Perl uses a natural
1209 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1210 extension to represent even higher ones, so that any code point that fits in a
1211 64-bit word can be represented.  Text using these extensions is not likely to
1212 be portable to non-Perl code.  We lump both of these extensions together and
1213 refer to them as Perl extended UTF-8.  There exist other extensions that people
1214 have invented, incompatible with Perl's.
1215 
1216 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1217 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1218 than on ASCII.  Prior to that, code points 2**31 and higher were simply
1219 unrepresentable, and a different, incompatible method was used to represent
1220 code points between 2**30 and 2**31 - 1.
1221 
1222 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1223 Perl extended UTF-8 is used.
1224 
1225 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1226 may use for backward compatibility.  That name is misleading, as this flag may
1227 be set when the code point actually does fit in 31 bits.  This happens on
1228 EBCDIC platforms, and sometimes when the L<overlong
1229 malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
1230 describes the situation in all cases.
1231 
1232 =item C<UTF8_GOT_CONTINUATION>
1233 
1234 The input sequence was malformed in that the first byte was a UTF-8
1235 continuation byte.
1236 
1237 =item C<UTF8_GOT_EMPTY>
1238 
1239 The input C<curlen> parameter was 0.
1240 
1241 =item C<UTF8_GOT_LONG>
1242 
1243 The input sequence was malformed in that there is some other sequence that
1244 evaluates to the same code point, but that sequence is shorter than this one.
1245 
1246 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1247 it was discovered that this created security issues.
1248 
1249 =item C<UTF8_GOT_NONCHAR>
1250 
1251 The code point represented by the input UTF-8 sequence is for a Unicode
1252 non-character code point.
1253 This bit is set only if the input C<flags> parameter contains either the
1254 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1255 
1256 =item C<UTF8_GOT_NON_CONTINUATION>
1257 
1258 The input sequence was malformed in that a non-continuation type byte was found
1259 in a position where only a continuation type one should be.  See also
1260 C<L</UTF8_GOT_SHORT>>.
1261 
1262 =item C<UTF8_GOT_OVERFLOW>
1263 
1264 The input sequence was malformed in that it is for a code point that is not
1265 representable in the number of bits available in an IV on the current platform.
1266 
1267 =item C<UTF8_GOT_SHORT>
1268 
1269 The input sequence was malformed in that C<curlen> is smaller than required for
1270 a complete sequence.  In other words, the input is for a partial character
1271 sequence.
1272 
1273 
1274 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1275 sequence.  The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1276 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1277 sequence was looked at.   If no other flags are present, it means that the
1278 sequence was valid as far as it went.  Depending on the application, this could
1279 mean one of three things:
1280 
1281 =over
1282 
1283 =item *
1284 
1285 The C<curlen> length parameter passed in was too small, and the function was
1286 prevented from examining all the necessary bytes.
1287 
1288 =item *
1289 
1290 The buffer being looked at is based on reading data, and the data received so
1291 far stopped in the middle of a character, so that the next read will
1292 read the remainder of this character.  (It is up to the caller to deal with the
1293 split bytes somehow.)
1294 
1295 =item *
1296 
1297 This is a real error, and the partial sequence is all we're going to get.
1298 
1299 =back
1300 
1301 =item C<UTF8_GOT_SUPER>
1302 
1303 The input sequence was malformed in that it is for a non-Unicode code point;
1304 that is, one above the legal Unicode maximum.
1305 This bit is set only if the input C<flags> parameter contains either the
1306 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1307 
1308 =item C<UTF8_GOT_SURROGATE>
1309 
1310 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1311 code point.
1312 This bit is set only if the input C<flags> parameter contains either the
1313 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1314 
1315 =back
1316 
1317 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1318 flag to suppress any warnings, and then examine the C<*errors> return.
1319 
1320 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED
1321 =for apidoc Amnh||UTF8_GOT_CONTINUATION
1322 =for apidoc Amnh||UTF8_GOT_EMPTY
1323 =for apidoc Amnh||UTF8_GOT_LONG
1324 =for apidoc Amnh||UTF8_GOT_NONCHAR
1325 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION
1326 =for apidoc Amnh||UTF8_GOT_OVERFLOW
1327 =for apidoc Amnh||UTF8_GOT_SHORT
1328 =for apidoc Amnh||UTF8_GOT_SUPER
1329 =for apidoc Amnh||UTF8_GOT_SURROGATE
1330 
1331 =cut
1332 
1333 Also implemented as a macro in utf8.h
1334 */
1335 
1336 UV
Perl_utf8n_to_uvchr_error(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors)1337 Perl_utf8n_to_uvchr_error(const U8 *s,
1338                           STRLEN curlen,
1339                           STRLEN *retlen,
1340                           const U32 flags,
1341                           U32 * errors)
1342 {
1343     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1344 
1345     return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1346 }
1347 
1348 /*
1349 
1350 =for apidoc utf8n_to_uvchr_msgs
1351 
1352 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1353 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1354 directly.
1355 
1356 This function is for code that needs to know what the precise malformation(s)
1357 are when an error is found, and wants the corresponding warning and/or error
1358 messages to be returned to the caller rather than be displayed.  All messages
1359 that would have been displayed if all lexical warnings are enabled will be
1360 returned.
1361 
1362 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1363 placed after all the others, C<msgs>.  If this parameter is 0, this function
1364 behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
1365 be a pointer to an C<AV *> variable, in which this function creates a new AV to
1366 contain any appropriate messages.  The elements of the array are ordered so
1367 that the first message that would have been displayed is in the 0th element,
1368 and so on.  Each element is a hash with three key-value pairs, as follows:
1369 
1370 =over 4
1371 
1372 =item C<text>
1373 
1374 The text of the message as a C<SVpv>.
1375 
1376 =item C<warn_categories>
1377 
1378 The warning category (or categories) packed into a C<SVuv>.
1379 
1380 =item C<flag>
1381 
1382 A single flag bit associated with this message, in a C<SVuv>.
1383 The bit corresponds to some bit in the C<*errors> return value,
1384 such as C<UTF8_GOT_LONG>.
1385 
1386 =back
1387 
1388 It's important to note that specifying this parameter as non-null will cause
1389 any warnings this function would otherwise generate to be suppressed, and
1390 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
1391 (or not) when choosing what to do with the returned messages.
1392 
1393 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1394 no AV is created.
1395 
1396 The caller, of course, is responsible for freeing any returned AV.
1397 
1398 =cut
1399 */
1400 
1401 UV
Perl__utf8n_to_uvchr_msgs_helper(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)1402 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1403                                STRLEN curlen,
1404                                STRLEN *retlen,
1405                                const U32 flags,
1406                                U32 * errors,
1407                                AV ** msgs)
1408 {
1409     const U8 * const s0 = s;
1410     const U8 * send = s0 + curlen;
1411     U32 possible_problems;  /* A bit is set here for each potential problem
1412                                found as we go along */
1413     UV uv;
1414     STRLEN expectlen;     /* How long should this sequence be? */
1415     STRLEN avail_len;     /* When input is too short, gives what that is */
1416     U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
1417                              gets set and discarded */
1418 
1419     /* The below are used only if there is both an overlong malformation and a
1420      * too short one.  Otherwise the first two are set to 's0' and 'send', and
1421      * the third not used at all */
1422     U8 * adjusted_s0;
1423     U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1424                                             routine; see [perl #130921] */
1425     UV uv_so_far;
1426     dTHX;
1427 
1428     PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1429 
1430     /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1431      * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1432      * syllables that the dfa doesn't properly handle.  Quickly dispose of the
1433      * final case. */
1434 
1435     /* Each of the affected Hanguls starts with \xED */
1436 
1437     if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
1438         if (retlen) {
1439             *retlen = 3;
1440         }
1441         if (errors) {
1442             *errors = 0;
1443         }
1444         if (msgs) {
1445             *msgs = NULL;
1446         }
1447 
1448         return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1449              | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1450              |  (s0[2] & UTF_CONTINUATION_MASK);
1451     }
1452 
1453     /* In conjunction with the exhaustive tests that can be enabled in
1454      * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1455      * what it is intended to do, and that no flaws in it are masked by
1456      * dropping down and executing the code below
1457     assert(! isUTF8_CHAR(s0, send)
1458           || UTF8_IS_SURROGATE(s0, send)
1459           || UTF8_IS_SUPER(s0, send)
1460           || UTF8_IS_NONCHAR(s0,send));
1461     */
1462 
1463     s = s0;
1464     possible_problems = 0;
1465     expectlen = 0;
1466     avail_len = 0;
1467     discard_errors = 0;
1468     adjusted_s0 = (U8 *) s0;
1469     uv_so_far = 0;
1470 
1471     if (errors) {
1472         *errors = 0;
1473     }
1474     else {
1475         errors = &discard_errors;
1476     }
1477 
1478     /* The order of malformation tests here is important.  We should consume as
1479      * few bytes as possible in order to not skip any valid character.  This is
1480      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1481      * https://unicode.org/reports/tr36 for more discussion as to why.  For
1482      * example, once we've done a UTF8SKIP, we can tell the expected number of
1483      * bytes, and could fail right off the bat if the input parameters indicate
1484      * that there are too few available.  But it could be that just that first
1485      * byte is garbled, and the intended character occupies fewer bytes.  If we
1486      * blindly assumed that the first byte is correct, and skipped based on
1487      * that number, we could skip over a valid input character.  So instead, we
1488      * always examine the sequence byte-by-byte.
1489      *
1490      * We also should not consume too few bytes, otherwise someone could inject
1491      * things.  For example, an input could be deliberately designed to
1492      * overflow, and if this code bailed out immediately upon discovering that,
1493      * returning to the caller C<*retlen> pointing to the very next byte (one
1494      * which is actually part of the overflowing sequence), that could look
1495      * legitimate to the caller, which could discard the initial partial
1496      * sequence and process the rest, inappropriately.
1497      *
1498      * Some possible input sequences are malformed in more than one way.  This
1499      * function goes to lengths to try to find all of them.  This is necessary
1500      * for correctness, as the inputs may allow one malformation but not
1501      * another, and if we abandon searching for others after finding the
1502      * allowed one, we could allow in something that shouldn't have been.
1503      */
1504 
1505     if (UNLIKELY(curlen == 0)) {
1506         possible_problems |= UTF8_GOT_EMPTY;
1507         curlen = 0;
1508         uv = UNICODE_REPLACEMENT;
1509         goto ready_to_handle_errors;
1510     }
1511 
1512     /* We now know we can examine the first byte of the input */
1513     expectlen = UTF8SKIP(s);
1514     uv = *s;
1515 
1516     /* A well-formed UTF-8 character, as the vast majority of calls to this
1517      * function will be for, has this expected length.  For efficiency, set
1518      * things up here to return it.  It will be overridden only in those rare
1519      * cases where a malformation is found */
1520     if (retlen) {
1521         *retlen = expectlen;
1522     }
1523 
1524     /* A continuation character can't start a valid sequence */
1525     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1526         possible_problems |= UTF8_GOT_CONTINUATION;
1527         curlen = 1;
1528         uv = UNICODE_REPLACEMENT;
1529         goto ready_to_handle_errors;
1530     }
1531 
1532     /* Here is not a continuation byte, nor an invariant.  The only thing left
1533      * is a start byte (possibly for an overlong).  (We can't use UTF8_IS_START
1534      * because it excludes start bytes like \xC0 that always lead to
1535      * overlongs.) */
1536 
1537     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1538      * that indicate the number of bytes in the character's whole UTF-8
1539      * sequence, leaving just the bits that are part of the value.  */
1540     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1541 
1542     /* Setup the loop end point, making sure to not look past the end of the
1543      * input string, and flag it as too short if the size isn't big enough. */
1544     if (UNLIKELY(curlen < expectlen)) {
1545         possible_problems |= UTF8_GOT_SHORT;
1546         avail_len = curlen;
1547     }
1548     else {
1549         send = (U8*) s0 + expectlen;
1550     }
1551 
1552     /* Now, loop through the remaining bytes in the character's sequence,
1553      * accumulating each into the working value as we go. */
1554     for (s = s0 + 1; s < send; s++) {
1555         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1556             uv = UTF8_ACCUMULATE(uv, *s);
1557             continue;
1558         }
1559 
1560         /* Here, found a non-continuation before processing all expected bytes.
1561          * This byte indicates the beginning of a new character, so quit, even
1562          * if allowing this malformation. */
1563         possible_problems |= UTF8_GOT_NON_CONTINUATION;
1564         break;
1565     } /* End of loop through the character's bytes */
1566 
1567     /* Save how many bytes were actually in the character */
1568     curlen = s - s0;
1569 
1570     /* Note that there are two types of too-short malformation.  One is when
1571      * there is actual wrong data before the normal termination of the
1572      * sequence.  The other is that the sequence wasn't complete before the end
1573      * of the data we are allowed to look at, based on the input 'curlen'.
1574      * This means that we were passed data for a partial character, but it is
1575      * valid as far as we saw.  The other is definitely invalid.  This
1576      * distinction could be important to a caller, so the two types are kept
1577      * separate.
1578      *
1579      * A convenience macro that matches either of the too-short conditions.  */
1580 #   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1581 
1582     if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1583         uv_so_far = uv;
1584         uv = UNICODE_REPLACEMENT;
1585     }
1586 
1587     /* Check for overflow.  The algorithm requires us to not look past the end
1588      * of the current character, even if partial, so the upper limit is 's' */
1589     if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1590                                          1 /* Do consider overlongs */
1591                                         )))
1592     {
1593         possible_problems |= UTF8_GOT_OVERFLOW;
1594         uv = UNICODE_REPLACEMENT;
1595     }
1596 
1597     /* Check for overlong.  If no problems so far, 'uv' is the correct code
1598      * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
1599      * we must look at the UTF-8 byte sequence itself to see if it is for an
1600      * overlong */
1601     if (     (   LIKELY(! possible_problems)
1602               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1603         || (       UNLIKELY(possible_problems)
1604             && (   UNLIKELY(! UTF8_IS_START(*s0))
1605                 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
1606     {
1607         possible_problems |= UTF8_GOT_LONG;
1608 
1609         if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
1610 
1611                           /* The calculation in the 'true' branch of this 'if'
1612                            * below won't work if overflows, and isn't needed
1613                            * anyway.  Further below we handle all overflow
1614                            * cases */
1615             &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1616         {
1617             UV min_uv = uv_so_far;
1618             STRLEN i;
1619 
1620             /* Here, the input is both overlong and is missing some trailing
1621              * bytes.  There is no single code point it could be for, but there
1622              * may be enough information present to determine if what we have
1623              * so far is for an unallowed code point, such as for a surrogate.
1624              * The code further below has the intelligence to determine this,
1625              * but just for non-overlong UTF-8 sequences.  What we do here is
1626              * calculate the smallest code point the input could represent if
1627              * there were no too short malformation.  Then we compute and save
1628              * the UTF-8 for that, which is what the code below looks at
1629              * instead of the raw input.  It turns out that the smallest such
1630              * code point is all we need. */
1631             for (i = curlen; i < expectlen; i++) {
1632                 min_uv = UTF8_ACCUMULATE(min_uv,
1633                                 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1634             }
1635 
1636             adjusted_s0 = temp_char_buf;
1637             (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1638         }
1639     }
1640 
1641     /* Here, we have found all the possible problems, except for when the input
1642      * is for a problematic code point not allowed by the input parameters. */
1643 
1644                                 /* uv is valid for overlongs */
1645     if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1646                    && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
1647             || (   UNLIKELY(possible_problems)
1648 
1649                           /* if overflow, we know without looking further
1650                            * precisely which of the problematic types it is,
1651                            * and we deal with those in the overflow handling
1652                            * code */
1653                 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1654                 && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1655                     || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
1656         && ((flags & ( UTF8_DISALLOW_NONCHAR
1657                       |UTF8_DISALLOW_SURROGATE
1658                       |UTF8_DISALLOW_SUPER
1659                       |UTF8_DISALLOW_PERL_EXTENDED
1660                       |UTF8_WARN_NONCHAR
1661                       |UTF8_WARN_SURROGATE
1662                       |UTF8_WARN_SUPER
1663                       |UTF8_WARN_PERL_EXTENDED))))
1664     {
1665         /* If there were no malformations, or the only malformation is an
1666          * overlong, 'uv' is valid */
1667         if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1668             if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1669                 possible_problems |= UTF8_GOT_SURROGATE;
1670             }
1671             else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1672                 possible_problems |= UTF8_GOT_SUPER;
1673             }
1674             else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1675                 possible_problems |= UTF8_GOT_NONCHAR;
1676             }
1677         }
1678         else {  /* Otherwise, need to look at the source UTF-8, possibly
1679                    adjusted to be non-overlong */
1680 
1681             if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1682                                                     > UTF_START_BYTE_110000_))
1683             {
1684                 possible_problems |= UTF8_GOT_SUPER;
1685             }
1686             else if (curlen > 1) {
1687                 if (UNLIKELY(   NATIVE_UTF8_TO_I8(*adjusted_s0)
1688                                                 == UTF_START_BYTE_110000_
1689                              && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1690                                                 >= UTF_FIRST_CONT_BYTE_110000_))
1691                 {
1692                     possible_problems |= UTF8_GOT_SUPER;
1693                 }
1694                 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
1695                     possible_problems |= UTF8_GOT_SURROGATE;
1696                 }
1697             }
1698 
1699             /* We need a complete well-formed UTF-8 character to discern
1700              * non-characters, so can't look for them here */
1701         }
1702     }
1703 
1704   ready_to_handle_errors:
1705 
1706     /* At this point:
1707      * curlen               contains the number of bytes in the sequence that
1708      *                      this call should advance the input by.
1709      * avail_len            gives the available number of bytes passed in, but
1710      *                      only if this is less than the expected number of
1711      *                      bytes, based on the code point's start byte.
1712      * possible_problems    is 0 if there weren't any problems; otherwise a bit
1713      *                      is set in it for each potential problem found.
1714      * uv                   contains the code point the input sequence
1715      *                      represents; or if there is a problem that prevents
1716      *                      a well-defined value from being computed, it is
1717      *                      some substitute value, typically the REPLACEMENT
1718      *                      CHARACTER.
1719      * s0                   points to the first byte of the character
1720      * s                    points to just after where we left off processing
1721      *                      the character
1722      * send                 points to just after where that character should
1723      *                      end, based on how many bytes the start byte tells
1724      *                      us should be in it, but no further than s0 +
1725      *                      avail_len
1726      */
1727 
1728     if (UNLIKELY(possible_problems)) {
1729         bool disallowed = FALSE;
1730         const U32 orig_problems = possible_problems;
1731 
1732         if (msgs) {
1733             *msgs = NULL;
1734         }
1735 
1736         while (possible_problems) { /* Handle each possible problem */
1737             U32 pack_warn = 0;
1738             char * message = NULL;
1739             U32 this_flag_bit = 0;
1740 
1741             /* Each 'if' clause handles one problem.  They are ordered so that
1742              * the first ones' messages will be displayed before the later
1743              * ones; this is kinda in decreasing severity order.  But the
1744              * overlong must come last, as it changes 'uv' looked at by the
1745              * others */
1746             if (possible_problems & UTF8_GOT_OVERFLOW) {
1747 
1748                 /* Overflow means also got a super and are using Perl's
1749                  * extended UTF-8, but we handle all three cases here */
1750                 possible_problems
1751                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1752                 *errors |= UTF8_GOT_OVERFLOW;
1753 
1754                 /* But the API says we flag all errors found */
1755                 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1756                     *errors |= UTF8_GOT_SUPER;
1757                 }
1758                 if (flags
1759                         & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1760                 {
1761                     *errors |= UTF8_GOT_PERL_EXTENDED;
1762                 }
1763 
1764                 /* Disallow if any of the three categories say to */
1765                 if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
1766                     || (flags & ( UTF8_DISALLOW_SUPER
1767                                  |UTF8_DISALLOW_PERL_EXTENDED)))
1768                 {
1769                     disallowed = TRUE;
1770                 }
1771 
1772                 /* Likewise, warn if any say to */
1773                 if (  ! (flags & UTF8_ALLOW_OVERFLOW)
1774                     ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1775                 {
1776 
1777                     /* The warnings code explicitly says it doesn't handle the
1778                      * case of packWARN2 and two categories which have
1779                      * parent-child relationship.  Even if it works now to
1780                      * raise the warning if either is enabled, it wouldn't
1781                      * necessarily do so in the future.  We output (only) the
1782                      * most dire warning */
1783                     if (! (flags & UTF8_CHECK_ONLY)) {
1784                         if (msgs || ckWARN_d(WARN_UTF8)) {
1785                             pack_warn = packWARN(WARN_UTF8);
1786                         }
1787                         else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
1788                             pack_warn = packWARN(WARN_NON_UNICODE);
1789                         }
1790                         if (pack_warn) {
1791                             message = Perl_form(aTHX_ "%s: %s (overflows)",
1792                                             malformed_text,
1793                                             _byte_dump_string(s0, curlen, 0));
1794                             this_flag_bit = UTF8_GOT_OVERFLOW;
1795                         }
1796                     }
1797                 }
1798             }
1799             else if (possible_problems & UTF8_GOT_EMPTY) {
1800                 possible_problems &= ~UTF8_GOT_EMPTY;
1801                 *errors |= UTF8_GOT_EMPTY;
1802 
1803                 if (! (flags & UTF8_ALLOW_EMPTY)) {
1804 
1805                     /* This so-called malformation is now treated as a bug in
1806                      * the caller.  If you have nothing to decode, skip calling
1807                      * this function */
1808                     assert(0);
1809 
1810                     disallowed = TRUE;
1811                     if (  (msgs
1812                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1813                     {
1814                         pack_warn = packWARN(WARN_UTF8);
1815                         message = Perl_form(aTHX_ "%s (empty string)",
1816                                                    malformed_text);
1817                         this_flag_bit = UTF8_GOT_EMPTY;
1818                     }
1819                 }
1820             }
1821             else if (possible_problems & UTF8_GOT_CONTINUATION) {
1822                 possible_problems &= ~UTF8_GOT_CONTINUATION;
1823                 *errors |= UTF8_GOT_CONTINUATION;
1824 
1825                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1826                     disallowed = TRUE;
1827                     if ((   msgs
1828                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1829                     {
1830                         pack_warn = packWARN(WARN_UTF8);
1831                         message = Perl_form(aTHX_
1832                                 "%s: %s (unexpected continuation byte 0x%02x,"
1833                                 " with no preceding start byte)",
1834                                 malformed_text,
1835                                 _byte_dump_string(s0, 1, 0), *s0);
1836                         this_flag_bit = UTF8_GOT_CONTINUATION;
1837                     }
1838                 }
1839             }
1840             else if (possible_problems & UTF8_GOT_SHORT) {
1841                 possible_problems &= ~UTF8_GOT_SHORT;
1842                 *errors |= UTF8_GOT_SHORT;
1843 
1844                 if (! (flags & UTF8_ALLOW_SHORT)) {
1845                     disallowed = TRUE;
1846                     if ((   msgs
1847                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1848                     {
1849                         pack_warn = packWARN(WARN_UTF8);
1850                         message = Perl_form(aTHX_
1851                              "%s: %s (too short; %d byte%s available, need %d)",
1852                              malformed_text,
1853                              _byte_dump_string(s0, send - s0, 0),
1854                              (int)avail_len,
1855                              avail_len == 1 ? "" : "s",
1856                              (int)expectlen);
1857                         this_flag_bit = UTF8_GOT_SHORT;
1858                     }
1859                 }
1860 
1861             }
1862             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1863                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1864                 *errors |= UTF8_GOT_NON_CONTINUATION;
1865 
1866                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1867                     disallowed = TRUE;
1868                     if ((   msgs
1869                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1870                     {
1871 
1872                         /* If we don't know for sure that the input length is
1873                          * valid, avoid as much as possible reading past the
1874                          * end of the buffer */
1875                         int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1876                                        ? (int) (s - s0)
1877                                        : (int) (send - s0);
1878                         pack_warn = packWARN(WARN_UTF8);
1879                         message = Perl_form(aTHX_ "%s",
1880                             unexpected_non_continuation_text(s0,
1881                                                             printlen,
1882                                                             s - s0,
1883                                                             (int) expectlen));
1884                         this_flag_bit = UTF8_GOT_NON_CONTINUATION;
1885                     }
1886                 }
1887             }
1888             else if (possible_problems & UTF8_GOT_SURROGATE) {
1889                 possible_problems &= ~UTF8_GOT_SURROGATE;
1890 
1891                 if (flags & UTF8_WARN_SURROGATE) {
1892                     *errors |= UTF8_GOT_SURROGATE;
1893 
1894                     if (   ! (flags & UTF8_CHECK_ONLY)
1895                         && (msgs || ckWARN_d(WARN_SURROGATE)))
1896                     {
1897                         pack_warn = packWARN(WARN_SURROGATE);
1898 
1899                         /* These are the only errors that can occur with a
1900                         * surrogate when the 'uv' isn't valid */
1901                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1902                             message = Perl_form(aTHX_
1903                                     "UTF-16 surrogate (any UTF-8 sequence that"
1904                                     " starts with \"%s\" is for a surrogate)",
1905                                     _byte_dump_string(s0, curlen, 0));
1906                         }
1907                         else {
1908                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
1909                         }
1910                         this_flag_bit = UTF8_GOT_SURROGATE;
1911                     }
1912                 }
1913 
1914                 if (flags & UTF8_DISALLOW_SURROGATE) {
1915                     disallowed = TRUE;
1916                     *errors |= UTF8_GOT_SURROGATE;
1917                 }
1918             }
1919             else if (possible_problems & UTF8_GOT_SUPER) {
1920                 possible_problems &= ~UTF8_GOT_SUPER;
1921 
1922                 if (flags & UTF8_WARN_SUPER) {
1923                     *errors |= UTF8_GOT_SUPER;
1924 
1925                     if (   ! (flags & UTF8_CHECK_ONLY)
1926                         && (msgs || ckWARN_d(WARN_NON_UNICODE)))
1927                     {
1928                         pack_warn = packWARN(WARN_NON_UNICODE);
1929 
1930                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1931                             message = Perl_form(aTHX_
1932                                     "Any UTF-8 sequence that starts with"
1933                                     " \"%s\" is for a non-Unicode code point,"
1934                                     " may not be portable",
1935                                     _byte_dump_string(s0, curlen, 0));
1936                         }
1937                         else {
1938                             message = Perl_form(aTHX_ super_cp_format, uv);
1939                         }
1940                         this_flag_bit = UTF8_GOT_SUPER;
1941                     }
1942                 }
1943 
1944                 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1945                  * and before possibly bailing out, so that the more dire
1946                  * warning will override the regular one. */
1947                 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
1948                     if (  ! (flags & UTF8_CHECK_ONLY)
1949                         &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
1950                         &&  (msgs || (   ckWARN_d(WARN_NON_UNICODE)
1951                                       || ckWARN(WARN_PORTABLE))))
1952                     {
1953                         pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
1954 
1955                         /* If it is an overlong that evaluates to a code point
1956                          * that doesn't have to use the Perl extended UTF-8, it
1957                          * still used it, and so we output a message that
1958                          * doesn't refer to the code point.  The same is true
1959                          * if there was a SHORT malformation where the code
1960                          * point is not valid.  In that case, 'uv' will have
1961                          * been set to the REPLACEMENT CHAR, and the message
1962                          * below without the code point in it will be selected
1963                          * */
1964                         if (UNICODE_IS_PERL_EXTENDED(uv)) {
1965                             message = Perl_form(aTHX_
1966                                             PL_extended_cp_format, uv);
1967                         }
1968                         else {
1969                             message = Perl_form(aTHX_
1970                                         "Any UTF-8 sequence that starts with"
1971                                         " \"%s\" is a Perl extension, and"
1972                                         " so is not portable",
1973                                         _byte_dump_string(s0, curlen, 0));
1974                         }
1975                         this_flag_bit = UTF8_GOT_PERL_EXTENDED;
1976                     }
1977 
1978                     if (flags & ( UTF8_WARN_PERL_EXTENDED
1979                                  |UTF8_DISALLOW_PERL_EXTENDED))
1980                     {
1981                         *errors |= UTF8_GOT_PERL_EXTENDED;
1982 
1983                         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
1984                             disallowed = TRUE;
1985                         }
1986                     }
1987                 }
1988 
1989                 if (flags & UTF8_DISALLOW_SUPER) {
1990                     *errors |= UTF8_GOT_SUPER;
1991                     disallowed = TRUE;
1992                 }
1993             }
1994             else if (possible_problems & UTF8_GOT_NONCHAR) {
1995                 possible_problems &= ~UTF8_GOT_NONCHAR;
1996 
1997                 if (flags & UTF8_WARN_NONCHAR) {
1998                     *errors |= UTF8_GOT_NONCHAR;
1999 
2000                     if (  ! (flags & UTF8_CHECK_ONLY)
2001                         && (msgs || ckWARN_d(WARN_NONCHAR)))
2002                     {
2003                         /* The code above should have guaranteed that we don't
2004                          * get here with errors other than overlong */
2005                         assert (! (orig_problems
2006                                         & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2007 
2008                         pack_warn = packWARN(WARN_NONCHAR);
2009                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
2010                         this_flag_bit = UTF8_GOT_NONCHAR;
2011                     }
2012                 }
2013 
2014                 if (flags & UTF8_DISALLOW_NONCHAR) {
2015                     disallowed = TRUE;
2016                     *errors |= UTF8_GOT_NONCHAR;
2017                 }
2018             }
2019             else if (possible_problems & UTF8_GOT_LONG) {
2020                 possible_problems &= ~UTF8_GOT_LONG;
2021                 *errors |= UTF8_GOT_LONG;
2022 
2023                 if (flags & UTF8_ALLOW_LONG) {
2024 
2025                     /* We don't allow the actual overlong value, unless the
2026                      * special extra bit is also set */
2027                     if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
2028                                     & ~UTF8_ALLOW_LONG)))
2029                     {
2030                         uv = UNICODE_REPLACEMENT;
2031                     }
2032                 }
2033                 else {
2034                     disallowed = TRUE;
2035 
2036                     if ((   msgs
2037                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2038                     {
2039                         pack_warn = packWARN(WARN_UTF8);
2040 
2041                         /* These error types cause 'uv' to be something that
2042                          * isn't what was intended, so can't use it in the
2043                          * message.  The other error types either can't
2044                          * generate an overlong, or else the 'uv' is valid */
2045                         if (orig_problems &
2046                                         (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2047                         {
2048                             message = Perl_form(aTHX_
2049                                     "%s: %s (any UTF-8 sequence that starts"
2050                                     " with \"%s\" is overlong which can and"
2051                                     " should be represented with a"
2052                                     " different, shorter sequence)",
2053                                     malformed_text,
2054                                     _byte_dump_string(s0, send - s0, 0),
2055                                     _byte_dump_string(s0, curlen, 0));
2056                         }
2057                         else {
2058                             U8 tmpbuf[UTF8_MAXBYTES+1];
2059                             const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2060                                                                         uv, 0);
2061                             /* Don't use U+ for non-Unicode code points, which
2062                              * includes those in the Latin1 range */
2063                             const char * preface = (   UNICODE_IS_SUPER(uv)
2064 #ifdef EBCDIC
2065                                                     || uv <= 0xFF
2066 #endif
2067                                                    )
2068                                                    ? "0x"
2069                                                    : "U+";
2070                             message = Perl_form(aTHX_
2071                                 "%s: %s (overlong; instead use %s to represent"
2072                                 " %s%0*" UVXf ")",
2073                                 malformed_text,
2074                                 _byte_dump_string(s0, send - s0, 0),
2075                                 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2076                                 preface,
2077                                 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2078                                                          small code points */
2079                                 UNI_TO_NATIVE(uv));
2080                         }
2081                         this_flag_bit = UTF8_GOT_LONG;
2082                     }
2083                 }
2084             } /* End of looking through the possible flags */
2085 
2086             /* Display the message (if any) for the problem being handled in
2087              * this iteration of the loop */
2088             if (message) {
2089                 if (msgs) {
2090                     assert(this_flag_bit);
2091 
2092                     if (*msgs == NULL) {
2093                         *msgs = newAV();
2094                     }
2095 
2096                     av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2097                                                                 pack_warn,
2098                                                                 this_flag_bit)));
2099                 }
2100                 else if (PL_op)
2101                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2102                                                  OP_DESC(PL_op));
2103                 else
2104                     Perl_warner(aTHX_ pack_warn, "%s", message);
2105             }
2106         }   /* End of 'while (possible_problems)' */
2107 
2108         /* Since there was a possible problem, the returned length may need to
2109          * be changed from the one stored at the beginning of this function.
2110          * Instead of trying to figure out if it has changed, just do it. */
2111         if (retlen) {
2112             *retlen = curlen;
2113         }
2114 
2115         if (disallowed) {
2116             if (flags & UTF8_CHECK_ONLY && retlen) {
2117                 *retlen = ((STRLEN) -1);
2118             }
2119             return 0;
2120         }
2121     }
2122 
2123     return UNI_TO_NATIVE(uv);
2124 }
2125 
2126 /*
2127 =for apidoc utf8_to_uvchr_buf
2128 
2129 Returns the native code point of the first character in the string C<s> which
2130 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2131 C<*retlen> will be set to the length, in bytes, of that character.
2132 
2133 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2134 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2135 C<NULL>) to -1.  If those warnings are off, the computed value, if well-defined
2136 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
2137 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
2138 the next possible position in C<s> that could begin a non-malformed character.
2139 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2140 returned.
2141 
2142 =cut
2143 
2144 Also implemented as a macro in utf8.h
2145 
2146 */
2147 
2148 
2149 UV
Perl_utf8_to_uvchr_buf(pTHX_ const U8 * s,const U8 * send,STRLEN * retlen)2150 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2151 {
2152     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2153 
2154     return utf8_to_uvchr_buf_helper(s, send, retlen);
2155 }
2156 
2157 /*
2158 =for apidoc utf8_length
2159 
2160 Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2161 at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
2162 same place, it returns 0 with no warning raised.
2163 
2164 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2165 and returns the number of valid characters.
2166 
2167 =cut
2168 
2169     For long strings we process the input word-at-a-time, and count
2170     continuations, instead of otherwise counting characters and using UTF8SKIP
2171     to find the next one.  If our input were 13-byte characters, the per-word
2172     would be a loser, as we would be doing things in 8 byte chunks (or 4 on a
2173     32-bit platform).  But the maximum legal Unicode code point is 4 bytes, and
2174     most text will have a significant number of 1 and 2 byte characters, so the
2175     per-word is generally a winner.
2176 
2177     There are start-up and finish costs with the per-word method, so we use the
2178     standard method unless the input has a relatively large length.
2179 */
2180 
2181 STRLEN
Perl_utf8_length(pTHX_ const U8 * const s0,const U8 * const e)2182 Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
2183 {
2184     STRLEN continuations = 0;
2185     STRLEN len = 0;
2186     const U8 * s = s0;
2187 
2188     PERL_ARGS_ASSERT_UTF8_LENGTH;
2189 
2190     /* For EBCDCIC and short strings, we count the characters.  The boundary
2191      * was determined by eyeballing the output of Porting/bench.pl and
2192      * choosing a number where the continuations method gave better results (on
2193      * a 64 bit system, khw not having access to a 32 bit system with
2194      * cachegrind).  The number isn't critical, as at these sizes, the total
2195      * time spent isn't large either way */
2196 
2197 #ifndef EBCDIC
2198 
2199     if (e - s0 < 96)
2200 
2201 #endif
2202 
2203     {
2204         while (s < e) { /* Count characters directly */
2205 
2206             /* Take extra care to not exceed 'e' (which would be undefined
2207              * behavior) should the input be malformed, with a partial
2208              * character at the end */
2209             Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2210             if (UNLIKELY(e - s  < expected_byte_count)) {
2211                 goto warn_and_return;
2212             }
2213 
2214             len++;
2215             s += expected_byte_count;
2216         }
2217 
2218         if (LIKELY(e == s)) {
2219             return len;
2220         }
2221 
2222       warn_and_return:
2223         if (ckWARN_d(WARN_UTF8)) {
2224             if (PL_op)
2225                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
2226                              "%s in %s", unees, OP_DESC(PL_op));
2227             else
2228                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2229         }
2230 
2231         return s - s0;
2232     }
2233 
2234 #ifndef EBCDIC
2235 
2236     /* Count continuations, word-at-a-time.
2237      *
2238      * We need to stop before the final start character in order to
2239      * preserve the limited error checking that's always been done */
2240     const U8 * e_limit = e - UTF8_MAXBYTES;
2241 
2242     /* Points to the first byte >=s which is positioned at a word boundary.  If
2243      * s is on a word boundary, it is s, otherwise it is to the next word. */
2244     const U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
2245                                     - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
2246 
2247     /* Process up to a full word boundary. */
2248     while (s < partial_word_end) {
2249         const Size_t skip = UTF8SKIP(s);
2250 
2251         continuations += skip - 1;
2252         s += skip;
2253     }
2254 
2255     /* Adjust back down any overshoot */
2256     continuations -= s - partial_word_end;
2257     s = partial_word_end;
2258 
2259     do { /* Process per-word */
2260 
2261         /* The idea for counting continuation bytes came from
2262          * https://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
2263          * One thing it does that this doesn't is to prefetch the buffer
2264          *      __builtin_prefetch(&s[256], 0, 0);
2265          *
2266          * A continuation byte has the upper 2 bits be '10', and the rest
2267          * dont-cares.  The VARIANTS mask zeroes out all but the upper bit of
2268          * each byte in the word.  That gets shifted to the byte's lowest bit,
2269          * and 'anded' with the complement of the 2nd highest bit of the byte,
2270          * which has also been shifted to that position.  Hence the bit in that
2271          * position will be 1 iff the upper bit is 1 and the next one is 0.  We
2272          * then use the same integer multiplcation and shifting that are used
2273          * in variant_under_utf8_count() to count how many of those are set in
2274          * the word. */
2275 
2276         continuations += (((((* (const PERL_UINTMAX_T *) s)
2277                                             & PERL_VARIANTS_WORD_MASK) >> 7)
2278                       & (((~ (* (const PERL_UINTMAX_T *) s))) >> 6))
2279                   * PERL_COUNT_MULTIPLIER)
2280                 >> ((PERL_WORDSIZE - 1) * CHARBITS);
2281         s += PERL_WORDSIZE;
2282     } while (s + PERL_WORDSIZE <= e_limit);
2283 
2284     /* Process remainder per-byte */
2285     while (s < e) {
2286 	if (UTF8_IS_CONTINUATION(*s)) {
2287             continuations++;
2288             s++;
2289             continue;
2290         }
2291 
2292         /* Here is a starter byte.  Use UTF8SKIP from now on */
2293         do {
2294             Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2295             if (UNLIKELY(e - s  < expected_byte_count)) {
2296                 break;
2297             }
2298 
2299             continuations += expected_byte_count- 1;
2300             s += expected_byte_count;
2301         } while (s < e);
2302 
2303         break;
2304     }
2305 
2306 #  endif
2307 
2308     if (LIKELY(e == s)) {
2309         return s - s0 - continuations;
2310     }
2311 
2312     /* Convert to characters */
2313     s -= continuations;
2314 
2315     goto warn_and_return;
2316 }
2317 
2318 /*
2319 =for apidoc bytes_cmp_utf8
2320 
2321 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
2322 sequence of characters (stored as UTF-8)
2323 in C<u>, C<ulen>.  Returns 0 if they are
2324 equal, -1 or -2 if the first string is less than the second string, +1 or +2
2325 if the first string is greater than the second string.
2326 
2327 -1 or +1 is returned if the shorter string was identical to the start of the
2328 longer string.  -2 or +2 is returned if
2329 there was a difference between characters
2330 within the strings.
2331 
2332 =cut
2333 */
2334 
2335 int
Perl_bytes_cmp_utf8(pTHX_ const U8 * b,STRLEN blen,const U8 * u,STRLEN ulen)2336 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2337 {
2338     const U8 *const bend = b + blen;
2339     const U8 *const uend = u + ulen;
2340 
2341     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
2342 
2343     while (b < bend && u < uend) {
2344         U8 c = *u++;
2345         if (!UTF8_IS_INVARIANT(c)) {
2346             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2347                 if (u < uend) {
2348                     U8 c1 = *u++;
2349                     if (UTF8_IS_CONTINUATION(c1)) {
2350                         c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2351                     } else {
2352                         /* diag_listed_as: Malformed UTF-8 character%s */
2353                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2354                               "%s %s%s",
2355                               unexpected_non_continuation_text(u - 2, 2, 1, 2),
2356                               PL_op ? " in " : "",
2357                               PL_op ? OP_DESC(PL_op) : "");
2358                         return -2;
2359                     }
2360                 } else {
2361                     if (PL_op)
2362                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2363                                          "%s in %s", unees, OP_DESC(PL_op));
2364                     else
2365                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2366                     return -2; /* Really want to return undef :-)  */
2367                 }
2368             } else {
2369                 return -2;
2370             }
2371         }
2372         if (*b != c) {
2373             return *b < c ? -2 : +2;
2374         }
2375         ++b;
2376     }
2377 
2378     if (b == bend && u == uend)
2379         return 0;
2380 
2381     return b < bend ? +1 : -1;
2382 }
2383 
2384 /*
2385 =for apidoc utf8_to_bytes
2386 
2387 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2388 Unlike L</bytes_to_utf8>, this over-writes the original string, and
2389 updates C<*lenp> to contain the new length.
2390 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2391 
2392 Upon successful return, the number of variants in the string can be computed by
2393 having saved the value of C<*lenp> before the call, and subtracting the
2394 after-call value of C<*lenp> from it.
2395 
2396 If you need a copy of the string, see L</bytes_from_utf8>.
2397 
2398 =cut
2399 */
2400 
2401 U8 *
Perl_utf8_to_bytes(pTHX_ U8 * s,STRLEN * lenp)2402 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2403 {
2404     U8 * first_variant;
2405 
2406     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2407     PERL_UNUSED_CONTEXT;
2408 
2409     /* This is a no-op if no variants at all in the input */
2410     if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2411         return s;
2412     }
2413 
2414     /* Nothing before 'first_variant' needs to be changed, so start the real
2415      * work there */
2416 
2417     U8 * const save = s;
2418     U8 * const send = s + *lenp;
2419     U8 * d;
2420 
2421 #ifndef EBCDIC      /* The below relies on the bit patterns of UTF-8 */
2422 
2423     /* There is some start-up/tear-down overhead with this, so no real gain
2424      * unless the string is long enough.  The current value is just a
2425      * guess. */
2426     if (*lenp > 5 * PERL_WORDSIZE) {
2427 
2428         /* First, go through the string a word at-a-time to verify that it is
2429          * downgradable.  If it contains any start byte besides C2 and C3, then
2430          * it isn't. */
2431 
2432         const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0;
2433         const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2;
2434         const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE;
2435 
2436         /* Points to the first byte >=s which is positioned at a word boundary.
2437          * If s is on a word boundary, it is s, otherwise it is the first byte
2438          * of the next word. */
2439         U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
2440                                 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
2441 
2442         /* Here there is at least a full word beyond the first word boundary.
2443          * Process up to that boundary. */
2444         while (s < partial_word_end) {
2445             if (! UTF8_IS_INVARIANT(*s)) {
2446                 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2447                     *lenp = ((STRLEN) -1);
2448                     return NULL;
2449                 }
2450                 s++;
2451             }
2452             s++;
2453         }
2454 
2455         /* Adjust back down any overshoot */
2456         s = partial_word_end;
2457 
2458         /* Process per-word */
2459         do {
2460 
2461             PERL_UINTMAX_T C2_C3_start_bytes;
2462 
2463             /* First find the bytes that are start bytes.  ANDing with
2464              * C0C0...C0 causes any start byte to become C0; any other byte
2465              * becomes something else.  Then XORing with C0 causes any start
2466              * byte to become 0; all other bytes non-zero. */
2467             PERL_UINTMAX_T start_bytes
2468                           = ((* (PERL_UINTMAX_T *) s) & C0_mask) ^ C0_mask;
2469 
2470             /* These shifts causes the most significant bit to be set to 1 for
2471              * any bytes in the word that aren't completely 0.  Hence after
2472              * these, only the start bytes have 0 in their msb */
2473             start_bytes |= start_bytes << 1;
2474             start_bytes |= start_bytes << 2;
2475             start_bytes |= start_bytes << 4;
2476 
2477             /* When we complement, then AND with 8080...80, the start bytes
2478              * will have 1 in their msb, and all other bits are 0 */
2479             start_bytes = ~ start_bytes & PERL_VARIANTS_WORD_MASK;
2480 
2481             /* Now repeat the procedure, but look for bytes that match only
2482              * C2-C3. */
2483             C2_C3_start_bytes = ((* (PERL_UINTMAX_T *) s) & FE_mask)
2484                                                                 ^ C2_mask;
2485             C2_C3_start_bytes |= C2_C3_start_bytes << 1;
2486             C2_C3_start_bytes |= C2_C3_start_bytes << 2;
2487             C2_C3_start_bytes |= C2_C3_start_bytes << 4;
2488             C2_C3_start_bytes = ~ C2_C3_start_bytes
2489                                 & PERL_VARIANTS_WORD_MASK;
2490 
2491             /* Here, start_bytes has a 1 in the msb of each byte that has a
2492              *                                              start_byte; And
2493              * C2_C3_start_bytes has a 1 in the msb of each byte that has a
2494              *                                       start_byte of C2 or C3
2495              * If they're not equal, there are start bytes that aren't C2
2496              * nor C3, hence this is not downgradable */
2497             if (start_bytes != C2_C3_start_bytes) {
2498                 *lenp = ((STRLEN) -1);
2499                 return NULL;
2500             }
2501 
2502             s += PERL_WORDSIZE;
2503         } while (s + PERL_WORDSIZE <= send);
2504 
2505         /* If the final byte was a start byte, it means that the character
2506          * straddles two words, so back off one to start looking below at the
2507          * first byte of the character  */
2508         if (s > first_variant && UTF8_IS_START(*(s-1))) {
2509             s--;
2510         }
2511     }
2512 
2513 #endif
2514 
2515     /* Do the straggler bytes beyond the final word boundary (or all bytes
2516      * in the case of EBCDIC) */
2517     while (s < send) {
2518         if (! UTF8_IS_INVARIANT(*s)) {
2519             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2520                 *lenp = ((STRLEN) -1);
2521                 return NULL;
2522             }
2523             s++;
2524         }
2525         s++;
2526     }
2527 
2528     /* Here, we passed the tests above.  For the EBCDIC case, everything
2529      * was well-formed and can be downgraded to non-UTF8.  For non-EBCDIC,
2530      * it means only that all start bytes were C2 or C3, hence any
2531      * well-formed sequences are downgradable.  But we didn't test, for
2532      * example, that there weren't two C2's in a row.  That means that in
2533      * the loop below, we have to be sure things are well-formed.  Because
2534      * this is very very likely, and we don't care about having speedy
2535      * handling of malformed input, the loop proceeds as if well formed,
2536      * and should a malformed one come along, it undoes what it already has
2537      * done */
2538 
2539     d = s = first_variant;
2540 
2541     while (s < send) {
2542         U8 * s1;
2543 
2544         if (UVCHR_IS_INVARIANT(*s)) {
2545             *d++ = *s++;
2546             continue;
2547         }
2548 
2549         /* Here it is two-byte encoded. */
2550         if (   LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s))
2551             && LIKELY(UTF8_IS_CONTINUATION((s[1]))))
2552         {
2553             U8 first_byte = *s++;
2554             *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s);
2555             s++;
2556             continue;
2557         }
2558 
2559         /* Here, it is malformed.  This shouldn't happen on EBCDIC, and on
2560          * ASCII platforms, we know that the only start bytes in the text
2561          * are C2 and C3, and the code above has made sure that it doesn't
2562          * end with a start byte.  That means the only malformations that
2563          * are possible are a start byte without a continuation (either
2564          * followed by another start byte or an invariant) or an unexpected
2565          * continuation.
2566          *
2567          * We have to undo all we've done before, back down to the first
2568          * UTF-8 variant.  Note that each 2-byte variant we've done so far
2569          * (converted to single byte) slides things to the left one byte,
2570          * and so we have bytes that haven't been written over.
2571          *
2572          * Here, 'd' points to the next position to overwrite, and 's'
2573          * points to the first invalid byte.  That means 'd's contents
2574          * haven't been changed yet, nor has anything else beyond it in the
2575          * string.  In restoring to the original contents, we don't need to
2576          * do anything past (d-1).
2577          *
2578          * In particular, the bytes from 'd' to 's' have not been changed.
2579          * This loop uses a new variable 's1' (to avoid confusing 'source'
2580          * and 'destination') set to 'd',  and moves 's' and 's1' in lock
2581          * step back so that afterwards, 's1' points to the first changed
2582          * byte that will be the source for the first byte (or bytes) at
2583          * 's' that need to be changed back.  Note that s1 can expand to
2584          * two bytes */
2585         s1 = d;
2586         while (s >= d) {
2587             s--;
2588             if (! UVCHR_IS_INVARIANT(*s1)) {
2589                 s--;
2590             }
2591             s1--;
2592         }
2593 
2594         /* Do the changing back */
2595         while (s1 >= first_variant) {
2596             if (UVCHR_IS_INVARIANT(*s1)) {
2597                 *s-- = *s1--;
2598             }
2599             else {
2600                 *s-- = UTF8_EIGHT_BIT_LO(*s1);
2601                 *s-- = UTF8_EIGHT_BIT_HI(*s1);
2602                 s1--;
2603             }
2604         }
2605 
2606         *lenp = ((STRLEN) -1);
2607         return NULL;
2608     }
2609 
2610     /* Success! */
2611     *d = '\0';
2612     *lenp = d - save;
2613 
2614     return save;
2615 }
2616 
2617 /*
2618 =for apidoc bytes_from_utf8
2619 
2620 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2621 byte encoding.  On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2622 actually encoded in UTF-8.
2623 
2624 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2625 the input string.
2626 
2627 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2628 not expressible in native byte encoding.  In these cases, C<*is_utf8p> and
2629 C<*lenp> are unchanged, and the return value is the original C<s>.
2630 
2631 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2632 newly created string containing a downgraded copy of C<s>, and whose length is
2633 returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.  The
2634 caller is responsible for arranging for the memory used by this string to get
2635 freed.
2636 
2637 Upon successful return, the number of variants in the string can be computed by
2638 having saved the value of C<*lenp> before the call, and subtracting the
2639 after-call value of C<*lenp> from it.
2640 
2641 =cut
2642 
2643 There is a macro that avoids this function call, but this is retained for
2644 anyone who calls it with the Perl_ prefix */
2645 
2646 U8 *
Perl_bytes_from_utf8(pTHX_ const U8 * s,STRLEN * lenp,bool * is_utf8p)2647 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2648 {
2649     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2650     PERL_UNUSED_CONTEXT;
2651 
2652     return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2653 }
2654 
2655 /*
2656 =for apidoc bytes_from_utf8_loc
2657 
2658 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2659 to where to store the location of the first character in C<"s"> that cannot be
2660 converted to non-UTF8.
2661 
2662 If that parameter is C<NULL>, this function behaves identically to
2663 C<bytes_from_utf8>.
2664 
2665 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2666 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2667 
2668 Otherwise, the function returns a newly created C<NUL>-terminated string
2669 containing the non-UTF8 equivalent of the convertible first portion of
2670 C<"s">.  C<*lenp> is set to its length, not including the terminating C<NUL>.
2671 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2672 and C<*first_non_downgradable> is set to C<NULL>.
2673 
2674 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
2675 first character in the original string that wasn't converted.  C<*is_utf8p> is
2676 unchanged.  Note that the new string may have length 0.
2677 
2678 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2679 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2680 converts as many characters in it as possible stopping at the first one it
2681 finds that can't be converted to non-UTF-8.  C<*first_non_downgradable> is
2682 set to point to that.  The function returns the portion that could be converted
2683 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2684 not including the terminating C<NUL>.  If the very first character in the
2685 original could not be converted, C<*lenp> will be 0, and the new string will
2686 contain just a single C<NUL>.  If the entire input string was converted,
2687 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2688 
2689 Upon successful return, the number of variants in the converted portion of the
2690 string can be computed by having saved the value of C<*lenp> before the call,
2691 and subtracting the after-call value of C<*lenp> from it.
2692 
2693 =cut
2694 
2695 
2696 */
2697 
2698 U8 *
Perl_bytes_from_utf8_loc(const U8 * s,STRLEN * lenp,bool * is_utf8p,const U8 ** first_unconverted)2699 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2700 {
2701     U8 *d;
2702     const U8 *original = s;
2703     U8 *converted_start;
2704     const U8 *send = s + *lenp;
2705 
2706     PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2707 
2708     if (! *is_utf8p) {
2709         if (first_unconverted) {
2710             *first_unconverted = NULL;
2711         }
2712 
2713         return (U8 *) original;
2714     }
2715 
2716     Newx(d, (*lenp) + 1, U8);
2717 
2718     converted_start = d;
2719     while (s < send) {
2720         U8 c = *s++;
2721         if (! UTF8_IS_INVARIANT(c)) {
2722 
2723             /* Then it is multi-byte encoded.  If the code point is above 0xFF,
2724              * have to stop now */
2725             if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2726                 if (first_unconverted) {
2727                     *first_unconverted = s - 1;
2728                     goto finish_and_return;
2729                 }
2730                 else {
2731                     Safefree(converted_start);
2732                     return (U8 *) original;
2733                 }
2734             }
2735 
2736             c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2737             s++;
2738         }
2739         *d++ = c;
2740     }
2741 
2742     /* Here, converted the whole of the input */
2743     *is_utf8p = FALSE;
2744     if (first_unconverted) {
2745         *first_unconverted = NULL;
2746     }
2747 
2748   finish_and_return:
2749     *d = '\0';
2750     *lenp = d - converted_start;
2751 
2752     /* Trim unused space */
2753     Renew(converted_start, *lenp + 1, U8);
2754 
2755     return converted_start;
2756 }
2757 
2758 /*
2759 =for apidoc bytes_to_utf8
2760 
2761 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2762 UTF-8.
2763 Returns a pointer to the newly-created string, and sets C<*lenp> to
2764 reflect the new length in bytes.  The caller is responsible for arranging for
2765 the memory used by this string to get freed.
2766 
2767 Upon successful return, the number of variants in the string can be computed by
2768 having saved the value of C<*lenp> before the call, and subtracting it from the
2769 after-call value of C<*lenp>.
2770 
2771 A C<NUL> character will be written after the end of the string.
2772 
2773 If you want to convert to UTF-8 from encodings other than
2774 the native (Latin1 or EBCDIC),
2775 see L</sv_recode_to_utf8>().
2776 
2777 =cut
2778 */
2779 
2780 U8*
Perl_bytes_to_utf8(pTHX_ const U8 * s,STRLEN * lenp)2781 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2782 {
2783     const U8 * const send = s + (*lenp);
2784     U8 *d;
2785     U8 *dst;
2786 
2787     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2788     PERL_UNUSED_CONTEXT;
2789 
2790     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2791     Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
2792     dst = d;
2793 
2794     while (s < send) {
2795         append_utf8_from_native_byte(*s, &d);
2796         s++;
2797     }
2798 
2799     *d = '\0';
2800     *lenp = d-dst;
2801 
2802     return dst;
2803 }
2804 
2805 /*
2806  * Convert native UTF-16 to UTF-8. Called via the more public functions
2807  * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
2808  * little-endian,
2809  *
2810  * 'p' is the UTF-16 input string, passed as a pointer to U8.
2811  * 'bytelen' is its length (must be even)
2812  * 'd' is the pointer to the destination buffer.  The caller must ensure that
2813  *     the space is large enough.  The maximum expansion factor is 2 times
2814  *     'bytelen'.  1.5 if never going to run on an EBCDIC box.
2815  * '*newlen' will contain the number of bytes this function filled of 'd'.
2816  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2817  * 'low_byte' is 1  if UTF-16BE; 0 if UTF-16LE
2818  *
2819  * The expansion factor is because UTF-16 requires 2 bytes for every code point
2820  * below 0x10000; otherwise 4 bytes.  UTF-8 requires 1-3 bytes for every code
2821  * point below 0x1000; otherwise 4 bytes.  UTF-EBCDIC requires 1-4 bytes for
2822  * every code point below 0x1000; otherwise 4-5 bytes.
2823  *
2824  * The worst case is where every code point is below U+10000, hence requiring 2
2825  * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
2826  * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
2827  *
2828  * Do not use in-place. */
2829 
2830 U8*
Perl_utf16_to_utf8_base(pTHX_ U8 * p,U8 * d,Size_t bytelen,Size_t * newlen,const bool high_byte,const bool low_byte)2831 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
2832                               const bool high_byte, /* Which of next two bytes is
2833                                                   high order */
2834                               const bool low_byte)
2835 {
2836     U8* pend;
2837     U8* dstart = d;
2838 
2839     PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
2840 
2841     if (bytelen & 1)
2842         Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
2843                 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
2844     pend = p + bytelen;
2845 
2846     while (p < pend) {
2847 
2848         /* Next 16 bits is what we want.  (The bool is cast to U8 because on
2849          * platforms where a bool is implemented as a signed char, a compiler
2850          * warning may be generated) */
2851         U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2852         p += 2;
2853 
2854         /* If it's a surrogate, we find the uv that the surrogate pair encodes.
2855          * */
2856         if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
2857 
2858 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2859 #define LAST_HIGH_SURROGATE  0xDBFF
2860 #define FIRST_LOW_SURROGATE  0xDC00
2861 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
2862 #define FIRST_IN_PLANE1      0x10000
2863 
2864             if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2865                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2866             }
2867             else {
2868                 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2869                 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
2870                                                        LAST_LOW_SURROGATE)))
2871                 {
2872                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2873                 }
2874 
2875                 p += 2;
2876 
2877                 /* Here uv is the high surrogate.  Combine with low surrogate
2878                  * just computed to form the actual U32 code point.
2879                  *
2880                  * From https://unicode.org/faq/utf_bom.html#utf16-4 */
2881                 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
2882                                      + low_surrogate - FIRST_LOW_SURROGATE;
2883             }
2884         }
2885 
2886         /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
2887         d = uvchr_to_utf8(d, uv);
2888     }
2889 
2890     *newlen = d - dstart;
2891     return d;
2892 }
2893 
2894 U8*
Perl_utf16_to_utf8(pTHX_ U8 * p,U8 * d,Size_t bytelen,Size_t * newlen)2895 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2896 {
2897     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2898 
2899     return utf16_to_utf8(p, d, bytelen, newlen);
2900 }
2901 
2902 U8*
Perl_utf16_to_utf8_reversed(pTHX_ U8 * p,U8 * d,Size_t bytelen,Size_t * newlen)2903 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2904 {
2905     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2906 
2907     return utf16_to_utf8_reversed(p, d, bytelen, newlen);
2908 }
2909 
2910 /*
2911  * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
2912  * big-endian and utf8_to_utf16_reversed() for little-endian,
2913  *
2914  * 's' is the UTF-8 input string, passed as a pointer to U8.
2915  * 'bytelen' is its length
2916  * 'd' is the pointer to the destination buffer, currently passed as U8 *.  The
2917  *     caller must ensure that the space is large enough.  The maximum
2918  *     expansion factor is 2 times 'bytelen'.  This happens when the input is
2919  *     entirely single-byte ASCII, expanding to two-byte UTF-16.
2920  * '*newlen' will contain the number of bytes this function filled of 'd'.
2921  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2922  * 'low_byte'  is 1 if UTF-16BE; 0 if UTF-16LE
2923  *
2924  * Do not use in-place. */
2925 U8*
Perl_utf8_to_utf16_base(pTHX_ U8 * s,U8 * d,Size_t bytelen,Size_t * newlen,const bool high_byte,const bool low_byte)2926 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
2927                               const bool high_byte, /* Which of next two bytes
2928                                                        is high order */
2929                               const bool low_byte)
2930 {
2931     U8* send;
2932     U8* dstart = d;
2933 
2934     PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
2935 
2936     send = s + bytelen;
2937 
2938     while (s < send) {
2939         STRLEN retlen;
2940         UV uv = utf8n_to_uvchr(s, send - s, &retlen,
2941                                /* No surrogates nor above-Unicode */
2942                                UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
2943 
2944         /* The modern method is to keep going with malformed input,
2945          * substituting the REPLACEMENT CHARACTER */
2946         if (UNLIKELY(uv == 0 && *s != '\0')) {
2947             uv = UNICODE_REPLACEMENT;
2948         }
2949 
2950         if (uv >= FIRST_IN_PLANE1) {    /* Requires a surrogate pair */
2951 
2952             /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
2953             U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
2954                                + FIRST_HIGH_SURROGATE;
2955 
2956             /* (The bool is cast to U8 because on platforms where a bool is
2957              * implemented as a signed char, a compiler warning may be
2958              * generated) */
2959             d[(U8) high_byte] = high_surrogate >> 8;
2960             d[(U8) low_byte]  = high_surrogate & nBIT_MASK(8);
2961             d += 2;
2962 
2963             /* The low surrogate is the lower 10 bits plus the offset */
2964             uv &= nBIT_MASK(10);
2965             uv += FIRST_LOW_SURROGATE;
2966 
2967             /* Drop down to output the low surrogate like it were a
2968              * non-surrogate */
2969         }
2970 
2971         d[(U8) high_byte] = uv >> 8;
2972         d[(U8) low_byte] = uv & nBIT_MASK(8);
2973         d += 2;
2974 
2975         s += retlen;
2976     }
2977 
2978     *newlen = d - dstart;
2979     return d;
2980 }
2981 
2982 bool
Perl__is_uni_FOO(pTHX_ const U8 classnum,const UV c)2983 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2984 {
2985     return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
2986 }
2987 
2988 bool
Perl__is_uni_perl_idcont(pTHX_ UV c)2989 Perl__is_uni_perl_idcont(pTHX_ UV c)
2990 {
2991     return _invlist_contains_cp(PL_utf8_perl_idcont, c);
2992 }
2993 
2994 bool
Perl__is_uni_perl_idstart(pTHX_ UV c)2995 Perl__is_uni_perl_idstart(pTHX_ UV c)
2996 {
2997     return _invlist_contains_cp(PL_utf8_perl_idstart, c);
2998 }
2999 
3000 UV
Perl__to_upper_title_latin1(pTHX_ const U8 c,U8 * p,STRLEN * lenp,const char S_or_s)3001 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
3002                                   const char S_or_s)
3003 {
3004     /* We have the latin1-range values compiled into the core, so just use
3005      * those, converting the result to UTF-8.  The only difference between upper
3006      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
3007      * either "SS" or "Ss".  Which one to use is passed into the routine in
3008      * 'S_or_s' to avoid a test */
3009 
3010     UV converted = toUPPER_LATIN1_MOD(c);
3011 
3012     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
3013 
3014     assert(S_or_s == 'S' || S_or_s == 's');
3015 
3016     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
3017                                              characters in this range */
3018         *p = (U8) converted;
3019         *lenp = 1;
3020         return converted;
3021     }
3022 
3023     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
3024      * which it maps to one of them, so as to only have to have one check for
3025      * it in the main case */
3026     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3027         switch (c) {
3028             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
3029                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3030                 break;
3031             case MICRO_SIGN:
3032                 converted = GREEK_CAPITAL_LETTER_MU;
3033                 break;
3034 #if    UNICODE_MAJOR_VERSION > 2                                        \
3035    || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1           \
3036                                   && UNICODE_DOT_DOT_VERSION >= 8)
3037             case LATIN_SMALL_LETTER_SHARP_S:
3038                 *(p)++ = 'S';
3039                 *p = S_or_s;
3040                 *lenp = 2;
3041                 return 'S';
3042 #endif
3043             default:
3044                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
3045                                  " '%c' to map to '%c'",
3046                                  c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
3047                 NOT_REACHED; /* NOTREACHED */
3048         }
3049     }
3050 
3051     *(p)++ = UTF8_TWO_BYTE_HI(converted);
3052     *p = UTF8_TWO_BYTE_LO(converted);
3053     *lenp = 2;
3054 
3055     return converted;
3056 }
3057 
3058 /* If compiled on an early Unicode version, there may not be auxiliary tables
3059  * */
3060 #ifndef HAS_UC_AUX_TABLES
3061 #  define UC_AUX_TABLE_ptrs     NULL
3062 #  define UC_AUX_TABLE_lengths  NULL
3063 #endif
3064 #ifndef HAS_TC_AUX_TABLES
3065 #  define TC_AUX_TABLE_ptrs     NULL
3066 #  define TC_AUX_TABLE_lengths  NULL
3067 #endif
3068 #ifndef HAS_LC_AUX_TABLES
3069 #  define LC_AUX_TABLE_ptrs     NULL
3070 #  define LC_AUX_TABLE_lengths  NULL
3071 #endif
3072 #ifndef HAS_CF_AUX_TABLES
3073 #  define CF_AUX_TABLE_ptrs     NULL
3074 #  define CF_AUX_TABLE_lengths  NULL
3075 #endif
3076 
3077 /* Call the function to convert a UTF-8 encoded character to the specified case.
3078  * Note that there may be more than one character in the result.
3079  * 's' is a pointer to the first byte of the input character
3080  * 'd' will be set to the first byte of the string of changed characters.  It
3081  *	needs to have space for UTF8_MAXBYTES_CASE+1 bytes
3082  * 'lenp' will be set to the length in bytes of the string of changed characters
3083  *
3084  * The functions return the ordinal of the first character in the string of
3085  * 'd' */
3086 #define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
3087                 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper,              \
3088                                               Uppercase_Mapping_invmap,     \
3089                                               UC_AUX_TABLE_ptrs,            \
3090                                               UC_AUX_TABLE_lengths,         \
3091                                               "uppercase")
3092 #define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
3093                 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle,              \
3094                                               Titlecase_Mapping_invmap,     \
3095                                               TC_AUX_TABLE_ptrs,            \
3096                                               TC_AUX_TABLE_lengths,         \
3097                                               "titlecase")
3098 #define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
3099                 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower,              \
3100                                               Lowercase_Mapping_invmap,     \
3101                                               LC_AUX_TABLE_ptrs,            \
3102                                               LC_AUX_TABLE_lengths,         \
3103                                               "lowercase")
3104 
3105 
3106 /* This additionally has the input parameter 'specials', which if non-zero will
3107  * cause this to use the specials hash for folding (meaning get full case
3108  * folding); otherwise, when zero, this implies a simple case fold */
3109 #define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
3110         (specials)                                                          \
3111         ?  _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold,                    \
3112                                           Case_Folding_invmap,              \
3113                                           CF_AUX_TABLE_ptrs,                \
3114                                           CF_AUX_TABLE_lengths,             \
3115                                           "foldcase")                       \
3116         : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold,               \
3117                                          Simple_Case_Folding_invmap,        \
3118                                          NULL, NULL,                        \
3119                                          "foldcase")
3120 
3121 UV
Perl_to_uni_upper(pTHX_ UV c,U8 * p,STRLEN * lenp)3122 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
3123 {
3124     /* Convert the Unicode character whose ordinal is <c> to its uppercase
3125      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
3126      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
3127      * the changed version may be longer than the original character.
3128      *
3129      * The ordinal of the first character of the changed version is returned
3130      * (but note, as explained above, that there may be more.) */
3131 
3132     PERL_ARGS_ASSERT_TO_UNI_UPPER;
3133 
3134     if (c < 256) {
3135         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
3136     }
3137 
3138     return CALL_UPPER_CASE(c, NULL, p, lenp);
3139 }
3140 
3141 UV
Perl_to_uni_title(pTHX_ UV c,U8 * p,STRLEN * lenp)3142 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
3143 {
3144     PERL_ARGS_ASSERT_TO_UNI_TITLE;
3145 
3146     if (c < 256) {
3147         return _to_upper_title_latin1((U8) c, p, lenp, 's');
3148     }
3149 
3150     return CALL_TITLE_CASE(c, NULL, p, lenp);
3151 }
3152 
3153 STATIC U8
S_to_lower_latin1(const U8 c,U8 * p,STRLEN * lenp,const char dummy)3154 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
3155 {
3156     /* We have the latin1-range values compiled into the core, so just use
3157      * those, converting the result to UTF-8.  Since the result is always just
3158      * one character, we allow <p> to be NULL */
3159 
3160     U8 converted = toLOWER_LATIN1(c);
3161 
3162     PERL_UNUSED_ARG(dummy);
3163 
3164     if (p != NULL) {
3165         if (NATIVE_BYTE_IS_INVARIANT(converted)) {
3166             *p = converted;
3167             *lenp = 1;
3168         }
3169         else {
3170             /* Result is known to always be < 256, so can use the EIGHT_BIT
3171              * macros */
3172             *p = UTF8_EIGHT_BIT_HI(converted);
3173             *(p+1) = UTF8_EIGHT_BIT_LO(converted);
3174             *lenp = 2;
3175         }
3176     }
3177     return converted;
3178 }
3179 
3180 UV
Perl_to_uni_lower(pTHX_ UV c,U8 * p,STRLEN * lenp)3181 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
3182 {
3183     PERL_ARGS_ASSERT_TO_UNI_LOWER;
3184 
3185     if (c < 256) {
3186         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
3187     }
3188 
3189     return CALL_LOWER_CASE(c, NULL, p, lenp);
3190 }
3191 
3192 UV
Perl__to_fold_latin1(const U8 c,U8 * p,STRLEN * lenp,const unsigned int flags)3193 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
3194 {
3195     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
3196      *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3197      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
3198      *
3199      *	Not to be used for locale folds
3200      */
3201 
3202     UV converted;
3203 
3204     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
3205 
3206     assert (! (flags & FOLD_FLAGS_LOCALE));
3207 
3208     if (UNLIKELY(c == MICRO_SIGN)) {
3209         converted = GREEK_SMALL_LETTER_MU;
3210     }
3211 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
3212    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
3213                                       || UNICODE_DOT_DOT_VERSION > 0)
3214     else if (   (flags & FOLD_FLAGS_FULL)
3215              && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
3216     {
3217         /* If can't cross 127/128 boundary, can't return "ss"; instead return
3218          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
3219          * under those circumstances. */
3220         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
3221             *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3222             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3223                  p, *lenp, U8);
3224             return LATIN_SMALL_LETTER_LONG_S;
3225         }
3226         else {
3227             *(p)++ = 's';
3228             *p = 's';
3229             *lenp = 2;
3230             return 's';
3231         }
3232     }
3233 #endif
3234     else { /* In this range the fold of all other characters is their lower
3235               case */
3236         converted = toLOWER_LATIN1(c);
3237     }
3238 
3239     if (UVCHR_IS_INVARIANT(converted)) {
3240         *p = (U8) converted;
3241         *lenp = 1;
3242     }
3243     else {
3244         *(p)++ = UTF8_TWO_BYTE_HI(converted);
3245         *p = UTF8_TWO_BYTE_LO(converted);
3246         *lenp = 2;
3247     }
3248 
3249     return converted;
3250 }
3251 
3252 UV
Perl__to_uni_fold_flags(pTHX_ UV c,U8 * p,STRLEN * lenp,U8 flags)3253 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
3254 {
3255 
3256     /* Not currently externally documented, and subject to change
3257      *  <flags> bits meanings:
3258      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
3259      *	    FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3260      *	                      locale are to be used.
3261      *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3262      */
3263 
3264     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
3265 
3266     if (flags & FOLD_FLAGS_LOCALE) {
3267         /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3268          * except for potentially warning */
3269         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3270         if (IN_UTF8_CTYPE_LOCALE && ! IN_UTF8_TURKIC_LOCALE) {
3271             flags &= ~FOLD_FLAGS_LOCALE;
3272         }
3273         else {
3274             goto needs_full_generality;
3275         }
3276     }
3277 
3278     if (c < 256) {
3279         return _to_fold_latin1((U8) c, p, lenp,
3280                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
3281     }
3282 
3283     /* Here, above 255.  If no special needs, just use the macro */
3284     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3285         return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
3286     }
3287     else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
3288                the special flags. */
3289         U8 utf8_c[UTF8_MAXBYTES + 1];
3290 
3291       needs_full_generality:
3292         uvchr_to_utf8(utf8_c, c);
3293         return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c),
3294                                   p, lenp, flags);
3295     }
3296 }
3297 
3298 PERL_STATIC_INLINE bool
S_is_utf8_common(pTHX_ const U8 * const p,const U8 * const e,SV * const invlist)3299 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3300                        SV* const invlist)
3301 {
3302     /* returns a boolean giving whether or not the UTF8-encoded character that
3303      * starts at <p>, and extending no further than <e - 1> is in the inversion
3304      * list <invlist>. */
3305 
3306     UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3307 
3308     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3309 
3310     if (cp == 0 && (p >= e || *p != '\0')) {
3311         _force_out_malformed_utf8_message(p, e, 0, 1);
3312         NOT_REACHED; /* NOTREACHED */
3313     }
3314 
3315     assert(invlist);
3316     return _invlist_contains_cp(invlist, cp);
3317 }
3318 
3319 #if 0	/* Not currently used, but may be needed in the future */
PERLVAR(I,seen_deprecated_macro,HV *)3320 PERLVAR(I, seen_deprecated_macro, HV *)
3321 
3322 STATIC void
3323 S_warn_on_first_deprecated_use(pTHX_ U32 category,
3324                                      const char * const name,
3325                                      const char * const alternative,
3326                                      const bool use_locale,
3327                                      const char * const file,
3328                                      const unsigned line)
3329 {
3330     const char * key;
3331 
3332     PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3333 
3334     if (ckWARN_d(category)) {
3335 
3336         key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3337         if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3338             if (! PL_seen_deprecated_macro) {
3339                 PL_seen_deprecated_macro = newHV();
3340             }
3341             if (! hv_store(PL_seen_deprecated_macro, key,
3342                            strlen(key), &PL_sv_undef, 0))
3343             {
3344                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3345             }
3346 
3347             if (instr(file, "mathoms.c")) {
3348                 Perl_warner(aTHX_ category,
3349                             "In %s, line %d, starting in Perl v5.32, %s()"
3350                             " will be removed.  Avoid this message by"
3351                             " converting to use %s().\n",
3352                             file, line, name, alternative);
3353             }
3354             else {
3355                 Perl_warner(aTHX_ category,
3356                             "In %s, line %d, starting in Perl v5.32, %s() will"
3357                             " require an additional parameter.  Avoid this"
3358                             " message by converting to use %s().\n",
3359                             file, line, name, alternative);
3360             }
3361         }
3362     }
3363 }
3364 #endif
3365 
3366 bool
Perl__is_utf8_FOO(pTHX_ const U8 classnum,const U8 * p,const U8 * const e)3367 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
3368 {
3369     PERL_ARGS_ASSERT__IS_UTF8_FOO;
3370 
3371     return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
3372 }
3373 
3374 bool
Perl__is_utf8_perl_idstart(pTHX_ const U8 * p,const U8 * const e)3375 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
3376 {
3377     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
3378 
3379     return is_utf8_common(p, e, PL_utf8_perl_idstart);
3380 }
3381 
3382 bool
Perl__is_utf8_perl_idcont(pTHX_ const U8 * p,const U8 * const e)3383 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
3384 {
3385     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
3386 
3387     return is_utf8_common(p, e, PL_utf8_perl_idcont);
3388 }
3389 
3390 STATIC UV
S_to_case_cp_list(pTHX_ const UV original,const U32 ** const remaining_list,Size_t * remaining_count,SV * invlist,const I32 * const invmap,const U32 * const * const aux_tables,const U8 * const aux_table_lengths,const char * const normal)3391 S_to_case_cp_list(pTHX_
3392                   const UV original,
3393                   const U32 ** const remaining_list,
3394                   Size_t * remaining_count,
3395                   SV *invlist, const I32 * const invmap,
3396                   const U32 * const * const aux_tables,
3397                   const U8 * const aux_table_lengths,
3398                   const char * const normal)
3399 {
3400     SSize_t index;
3401     I32 base;
3402 
3403     /* Calculate the changed case of code point 'original'.  The first code
3404      * point of the changed case is returned.
3405      *
3406      * If 'remaining_count' is not NULL, *remaining_count will be set to how
3407      * many *other* code points are in the changed case.  If non-zero and
3408      * 'remaining_list' is also not NULL, *remaining_list will be set to point
3409      * to a non-modifiable array containing the second and potentially third
3410      * code points in the changed case.  (Unicode guarantees a maximum of 3.)
3411      * Note that this means that *remaining_list is undefined unless there are
3412      * multiple code points, and the caller has chosen to find out how many by
3413      * making 'remaining_count' not NULL.
3414      *
3415      * 'normal' is a string to use to name the new case in any generated
3416      * messages, as a fallback if the operation being used is not available.
3417      *
3418      * The casing to use is given by the data structures in the remaining
3419      * arguments.
3420      */
3421 
3422     PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
3423 
3424     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3425      * that covers all possible inputs.  See [perl #133365] */
3426     index = _invlist_search(invlist, original);
3427     base = invmap[index];
3428 
3429     /* Most likely, the case change will contain just a single code point */
3430     if (remaining_count) {
3431         *remaining_count = 0;
3432     }
3433 
3434     if (LIKELY(base == 0)) {    /* 0 => original was unchanged by casing */
3435 
3436         /* At this bottom level routine is where we warn about illegal code
3437          * points */
3438         if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
3439             if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
3440                 if (ckWARN_d(WARN_SURROGATE)) {
3441                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3442                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3443                         "Operation \"%s\" returns its argument for"
3444                         " UTF-16 surrogate U+%04" UVXf, desc, original);
3445                 }
3446             }
3447             else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
3448                 if (UNLIKELY(original > MAX_LEGAL_CP)) {
3449                     Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
3450                 }
3451                 if (ckWARN_d(WARN_NON_UNICODE)) {
3452                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3453                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3454                         "Operation \"%s\" returns its argument for"
3455                         " non-Unicode code point 0x%04" UVXf, desc, original);
3456                 }
3457             }
3458 
3459             /* Note that non-characters are perfectly legal, so no warning
3460              * should be given. */
3461         }
3462 
3463         return original;
3464     }
3465 
3466     if (LIKELY(base > 0)) {  /* means original mapped to a single code point,
3467                                 different from itself */
3468         return base + original - invlist_array(invlist)[index];
3469     }
3470 
3471     /* Here 'base' is negative.  That means the mapping is 1-to-many, and
3472      * requires an auxiliary table look up.  abs(base) gives the index into a
3473      * list of such tables which points to the proper aux table.  And a
3474      * parallel list gives the length of each corresponding aux table.  Skip
3475      * the first entry in the *remaining returns, as it is returned by the
3476      * function. */
3477     base = -base;
3478     if (remaining_count) {
3479         *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
3480 
3481         if (remaining_list) {
3482             *remaining_list  = aux_tables[base] + 1;
3483         }
3484     }
3485 
3486     return (UV) aux_tables[base][0];
3487 }
3488 
3489 STATIC UV
S__to_utf8_case(pTHX_ const UV original,const U8 * p,U8 * ustrp,STRLEN * lenp,SV * invlist,const I32 * const invmap,const U32 * const * const aux_tables,const U8 * const aux_table_lengths,const char * const normal)3490 S__to_utf8_case(pTHX_ const UV original, const U8 *p,
3491                       U8* ustrp, STRLEN *lenp,
3492                       SV *invlist, const I32 * const invmap,
3493                       const U32 * const * const aux_tables,
3494                       const U8 * const aux_table_lengths,
3495                       const char * const normal)
3496 {
3497     /* Change the case of code point 'original'.  If 'p' is non-NULL, it points to
3498      * the beginning of the (assumed to be valid) UTF-8 representation of
3499      * 'original'.  'normal' is a string to use to name the new case in any
3500      * generated messages, as a fallback if the operation being used is not
3501      * available.  The new case is given by the data structures in the
3502      * remaining arguments.
3503      *
3504      * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3505      * entire changed case string, and the return value is the first code point
3506      * in that string
3507      *
3508      * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
3509      * since the changed version may be longer than the original character. */
3510 
3511     const U32 * remaining_list;
3512     Size_t remaining_count;
3513     UV first = to_case_cp_list(original,
3514                                &remaining_list, &remaining_count,
3515                                invlist, invmap,
3516                                aux_tables, aux_table_lengths,
3517                                normal);
3518 
3519     PERL_ARGS_ASSERT__TO_UTF8_CASE;
3520 
3521     /* If the code point maps to itself and we already have its representation,
3522      * copy it instead of recalculating */
3523     if (original == first && p) {
3524         *lenp = UTF8SKIP(p);
3525 
3526         if (p != ustrp) {   /* Don't copy onto itself */
3527             Copy(p, ustrp, *lenp, U8);
3528         }
3529     }
3530     else {
3531         U8 * d = ustrp;
3532         Size_t i;
3533 
3534         d = uvchr_to_utf8(d, first);
3535 
3536         for (i = 0; i < remaining_count; i++) {
3537             d = uvchr_to_utf8(d, remaining_list[i]);
3538         }
3539 
3540         *d = '\0';
3541         *lenp = d - ustrp;
3542     }
3543 
3544     return first;
3545 }
3546 
3547 Size_t
Perl__inverse_folds(pTHX_ const UV cp,U32 * first_folds_to,const U32 ** remaining_folds_to)3548 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
3549                           const U32 ** remaining_folds_to)
3550 {
3551     /* Returns the count of the number of code points that fold to the input
3552      * 'cp' (besides itself).
3553      *
3554      * If the return is 0, there is nothing else that folds to it, and
3555      * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3556      *
3557      * If the return is 1, '*first_folds_to' is set to the single code point,
3558      * and '*remaining_folds_to' is set to NULL.
3559      *
3560      * Otherwise, '*first_folds_to' is set to a code point, and
3561      * '*remaining_fold_to' is set to an array that contains the others.  The
3562      * length of this array is the returned count minus 1.
3563      *
3564      * The reason for this convolution is to avoid having to deal with
3565      * allocating and freeing memory.  The lists are already constructed, so
3566      * the return can point to them, but single code points aren't, so would
3567      * need to be constructed if we didn't employ something like this API
3568      *
3569      * The code points returned by this function are all legal Unicode, which
3570      * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3571      * constructed with this size (to save space and memory), and we return
3572      * pointers, so they must be this size */
3573 
3574     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3575      * that covers all possible inputs.  See [perl #133365] */
3576     SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3577     I32 base = _Perl_IVCF_invmap[index];
3578 
3579     PERL_ARGS_ASSERT__INVERSE_FOLDS;
3580 
3581     if (base == 0) {            /* No fold */
3582         *first_folds_to = 0;
3583         *remaining_folds_to = NULL;
3584         return 0;
3585     }
3586 
3587 #ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
3588 
3589     assert(base > 0);
3590 
3591 #else
3592 
3593     if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
3594 
3595         /* The data structure is set up so that the absolute value of 'base' is
3596          * an index into a table of pointers to arrays, with the array
3597          * corresponding to the index being the list of code points that fold
3598          * to 'cp', and the parallel array containing the length of the list
3599          * array */
3600         *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3601         *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3602                                                 /* +1 excludes first_folds_to */
3603         return IVCF_AUX_TABLE_lengths[-base];
3604     }
3605 
3606 #endif
3607 
3608     /* Only the single code point.  This works like 'fc(G) = G - A + a' */
3609     *first_folds_to = (U32) (base + cp
3610                                   - invlist_array(PL_utf8_foldclosures)[index]);
3611     *remaining_folds_to = NULL;
3612     return 1;
3613 }
3614 
3615 STATIC UV
S_check_locale_boundary_crossing(pTHX_ const U8 * const p,const UV result,U8 * const ustrp,STRLEN * lenp)3616 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3617                                        U8* const ustrp, STRLEN *lenp)
3618 {
3619     /* This is called when changing the case of a UTF-8-encoded character above
3620      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
3621      * result contains a character that crosses the 255/256 boundary, disallow
3622      * the change, and return the original code point.  See L<perlfunc/lc> for
3623      * why;
3624      *
3625      * p	points to the original string whose case was changed; assumed
3626      *          by this routine to be well-formed
3627      * result	the code point of the first character in the changed-case string
3628      * ustrp	points to the changed-case string (<result> represents its
3629      *          first char)
3630      * lenp	points to the length of <ustrp> */
3631 
3632     UV original;    /* To store the first code point of <p> */
3633 
3634     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3635 
3636     assert(UTF8_IS_ABOVE_LATIN1(*p));
3637 
3638     /* We know immediately if the first character in the string crosses the
3639      * boundary, so can skip testing */
3640     if (result > 255) {
3641 
3642         /* Look at every character in the result; if any cross the
3643         * boundary, the whole thing is disallowed */
3644         U8* s = ustrp + UTF8SKIP(ustrp);
3645         U8* e = ustrp + *lenp;
3646         while (s < e) {
3647             if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3648                 goto bad_crossing;
3649             }
3650             s += UTF8SKIP(s);
3651         }
3652 
3653         /* Here, no characters crossed, result is ok as-is, but we warn. */
3654         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3655         return result;
3656     }
3657 
3658   bad_crossing:
3659 
3660     /* Failed, have to return the original */
3661     original = valid_utf8_to_uvchr(p, lenp);
3662 
3663     /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3664     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3665                            "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3666                            " locale; resolved to \"\\x{%" UVXf "}\".",
3667                            OP_DESC(PL_op),
3668                            original,
3669                            original);
3670     Copy(p, ustrp, *lenp, char);
3671     return original;
3672 }
3673 
3674 STATIC UV
S_turkic_fc(pTHX_ const U8 * const p,const U8 * const e,U8 * ustrp,STRLEN * lenp)3675 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3676                         U8 * ustrp, STRLEN *lenp)
3677 {
3678     /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3679      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3680      * Otherwise, it returns the first code point of the Turkic foldcased
3681      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3682      * contain *lenp bytes
3683      *
3684      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3685      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3686      * DOTLESS I */
3687 
3688     PERL_ARGS_ASSERT_TURKIC_FC;
3689     assert(e > p);
3690 
3691     if (UNLIKELY(*p == 'I')) {
3692         *lenp = 2;
3693         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3694         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3695         return LATIN_SMALL_LETTER_DOTLESS_I;
3696     }
3697 
3698     if (UNLIKELY(memBEGINs(p, e - p,
3699                            LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3700     {
3701         *lenp = 1;
3702         *ustrp = 'i';
3703         return 'i';
3704     }
3705 
3706     return 0;
3707 }
3708 
3709 STATIC UV
S_turkic_lc(pTHX_ const U8 * const p0,const U8 * const e,U8 * ustrp,STRLEN * lenp)3710 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3711                         U8 * ustrp, STRLEN *lenp)
3712 {
3713     /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3714      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3715      * Otherwise, it returns the first code point of the Turkic lowercased
3716      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3717      * contain *lenp bytes */
3718 
3719     PERL_ARGS_ASSERT_TURKIC_LC;
3720     assert(e > p0);
3721 
3722     /* A 'I' requires context as to what to do */
3723     if (UNLIKELY(*p0 == 'I')) {
3724         const U8 * p = p0 + 1;
3725 
3726         /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3727          * modified by a dot above lowercases to 'i' even in turkic locales. */
3728         while (p < e) {
3729             UV cp;
3730 
3731             if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3732                 ustrp[0] = 'i';
3733                 *lenp = 1;
3734                 return 'i';
3735             }
3736 
3737             /* For the dot above to modify the 'I', it must be part of a
3738              * combining sequence immediately following the 'I', and no other
3739              * modifier with a ccc of 230 may intervene */
3740             cp = utf8_to_uvchr_buf(p, e, NULL);
3741             if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3742                 break;
3743             }
3744 
3745             /* Here the combining sequence continues */
3746             p += UTF8SKIP(p);
3747         }
3748     }
3749 
3750     /* In all other cases the lc is the same as the fold */
3751     return turkic_fc(p0, e, ustrp, lenp);
3752 }
3753 
3754 STATIC UV
S_turkic_uc(pTHX_ const U8 * const p,const U8 * const e,U8 * ustrp,STRLEN * lenp)3755 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3756                         U8 * ustrp, STRLEN *lenp)
3757 {
3758     /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3759      * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3760      * Otherwise, it returns the first code point of the Turkic upper or
3761      * title-cased sequence, and the entire sequence will be stored in *ustrp.
3762      * ustrp will contain *lenp bytes
3763      *
3764      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3765      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3766      * DOTLESS I */
3767 
3768     PERL_ARGS_ASSERT_TURKIC_UC;
3769     assert(e > p);
3770 
3771     if (*p == 'i') {
3772         *lenp = 2;
3773         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3774         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3775         return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3776     }
3777 
3778     if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3779         *lenp = 1;
3780         *ustrp = 'I';
3781         return 'I';
3782     }
3783 
3784     return 0;
3785 }
3786 
3787 /* The process for changing the case is essentially the same for the four case
3788  * change types, except there are complications for folding.  Otherwise the
3789  * difference is only which case to change to.  To make sure that they all do
3790  * the same thing, the bodies of the functions are extracted out into the
3791  * following two macros.  The functions are written with the same variable
3792  * names, and these are known and used inside these macros.  It would be
3793  * better, of course, to have inline functions to do it, but since different
3794  * macros are called, depending on which case is being changed to, this is not
3795  * feasible in C (to khw's knowledge).  Two macros are created so that the fold
3796  * function can start with the common start macro, then finish with its special
3797  * handling; while the other three cases can just use the common end macro.
3798  *
3799  * The algorithm is to use the proper (passed in) macro or function to change
3800  * the case for code points that are below 256.  The macro is used if using
3801  * locale rules for the case change; the function if not.  If the code point is
3802  * above 255, it is computed from the input UTF-8, and another macro is called
3803  * to do the conversion.  If necessary, the output is converted to UTF-8.  If
3804  * using a locale, we have to check that the change did not cross the 255/256
3805  * boundary, see check_locale_boundary_crossing() for further details.
3806  *
3807  * The macros are split with the correct case change for the below-256 case
3808  * stored into 'result', and in the middle of an else clause for the above-255
3809  * case.  At that point in the 'else', 'result' is not the final result, but is
3810  * the input code point calculated from the UTF-8.  The fold code needs to
3811  * realize all this and take it from there.
3812  *
3813  * To deal with Turkic locales, the function specified by the parameter
3814  * 'turkic' is called when appropriate.
3815  *
3816  * If you read the two macros as sequential, it's easier to understand what's
3817  * going on. */
3818 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func,  \
3819                                L1_func_extra_param, turkic)                  \
3820                                                                              \
3821     if (flags & (locale_flags)) {                                            \
3822         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                  \
3823         if (IN_UTF8_CTYPE_LOCALE) {                                          \
3824             if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) {                           \
3825                 UV ret = turkic(p, e, ustrp, lenp);                          \
3826                 if (ret) return ret;                                         \
3827             }                                                                \
3828                                                                              \
3829             /* Otherwise, treat a UTF-8 locale as not being in locale at     \
3830              * all */                                                        \
3831             flags &= ~(locale_flags);                                        \
3832         }                                                                    \
3833     }                                                                        \
3834                                                                              \
3835     if (UTF8_IS_INVARIANT(*p)) {                                             \
3836         if (flags & (locale_flags)) {                                        \
3837             result = libc_change_function(*p);                               \
3838         }                                                                    \
3839         else {                                                               \
3840             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
3841         }                                                                    \
3842     }                                                                        \
3843     else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
3844         U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));                         \
3845         if (flags & (locale_flags)) {                                        \
3846             result = libc_change_function(c);                                \
3847         }                                                                    \
3848         else {                                                               \
3849             return L1_func(c, ustrp, lenp,  L1_func_extra_param);            \
3850         }                                                                    \
3851     }                                                                        \
3852     else {  /* malformed UTF-8 or ord above 255 */                           \
3853         STRLEN len_result;                                                   \
3854         result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
3855         if (len_result == (STRLEN) -1) {                                     \
3856             _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ );        \
3857         }
3858 
3859 #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
3860         result = change_macro(result, p, ustrp, lenp);                       \
3861                                                                              \
3862         if (flags & (locale_flags)) {                                        \
3863             result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3864         }                                                                    \
3865         return result;                                                       \
3866     }                                                                        \
3867                                                                              \
3868     /* Here, used locale rules.  Convert back to UTF-8 */                    \
3869     if (UTF8_IS_INVARIANT(result)) {                                         \
3870         *ustrp = (U8) result;                                                \
3871         *lenp = 1;                                                           \
3872     }                                                                        \
3873     else {                                                                   \
3874         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);                             \
3875         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);                       \
3876         *lenp = 2;                                                           \
3877     }                                                                        \
3878                                                                              \
3879     return result;
3880 
3881 /* Not currently externally documented, and subject to change:
3882  * <flags> is set iff the rules from the current underlying locale are to
3883  *         be used. */
3884 
3885 UV
Perl__to_utf8_upper_flags(pTHX_ const U8 * p,const U8 * e,U8 * ustrp,STRLEN * lenp,bool flags)3886 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3887                                 const U8 *e,
3888                                 U8* ustrp,
3889                                 STRLEN *lenp,
3890                                 bool flags)
3891 {
3892     UV result;
3893 
3894     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3895 
3896     /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3897     /* 2nd char of uc(U+DF) is 'S' */
3898     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
3899                                                                     turkic_uc);
3900     CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
3901 }
3902 
3903 /* Not currently externally documented, and subject to change:
3904  * <flags> is set iff the rules from the current underlying locale are to be
3905  *         used.  Since titlecase is not defined in POSIX, for other than a
3906  *         UTF-8 locale, uppercase is used instead for code points < 256.
3907  */
3908 
3909 UV
Perl__to_utf8_title_flags(pTHX_ const U8 * p,const U8 * e,U8 * ustrp,STRLEN * lenp,bool flags)3910 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3911                                 const U8 *e,
3912                                 U8* ustrp,
3913                                 STRLEN *lenp,
3914                                 bool flags)
3915 {
3916     UV result;
3917 
3918     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3919 
3920     /* 2nd char of ucfirst(U+DF) is 's' */
3921     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
3922                                                                     turkic_uc);
3923     CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
3924 }
3925 
3926 /* Not currently externally documented, and subject to change:
3927  * <flags> is set iff the rules from the current underlying locale are to
3928  *         be used.
3929  */
3930 
3931 UV
Perl__to_utf8_lower_flags(pTHX_ const U8 * p,const U8 * e,U8 * ustrp,STRLEN * lenp,bool flags)3932 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3933                                 const U8 *e,
3934                                 U8* ustrp,
3935                                 STRLEN *lenp,
3936                                 bool flags)
3937 {
3938     UV result;
3939 
3940     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3941 
3942     CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
3943                                                                     turkic_lc);
3944     CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
3945 }
3946 
3947 /* Not currently externally documented, and subject to change,
3948  * in <flags>
3949  *	bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3950  *	                      locale are to be used.
3951  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
3952  *			      otherwise simple folds
3953  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3954  *			      prohibited
3955  */
3956 
3957 UV
Perl__to_utf8_fold_flags(pTHX_ const U8 * p,const U8 * e,U8 * ustrp,STRLEN * lenp,U8 flags)3958 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3959                                const U8 *e,
3960                                U8* ustrp,
3961                                STRLEN *lenp,
3962                                U8 flags)
3963 {
3964     UV result;
3965 
3966     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3967 
3968     /* These are mutually exclusive */
3969     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3970 
3971     assert(p != ustrp); /* Otherwise overwrites */
3972 
3973     CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
3974                  ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3975                                                                     turkic_fc);
3976 
3977         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3978 
3979         if (flags & FOLD_FLAGS_LOCALE) {
3980 
3981 #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3982 #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3983 #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3984 
3985             /* Special case these two characters, as what normally gets
3986              * returned under locale doesn't work */
3987             if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
3988             {
3989                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3990                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3991                               "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3992                               "resolved to \"\\x{17F}\\x{17F}\".");
3993                 goto return_long_s;
3994             }
3995             else
3996 #endif
3997                  if (memBEGINs((char *) p, e - p, LONG_S_T))
3998             {
3999                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
4000                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
4001                               "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
4002                               "resolved to \"\\x{FB06}\".");
4003                 goto return_ligature_st;
4004             }
4005 
4006 #if    UNICODE_MAJOR_VERSION   == 3         \
4007     && UNICODE_DOT_VERSION     == 0         \
4008     && UNICODE_DOT_DOT_VERSION == 1
4009 #           define DOTTED_I   LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
4010 
4011             /* And special case this on this Unicode version only, for the same
4012              * reaons the other two are special cased.  They would cross the
4013              * 255/256 boundary which is forbidden under /l, and so the code
4014              * wouldn't catch that they are equivalent (which they are only in
4015              * this release) */
4016             else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
4017                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
4018                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
4019                               "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
4020                               "resolved to \"\\x{0131}\".");
4021                 goto return_dotless_i;
4022             }
4023 #endif
4024 
4025             return check_locale_boundary_crossing(p, result, ustrp, lenp);
4026         }
4027         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
4028             return result;
4029         }
4030         else {
4031             /* This is called when changing the case of a UTF-8-encoded
4032              * character above the ASCII range, and the result should not
4033              * contain an ASCII character. */
4034 
4035             UV original;    /* To store the first code point of <p> */
4036 
4037             /* Look at every character in the result; if any cross the
4038             * boundary, the whole thing is disallowed */
4039             U8* s = ustrp;
4040             U8* send = ustrp + *lenp;
4041             while (s < send) {
4042                 if (isASCII(*s)) {
4043                     /* Crossed, have to return the original */
4044                     original = valid_utf8_to_uvchr(p, lenp);
4045 
4046                     /* But in these instances, there is an alternative we can
4047                      * return that is valid */
4048                     if (original == LATIN_SMALL_LETTER_SHARP_S
4049 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
4050                         || original == LATIN_CAPITAL_LETTER_SHARP_S
4051 #endif
4052                     ) {
4053                         goto return_long_s;
4054                     }
4055                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
4056                         goto return_ligature_st;
4057                     }
4058 #if    UNICODE_MAJOR_VERSION   == 3         \
4059     && UNICODE_DOT_VERSION     == 0         \
4060     && UNICODE_DOT_DOT_VERSION == 1
4061 
4062                     else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
4063                         goto return_dotless_i;
4064                     }
4065 #endif
4066                     Copy(p, ustrp, *lenp, char);
4067                     return original;
4068                 }
4069                 s += UTF8SKIP(s);
4070             }
4071 
4072             /* Here, no characters crossed, result is ok as-is */
4073             return result;
4074         }
4075     }
4076 
4077     /* Here, used locale rules.  Convert back to UTF-8 */
4078     if (UTF8_IS_INVARIANT(result)) {
4079         *ustrp = (U8) result;
4080         *lenp = 1;
4081     }
4082     else {
4083         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
4084         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
4085         *lenp = 2;
4086     }
4087 
4088     return result;
4089 
4090   return_long_s:
4091     /* Certain folds to 'ss' are prohibited by the options, but they do allow
4092      * folds to a string of two of these characters.  By returning this
4093      * instead, then, e.g.,
4094      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
4095      * works. */
4096 
4097     *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
4098     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
4099         ustrp, *lenp, U8);
4100     return LATIN_SMALL_LETTER_LONG_S;
4101 
4102   return_ligature_st:
4103     /* Two folds to 'st' are prohibited by the options; instead we pick one and
4104      * have the other one fold to it */
4105 
4106     *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8);
4107     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
4108     return LATIN_SMALL_LIGATURE_ST;
4109 
4110 #if    UNICODE_MAJOR_VERSION   == 3         \
4111     && UNICODE_DOT_VERSION     == 0         \
4112     && UNICODE_DOT_DOT_VERSION == 1
4113 
4114   return_dotless_i:
4115     *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8);
4116     Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
4117     return LATIN_SMALL_LETTER_DOTLESS_I;
4118 
4119 #endif
4120 
4121 }
4122 
4123 bool
4124 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
4125 {
4126     /* May change: warns if surrogates, non-character code points, or
4127      * non-Unicode code points are in 's' which has length 'len' bytes.
4128      * Returns TRUE if none found; FALSE otherwise.  The only other validity
4129      * check is to make sure that this won't exceed the string's length nor
4130      * overflow */
4131 
4132     const U8* const e = s + len;
4133     bool ok = TRUE;
4134 
4135     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4136 
4137     while (s < e) {
4138         if (UTF8SKIP(s) > len) {
4139             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4140                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4141             return FALSE;
4142         }
4143         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
4144             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
4145                 if (   ckWARN_d(WARN_NON_UNICODE)
4146                     || UNLIKELY(0 < does_utf8_overflow(s, s + len,
4147                                                0 /* Don't consider overlongs */
4148                                                )))
4149                 {
4150                     /* A side effect of this function will be to warn */
4151                     (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
4152                     ok = FALSE;
4153                 }
4154             }
4155             else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
4156                 if (ckWARN_d(WARN_SURROGATE)) {
4157                     /* This has a different warning than the one the called
4158                      * function would output, so can't just call it, unlike we
4159                      * do for the non-chars and above-unicodes */
4160                     UV uv = utf8_to_uvchr_buf(s, e, NULL);
4161                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4162                         "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
4163                                              uv);
4164                     ok = FALSE;
4165                 }
4166             }
4167             else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
4168                      && (ckWARN_d(WARN_NONCHAR)))
4169             {
4170                 /* A side effect of this function will be to warn */
4171                 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
4172                 ok = FALSE;
4173             }
4174         }
4175         s += UTF8SKIP(s);
4176     }
4177 
4178     return ok;
4179 }
4180 
4181 /*
4182 =for apidoc pv_uni_display
4183 
4184 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
4185 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
4186 long (if longer, the rest is truncated and C<"..."> will be appended).
4187 
4188 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
4189 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
4190 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
4191 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
4192 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
4193 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
4194 
4195 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
4196 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
4197 
4198 The pointer to the PV of the C<dsv> is returned.
4199 
4200 See also L</sv_uni_display>.
4201 
4202 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH
4203 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE
4204 =for apidoc Amnh||UNI_DISPLAY_ISPRINT
4205 =for apidoc Amnh||UNI_DISPLAY_QQ
4206 =for apidoc Amnh||UNI_DISPLAY_REGEX
4207 =cut
4208 */
4209 char *
4210 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
4211                           UV flags)
4212 {
4213     int truncated = 0;
4214     const char *s, *e;
4215 
4216     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4217 
4218     SvPVCLEAR(dsv);
4219     SvUTF8_off(dsv);
4220     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4221          UV u;
4222          bool ok = 0;
4223 
4224          if (pvlim && SvCUR(dsv) >= pvlim) {
4225               truncated++;
4226               break;
4227          }
4228          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4229          if (u < 256) {
4230              const U8 c = (U8) u;
4231              if (flags & UNI_DISPLAY_BACKSLASH) {
4232                  if (    isMNEMONIC_CNTRL(c)
4233                      && (   c != '\b'
4234                          || (flags & UNI_DISPLAY_BACKSPACE)))
4235                  {
4236                     const char * mnemonic = cntrl_to_mnemonic(c);
4237                     sv_catpvn(dsv, mnemonic, strlen(mnemonic));
4238                     ok = 1;
4239                  }
4240                  else if (c == '\\') {
4241                     sv_catpvs(dsv, "\\\\");
4242                     ok = 1;
4243                  }
4244              }
4245              /* isPRINT() is the locale-blind version. */
4246              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4247                  const char string = c;
4248                  sv_catpvn(dsv, &string, 1);
4249                  ok = 1;
4250              }
4251          }
4252          if (!ok)
4253              Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
4254     }
4255     if (truncated)
4256          sv_catpvs(dsv, "...");
4257 
4258     return SvPVX(dsv);
4259 }
4260 
4261 /*
4262 =for apidoc sv_uni_display
4263 
4264 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4265 the displayable version being at most C<pvlim> bytes long
4266 (if longer, the rest is truncated and "..." will be appended).
4267 
4268 The C<flags> argument is as in L</pv_uni_display>().
4269 
4270 The pointer to the PV of the C<dsv> is returned.
4271 
4272 =cut
4273 */
4274 char *
4275 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4276 {
4277     const char * const ptr =
4278         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4279 
4280     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4281 
4282     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4283                                 SvCUR(ssv), pvlim, flags);
4284 }
4285 
4286 /*
4287 =for apidoc foldEQ_utf8
4288 
4289 Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4290 both of which may be in UTF-8) are the same case-insensitively; false
4291 otherwise.  How far into the strings to compare is determined by other input
4292 parameters.
4293 
4294 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4295 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for
4296 C<u2> with respect to C<s2>.
4297 
4298 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4299 fold equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.
4300 The scan will not be considered to be a match unless the goal is reached, and
4301 scanning won't continue past that goal.  Correspondingly for C<l2> with respect
4302 to C<s2>.
4303 
4304 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4305 pointer is considered an end pointer to the position 1 byte past the maximum
4306 point in C<s1> beyond which scanning will not continue under any circumstances.
4307 (This routine assumes that UTF-8 encoded input strings are not malformed;
4308 malformed input can cause it to read past C<pe1>).  This means that if both
4309 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4310 will never be successful because it can never
4311 get as far as its goal (and in fact is asserted against).  Correspondingly for
4312 C<pe2> with respect to C<s2>.
4313 
4314 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4315 C<l2> must be non-zero), and if both do, both have to be
4316 reached for a successful match.   Also, if the fold of a character is multiple
4317 characters, all of them must be matched (see tr21 reference below for
4318 'folding').
4319 
4320 Upon a successful match, if C<pe1> is non-C<NULL>,
4321 it will be set to point to the beginning of the I<next> character of C<s1>
4322 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4323 
4324 For case-insensitiveness, the "casefolding" of Unicode is used
4325 instead of upper/lowercasing both the characters, see
4326 L<https://www.unicode.org/reports/tr21/> (Case Mappings).
4327 
4328 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII
4329 =for apidoc Cmnh||FOLDEQ_LOCALE
4330 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED
4331 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE
4332 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED
4333 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE
4334 
4335 =cut */
4336 
4337 /* A flags parameter has been added which may change, and hence isn't
4338  * externally documented.  Currently it is:
4339  *  0 for as-documented above
4340  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4341                             ASCII one, to not match
4342  *  FOLDEQ_LOCALE	    is set iff the rules from the current underlying
4343  *	                    locale are to be used.
4344  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
4345  *                          routine.  This allows that step to be skipped.
4346  *                          Currently, this requires s1 to be encoded as UTF-8
4347  *                          (u1 must be true), which is asserted for.
4348  *  FOLDEQ_S1_FOLDS_SANE    With either NOMIX_ASCII or LOCALE, no folds may
4349  *                          cross certain boundaries.  Hence, the caller should
4350  *                          let this function do the folding instead of
4351  *                          pre-folding.  This code contains an assertion to
4352  *                          that effect.  However, if the caller knows what
4353  *                          it's doing, it can pass this flag to indicate that,
4354  *                          and the assertion is skipped.
4355  *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4356  *                          to s2, and s2 doesn't have to be UTF-8 encoded.
4357  *                          This introduces an asymmetry to save a few branches
4358  *                          in a loop.  Currently, this is not a problem, as
4359  *                          never are both inputs pre-folded.  Simply call this
4360  *                          function with the pre-folded one as the second
4361  *                          string.
4362  *  FOLDEQ_S2_FOLDS_SANE
4363  */
4364 
4365 I32
4366 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4367                              const char *s2, char **pe2, UV l2, bool u2,
4368                              U32 flags)
4369 {
4370     const U8 *p1  = (const U8*)s1; /* Point to current char */
4371     const U8 *p2  = (const U8*)s2;
4372     const U8 *g1 = NULL;       /* goal for s1 */
4373     const U8 *g2 = NULL;
4374     const U8 *e1 = NULL;       /* Don't scan s1 past this */
4375     U8 *f1 = NULL;             /* Point to current folded */
4376     const U8 *e2 = NULL;
4377     U8 *f2 = NULL;
4378     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4379     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4380     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4381     U8 flags_for_folder = FOLD_FLAGS_FULL;
4382 
4383     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4384 
4385     assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4386                && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
4387                         && !(flags &  FOLDEQ_S1_FOLDS_SANE))
4388                     || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
4389                         && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
4390     /* The algorithm is to trial the folds without regard to the flags on
4391      * the first line of the above assert(), and then see if the result
4392      * violates them.  This means that the inputs can't be pre-folded to a
4393      * violating result, hence the assert.  This could be changed, with the
4394      * addition of extra tests here for the already-folded case, which would
4395      * slow it down.  That cost is more than any possible gain for when these
4396      * flags are specified, as the flags indicate /il or /iaa matching which
4397      * is less common than /iu, and I (khw) also believe that real-world /il
4398      * and /iaa matches are most likely to involve code points 0-255, and this
4399      * function only under rare conditions gets called for 0-255. */
4400 
4401     if (flags & FOLDEQ_LOCALE) {
4402         if (IN_UTF8_CTYPE_LOCALE) {
4403             if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) {
4404                 flags_for_folder |= FOLD_FLAGS_LOCALE;
4405             }
4406             else {
4407                 flags &= ~FOLDEQ_LOCALE;
4408             }
4409         }
4410         else {
4411             flags_for_folder |= FOLD_FLAGS_LOCALE;
4412         }
4413     }
4414     if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4415         flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4416     }
4417 
4418     if (pe1) {
4419         e1 = *(U8**)pe1;
4420     }
4421 
4422     if (l1) {
4423         g1 = (const U8*)s1 + l1;
4424     }
4425 
4426     if (pe2) {
4427         e2 = *(U8**)pe2;
4428     }
4429 
4430     if (l2) {
4431         g2 = (const U8*)s2 + l2;
4432     }
4433 
4434     /* Must have at least one goal */
4435     assert(g1 || g2);
4436 
4437     if (g1) {
4438 
4439         /* Will never match if goal is out-of-bounds */
4440         assert(! e1  || e1 >= g1);
4441 
4442         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4443         * only go as far as the goal */
4444         e1 = g1;
4445     }
4446     else {
4447         assert(e1);    /* Must have an end for looking at s1 */
4448     }
4449 
4450     /* Same for goal for s2 */
4451     if (g2) {
4452         assert(! e2  || e2 >= g2);
4453         e2 = g2;
4454     }
4455     else {
4456         assert(e2);
4457     }
4458 
4459     /* If both operands are already folded, we could just do a memEQ on the
4460      * whole strings at once, but it would be better if the caller realized
4461      * this and didn't even call us */
4462 
4463     /* Look through both strings, a character at a time */
4464     while (p1 < e1 && p2 < e2) {
4465 
4466         /* If at the beginning of a new character in s1, get its fold to use
4467          * and the length of the fold. */
4468         if (n1 == 0) {
4469             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4470                 f1 = (U8 *) p1;
4471                 assert(u1);
4472                 n1 = UTF8SKIP(f1);
4473             }
4474             else {
4475                 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4476 
4477                     /* We have to forbid mixing ASCII with non-ASCII if the
4478                      * flags so indicate.  And, we can short circuit having to
4479                      * call the general functions for this common ASCII case,
4480                      * all of whose non-locale folds are also ASCII, and hence
4481                      * UTF-8 invariants, so the UTF8ness of the strings is not
4482                      * relevant. */
4483                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4484                         return 0;
4485                     }
4486                     n1 = 1;
4487                     *foldbuf1 = toFOLD(*p1);
4488                 }
4489                 else if (u1) {
4490                     _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
4491                 }
4492                 else {  /* Not UTF-8, get UTF-8 fold */
4493                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4494                 }
4495                 f1 = foldbuf1;
4496             }
4497         }
4498 
4499         if (n2 == 0) {    /* Same for s2 */
4500             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4501 
4502                 /* Point to the already-folded character.  But for non-UTF-8
4503                  * variants, convert to UTF-8 for the algorithm below */
4504                 if (UTF8_IS_INVARIANT(*p2)) {
4505                     f2 = (U8 *) p2;
4506                     n2 = 1;
4507                 }
4508                 else if (u2) {
4509                     f2 = (U8 *) p2;
4510                     n2 = UTF8SKIP(f2);
4511                 }
4512                 else {
4513                     foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4514                     foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4515                     f2 = foldbuf2;
4516                     n2 = 2;
4517                 }
4518             }
4519             else {
4520                 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4521                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4522                         return 0;
4523                     }
4524                     n2 = 1;
4525                     *foldbuf2 = toFOLD(*p2);
4526                 }
4527                 else if (u2) {
4528                     _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
4529                 }
4530                 else {
4531                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4532                 }
4533                 f2 = foldbuf2;
4534             }
4535         }
4536 
4537         /* Here f1 and f2 point to the beginning of the strings to compare.
4538          * These strings are the folds of the next character from each input
4539          * string, stored in UTF-8. */
4540 
4541         /* While there is more to look for in both folds, see if they
4542         * continue to match */
4543         while (n1 && n2) {
4544             U8 fold_length = UTF8SKIP(f1);
4545             if (fold_length != UTF8SKIP(f2)
4546                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4547                                                        function call for single
4548                                                        byte */
4549                 || memNE((char*)f1, (char*)f2, fold_length))
4550             {
4551                 return 0; /* mismatch */
4552             }
4553 
4554             /* Here, they matched, advance past them */
4555             n1 -= fold_length;
4556             f1 += fold_length;
4557             n2 -= fold_length;
4558             f2 += fold_length;
4559         }
4560 
4561         /* When reach the end of any fold, advance the input past it */
4562         if (n1 == 0) {
4563             p1 += u1 ? UTF8SKIP(p1) : 1;
4564         }
4565         if (n2 == 0) {
4566             p2 += u2 ? UTF8SKIP(p2) : 1;
4567         }
4568     } /* End of loop through both strings */
4569 
4570     /* A match is defined by each scan that specified an explicit length
4571     * reaching its final goal, and the other not having matched a partial
4572     * character (which can happen when the fold of a character is more than one
4573     * character). */
4574     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4575         return 0;
4576     }
4577 
4578     /* Successful match.  Set output pointers */
4579     if (pe1) {
4580         *pe1 = (char*)p1;
4581     }
4582     if (pe2) {
4583         *pe2 = (char*)p2;
4584     }
4585     return 1;
4586 }
4587 
4588 /*
4589  * ex: set ts=8 sts=4 sw=4 et:
4590  */
4591