xref: /openbsd/gnu/usr.bin/perl/utf8.c (revision e0680481)
1 /*    utf8.c
2  *
3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4  *    by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  * 'What a fix!' said Sam.  'That's the one place in all the lands we've ever
13  *  heard of that we don't want to see any closer; and that's the one place
14  *  we're trying to get to!  And that's just where we can't get, nohow.'
15  *
16  *     [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
17  *
18  * 'Well do I understand your speech,' he answered in the same language;
19  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
20  *  as is the custom in the West, if you wish to be answered?'
21  *                           --Gandalf, addressing Théoden's door wardens
22  *
23  *     [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24  *
25  * ...the travellers perceived that the floor was paved with stones of many
26  * hues; branching runes and strange devices intertwined beneath their feet.
27  *
28  *     [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29  */
30 
31 #include "EXTERN.h"
32 #define PERL_IN_UTF8_C
33 #include "perl.h"
34 #include "invlist_inline.h"
35 
36 static const char malformed_text[] = "Malformed UTF-8 character";
37 static const char unees[] =
38                         "Malformed UTF-8 character (unexpected end of string)";
39 
40 /*
41 These are various utility functions for manipulating UTF8-encoded
42 strings.  For the uninitiated, this is a method of representing arbitrary
43 Unicode characters as a variable number of bytes, in such a way that
44 characters in the ASCII range are unmodified, and a zero byte never appears
45 within non-zero characters.
46 */
47 
48 void
Perl__force_out_malformed_utf8_message(pTHX_ const U8 * const p,const U8 * const e,const U32 flags,const bool die_here)49 Perl__force_out_malformed_utf8_message(pTHX_
50             const U8 *const p,      /* First byte in UTF-8 sequence */
51             const U8 * const e,     /* Final byte in sequence (may include
52                                        multiple chars */
53             const U32 flags,        /* Flags to pass to utf8n_to_uvchr(),
54                                        usually 0, or some DISALLOW flags */
55             const bool die_here)    /* If TRUE, this function does not return */
56 {
57     /* This core-only function is to be called when a malformed UTF-8 character
58      * is found, in order to output the detailed information about the
59      * malformation before dieing.  The reason it exists is for the occasions
60      * when such a malformation is fatal, but warnings might be turned off, so
61      * that normally they would not be actually output.  This ensures that they
62      * do get output.  Because a sequence may be malformed in more than one
63      * way, multiple messages may be generated, so we can't make them fatal, as
64      * that would cause the first one to die.
65      *
66      * Instead we pretend -W was passed to perl, then die afterwards.  The
67      * flexibility is here to return to the caller so they can finish up and
68      * die themselves */
69     U32 errors;
70 
71     PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
72 
73     ENTER;
74     SAVEI8(PL_dowarn);
75     SAVESPTR(PL_curcop);
76 
77     PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
78     if (PL_curcop) {
79         SAVECURCOPWARNINGS();
80         PL_curcop->cop_warnings = pWARN_ALL;
81     }
82 
83     (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
84 
85     LEAVE;
86 
87     if (! errors) {
88         Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
89                          " be called only when there are errors found");
90     }
91 
92     if (die_here) {
93         Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
94     }
95 }
96 
97 STATIC HV *
S_new_msg_hv(pTHX_ const char * const message,U32 categories,U32 flag)98 S_new_msg_hv(pTHX_ const char * const message, /* The message text */
99                    U32 categories,  /* Packed warning categories */
100                    U32 flag)        /* Flag associated with this message */
101 {
102     /* Creates, populates, and returns an HV* that describes an error message
103      * for the translators between UTF8 and code point */
104 
105     SV* msg_sv = newSVpv(message, 0);
106     SV* category_sv = newSVuv(categories);
107     SV* flag_bit_sv = newSVuv(flag);
108 
109     HV* msg_hv = newHV();
110 
111     PERL_ARGS_ASSERT_NEW_MSG_HV;
112 
113     (void) hv_stores(msg_hv, "text", msg_sv);
114     (void) hv_stores(msg_hv, "warn_categories",  category_sv);
115     (void) hv_stores(msg_hv, "flag_bit", flag_bit_sv);
116 
117     return msg_hv;
118 }
119 
120 /*
121 =for apidoc uvoffuni_to_utf8_flags
122 
123 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
124 Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
125 L<perlapi/uvchr_to_utf8_flags>>.
126 
127 This function is like them, but the input is a strict Unicode
128 (as opposed to native) code point.  Only in very rare circumstances should code
129 not be using the native code point.
130 
131 For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
132 
133 =cut
134 */
135 
136 U8 *
Perl_uvoffuni_to_utf8_flags(pTHX_ U8 * d,UV uv,const UV flags)137 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
138 {
139     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
140 
141     return uvoffuni_to_utf8_flags_msgs(d, uv, flags, NULL);
142 }
143 
144 /* All these formats take a single UV code point argument */
145 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
146 const char nonchar_cp_format[]   = "Unicode non-character U+%04" UVXf
147                                    " is not recommended for open interchange";
148 const char super_cp_format[]     = "Code point 0x%" UVXf " is not Unicode,"
149                                    " may not be portable";
150 
151 /*  Use shorter names internally in this file */
152 #define SHIFT   UTF_ACCUMULATION_SHIFT
153 #undef  MARK
154 #define MARK    UTF_CONTINUATION_MARK
155 #define MASK    UTF_CONTINUATION_MASK
156 
157 /*
158 =for apidoc uvchr_to_utf8_flags_msgs
159 
160 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
161 
162 Most code should use C<L</uvchr_to_utf8_flags>()> rather than call this directly.
163 
164 This function is for code that wants any warning and/or error messages to be
165 returned to the caller rather than be displayed.  All messages that would have
166 been displayed if all lexical warnings are enabled will be returned.
167 
168 It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
169 placed after all the others, C<msgs>.  If this parameter is 0, this function
170 behaves identically to C<L</uvchr_to_utf8_flags>>.  Otherwise, C<msgs> should
171 be a pointer to an C<HV *> variable, in which this function creates a new HV to
172 contain any appropriate messages.  The hash has three key-value pairs, as
173 follows:
174 
175 =over 4
176 
177 =item C<text>
178 
179 The text of the message as a C<SVpv>.
180 
181 =item C<warn_categories>
182 
183 The warning category (or categories) packed into a C<SVuv>.
184 
185 =item C<flag>
186 
187 A single flag bit associated with this message, in a C<SVuv>.
188 The bit corresponds to some bit in the C<*errors> return value,
189 such as C<UNICODE_GOT_SURROGATE>.
190 
191 =back
192 
193 It's important to note that specifying this parameter as non-null will cause
194 any warnings this function would otherwise generate to be suppressed, and
195 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
196 (or not) when choosing what to do with the returned messages.
197 
198 The caller, of course, is responsible for freeing any returned HV.
199 
200 =cut
201 */
202 
203 /* Undocumented; we don't want people using this.  Instead they should use
204  * uvchr_to_utf8_flags_msgs() */
205 U8 *
Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 * d,UV input_uv,UV flags,HV ** msgs)206 Perl_uvoffuni_to_utf8_flags_msgs(pTHX_ U8 *d, UV input_uv, UV flags, HV** msgs)
207 {
208     U8 *p;
209     UV shifted_uv = input_uv;
210     STRLEN utf8_skip = OFFUNISKIP(input_uv);
211 
212     PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS_MSGS;
213 
214     if (msgs) {
215         *msgs = NULL;
216     }
217 
218     switch (utf8_skip) {
219       case 1:
220         *d++ = LATIN1_TO_NATIVE(input_uv);
221         return d;
222 
223       default:
224         if (   UNLIKELY(input_uv > MAX_LEGAL_CP
225             && UNLIKELY(! (flags & UNICODE_ALLOW_ABOVE_IV_MAX))))
226         {
227             Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, /* Hex output */
228                                                          NULL, 0, input_uv));
229         }
230 
231         if ((flags & (UNICODE_WARN_PERL_EXTENDED|UNICODE_WARN_SUPER))) {
232             U32 category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
233             const char * format = PL_extended_cp_format;
234             if (msgs) {
235                 *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
236                                    category,
237                                    UNICODE_GOT_PERL_EXTENDED);
238             }
239             else {
240                 Perl_ck_warner_d(aTHX_ category, format, input_uv);
241             }
242 
243             /* Don't output a 2nd msg */
244             flags &= ~UNICODE_WARN_SUPER;
245         }
246 
247         if (flags & UNICODE_DISALLOW_PERL_EXTENDED) {
248             return NULL;
249         }
250 
251         p = d + utf8_skip - 1;
252         while (p >= d + 6 + ONE_IF_EBCDIC_ZERO_IF_NOT) {
253             *p-- = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
254             shifted_uv >>= SHIFT;
255         }
256 
257         /* FALLTHROUGH */
258 
259       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:
260         d[5 + ONE_IF_EBCDIC_ZERO_IF_NOT]
261                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
262         shifted_uv >>= SHIFT;
263         /* FALLTHROUGH */
264 
265       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:
266         d[4 + ONE_IF_EBCDIC_ZERO_IF_NOT]
267                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
268         shifted_uv >>= SHIFT;
269         /* FALLTHROUGH */
270 
271       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
272         if (UNLIKELY(UNICODE_IS_SUPER(input_uv))) {
273             if (flags & UNICODE_WARN_SUPER) {
274                 U32 category = packWARN(WARN_NON_UNICODE);
275                 const char * format = super_cp_format;
276 
277                 if (msgs) {
278                     *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
279                                        category,
280                                        UNICODE_GOT_SUPER);
281                 }
282                 else {
283                     Perl_ck_warner_d(aTHX_ category, format, input_uv);
284                 }
285 
286                 if (flags & UNICODE_DISALLOW_SUPER) {
287                     return NULL;
288                 }
289             }
290             if (       (flags & UNICODE_DISALLOW_SUPER)
291                 || (   (flags & UNICODE_DISALLOW_PERL_EXTENDED)
292                     &&  UNICODE_IS_PERL_EXTENDED(input_uv)))
293             {
294                 return NULL;
295             }
296         }
297 
298         d[3 + ONE_IF_EBCDIC_ZERO_IF_NOT]
299                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
300         shifted_uv >>= SHIFT;
301         /* FALLTHROUGH */
302 
303       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
304         if (isUNICODE_POSSIBLY_PROBLEMATIC(input_uv)) {
305             if (UNLIKELY(UNICODE_IS_NONCHAR(input_uv))) {
306                 if (flags & UNICODE_WARN_NONCHAR) {
307                     U32 category = packWARN(WARN_NONCHAR);
308                     const char * format = nonchar_cp_format;
309                     if (msgs) {
310                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
311                                            category,
312                                            UNICODE_GOT_NONCHAR);
313                     }
314                     else {
315                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
316                     }
317                 }
318                 if (flags & UNICODE_DISALLOW_NONCHAR) {
319                     return NULL;
320                 }
321             }
322             else if (UNLIKELY(UNICODE_IS_SURROGATE(input_uv))) {
323                 if (flags & UNICODE_WARN_SURROGATE) {
324                     U32 category = packWARN(WARN_SURROGATE);
325                     const char * format = surrogate_cp_format;
326                     if (msgs) {
327                         *msgs = new_msg_hv(Perl_form(aTHX_ format, input_uv),
328                                            category,
329                                            UNICODE_GOT_SURROGATE);
330                     }
331                     else {
332                         Perl_ck_warner_d(aTHX_ category, format, input_uv);
333                     }
334                 }
335                 if (flags & UNICODE_DISALLOW_SURROGATE) {
336                     return NULL;
337                 }
338             }
339         }
340 
341         d[2 + ONE_IF_EBCDIC_ZERO_IF_NOT]
342                                 = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
343         shifted_uv >>= SHIFT;
344         /* FALLTHROUGH */
345 
346 #ifdef EBCDIC
347 
348       case 3:
349         d[2] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
350         shifted_uv >>= SHIFT;
351         /* FALLTHROUGH */
352 
353 #endif
354 
355         /* FALLTHROUGH */
356       case 2:
357         d[1] = I8_TO_NATIVE_UTF8((shifted_uv & MASK) | MARK);
358         shifted_uv >>= SHIFT;
359         d[0] = I8_TO_NATIVE_UTF8((shifted_uv & UTF_START_MASK(utf8_skip))
360                                              | UTF_START_MARK(utf8_skip));
361         break;
362     }
363 
364     return d + utf8_skip;
365 }
366 
367 /*
368 =for apidoc uvchr_to_utf8
369 
370 Adds the UTF-8 representation of the native code point C<uv> to the end
371 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
372 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
373 the byte after the end of the new character.  In other words,
374 
375     d = uvchr_to_utf8(d, uv);
376 
377 is the recommended wide native character-aware way of saying
378 
379     *(d++) = uv;
380 
381 This function accepts any code point from 0..C<IV_MAX> as input.
382 C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
383 
384 It is possible to forbid or warn on non-Unicode code points, or those that may
385 be problematic by using L</uvchr_to_utf8_flags>.
386 
387 =cut
388 */
389 
390 /* This is also a macro */
391 PERL_CALLCONV U8*       Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
392 
393 U8 *
Perl_uvchr_to_utf8(pTHX_ U8 * d,UV uv)394 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
395 {
396     return uvchr_to_utf8(d, uv);
397 }
398 
399 /*
400 =for apidoc uvchr_to_utf8_flags
401 
402 Adds the UTF-8 representation of the native code point C<uv> to the end
403 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
404 C<UTF8_MAXBYTES+1>) free bytes available.  The return value is the pointer to
405 the byte after the end of the new character.  In other words,
406 
407     d = uvchr_to_utf8_flags(d, uv, flags);
408 
409 or, in most cases,
410 
411     d = uvchr_to_utf8_flags(d, uv, 0);
412 
413 This is the Unicode-aware way of saying
414 
415     *(d++) = uv;
416 
417 If C<flags> is 0, this function accepts any code point from 0..C<IV_MAX> as
418 input.  C<IV_MAX> is typically 0x7FFF_FFFF in a 32-bit word.
419 
420 Specifying C<flags> can further restrict what is allowed and not warned on, as
421 follows:
422 
423 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
424 the function will raise a warning, provided UTF8 warnings are enabled.  If
425 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
426 NULL.  If both flags are set, the function will both warn and return NULL.
427 
428 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
429 affect how the function handles a Unicode non-character.
430 
431 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
432 affect the handling of code points that are above the Unicode maximum of
433 0x10FFFF.  Languages other than Perl may not be able to accept files that
434 contain these.
435 
436 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
437 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
438 three DISALLOW flags.  C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
439 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
440 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
441 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
442 above-Unicode and surrogate flags, but not the non-character ones, as
443 defined in
444 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
445 See L<perlunicode/Noncharacter code points>.
446 
447 Extremely high code points were never specified in any standard, and require an
448 extension to UTF-8 to express, which Perl does.  It is likely that programs
449 written in something other than Perl would not be able to read files that
450 contain these; nor would Perl understand files written by something that uses a
451 different extension.  For these reasons, there is a separate set of flags that
452 can warn and/or disallow these extremely high code points, even if other
453 above-Unicode ones are accepted.  They are the C<UNICODE_WARN_PERL_EXTENDED>
454 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags.  For more information see
455 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UNICODE_DISALLOW_SUPER> will
456 treat all above-Unicode code points, including these, as malformations.  (Note
457 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
458 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
459 
460 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
461 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>.  Similarly,
462 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
463 C<UNICODE_DISALLOW_PERL_EXTENDED>.  The names are misleading because on EBCDIC
464 platforms,these flags can apply to code points that actually do fit in 31 bits.
465 The new names accurately describe the situation in all cases.
466 
467 =for apidoc Amnh||UNICODE_DISALLOW_ABOVE_31_BIT
468 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE
469 =for apidoc Amnh||UNICODE_DISALLOW_ILLEGAL_INTERCHANGE
470 =for apidoc Amnh||UNICODE_DISALLOW_NONCHAR
471 =for apidoc Amnh||UNICODE_DISALLOW_PERL_EXTENDED
472 =for apidoc Amnh||UNICODE_DISALLOW_SUPER
473 =for apidoc Amnh||UNICODE_DISALLOW_SURROGATE
474 =for apidoc Amnh||UNICODE_WARN_ABOVE_31_BIT
475 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_C9_INTERCHANGE
476 =for apidoc Amnh||UNICODE_WARN_ILLEGAL_INTERCHANGE
477 =for apidoc Amnh||UNICODE_WARN_NONCHAR
478 =for apidoc Amnh||UNICODE_WARN_PERL_EXTENDED
479 =for apidoc Amnh||UNICODE_WARN_SUPER
480 =for apidoc Amnh||UNICODE_WARN_SURROGATE
481 
482 =cut
483 */
484 
485 /* This is also a macro */
486 PERL_CALLCONV U8*       Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
487 
488 U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 * d,UV uv,UV flags)489 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
490 {
491     return uvchr_to_utf8_flags(d, uv, flags);
492 }
493 
494 PERL_STATIC_INLINE int
S_is_utf8_overlong(const U8 * const s,const STRLEN len)495 S_is_utf8_overlong(const U8 * const s, const STRLEN len)
496 {
497     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
498      * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
499      * it isn't, and -1 if there isn't enough information to tell.  This last
500      * return value can happen if the sequence is incomplete, missing some
501      * trailing bytes that would form a complete character.  If there are
502      * enough bytes to make a definitive decision, this function does so.
503      * Usually 2 bytes are sufficient.
504      *
505      * Overlongs can occur whenever the number of continuation bytes changes.
506      * That means whenever the number of leading 1 bits in a start byte
507      * increases from the next lower start byte.  That happens for start bytes
508      * C0, E0, F0, F8, FC, FE, and FF.
509      */
510 
511     PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
512 
513     /* Each platform has overlongs after the start bytes given above (expressed
514      * in I8 for EBCDIC).  The values below were found by manually inspecting
515      * the UTF-8 patterns.  See the tables in utf8.h and utfebcdic.h. */
516 
517     switch (NATIVE_UTF8_TO_I8(s[0])) {
518       default:
519         assert(UTF8_IS_START(s[0]));
520         return 0;
521 
522       case 0xC0:
523       case 0xC1:
524         return 1;
525 
526 #ifdef EBCDIC
527 
528       case 0xC2:
529       case 0xC3:
530       case 0xC4:
531       case 0xE0:
532         return 1;
533 #else
534       case 0xE0:
535         return (len < 2) ? -1 : s[1] < 0xA0;
536 #endif
537 
538       case 0xF0:
539         return (len < 2)
540                ? -1
541                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
542       case 0xF8:
543         return (len < 2)
544                ? -1
545                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
546       case 0xFC:
547         return (len < 2)
548                ? -1
549                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
550       case 0xFE:
551         return (len < 2)
552                ? -1
553                : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
554       case 0xFF:
555         return isFF_overlong(s, len);
556     }
557 }
558 
559 PERL_STATIC_INLINE int
S_isFF_overlong(const U8 * const s,const STRLEN len)560 S_isFF_overlong(const U8 * const s, const STRLEN len)
561 {
562     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
563      * 'e' - 1 is an overlong beginning with \xFF.  It returns 1 if it is; 0 if
564      * it isn't, and -1 if there isn't enough information to tell.  This last
565      * return value can happen if the sequence is incomplete, missing some
566      * trailing bytes that would form a complete character.  If there are
567      * enough bytes to make a definitive decision, this function does so. */
568 
569     PERL_ARGS_ASSERT_ISFF_OVERLONG;
570 
571 #ifdef EBCDIC
572     /* This works on all three EBCDIC code pages traditionally supported by
573      * perl */
574 #  define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
575 #else
576 #  define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
577 #endif
578 
579     /* To be an FF overlong, all the available bytes must match */
580     if (LIKELY(memNE(s, FF_OVERLONG_PREFIX,
581                      MIN(len, STRLENs(FF_OVERLONG_PREFIX)))))
582     {
583         return 0;
584     }
585 
586     /* To be an FF overlong sequence, all the bytes in FF_OVERLONG_PREFIX must
587      * be there; what comes after them doesn't matter.  See tables in utf8.h,
588      * utfebcdic.h. */
589     if (len >= STRLENs(FF_OVERLONG_PREFIX)) {
590         return 1;
591     }
592 
593     /* The missing bytes could cause the result to go one way or the other, so
594      * the result is indeterminate */
595     return -1;
596 }
597 
598 /* At some point we may want to allow core to use up to UV_MAX */
599 
600 #ifdef EBCDIC     /* Actually is I8 */
601 #  if defined(UV_IS_QUAD) /* These assume IV_MAX is 2**63-1, UV_MAX 2**64-1 */
602 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA7"
603                               /* UV_MAX "\xFF\xAF" */
604 #  else      /* These assume IV_MAX is 2**31-1, UV_MAX 2**32-1 */
605 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1"
606                               /* UV_MAX "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3" */
607 #  endif
608 #else
609 #  if defined(UV_IS_QUAD)
610 #    define HIGHEST_REPRESENTABLE_UTF  "\xFF\x80\x87"
611                               /* UV_MAX "\xFF\x80" */
612 #  else
613 #    define HIGHEST_REPRESENTABLE_UTF  "\xFD"
614                               /* UV_MAX "\xFE\x83" */
615 #  endif
616 #endif
617 
618 PERL_STATIC_INLINE int
S_does_utf8_overflow(const U8 * const s,const U8 * e,const bool consider_overlongs)619 S_does_utf8_overflow(const U8 * const s,
620                      const U8 * e,
621                      const bool consider_overlongs)
622 {
623     /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
624      * 'e' - 1 would overflow an IV on this platform; that is if it represents
625      * a code point larger than the highest representable code point.  It
626      * returns 1 if it does overflow; 0 if it doesn't, and -1 if there isn't
627      * enough information to tell.  This last return value can happen if the
628      * sequence is incomplete, missing some trailing bytes that would form a
629      * complete character.  If there are enough bytes to make a definitive
630      * decision, this function does so.
631      *
632      * If 'consider_overlongs' is TRUE, the function checks for the possibility
633      * that the sequence is an overlong that doesn't overflow.  Otherwise, it
634      * assumes the sequence is not an overlong.  This can give different
635      * results only on ASCII 32-bit platforms.
636      *
637      * (For ASCII platforms, we could use memcmp() because we don't have to
638      * convert each byte to I8, but it's very rare input indeed that would
639      * approach overflow, so the loop below will likely only get executed once.)
640      *
641      */
642     const STRLEN len = e - s;
643     const U8 *x;
644     const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF;
645     int is_overlong = 0;
646 
647     PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
648 
649     for (x = s; x < e; x++, y++) {
650 
651         /* 'y' is set up to not include the trailing bytes that are all the
652          * maximum possible continuation byte.  So when we reach the end of 'y'
653          * (known to be NUL terminated), it is impossible for 'x' to contain
654          * bytes larger than those omitted bytes, and therefore 'x' can't
655          * overflow */
656         if (*y == '\0') {
657             return 0;
658         }
659 
660         /* If this byte is less than the corresponding highest non-overflowing
661          * UTF-8, the sequence doesn't overflow */
662         if (NATIVE_UTF8_TO_I8(*x) < *y) {
663             return 0;
664         }
665 
666         if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
667             goto overflows_if_not_overlong;
668         }
669     }
670 
671     /* Got to the end, and all bytes are the same.  If the input is a whole
672      * character, it doesn't overflow.  And if it is a partial character,
673      * there's not enough information to tell */
674     return (len >= STRLENs(HIGHEST_REPRESENTABLE_UTF)) ? 0 : -1;
675 
676   overflows_if_not_overlong:
677 
678     /* Here, a well-formed sequence overflows.  If we are assuming
679      * well-formedness, return that it overflows. */
680     if (! consider_overlongs) {
681         return 1;
682     }
683 
684     /* Here, it could be the overlong malformation, and might not actually
685      * overflow if you were to calculate it out.
686      *
687      * See if it actually is overlong */
688     is_overlong = is_utf8_overlong(s, len);
689 
690     /* If it isn't overlong, is well-formed, so overflows */
691     if (is_overlong == 0) {
692         return 1;
693     }
694 
695     /* Not long enough to determine */
696     if (is_overlong < 0) {
697         return -1;
698     }
699 
700     /* Here, it appears to overflow, but it is also overlong */
701 
702 #if 6 * UTF_CONTINUATION_BYTE_INFO_BITS <= IVSIZE * CHARBITS
703 
704     /* On many platforms, it is impossible for an overlong to overflow.  For
705      * these, no further work is necessary: we can return immediately that this
706      * overlong that is an apparent overflow actually isn't
707      *
708      * To see why, note that a length_N sequence can represent as overlongs all
709      * the code points representable by shorter length sequences, but no
710      * higher.  If it could represent a higher code point without being an
711      * overlong, we wouldn't have had to increase the sequence length!
712      *
713      * The highest possible start byte is FF; the next highest is FE.  The
714      * highest code point representable as an overlong on the platform is thus
715      * the highest code point representable by a non-overlong sequence whose
716      * start byte is FE.  If that value doesn't overflow the platform's word
717      * size, overlongs can't overflow.
718      *
719      * FE consists of 7 bytes total; the FE start byte contributes 0 bits of
720      * information (the high 7 bits, all ones, say that the sequence is 7 bytes
721      * long, and the bottom, zero, bit is s placeholder. That leaves the 6
722      * continuation bytes to contribute UTF_CONTINUATION_BYTE_INFO_BITS each.
723       If that number of bits doesn't exceed the word size, it can't overflow. */
724 
725     return 0;
726 
727 #else
728 
729     /* In practice, only a 32-bit ASCII box gets here.  The FE start byte can
730      * represent, as an overlong, the highest code point representable by an FD
731      * start byte, which is 5*6 continuation bytes of info plus one bit from
732      * the start byte, or 31 bits.  That doesn't overflow.  More explicitly:
733      * \xFD\xBF\xBF\xBF\xBF\xBF evaluates to 0x7FFFFFFF = 2*31 - 1.
734      *
735      * That means only the FF start byte can have an overflowing overlong. */
736     if (*s < 0xFF) {
737         return 0;
738     }
739 
740     /* The sequence \xff\x80\x80\x80\x80\x80\x80\x82 is an overlong that
741      * evaluates to 2**31, so overflows an IV.  For a UV it's
742      *              \xff\x80\x80\x80\x80\x80\x80\x83 = 2**32 */
743 #  define OVERFLOWS  "\xff\x80\x80\x80\x80\x80\x80\x82"
744 
745     if (e - s < (Ptrdiff_t) STRLENs(OVERFLOWS)) {   /* Not enough info */
746          return -1;
747     }
748 
749 #  define strnGE(s1,s2,l) (strncmp(s1,s2,l) >= 0)
750 
751     return strnGE((const char *) s, OVERFLOWS, STRLENs(OVERFLOWS));
752 
753 #endif
754 
755 }
756 
757 STRLEN
Perl_is_utf8_char_helper_(const U8 * const s,const U8 * e,const U32 flags)758 Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags)
759 {
760     SSize_t len, full_len;
761 
762     /* An internal helper function.
763      *
764      * On input:
765      *  's' is a string, which is known to be syntactically valid UTF-8 as far
766      *      as (e - 1); e > s must hold.
767      *  'e' This function is allowed to look at any byte from 's'...'e-1', but
768      *      nowhere else.  The function has to cope as best it can if that
769      *      sequence does not form a full character.
770      * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
771      *      accepted by L</utf8n_to_uvchr>.  If non-zero, this function returns
772      *      0 if it determines the input will match something disallowed.
773      * On output:
774      *  The return is the number of bytes required to represent the code point
775      *  if it isn't disallowed by 'flags'; 0 otherwise.  Be aware that if the
776      *  input is for a partial character, a successful return will be larger
777      *  than 'e - s'.
778      *
779      *  If *s..*(e-1) is only for a partial character, the function will return
780      *  non-zero if there is any sequence of well-formed UTF-8 that, when
781      *  appended to the input sequence, could result in an allowed code point;
782      *  otherwise it returns 0.  Non characters cannot be determined based on
783      *  partial character input.  But many  of the other excluded types can be
784      *  determined with just the first one or two bytes.
785      *
786      */
787 
788     PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER_;
789 
790     assert(e > s);
791     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
792                           |UTF8_DISALLOW_PERL_EXTENDED)));
793 
794     full_len = UTF8SKIP(s);
795 
796     len = e - s;
797     if (len > full_len) {
798         e = s + full_len;
799         len = full_len;
800     }
801 
802     switch (full_len) {
803         bool is_super;
804 
805       default: /* Extended */
806         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
807             return 0;
808         }
809 
810         /* FALLTHROUGH */
811 
812       case 6 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
813       case 5 + ONE_IF_EBCDIC_ZERO_IF_NOT:   /* above Unicode */
814 
815         if (flags & UTF8_DISALLOW_SUPER) {
816             return 0;                       /* Above Unicode */
817         }
818 
819         return full_len;
820 
821       case 4 + ONE_IF_EBCDIC_ZERO_IF_NOT:
822         is_super = (   UNLIKELY(NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_)
823                     || (   len > 1
824                         && NATIVE_UTF8_TO_I8(s[0]) == UTF_START_BYTE_110000_
825                         && NATIVE_UTF8_TO_I8(s[1])
826                                                 >= UTF_FIRST_CONT_BYTE_110000_));
827         if (is_super) {
828             if (flags & UTF8_DISALLOW_SUPER) {
829                 return 0;
830             }
831         }
832         else if (   (flags & UTF8_DISALLOW_NONCHAR)
833                  && len == full_len
834                  && UNLIKELY(is_LARGER_NON_CHARS_utf8(s)))
835         {
836             return 0;
837         }
838 
839         return full_len;
840 
841       case 3 + ONE_IF_EBCDIC_ZERO_IF_NOT:
842 
843         if (! isUTF8_POSSIBLY_PROBLEMATIC(s[0]) || len < 2) {
844             return full_len;
845         }
846 
847         if (   (flags & UTF8_DISALLOW_SURROGATE)
848             &&  UNLIKELY(is_SURROGATE_utf8(s)))
849         {
850             return 0;       /* Surrogate */
851         }
852 
853         if (  (flags & UTF8_DISALLOW_NONCHAR)
854             && len == full_len
855             && UNLIKELY(is_SHORTER_NON_CHARS_utf8(s)))
856         {
857             return 0;
858         }
859 
860         return full_len;
861 
862       /* The lower code points don't have any disallowable characters */
863 #ifdef EBCDIC
864       case 3:
865         return full_len;
866 #endif
867 
868       case 2:
869       case 1:
870         return full_len;
871     }
872 }
873 
874 Size_t
Perl_is_utf8_FF_helper_(const U8 * const s0,const U8 * const e,const bool require_partial)875 Perl_is_utf8_FF_helper_(const U8 * const s0, const U8 * const e,
876                         const bool require_partial)
877 {
878     /* This is called to determine if the UTF-8 sequence starting at s0 and
879      * continuing for up to one full character of bytes, but looking no further
880      * than 'e - 1', is legal.  *s0 must be 0xFF (or whatever the native
881      * equivalent of FF in I8 on EBCDIC platforms is).  This marks it as being
882      * for the largest code points recognized by Perl, the ones that require
883      * the most UTF-8 bytes per character to represent (somewhat less than
884      * twice the size of the next longest kind).  This sequence will only ever
885      * be Perl extended UTF-8.
886      *
887      * The routine returns 0 if the sequence is not fully valid, syntactically
888      * or semantically.  That means it checks that everything following the
889      * start byte is a continuation byte, and that it doesn't overflow, nor is
890      * an overlong representation.
891      *
892      * If 'require_partial' is FALSE, the routine returns non-zero only if the
893      * input (as far as 'e-1') is a full character.  The return is the count of
894      * the bytes in the character.
895      *
896      * If 'require_partial' is TRUE, the routine returns non-zero only if the
897      * input as far as 'e-1' is a partial, not full character, with no
898      * malformations found before position 'e'.  The return is either just
899      * FALSE, or TRUE.  */
900 
901     const U8 *s = s0 + 1;
902     const U8 *send = e;
903 
904     PERL_ARGS_ASSERT_IS_UTF8_FF_HELPER_;
905 
906     assert(s0 < e);
907     assert(*s0 == I8_TO_NATIVE_UTF8(0xFF));
908 
909     send = s + MIN(UTF8_MAXBYTES - 1, e - s);
910     while (s < send) {
911         if (! UTF8_IS_CONTINUATION(*s)) {
912             return 0;
913         }
914 
915         s++;
916     }
917 
918     if (0 < does_utf8_overflow(s0, e,
919                                FALSE /* Don't consider_overlongs */
920     )) {
921         return 0;
922     }
923 
924     if (0 < isFF_overlong(s0, e - s0)) {
925         return 0;
926     }
927 
928     /* Here, the character is valid as far as it got.  Check if got a partial
929      * character */
930     if (s - s0 < UTF8_MAXBYTES) {
931         return (require_partial) ? 1 : 0;
932     }
933 
934     /* Here, got a full character */
935     return (require_partial) ? 0 : UTF8_MAXBYTES;
936 }
937 
938 char *
Perl__byte_dump_string(pTHX_ const U8 * const start,const STRLEN len,const bool format)939 Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
940 {
941     /* Returns a mortalized C string that is a displayable copy of the 'len'
942      * bytes starting at 'start'.  'format' gives how to display each byte.
943      * Currently, there are only two formats, so it is currently a bool:
944      *      0   \xab
945      *      1    ab         (that is a space between two hex digit bytes)
946      */
947 
948     const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
949                                                trailing NUL */
950     const U8 * s = start;
951     const U8 * const e = start + len;
952     char * output;
953     char * d;
954 
955     PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
956 
957     Newx(output, output_len, char);
958     SAVEFREEPV(output);
959 
960     d = output;
961     for (s = start; s < e; s++) {
962         const unsigned high_nibble = (*s & 0xF0) >> 4;
963         const unsigned low_nibble =  (*s & 0x0F);
964 
965         if (format) {
966             if (s > start) {
967                 *d++ = ' ';
968             }
969         }
970         else {
971             *d++ = '\\';
972             *d++ = 'x';
973         }
974 
975         if (high_nibble < 10) {
976             *d++ = high_nibble + '0';
977         }
978         else {
979             *d++ = high_nibble - 10 + 'a';
980         }
981 
982         if (low_nibble < 10) {
983             *d++ = low_nibble + '0';
984         }
985         else {
986             *d++ = low_nibble - 10 + 'a';
987         }
988     }
989 
990     *d = '\0';
991     return output;
992 }
993 
994 PERL_STATIC_INLINE char *
S_unexpected_non_continuation_text(pTHX_ const U8 * const s,STRLEN print_len,const STRLEN non_cont_byte_pos,const STRLEN expect_len)995 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
996 
997                                          /* Max number of bytes to print */
998                                          STRLEN print_len,
999 
1000                                          /* Which one is the non-continuation */
1001                                          const STRLEN non_cont_byte_pos,
1002 
1003                                          /* How many bytes should there be? */
1004                                          const STRLEN expect_len)
1005 {
1006     /* Return the malformation warning text for an unexpected continuation
1007      * byte. */
1008 
1009     const char * const where = (non_cont_byte_pos == 1)
1010                                ? "immediately"
1011                                : Perl_form(aTHX_ "%d bytes",
1012                                                  (int) non_cont_byte_pos);
1013     const U8 * x = s + non_cont_byte_pos;
1014     const U8 * e = s + print_len;
1015 
1016     PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
1017 
1018     /* We don't need to pass this parameter, but since it has already been
1019      * calculated, it's likely faster to pass it; verify under DEBUGGING */
1020     assert(expect_len == UTF8SKIP(s));
1021 
1022     /* As a defensive coding measure, don't output anything past a NUL.  Such
1023      * bytes shouldn't be in the middle of a malformation, and could mark the
1024      * end of the allocated string, and what comes after is undefined */
1025     for (; x < e; x++) {
1026         if (*x == '\0') {
1027             x++;            /* Output this particular NUL */
1028             break;
1029         }
1030     }
1031 
1032     return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
1033                            " %s after start byte 0x%02x; need %d bytes, got %d)",
1034                            malformed_text,
1035                            _byte_dump_string(s, x - s, 0),
1036                            *(s + non_cont_byte_pos),
1037                            where,
1038                            *s,
1039                            (int) expect_len,
1040                            (int) non_cont_byte_pos);
1041 }
1042 
1043 /*
1044 
1045 =for apidoc utf8n_to_uvchr
1046 
1047 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1048 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1049 directly.
1050 
1051 Bottom level UTF-8 decode routine.
1052 Returns the native code point value of the first character in the string C<s>,
1053 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
1054 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
1055 the length, in bytes, of that character.
1056 
1057 The value of C<flags> determines the behavior when C<s> does not point to a
1058 well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
1059 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
1060 is the next possible position in C<s> that could begin a non-malformed
1061 character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
1062 is raised.  Some UTF-8 input sequences may contain multiple malformations.
1063 This function tries to find every possible one in each call, so multiple
1064 warnings can be raised for the same sequence.
1065 
1066 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
1067 individual types of malformations, such as the sequence being overlong (that
1068 is, when there is a shorter sequence that can express the same code point;
1069 overlong sequences are expressly forbidden in the UTF-8 standard due to
1070 potential security issues).  Another malformation example is the first byte of
1071 a character not being a legal first byte.  See F<utf8.h> for the list of such
1072 flags.  Even if allowed, this function generally returns the Unicode
1073 REPLACEMENT CHARACTER when it encounters a malformation.  There are flags in
1074 F<utf8.h> to override this behavior for the overlong malformations, but don't
1075 do that except for very specialized purposes.
1076 
1077 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
1078 flags) malformation is found.  If this flag is set, the routine assumes that
1079 the caller will raise a warning, and this function will silently just set
1080 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
1081 
1082 Note that this API requires disambiguation between successful decoding a C<NUL>
1083 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
1084 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
1085 be set to 1.  To disambiguate, upon a zero return, see if the first byte of
1086 C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
1087 error.  Or you can use C<L</utf8n_to_uvchr_error>>.
1088 
1089 Certain code points are considered problematic.  These are Unicode surrogates,
1090 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
1091 By default these are considered regular code points, but certain situations
1092 warrant special handling for them, which can be specified using the C<flags>
1093 parameter.  If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
1094 three classes are treated as malformations and handled as such.  The flags
1095 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
1096 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
1097 disallow these categories individually.  C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
1098 restricts the allowed inputs to the strict UTF-8 traditionally defined by
1099 Unicode.  Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
1100 definition given by
1101 L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
1102 The difference between traditional strictness and C9 strictness is that the
1103 latter does not forbid non-character code points.  (They are still discouraged,
1104 however.)  For more discussion see L<perlunicode/Noncharacter code points>.
1105 
1106 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
1107 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
1108 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
1109 raised for their respective categories, but otherwise the code points are
1110 considered valid (not malformations).  To get a category to both be treated as
1111 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
1112 (But note that warnings are not raised if lexically disabled nor if
1113 C<UTF8_CHECK_ONLY> is also specified.)
1114 
1115 Extremely high code points were never specified in any standard, and require an
1116 extension to UTF-8 to express, which Perl does.  It is likely that programs
1117 written in something other than Perl would not be able to read files that
1118 contain these; nor would Perl understand files written by something that uses a
1119 different extension.  For these reasons, there is a separate set of flags that
1120 can warn and/or disallow these extremely high code points, even if other
1121 above-Unicode ones are accepted.  They are the C<UTF8_WARN_PERL_EXTENDED> and
1122 C<UTF8_DISALLOW_PERL_EXTENDED> flags.  For more information see
1123 C<L</UTF8_GOT_PERL_EXTENDED>>.  Of course C<UTF8_DISALLOW_SUPER> will treat all
1124 above-Unicode code points, including these, as malformations.
1125 (Note that the Unicode standard considers anything above 0x10FFFF to be
1126 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
1127 (2**31 -1))
1128 
1129 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
1130 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>.  Similarly,
1131 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
1132 C<UTF8_DISALLOW_PERL_EXTENDED>.  The names are misleading because these flags
1133 can apply to code points that actually do fit in 31 bits.  This happens on
1134 EBCDIC platforms, and sometimes when the L<overlong
1135 malformation|/C<UTF8_GOT_LONG>> is also present.  The new names accurately
1136 describe the situation in all cases.
1137 
1138 
1139 All other code points corresponding to Unicode characters, including private
1140 use and those yet to be assigned, are never considered malformed and never
1141 warn.
1142 
1143 =for apidoc Amnh||UTF8_CHECK_ONLY
1144 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1145 =for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
1146 =for apidoc Amnh||UTF8_DISALLOW_SURROGATE
1147 =for apidoc Amnh||UTF8_DISALLOW_NONCHAR
1148 =for apidoc Amnh||UTF8_DISALLOW_SUPER
1149 =for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
1150 =for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
1151 =for apidoc Amnh||UTF8_WARN_SURROGATE
1152 =for apidoc Amnh||UTF8_WARN_NONCHAR
1153 =for apidoc Amnh||UTF8_WARN_SUPER
1154 =for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
1155 =for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
1156 
1157 =cut
1158 
1159 Also implemented as a macro in utf8.h
1160 */
1161 
1162 UV
Perl_utf8n_to_uvchr(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags)1163 Perl_utf8n_to_uvchr(const U8 *s,
1164                     STRLEN curlen,
1165                     STRLEN *retlen,
1166                     const U32 flags)
1167 {
1168     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1169 
1170     return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1171 }
1172 
1173 /*
1174 
1175 =for apidoc utf8n_to_uvchr_error
1176 
1177 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1178 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1179 directly.
1180 
1181 This function is for code that needs to know what the precise malformation(s)
1182 are when an error is found.  If you also need to know the generated warning
1183 messages, use L</utf8n_to_uvchr_msgs>() instead.
1184 
1185 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1186 all the others, C<errors>.  If this parameter is 0, this function behaves
1187 identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
1188 to a C<U32> variable, which this function sets to indicate any errors found.
1189 Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
1190 C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
1191 of these bits will be set if a malformation is found, even if the input
1192 C<flags> parameter indicates that the given malformation is allowed; those
1193 exceptions are noted:
1194 
1195 =over 4
1196 
1197 =item C<UTF8_GOT_PERL_EXTENDED>
1198 
1199 The input sequence is not standard UTF-8, but a Perl extension.  This bit is
1200 set only if the input C<flags> parameter contains either the
1201 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1202 
1203 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1204 and so some extension must be used to express them.  Perl uses a natural
1205 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1206 extension to represent even higher ones, so that any code point that fits in a
1207 64-bit word can be represented.  Text using these extensions is not likely to
1208 be portable to non-Perl code.  We lump both of these extensions together and
1209 refer to them as Perl extended UTF-8.  There exist other extensions that people
1210 have invented, incompatible with Perl's.
1211 
1212 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1213 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1214 than on ASCII.  Prior to that, code points 2**31 and higher were simply
1215 unrepresentable, and a different, incompatible method was used to represent
1216 code points between 2**30 and 2**31 - 1.
1217 
1218 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1219 Perl extended UTF-8 is used.
1220 
1221 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1222 may use for backward compatibility.  That name is misleading, as this flag may
1223 be set when the code point actually does fit in 31 bits.  This happens on
1224 EBCDIC platforms, and sometimes when the L<overlong
1225 malformation|/C<UTF8_GOT_LONG>> is also present.  The new name accurately
1226 describes the situation in all cases.
1227 
1228 =item C<UTF8_GOT_CONTINUATION>
1229 
1230 The input sequence was malformed in that the first byte was a UTF-8
1231 continuation byte.
1232 
1233 =item C<UTF8_GOT_EMPTY>
1234 
1235 The input C<curlen> parameter was 0.
1236 
1237 =item C<UTF8_GOT_LONG>
1238 
1239 The input sequence was malformed in that there is some other sequence that
1240 evaluates to the same code point, but that sequence is shorter than this one.
1241 
1242 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1243 it was discovered that this created security issues.
1244 
1245 =item C<UTF8_GOT_NONCHAR>
1246 
1247 The code point represented by the input UTF-8 sequence is for a Unicode
1248 non-character code point.
1249 This bit is set only if the input C<flags> parameter contains either the
1250 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1251 
1252 =item C<UTF8_GOT_NON_CONTINUATION>
1253 
1254 The input sequence was malformed in that a non-continuation type byte was found
1255 in a position where only a continuation type one should be.  See also
1256 C<L</UTF8_GOT_SHORT>>.
1257 
1258 =item C<UTF8_GOT_OVERFLOW>
1259 
1260 The input sequence was malformed in that it is for a code point that is not
1261 representable in the number of bits available in an IV on the current platform.
1262 
1263 =item C<UTF8_GOT_SHORT>
1264 
1265 The input sequence was malformed in that C<curlen> is smaller than required for
1266 a complete sequence.  In other words, the input is for a partial character
1267 sequence.
1268 
1269 
1270 C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
1271 sequence.  The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
1272 that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
1273 sequence was looked at.   If no other flags are present, it means that the
1274 sequence was valid as far as it went.  Depending on the application, this could
1275 mean one of three things:
1276 
1277 =over
1278 
1279 =item *
1280 
1281 The C<curlen> length parameter passed in was too small, and the function was
1282 prevented from examining all the necessary bytes.
1283 
1284 =item *
1285 
1286 The buffer being looked at is based on reading data, and the data received so
1287 far stopped in the middle of a character, so that the next read will
1288 read the remainder of this character.  (It is up to the caller to deal with the
1289 split bytes somehow.)
1290 
1291 =item *
1292 
1293 This is a real error, and the partial sequence is all we're going to get.
1294 
1295 =back
1296 
1297 =item C<UTF8_GOT_SUPER>
1298 
1299 The input sequence was malformed in that it is for a non-Unicode code point;
1300 that is, one above the legal Unicode maximum.
1301 This bit is set only if the input C<flags> parameter contains either the
1302 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1303 
1304 =item C<UTF8_GOT_SURROGATE>
1305 
1306 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1307 code point.
1308 This bit is set only if the input C<flags> parameter contains either the
1309 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1310 
1311 =back
1312 
1313 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1314 flag to suppress any warnings, and then examine the C<*errors> return.
1315 
1316 =for apidoc Amnh||UTF8_GOT_PERL_EXTENDED
1317 =for apidoc Amnh||UTF8_GOT_CONTINUATION
1318 =for apidoc Amnh||UTF8_GOT_EMPTY
1319 =for apidoc Amnh||UTF8_GOT_LONG
1320 =for apidoc Amnh||UTF8_GOT_NONCHAR
1321 =for apidoc Amnh||UTF8_GOT_NON_CONTINUATION
1322 =for apidoc Amnh||UTF8_GOT_OVERFLOW
1323 =for apidoc Amnh||UTF8_GOT_SHORT
1324 =for apidoc Amnh||UTF8_GOT_SUPER
1325 =for apidoc Amnh||UTF8_GOT_SURROGATE
1326 
1327 =cut
1328 
1329 Also implemented as a macro in utf8.h
1330 */
1331 
1332 UV
Perl_utf8n_to_uvchr_error(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors)1333 Perl_utf8n_to_uvchr_error(const U8 *s,
1334                           STRLEN curlen,
1335                           STRLEN *retlen,
1336                           const U32 flags,
1337                           U32 * errors)
1338 {
1339     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1340 
1341     return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
1342 }
1343 
1344 /*
1345 
1346 =for apidoc utf8n_to_uvchr_msgs
1347 
1348 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1349 Most code should use L</utf8_to_uvchr_buf>() rather than call this
1350 directly.
1351 
1352 This function is for code that needs to know what the precise malformation(s)
1353 are when an error is found, and wants the corresponding warning and/or error
1354 messages to be returned to the caller rather than be displayed.  All messages
1355 that would have been displayed if all lexical warnings are enabled will be
1356 returned.
1357 
1358 It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
1359 placed after all the others, C<msgs>.  If this parameter is 0, this function
1360 behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
1361 be a pointer to an C<AV *> variable, in which this function creates a new AV to
1362 contain any appropriate messages.  The elements of the array are ordered so
1363 that the first message that would have been displayed is in the 0th element,
1364 and so on.  Each element is a hash with three key-value pairs, as follows:
1365 
1366 =over 4
1367 
1368 =item C<text>
1369 
1370 The text of the message as a C<SVpv>.
1371 
1372 =item C<warn_categories>
1373 
1374 The warning category (or categories) packed into a C<SVuv>.
1375 
1376 =item C<flag>
1377 
1378 A single flag bit associated with this message, in a C<SVuv>.
1379 The bit corresponds to some bit in the C<*errors> return value,
1380 such as C<UTF8_GOT_LONG>.
1381 
1382 =back
1383 
1384 It's important to note that specifying this parameter as non-null will cause
1385 any warnings this function would otherwise generate to be suppressed, and
1386 instead be placed in C<*msgs>.  The caller can check the lexical warnings state
1387 (or not) when choosing what to do with the returned messages.
1388 
1389 If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
1390 no AV is created.
1391 
1392 The caller, of course, is responsible for freeing any returned AV.
1393 
1394 =cut
1395 */
1396 
1397 UV
Perl__utf8n_to_uvchr_msgs_helper(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)1398 Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
1399                                STRLEN curlen,
1400                                STRLEN *retlen,
1401                                const U32 flags,
1402                                U32 * errors,
1403                                AV ** msgs)
1404 {
1405     const U8 * const s0 = s;
1406     const U8 * send = s0 + curlen;
1407     U32 possible_problems;  /* A bit is set here for each potential problem
1408                                found as we go along */
1409     UV uv;
1410     STRLEN expectlen;     /* How long should this sequence be? */
1411     STRLEN avail_len;     /* When input is too short, gives what that is */
1412     U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
1413                              gets set and discarded */
1414 
1415     /* The below are used only if there is both an overlong malformation and a
1416      * too short one.  Otherwise the first two are set to 's0' and 'send', and
1417      * the third not used at all */
1418     U8 * adjusted_s0;
1419     U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1420                                             routine; see [perl #130921] */
1421     UV uv_so_far;
1422     dTHX;
1423 
1424     PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
1425 
1426     /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
1427      * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
1428      * syllables that the dfa doesn't properly handle.  Quickly dispose of the
1429      * final case. */
1430 
1431     /* Each of the affected Hanguls starts with \xED */
1432 
1433     if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */
1434         if (retlen) {
1435             *retlen = 3;
1436         }
1437         if (errors) {
1438             *errors = 0;
1439         }
1440         if (msgs) {
1441             *msgs = NULL;
1442         }
1443 
1444         return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
1445              | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
1446              |  (s0[2] & UTF_CONTINUATION_MASK);
1447     }
1448 
1449     /* In conjunction with the exhaustive tests that can be enabled in
1450      * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
1451      * what it is intended to do, and that no flaws in it are masked by
1452      * dropping down and executing the code below
1453     assert(! isUTF8_CHAR(s0, send)
1454           || UTF8_IS_SURROGATE(s0, send)
1455           || UTF8_IS_SUPER(s0, send)
1456           || UTF8_IS_NONCHAR(s0,send));
1457     */
1458 
1459     s = s0;
1460     possible_problems = 0;
1461     expectlen = 0;
1462     avail_len = 0;
1463     discard_errors = 0;
1464     adjusted_s0 = (U8 *) s0;
1465     uv_so_far = 0;
1466 
1467     if (errors) {
1468         *errors = 0;
1469     }
1470     else {
1471         errors = &discard_errors;
1472     }
1473 
1474     /* The order of malformation tests here is important.  We should consume as
1475      * few bytes as possible in order to not skip any valid character.  This is
1476      * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1477      * https://unicode.org/reports/tr36 for more discussion as to why.  For
1478      * example, once we've done a UTF8SKIP, we can tell the expected number of
1479      * bytes, and could fail right off the bat if the input parameters indicate
1480      * that there are too few available.  But it could be that just that first
1481      * byte is garbled, and the intended character occupies fewer bytes.  If we
1482      * blindly assumed that the first byte is correct, and skipped based on
1483      * that number, we could skip over a valid input character.  So instead, we
1484      * always examine the sequence byte-by-byte.
1485      *
1486      * We also should not consume too few bytes, otherwise someone could inject
1487      * things.  For example, an input could be deliberately designed to
1488      * overflow, and if this code bailed out immediately upon discovering that,
1489      * returning to the caller C<*retlen> pointing to the very next byte (one
1490      * which is actually part of the overflowing sequence), that could look
1491      * legitimate to the caller, which could discard the initial partial
1492      * sequence and process the rest, inappropriately.
1493      *
1494      * Some possible input sequences are malformed in more than one way.  This
1495      * function goes to lengths to try to find all of them.  This is necessary
1496      * for correctness, as the inputs may allow one malformation but not
1497      * another, and if we abandon searching for others after finding the
1498      * allowed one, we could allow in something that shouldn't have been.
1499      */
1500 
1501     if (UNLIKELY(curlen == 0)) {
1502         possible_problems |= UTF8_GOT_EMPTY;
1503         curlen = 0;
1504         uv = UNICODE_REPLACEMENT;
1505         goto ready_to_handle_errors;
1506     }
1507 
1508     /* We now know we can examine the first byte of the input */
1509     expectlen = UTF8SKIP(s);
1510     uv = *s;
1511 
1512     /* A well-formed UTF-8 character, as the vast majority of calls to this
1513      * function will be for, has this expected length.  For efficiency, set
1514      * things up here to return it.  It will be overridden only in those rare
1515      * cases where a malformation is found */
1516     if (retlen) {
1517         *retlen = expectlen;
1518     }
1519 
1520     /* A continuation character can't start a valid sequence */
1521     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1522         possible_problems |= UTF8_GOT_CONTINUATION;
1523         curlen = 1;
1524         uv = UNICODE_REPLACEMENT;
1525         goto ready_to_handle_errors;
1526     }
1527 
1528     /* Here is not a continuation byte, nor an invariant.  The only thing left
1529      * is a start byte (possibly for an overlong).  (We can't use UTF8_IS_START
1530      * because it excludes start bytes like \xC0 that always lead to
1531      * overlongs.) */
1532 
1533     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1534      * that indicate the number of bytes in the character's whole UTF-8
1535      * sequence, leaving just the bits that are part of the value.  */
1536     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1537 
1538     /* Setup the loop end point, making sure to not look past the end of the
1539      * input string, and flag it as too short if the size isn't big enough. */
1540     if (UNLIKELY(curlen < expectlen)) {
1541         possible_problems |= UTF8_GOT_SHORT;
1542         avail_len = curlen;
1543     }
1544     else {
1545         send = (U8*) s0 + expectlen;
1546     }
1547 
1548     /* Now, loop through the remaining bytes in the character's sequence,
1549      * accumulating each into the working value as we go. */
1550     for (s = s0 + 1; s < send; s++) {
1551         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1552             uv = UTF8_ACCUMULATE(uv, *s);
1553             continue;
1554         }
1555 
1556         /* Here, found a non-continuation before processing all expected bytes.
1557          * This byte indicates the beginning of a new character, so quit, even
1558          * if allowing this malformation. */
1559         possible_problems |= UTF8_GOT_NON_CONTINUATION;
1560         break;
1561     } /* End of loop through the character's bytes */
1562 
1563     /* Save how many bytes were actually in the character */
1564     curlen = s - s0;
1565 
1566     /* Note that there are two types of too-short malformation.  One is when
1567      * there is actual wrong data before the normal termination of the
1568      * sequence.  The other is that the sequence wasn't complete before the end
1569      * of the data we are allowed to look at, based on the input 'curlen'.
1570      * This means that we were passed data for a partial character, but it is
1571      * valid as far as we saw.  The other is definitely invalid.  This
1572      * distinction could be important to a caller, so the two types are kept
1573      * separate.
1574      *
1575      * A convenience macro that matches either of the too-short conditions.  */
1576 #   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1577 
1578     if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1579         uv_so_far = uv;
1580         uv = UNICODE_REPLACEMENT;
1581     }
1582 
1583     /* Check for overflow.  The algorithm requires us to not look past the end
1584      * of the current character, even if partial, so the upper limit is 's' */
1585     if (UNLIKELY(0 < does_utf8_overflow(s0, s,
1586                                          1 /* Do consider overlongs */
1587                                         )))
1588     {
1589         possible_problems |= UTF8_GOT_OVERFLOW;
1590         uv = UNICODE_REPLACEMENT;
1591     }
1592 
1593     /* Check for overlong.  If no problems so far, 'uv' is the correct code
1594      * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
1595      * we must look at the UTF-8 byte sequence itself to see if it is for an
1596      * overlong */
1597     if (     (   LIKELY(! possible_problems)
1598               && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1599         || (       UNLIKELY(possible_problems)
1600             && (   UNLIKELY(! UTF8_IS_START(*s0))
1601                 || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
1602     {
1603         possible_problems |= UTF8_GOT_LONG;
1604 
1605         if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
1606 
1607                           /* The calculation in the 'true' branch of this 'if'
1608                            * below won't work if overflows, and isn't needed
1609                            * anyway.  Further below we handle all overflow
1610                            * cases */
1611             &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1612         {
1613             UV min_uv = uv_so_far;
1614             STRLEN i;
1615 
1616             /* Here, the input is both overlong and is missing some trailing
1617              * bytes.  There is no single code point it could be for, but there
1618              * may be enough information present to determine if what we have
1619              * so far is for an unallowed code point, such as for a surrogate.
1620              * The code further below has the intelligence to determine this,
1621              * but just for non-overlong UTF-8 sequences.  What we do here is
1622              * calculate the smallest code point the input could represent if
1623              * there were no too short malformation.  Then we compute and save
1624              * the UTF-8 for that, which is what the code below looks at
1625              * instead of the raw input.  It turns out that the smallest such
1626              * code point is all we need. */
1627             for (i = curlen; i < expectlen; i++) {
1628                 min_uv = UTF8_ACCUMULATE(min_uv,
1629                                 I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
1630             }
1631 
1632             adjusted_s0 = temp_char_buf;
1633             (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1634         }
1635     }
1636 
1637     /* Here, we have found all the possible problems, except for when the input
1638      * is for a problematic code point not allowed by the input parameters. */
1639 
1640                                 /* uv is valid for overlongs */
1641     if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1642                    && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
1643             || (   UNLIKELY(possible_problems)
1644 
1645                           /* if overflow, we know without looking further
1646                            * precisely which of the problematic types it is,
1647                            * and we deal with those in the overflow handling
1648                            * code */
1649                 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1650                 && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1651                     || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
1652         && ((flags & ( UTF8_DISALLOW_NONCHAR
1653                       |UTF8_DISALLOW_SURROGATE
1654                       |UTF8_DISALLOW_SUPER
1655                       |UTF8_DISALLOW_PERL_EXTENDED
1656                       |UTF8_WARN_NONCHAR
1657                       |UTF8_WARN_SURROGATE
1658                       |UTF8_WARN_SUPER
1659                       |UTF8_WARN_PERL_EXTENDED))))
1660     {
1661         /* If there were no malformations, or the only malformation is an
1662          * overlong, 'uv' is valid */
1663         if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1664             if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1665                 possible_problems |= UTF8_GOT_SURROGATE;
1666             }
1667             else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
1668                 possible_problems |= UTF8_GOT_SUPER;
1669             }
1670             else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1671                 possible_problems |= UTF8_GOT_NONCHAR;
1672             }
1673         }
1674         else {  /* Otherwise, need to look at the source UTF-8, possibly
1675                    adjusted to be non-overlong */
1676 
1677             if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1678                                                     > UTF_START_BYTE_110000_))
1679             {
1680                 possible_problems |= UTF8_GOT_SUPER;
1681             }
1682             else if (curlen > 1) {
1683                 if (UNLIKELY(   NATIVE_UTF8_TO_I8(*adjusted_s0)
1684                                                 == UTF_START_BYTE_110000_
1685                              && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
1686                                                 >= UTF_FIRST_CONT_BYTE_110000_))
1687                 {
1688                     possible_problems |= UTF8_GOT_SUPER;
1689                 }
1690                 else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
1691                     possible_problems |= UTF8_GOT_SURROGATE;
1692                 }
1693             }
1694 
1695             /* We need a complete well-formed UTF-8 character to discern
1696              * non-characters, so can't look for them here */
1697         }
1698     }
1699 
1700   ready_to_handle_errors:
1701 
1702     /* At this point:
1703      * curlen               contains the number of bytes in the sequence that
1704      *                      this call should advance the input by.
1705      * avail_len            gives the available number of bytes passed in, but
1706      *                      only if this is less than the expected number of
1707      *                      bytes, based on the code point's start byte.
1708      * possible_problems    is 0 if there weren't any problems; otherwise a bit
1709      *                      is set in it for each potential problem found.
1710      * uv                   contains the code point the input sequence
1711      *                      represents; or if there is a problem that prevents
1712      *                      a well-defined value from being computed, it is
1713      *                      some substitute value, typically the REPLACEMENT
1714      *                      CHARACTER.
1715      * s0                   points to the first byte of the character
1716      * s                    points to just after where we left off processing
1717      *                      the character
1718      * send                 points to just after where that character should
1719      *                      end, based on how many bytes the start byte tells
1720      *                      us should be in it, but no further than s0 +
1721      *                      avail_len
1722      */
1723 
1724     if (UNLIKELY(possible_problems)) {
1725         bool disallowed = FALSE;
1726         const U32 orig_problems = possible_problems;
1727 
1728         if (msgs) {
1729             *msgs = NULL;
1730         }
1731 
1732         while (possible_problems) { /* Handle each possible problem */
1733             U32 pack_warn = 0;
1734             char * message = NULL;
1735             U32 this_flag_bit = 0;
1736 
1737             /* Each 'if' clause handles one problem.  They are ordered so that
1738              * the first ones' messages will be displayed before the later
1739              * ones; this is kinda in decreasing severity order.  But the
1740              * overlong must come last, as it changes 'uv' looked at by the
1741              * others */
1742             if (possible_problems & UTF8_GOT_OVERFLOW) {
1743 
1744                 /* Overflow means also got a super and are using Perl's
1745                  * extended UTF-8, but we handle all three cases here */
1746                 possible_problems
1747                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1748                 *errors |= UTF8_GOT_OVERFLOW;
1749 
1750                 /* But the API says we flag all errors found */
1751                 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1752                     *errors |= UTF8_GOT_SUPER;
1753                 }
1754                 if (flags
1755                         & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1756                 {
1757                     *errors |= UTF8_GOT_PERL_EXTENDED;
1758                 }
1759 
1760                 /* Disallow if any of the three categories say to */
1761                 if ( ! (flags &   UTF8_ALLOW_OVERFLOW)
1762                     || (flags & ( UTF8_DISALLOW_SUPER
1763                                  |UTF8_DISALLOW_PERL_EXTENDED)))
1764                 {
1765                     disallowed = TRUE;
1766                 }
1767 
1768                 /* Likewise, warn if any say to */
1769                 if (  ! (flags & UTF8_ALLOW_OVERFLOW)
1770                     ||  (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1771                 {
1772 
1773                     /* The warnings code explicitly says it doesn't handle the
1774                      * case of packWARN2 and two categories which have
1775                      * parent-child relationship.  Even if it works now to
1776                      * raise the warning if either is enabled, it wouldn't
1777                      * necessarily do so in the future.  We output (only) the
1778                      * most dire warning */
1779                     if (! (flags & UTF8_CHECK_ONLY)) {
1780                         if (msgs || ckWARN_d(WARN_UTF8)) {
1781                             pack_warn = packWARN(WARN_UTF8);
1782                         }
1783                         else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
1784                             pack_warn = packWARN(WARN_NON_UNICODE);
1785                         }
1786                         if (pack_warn) {
1787                             message = Perl_form(aTHX_ "%s: %s (overflows)",
1788                                             malformed_text,
1789                                             _byte_dump_string(s0, curlen, 0));
1790                             this_flag_bit = UTF8_GOT_OVERFLOW;
1791                         }
1792                     }
1793                 }
1794             }
1795             else if (possible_problems & UTF8_GOT_EMPTY) {
1796                 possible_problems &= ~UTF8_GOT_EMPTY;
1797                 *errors |= UTF8_GOT_EMPTY;
1798 
1799                 if (! (flags & UTF8_ALLOW_EMPTY)) {
1800 
1801                     /* This so-called malformation is now treated as a bug in
1802                      * the caller.  If you have nothing to decode, skip calling
1803                      * this function */
1804                     assert(0);
1805 
1806                     disallowed = TRUE;
1807                     if (  (msgs
1808                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1809                     {
1810                         pack_warn = packWARN(WARN_UTF8);
1811                         message = Perl_form(aTHX_ "%s (empty string)",
1812                                                    malformed_text);
1813                         this_flag_bit = UTF8_GOT_EMPTY;
1814                     }
1815                 }
1816             }
1817             else if (possible_problems & UTF8_GOT_CONTINUATION) {
1818                 possible_problems &= ~UTF8_GOT_CONTINUATION;
1819                 *errors |= UTF8_GOT_CONTINUATION;
1820 
1821                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1822                     disallowed = TRUE;
1823                     if ((   msgs
1824                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1825                     {
1826                         pack_warn = packWARN(WARN_UTF8);
1827                         message = Perl_form(aTHX_
1828                                 "%s: %s (unexpected continuation byte 0x%02x,"
1829                                 " with no preceding start byte)",
1830                                 malformed_text,
1831                                 _byte_dump_string(s0, 1, 0), *s0);
1832                         this_flag_bit = UTF8_GOT_CONTINUATION;
1833                     }
1834                 }
1835             }
1836             else if (possible_problems & UTF8_GOT_SHORT) {
1837                 possible_problems &= ~UTF8_GOT_SHORT;
1838                 *errors |= UTF8_GOT_SHORT;
1839 
1840                 if (! (flags & UTF8_ALLOW_SHORT)) {
1841                     disallowed = TRUE;
1842                     if ((   msgs
1843                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1844                     {
1845                         pack_warn = packWARN(WARN_UTF8);
1846                         message = Perl_form(aTHX_
1847                              "%s: %s (too short; %d byte%s available, need %d)",
1848                              malformed_text,
1849                              _byte_dump_string(s0, send - s0, 0),
1850                              (int)avail_len,
1851                              avail_len == 1 ? "" : "s",
1852                              (int)expectlen);
1853                         this_flag_bit = UTF8_GOT_SHORT;
1854                     }
1855                 }
1856 
1857             }
1858             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1859                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1860                 *errors |= UTF8_GOT_NON_CONTINUATION;
1861 
1862                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1863                     disallowed = TRUE;
1864                     if ((   msgs
1865                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
1866                     {
1867 
1868                         /* If we don't know for sure that the input length is
1869                          * valid, avoid as much as possible reading past the
1870                          * end of the buffer */
1871                         int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1872                                        ? (int) (s - s0)
1873                                        : (int) (send - s0);
1874                         pack_warn = packWARN(WARN_UTF8);
1875                         message = Perl_form(aTHX_ "%s",
1876                             unexpected_non_continuation_text(s0,
1877                                                             printlen,
1878                                                             s - s0,
1879                                                             (int) expectlen));
1880                         this_flag_bit = UTF8_GOT_NON_CONTINUATION;
1881                     }
1882                 }
1883             }
1884             else if (possible_problems & UTF8_GOT_SURROGATE) {
1885                 possible_problems &= ~UTF8_GOT_SURROGATE;
1886 
1887                 if (flags & UTF8_WARN_SURROGATE) {
1888                     *errors |= UTF8_GOT_SURROGATE;
1889 
1890                     if (   ! (flags & UTF8_CHECK_ONLY)
1891                         && (msgs || ckWARN_d(WARN_SURROGATE)))
1892                     {
1893                         pack_warn = packWARN(WARN_SURROGATE);
1894 
1895                         /* These are the only errors that can occur with a
1896                         * surrogate when the 'uv' isn't valid */
1897                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1898                             message = Perl_form(aTHX_
1899                                     "UTF-16 surrogate (any UTF-8 sequence that"
1900                                     " starts with \"%s\" is for a surrogate)",
1901                                     _byte_dump_string(s0, curlen, 0));
1902                         }
1903                         else {
1904                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
1905                         }
1906                         this_flag_bit = UTF8_GOT_SURROGATE;
1907                     }
1908                 }
1909 
1910                 if (flags & UTF8_DISALLOW_SURROGATE) {
1911                     disallowed = TRUE;
1912                     *errors |= UTF8_GOT_SURROGATE;
1913                 }
1914             }
1915             else if (possible_problems & UTF8_GOT_SUPER) {
1916                 possible_problems &= ~UTF8_GOT_SUPER;
1917 
1918                 if (flags & UTF8_WARN_SUPER) {
1919                     *errors |= UTF8_GOT_SUPER;
1920 
1921                     if (   ! (flags & UTF8_CHECK_ONLY)
1922                         && (msgs || ckWARN_d(WARN_NON_UNICODE)))
1923                     {
1924                         pack_warn = packWARN(WARN_NON_UNICODE);
1925 
1926                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
1927                             message = Perl_form(aTHX_
1928                                     "Any UTF-8 sequence that starts with"
1929                                     " \"%s\" is for a non-Unicode code point,"
1930                                     " may not be portable",
1931                                     _byte_dump_string(s0, curlen, 0));
1932                         }
1933                         else {
1934                             message = Perl_form(aTHX_ super_cp_format, uv);
1935                         }
1936                         this_flag_bit = UTF8_GOT_SUPER;
1937                     }
1938                 }
1939 
1940                 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1941                  * and before possibly bailing out, so that the more dire
1942                  * warning will override the regular one. */
1943                 if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
1944                     if (  ! (flags & UTF8_CHECK_ONLY)
1945                         &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
1946                         &&  (msgs || (   ckWARN_d(WARN_NON_UNICODE)
1947                                       || ckWARN(WARN_PORTABLE))))
1948                     {
1949                         pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
1950 
1951                         /* If it is an overlong that evaluates to a code point
1952                          * that doesn't have to use the Perl extended UTF-8, it
1953                          * still used it, and so we output a message that
1954                          * doesn't refer to the code point.  The same is true
1955                          * if there was a SHORT malformation where the code
1956                          * point is not valid.  In that case, 'uv' will have
1957                          * been set to the REPLACEMENT CHAR, and the message
1958                          * below without the code point in it will be selected
1959                          * */
1960                         if (UNICODE_IS_PERL_EXTENDED(uv)) {
1961                             message = Perl_form(aTHX_
1962                                             PL_extended_cp_format, uv);
1963                         }
1964                         else {
1965                             message = Perl_form(aTHX_
1966                                         "Any UTF-8 sequence that starts with"
1967                                         " \"%s\" is a Perl extension, and"
1968                                         " so is not portable",
1969                                         _byte_dump_string(s0, curlen, 0));
1970                         }
1971                         this_flag_bit = UTF8_GOT_PERL_EXTENDED;
1972                     }
1973 
1974                     if (flags & ( UTF8_WARN_PERL_EXTENDED
1975                                  |UTF8_DISALLOW_PERL_EXTENDED))
1976                     {
1977                         *errors |= UTF8_GOT_PERL_EXTENDED;
1978 
1979                         if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
1980                             disallowed = TRUE;
1981                         }
1982                     }
1983                 }
1984 
1985                 if (flags & UTF8_DISALLOW_SUPER) {
1986                     *errors |= UTF8_GOT_SUPER;
1987                     disallowed = TRUE;
1988                 }
1989             }
1990             else if (possible_problems & UTF8_GOT_NONCHAR) {
1991                 possible_problems &= ~UTF8_GOT_NONCHAR;
1992 
1993                 if (flags & UTF8_WARN_NONCHAR) {
1994                     *errors |= UTF8_GOT_NONCHAR;
1995 
1996                     if (  ! (flags & UTF8_CHECK_ONLY)
1997                         && (msgs || ckWARN_d(WARN_NONCHAR)))
1998                     {
1999                         /* The code above should have guaranteed that we don't
2000                          * get here with errors other than overlong */
2001                         assert (! (orig_problems
2002                                         & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
2003 
2004                         pack_warn = packWARN(WARN_NONCHAR);
2005                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
2006                         this_flag_bit = UTF8_GOT_NONCHAR;
2007                     }
2008                 }
2009 
2010                 if (flags & UTF8_DISALLOW_NONCHAR) {
2011                     disallowed = TRUE;
2012                     *errors |= UTF8_GOT_NONCHAR;
2013                 }
2014             }
2015             else if (possible_problems & UTF8_GOT_LONG) {
2016                 possible_problems &= ~UTF8_GOT_LONG;
2017                 *errors |= UTF8_GOT_LONG;
2018 
2019                 if (flags & UTF8_ALLOW_LONG) {
2020 
2021                     /* We don't allow the actual overlong value, unless the
2022                      * special extra bit is also set */
2023                     if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
2024                                     & ~UTF8_ALLOW_LONG)))
2025                     {
2026                         uv = UNICODE_REPLACEMENT;
2027                     }
2028                 }
2029                 else {
2030                     disallowed = TRUE;
2031 
2032                     if ((   msgs
2033                          || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
2034                     {
2035                         pack_warn = packWARN(WARN_UTF8);
2036 
2037                         /* These error types cause 'uv' to be something that
2038                          * isn't what was intended, so can't use it in the
2039                          * message.  The other error types either can't
2040                          * generate an overlong, or else the 'uv' is valid */
2041                         if (orig_problems &
2042                                         (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
2043                         {
2044                             message = Perl_form(aTHX_
2045                                     "%s: %s (any UTF-8 sequence that starts"
2046                                     " with \"%s\" is overlong which can and"
2047                                     " should be represented with a"
2048                                     " different, shorter sequence)",
2049                                     malformed_text,
2050                                     _byte_dump_string(s0, send - s0, 0),
2051                                     _byte_dump_string(s0, curlen, 0));
2052                         }
2053                         else {
2054                             U8 tmpbuf[UTF8_MAXBYTES+1];
2055                             const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
2056                                                                         uv, 0);
2057                             /* Don't use U+ for non-Unicode code points, which
2058                              * includes those in the Latin1 range */
2059                             const char * preface = (   UNICODE_IS_SUPER(uv)
2060 #ifdef EBCDIC
2061                                                     || uv <= 0xFF
2062 #endif
2063                                                    )
2064                                                    ? "0x"
2065                                                    : "U+";
2066                             message = Perl_form(aTHX_
2067                                 "%s: %s (overlong; instead use %s to represent"
2068                                 " %s%0*" UVXf ")",
2069                                 malformed_text,
2070                                 _byte_dump_string(s0, send - s0, 0),
2071                                 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
2072                                 preface,
2073                                 ((uv < 256) ? 2 : 4), /* Field width of 2 for
2074                                                          small code points */
2075                                 UNI_TO_NATIVE(uv));
2076                         }
2077                         this_flag_bit = UTF8_GOT_LONG;
2078                     }
2079                 }
2080             } /* End of looking through the possible flags */
2081 
2082             /* Display the message (if any) for the problem being handled in
2083              * this iteration of the loop */
2084             if (message) {
2085                 if (msgs) {
2086                     assert(this_flag_bit);
2087 
2088                     if (*msgs == NULL) {
2089                         *msgs = newAV();
2090                     }
2091 
2092                     av_push(*msgs, newRV_noinc((SV*) new_msg_hv(message,
2093                                                                 pack_warn,
2094                                                                 this_flag_bit)));
2095                 }
2096                 else if (PL_op)
2097                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
2098                                                  OP_DESC(PL_op));
2099                 else
2100                     Perl_warner(aTHX_ pack_warn, "%s", message);
2101             }
2102         }   /* End of 'while (possible_problems)' */
2103 
2104         /* Since there was a possible problem, the returned length may need to
2105          * be changed from the one stored at the beginning of this function.
2106          * Instead of trying to figure out if it has changed, just do it. */
2107         if (retlen) {
2108             *retlen = curlen;
2109         }
2110 
2111         if (disallowed) {
2112             if (flags & UTF8_CHECK_ONLY && retlen) {
2113                 *retlen = ((STRLEN) -1);
2114             }
2115             return 0;
2116         }
2117     }
2118 
2119     return UNI_TO_NATIVE(uv);
2120 }
2121 
2122 /*
2123 =for apidoc utf8_to_uvchr_buf
2124 
2125 Returns the native code point of the first character in the string C<s> which
2126 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
2127 C<*retlen> will be set to the length, in bytes, of that character.
2128 
2129 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
2130 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
2131 C<NULL>) to -1.  If those warnings are off, the computed value, if well-defined
2132 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
2133 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
2134 the next possible position in C<s> that could begin a non-malformed character.
2135 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
2136 returned.
2137 
2138 =cut
2139 
2140 Also implemented as a macro in utf8.h
2141 
2142 */
2143 
2144 
2145 UV
Perl_utf8_to_uvchr_buf(pTHX_ const U8 * s,const U8 * send,STRLEN * retlen)2146 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2147 {
2148     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
2149 
2150     return utf8_to_uvchr_buf_helper(s, send, retlen);
2151 }
2152 
2153 /*
2154 =for apidoc utf8_length
2155 
2156 Returns the number of characters in the sequence of UTF-8-encoded bytes starting
2157 at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
2158 same place, it returns 0 with no warning raised.
2159 
2160 If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
2161 and returns the number of valid characters.
2162 
2163 =cut
2164 
2165     For long strings we process the input word-at-a-time, and count
2166     continuations, instead of otherwise counting characters and using UTF8SKIP
2167     to find the next one.  If our input were 13-byte characters, the per-word
2168     would be a loser, as we would be doing things in 8 byte chunks (or 4 on a
2169     32-bit platform).  But the maximum legal Unicode code point is 4 bytes, and
2170     most text will have a significant number of 1 and 2 byte characters, so the
2171     per-word is generally a winner.
2172 
2173     There are start-up and finish costs with the per-word method, so we use the
2174     standard method unless the input has a relatively large length.
2175 */
2176 
2177 STRLEN
Perl_utf8_length(pTHX_ const U8 * const s0,const U8 * const e)2178 Perl_utf8_length(pTHX_ const U8 * const s0, const U8 * const e)
2179 {
2180     STRLEN continuations = 0;
2181     STRLEN len = 0;
2182     const U8 * s = s0;
2183 
2184     PERL_ARGS_ASSERT_UTF8_LENGTH;
2185 
2186     /* For EBCDCIC and short strings, we count the characters.  The boundary
2187      * was determined by eyeballing the output of Porting/bench.pl and
2188      * choosing a number where the continuations method gave better results (on
2189      * a 64 bit system, khw not having access to a 32 bit system with
2190      * cachegrind).  The number isn't critical, as at these sizes, the total
2191      * time spent isn't large either way */
2192 
2193 #ifndef EBCDIC
2194 
2195     if (e - s0 < 96)
2196 
2197 #endif
2198 
2199     {
2200         while (s < e) { /* Count characters directly */
2201 
2202             /* Take extra care to not exceed 'e' (which would be undefined
2203              * behavior) should the input be malformed, with a partial
2204              * character at the end */
2205             Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2206             if (UNLIKELY(e - s  < expected_byte_count)) {
2207                 goto warn_and_return;
2208             }
2209 
2210             len++;
2211             s += expected_byte_count;
2212         }
2213 
2214         if (LIKELY(e == s)) {
2215             return len;
2216         }
2217 
2218       warn_and_return:
2219         if (ckWARN_d(WARN_UTF8)) {
2220             if (PL_op)
2221                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
2222                              "%s in %s", unees, OP_DESC(PL_op));
2223             else
2224                 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2225         }
2226 
2227         return s - s0;
2228     }
2229 
2230 #ifndef EBCDIC
2231 
2232     /* Count continuations, word-at-a-time.
2233      *
2234      * We need to stop before the final start character in order to
2235      * preserve the limited error checking that's always been done */
2236     const U8 * e_limit = e - UTF8_MAXBYTES;
2237 
2238     /* Points to the first byte >=s which is positioned at a word boundary.  If
2239      * s is on a word boundary, it is s, otherwise it is to the next word. */
2240     const U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
2241                                     - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
2242 
2243     /* Process up to a full word boundary. */
2244     while (s < partial_word_end) {
2245         const Size_t skip = UTF8SKIP(s);
2246 
2247         continuations += skip - 1;
2248         s += skip;
2249     }
2250 
2251     /* Adjust back down any overshoot */
2252     continuations -= s - partial_word_end;
2253     s = partial_word_end;
2254 
2255     do { /* Process per-word */
2256 
2257         /* The idea for counting continuation bytes came from
2258          * http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
2259          * One thing it does that this doesn't is to prefetch the buffer
2260          *      __builtin_prefetch(&s[256], 0, 0);
2261          *
2262          * A continuation byte has the upper 2 bits be '10', and the rest
2263          * dont-cares.  The VARIANTS mask zeroes out all but the upper bit of
2264          * each byte in the word.  That gets shifted to the byte's lowest bit,
2265          * and 'anded' with the complement of the 2nd highest bit of the byte,
2266          * which has also been shifted to that position.  Hence the bit in that
2267          * position will be 1 iff the upper bit is 1 and the next one is 0.  We
2268          * then use the same integer multiplcation and shifting that are used
2269          * in variant_under_utf8_count() to count how many of those are set in
2270          * the word. */
2271 
2272         continuations += (((((* (const PERL_UINTMAX_T *) s)
2273                                             & PERL_VARIANTS_WORD_MASK) >> 7)
2274                       & (((~ (* (const PERL_UINTMAX_T *) s))) >> 6))
2275                   * PERL_COUNT_MULTIPLIER)
2276                 >> ((PERL_WORDSIZE - 1) * CHARBITS);
2277         s += PERL_WORDSIZE;
2278     } while (s + PERL_WORDSIZE <= e_limit);
2279 
2280     /* Process remainder per-byte */
2281     while (s < e) {
2282 	if (UTF8_IS_CONTINUATION(*s)) {
2283             continuations++;
2284             s++;
2285             continue;
2286         }
2287 
2288         /* Here is a starter byte.  Use UTF8SKIP from now on */
2289         do {
2290             Ptrdiff_t expected_byte_count = UTF8SKIP(s);
2291             if (UNLIKELY(e - s  < expected_byte_count)) {
2292                 break;
2293             }
2294 
2295             continuations += expected_byte_count- 1;
2296             s += expected_byte_count;
2297         } while (s < e);
2298 
2299         break;
2300     }
2301 
2302 #  endif
2303 
2304     if (LIKELY(e == s)) {
2305         return s - s0 - continuations;
2306     }
2307 
2308     /* Convert to characters */
2309     s -= continuations;
2310 
2311     goto warn_and_return;
2312 }
2313 
2314 /*
2315 =for apidoc bytes_cmp_utf8
2316 
2317 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
2318 sequence of characters (stored as UTF-8)
2319 in C<u>, C<ulen>.  Returns 0 if they are
2320 equal, -1 or -2 if the first string is less than the second string, +1 or +2
2321 if the first string is greater than the second string.
2322 
2323 -1 or +1 is returned if the shorter string was identical to the start of the
2324 longer string.  -2 or +2 is returned if
2325 there was a difference between characters
2326 within the strings.
2327 
2328 =cut
2329 */
2330 
2331 int
Perl_bytes_cmp_utf8(pTHX_ const U8 * b,STRLEN blen,const U8 * u,STRLEN ulen)2332 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
2333 {
2334     const U8 *const bend = b + blen;
2335     const U8 *const uend = u + ulen;
2336 
2337     PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
2338 
2339     while (b < bend && u < uend) {
2340         U8 c = *u++;
2341         if (!UTF8_IS_INVARIANT(c)) {
2342             if (UTF8_IS_DOWNGRADEABLE_START(c)) {
2343                 if (u < uend) {
2344                     U8 c1 = *u++;
2345                     if (UTF8_IS_CONTINUATION(c1)) {
2346                         c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
2347                     } else {
2348                         /* diag_listed_as: Malformed UTF-8 character%s */
2349                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2350                               "%s %s%s",
2351                               unexpected_non_continuation_text(u - 2, 2, 1, 2),
2352                               PL_op ? " in " : "",
2353                               PL_op ? OP_DESC(PL_op) : "");
2354                         return -2;
2355                     }
2356                 } else {
2357                     if (PL_op)
2358                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
2359                                          "%s in %s", unees, OP_DESC(PL_op));
2360                     else
2361                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
2362                     return -2; /* Really want to return undef :-)  */
2363                 }
2364             } else {
2365                 return -2;
2366             }
2367         }
2368         if (*b != c) {
2369             return *b < c ? -2 : +2;
2370         }
2371         ++b;
2372     }
2373 
2374     if (b == bend && u == uend)
2375         return 0;
2376 
2377     return b < bend ? +1 : -1;
2378 }
2379 
2380 /*
2381 =for apidoc utf8_to_bytes
2382 
2383 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
2384 Unlike L</bytes_to_utf8>, this over-writes the original string, and
2385 updates C<*lenp> to contain the new length.
2386 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
2387 
2388 Upon successful return, the number of variants in the string can be computed by
2389 having saved the value of C<*lenp> before the call, and subtracting the
2390 after-call value of C<*lenp> from it.
2391 
2392 If you need a copy of the string, see L</bytes_from_utf8>.
2393 
2394 =cut
2395 */
2396 
2397 U8 *
Perl_utf8_to_bytes(pTHX_ U8 * s,STRLEN * lenp)2398 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2399 {
2400     U8 * first_variant;
2401 
2402     PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2403     PERL_UNUSED_CONTEXT;
2404 
2405     /* This is a no-op if no variants at all in the input */
2406     if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2407         return s;
2408     }
2409 
2410     /* Nothing before 'first_variant' needs to be changed, so start the real
2411      * work there */
2412 
2413     U8 * const save = s;
2414     U8 * const send = s + *lenp;
2415     U8 * d;
2416 
2417 #ifndef EBCDIC      /* The below relies on the bit patterns of UTF-8 */
2418 
2419     /* There is some start-up/tear-down overhead with this, so no real gain
2420      * unless the string is long enough.  The current value is just a
2421      * guess. */
2422     if (*lenp > 5 * PERL_WORDSIZE) {
2423 
2424         /* First, go through the string a word at-a-time to verify that it is
2425          * downgradable.  If it contains any start byte besides C2 and C3, then
2426          * it isn't. */
2427 
2428         const PERL_UINTMAX_T C0_mask = PERL_COUNT_MULTIPLIER * 0xC0;
2429         const PERL_UINTMAX_T C2_mask = PERL_COUNT_MULTIPLIER * 0xC2;
2430         const PERL_UINTMAX_T FE_mask = PERL_COUNT_MULTIPLIER * 0xFE;
2431 
2432         /* Points to the first byte >=s which is positioned at a word boundary.
2433          * If s is on a word boundary, it is s, otherwise it is the first byte
2434          * of the next word. */
2435         U8 * partial_word_end = s + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
2436                                 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK);
2437 
2438         /* Here there is at least a full word beyond the first word boundary.
2439          * Process up to that boundary. */
2440         while (s < partial_word_end) {
2441             if (! UTF8_IS_INVARIANT(*s)) {
2442                 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2443                     *lenp = ((STRLEN) -1);
2444                     return NULL;
2445                 }
2446                 s++;
2447             }
2448             s++;
2449         }
2450 
2451         /* Adjust back down any overshoot */
2452         s = partial_word_end;
2453 
2454         /* Process per-word */
2455         do {
2456 
2457             PERL_UINTMAX_T C2_C3_start_bytes;
2458 
2459             /* First find the bytes that are start bytes.  ANDing with
2460              * C0C0...C0 causes any start byte to become C0; any other byte
2461              * becomes something else.  Then XORing with C0 causes any start
2462              * byte to become 0; all other bytes non-zero. */
2463             PERL_UINTMAX_T start_bytes
2464                           = ((* (PERL_UINTMAX_T *) s) & C0_mask) ^ C0_mask;
2465 
2466             /* These shifts causes the most significant bit to be set to 1 for
2467              * any bytes in the word that aren't completely 0.  Hence after
2468              * these, only the start bytes have 0 in their msb */
2469             start_bytes |= start_bytes << 1;
2470             start_bytes |= start_bytes << 2;
2471             start_bytes |= start_bytes << 4;
2472 
2473             /* When we complement, then AND with 8080...80, the start bytes
2474              * will have 1 in their msb, and all other bits are 0 */
2475             start_bytes = ~ start_bytes & PERL_VARIANTS_WORD_MASK;
2476 
2477             /* Now repeat the procedure, but look for bytes that match only
2478              * C2-C3. */
2479             C2_C3_start_bytes = ((* (PERL_UINTMAX_T *) s) & FE_mask)
2480                                                                 ^ C2_mask;
2481             C2_C3_start_bytes |= C2_C3_start_bytes << 1;
2482             C2_C3_start_bytes |= C2_C3_start_bytes << 2;
2483             C2_C3_start_bytes |= C2_C3_start_bytes << 4;
2484             C2_C3_start_bytes = ~ C2_C3_start_bytes
2485                                 & PERL_VARIANTS_WORD_MASK;
2486 
2487             /* Here, start_bytes has a 1 in the msb of each byte that has a
2488              *                                              start_byte; And
2489              * C2_C3_start_bytes has a 1 in the msb of each byte that has a
2490              *                                       start_byte of C2 or C3
2491              * If they're not equal, there are start bytes that aren't C2
2492              * nor C3, hence this is not downgradable */
2493             if (start_bytes != C2_C3_start_bytes) {
2494                 *lenp = ((STRLEN) -1);
2495                 return NULL;
2496             }
2497 
2498             s += PERL_WORDSIZE;
2499         } while (s + PERL_WORDSIZE <= send);
2500 
2501         /* If the final byte was a start byte, it means that the character
2502          * straddles two words, so back off one to start looking below at the
2503          * first byte of the character  */
2504         if (s > first_variant && UTF8_IS_START(*(s-1))) {
2505             s--;
2506         }
2507     }
2508 
2509 #endif
2510 
2511     /* Do the straggler bytes beyond the final word boundary (or all bytes
2512      * in the case of EBCDIC) */
2513     while (s < send) {
2514         if (! UTF8_IS_INVARIANT(*s)) {
2515             if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2516                 *lenp = ((STRLEN) -1);
2517                 return NULL;
2518             }
2519             s++;
2520         }
2521         s++;
2522     }
2523 
2524     /* Here, we passed the tests above.  For the EBCDIC case, everything
2525      * was well-formed and can be downgraded to non-UTF8.  For non-EBCDIC,
2526      * it means only that all start bytes were C2 or C3, hence any
2527      * well-formed sequences are downgradable.  But we didn't test, for
2528      * example, that there weren't two C2's in a row.  That means that in
2529      * the loop below, we have to be sure things are well-formed.  Because
2530      * this is very very likely, and we don't care about having speedy
2531      * handling of malformed input, the loop proceeds as if well formed,
2532      * and should a malformed one come along, it undoes what it already has
2533      * done */
2534 
2535     d = s = first_variant;
2536 
2537     while (s < send) {
2538         U8 * s1;
2539 
2540         if (UVCHR_IS_INVARIANT(*s)) {
2541             *d++ = *s++;
2542             continue;
2543         }
2544 
2545         /* Here it is two-byte encoded. */
2546         if (   LIKELY(UTF8_IS_DOWNGRADEABLE_START(*s))
2547             && LIKELY(UTF8_IS_CONTINUATION((s[1]))))
2548         {
2549             U8 first_byte = *s++;
2550             *d++ = EIGHT_BIT_UTF8_TO_NATIVE(first_byte, *s);
2551             s++;
2552             continue;
2553         }
2554 
2555         /* Here, it is malformed.  This shouldn't happen on EBCDIC, and on
2556          * ASCII platforms, we know that the only start bytes in the text
2557          * are C2 and C3, and the code above has made sure that it doesn't
2558          * end with a start byte.  That means the only malformations that
2559          * are possible are a start byte without a continuation (either
2560          * followed by another start byte or an invariant) or an unexpected
2561          * continuation.
2562          *
2563          * We have to undo all we've done before, back down to the first
2564          * UTF-8 variant.  Note that each 2-byte variant we've done so far
2565          * (converted to single byte) slides things to the left one byte,
2566          * and so we have bytes that haven't been written over.
2567          *
2568          * Here, 'd' points to the next position to overwrite, and 's'
2569          * points to the first invalid byte.  That means 'd's contents
2570          * haven't been changed yet, nor has anything else beyond it in the
2571          * string.  In restoring to the original contents, we don't need to
2572          * do anything past (d-1).
2573          *
2574          * In particular, the bytes from 'd' to 's' have not been changed.
2575          * This loop uses a new variable 's1' (to avoid confusing 'source'
2576          * and 'destination') set to 'd',  and moves 's' and 's1' in lock
2577          * step back so that afterwards, 's1' points to the first changed
2578          * byte that will be the source for the first byte (or bytes) at
2579          * 's' that need to be changed back.  Note that s1 can expand to
2580          * two bytes */
2581         s1 = d;
2582         while (s >= d) {
2583             s--;
2584             if (! UVCHR_IS_INVARIANT(*s1)) {
2585                 s--;
2586             }
2587             s1--;
2588         }
2589 
2590         /* Do the changing back */
2591         while (s1 >= first_variant) {
2592             if (UVCHR_IS_INVARIANT(*s1)) {
2593                 *s-- = *s1--;
2594             }
2595             else {
2596                 *s-- = UTF8_EIGHT_BIT_LO(*s1);
2597                 *s-- = UTF8_EIGHT_BIT_HI(*s1);
2598                 s1--;
2599             }
2600         }
2601 
2602         *lenp = ((STRLEN) -1);
2603         return NULL;
2604     }
2605 
2606     /* Success! */
2607     *d = '\0';
2608     *lenp = d - save;
2609 
2610     return save;
2611 }
2612 
2613 /*
2614 =for apidoc bytes_from_utf8
2615 
2616 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2617 byte encoding.  On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2618 actually encoded in UTF-8.
2619 
2620 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2621 the input string.
2622 
2623 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2624 not expressible in native byte encoding.  In these cases, C<*is_utf8p> and
2625 C<*lenp> are unchanged, and the return value is the original C<s>.
2626 
2627 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2628 newly created string containing a downgraded copy of C<s>, and whose length is
2629 returned in C<*lenp>, updated.  The new string is C<NUL>-terminated.  The
2630 caller is responsible for arranging for the memory used by this string to get
2631 freed.
2632 
2633 Upon successful return, the number of variants in the string can be computed by
2634 having saved the value of C<*lenp> before the call, and subtracting the
2635 after-call value of C<*lenp> from it.
2636 
2637 =cut
2638 
2639 There is a macro that avoids this function call, but this is retained for
2640 anyone who calls it with the Perl_ prefix */
2641 
2642 U8 *
Perl_bytes_from_utf8(pTHX_ const U8 * s,STRLEN * lenp,bool * is_utf8p)2643 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2644 {
2645     PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2646     PERL_UNUSED_CONTEXT;
2647 
2648     return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2649 }
2650 
2651 /*
2652 =for apidoc bytes_from_utf8_loc
2653 
2654 Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
2655 to where to store the location of the first character in C<"s"> that cannot be
2656 converted to non-UTF8.
2657 
2658 If that parameter is C<NULL>, this function behaves identically to
2659 C<bytes_from_utf8>.
2660 
2661 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2662 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2663 
2664 Otherwise, the function returns a newly created C<NUL>-terminated string
2665 containing the non-UTF8 equivalent of the convertible first portion of
2666 C<"s">.  C<*lenp> is set to its length, not including the terminating C<NUL>.
2667 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2668 and C<*first_non_downgradable> is set to C<NULL>.
2669 
2670 Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
2671 first character in the original string that wasn't converted.  C<*is_utf8p> is
2672 unchanged.  Note that the new string may have length 0.
2673 
2674 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2675 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2676 converts as many characters in it as possible stopping at the first one it
2677 finds that can't be converted to non-UTF-8.  C<*first_non_downgradable> is
2678 set to point to that.  The function returns the portion that could be converted
2679 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2680 not including the terminating C<NUL>.  If the very first character in the
2681 original could not be converted, C<*lenp> will be 0, and the new string will
2682 contain just a single C<NUL>.  If the entire input string was converted,
2683 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2684 
2685 Upon successful return, the number of variants in the converted portion of the
2686 string can be computed by having saved the value of C<*lenp> before the call,
2687 and subtracting the after-call value of C<*lenp> from it.
2688 
2689 =cut
2690 
2691 
2692 */
2693 
2694 U8 *
Perl_bytes_from_utf8_loc(const U8 * s,STRLEN * lenp,bool * is_utf8p,const U8 ** first_unconverted)2695 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2696 {
2697     U8 *d;
2698     const U8 *original = s;
2699     U8 *converted_start;
2700     const U8 *send = s + *lenp;
2701 
2702     PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2703 
2704     if (! *is_utf8p) {
2705         if (first_unconverted) {
2706             *first_unconverted = NULL;
2707         }
2708 
2709         return (U8 *) original;
2710     }
2711 
2712     Newx(d, (*lenp) + 1, U8);
2713 
2714     converted_start = d;
2715     while (s < send) {
2716         U8 c = *s++;
2717         if (! UTF8_IS_INVARIANT(c)) {
2718 
2719             /* Then it is multi-byte encoded.  If the code point is above 0xFF,
2720              * have to stop now */
2721             if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2722                 if (first_unconverted) {
2723                     *first_unconverted = s - 1;
2724                     goto finish_and_return;
2725                 }
2726                 else {
2727                     Safefree(converted_start);
2728                     return (U8 *) original;
2729                 }
2730             }
2731 
2732             c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2733             s++;
2734         }
2735         *d++ = c;
2736     }
2737 
2738     /* Here, converted the whole of the input */
2739     *is_utf8p = FALSE;
2740     if (first_unconverted) {
2741         *first_unconverted = NULL;
2742     }
2743 
2744   finish_and_return:
2745     *d = '\0';
2746     *lenp = d - converted_start;
2747 
2748     /* Trim unused space */
2749     Renew(converted_start, *lenp + 1, U8);
2750 
2751     return converted_start;
2752 }
2753 
2754 /*
2755 =for apidoc bytes_to_utf8
2756 
2757 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2758 UTF-8.
2759 Returns a pointer to the newly-created string, and sets C<*lenp> to
2760 reflect the new length in bytes.  The caller is responsible for arranging for
2761 the memory used by this string to get freed.
2762 
2763 Upon successful return, the number of variants in the string can be computed by
2764 having saved the value of C<*lenp> before the call, and subtracting it from the
2765 after-call value of C<*lenp>.
2766 
2767 A C<NUL> character will be written after the end of the string.
2768 
2769 If you want to convert to UTF-8 from encodings other than
2770 the native (Latin1 or EBCDIC),
2771 see L</sv_recode_to_utf8>().
2772 
2773 =cut
2774 */
2775 
2776 U8*
Perl_bytes_to_utf8(pTHX_ const U8 * s,STRLEN * lenp)2777 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2778 {
2779     const U8 * const send = s + (*lenp);
2780     U8 *d;
2781     U8 *dst;
2782 
2783     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2784     PERL_UNUSED_CONTEXT;
2785 
2786     /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
2787     Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
2788     dst = d;
2789 
2790     while (s < send) {
2791         append_utf8_from_native_byte(*s, &d);
2792         s++;
2793     }
2794 
2795     *d = '\0';
2796     *lenp = d-dst;
2797 
2798     return dst;
2799 }
2800 
2801 /*
2802  * Convert native UTF-16 to UTF-8. Called via the more public functions
2803  * utf16_to_utf8() for big-endian and utf16_to_utf8_reversed() for
2804  * little-endian,
2805  *
2806  * 'p' is the UTF-16 input string, passed as a pointer to U8.
2807  * 'bytelen' is its length (must be even)
2808  * 'd' is the pointer to the destination buffer.  The caller must ensure that
2809  *     the space is large enough.  The maximum expansion factor is 2 times
2810  *     'bytelen'.  1.5 if never going to run on an EBCDIC box.
2811  * '*newlen' will contain the number of bytes this function filled of 'd'.
2812  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2813  * 'low_byte' is 1  if UTF-16BE; 0 if UTF-16LE
2814  *
2815  * The expansion factor is because UTF-16 requires 2 bytes for every code point
2816  * below 0x10000; otherwise 4 bytes.  UTF-8 requires 1-3 bytes for every code
2817  * point below 0x1000; otherwise 4 bytes.  UTF-EBCDIC requires 1-4 bytes for
2818  * every code point below 0x1000; otherwise 4-5 bytes.
2819  *
2820  * The worst case is where every code point is below U+10000, hence requiring 2
2821  * UTF-16 bytes, but is U+0800 or higher on ASCII platforms, requiring 3 UTF-8
2822  * bytes; or >= U+4000 on EBCDIC requiring 4 UTF-8 bytes.
2823  *
2824  * Do not use in-place. */
2825 
2826 U8*
Perl_utf16_to_utf8_base(pTHX_ U8 * p,U8 * d,Size_t bytelen,Size_t * newlen,const bool high_byte,const bool low_byte)2827 Perl_utf16_to_utf8_base(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen,
2828                               const bool high_byte, /* Which of next two bytes is
2829                                                   high order */
2830                               const bool low_byte)
2831 {
2832     U8* pend;
2833     U8* dstart = d;
2834 
2835     PERL_ARGS_ASSERT_UTF16_TO_UTF8_BASE;
2836 
2837     if (bytelen & 1)
2838         Perl_croak(aTHX_ "panic: utf16_to_utf8%s: odd bytelen %" UVuf,
2839                 ((high_byte == 0) ? "" : "_reversed"), (UV)bytelen);
2840     pend = p + bytelen;
2841 
2842     while (p < pend) {
2843 
2844         /* Next 16 bits is what we want.  (The bool is cast to U8 because on
2845          * platforms where a bool is implemented as a signed char, a compiler
2846          * warning may be generated) */
2847         U32 uv = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2848         p += 2;
2849 
2850         /* If it's a surrogate, we find the uv that the surrogate pair encodes.
2851          * */
2852         if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
2853 
2854 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2855 #define LAST_HIGH_SURROGATE  0xDBFF
2856 #define FIRST_LOW_SURROGATE  0xDC00
2857 #define LAST_LOW_SURROGATE   UNICODE_SURROGATE_LAST
2858 #define FIRST_IN_PLANE1      0x10000
2859 
2860             if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2861                 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2862             }
2863             else {
2864                 U32 low_surrogate = (p[(U8) high_byte] << 8) + p[(U8) low_byte];
2865                 if (UNLIKELY(! inRANGE(low_surrogate, FIRST_LOW_SURROGATE,
2866                                                        LAST_LOW_SURROGATE)))
2867                 {
2868                     Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2869                 }
2870 
2871                 p += 2;
2872 
2873                 /* Here uv is the high surrogate.  Combine with low surrogate
2874                  * just computed to form the actual U32 code point.
2875                  *
2876                  * From https://unicode.org/faq/utf_bom.html#utf16-4 */
2877                 uv = FIRST_IN_PLANE1 + (uv << 10) - (FIRST_HIGH_SURROGATE << 10)
2878                                      + low_surrogate - FIRST_LOW_SURROGATE;
2879             }
2880         }
2881 
2882         /* Here, 'uv' is the real U32 we want to find the UTF-8 of */
2883         d = uvchr_to_utf8(d, uv);
2884     }
2885 
2886     *newlen = d - dstart;
2887     return d;
2888 }
2889 
2890 U8*
Perl_utf16_to_utf8(pTHX_ U8 * p,U8 * d,Size_t bytelen,Size_t * newlen)2891 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2892 {
2893     PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2894 
2895     return utf16_to_utf8(p, d, bytelen, newlen);
2896 }
2897 
2898 U8*
Perl_utf16_to_utf8_reversed(pTHX_ U8 * p,U8 * d,Size_t bytelen,Size_t * newlen)2899 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
2900 {
2901     PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2902 
2903     return utf16_to_utf8_reversed(p, d, bytelen, newlen);
2904 }
2905 
2906 /*
2907  * Convert UTF-8 to native UTF-16. Called via the macros utf8_to_utf16() for
2908  * big-endian and utf8_to_utf16_reversed() for little-endian,
2909  *
2910  * 's' is the UTF-8 input string, passed as a pointer to U8.
2911  * 'bytelen' is its length
2912  * 'd' is the pointer to the destination buffer, currently passed as U8 *.  The
2913  *     caller must ensure that the space is large enough.  The maximum
2914  *     expansion factor is 2 times 'bytelen'.  This happens when the input is
2915  *     entirely single-byte ASCII, expanding to two-byte UTF-16.
2916  * '*newlen' will contain the number of bytes this function filled of 'd'.
2917  * 'high_byte' is 0 if UTF-16BE; 1 if UTF-16LE
2918  * 'low_byte'  is 1 if UTF-16BE; 0 if UTF-16LE
2919  *
2920  * Do not use in-place. */
2921 U8*
Perl_utf8_to_utf16_base(pTHX_ U8 * s,U8 * d,Size_t bytelen,Size_t * newlen,const bool high_byte,const bool low_byte)2922 Perl_utf8_to_utf16_base(pTHX_ U8* s, U8* d, Size_t bytelen, Size_t *newlen,
2923                               const bool high_byte, /* Which of next two bytes
2924                                                        is high order */
2925                               const bool low_byte)
2926 {
2927     U8* send;
2928     U8* dstart = d;
2929 
2930     PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE;
2931 
2932     send = s + bytelen;
2933 
2934     while (s < send) {
2935         STRLEN retlen;
2936         UV uv = utf8n_to_uvchr(s, send - s, &retlen,
2937                                /* No surrogates nor above-Unicode */
2938                                UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
2939 
2940         /* The modern method is to keep going with malformed input,
2941          * substituting the REPLACEMENT CHARACTER */
2942         if (UNLIKELY(uv == 0 && *s != '\0')) {
2943             uv = UNICODE_REPLACEMENT;
2944         }
2945 
2946         if (uv >= FIRST_IN_PLANE1) {    /* Requires a surrogate pair */
2947 
2948             /* From https://unicode.org/faq/utf_bom.html#utf16-4 */
2949             U32 high_surrogate = (uv >> 10) - (FIRST_IN_PLANE1 >> 10)
2950                                + FIRST_HIGH_SURROGATE;
2951 
2952             /* (The bool is cast to U8 because on platforms where a bool is
2953              * implemented as a signed char, a compiler warning may be
2954              * generated) */
2955             d[(U8) high_byte] = high_surrogate >> 8;
2956             d[(U8) low_byte]  = high_surrogate & nBIT_MASK(8);
2957             d += 2;
2958 
2959             /* The low surrogate is the lower 10 bits plus the offset */
2960             uv &= nBIT_MASK(10);
2961             uv += FIRST_LOW_SURROGATE;
2962 
2963             /* Drop down to output the low surrogate like it were a
2964              * non-surrogate */
2965         }
2966 
2967         d[(U8) high_byte] = uv >> 8;
2968         d[(U8) low_byte] = uv & nBIT_MASK(8);
2969         d += 2;
2970 
2971         s += retlen;
2972     }
2973 
2974     *newlen = d - dstart;
2975     return d;
2976 }
2977 
2978 bool
Perl__is_uni_FOO(pTHX_ const U8 classnum,const UV c)2979 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2980 {
2981     return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
2982 }
2983 
2984 bool
Perl__is_uni_perl_idcont(pTHX_ UV c)2985 Perl__is_uni_perl_idcont(pTHX_ UV c)
2986 {
2987     return _invlist_contains_cp(PL_utf8_perl_idcont, c);
2988 }
2989 
2990 bool
Perl__is_uni_perl_idstart(pTHX_ UV c)2991 Perl__is_uni_perl_idstart(pTHX_ UV c)
2992 {
2993     return _invlist_contains_cp(PL_utf8_perl_idstart, c);
2994 }
2995 
2996 UV
Perl__to_upper_title_latin1(pTHX_ const U8 c,U8 * p,STRLEN * lenp,const char S_or_s)2997 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2998                                   const char S_or_s)
2999 {
3000     /* We have the latin1-range values compiled into the core, so just use
3001      * those, converting the result to UTF-8.  The only difference between upper
3002      * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
3003      * either "SS" or "Ss".  Which one to use is passed into the routine in
3004      * 'S_or_s' to avoid a test */
3005 
3006     UV converted = toUPPER_LATIN1_MOD(c);
3007 
3008     PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
3009 
3010     assert(S_or_s == 'S' || S_or_s == 's');
3011 
3012     if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
3013                                              characters in this range */
3014         *p = (U8) converted;
3015         *lenp = 1;
3016         return converted;
3017     }
3018 
3019     /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
3020      * which it maps to one of them, so as to only have to have one check for
3021      * it in the main case */
3022     if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3023         switch (c) {
3024             case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
3025                 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3026                 break;
3027             case MICRO_SIGN:
3028                 converted = GREEK_CAPITAL_LETTER_MU;
3029                 break;
3030 #if    UNICODE_MAJOR_VERSION > 2                                        \
3031    || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1           \
3032                                   && UNICODE_DOT_DOT_VERSION >= 8)
3033             case LATIN_SMALL_LETTER_SHARP_S:
3034                 *(p)++ = 'S';
3035                 *p = S_or_s;
3036                 *lenp = 2;
3037                 return 'S';
3038 #endif
3039             default:
3040                 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
3041                                  " '%c' to map to '%c'",
3042                                  c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
3043                 NOT_REACHED; /* NOTREACHED */
3044         }
3045     }
3046 
3047     *(p)++ = UTF8_TWO_BYTE_HI(converted);
3048     *p = UTF8_TWO_BYTE_LO(converted);
3049     *lenp = 2;
3050 
3051     return converted;
3052 }
3053 
3054 /* If compiled on an early Unicode version, there may not be auxiliary tables
3055  * */
3056 #ifndef HAS_UC_AUX_TABLES
3057 #  define UC_AUX_TABLE_ptrs     NULL
3058 #  define UC_AUX_TABLE_lengths  NULL
3059 #endif
3060 #ifndef HAS_TC_AUX_TABLES
3061 #  define TC_AUX_TABLE_ptrs     NULL
3062 #  define TC_AUX_TABLE_lengths  NULL
3063 #endif
3064 #ifndef HAS_LC_AUX_TABLES
3065 #  define LC_AUX_TABLE_ptrs     NULL
3066 #  define LC_AUX_TABLE_lengths  NULL
3067 #endif
3068 #ifndef HAS_CF_AUX_TABLES
3069 #  define CF_AUX_TABLE_ptrs     NULL
3070 #  define CF_AUX_TABLE_lengths  NULL
3071 #endif
3072 
3073 /* Call the function to convert a UTF-8 encoded character to the specified case.
3074  * Note that there may be more than one character in the result.
3075  * 's' is a pointer to the first byte of the input character
3076  * 'd' will be set to the first byte of the string of changed characters.  It
3077  *	needs to have space for UTF8_MAXBYTES_CASE+1 bytes
3078  * 'lenp' will be set to the length in bytes of the string of changed characters
3079  *
3080  * The functions return the ordinal of the first character in the string of
3081  * 'd' */
3082 #define CALL_UPPER_CASE(uv, s, d, lenp)                                     \
3083                 _to_utf8_case(uv, s, d, lenp, PL_utf8_toupper,              \
3084                                               Uppercase_Mapping_invmap,     \
3085                                               UC_AUX_TABLE_ptrs,            \
3086                                               UC_AUX_TABLE_lengths,         \
3087                                               "uppercase")
3088 #define CALL_TITLE_CASE(uv, s, d, lenp)                                     \
3089                 _to_utf8_case(uv, s, d, lenp, PL_utf8_totitle,              \
3090                                               Titlecase_Mapping_invmap,     \
3091                                               TC_AUX_TABLE_ptrs,            \
3092                                               TC_AUX_TABLE_lengths,         \
3093                                               "titlecase")
3094 #define CALL_LOWER_CASE(uv, s, d, lenp)                                     \
3095                 _to_utf8_case(uv, s, d, lenp, PL_utf8_tolower,              \
3096                                               Lowercase_Mapping_invmap,     \
3097                                               LC_AUX_TABLE_ptrs,            \
3098                                               LC_AUX_TABLE_lengths,         \
3099                                               "lowercase")
3100 
3101 
3102 /* This additionally has the input parameter 'specials', which if non-zero will
3103  * cause this to use the specials hash for folding (meaning get full case
3104  * folding); otherwise, when zero, this implies a simple case fold */
3105 #define CALL_FOLD_CASE(uv, s, d, lenp, specials)                            \
3106         (specials)                                                          \
3107         ?  _to_utf8_case(uv, s, d, lenp, PL_utf8_tofold,                    \
3108                                           Case_Folding_invmap,              \
3109                                           CF_AUX_TABLE_ptrs,                \
3110                                           CF_AUX_TABLE_lengths,             \
3111                                           "foldcase")                       \
3112         : _to_utf8_case(uv, s, d, lenp, PL_utf8_tosimplefold,               \
3113                                          Simple_Case_Folding_invmap,        \
3114                                          NULL, NULL,                        \
3115                                          "foldcase")
3116 
3117 UV
Perl_to_uni_upper(pTHX_ UV c,U8 * p,STRLEN * lenp)3118 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
3119 {
3120     /* Convert the Unicode character whose ordinal is <c> to its uppercase
3121      * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
3122      * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
3123      * the changed version may be longer than the original character.
3124      *
3125      * The ordinal of the first character of the changed version is returned
3126      * (but note, as explained above, that there may be more.) */
3127 
3128     PERL_ARGS_ASSERT_TO_UNI_UPPER;
3129 
3130     if (c < 256) {
3131         return _to_upper_title_latin1((U8) c, p, lenp, 'S');
3132     }
3133 
3134     return CALL_UPPER_CASE(c, NULL, p, lenp);
3135 }
3136 
3137 UV
Perl_to_uni_title(pTHX_ UV c,U8 * p,STRLEN * lenp)3138 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
3139 {
3140     PERL_ARGS_ASSERT_TO_UNI_TITLE;
3141 
3142     if (c < 256) {
3143         return _to_upper_title_latin1((U8) c, p, lenp, 's');
3144     }
3145 
3146     return CALL_TITLE_CASE(c, NULL, p, lenp);
3147 }
3148 
3149 STATIC U8
S_to_lower_latin1(const U8 c,U8 * p,STRLEN * lenp,const char dummy)3150 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
3151 {
3152     /* We have the latin1-range values compiled into the core, so just use
3153      * those, converting the result to UTF-8.  Since the result is always just
3154      * one character, we allow <p> to be NULL */
3155 
3156     U8 converted = toLOWER_LATIN1(c);
3157 
3158     PERL_UNUSED_ARG(dummy);
3159 
3160     if (p != NULL) {
3161         if (NATIVE_BYTE_IS_INVARIANT(converted)) {
3162             *p = converted;
3163             *lenp = 1;
3164         }
3165         else {
3166             /* Result is known to always be < 256, so can use the EIGHT_BIT
3167              * macros */
3168             *p = UTF8_EIGHT_BIT_HI(converted);
3169             *(p+1) = UTF8_EIGHT_BIT_LO(converted);
3170             *lenp = 2;
3171         }
3172     }
3173     return converted;
3174 }
3175 
3176 UV
Perl_to_uni_lower(pTHX_ UV c,U8 * p,STRLEN * lenp)3177 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
3178 {
3179     PERL_ARGS_ASSERT_TO_UNI_LOWER;
3180 
3181     if (c < 256) {
3182         return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
3183     }
3184 
3185     return CALL_LOWER_CASE(c, NULL, p, lenp);
3186 }
3187 
3188 UV
Perl__to_fold_latin1(const U8 c,U8 * p,STRLEN * lenp,const unsigned int flags)3189 Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
3190 {
3191     /* Corresponds to to_lower_latin1(); <flags> bits meanings:
3192      *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3193      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
3194      *
3195      *	Not to be used for locale folds
3196      */
3197 
3198     UV converted;
3199 
3200     PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
3201 
3202     assert (! (flags & FOLD_FLAGS_LOCALE));
3203 
3204     if (UNLIKELY(c == MICRO_SIGN)) {
3205         converted = GREEK_SMALL_LETTER_MU;
3206     }
3207 #if    UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */   \
3208    || (UNICODE_MAJOR_VERSION == 3 && (   UNICODE_DOT_VERSION > 0)       \
3209                                       || UNICODE_DOT_DOT_VERSION > 0)
3210     else if (   (flags & FOLD_FLAGS_FULL)
3211              && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
3212     {
3213         /* If can't cross 127/128 boundary, can't return "ss"; instead return
3214          * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
3215          * under those circumstances. */
3216         if (flags & FOLD_FLAGS_NOMIX_ASCII) {
3217             *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
3218             Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3219                  p, *lenp, U8);
3220             return LATIN_SMALL_LETTER_LONG_S;
3221         }
3222         else {
3223             *(p)++ = 's';
3224             *p = 's';
3225             *lenp = 2;
3226             return 's';
3227         }
3228     }
3229 #endif
3230     else { /* In this range the fold of all other characters is their lower
3231               case */
3232         converted = toLOWER_LATIN1(c);
3233     }
3234 
3235     if (UVCHR_IS_INVARIANT(converted)) {
3236         *p = (U8) converted;
3237         *lenp = 1;
3238     }
3239     else {
3240         *(p)++ = UTF8_TWO_BYTE_HI(converted);
3241         *p = UTF8_TWO_BYTE_LO(converted);
3242         *lenp = 2;
3243     }
3244 
3245     return converted;
3246 }
3247 
3248 UV
Perl__to_uni_fold_flags(pTHX_ UV c,U8 * p,STRLEN * lenp,U8 flags)3249 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
3250 {
3251 
3252     /* Not currently externally documented, and subject to change
3253      *  <flags> bits meanings:
3254      *	    FOLD_FLAGS_FULL  iff full folding is to be used;
3255      *	    FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3256      *	                      locale are to be used.
3257      *	    FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
3258      */
3259 
3260     PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
3261 
3262     if (flags & FOLD_FLAGS_LOCALE) {
3263         /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
3264          * except for potentially warning */
3265         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3266         if (IN_UTF8_CTYPE_LOCALE && ! IN_UTF8_TURKIC_LOCALE) {
3267             flags &= ~FOLD_FLAGS_LOCALE;
3268         }
3269         else {
3270             goto needs_full_generality;
3271         }
3272     }
3273 
3274     if (c < 256) {
3275         return _to_fold_latin1((U8) c, p, lenp,
3276                             flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
3277     }
3278 
3279     /* Here, above 255.  If no special needs, just use the macro */
3280     if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
3281         return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
3282     }
3283     else {  /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
3284                the special flags. */
3285         U8 utf8_c[UTF8_MAXBYTES + 1];
3286 
3287       needs_full_generality:
3288         uvchr_to_utf8(utf8_c, c);
3289         return _toFOLD_utf8_flags(utf8_c, utf8_c + C_ARRAY_LENGTH(utf8_c),
3290                                   p, lenp, flags);
3291     }
3292 }
3293 
3294 PERL_STATIC_INLINE bool
S_is_utf8_common(pTHX_ const U8 * const p,const U8 * const e,SV * const invlist)3295 S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
3296                        SV* const invlist)
3297 {
3298     /* returns a boolean giving whether or not the UTF8-encoded character that
3299      * starts at <p>, and extending no further than <e - 1> is in the inversion
3300      * list <invlist>. */
3301 
3302     UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
3303 
3304     PERL_ARGS_ASSERT_IS_UTF8_COMMON;
3305 
3306     if (cp == 0 && (p >= e || *p != '\0')) {
3307         _force_out_malformed_utf8_message(p, e, 0, 1);
3308         NOT_REACHED; /* NOTREACHED */
3309     }
3310 
3311     assert(invlist);
3312     return _invlist_contains_cp(invlist, cp);
3313 }
3314 
3315 #if 0	/* Not currently used, but may be needed in the future */
PERLVAR(I,seen_deprecated_macro,HV *)3316 PERLVAR(I, seen_deprecated_macro, HV *)
3317 
3318 STATIC void
3319 S_warn_on_first_deprecated_use(pTHX_ U32 category,
3320                                      const char * const name,
3321                                      const char * const alternative,
3322                                      const bool use_locale,
3323                                      const char * const file,
3324                                      const unsigned line)
3325 {
3326     const char * key;
3327 
3328     PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
3329 
3330     if (ckWARN_d(category)) {
3331 
3332         key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
3333         if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
3334             if (! PL_seen_deprecated_macro) {
3335                 PL_seen_deprecated_macro = newHV();
3336             }
3337             if (! hv_store(PL_seen_deprecated_macro, key,
3338                            strlen(key), &PL_sv_undef, 0))
3339             {
3340                 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3341             }
3342 
3343             if (instr(file, "mathoms.c")) {
3344                 Perl_warner(aTHX_ category,
3345                             "In %s, line %d, starting in Perl v5.32, %s()"
3346                             " will be removed.  Avoid this message by"
3347                             " converting to use %s().\n",
3348                             file, line, name, alternative);
3349             }
3350             else {
3351                 Perl_warner(aTHX_ category,
3352                             "In %s, line %d, starting in Perl v5.32, %s() will"
3353                             " require an additional parameter.  Avoid this"
3354                             " message by converting to use %s().\n",
3355                             file, line, name, alternative);
3356             }
3357         }
3358     }
3359 }
3360 #endif
3361 
3362 bool
Perl__is_utf8_FOO(pTHX_ const U8 classnum,const U8 * p,const U8 * const e)3363 Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
3364 {
3365     PERL_ARGS_ASSERT__IS_UTF8_FOO;
3366 
3367     return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
3368 }
3369 
3370 bool
Perl__is_utf8_perl_idstart(pTHX_ const U8 * p,const U8 * const e)3371 Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
3372 {
3373     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
3374 
3375     return is_utf8_common(p, e, PL_utf8_perl_idstart);
3376 }
3377 
3378 bool
Perl__is_utf8_perl_idcont(pTHX_ const U8 * p,const U8 * const e)3379 Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
3380 {
3381     PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
3382 
3383     return is_utf8_common(p, e, PL_utf8_perl_idcont);
3384 }
3385 
3386 STATIC UV
S_to_case_cp_list(pTHX_ const UV original,const U32 ** const remaining_list,Size_t * remaining_count,SV * invlist,const I32 * const invmap,const U32 * const * const aux_tables,const U8 * const aux_table_lengths,const char * const normal)3387 S_to_case_cp_list(pTHX_
3388                   const UV original,
3389                   const U32 ** const remaining_list,
3390                   Size_t * remaining_count,
3391                   SV *invlist, const I32 * const invmap,
3392                   const U32 * const * const aux_tables,
3393                   const U8 * const aux_table_lengths,
3394                   const char * const normal)
3395 {
3396     SSize_t index;
3397     I32 base;
3398 
3399     /* Calculate the changed case of code point 'original'.  The first code
3400      * point of the changed case is returned.
3401      *
3402      * If 'remaining_count' is not NULL, *remaining_count will be set to how
3403      * many *other* code points are in the changed case.  If non-zero and
3404      * 'remaining_list' is also not NULL, *remaining_list will be set to point
3405      * to a non-modifiable array containing the second and potentially third
3406      * code points in the changed case.  (Unicode guarantees a maximum of 3.)
3407      * Note that this means that *remaining_list is undefined unless there are
3408      * multiple code points, and the caller has chosen to find out how many by
3409      * making 'remaining_count' not NULL.
3410      *
3411      * 'normal' is a string to use to name the new case in any generated
3412      * messages, as a fallback if the operation being used is not available.
3413      *
3414      * The casing to use is given by the data structures in the remaining
3415      * arguments.
3416      */
3417 
3418     PERL_ARGS_ASSERT_TO_CASE_CP_LIST;
3419 
3420     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3421      * that covers all possible inputs.  See [perl #133365] */
3422     index = _invlist_search(invlist, original);
3423     base = invmap[index];
3424 
3425     /* Most likely, the case change will contain just a single code point */
3426     if (remaining_count) {
3427         *remaining_count = 0;
3428     }
3429 
3430     if (LIKELY(base == 0)) {    /* 0 => original was unchanged by casing */
3431 
3432         /* At this bottom level routine is where we warn about illegal code
3433          * points */
3434         if (isUNICODE_POSSIBLY_PROBLEMATIC(original)) {
3435             if (UNLIKELY(UNICODE_IS_SURROGATE(original))) {
3436                 if (ckWARN_d(WARN_SURROGATE)) {
3437                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3438                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
3439                         "Operation \"%s\" returns its argument for"
3440                         " UTF-16 surrogate U+%04" UVXf, desc, original);
3441                 }
3442             }
3443             else if (UNLIKELY(UNICODE_IS_SUPER(original))) {
3444                 if (UNLIKELY(original > MAX_LEGAL_CP)) {
3445                     Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, original));
3446                 }
3447                 if (ckWARN_d(WARN_NON_UNICODE)) {
3448                     const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3449                     Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3450                         "Operation \"%s\" returns its argument for"
3451                         " non-Unicode code point 0x%04" UVXf, desc, original);
3452                 }
3453             }
3454 
3455             /* Note that non-characters are perfectly legal, so no warning
3456              * should be given. */
3457         }
3458 
3459         return original;
3460     }
3461 
3462     if (LIKELY(base > 0)) {  /* means original mapped to a single code point,
3463                                 different from itself */
3464         return base + original - invlist_array(invlist)[index];
3465     }
3466 
3467     /* Here 'base' is negative.  That means the mapping is 1-to-many, and
3468      * requires an auxiliary table look up.  abs(base) gives the index into a
3469      * list of such tables which points to the proper aux table.  And a
3470      * parallel list gives the length of each corresponding aux table.  Skip
3471      * the first entry in the *remaining returns, as it is returned by the
3472      * function. */
3473     base = -base;
3474     if (remaining_count) {
3475         *remaining_count = (Size_t) (aux_table_lengths[base] - 1);
3476 
3477         if (remaining_list) {
3478             *remaining_list  = aux_tables[base] + 1;
3479         }
3480     }
3481 
3482     return (UV) aux_tables[base][0];
3483 }
3484 
3485 STATIC UV
S__to_utf8_case(pTHX_ const UV original,const U8 * p,U8 * ustrp,STRLEN * lenp,SV * invlist,const I32 * const invmap,const U32 * const * const aux_tables,const U8 * const aux_table_lengths,const char * const normal)3486 S__to_utf8_case(pTHX_ const UV original, const U8 *p,
3487                       U8* ustrp, STRLEN *lenp,
3488                       SV *invlist, const I32 * const invmap,
3489                       const U32 * const * const aux_tables,
3490                       const U8 * const aux_table_lengths,
3491                       const char * const normal)
3492 {
3493     /* Change the case of code point 'original'.  If 'p' is non-NULL, it points to
3494      * the beginning of the (assumed to be valid) UTF-8 representation of
3495      * 'original'.  'normal' is a string to use to name the new case in any
3496      * generated messages, as a fallback if the operation being used is not
3497      * available.  The new case is given by the data structures in the
3498      * remaining arguments.
3499      *
3500      * On return 'ustrp' points to '*lenp' UTF-8 encoded bytes representing the
3501      * entire changed case string, and the return value is the first code point
3502      * in that string
3503      *
3504      * Note that the <ustrp> needs to be at least UTF8_MAXBYTES_CASE+1 bytes
3505      * since the changed version may be longer than the original character. */
3506 
3507     const U32 * remaining_list;
3508     Size_t remaining_count;
3509     UV first = to_case_cp_list(original,
3510                                &remaining_list, &remaining_count,
3511                                invlist, invmap,
3512                                aux_tables, aux_table_lengths,
3513                                normal);
3514 
3515     PERL_ARGS_ASSERT__TO_UTF8_CASE;
3516 
3517     /* If the code point maps to itself and we already have its representation,
3518      * copy it instead of recalculating */
3519     if (original == first && p) {
3520         *lenp = UTF8SKIP(p);
3521 
3522         if (p != ustrp) {   /* Don't copy onto itself */
3523             Copy(p, ustrp, *lenp, U8);
3524         }
3525     }
3526     else {
3527         U8 * d = ustrp;
3528         Size_t i;
3529 
3530         d = uvchr_to_utf8(d, first);
3531 
3532         for (i = 0; i < remaining_count; i++) {
3533             d = uvchr_to_utf8(d, remaining_list[i]);
3534         }
3535 
3536         *d = '\0';
3537         *lenp = d - ustrp;
3538     }
3539 
3540     return first;
3541 }
3542 
3543 Size_t
Perl__inverse_folds(pTHX_ const UV cp,U32 * first_folds_to,const U32 ** remaining_folds_to)3544 Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
3545                           const U32 ** remaining_folds_to)
3546 {
3547     /* Returns the count of the number of code points that fold to the input
3548      * 'cp' (besides itself).
3549      *
3550      * If the return is 0, there is nothing else that folds to it, and
3551      * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
3552      *
3553      * If the return is 1, '*first_folds_to' is set to the single code point,
3554      * and '*remaining_folds_to' is set to NULL.
3555      *
3556      * Otherwise, '*first_folds_to' is set to a code point, and
3557      * '*remaining_fold_to' is set to an array that contains the others.  The
3558      * length of this array is the returned count minus 1.
3559      *
3560      * The reason for this convolution is to avoid having to deal with
3561      * allocating and freeing memory.  The lists are already constructed, so
3562      * the return can point to them, but single code points aren't, so would
3563      * need to be constructed if we didn't employ something like this API
3564      *
3565      * The code points returned by this function are all legal Unicode, which
3566      * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
3567      * constructed with this size (to save space and memory), and we return
3568      * pointers, so they must be this size */
3569 
3570     /* 'index' is guaranteed to be non-negative, as this is an inversion map
3571      * that covers all possible inputs.  See [perl #133365] */
3572     SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
3573     I32 base = _Perl_IVCF_invmap[index];
3574 
3575     PERL_ARGS_ASSERT__INVERSE_FOLDS;
3576 
3577     if (base == 0) {            /* No fold */
3578         *first_folds_to = 0;
3579         *remaining_folds_to = NULL;
3580         return 0;
3581     }
3582 
3583 #ifndef HAS_IVCF_AUX_TABLES     /* This Unicode version only has 1-1 folds */
3584 
3585     assert(base > 0);
3586 
3587 #else
3588 
3589     if (UNLIKELY(base < 0)) {   /* Folds to more than one character */
3590 
3591         /* The data structure is set up so that the absolute value of 'base' is
3592          * an index into a table of pointers to arrays, with the array
3593          * corresponding to the index being the list of code points that fold
3594          * to 'cp', and the parallel array containing the length of the list
3595          * array */
3596         *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
3597         *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
3598                                                 /* +1 excludes first_folds_to */
3599         return IVCF_AUX_TABLE_lengths[-base];
3600     }
3601 
3602 #endif
3603 
3604     /* Only the single code point.  This works like 'fc(G) = G - A + a' */
3605     *first_folds_to = (U32) (base + cp
3606                                   - invlist_array(PL_utf8_foldclosures)[index]);
3607     *remaining_folds_to = NULL;
3608     return 1;
3609 }
3610 
3611 STATIC UV
S_check_locale_boundary_crossing(pTHX_ const U8 * const p,const UV result,U8 * const ustrp,STRLEN * lenp)3612 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3613                                        U8* const ustrp, STRLEN *lenp)
3614 {
3615     /* This is called when changing the case of a UTF-8-encoded character above
3616      * the Latin1 range, and the operation is in a non-UTF-8 locale.  If the
3617      * result contains a character that crosses the 255/256 boundary, disallow
3618      * the change, and return the original code point.  See L<perlfunc/lc> for
3619      * why;
3620      *
3621      * p	points to the original string whose case was changed; assumed
3622      *          by this routine to be well-formed
3623      * result	the code point of the first character in the changed-case string
3624      * ustrp	points to the changed-case string (<result> represents its
3625      *          first char)
3626      * lenp	points to the length of <ustrp> */
3627 
3628     UV original;    /* To store the first code point of <p> */
3629 
3630     PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3631 
3632     assert(UTF8_IS_ABOVE_LATIN1(*p));
3633 
3634     /* We know immediately if the first character in the string crosses the
3635      * boundary, so can skip testing */
3636     if (result > 255) {
3637 
3638         /* Look at every character in the result; if any cross the
3639         * boundary, the whole thing is disallowed */
3640         U8* s = ustrp + UTF8SKIP(ustrp);
3641         U8* e = ustrp + *lenp;
3642         while (s < e) {
3643             if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3644                 goto bad_crossing;
3645             }
3646             s += UTF8SKIP(s);
3647         }
3648 
3649         /* Here, no characters crossed, result is ok as-is, but we warn. */
3650         _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3651         return result;
3652     }
3653 
3654   bad_crossing:
3655 
3656     /* Failed, have to return the original */
3657     original = valid_utf8_to_uvchr(p, lenp);
3658 
3659     /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3660     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3661                            "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3662                            " locale; resolved to \"\\x{%" UVXf "}\".",
3663                            OP_DESC(PL_op),
3664                            original,
3665                            original);
3666     Copy(p, ustrp, *lenp, char);
3667     return original;
3668 }
3669 
3670 STATIC UV
S_turkic_fc(pTHX_ const U8 * const p,const U8 * const e,U8 * ustrp,STRLEN * lenp)3671 S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
3672                         U8 * ustrp, STRLEN *lenp)
3673 {
3674     /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
3675      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3676      * Otherwise, it returns the first code point of the Turkic foldcased
3677      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3678      * contain *lenp bytes
3679      *
3680      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3681      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3682      * DOTLESS I */
3683 
3684     PERL_ARGS_ASSERT_TURKIC_FC;
3685     assert(e > p);
3686 
3687     if (UNLIKELY(*p == 'I')) {
3688         *lenp = 2;
3689         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
3690         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
3691         return LATIN_SMALL_LETTER_DOTLESS_I;
3692     }
3693 
3694     if (UNLIKELY(memBEGINs(p, e - p,
3695                            LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
3696     {
3697         *lenp = 1;
3698         *ustrp = 'i';
3699         return 'i';
3700     }
3701 
3702     return 0;
3703 }
3704 
3705 STATIC UV
S_turkic_lc(pTHX_ const U8 * const p0,const U8 * const e,U8 * ustrp,STRLEN * lenp)3706 S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
3707                         U8 * ustrp, STRLEN *lenp)
3708 {
3709     /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
3710      * p0..e-1 according to Turkic rules is the same as for non-Turkic.
3711      * Otherwise, it returns the first code point of the Turkic lowercased
3712      * sequence, and the entire sequence will be stored in *ustrp.  ustrp will
3713      * contain *lenp bytes */
3714 
3715     PERL_ARGS_ASSERT_TURKIC_LC;
3716     assert(e > p0);
3717 
3718     /* A 'I' requires context as to what to do */
3719     if (UNLIKELY(*p0 == 'I')) {
3720         const U8 * p = p0 + 1;
3721 
3722         /* According to the Unicode SpecialCasing.txt file, a capital 'I'
3723          * modified by a dot above lowercases to 'i' even in turkic locales. */
3724         while (p < e) {
3725             UV cp;
3726 
3727             if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
3728                 ustrp[0] = 'i';
3729                 *lenp = 1;
3730                 return 'i';
3731             }
3732 
3733             /* For the dot above to modify the 'I', it must be part of a
3734              * combining sequence immediately following the 'I', and no other
3735              * modifier with a ccc of 230 may intervene */
3736             cp = utf8_to_uvchr_buf(p, e, NULL);
3737             if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
3738                 break;
3739             }
3740 
3741             /* Here the combining sequence continues */
3742             p += UTF8SKIP(p);
3743         }
3744     }
3745 
3746     /* In all other cases the lc is the same as the fold */
3747     return turkic_fc(p0, e, ustrp, lenp);
3748 }
3749 
3750 STATIC UV
S_turkic_uc(pTHX_ const U8 * const p,const U8 * const e,U8 * ustrp,STRLEN * lenp)3751 S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
3752                         U8 * ustrp, STRLEN *lenp)
3753 {
3754     /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
3755      * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
3756      * Otherwise, it returns the first code point of the Turkic upper or
3757      * title-cased sequence, and the entire sequence will be stored in *ustrp.
3758      * ustrp will contain *lenp bytes
3759      *
3760      * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
3761      * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
3762      * DOTLESS I */
3763 
3764     PERL_ARGS_ASSERT_TURKIC_UC;
3765     assert(e > p);
3766 
3767     if (*p == 'i') {
3768         *lenp = 2;
3769         ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3770         ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
3771         return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
3772     }
3773 
3774     if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
3775         *lenp = 1;
3776         *ustrp = 'I';
3777         return 'I';
3778     }
3779 
3780     return 0;
3781 }
3782 
3783 /* The process for changing the case is essentially the same for the four case
3784  * change types, except there are complications for folding.  Otherwise the
3785  * difference is only which case to change to.  To make sure that they all do
3786  * the same thing, the bodies of the functions are extracted out into the
3787  * following two macros.  The functions are written with the same variable
3788  * names, and these are known and used inside these macros.  It would be
3789  * better, of course, to have inline functions to do it, but since different
3790  * macros are called, depending on which case is being changed to, this is not
3791  * feasible in C (to khw's knowledge).  Two macros are created so that the fold
3792  * function can start with the common start macro, then finish with its special
3793  * handling; while the other three cases can just use the common end macro.
3794  *
3795  * The algorithm is to use the proper (passed in) macro or function to change
3796  * the case for code points that are below 256.  The macro is used if using
3797  * locale rules for the case change; the function if not.  If the code point is
3798  * above 255, it is computed from the input UTF-8, and another macro is called
3799  * to do the conversion.  If necessary, the output is converted to UTF-8.  If
3800  * using a locale, we have to check that the change did not cross the 255/256
3801  * boundary, see check_locale_boundary_crossing() for further details.
3802  *
3803  * The macros are split with the correct case change for the below-256 case
3804  * stored into 'result', and in the middle of an else clause for the above-255
3805  * case.  At that point in the 'else', 'result' is not the final result, but is
3806  * the input code point calculated from the UTF-8.  The fold code needs to
3807  * realize all this and take it from there.
3808  *
3809  * To deal with Turkic locales, the function specified by the parameter
3810  * 'turkic' is called when appropriate.
3811  *
3812  * If you read the two macros as sequential, it's easier to understand what's
3813  * going on. */
3814 #define CASE_CHANGE_BODY_START(locale_flags, libc_change_function, L1_func,  \
3815                                L1_func_extra_param, turkic)                  \
3816                                                                              \
3817     if (flags & (locale_flags)) {                                            \
3818         CHECK_AND_WARN_PROBLEMATIC_LOCALE_;                                  \
3819         if (IN_UTF8_CTYPE_LOCALE) {                                          \
3820             if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) {                           \
3821                 UV ret = turkic(p, e, ustrp, lenp);                          \
3822                 if (ret) return ret;                                         \
3823             }                                                                \
3824                                                                              \
3825             /* Otherwise, treat a UTF-8 locale as not being in locale at     \
3826              * all */                                                        \
3827             flags &= ~(locale_flags);                                        \
3828         }                                                                    \
3829     }                                                                        \
3830                                                                              \
3831     if (UTF8_IS_INVARIANT(*p)) {                                             \
3832         if (flags & (locale_flags)) {                                        \
3833             result = libc_change_function(*p);                               \
3834         }                                                                    \
3835         else {                                                               \
3836             return L1_func(*p, ustrp, lenp, L1_func_extra_param);            \
3837         }                                                                    \
3838     }                                                                        \
3839     else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) {                          \
3840         U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));                         \
3841         if (flags & (locale_flags)) {                                        \
3842             result = libc_change_function(c);                                \
3843         }                                                                    \
3844         else {                                                               \
3845             return L1_func(c, ustrp, lenp,  L1_func_extra_param);            \
3846         }                                                                    \
3847     }                                                                        \
3848     else {  /* malformed UTF-8 or ord above 255 */                           \
3849         STRLEN len_result;                                                   \
3850         result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY);     \
3851         if (len_result == (STRLEN) -1) {                                     \
3852             _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ );        \
3853         }
3854 
3855 #define CASE_CHANGE_BODY_END(locale_flags, change_macro)                     \
3856         result = change_macro(result, p, ustrp, lenp);                       \
3857                                                                              \
3858         if (flags & (locale_flags)) {                                        \
3859             result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3860         }                                                                    \
3861         return result;                                                       \
3862     }                                                                        \
3863                                                                              \
3864     /* Here, used locale rules.  Convert back to UTF-8 */                    \
3865     if (UTF8_IS_INVARIANT(result)) {                                         \
3866         *ustrp = (U8) result;                                                \
3867         *lenp = 1;                                                           \
3868     }                                                                        \
3869     else {                                                                   \
3870         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);                             \
3871         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);                       \
3872         *lenp = 2;                                                           \
3873     }                                                                        \
3874                                                                              \
3875     return result;
3876 
3877 /* Not currently externally documented, and subject to change:
3878  * <flags> is set iff the rules from the current underlying locale are to
3879  *         be used. */
3880 
3881 UV
Perl__to_utf8_upper_flags(pTHX_ const U8 * p,const U8 * e,U8 * ustrp,STRLEN * lenp,bool flags)3882 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3883                                 const U8 *e,
3884                                 U8* ustrp,
3885                                 STRLEN *lenp,
3886                                 bool flags)
3887 {
3888     UV result;
3889 
3890     PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3891 
3892     /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3893     /* 2nd char of uc(U+DF) is 'S' */
3894     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 'S',
3895                                                                     turkic_uc);
3896     CASE_CHANGE_BODY_END  (~0, CALL_UPPER_CASE);
3897 }
3898 
3899 /* Not currently externally documented, and subject to change:
3900  * <flags> is set iff the rules from the current underlying locale are to be
3901  *         used.  Since titlecase is not defined in POSIX, for other than a
3902  *         UTF-8 locale, uppercase is used instead for code points < 256.
3903  */
3904 
3905 UV
Perl__to_utf8_title_flags(pTHX_ const U8 * p,const U8 * e,U8 * ustrp,STRLEN * lenp,bool flags)3906 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3907                                 const U8 *e,
3908                                 U8* ustrp,
3909                                 STRLEN *lenp,
3910                                 bool flags)
3911 {
3912     UV result;
3913 
3914     PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3915 
3916     /* 2nd char of ucfirst(U+DF) is 's' */
3917     CASE_CHANGE_BODY_START(~0, toupper, _to_upper_title_latin1, 's',
3918                                                                     turkic_uc);
3919     CASE_CHANGE_BODY_END  (~0, CALL_TITLE_CASE);
3920 }
3921 
3922 /* Not currently externally documented, and subject to change:
3923  * <flags> is set iff the rules from the current underlying locale are to
3924  *         be used.
3925  */
3926 
3927 UV
Perl__to_utf8_lower_flags(pTHX_ const U8 * p,const U8 * e,U8 * ustrp,STRLEN * lenp,bool flags)3928 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3929                                 const U8 *e,
3930                                 U8* ustrp,
3931                                 STRLEN *lenp,
3932                                 bool flags)
3933 {
3934     UV result;
3935 
3936     PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3937 
3938     CASE_CHANGE_BODY_START(~0, tolower, to_lower_latin1, 0 /* 0 is dummy */,
3939                                                                     turkic_lc);
3940     CASE_CHANGE_BODY_END  (~0, CALL_LOWER_CASE)
3941 }
3942 
3943 /* Not currently externally documented, and subject to change,
3944  * in <flags>
3945  *	bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3946  *	                      locale are to be used.
3947  *      bit FOLD_FLAGS_FULL   is set iff full case folds are to be used;
3948  *			      otherwise simple folds
3949  *      bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3950  *			      prohibited
3951  */
3952 
3953 UV
Perl__to_utf8_fold_flags(pTHX_ const U8 * p,const U8 * e,U8 * ustrp,STRLEN * lenp,U8 flags)3954 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3955                                const U8 *e,
3956                                U8* ustrp,
3957                                STRLEN *lenp,
3958                                U8 flags)
3959 {
3960     UV result;
3961 
3962     PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3963 
3964     /* These are mutually exclusive */
3965     assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3966 
3967     assert(p != ustrp); /* Otherwise overwrites */
3968 
3969     CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, tolower, _to_fold_latin1,
3970                  ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
3971                                                                     turkic_fc);
3972 
3973         result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3974 
3975         if (flags & FOLD_FLAGS_LOCALE) {
3976 
3977 #           define LONG_S_T      LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3978 #         ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3979 #           define CAP_SHARP_S   LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3980 
3981             /* Special case these two characters, as what normally gets
3982              * returned under locale doesn't work */
3983             if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
3984             {
3985                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3986                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3987                               "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3988                               "resolved to \"\\x{17F}\\x{17F}\".");
3989                 goto return_long_s;
3990             }
3991             else
3992 #endif
3993                  if (memBEGINs((char *) p, e - p, LONG_S_T))
3994             {
3995                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3996                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3997                               "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3998                               "resolved to \"\\x{FB06}\".");
3999                 goto return_ligature_st;
4000             }
4001 
4002 #if    UNICODE_MAJOR_VERSION   == 3         \
4003     && UNICODE_DOT_VERSION     == 0         \
4004     && UNICODE_DOT_DOT_VERSION == 1
4005 #           define DOTTED_I   LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
4006 
4007             /* And special case this on this Unicode version only, for the same
4008              * reaons the other two are special cased.  They would cross the
4009              * 255/256 boundary which is forbidden under /l, and so the code
4010              * wouldn't catch that they are equivalent (which they are only in
4011              * this release) */
4012             else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
4013                 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
4014                 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
4015                               "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
4016                               "resolved to \"\\x{0131}\".");
4017                 goto return_dotless_i;
4018             }
4019 #endif
4020 
4021             return check_locale_boundary_crossing(p, result, ustrp, lenp);
4022         }
4023         else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
4024             return result;
4025         }
4026         else {
4027             /* This is called when changing the case of a UTF-8-encoded
4028              * character above the ASCII range, and the result should not
4029              * contain an ASCII character. */
4030 
4031             UV original;    /* To store the first code point of <p> */
4032 
4033             /* Look at every character in the result; if any cross the
4034             * boundary, the whole thing is disallowed */
4035             U8* s = ustrp;
4036             U8* send = ustrp + *lenp;
4037             while (s < send) {
4038                 if (isASCII(*s)) {
4039                     /* Crossed, have to return the original */
4040                     original = valid_utf8_to_uvchr(p, lenp);
4041 
4042                     /* But in these instances, there is an alternative we can
4043                      * return that is valid */
4044                     if (original == LATIN_SMALL_LETTER_SHARP_S
4045 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
4046                         || original == LATIN_CAPITAL_LETTER_SHARP_S
4047 #endif
4048                     ) {
4049                         goto return_long_s;
4050                     }
4051                     else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
4052                         goto return_ligature_st;
4053                     }
4054 #if    UNICODE_MAJOR_VERSION   == 3         \
4055     && UNICODE_DOT_VERSION     == 0         \
4056     && UNICODE_DOT_DOT_VERSION == 1
4057 
4058                     else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
4059                         goto return_dotless_i;
4060                     }
4061 #endif
4062                     Copy(p, ustrp, *lenp, char);
4063                     return original;
4064                 }
4065                 s += UTF8SKIP(s);
4066             }
4067 
4068             /* Here, no characters crossed, result is ok as-is */
4069             return result;
4070         }
4071     }
4072 
4073     /* Here, used locale rules.  Convert back to UTF-8 */
4074     if (UTF8_IS_INVARIANT(result)) {
4075         *ustrp = (U8) result;
4076         *lenp = 1;
4077     }
4078     else {
4079         *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
4080         *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
4081         *lenp = 2;
4082     }
4083 
4084     return result;
4085 
4086   return_long_s:
4087     /* Certain folds to 'ss' are prohibited by the options, but they do allow
4088      * folds to a string of two of these characters.  By returning this
4089      * instead, then, e.g.,
4090      *      fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
4091      * works. */
4092 
4093     *lenp = 2 * STRLENs(LATIN_SMALL_LETTER_LONG_S_UTF8);
4094     Copy(LATIN_SMALL_LETTER_LONG_S_UTF8   LATIN_SMALL_LETTER_LONG_S_UTF8,
4095         ustrp, *lenp, U8);
4096     return LATIN_SMALL_LETTER_LONG_S;
4097 
4098   return_ligature_st:
4099     /* Two folds to 'st' are prohibited by the options; instead we pick one and
4100      * have the other one fold to it */
4101 
4102     *lenp = STRLENs(LATIN_SMALL_LIGATURE_ST_UTF8);
4103     Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
4104     return LATIN_SMALL_LIGATURE_ST;
4105 
4106 #if    UNICODE_MAJOR_VERSION   == 3         \
4107     && UNICODE_DOT_VERSION     == 0         \
4108     && UNICODE_DOT_DOT_VERSION == 1
4109 
4110   return_dotless_i:
4111     *lenp = STRLENs(LATIN_SMALL_LETTER_DOTLESS_I_UTF8);
4112     Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
4113     return LATIN_SMALL_LETTER_DOTLESS_I;
4114 
4115 #endif
4116 
4117 }
4118 
4119 bool
4120 Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
4121 {
4122     /* May change: warns if surrogates, non-character code points, or
4123      * non-Unicode code points are in 's' which has length 'len' bytes.
4124      * Returns TRUE if none found; FALSE otherwise.  The only other validity
4125      * check is to make sure that this won't exceed the string's length nor
4126      * overflow */
4127 
4128     const U8* const e = s + len;
4129     bool ok = TRUE;
4130 
4131     PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4132 
4133     while (s < e) {
4134         if (UTF8SKIP(s) > len) {
4135             Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4136                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4137             return FALSE;
4138         }
4139         if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
4140             if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
4141                 if (   ckWARN_d(WARN_NON_UNICODE)
4142                     || UNLIKELY(0 < does_utf8_overflow(s, s + len,
4143                                                0 /* Don't consider overlongs */
4144                                                )))
4145                 {
4146                     /* A side effect of this function will be to warn */
4147                     (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
4148                     ok = FALSE;
4149                 }
4150             }
4151             else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
4152                 if (ckWARN_d(WARN_SURROGATE)) {
4153                     /* This has a different warning than the one the called
4154                      * function would output, so can't just call it, unlike we
4155                      * do for the non-chars and above-unicodes */
4156                     UV uv = utf8_to_uvchr_buf(s, e, NULL);
4157                     Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
4158                         "Unicode surrogate U+%04" UVXf " is illegal in UTF-8",
4159                                              uv);
4160                     ok = FALSE;
4161                 }
4162             }
4163             else if (   UNLIKELY(UTF8_IS_NONCHAR(s, e))
4164                      && (ckWARN_d(WARN_NONCHAR)))
4165             {
4166                 /* A side effect of this function will be to warn */
4167                 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
4168                 ok = FALSE;
4169             }
4170         }
4171         s += UTF8SKIP(s);
4172     }
4173 
4174     return ok;
4175 }
4176 
4177 /*
4178 =for apidoc pv_uni_display
4179 
4180 Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
4181 C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
4182 long (if longer, the rest is truncated and C<"..."> will be appended).
4183 
4184 The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
4185 C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
4186 to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
4187 (C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
4188 C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
4189 C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
4190 
4191 Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
4192 backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
4193 
4194 The pointer to the PV of the C<dsv> is returned.
4195 
4196 See also L</sv_uni_display>.
4197 
4198 =for apidoc Amnh||UNI_DISPLAY_BACKSLASH
4199 =for apidoc Amnh||UNI_DISPLAY_BACKSPACE
4200 =for apidoc Amnh||UNI_DISPLAY_ISPRINT
4201 =for apidoc Amnh||UNI_DISPLAY_QQ
4202 =for apidoc Amnh||UNI_DISPLAY_REGEX
4203 =cut
4204 */
4205 char *
4206 Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim,
4207                           UV flags)
4208 {
4209     int truncated = 0;
4210     const char *s, *e;
4211 
4212     PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4213 
4214     SvPVCLEAR(dsv);
4215     SvUTF8_off(dsv);
4216     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
4217          UV u;
4218          bool ok = 0;
4219 
4220          if (pvlim && SvCUR(dsv) >= pvlim) {
4221               truncated++;
4222               break;
4223          }
4224          u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
4225          if (u < 256) {
4226              const U8 c = (U8) u;
4227              if (flags & UNI_DISPLAY_BACKSLASH) {
4228                  if (    isMNEMONIC_CNTRL(c)
4229                      && (   c != '\b'
4230                          || (flags & UNI_DISPLAY_BACKSPACE)))
4231                  {
4232                     const char * mnemonic = cntrl_to_mnemonic(c);
4233                     sv_catpvn(dsv, mnemonic, strlen(mnemonic));
4234                     ok = 1;
4235                  }
4236                  else if (c == '\\') {
4237                     sv_catpvs(dsv, "\\\\");
4238                     ok = 1;
4239                  }
4240              }
4241              /* isPRINT() is the locale-blind version. */
4242              if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
4243                  const char string = c;
4244                  sv_catpvn(dsv, &string, 1);
4245                  ok = 1;
4246              }
4247          }
4248          if (!ok)
4249              Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
4250     }
4251     if (truncated)
4252          sv_catpvs(dsv, "...");
4253 
4254     return SvPVX(dsv);
4255 }
4256 
4257 /*
4258 =for apidoc sv_uni_display
4259 
4260 Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4261 the displayable version being at most C<pvlim> bytes long
4262 (if longer, the rest is truncated and "..." will be appended).
4263 
4264 The C<flags> argument is as in L</pv_uni_display>().
4265 
4266 The pointer to the PV of the C<dsv> is returned.
4267 
4268 =cut
4269 */
4270 char *
4271 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4272 {
4273     const char * const ptr =
4274         isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4275 
4276     PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4277 
4278     return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
4279                                 SvCUR(ssv), pvlim, flags);
4280 }
4281 
4282 /*
4283 =for apidoc foldEQ_utf8
4284 
4285 Returns true if the leading portions of the strings C<s1> and C<s2> (either or
4286 both of which may be in UTF-8) are the same case-insensitively; false
4287 otherwise.  How far into the strings to compare is determined by other input
4288 parameters.
4289 
4290 If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4291 otherwise it is assumed to be in native 8-bit encoding.  Correspondingly for
4292 C<u2> with respect to C<s2>.
4293 
4294 If the byte length C<l1> is non-zero, it says how far into C<s1> to check for
4295 fold equality.  In other words, C<s1>+C<l1> will be used as a goal to reach.
4296 The scan will not be considered to be a match unless the goal is reached, and
4297 scanning won't continue past that goal.  Correspondingly for C<l2> with respect
4298 to C<s2>.
4299 
4300 If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that
4301 pointer is considered an end pointer to the position 1 byte past the maximum
4302 point in C<s1> beyond which scanning will not continue under any circumstances.
4303 (This routine assumes that UTF-8 encoded input strings are not malformed;
4304 malformed input can cause it to read past C<pe1>).  This means that if both
4305 C<l1> and C<pe1> are specified, and C<pe1> is less than C<s1>+C<l1>, the match
4306 will never be successful because it can never
4307 get as far as its goal (and in fact is asserted against).  Correspondingly for
4308 C<pe2> with respect to C<s2>.
4309 
4310 At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4311 C<l2> must be non-zero), and if both do, both have to be
4312 reached for a successful match.   Also, if the fold of a character is multiple
4313 characters, all of them must be matched (see tr21 reference below for
4314 'folding').
4315 
4316 Upon a successful match, if C<pe1> is non-C<NULL>,
4317 it will be set to point to the beginning of the I<next> character of C<s1>
4318 beyond what was matched.  Correspondingly for C<pe2> and C<s2>.
4319 
4320 For case-insensitiveness, the "casefolding" of Unicode is used
4321 instead of upper/lowercasing both the characters, see
4322 L<https://www.unicode.org/reports/tr21/> (Case Mappings).
4323 
4324 =for apidoc Cmnh||FOLDEQ_UTF8_NOMIX_ASCII
4325 =for apidoc Cmnh||FOLDEQ_LOCALE
4326 =for apidoc Cmnh||FOLDEQ_S1_ALREADY_FOLDED
4327 =for apidoc Cmnh||FOLDEQ_S1_FOLDS_SANE
4328 =for apidoc Cmnh||FOLDEQ_S2_ALREADY_FOLDED
4329 =for apidoc Cmnh||FOLDEQ_S2_FOLDS_SANE
4330 
4331 =cut */
4332 
4333 /* A flags parameter has been added which may change, and hence isn't
4334  * externally documented.  Currently it is:
4335  *  0 for as-documented above
4336  *  FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4337                             ASCII one, to not match
4338  *  FOLDEQ_LOCALE	    is set iff the rules from the current underlying
4339  *	                    locale are to be used.
4340  *  FOLDEQ_S1_ALREADY_FOLDED  s1 has already been folded before calling this
4341  *                          routine.  This allows that step to be skipped.
4342  *                          Currently, this requires s1 to be encoded as UTF-8
4343  *                          (u1 must be true), which is asserted for.
4344  *  FOLDEQ_S1_FOLDS_SANE    With either NOMIX_ASCII or LOCALE, no folds may
4345  *                          cross certain boundaries.  Hence, the caller should
4346  *                          let this function do the folding instead of
4347  *                          pre-folding.  This code contains an assertion to
4348  *                          that effect.  However, if the caller knows what
4349  *                          it's doing, it can pass this flag to indicate that,
4350  *                          and the assertion is skipped.
4351  *  FOLDEQ_S2_ALREADY_FOLDED  Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
4352  *                          to s2, and s2 doesn't have to be UTF-8 encoded.
4353  *                          This introduces an asymmetry to save a few branches
4354  *                          in a loop.  Currently, this is not a problem, as
4355  *                          never are both inputs pre-folded.  Simply call this
4356  *                          function with the pre-folded one as the second
4357  *                          string.
4358  *  FOLDEQ_S2_FOLDS_SANE
4359  */
4360 
4361 I32
4362 Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
4363                              const char *s2, char **pe2, UV l2, bool u2,
4364                              U32 flags)
4365 {
4366     const U8 *p1  = (const U8*)s1; /* Point to current char */
4367     const U8 *p2  = (const U8*)s2;
4368     const U8 *g1 = NULL;       /* goal for s1 */
4369     const U8 *g2 = NULL;
4370     const U8 *e1 = NULL;       /* Don't scan s1 past this */
4371     U8 *f1 = NULL;             /* Point to current folded */
4372     const U8 *e2 = NULL;
4373     U8 *f2 = NULL;
4374     STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
4375     U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4376     U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
4377     U8 flags_for_folder = FOLD_FLAGS_FULL;
4378 
4379     PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
4380 
4381     assert( ! (             (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
4382                && ((        (flags &  FOLDEQ_S1_ALREADY_FOLDED)
4383                         && !(flags &  FOLDEQ_S1_FOLDS_SANE))
4384                     || (    (flags &  FOLDEQ_S2_ALREADY_FOLDED)
4385                         && !(flags &  FOLDEQ_S2_FOLDS_SANE)))));
4386     /* The algorithm is to trial the folds without regard to the flags on
4387      * the first line of the above assert(), and then see if the result
4388      * violates them.  This means that the inputs can't be pre-folded to a
4389      * violating result, hence the assert.  This could be changed, with the
4390      * addition of extra tests here for the already-folded case, which would
4391      * slow it down.  That cost is more than any possible gain for when these
4392      * flags are specified, as the flags indicate /il or /iaa matching which
4393      * is less common than /iu, and I (khw) also believe that real-world /il
4394      * and /iaa matches are most likely to involve code points 0-255, and this
4395      * function only under rare conditions gets called for 0-255. */
4396 
4397     if (flags & FOLDEQ_LOCALE) {
4398         if (IN_UTF8_CTYPE_LOCALE) {
4399             if (UNLIKELY(IN_UTF8_TURKIC_LOCALE)) {
4400                 flags_for_folder |= FOLD_FLAGS_LOCALE;
4401             }
4402             else {
4403                 flags &= ~FOLDEQ_LOCALE;
4404             }
4405         }
4406         else {
4407             flags_for_folder |= FOLD_FLAGS_LOCALE;
4408         }
4409     }
4410     if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
4411         flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
4412     }
4413 
4414     if (pe1) {
4415         e1 = *(U8**)pe1;
4416     }
4417 
4418     if (l1) {
4419         g1 = (const U8*)s1 + l1;
4420     }
4421 
4422     if (pe2) {
4423         e2 = *(U8**)pe2;
4424     }
4425 
4426     if (l2) {
4427         g2 = (const U8*)s2 + l2;
4428     }
4429 
4430     /* Must have at least one goal */
4431     assert(g1 || g2);
4432 
4433     if (g1) {
4434 
4435         /* Will never match if goal is out-of-bounds */
4436         assert(! e1  || e1 >= g1);
4437 
4438         /* Here, there isn't an end pointer, or it is beyond the goal.  We
4439         * only go as far as the goal */
4440         e1 = g1;
4441     }
4442     else {
4443         assert(e1);    /* Must have an end for looking at s1 */
4444     }
4445 
4446     /* Same for goal for s2 */
4447     if (g2) {
4448         assert(! e2  || e2 >= g2);
4449         e2 = g2;
4450     }
4451     else {
4452         assert(e2);
4453     }
4454 
4455     /* If both operands are already folded, we could just do a memEQ on the
4456      * whole strings at once, but it would be better if the caller realized
4457      * this and didn't even call us */
4458 
4459     /* Look through both strings, a character at a time */
4460     while (p1 < e1 && p2 < e2) {
4461 
4462         /* If at the beginning of a new character in s1, get its fold to use
4463          * and the length of the fold. */
4464         if (n1 == 0) {
4465             if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4466                 f1 = (U8 *) p1;
4467                 assert(u1);
4468                 n1 = UTF8SKIP(f1);
4469             }
4470             else {
4471                 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4472 
4473                     /* We have to forbid mixing ASCII with non-ASCII if the
4474                      * flags so indicate.  And, we can short circuit having to
4475                      * call the general functions for this common ASCII case,
4476                      * all of whose non-locale folds are also ASCII, and hence
4477                      * UTF-8 invariants, so the UTF8ness of the strings is not
4478                      * relevant. */
4479                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4480                         return 0;
4481                     }
4482                     n1 = 1;
4483                     *foldbuf1 = toFOLD(*p1);
4484                 }
4485                 else if (u1) {
4486                     _toFOLD_utf8_flags(p1, e1, foldbuf1, &n1, flags_for_folder);
4487                 }
4488                 else {  /* Not UTF-8, get UTF-8 fold */
4489                     _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4490                 }
4491                 f1 = foldbuf1;
4492             }
4493         }
4494 
4495         if (n2 == 0) {    /* Same for s2 */
4496             if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4497 
4498                 /* Point to the already-folded character.  But for non-UTF-8
4499                  * variants, convert to UTF-8 for the algorithm below */
4500                 if (UTF8_IS_INVARIANT(*p2)) {
4501                     f2 = (U8 *) p2;
4502                     n2 = 1;
4503                 }
4504                 else if (u2) {
4505                     f2 = (U8 *) p2;
4506                     n2 = UTF8SKIP(f2);
4507                 }
4508                 else {
4509                     foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
4510                     foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
4511                     f2 = foldbuf2;
4512                     n2 = 2;
4513                 }
4514             }
4515             else {
4516                 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4517                     if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4518                         return 0;
4519                     }
4520                     n2 = 1;
4521                     *foldbuf2 = toFOLD(*p2);
4522                 }
4523                 else if (u2) {
4524                     _toFOLD_utf8_flags(p2, e2, foldbuf2, &n2, flags_for_folder);
4525                 }
4526                 else {
4527                     _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4528                 }
4529                 f2 = foldbuf2;
4530             }
4531         }
4532 
4533         /* Here f1 and f2 point to the beginning of the strings to compare.
4534          * These strings are the folds of the next character from each input
4535          * string, stored in UTF-8. */
4536 
4537         /* While there is more to look for in both folds, see if they
4538         * continue to match */
4539         while (n1 && n2) {
4540             U8 fold_length = UTF8SKIP(f1);
4541             if (fold_length != UTF8SKIP(f2)
4542                 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4543                                                        function call for single
4544                                                        byte */
4545                 || memNE((char*)f1, (char*)f2, fold_length))
4546             {
4547                 return 0; /* mismatch */
4548             }
4549 
4550             /* Here, they matched, advance past them */
4551             n1 -= fold_length;
4552             f1 += fold_length;
4553             n2 -= fold_length;
4554             f2 += fold_length;
4555         }
4556 
4557         /* When reach the end of any fold, advance the input past it */
4558         if (n1 == 0) {
4559             p1 += u1 ? UTF8SKIP(p1) : 1;
4560         }
4561         if (n2 == 0) {
4562             p2 += u2 ? UTF8SKIP(p2) : 1;
4563         }
4564     } /* End of loop through both strings */
4565 
4566     /* A match is defined by each scan that specified an explicit length
4567     * reaching its final goal, and the other not having matched a partial
4568     * character (which can happen when the fold of a character is more than one
4569     * character). */
4570     if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
4571         return 0;
4572     }
4573 
4574     /* Successful match.  Set output pointers */
4575     if (pe1) {
4576         *pe1 = (char*)p1;
4577     }
4578     if (pe2) {
4579         *pe2 = (char*)p2;
4580     }
4581     return 1;
4582 }
4583 
4584 /*
4585  * ex: set ts=8 sts=4 sw=4 et:
4586  */
4587