1#!perl -w
2
3# This is a base file to be used by various .t's in its directory
4# It tests various malformed UTF-8 sequences and some code points that are
5# "problematic", and verifies that the correct warnings/flags etc are
6# generated when using them.  For the code points, it also takes the UTF-8 and
7# perturbs it to be malformed in various ways, and tests that this gets
8# appropriately detected.
9
10use strict;
11use Test::More;
12
13BEGIN {
14    use_ok('XS::APItest');
15    require 'charset_tools.pl';
16    require './t/utf8_setup.pl';
17};
18
19$|=1;
20
21use XS::APItest;
22
23my @warnings_gotten;
24
25use warnings 'utf8';
26local $SIG{__WARN__} = sub { my @copy = @_;
27                             push @warnings_gotten, map { chomp; $_ } @copy;
28                           };
29
30my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
31my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
32
33# C5 is chosen as it is valid for both ASCII and EBCDIC platforms
34my $known_start_byte = I8_to_native("\xC5");
35
36sub requires_extended_utf8($) {
37
38    # Returns a boolean as to whether or not the code point parameter fits
39    # into 31 bits, subject to the convention that a negative code point
40    # stands for one that overflows the word size, so won't fit in 31 bits.
41
42    return shift > $highest_non_extended_utf8_cp;
43}
44
45sub is_extended_utf8($) {
46
47    # Returns a boolean as to whether or not the input UTF-8 sequence uses
48    # Perl extended UTF-8.
49
50    my $byte = substr(shift, 0, 1);
51    return ord $byte >= 0xFE if isASCII;
52    return $byte == I8_to_native("\xFF");
53}
54
55sub overflow_discern_len($) {
56
57    # Returns how many bytes are needed to tell if a non-overlong UTF-8
58    # sequence is for a code point that won't fit in the platform's word size.
59    # Only the length of the sequence representing a single code point is
60    # needed.
61
62    if (isASCII) {
63        return ($::is64bit) ? 3 : 1;
64
65        # Below is needed for code points above IV_MAX
66        #return ($::is64bit) ? 3 : ((shift == $::max_bytes)
67        #                           ? 1
68        #                           : 2);
69    }
70
71    return ($::is64bit) ? 2 : 8;
72}
73
74sub overlong_discern_len($) {
75
76    # Returns how many bytes are needed to tell if the input UTF-8 sequence
77    # for a code point is overlong
78
79    my $string = shift;
80    my $length = length $string;
81    my $byte = ord native_to_I8(substr($string, 0, 1));
82    if (isASCII) {
83        return ($byte >= 0xFE)
84                ? ((! $::is64bit)
85                    ? 1
86                    : ($byte == 0xFF) ? 7 : 2)
87                : (($length == 2) ? 1 : 2);
88        # Below is needed for code points above IV_MAX
89        #return ($length == $::max_bytes)
90        #          # This is constrained to 1 on 32-bit machines, as it
91        #          # overflows there
92        #        ? (($::is64bit) ? 7 : 1)
93        #        : (($length == 2) ? 1 : 2);
94    }
95
96    return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2);
97}
98
99my @tests;
100{
101    no warnings qw(portable overflow);
102    @tests = (
103        # $testname,
104        # $bytes,                  UTF-8 string
105        # $allowed_uv,             code point $bytes evaluates to; -1 if
106        #                          overflows
107        # $needed_to_discern_len   optional, how long an initial substring do
108        #                          we need to tell that the string must be for
109        #                          a code point in the category it falls in,
110        #                          like being a surrogate; 0 indicates we need
111        #                          the whole string.  Some categories have a
112        #                          default that is used if this is omitted.
113        [ "orphan continuation byte malformation",
114            I8_to_native("$::I8c"),
115            0xFFFD,
116            1,
117        ],
118        [ "overlong malformation, lowest 2-byte",
119            (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
120            0,   # NUL
121        ],
122        [ "overlong malformation, highest 2-byte",
123            (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
124            (isASCII) ? 0x7F : 0xFF,
125        ],
126        [ "overlong malformation, lowest 3-byte",
127            (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
128            0,   # NUL
129        ],
130        [ "overlong malformation, highest 3-byte",
131            (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
132            (isASCII) ? 0x7FF : 0x3FF,
133        ],
134        [ "lowest surrogate",
135            (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
136            0xD800,
137        ],
138        [ "a middle surrogate",
139            (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
140            0xD90D,
141        ],
142        [ "highest surrogate",
143            (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
144            0xDFFF,
145        ],
146        [ "first of 32 consecutive non-character code points",
147            (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
148            0xFDD0,
149        ],
150        [ "a mid non-character code point of the 32 consecutive ones",
151            (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
152            0xFDE0,
153        ],
154        [ "final of 32 consecutive non-character code points",
155            (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
156            0xFDEF,
157        ],
158        [ "non-character code point U+FFFE",
159            (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
160            0xFFFE,
161        ],
162        [ "non-character code point U+FFFF",
163            (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
164            0xFFFF,
165        ],
166        [ "overlong malformation, lowest 4-byte",
167            (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
168            0,   # NUL
169        ],
170        [ "overlong malformation, highest 4-byte",
171            (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
172            (isASCII) ? 0xFFFF : 0x3FFF,
173        ],
174        [ "non-character code point U+1FFFE",
175            (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
176            0x1FFFE,
177        ],
178        [ "non-character code point U+1FFFF",
179            (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
180            0x1FFFF,
181        ],
182        [ "non-character code point U+2FFFE",
183            (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
184            0x2FFFE,
185        ],
186        [ "non-character code point U+2FFFF",
187            (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
188            0x2FFFF,
189        ],
190        [ "non-character code point U+3FFFE",
191            (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
192            0x3FFFE,
193        ],
194        [ "non-character code point U+3FFFF",
195            (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
196            0x3FFFF,
197        ],
198        [ "non-character code point U+4FFFE",
199            (isASCII)
200            ?               "\xf1\x8f\xbf\xbe"
201            : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
202            0x4FFFE,
203        ],
204        [ "non-character code point U+4FFFF",
205            (isASCII)
206            ?               "\xf1\x8f\xbf\xbf"
207            : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
208            0x4FFFF,
209        ],
210        [ "non-character code point U+5FFFE",
211            (isASCII)
212            ?              "\xf1\x9f\xbf\xbe"
213            : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
214            0x5FFFE,
215        ],
216        [ "non-character code point U+5FFFF",
217            (isASCII)
218            ?              "\xf1\x9f\xbf\xbf"
219            : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
220            0x5FFFF,
221        ],
222        [ "non-character code point U+6FFFE",
223            (isASCII)
224            ?              "\xf1\xaf\xbf\xbe"
225            : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
226            0x6FFFE,
227        ],
228        [ "non-character code point U+6FFFF",
229            (isASCII)
230            ?              "\xf1\xaf\xbf\xbf"
231            : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
232            0x6FFFF,
233        ],
234        [ "non-character code point U+7FFFE",
235            (isASCII)
236            ?              "\xf1\xbf\xbf\xbe"
237            : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
238            0x7FFFE,
239        ],
240        [ "non-character code point U+7FFFF",
241            (isASCII)
242            ?              "\xf1\xbf\xbf\xbf"
243            : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
244            0x7FFFF,
245        ],
246        [ "non-character code point U+8FFFE",
247            (isASCII)
248            ?              "\xf2\x8f\xbf\xbe"
249            : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
250            0x8FFFE,
251        ],
252        [ "non-character code point U+8FFFF",
253            (isASCII)
254            ?              "\xf2\x8f\xbf\xbf"
255            : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
256            0x8FFFF,
257        ],
258        [ "non-character code point U+9FFFE",
259            (isASCII)
260            ?              "\xf2\x9f\xbf\xbe"
261            : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
262            0x9FFFE,
263        ],
264        [ "non-character code point U+9FFFF",
265            (isASCII)
266            ?              "\xf2\x9f\xbf\xbf"
267            : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
268            0x9FFFF,
269        ],
270        [ "non-character code point U+AFFFE",
271            (isASCII)
272            ?              "\xf2\xaf\xbf\xbe"
273            : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
274            0xAFFFE,
275        ],
276        [ "non-character code point U+AFFFF",
277            (isASCII)
278            ?              "\xf2\xaf\xbf\xbf"
279            : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
280            0xAFFFF,
281        ],
282        [ "non-character code point U+BFFFE",
283            (isASCII)
284            ?              "\xf2\xbf\xbf\xbe"
285            : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
286            0xBFFFE,
287        ],
288        [ "non-character code point U+BFFFF",
289            (isASCII)
290            ?              "\xf2\xbf\xbf\xbf"
291            : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
292            0xBFFFF,
293        ],
294        [ "non-character code point U+CFFFE",
295            (isASCII)
296            ?              "\xf3\x8f\xbf\xbe"
297            : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
298            0xCFFFE,
299        ],
300        [ "non-character code point U+CFFFF",
301            (isASCII)
302            ?              "\xf3\x8f\xbf\xbf"
303            : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
304            0xCFFFF,
305        ],
306        [ "non-character code point U+DFFFE",
307            (isASCII)
308            ?              "\xf3\x9f\xbf\xbe"
309            : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
310            0xDFFFE,
311        ],
312        [ "non-character code point U+DFFFF",
313            (isASCII)
314            ?              "\xf3\x9f\xbf\xbf"
315            : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
316            0xDFFFF,
317        ],
318        [ "non-character code point U+EFFFE",
319            (isASCII)
320            ?              "\xf3\xaf\xbf\xbe"
321            : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
322            0xEFFFE,
323        ],
324        [ "non-character code point U+EFFFF",
325            (isASCII)
326            ?              "\xf3\xaf\xbf\xbf"
327            : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
328            0xEFFFF,
329        ],
330        [ "non-character code point U+FFFFE",
331            (isASCII)
332            ?              "\xf3\xbf\xbf\xbe"
333            : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
334            0xFFFFE,
335        ],
336        [ "non-character code point U+FFFFF",
337            (isASCII)
338            ?              "\xf3\xbf\xbf\xbf"
339            : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
340            0xFFFFF,
341        ],
342        [ "non-character code point U+10FFFE",
343            (isASCII)
344            ?              "\xf4\x8f\xbf\xbe"
345            : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
346            0x10FFFE,
347        ],
348        [ "non-character code point U+10FFFF",
349            (isASCII)
350            ?              "\xf4\x8f\xbf\xbf"
351            : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
352            0x10FFFF,
353        ],
354        [ "first non_unicode",
355            (isASCII)
356            ?              "\xf4\x90\x80\x80"
357            : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
358            0x110000,
359            2,
360        ],
361        [ "non_unicode whose first byte tells that",
362            (isASCII)
363            ?              "\xf5\x80\x80\x80"
364            : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
365            (isASCII) ? 0x140000 : 0x200000,
366            1,
367        ],
368        [ "overlong malformation, lowest 5-byte",
369            (isASCII)
370            ?              "\xf8\x80\x80\x80\x80"
371            : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
372            0,   # NUL
373        ],
374        [ "overlong malformation, highest 5-byte",
375            (isASCII)
376            ?              "\xf8\x87\xbf\xbf\xbf"
377            : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
378            (isASCII) ? 0x1FFFFF : 0x3FFFF,
379        ],
380        [ "overlong malformation, lowest 6-byte",
381            (isASCII)
382            ?              "\xfc\x80\x80\x80\x80\x80"
383            : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
384            0,   # NUL
385        ],
386        [ "overlong malformation, highest 6-byte",
387            (isASCII)
388            ?              "\xfc\x83\xbf\xbf\xbf\xbf"
389            : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
390            (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
391        ],
392        [ "overlong malformation, lowest 7-byte",
393            (isASCII)
394            ?              "\xfe\x80\x80\x80\x80\x80\x80"
395            : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
396            0,   # NUL
397        ],
398        [ "overlong malformation, highest 7-byte",
399            (isASCII)
400            ?              "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
401            : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
402            (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
403        ],
404        [ "highest 31 bit code point",
405            (isASCII)
406            ?  "\xfd\xbf\xbf\xbf\xbf\xbf"
407            : I8_to_native(
408               "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
409            0x7FFFFFFF,
410            1,
411        ],
412        [ "lowest 32 bit code point",
413            (isASCII)
414            ?  "\xfe\x82\x80\x80\x80\x80\x80"
415            : I8_to_native(
416                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
417            ($::is64bit) ? 0x80000000 : -1,   # Overflows on 32-bit systems
418            1,
419        ],
420        # Used when UV_MAX is allowed as a code point
421        #[ "highest 32 bit code point",
422        #    (isASCII)
423        #    ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
424        #    : I8_to_native(
425        #       "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
426        #    0xFFFFFFFF,
427        #],
428        #[ "Lowest 33 bit code point",
429        #    (isASCII)
430        #    ?  "\xfe\x84\x80\x80\x80\x80\x80"
431        #    : I8_to_native(
432        #        "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
433        #    ($::is64bit) ? 0x100000000 : 0x0,   # Overflows on 32-bit systems
434        #],
435    );
436
437    if (! $::is64bit) {
438        if (isASCII) {
439            push @tests,
440                [ "overlong malformation, but naively looks like overflow",
441                    "\xff\x80\x80\x80\x80\x80\x80\x81\xbf\xbf\xbf\xbf\xbf",
442                    0x7FFFFFFF,
443                ],
444                # Used when above IV_MAX are allowed.
445                #[ "overlong malformation, but naively looks like overflow",
446                #    "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
447                #    0xFFFFFFFF,
448                #],
449                [ "overflow that old algorithm failed to detect",
450                    "\xfe\x86\x80\x80\x80\x80\x80",
451                    -1,
452                ];
453        }
454    }
455
456    push @tests,
457        [ "overlong malformation, lowest max-byte",
458            (isASCII)
459             ?      "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
460             : I8_to_native(
461                    "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
462            0,   # NUL
463        ],
464        [ "overlong malformation, highest max-byte",
465            (isASCII)    # 2**36-1 on ASCII; 2**30-1 on EBCDIC
466             ?      "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
467             : I8_to_native(
468                    "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
469            (isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF,
470        ];
471
472    if (isASCII) {
473        push @tests,
474            [ "Lowest code point requiring 13 bytes to represent", # 2**36
475                "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
476                ($::is64bit) ? 0x1000000000 : -1,    # overflows on 32bit
477            ],
478    };
479
480    if ($::is64bit) {
481        push @tests,
482            [ "highest 63 bit code point",
483              (isASCII)
484              ? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
485              : I8_to_native(
486                "\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
487              0x7FFFFFFFFFFFFFFF,
488              (isASCII) ? 1 : 2,
489            ],
490            [ "first 64 bit code point",
491              (isASCII)
492              ? "\xff\x80\x88\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
493              : I8_to_native(
494                "\xff\xa8\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
495              -1,
496            ];
497            # Used when UV_MAX is allowed as a code point
498            #[ "highest 64 bit code point",
499            #  (isASCII)
500            #  ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
501            #  : I8_to_native(
502            #    "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
503            #  0xFFFFFFFFFFFFFFFF,
504            #  (isASCII) ? 1 : 2,
505            #],
506            #[ "first 65 bit code point",
507            #  (isASCII)
508            #  ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
509            #  : I8_to_native(
510            #    "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
511            #  0,
512            #];
513        if (isASCII) {
514            push @tests,
515                [ "overflow that old algorithm failed to detect",
516                    "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
517                    -1,
518                ];
519        }
520        else {
521            push @tests,    # These could falsely show wrongly in a naive
522                            # implementation
523                [ "requires at least 32 bits",
524                    I8_to_native(
525                    "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
526                    0x800000000,
527                      40000000
528                ],
529                [ "requires at least 32 bits",
530                    I8_to_native(
531                    "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
532                    0x10000000000,
533                ],
534                [ "requires at least 32 bits",
535                    I8_to_native(
536                    "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
537                    0x200000000000,
538                ],
539                [ "requires at least 32 bits",
540                    I8_to_native(
541                    "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
542                    0x4000000000000,
543                ],
544                [ "requires at least 32 bits",
545                    I8_to_native(
546                    "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
547                    0x80000000000000,
548                ],
549                [ "requires at least 32 bits",
550                    I8_to_native(
551                    "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
552                    0x1000000000000000,
553                ];
554        }
555    }
556}
557
558sub flags_to_text($$)
559{
560    my ($flags, $flags_to_text_ref) = @_;
561
562    # Returns a string containing a mnemonic representation of the bits that
563    # are set in the $flags.  These are assumed to be flag bits.  The return
564    # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
565    # array that gives the textual representation of all the possible flags.
566    # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
567    # no bits at all are set the string "0" is returned;
568
569    my @flag_text;
570    my $shift = 0;
571
572    return "0" if $flags == 0;
573
574    while ($flags) {
575        #diag sprintf "%x", $flags;
576        if ($flags & 1) {
577            push @flag_text, $flags_to_text_ref->[$shift];
578        }
579        $shift++;
580        $flags >>= 1;
581    }
582
583    return join "|", @flag_text;
584}
585
586# Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
587# instead of A_, D_, but the prefixes will be used in a a later commit, so
588# minimize churn by having them here.
589my @utf8n_flags_to_text =  ( qw(
590        A_EMPTY
591        A_CONTINUATION
592        A_NON_CONTINUATION
593        A_SHORT
594        A_LONG
595        A_LONG_AND_ITS_VALUE
596        PLACEHOLDER
597        A_OVERFLOW
598        D_SURROGATE
599        W_SURROGATE
600        D_NONCHAR
601        W_NONCHAR
602        D_SUPER
603        W_SUPER
604        D_PERL_EXTENDED
605        W_PERL_EXTENDED
606        CHECK_ONLY
607        NO_CONFIDENCE_IN_CURLEN_
608    ) );
609
610sub utf8n_display_call($)
611{
612    # Converts an eval string that calls test_utf8n_to_uvchr into a more human
613    # readable form, and returns it.  Doesn't work if the byte string contains
614    # an apostrophe.  The return will look something like:
615    #   test_utf8n_to_uvchr_error('$bytes', $length, $flags)
616    #diag $_[0];
617
618    $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
619    my $text1 = $1;     # Everything before the byte string
620    my $bytes = $2;
621    my $text2 = $3;     # Includes the length
622    my $flags = $4;
623
624    return $text1
625         . display_bytes($bytes)
626         . $text2
627         . flags_to_text($flags, \@utf8n_flags_to_text)
628         . ')';
629}
630
631my @uvchr_flags_to_text =  ( qw(
632        W_SURROGATE
633        W_NONCHAR
634        W_SUPER
635        W_PERL_EXTENDED
636        D_SURROGATE
637        D_NONCHAR
638        D_SUPER
639        D_PERL_EXTENDED
640) );
641
642sub uvchr_display_call($)
643{
644    # Converts an eval string that calls test_uvchr_to_utf8 into a more human
645    # readable form, and returns it.  The return will look something like:
646    #   test_uvchr_to_utf8n_flags($uv, $flags)
647    #diag $_[0];
648
649
650    $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
651    my $text = $1;
652    my $cp = sprintf "%X", $2;
653    my $flags = $3;
654
655    return "${text}0x$cp, " . flags_to_text($flags, \@uvchr_flags_to_text) . ')';
656}
657
658sub do_warnings_test(@)
659{
660    my @expected_warnings = @_;
661
662    # Compares the input expected warnings array with @warnings_gotten,
663    # generating a pass for each found, removing it from @warnings_gotten.
664    # Any discrepancies generate test failures.  Returns TRUE if no
665    # discrepcancies; otherwise FALSE.
666
667    my $succeeded = 1;
668
669    if (@expected_warnings == 0) {
670        if (! is(@warnings_gotten, 0, "    Expected and got no warnings")) {
671            output_warnings(@warnings_gotten);
672            $succeeded = 0;
673        }
674        return $succeeded;
675    }
676
677    # Check that we got all the expected warnings,
678    # removing each one found
679  WARNING:
680    foreach my $expected (@expected_warnings) {
681        foreach (my $i = 0; $i < @warnings_gotten; $i++) {
682            if ($warnings_gotten[$i] =~ $expected) {
683                pass("    Expected and got warning: "
684                    . " $warnings_gotten[$i]");
685                splice @warnings_gotten, $i, 1;
686                next WARNING;
687            }
688        }
689        fail("    Expected a warning that matches "
690            . $expected . " but didn't get it");
691        $succeeded = 0;
692    }
693
694    if (! is(@warnings_gotten, 0, "    Got no unexpected warnings")) {
695        output_warnings(@warnings_gotten);
696        $succeeded = 0;
697    }
698
699    return $succeeded;
700}
701
702my $min_cont = (isASCII) ? 0x80 : 0xA0;
703my $continuation_shift = (isASCII) ? 6 : 5;
704my $continuation_mask = (1 << $continuation_shift) - 1;
705
706sub isUTF8_CHAR($$) {   # Uses first principals to determine if this is legal
707                        # (Doesn't work if overflows)
708    my ($string, $length) = @_;
709
710    # Uses first principals to calculate if $string is legal
711
712    return 0 if $length <= 0;
713
714    my $first = ord substr($string, 0, 1);
715
716    # Invariant
717    return 1 if $length == 1 && $first < $min_cont;
718
719    return 0 if $first < 0xC0;  # Starts with continuation
720
721    # Calculate the number of leading 1 bits
722    my $utf8skip = 0;
723    my $bits = $first;
724    do {
725        $utf8skip++;
726        $bits = ($bits << 1) & 0xFF;
727    } while ($bits & 0x80);
728
729    return 0 if $utf8skip != $length;
730
731    # Acuumulate the $code point.  The remaining bits in the start byte count
732    # towards it
733    my $cp = $bits >> $utf8skip;
734
735    for my $i (1 .. $length - 1) {
736        my $ord = ord substr($string, $i, 1);
737
738        # Wrong if not a continuation
739        return 0 if $ord < $min_cont || $ord >= 0xC0;
740
741        $cp = ($cp << $continuation_shift)
742            | ($ord & $continuation_mask);
743    }
744
745    # If the calculated value can be expressed in fewer bytes than were passed
746    # in, is an illegal overlong.  XXX if 'chr' is not working properly, this
747    # may not be right
748    my $chr = chr $cp;
749    utf8::upgrade($chr);
750
751    use bytes;
752    return 0 if length $chr < $length;
753
754    return 1;
755}
756
757sub start_mark($) {
758    my $len = shift;
759    return 0xFF if $len >  7;
760    return (0xFF & (0xFE << (7 - $len)));
761}
762
763sub start_mask($) {
764    my $len = shift;
765    return 0 if $len >  7;
766    return 0x1F >> ($len - 2);
767}
768
769# This test is split into this number of files.
770my $num_test_files = $ENV{TEST_JOBS} || 1;
771$num_test_files = 10 if $num_test_files > 10;
772
773# We only really need to test utf8n_to_uvchr_msgs() once with this flag.
774my $tested_CHECK_ONLY = 0;
775
776my $test_count = -1;
777
778# By setting this environment variable to this particular value, we test
779# essentially all combinations of potential UTF-8, so that can get a
780# comprehensive test of the decoding routine.  This test assumes the routine
781# that does the translation from code point to UTF-8 is working.  An assert
782# can be used in the routine to make sure that the dfa is working precisely
783# correctly, and any flaws in it aren't being masked by the remainder of the
784# function.
785if ($::TEST_CHUNK == 0
786&& $ENV{PERL_DEBUG_FULL_TEST}
787&& $ENV{PERL_DEBUG_FULL_TEST} == 97)
788{
789    my $min_cont_mask = $min_cont | 0xF;
790    my @bytes = (   0,  # Placeholder to signify to use an empty string ""
791                ord 'A',# We assume that all the invariant characters are
792                        # properly in the same class, so this is an exemplar
793                        # character
794                $min_cont .. 0xFF   # But test every non-invariant individually
795                );
796    my $shift = (isASCII) ? 6 : 5;
797    my $mark = $min_cont;
798    my $mask = (1 << $shift) - 1;
799    for my $byte1 (@bytes) {
800        for my $byte2 (@bytes) {
801            last if $byte2 && ! $byte1;      # Don't test empty preceding byte
802
803            last if $byte2 && $byte1 < 0xC0; # No need to test more than a
804                                             # single byte unless start byte
805                                             # indicates those.
806
807            for my $byte3 (@bytes) {
808                last if $byte3 && ! $byte2;
809                last if $byte3 && $byte1 < 0xE0;    # Only test 3 bytes for
810                                                    # 3-byte start byte
811
812                # If the preceding byte is a start byte, it should fail, and
813                # there is no need to test illegal bytes that follow.
814                # Instead, limit ourselves to just a few legal bytes that
815                # could follow.  This cuts down tremendously on the number of
816                # tests executed.
817                next if $byte2 >= 0xC0
818                     && $byte3 >= $min_cont
819                     && ($byte3 & $min_cont_mask) != $min_cont;
820
821                for my $byte4 (@bytes) {
822                    last if $byte4 && ! $byte3;
823                    last if $byte4 && $byte1 < 0xF0;  # Only test 4 bytes for
824                                                      # 4 byte strings
825
826                    # Like for byte 3, we limit things that come after a
827                    # mispositioned start-byte to just a few things that
828                    # otherwise would be legal
829                    next if ($byte2 >= 0xC0 || $byte3 >= 0xC0)
830                          && $byte4 >= $min_cont
831                          && ($byte4 & $min_cont_mask) != $min_cont;
832
833                    for my $byte5 (@bytes) {
834                        last if $byte5 && ! $byte4;
835                        last if $byte5 && $byte1 < 0xF8;  # Only test 5 bytes for
836                                                          # 5 byte strings
837
838                        # Like for byte 4, we limit things that come after a
839                        # mispositioned start-byte to just a few things that
840                        # otherwise would be legal
841                        next if (   $byte2 >= 0xC0
842                                 || $byte3 >= 0xC0
843                                 || $byte4 >= 0xC0)
844                              && $byte4 >= $min_cont
845                              && ($byte4 & $min_cont_mask) != $min_cont;
846
847                        my $string = "";
848                        $string .= chr $byte1 if $byte1;
849                        $string .= chr $byte2 if $byte2;
850                        $string .= chr $byte3 if $byte3;
851                        $string .= chr $byte4 if $byte4;
852                        $string .= chr $byte5 if $byte5;
853
854                        my $length = length $string;
855                        next unless $length;
856                        last if $byte1 >= ((isASCII) ? 0xF6 : 0xFA);
857
858                        my $native = I8_to_native($string);
859                        my $is_valid = isUTF8_CHAR($native, $length);
860                        my $got_valid = test_isUTF8_CHAR($native, $length);
861                        my $got_strict
862                                    = test_isSTRICT_UTF8_CHAR($native, $length);
863                        my $got_C9
864                                 = test_isC9_STRICT_UTF8_CHAR($native, $length);
865                        my $ret = test_utf8n_to_uvchr_msgs($native, $length,
866                                            $::UTF8_WARN_ILLEGAL_INTERCHANGE);
867                        my $is_strict = $is_valid;
868                        my $is_C9 = $is_valid;
869
870                        if ($is_valid) {
871
872                            # Here, is legal UTF-8.  Verify that it returned
873                            # the correct code point, and if so, that it
874                            # correctly classifies the result.
875                            my $cp = $ret->[0];
876
877                            my $should_be_string;
878                            if ($length == 1) {
879                                $should_be_string = chr $cp;
880                            }
881                            else {
882
883                                # Starting with the code point, use first
884                                # principals to find the equivalen UTF-8
885                                # string
886                                my @bytes;
887                                my $uv = $cp;
888                                for (my $i = $length - 1; $i > 0; $i--) {
889                                    $bytes[$i] = chr I8_to_native(($uv & $mask)
890                                                                  | $mark);
891                                    $uv >>= $shift;
892                                }
893                                $bytes[0] = chr I8_to_native((   $uv
894                                                        & start_mask($length))
895                                            | start_mark($length));
896                                $should_be_string = join "", @bytes;
897                            }
898
899                            # If the original string and the inverse are the
900                            # same, it worked.
901                            if (is($native, $should_be_string,
902                                    "utf8n_to_uvchr_msgs("
903                                 .  display_bytes($native)
904                                 . ") returns correct uv=0x"
905                                 . sprintf ("%x", $cp)))
906                            {
907                                my $is_surrogate = $cp >= 0xD800
908                                                && $cp <= 0xDFFF;
909                                my $got_surrogate
910                                    = ($ret->[2] & $::UTF8_GOT_SURROGATE) != 0;
911                                $is_strict = 0 if $is_surrogate;
912                                $is_C9 = 0 if $is_surrogate;
913
914                                my $is_super = $cp > 0x10FFFF;
915                                my $got_super
916                                        = ($ret->[2] & $::UTF8_GOT_SUPER) != 0;
917                                $is_strict = 0 if $is_super;
918                                $is_C9 = 0 if $is_super;
919
920                                my $is_nonchar = ! $is_super
921                                    && (   ($cp & 0xFFFE) == 0xFFFE
922                                        || ($cp >= 0xFDD0 && $cp <= 0xFDEF));
923                                my $got_nonchar
924                                      = ($ret->[2] & $::UTF8_GOT_NONCHAR) != 0;
925                                $is_strict = 0 if $is_nonchar;
926
927                                is($got_surrogate, $is_surrogate,
928                                    "    And correctly flagged it as"
929                                  . ((! $is_surrogate) ? " not" : "")
930                                  . " being a surrogate");
931                                is($got_super, $is_super,
932                                    "    And correctly flagged it as"
933                                  . ((! $is_super) ? " not" : "")
934                                  . " being above Unicode");
935                                is($got_nonchar, $is_nonchar,
936                                    "    And correctly flagged it as"
937                                  . ((! $is_nonchar) ? " not" : "")
938                                  . " being a non-char");
939                            }
940
941                            # This is how we exit the loop normally if things
942                            # are working.  The fail-safe code above is used
943                            # when they aren't.
944                            goto done if $cp > 0x140001;
945                        }
946                        else {
947                            is($ret->[0], 0, "utf8n_to_uvchr_msgs("
948                                            . display_bytes($native)
949                                            . ") correctly returns error");
950                            if (! ($ret->[2] & ($::UTF8_GOT_SHORT
951                                               |$::UTF8_GOT_NON_CONTINUATION
952                                               |$::UTF8_GOT_LONG)))
953                            {
954                                is($ret->[2] & ( $::UTF8_GOT_NONCHAR
955                                                |$::UTF8_GOT_SURROGATE
956                                                |$::UTF8_GOT_SUPER), 0,
957                                "    And isn't a surrogate, non-char, nor"
958                                . " above Unicode");
959                             }
960                        }
961
962                        is($got_valid == 0, $is_valid == 0,
963                           "    And isUTF8_CHAR() correctly returns "
964                         . (($got_valid == 0) ? "0" : "non-zero"));
965                        is($got_strict == 0, $is_strict == 0,
966                           "    And isSTRICT_UTF8_CHAR() correctly returns "
967                         . (($got_strict == 0) ? "0" : "non-zero"));
968                        is($got_C9 == 0, $is_C9 == 0,
969                           "    And isC9_UTF8_CHAR() correctly returns "
970                         . (($got_C9 == 0) ? "0" : "non-zero"));
971                    }
972                }
973            }
974        }
975    }
976  done:
977}
978
979foreach my $test (@tests) {
980  $test_count++;
981  next if $test_count % $num_test_files != $::TEST_CHUNK;
982
983  my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
984
985  my $length = length $bytes;
986  my $initially_overlong = $testname =~ /overlong/;
987  my $initially_orphan   = $testname =~ /orphan/;
988  my $will_overflow = $allowed_uv < 0;
989
990  my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
991  my $display_bytes = display_bytes($bytes);
992
993  my $controlling_warning_category;
994  my $utf8n_flag_to_warn;
995  my $utf8n_flag_to_disallow;
996  my $uvchr_flag_to_warn;
997  my $uvchr_flag_to_disallow;
998
999  # We want to test that the independent flags are actually independent.
1000  # For example, that a surrogate doesn't trigger a non-character warning,
1001  # and conversely, turning off an above-Unicode flag doesn't suppress a
1002  # surrogate warning.  Earlier versions of this file used nested loops to
1003  # test all possible combinations.  But that creates lots of tests, making
1004  # this run too long.  What is now done instead is to use the complement of
1005  # the category we are testing to greatly reduce the combinatorial
1006  # explosion.  For example, if we have a surrogate and we aren't expecting
1007  # a warning about it, we set all the flags for non-surrogates to raise
1008  # warnings.  If one shows up, it indicates the flags aren't independent.
1009  my $utf8n_flag_to_warn_complement;
1010  my $utf8n_flag_to_disallow_complement;
1011  my $uvchr_flag_to_warn_complement;
1012  my $uvchr_flag_to_disallow_complement;
1013
1014  # Many of the code points being tested are middling in that if code point
1015  # edge cases work, these are very likely to as well.  Because this test
1016  # file takes a while to execute, we skip testing the edge effects of code
1017  # points deemed middling, while testing their basics and continuing to
1018  # fully test the non-middling code points.
1019  my $skip_most_tests = 0;
1020
1021  my $cp_message_qr;      # Pattern that matches the message raised when
1022                          # that message contains the problematic code
1023                          # point.  The message is the same (currently) both
1024                          # when going from/to utf8.
1025  my $non_cp_trailing_text;   # The suffix text when the message doesn't
1026                              # contain a code point.  (This is a result of
1027                              # some sort of malformation that means we
1028                              # can't get an exact code poin
1029  my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
1030                      \Q requires a Perl extension, and so is not\E
1031                      \Q portable\E/x;
1032  my $extended_non_cp_trailing_text
1033                      = "is a Perl extension, and so is not portable";
1034
1035  # What bytes should have been used to specify a code point that has been
1036  # specified as an overlong.
1037  my $correct_bytes_for_overlong;
1038
1039  # Is this test malformed from the beginning?  If so, we know to generally
1040  # expect that the tests will show it isn't valid.
1041  my $initially_malformed = 0;
1042
1043  if ($initially_overlong || $initially_orphan) {
1044      $non_cp_trailing_text = "if you see this, there is an error";
1045      $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
1046      $initially_malformed = 1;
1047      $utf8n_flag_to_warn     = 0;
1048      $utf8n_flag_to_disallow = 0;
1049
1050      $utf8n_flag_to_warn_complement =     $::UTF8_WARN_SURROGATE;
1051      $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE;
1052      if (! $will_overflow && $allowed_uv <= 0x10FFFF) {
1053          $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_SUPER;
1054          $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER;
1055          if (($allowed_uv & 0xFFFF) != 0xFFFF) {
1056              $utf8n_flag_to_warn_complement      |= $::UTF8_WARN_NONCHAR;
1057              $utf8n_flag_to_disallow_complement  |= $::UTF8_DISALLOW_NONCHAR;
1058          }
1059      }
1060      if (! is_extended_utf8($bytes)) {
1061          $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
1062          $utf8n_flag_to_disallow_complement  |= $::UTF8_DISALLOW_PERL_EXTENDED;
1063      }
1064
1065      $controlling_warning_category = 'utf8';
1066
1067      if ($initially_overlong) {
1068          if (! defined $needed_to_discern_len) {
1069              $needed_to_discern_len = overlong_discern_len($bytes);
1070          }
1071          $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv);
1072      }
1073  }
1074  elsif($will_overflow || $allowed_uv > 0x10FFFF) {
1075
1076      # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
1077      $utf8n_flag_to_warn     = $::UTF8_WARN_SUPER;
1078      $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
1079      $uvchr_flag_to_warn     = $::UNICODE_WARN_SUPER;
1080      $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
1081
1082      # Below, we add the flags for non-perl_extended to the code points
1083      # that don't fit that category.  Special tests are done for this
1084      # category in the inner loop.
1085      $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
1086                                          |$::UTF8_WARN_SURROGATE;
1087      $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
1088                                          |$::UTF8_DISALLOW_SURROGATE;
1089      $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
1090                                          |$::UNICODE_WARN_SURROGATE;
1091      $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
1092                                          |$::UNICODE_DISALLOW_SURROGATE;
1093      $controlling_warning_category = 'non_unicode';
1094
1095      if ($will_overflow) {  # This is realy a malformation
1096          $non_cp_trailing_text = "if you see this, there is an error";
1097          $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
1098          $initially_malformed = 1;
1099          if (! defined $needed_to_discern_len) {
1100              $needed_to_discern_len = overflow_discern_len($length);
1101          }
1102      }
1103      elsif (requires_extended_utf8($allowed_uv)) {
1104          $cp_message_qr = $extended_cp_message_qr;
1105          $non_cp_trailing_text = $extended_non_cp_trailing_text;
1106          $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
1107      }
1108      else {
1109          $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
1110                              \Q may not be portable\E/x;
1111          $non_cp_trailing_text = "is for a non-Unicode code point, may not"
1112                              . " be portable";
1113          $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_PERL_EXTENDED;
1114          $utf8n_flag_to_disallow_complement
1115                                          |= $::UTF8_DISALLOW_PERL_EXTENDED;
1116          $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
1117          $uvchr_flag_to_disallow_complement
1118                                      |= $::UNICODE_DISALLOW_PERL_EXTENDED;
1119      }
1120  }
1121  elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
1122      $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
1123      $non_cp_trailing_text = "is for a surrogate";
1124      $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
1125      $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
1126
1127      $utf8n_flag_to_warn     = $::UTF8_WARN_SURROGATE;
1128      $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
1129      $uvchr_flag_to_warn     = $::UNICODE_WARN_SURROGATE;
1130      $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
1131
1132      $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
1133                                          |$::UTF8_WARN_SUPER
1134                                          |$::UTF8_WARN_PERL_EXTENDED;
1135      $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
1136                                          |$::UTF8_DISALLOW_SUPER
1137                                          |$::UTF8_DISALLOW_PERL_EXTENDED;
1138      $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
1139                                          |$::UNICODE_WARN_SUPER
1140                                          |$::UNICODE_WARN_PERL_EXTENDED;
1141      $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
1142                                          |$::UNICODE_DISALLOW_SUPER
1143                                          |$::UNICODE_DISALLOW_PERL_EXTENDED;
1144      $controlling_warning_category = 'surrogate';
1145  }
1146  elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
1147          || ($allowed_uv & 0xFFFE) == 0xFFFE)
1148  {
1149      $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
1150                          \Q is not recommended for open interchange\E/x;
1151      $non_cp_trailing_text = "if you see this, there is an error";
1152      $needed_to_discern_len = $length unless defined $needed_to_discern_len;
1153      if (   ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
1154          || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
1155      {
1156          $skip_most_tests = 1;
1157      }
1158
1159      $utf8n_flag_to_warn     = $::UTF8_WARN_NONCHAR;
1160      $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
1161      $uvchr_flag_to_warn     = $::UNICODE_WARN_NONCHAR;
1162      $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
1163
1164      $utf8n_flag_to_warn_complement     = $::UTF8_WARN_SURROGATE
1165                                          |$::UTF8_WARN_SUPER
1166                                          |$::UTF8_WARN_PERL_EXTENDED;
1167      $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
1168                                          |$::UTF8_DISALLOW_SUPER
1169                                          |$::UTF8_DISALLOW_PERL_EXTENDED;
1170      $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_SURROGATE
1171                                          |$::UNICODE_WARN_SUPER
1172                                          |$::UNICODE_WARN_PERL_EXTENDED;
1173      $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
1174                                          |$::UNICODE_DISALLOW_SUPER
1175                                          |$::UNICODE_DISALLOW_PERL_EXTENDED;
1176
1177      $controlling_warning_category = 'nonchar';
1178  }
1179  else {
1180      die "Can't figure out what type of warning to test for $testname"
1181  }
1182
1183  die 'Didn\'t set $needed_to_discern_len for ' . $testname
1184                                      unless defined $needed_to_discern_len;
1185
1186  # We try various combinations of malformations that can occur
1187  foreach my $short (0, 1) {
1188    next if $skip_most_tests && $short;
1189    foreach my $unexpected_noncont (0, 1) {
1190      next if $skip_most_tests && $unexpected_noncont;
1191      foreach my $overlong (0, 1) {
1192        next if $overlong && $skip_most_tests;
1193        next if $initially_overlong && ! $overlong;
1194
1195        # If we're creating an overlong, it can't be longer than the
1196        # maximum length, so skip if we're already at that length.
1197        next if   (! $initially_overlong && $overlong)
1198                  &&  $length >= $::max_bytes;
1199
1200        my $this_cp_message_qr = $cp_message_qr;
1201        my $this_non_cp_trailing_text = $non_cp_trailing_text;
1202
1203        foreach my $malformed_allow_type (0..2) {
1204          # 0 don't allow this malformation; ignored if no malformation
1205          # 1 allow, with REPLACEMENT CHARACTER returned
1206          # 2 allow, with intended code point returned.  All malformations
1207          #   other than overlong can't determine the intended code point,
1208          #   so this isn't valid for them.
1209          next if     $malformed_allow_type == 2
1210                  && ($will_overflow || $short || $unexpected_noncont);
1211          next if $skip_most_tests && $malformed_allow_type;
1212
1213          # Here we are in the innermost loop for malformations.  So we
1214          # know which ones are in effect.  Can now change the input to be
1215          # appropriately malformed.  We also can set up certain other
1216          # things now, like whether we expect a return flag from this
1217          # malformation, and which flag.
1218
1219          my $this_bytes = $bytes;
1220          my $this_length = $length;
1221          my $this_expected_len = $length;
1222          my $this_needed_to_discern_len = $needed_to_discern_len;
1223
1224          my @malformation_names;
1225          my @expected_malformation_warnings;
1226          my @expected_malformation_return_flags;
1227
1228          # Contains the flags for any allowed malformations.  Currently no
1229          # combinations of on/off are tested for.  It's either all are
1230          # allowed, or none are.
1231          my $allow_flags = 0;
1232          my $overlong_is_in_perl_extended_utf8 = 0;
1233          my $dont_use_overlong_cp = 0;
1234
1235          if ($initially_orphan) {
1236              next if $overlong || $short || $unexpected_noncont;
1237          }
1238
1239          if ($overlong) {
1240              if (! $initially_overlong) {
1241                  my $new_expected_len;
1242
1243                  # To force this malformation, we convert the original start
1244                  # byte into a continuation byte with the same data bits as
1245                  # originally. ...
1246                  my $start_byte = substr($this_bytes, 0, 1);
1247                  my $converted_to_continuation_byte
1248                                          = start_byte_to_cont($start_byte);
1249
1250                  # ... Then we prepend it with a known overlong sequence.
1251                  # This should evaluate to the exact same code point as the
1252                  # original.  We try to avoid an overlong using Perl
1253                  # extended UTF-8.  The code points are the highest
1254                  # representable as overlongs on the respective platform
1255                  # without using extended UTF-8.
1256                  if (native_to_I8($start_byte) lt "\xFC") {
1257                      $start_byte = I8_to_native("\xFC");
1258                      $new_expected_len = 6;
1259                  }
1260                  elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
1261
1262                      # FE is not extended UTF-8 on EBCDIC
1263                      $start_byte = I8_to_native("\xFE");
1264                      $new_expected_len = 7;
1265                  }
1266                  else {  # Must use extended UTF-8.  On ASCII platforms, we
1267                          # could express some overlongs here starting with
1268                          # \xFE, but there's no real reason to do so.
1269                      $overlong_is_in_perl_extended_utf8 = 1;
1270                      $start_byte = I8_to_native("\xFF");
1271                      $new_expected_len = $::max_bytes;
1272                      $this_cp_message_qr = $extended_cp_message_qr;
1273
1274                      # The warning that gets raised doesn't include the
1275                      # code point in the message if the code point can be
1276                      # expressed without using extended UTF-8, but the
1277                      # particular overlong sequence used is in extended
1278                      # UTF-8.  To do otherwise would be confusing to the
1279                      # user, as it would claim the code point requires
1280                      # extended, when it doesn't.
1281                      $dont_use_overlong_cp = 1
1282                                  unless requires_extended_utf8($allowed_uv);
1283                      $this_non_cp_trailing_text
1284                                            = $extended_non_cp_trailing_text;
1285                  }
1286
1287                  # Splice in the revise continuation byte, preceded by the
1288                  # start byte and the proper number of the lowest
1289                  # continuation bytes.
1290                  $this_bytes =   $start_byte
1291                              . ($native_lowest_continuation_chr
1292                                  x (  $new_expected_len
1293                                      - 1
1294                                      - length($this_bytes)))
1295                              .  $converted_to_continuation_byte
1296                              .  substr($this_bytes, 1);
1297                  $this_length = length($this_bytes);
1298                  $this_needed_to_discern_len =    $new_expected_len
1299                                              - (  $this_expected_len
1300                                              - $this_needed_to_discern_len);
1301                  $this_expected_len = $new_expected_len;
1302              }
1303          }
1304
1305          if ($short) {
1306
1307              # To force this malformation, just tell the test to not look
1308              # as far as it should into the input.
1309              $this_length--;
1310              $this_expected_len--;
1311
1312              $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
1313          }
1314
1315          if ($unexpected_noncont) {
1316
1317              # To force this malformation, change the final continuation
1318              # byte into a start byte.
1319              my $pos = ($short) ? -2 : -1;
1320              substr($this_bytes, $pos, 1) = $known_start_byte;
1321              $this_expected_len--;
1322          }
1323
1324          # The whole point of a test that is malformed from the beginning
1325          # is to test for that malformation.  If we've modified things so
1326          # much that we don't have enough information to detect that
1327          # malformation, there's no point in testing.
1328          next if    $initially_malformed
1329                  && $this_expected_len < $this_needed_to_discern_len;
1330
1331          # Here, we've transformed the input with all of the desired
1332          # non-overflow malformations.  We are now in a position to
1333          # construct any potential warnings for those malformations.  But
1334          # it's a pain to get the detailed messages exactly right, so for
1335          # now XXX, only do so for those that return an explicit code
1336          # point.
1337
1338          if ($initially_orphan) {
1339              push @malformation_names, "orphan continuation";
1340              push @expected_malformation_return_flags,
1341                                                  $::UTF8_GOT_CONTINUATION;
1342              $allow_flags |= $::UTF8_ALLOW_CONTINUATION
1343                                                  if $malformed_allow_type;
1344              push @expected_malformation_warnings, qr/unexpected continuation/;
1345          }
1346
1347          if ($overlong) {
1348              push @malformation_names, 'overlong';
1349              push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
1350
1351              # If one of the other malformation types is also in effect, we
1352              # don't know what the intended code point was.
1353              if ($short || $unexpected_noncont || $will_overflow) {
1354                  push @expected_malformation_warnings, qr/overlong/;
1355              }
1356              else {
1357                  my $wrong_bytes = display_bytes_no_quotes(
1358                                        substr($this_bytes, 0, $this_length));
1359                  if (! defined $correct_bytes_for_overlong) {
1360                      $correct_bytes_for_overlong
1361                                          = display_bytes_no_quotes($bytes);
1362                  }
1363                  my $prefix = (   $allowed_uv > 0x10FFFF
1364                                || ! isASCII && $allowed_uv < 256)
1365                                ? "0x"
1366                                : "U+";
1367                  push @expected_malformation_warnings,
1368                          qr/\QMalformed UTF-8 character: $wrong_bytes\E
1369                              \Q (overlong; instead use\E
1370                              \Q $correct_bytes_for_overlong to\E
1371                              \Q represent $prefix$uv_string)/x;
1372              }
1373
1374              if ($malformed_allow_type == 2) {
1375                  $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
1376              }
1377              elsif ($malformed_allow_type) {
1378                  $allow_flags |= $::UTF8_ALLOW_LONG;
1379              }
1380          }
1381          if ($short) {
1382              push @malformation_names, 'short';
1383              push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
1384              push @expected_malformation_warnings, qr/too short/;
1385          }
1386          if ($unexpected_noncont) {
1387              push @malformation_names, 'unexpected non-continuation';
1388              push @expected_malformation_return_flags,
1389                              $::UTF8_GOT_NON_CONTINUATION;
1390              $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
1391                                                  if $malformed_allow_type;
1392              push @expected_malformation_warnings,
1393                                      qr/unexpected non-continuation byte/;
1394          }
1395
1396          # The overflow malformation is done differently than other
1397          # malformations.  It comes from manually typed tests in the test
1398          # array.  We now make it be treated like one of the other
1399          # malformations.  But some has to be deferred until the inner loop
1400          my $overflow_msg_pattern;
1401          if ($will_overflow) {
1402              push @malformation_names, 'overflow';
1403
1404              $overflow_msg_pattern = display_bytes_no_quotes(
1405                                  substr($this_bytes, 0, $this_expected_len));
1406              $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1407                                          \Q $overflow_msg_pattern\E
1408                                          \Q (overflows)\E/x;
1409              push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1410              $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1411          }
1412
1413          # And we can create the malformation-related text for the the test
1414          # names we eventually will generate.
1415          my $malformations_name = "";
1416          if (@malformation_names) {
1417              $malformations_name .= "dis" unless $malformed_allow_type;
1418              $malformations_name .= "allowed ";
1419              $malformations_name .= "malformation";
1420              $malformations_name .= "s" if @malformation_names > 1;
1421              $malformations_name .= ": ";
1422              $malformations_name .=  join "/", @malformation_names;
1423              $malformations_name =  " ($malformations_name)";
1424          }
1425
1426          # Done setting up the malformation related stuff
1427
1428          {   # First test the isFOO calls
1429              use warnings; # XXX no warnings 'deprecated';   # Make sure these don't raise warnings
1430              undef @warnings_gotten;
1431
1432              my $ret = test_isUTF8_CHAR($this_bytes, $this_length);
1433              my $ret_flags
1434                      = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0);
1435              if ($malformations_name) {
1436                  is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0");
1437                  is($ret_flags, 0, "    And isUTF8_CHAR_flags() returns 0");
1438              }
1439              else {
1440                  is($ret, $this_length, "For $testname: isUTF8_CHAR() returns"
1441                                        . " expected length: $this_length");
1442                  is($ret_flags, $this_length,
1443                      "    And isUTF8_CHAR_flags(...,0) returns expected"
1444                    . " length: $this_length");
1445              }
1446              is(scalar @warnings_gotten, 0,
1447                  "    And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags"
1448                . " generated any warnings")
1449              or output_warnings(@warnings_gotten);
1450
1451              undef @warnings_gotten;
1452              $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length);
1453              if ($malformations_name) {
1454                  is($ret, 0, "    And isSTRICT_UTF8_CHAR() returns 0");
1455              }
1456              else {
1457                  my $expected_ret
1458                              = (   $testname =~ /surrogate|non-character/
1459                                  || $allowed_uv > 0x10FFFF)
1460                                ? 0
1461                                : $this_length;
1462                  is($ret, $expected_ret,
1463                      "    And isSTRICT_UTF8_CHAR() returns expected"
1464                    . " length: $expected_ret");
1465                  $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1466                                      $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1467                  is($ret, $expected_ret,
1468                      "    And isUTF8_CHAR_flags('"
1469                    . "DISALLOW_ILLEGAL_INTERCHANGE') acts like"
1470                    . " isSTRICT_UTF8_CHAR");
1471              }
1472              is(scalar @warnings_gotten, 0,
1473                      "    And neither isSTRICT_UTF8_CHAR() nor"
1474                    . " isUTF8_CHAR_flags generated any warnings")
1475              or output_warnings(@warnings_gotten);
1476
1477              undef @warnings_gotten;
1478              $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length);
1479              if ($malformations_name) {
1480                  is($ret, 0, "    And isC9_STRICT_UTF8_CHAR() returns 0");
1481              }
1482              else {
1483                  my $expected_ret = (   $testname =~ /surrogate/
1484                                      || $allowed_uv > 0x10FFFF)
1485                                      ? 0
1486                                      : $this_expected_len;
1487                  is($ret, $expected_ret, "    And isC9_STRICT_UTF8_CHAR()"
1488                                        . " returns expected length:"
1489                                        . " $expected_ret");
1490                  $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1491                                  $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1492                  is($ret, $expected_ret,
1493                      "    And isUTF8_CHAR_flags('"
1494                    . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like"
1495                    . " isC9_STRICT_UTF8_CHAR");
1496              }
1497              is(scalar @warnings_gotten, 0,
1498                      "    And neither isC9_STRICT_UTF8_CHAR() nor"
1499                    . " isUTF8_CHAR_flags generated any warnings")
1500              or output_warnings(@warnings_gotten);
1501
1502              foreach my $disallow_type (0..2) {
1503                  # 0 is don't disallow this type of code point
1504                  # 1 is do disallow
1505                  # 2 is do disallow, but only code points requiring
1506                  #   perl-extended-UTF8
1507
1508                  my $disallow_flags;
1509                  my $expected_ret;
1510
1511                  if ($malformations_name) {
1512
1513                      # Malformations are by default disallowed, so testing
1514                      # with $disallow_type equal to 0 is sufficicient.
1515                      next if $disallow_type;
1516
1517                      $disallow_flags = 0;
1518                      $expected_ret = 0;
1519                  }
1520                  elsif ($disallow_type == 1) {
1521                      $disallow_flags = $utf8n_flag_to_disallow;
1522                      $expected_ret = 0;
1523                  }
1524                  elsif ($disallow_type == 2) {
1525                      next if ! requires_extended_utf8($allowed_uv);
1526                      $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
1527                      $expected_ret = 0;
1528                  }
1529                  else {  # type is 0
1530                      $disallow_flags = $utf8n_flag_to_disallow_complement;
1531                      $expected_ret = $this_length;
1532                  }
1533
1534                  $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1535                                                $disallow_flags);
1536                  is($ret, $expected_ret,
1537                            "    And isUTF8_CHAR_flags($display_bytes,"
1538                          . " $disallow_flags) returns $expected_ret")
1539                    or diag "The flags mean "
1540                          . flags_to_text($disallow_flags,
1541                                          \@utf8n_flags_to_text);
1542                  is(scalar @warnings_gotten, 0,
1543                          "    And isUTF8_CHAR_flags(...) generated"
1544                        . " no warnings")
1545                    or output_warnings(@warnings_gotten);
1546
1547                  # Test partial character handling, for each byte not a
1548                  # full character
1549                  my $did_test_partial = 0;
1550                  for (my $j = 1; $j < $this_length - 1; $j++) {
1551                      $did_test_partial = 1;
1552                      my $partial = substr($this_bytes, 0, $j);
1553                      my $ret_should_be;
1554                      my $comment;
1555                      if ($disallow_type || $malformations_name) {
1556                          $ret_should_be = 0;
1557                          $comment = "disallowed";
1558
1559                          # The number of bytes required to tell if a
1560                          # sequence has something wrong is the smallest of
1561                          # all the things wrong with it.  We start with the
1562                          # number for this type of code point, if that is
1563                          # disallowed; or the whole length if not.  The
1564                          # latter is what a couple of the malformations
1565                          # require.
1566                          my $needed_to_tell = ($disallow_type)
1567                                                ? $this_needed_to_discern_len
1568                                                : $this_expected_len;
1569
1570                          # Then we see if the malformations that are
1571                          # detectable early in the string are present.
1572                          if ($overlong) {
1573                              my $dl = overlong_discern_len($this_bytes);
1574                              $needed_to_tell = $dl if $dl < $needed_to_tell;
1575                          }
1576                          if ($will_overflow) {
1577                              my $dl = overflow_discern_len($length);
1578                              $needed_to_tell = $dl if $dl < $needed_to_tell;
1579                          }
1580
1581                          if ($j < $needed_to_tell) {
1582                              $ret_should_be = 1;
1583                              $comment .= ", but need $needed_to_tell"
1584                                        . " bytes to discern:";
1585                          }
1586                      }
1587                      else {
1588                          $ret_should_be = 1;
1589                          $comment = "allowed";
1590                      }
1591
1592                      undef @warnings_gotten;
1593
1594                      $ret = test_is_utf8_valid_partial_char_flags($partial,
1595                                                      $j, $disallow_flags);
1596                      is($ret, $ret_should_be,
1597                          "    And is_utf8_valid_partial_char_flags("
1598                          . display_bytes($partial)
1599                          . ", $disallow_flags), $comment: returns"
1600                          . " $ret_should_be")
1601                      or diag "The flags mean "
1602                      . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
1603                  }
1604
1605                  if ($did_test_partial) {
1606                      is(scalar @warnings_gotten, 0,
1607                          "    And is_utf8_valid_partial_char_flags()"
1608                          . " generated no warnings for any of the lengths")
1609                        or output_warnings(@warnings_gotten);
1610                  }
1611              }
1612          }
1613
1614          # Now test the to/from UTF-8 calls.  There are several orthogonal
1615          # variables involved.  We test most possible combinations
1616
1617          foreach my $do_disallow (0, 1) {
1618            if ($do_disallow) {
1619              next if $initially_overlong || $initially_orphan;
1620            }
1621            else {
1622              next if $skip_most_tests;
1623            }
1624
1625            # This tests four functions: utf8n_to_uvchr_error,
1626            # utf8n_to_uvchr_msgs, uvchr_to_utf8_flags, and
1627            # uvchr_to_utf8_msgs.  The first two are variants of each other,
1628            # and the final two also form a pair.  We use a loop 'which_func'
1629            # to determine which of each pair is being tested.  The main loop
1630            # tests either the first and third, or the 2nd and fourth.
1631            # which_func is sets whether we are expecting warnings or not in
1632            # certain places.  The _msgs() version of the functions expects
1633            # warnings even if lexical ones are turned off, so by making its
1634            # which_func == 1, we can say we want warnings; whereas the other
1635            # one with the value 0, doesn't get them.
1636            for my $which_func (0, 1) {
1637              my $utf8_func = ($which_func)
1638                          ? 'utf8n_to_uvchr_msgs'
1639                          : 'utf8n_to_uvchr_error';
1640
1641              # We classify the warnings into certain "interesting" types,
1642              # described later
1643              foreach my $warning_type (0..4) {
1644                next if $skip_most_tests && $warning_type != 1;
1645                foreach my $use_warn_flag (0, 1) {
1646                    if ($use_warn_flag) {
1647                        next if $initially_overlong || $initially_orphan;
1648
1649                        # Since foo_msgs() expects warnings even when lexical
1650                        # ones are turned off, we can skip testing it when
1651                        # they are turned on, with little likelihood of
1652                        # missing an error case.
1653                        next if $which_func;
1654                    }
1655                    else {
1656                        next if $skip_most_tests;
1657                    }
1658
1659                    # Finally, here is the inner loop
1660
1661                    my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1662                    my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1663                    my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1664                    my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1665
1666                    my $eval_warn;
1667                    my $expect_regular_warnings;
1668                    my $expect_warnings_for_malformed;
1669                    my $expect_warnings_for_overflow;
1670
1671                    if ($warning_type == 0) {
1672                        $eval_warn = "use warnings";
1673                        $expect_regular_warnings = $use_warn_flag;
1674
1675                        # We ordinarily expect overflow warnings here.  But it
1676                        # is somewhat more complicated, and the final
1677                        # determination is deferred to one place in the file
1678                        # where we handle overflow.
1679                        $expect_warnings_for_overflow = 1;
1680
1681                        # We would ordinarily expect malformed warnings in
1682                        # this case, but not if malformations are allowed.
1683                        $expect_warnings_for_malformed
1684                                                = $malformed_allow_type == 0;
1685                    }
1686                    elsif ($warning_type == 1) {
1687                        $eval_warn = "no warnings";
1688                        $expect_regular_warnings = $which_func;
1689                        $expect_warnings_for_overflow = $which_func;
1690                        $expect_warnings_for_malformed = $which_func;
1691                    }
1692                    elsif ($warning_type == 2) {
1693                        $eval_warn = "no warnings; use warnings 'utf8'";
1694                        $expect_regular_warnings = $use_warn_flag;
1695                        $expect_warnings_for_overflow = 1;
1696                        $expect_warnings_for_malformed
1697                                                = $malformed_allow_type == 0;
1698                    }
1699                    elsif ($warning_type == 3) {
1700                        $eval_warn = "no warnings; use warnings"
1701                                   . " '$controlling_warning_category'";
1702                        $expect_regular_warnings = $use_warn_flag;
1703                        $expect_warnings_for_overflow
1704                            = $controlling_warning_category eq 'non_unicode';
1705                        $expect_warnings_for_malformed = $which_func;
1706                    }
1707                    elsif ($warning_type == 4) {  # Like type 3, but uses the
1708                                                  # PERL_EXTENDED flags
1709                        # The complement flags were set up so that the
1710                        # PERL_EXTENDED flags have been tested that they don't
1711                        # trigger wrongly for too small code points.  And the
1712                        # flags have been set up so that those small code
1713                        # points are tested for being above Unicode.  What's
1714                        # left to test is that the large code points do
1715                        # trigger the PERL_EXTENDED flags.
1716                        next if ! requires_extended_utf8($allowed_uv);
1717                        next if $controlling_warning_category ne 'non_unicode';
1718                        $eval_warn = "no warnings; use warnings 'non_unicode'";
1719                        $expect_regular_warnings = 1;
1720                        $expect_warnings_for_overflow = 1;
1721                        $expect_warnings_for_malformed = 0;
1722                        $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
1723                        $this_utf8n_flag_to_disallow
1724                                             = $::UTF8_DISALLOW_PERL_EXTENDED;
1725                        $this_uvchr_flag_to_warn
1726                                              = $::UNICODE_WARN_PERL_EXTENDED;
1727                        $this_uvchr_flag_to_disallow
1728                                          = $::UNICODE_DISALLOW_PERL_EXTENDED;
1729                    }
1730                    else {
1731                       die "Unexpected warning type '$warning_type'";
1732                    }
1733
1734                    # We only need to test the case where all warnings are
1735                    # enabled (type 0) to see if turning off the warning flag
1736                    # causes things to not be output.  If those pass, then
1737                    # turning on some sub-category of warnings, or turning off
1738                    # warnings altogether are extremely likely to not output
1739                    # warnings either, given how the warnings subsystem is
1740                    # supposed to work, and this file assumes it does work.
1741                    next if $warning_type != 0 && ! $use_warn_flag;
1742
1743                    # The convention is that the 'got' flag is the same value
1744                    # as the disallow one.  If this were violated, the tests
1745                    # here should start failing.
1746                    my $return_flag = $this_utf8n_flag_to_disallow;
1747
1748                    # If we aren't expecting warnings/disallow for this, turn
1749                    # on all the other flags.  That makes sure that they all
1750                    # are independent of this flag, and so we don't need to
1751                    # test them individually.
1752                    my $this_warning_flags
1753                            = ($use_warn_flag)
1754                              ? $this_utf8n_flag_to_warn
1755                              : ($overlong_is_in_perl_extended_utf8
1756                                ? ($utf8n_flag_to_warn_complement
1757                                    & ~$::UTF8_WARN_PERL_EXTENDED)
1758                                :  $utf8n_flag_to_warn_complement);
1759                    my $this_disallow_flags
1760                            = ($do_disallow)
1761                              ? $this_utf8n_flag_to_disallow
1762                              : ($overlong_is_in_perl_extended_utf8
1763                                 ? ($utf8n_flag_to_disallow_complement
1764                                    & ~$::UTF8_DISALLOW_PERL_EXTENDED)
1765                                 :  $utf8n_flag_to_disallow_complement);
1766                    my $expected_uv = $allowed_uv;
1767                    my $this_uv_string = $uv_string;
1768
1769                    my @expected_return_flags
1770                                        = @expected_malformation_return_flags;
1771                    my @expected_warnings;
1772                    push @expected_warnings, @expected_malformation_warnings
1773                                            if $expect_warnings_for_malformed;
1774
1775                    # The overflow malformation is done differently than other
1776                    # malformations.  It comes from manually typed tests in
1777                    # the test array, but it also is above Unicode and uses
1778                    # Perl extended UTF-8, so affects some of the flags being
1779                    # tested.  We now make it be treated like one of the other
1780                    # generated malformations.
1781                    if ($will_overflow) {
1782
1783                        # An overflow is (way) above Unicode, and overrides
1784                        # everything else.
1785                        $expect_regular_warnings = 0;
1786
1787                        # Earlier, we tentatively calculated whether this
1788                        # should emit a message or not.  It's tentative
1789                        # because, even if we ordinarily would output it, we
1790                        # don't if malformations are allowed -- except an
1791                        # overflow is also a SUPER and PERL_EXTENDED, and if
1792                        # warnings for those are enabled, the overflow
1793                        # warning does get raised.
1794                        if (   $expect_warnings_for_overflow
1795                            && (    $malformed_allow_type == 0
1796                                ||   (   $this_warning_flags
1797                                      & ($::UTF8_WARN_SUPER
1798                                        |$::UTF8_WARN_PERL_EXTENDED))))
1799                        {
1800                            push @expected_warnings, $overflow_msg_pattern;
1801                        }
1802                    }
1803
1804                    # It may be that the malformations have shortened the
1805                    # amount of input we look at so much that we can't tell
1806                    # what the category the code point was in.  Otherwise, set
1807                    # up the expected return flags based on the warnings and
1808                    # disallowments.
1809                    if ($this_expected_len < $this_needed_to_discern_len) {
1810                        $expect_regular_warnings = 0;
1811                    }
1812                    elsif (   ($this_warning_flags & $this_utf8n_flag_to_warn)
1813                           || (  $this_disallow_flags
1814                               & $this_utf8n_flag_to_disallow))
1815                    {
1816                        push @expected_return_flags, $return_flag;
1817                    }
1818
1819                    # Finish setting up the expected warning.
1820                    if ($expect_regular_warnings) {
1821
1822                        # So far the array contains warnings generated by
1823                        # malformations.  Add the expected regular one.
1824                        unshift @expected_warnings, $this_cp_message_qr;
1825
1826                        # But it may need to be modified, because either of
1827                        # these malformations means we can't determine the
1828                        # expected code point.
1829                        if (   $short || $unexpected_noncont
1830                            || $dont_use_overlong_cp)
1831                        {
1832                            my $first_byte = substr($this_bytes, 0, 1);
1833                            $expected_warnings[0] = display_bytes(
1834                                    substr($this_bytes, 0, $this_expected_len));
1835                            $expected_warnings[0]
1836                                = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1837                                     \Q $expected_warnings[0]\E
1838                                     \Q $this_non_cp_trailing_text\E/x;
1839                        }
1840                    }
1841
1842                    # Is effectively disallowed if we've set up a malformation
1843                    # (unless malformations are allowed), even if the flag
1844                    # indicates it is allowed.  Fix up test name to indicate
1845                    # this as well
1846                    my $disallowed = 0;
1847                    if (   $this_disallow_flags & $this_utf8n_flag_to_disallow
1848                        && $this_expected_len >= $this_needed_to_discern_len)
1849                    {
1850                        $disallowed = 1;
1851                    }
1852                    if ($malformations_name) {
1853                        if ($malformed_allow_type == 0) {
1854                            $disallowed = 1;
1855                        }
1856                        elsif ($malformed_allow_type == 1) {
1857
1858                            # Even if allowed, the malformation returns the
1859                            # REPLACEMENT CHARACTER.
1860                            $expected_uv = 0xFFFD;
1861                            $this_uv_string = "0xFFFD"
1862                        }
1863                    }
1864
1865                    my $this_name = "$utf8_func() $testname: ";
1866                    my @scratch_expected_return_flags = @expected_return_flags;
1867                    if (! $initially_malformed) {
1868                        $this_name .= ($disallowed)
1869                                       ? 'disallowed, '
1870                                       : 'allowed, ';
1871                    }
1872                    $this_name .= "$eval_warn";
1873                    $this_name .= ", " . ((  $this_warning_flags
1874                                            & $this_utf8n_flag_to_warn)
1875                                          ? 'with flag for raising warnings'
1876                                          : 'no flag for raising warnings');
1877                    $this_name .= $malformations_name;
1878
1879                    # Do the actual test using an eval
1880                    undef @warnings_gotten;
1881                    my $ret_ref;
1882                    my $this_flags
1883                        = $allow_flags|$this_warning_flags|$this_disallow_flags;
1884                    my $eval_text =      "$eval_warn; \$ret_ref"
1885                            . " = test_$utf8_func("
1886                            . "'$this_bytes', $this_length, $this_flags)";
1887                    eval "$eval_text";
1888                    if (! ok ($@ eq "", "$this_name: eval succeeded"))
1889                    {
1890                        diag "\$@='$@'; call was: "
1891                           . utf8n_display_call($eval_text);
1892                        next;
1893                    }
1894
1895                    if ($disallowed) {
1896                        is($ret_ref->[0], 0, "    And returns 0")
1897                          or diag "Call was: " . utf8n_display_call($eval_text);
1898                    }
1899                    else {
1900                        is($ret_ref->[0], $expected_uv,
1901                                "    And returns expected uv: "
1902                              . $this_uv_string)
1903                          or diag "Call was: " . utf8n_display_call($eval_text);
1904                    }
1905                    is($ret_ref->[1], $this_expected_len,
1906                                        "    And returns expected length:"
1907                                      . " $this_expected_len")
1908                      or diag "Call was: " . utf8n_display_call($eval_text);
1909
1910                    my $returned_flags = $ret_ref->[2];
1911
1912                    for (my $i = @scratch_expected_return_flags - 1;
1913                         $i >= 0;
1914                         $i--)
1915                    {
1916                      if ($scratch_expected_return_flags[$i] & $returned_flags)
1917                      {
1918                          if ($scratch_expected_return_flags[$i]
1919                                              == $::UTF8_GOT_PERL_EXTENDED)
1920                          {
1921                              pass("    Expected and got return flag for"
1922                                  . " PERL_EXTENDED");
1923                          }
1924                                  # The first entries in this are
1925                                  # malformations
1926                          elsif ($i > @malformation_names - 1)  {
1927                              pass("    Expected and got return flag"
1928                                  . " for " . $controlling_warning_category);
1929                          }
1930                          else {
1931                              pass("    Expected and got return flag for "
1932                                  . $malformation_names[$i]
1933                                  . " malformation");
1934                          }
1935                          $returned_flags
1936                                      &= ~$scratch_expected_return_flags[$i];
1937                          splice @scratch_expected_return_flags, $i, 1;
1938                      }
1939                    }
1940
1941                    if (! is($returned_flags, 0,
1942                       "    Got no unexpected return flags"))
1943                    {
1944                        diag "The unexpected flags gotten were: "
1945                           . (flags_to_text($returned_flags,
1946                                            \@utf8n_flags_to_text)
1947                                # We strip off any prefixes from the flag
1948                                # names
1949                             =~ s/ \b [A-Z] _ //xgr);
1950                        diag "Call was: " . utf8n_display_call($eval_text);
1951                    }
1952
1953                    if (! is (scalar @scratch_expected_return_flags, 0,
1954                        "    Got all expected return flags"))
1955                    {
1956                        diag "The expected flags not gotten were: "
1957                           . (flags_to_text(eval join("|",
1958                                                @scratch_expected_return_flags),
1959                                            \@utf8n_flags_to_text)
1960                                # We strip off any prefixes from the flag
1961                                # names
1962                             =~ s/ \b [A-Z] _ //xgr);
1963                        diag "Call was: " . utf8n_display_call($eval_text);
1964                    }
1965
1966                    if ($which_func) {
1967                        my @returned_warnings;
1968                        for my $element_ref (@{$ret_ref->[3]}) {
1969                            push @returned_warnings, $element_ref->{'text'};
1970                            my $text = $element_ref->{'text'};
1971                            my $flag = $element_ref->{'flag_bit'};
1972                            my $category = $element_ref->{'warning_category'};
1973
1974                            if (! ok(($flag & ($flag-1)) == 0,
1975                                      "flag for returned msg is a single bit"))
1976                            {
1977                              diag sprintf("flags are %x; msg=%s", $flag, $text);
1978                            }
1979                            else {
1980                              if (grep { $_ == $flag } @expected_return_flags) {
1981                                  pass("flag for returned msg is expected");
1982                              }
1983                              else {
1984                                  fail("flag ("
1985                                     . flags_to_text($flag, \@utf8n_flags_to_text)
1986                                     . ") for returned msg is expected");
1987                              }
1988                            }
1989
1990                            # In perl space, don't know the category numbers
1991                            isnt($category, 0,
1992                                          "returned category for msg isn't 0");
1993                        }
1994
1995                        ok(@warnings_gotten == 0, "$utf8_func raised no warnings;"
1996                              . " the next tests are for ones in the returned"
1997                              . " variable")
1998                            or diag join "\n", "The unexpected warnings were:",
1999                                                              @warnings_gotten;
2000                        @warnings_gotten = @returned_warnings;
2001                    }
2002
2003                    do_warnings_test(@expected_warnings)
2004                      or diag "Call was: " . utf8n_display_call($eval_text);
2005                    undef @warnings_gotten;
2006
2007                    # Check CHECK_ONLY results when the input is
2008                    # disallowed.  Do this when actually disallowed,
2009                    # not just when the $this_disallow_flags is set.  We only
2010                    # test once utf8n_to_uvchr_msgs() with this.
2011                    if (   $disallowed
2012                        && ($which_func == 0 || ! $tested_CHECK_ONLY))
2013                    {
2014                        $tested_CHECK_ONLY = 1;
2015                        my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
2016                        my $eval_text = "use warnings; \$ret_ref ="
2017                                      . " test_$utf8_func('"
2018                                      . "$this_bytes', $this_length,"
2019                                      . " $this_flags)";
2020                        eval $eval_text;
2021                        if (! ok ($@ eq "",
2022                            "    And eval succeeded with CHECK_ONLY"))
2023                        {
2024                            diag "\$@='$@'; Call was: "
2025                               . utf8n_display_call($eval_text);
2026                            next;
2027                        }
2028                        is($ret_ref->[0], 0, "    CHECK_ONLY: Returns 0")
2029                          or diag "Call was: " . utf8n_display_call($eval_text);
2030                        is($ret_ref->[1], -1,
2031                                       "    CHECK_ONLY: returns -1 for length")
2032                          or diag "Call was: " . utf8n_display_call($eval_text);
2033                        if (! is(scalar @warnings_gotten, 0,
2034                                      "    CHECK_ONLY: no warnings generated"))
2035                        {
2036                            diag "Call was: " . utf8n_display_call($eval_text);
2037                            output_warnings(@warnings_gotten);
2038                        }
2039                    }
2040
2041                    # Now repeat some of the above, but for
2042                    # uvchr_to_utf8_flags().  Since this comes from an
2043                    # existing code point, it hasn't overflowed, and isn't
2044                    # malformed.
2045                    next if @malformation_names;
2046
2047                    my $uvchr_func = ($which_func)
2048                                     ? 'uvchr_to_utf8_flags_msgs'
2049                                     : 'uvchr_to_utf8_flags';
2050
2051                    $this_warning_flags = ($use_warn_flag)
2052                                          ? $this_uvchr_flag_to_warn
2053                                          : 0;
2054                    $this_disallow_flags = ($do_disallow)
2055                                           ? $this_uvchr_flag_to_disallow
2056                                           : 0;
2057
2058                    $disallowed = $this_disallow_flags
2059                                & $this_uvchr_flag_to_disallow;
2060                    $this_name .= ", " . ((  $this_warning_flags
2061                                           & $this_utf8n_flag_to_warn)
2062                                          ? 'with flag for raising warnings'
2063                                          : 'no flag for raising warnings');
2064
2065                    $this_name = "$uvchr_func() $testname: "
2066                                        . (($disallowed)
2067                                           ? 'disallowed'
2068                                           : 'allowed');
2069                    $this_name .= ", $eval_warn";
2070                    $this_name .= ", " . ((  $this_warning_flags
2071                                           & $this_uvchr_flag_to_warn)
2072                                        ? 'with warning flag'
2073                                        : 'no warning flag');
2074
2075                    undef @warnings_gotten;
2076                    my $ret;
2077                    $this_flags = $this_warning_flags|$this_disallow_flags;
2078                    $eval_text = "$eval_warn; \$ret ="
2079                            . " test_$uvchr_func("
2080                            . "$allowed_uv, $this_flags)";
2081                    eval "$eval_text";
2082                    if (! ok ($@ eq "", "$this_name: eval succeeded"))
2083                    {
2084                        diag "\$@='$@'; call was: "
2085                           . uvchr_display_call($eval_text);
2086                        next;
2087                    }
2088
2089                    if ($which_func) {
2090                        if (defined $ret->[1]) {
2091                            my @returned_warnings;
2092                            push @returned_warnings, $ret->[1]{'text'};
2093                            my $text = $ret->[1]{'text'};
2094                            my $flag = $ret->[1]{'flag_bit'};
2095                            my $category = $ret->[1]{'warning_category'};
2096
2097                            if (! ok(($flag & ($flag-1)) == 0,
2098                                        "flag for returned msg is a single bit"))
2099                            {
2100                                diag sprintf("flags are %x; msg=%s", $flag, $text);
2101                            }
2102                            else {
2103                                if ($flag & $this_uvchr_flag_to_disallow) {
2104                                    pass("flag for returned msg is expected");
2105                                }
2106                                else {
2107                                    fail("flag ("
2108                                        . flags_to_text($flag, \@utf8n_flags_to_text)
2109                                        . ") for returned msg is expected");
2110                                }
2111                            }
2112
2113                            # In perl space, don't know the category numbers
2114                            isnt($category, 0,
2115                                            "returned category for msg isn't 0");
2116
2117                            ok(@warnings_gotten == 0, "$uvchr_func raised no warnings;"
2118                                . " the next tests are for ones in the returned"
2119                                . " variable")
2120                                or diag join "\n", "The unexpected warnings were:",
2121                                                                @warnings_gotten;
2122                            @warnings_gotten = @returned_warnings;
2123                        }
2124
2125                        $ret = $ret->[0];
2126                    }
2127
2128                    if ($disallowed) {
2129                        is($ret, undef, "    And returns undef")
2130                          or diag "Call was: " . uvchr_display_call($eval_text);
2131                    }
2132                    else {
2133                        is($ret, $this_bytes, "    And returns expected string")
2134                          or diag "Call was: " . uvchr_display_call($eval_text);
2135                    }
2136
2137                    do_warnings_test(@expected_warnings)
2138                      or diag "Call was: " . uvchr_display_call($eval_text);
2139                }
2140              }
2141            }
2142          }
2143        }
2144      }
2145    }
2146  }
2147}
2148
2149done_testing;
2150