1#!perl -w
2
3BEGIN {
4    require 'loc_tools.pl';   # Contains locales_enabled() and
5                              # find_utf8_ctype_locale()
6}
7
8use strict;
9use Test::More;
10use Config;
11
12use XS::APItest;
13
14my $tab = " " x 4;  # Indent subsidiary tests this much
15
16use Unicode::UCD qw(search_invlist prop_invmap prop_invlist);
17my ($charname_list, $charname_map, $format, $default) = prop_invmap("Name Alias");
18
19sub get_charname($) {
20    my $cp = shift;
21
22    # If there is a an abbreviation for the code point name, use it
23    my $name_index = search_invlist(\@{$charname_list}, $cp);
24    if (defined $name_index) {
25        my $synonyms = $charname_map->[$name_index];
26        if (ref $synonyms) {
27            my $pat = qr/: abbreviation/;
28            my @abbreviations = grep { $_ =~ $pat } @$synonyms;
29            if (@abbreviations) {
30                return $abbreviations[0] =~ s/$pat//r;
31            }
32        }
33    }
34
35    # Otherwise, use the full name
36    use charnames ();
37    return charnames::viacode($cp) // "No name";
38}
39
40sub truth($) {  # Converts values so is() works
41    return (shift) ? 1 : 0;
42}
43
44my $base_locale;
45my $utf8_locale;
46if(locales_enabled('LC_ALL')) {
47    require POSIX;
48    $base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
49    if (defined $base_locale && $base_locale eq 'C') {
50        use locale; # make \w work right in non-ASCII lands
51
52        # Some locale implementations don't have the 128-255 characters all
53        # mean nothing.  Skip the locale tests in that situation
54        for my $u (128 .. 255) {
55            if (chr(utf8::unicode_to_native($u)) =~ /[[:print:]]/) {
56                undef $base_locale;
57                last;
58            }
59        }
60
61        $utf8_locale = find_utf8_ctype_locale() if $base_locale;
62    }
63}
64
65sub get_display_locale_or_skip($$) {
66
67    # Helper function intimately tied to its callers.  It knows the loop
68    # iterates with a locale of "", meaning don't use locale; $base_locale
69    # meaning to use a non-UTF-8 locale; and $utf8_locale.
70    #
71    # It checks to see if the current test should be skipped or executed,
72    # returning an empty list for the former, and for the latter:
73    #   ( 'locale display name',
74    #     bool of is this a UTF-8 locale )
75    #
76    # The display name is the empty string if not using locale.  Functions
77    # with _LC in their name are skipped unless in locale, and functions
78    # without _LC are executed only outside locale.
79
80    my ($locale, $suffix) = @_;
81
82    # The test should be skipped if the input is for a non-existent locale
83    return unless defined $locale;
84
85    # Here the input is defined, either a locale name or "".  If the test is
86    # for not using locales, we want to do the test for non-LC functions,
87    # and skip it for LC ones.
88    if ($locale eq "") {
89        return ("", 0) if $suffix !~ /LC/;
90        return;
91    }
92
93    # Here the input is for a real locale.  We don't test the non-LC functions
94    # for locales.
95    return if $suffix !~ /LC/;
96
97    # Here is for a LC function and a real locale.  The base locale is not
98    # UTF-8.
99    return (" ($locale locale)", 0) if $locale eq $base_locale;
100
101    # The only other possibility is that we have a UTF-8 locale
102    return (" ($locale)", 1);
103}
104
105sub try_malforming($$$)
106{
107    # Determines if the tests for malformed UTF-8 should be done.  When done,
108    # the .xs code creates malformations by pretending the length is shorter
109    # than it actually is.  Some things can't be malformed, and sometimes this
110    # test knows that the current code doesn't look for a malformation under
111    # various circumstances.
112
113    my ($u, $function, $using_locale) = @_;
114    # $u is unicode code point;
115
116    # Single bytes can't be malformed
117    return 0 if $u < ((ord "A" == 65) ? 128 : 160);
118
119    # ASCII doesn't need to ever look beyond the first byte.
120    return 0 if $function eq "ASCII";
121
122    # Nor, on EBCDIC systems, does CNTRL
123    return 0 if ord "A" != 65 && $function eq "CNTRL";
124
125    # No controls above 255, so the code doesn't look at those
126    return 0 if $u > 255 && $function eq "CNTRL";
127
128    # No non-ASCII digits below 256, except if using locales.
129    return 0 if $u < 256 && ! $using_locale && $function =~ /X?DIGIT/;
130
131    return 1;
132}
133
134my %properties = (
135                   # name => Lookup-property name
136                   alnum => 'Word',
137                   wordchar => 'Word',
138                   alphanumeric => 'Alnum',
139                   alpha => 'XPosixAlpha',
140                   ascii => 'ASCII',
141                   blank => 'Blank',
142                   cntrl => 'Control',
143                   digit => 'Digit',
144                   graph => 'Graph',
145                   idfirst => '_Perl_IDStart',
146                   idcont => '_Perl_IDCont',
147                   lower => 'XPosixLower',
148                   print => 'Print',
149                   psxspc => 'XPosixSpace',
150                   punct => 'XPosixPunct',
151                   quotemeta => '_Perl_Quotemeta',
152                   space => 'XPerlSpace',
153                   vertws => 'VertSpace',
154                   upper => 'XPosixUpper',
155                   xdigit => 'XDigit',
156                );
157
158my %seen;
159my @warnings;
160local $SIG{__WARN__} = sub { push @warnings, @_ };
161
162my %utf8_param_code = (
163                        "_safe"                 =>  0,
164                        "_safe, malformed"      =>  1,
165                        "deprecated unsafe"     => -1,
166                        "deprecated mathoms"    => -2,
167                      );
168
169# This test is split into this number of files.
170my $num_test_files = $ENV{TEST_JOBS} || 1;
171$::TEST_CHUNK = 0 if $num_test_files == 1 && ! defined $::TEST_CHUNK;
172$num_test_files = 10 if $num_test_files > 10;
173
174my $property_count = -1;
175foreach my $name (sort keys %properties, 'octal') {
176
177    # We test every nth property in this run so that this test is split into
178    # smaller chunks to minimize test suite elapsed time when run in parallel.
179    $property_count++;
180    next if $property_count % $num_test_files != $::TEST_CHUNK;
181
182    my @invlist;
183    if ($name eq 'octal') {
184        # Hand-roll an inversion list with 0-7 in it and nothing else.
185        push @invlist, ord "0", ord "8";
186    }
187    else {
188        my $property = $properties{$name};
189        @invlist = prop_invlist($property, '_perl_core_internal_ok');
190        if (! @invlist) {
191
192            # An empty return could mean an unknown property, or merely that
193            # it is empty.  Call in scalar context to differentiate
194            if (! prop_invlist($property, '_perl_core_internal_ok')) {
195                fail("No inversion list found for $property");
196                next;
197            }
198        }
199    }
200
201    # Include all the Latin1 code points, plus 0x100.
202    my @code_points = (0 .. 256);
203
204    # Then include the next few boundaries above those from this property
205    my $above_latins = 0;
206    foreach my $range_start (@invlist) {
207        next if $range_start < 257;
208        push @code_points, $range_start - 1, $range_start;
209        $above_latins++;
210        last if $above_latins > 5;
211    }
212
213    # This makes sure we are using the Perl definition of idfirst and idcont,
214    # and not the Unicode.  There are a few differences.
215    push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
216    if ($name eq "idcont") {    # And some that are continuation but not start
217        push @code_points, ord("\N{GREEK ANO TELEIA}"),
218                           ord("\N{COMBINING GRAVE ACCENT}");
219    }
220
221    # And finally one non-Unicode code point.
222    push @code_points, 0x110000;    # Above Unicode, no prop should match
223    no warnings 'non_unicode';
224
225    for my $n (@code_points) {
226        my $u = utf8::native_to_unicode($n);
227        my $function = uc($name);
228
229        is (@warnings, 0, "Got no unexpected warnings in previous iteration")
230           or diag("@warnings");
231        undef @warnings;
232
233        my $matches = search_invlist(\@invlist, $n);
234        if (! defined $matches) {
235            $matches = 0;
236        }
237        else {
238            $matches = truth(! ($matches % 2));
239        }
240
241        my $ret;
242        my $char_name = get_charname($n);
243        my $display_name = sprintf "\\x{%02X, %s}", $n, $char_name;
244        my $display_call = "is${function}( $display_name )";
245
246        foreach my $suffix ("", "_A", "_L1", "_LC", "_uni", "_uvchr",
247                            "_LC_uvchr", "_utf8", "_LC_utf8")
248        {
249
250            # Not all possible macros have been defined
251            if ($name eq 'vertws') {
252
253                # vertws is always all of Unicode
254                next if $suffix !~ / ^ _ ( uni | uvchr | utf8 ) $ /x;
255            }
256            elsif ($name eq 'alnum') {
257
258                # ALNUM_A, ALNUM_L1, and ALNUM_uvchr are not defined as these
259                # suffixes were added later, after WORDCHAR was created to be
260                # a clearer synonym for ALNUM
261                next if    $suffix eq '_A'
262                        || $suffix eq '_L1'
263                        || $suffix eq '_uvchr';
264            }
265            elsif ($name eq 'octal') {
266                next if $suffix ne ""  && $suffix ne '_A' && $suffix ne '_L1';
267            }
268            elsif ($name eq 'quotemeta') {
269                # There is only one macro for this, and is defined only for
270                # Latin1 range
271                next if $suffix ne ""
272            }
273
274            foreach my $locale ("", $base_locale, $utf8_locale) {
275
276                my ($display_locale, $locale_is_utf8)
277                                = get_display_locale_or_skip($locale, $suffix);
278                next unless defined $display_locale;
279
280                use if $locale, "locale";
281                POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
282
283                if ($suffix !~ /utf8/) {    # _utf8 has to handled specially
284                    my $display_call
285                       = "is${function}$suffix( $display_name )$display_locale";
286                    $ret = truth eval "test_is${function}$suffix($n)";
287                    if (is ($@, "", "$display_call didn't give error")) {
288                        my $truth = $matches;
289                        if ($truth) {
290
291                            # The single byte functions are false for
292                            # above-Latin1
293                            if ($n >= 256) {
294                                $truth = 0
295                                        if $suffix=~ / ^ ( _A | _L [1C] )? $ /x;
296                            }
297                            elsif (   $u >= 128
298                                   && $name ne 'quotemeta')
299                            {
300
301                                # The no-suffix and _A functions are false
302                                # for non-ASCII.  So are  _LC  functions on a
303                                # non-UTF-8 locale
304                                $truth = 0 if    $suffix eq "_A"
305                                              || $suffix eq ""
306                                              || (     $suffix =~ /LC/
307                                                  && ! $locale_is_utf8);
308                            }
309                        }
310
311                        is ($ret, $truth, "${tab}And correctly returns $truth");
312                    }
313                }
314                else {  # _utf8 suffix
315                    my $char = chr($n);
316                    utf8::upgrade($char);
317                    $char = quotemeta $char if $char eq '\\' || $char eq "'";
318                    my $truth;
319                    if (   $suffix =~ /LC/
320                        && ! $locale_is_utf8
321                        && $n < 256
322                        && $u >= 128)
323                    {   # The C-locale _LC function returns FALSE for Latin1
324                        # above ASCII
325                        $truth = 0;
326                    }
327                    else {
328                        $truth = $matches;
329                    }
330
331                    foreach my $utf8_param("_safe",
332                                           "_safe, malformed",
333                                           "deprecated unsafe"
334                                          )
335                    {
336                        my $utf8_param_code = $utf8_param_code{$utf8_param};
337                        my $expect_error = $utf8_param_code > 0;
338                        next if      $expect_error
339                                && ! try_malforming($u, $function,
340                                                    $suffix =~ /LC/);
341
342                        my $display_call = "is${function}$suffix( $display_name"
343                                         . ", $utf8_param )$display_locale";
344                        $ret = truth eval "test_is${function}$suffix('$char',"
345                                        . " $utf8_param_code)";
346                        if ($expect_error) {
347                            isnt ($@, "",
348                                    "expected and got error in $display_call");
349                            like($@, qr/Malformed UTF-8 character/,
350                                "${tab}And got expected message");
351                            if (is (@warnings, 1,
352                                           "${tab}Got a single warning besides"))
353                            {
354                                like($warnings[0],
355                                     qr/Malformed UTF-8 character.*short/,
356                                     "${tab}Got expected warning");
357                            }
358                            else {
359                                diag("@warnings");
360                            }
361                            undef @warnings;
362                        }
363                        elsif (is ($@, "", "$display_call didn't give error")) {
364                            is ($ret, $truth,
365                                "${tab}And correctly returned $truth");
366                            if ($utf8_param_code < 0) {
367                                my $warnings_ok;
368                                my $unique_function = "is" . $function . $suffix;
369                                if (! $seen{$unique_function}++) {
370                                    $warnings_ok = is(@warnings, 1,
371                                        "${tab}This is first call to"
372                                      . " $unique_function; Got a single"
373                                      . " warning");
374                                    if ($warnings_ok) {
375                                        $warnings_ok = like($warnings[0],
376                qr/starting in Perl .* will require an additional parameter/,
377                                            "${tab}The warning was the expected"
378                                          . " deprecation one");
379                                    }
380                                }
381                                else {
382                                    $warnings_ok = is(@warnings, 0,
383                                        "${tab}This subsequent call to"
384                                      . " $unique_function did not warn");
385                                }
386                                $warnings_ok or diag("@warnings");
387                                undef @warnings;
388                            }
389                        }
390                    }
391                }
392            }
393        }
394    }
395}
396
397my %to_properties = (
398                FOLD  => 'Case_Folding',
399                LOWER => 'Lowercase_Mapping',
400                TITLE => 'Titlecase_Mapping',
401                UPPER => 'Uppercase_Mapping',
402            );
403
404$property_count = -1;
405foreach my $name (sort keys %to_properties) {
406
407    $property_count++;
408    next if $property_count % $num_test_files != $::TEST_CHUNK;
409
410    my $property = $to_properties{$name};
411    my ($list_ref, $map_ref, $format, $missing)
412                                      = prop_invmap($property, );
413    if (! $list_ref || ! $map_ref) {
414        fail("No inversion map found for $property");
415        next;
416    }
417    if ($format !~ / ^ a l? $ /x) {
418        fail("Unexpected inversion map format ('$format') found for $property");
419        next;
420    }
421
422    # Include all the Latin1 code points, plus 0x100.
423    my @code_points = (0 .. 256);
424
425    # Then include the next few multi-char folds above those from this
426    # property, and include the next few single folds as well
427    my $above_latins = 0;
428    my $multi_char = 0;
429    for my $i (0 .. @{$list_ref} - 1) {
430        my $range_start = $list_ref->[$i];
431        next if $range_start < 257;
432        if (ref $map_ref->[$i] && $multi_char < 5)  {
433            push @code_points, $range_start - 1
434                                        if $code_points[-1] != $range_start - 1;
435            push @code_points, $range_start;
436            $multi_char++;
437        }
438        elsif ($above_latins < 5) {
439            push @code_points, $range_start - 1
440                                        if $code_points[-1] != $range_start - 1;
441            push @code_points, $range_start;
442            $above_latins++;
443        }
444        last if $above_latins >= 5 && $multi_char >= 5;
445    }
446
447    # And finally one non-Unicode code point.
448    push @code_points, 0x110000;    # Above Unicode, no prop should match
449    no warnings 'non_unicode';
450
451    # $n is native; $u unicode.
452    for my $n (@code_points) {
453        my $u = utf8::native_to_unicode($n);
454        my $function = $name;
455
456        my $index = search_invlist(\@{$list_ref}, $n);
457
458        my $ret;
459        my $char_name = get_charname($n);
460        my $display_name = sprintf "\\N{U+%02X, %s}", $n, $char_name;
461
462        foreach my $suffix ("", "_L1", "_LC") {
463
464            # This is the only macro defined for L1
465            next if $suffix eq "_L1" && $function ne "LOWER";
466
467          SKIP:
468            foreach my $locale ("", $base_locale, $utf8_locale) {
469
470                # titlecase is not defined in locales.
471                next if $name eq 'TITLE' && $suffix eq "_LC";
472
473                my ($display_locale, $locale_is_utf8)
474                                = get_display_locale_or_skip($locale, $suffix);
475                next unless defined $display_locale;
476
477                skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S"
478                  . "$display_locale", 1)
479                            if  $u == 0xDF && $name =~ / FOLD | UPPER /x
480                             && $suffix eq "_LC" && $locale_is_utf8;
481
482                use if $locale, "locale";
483                POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
484
485                my $display_call = "to${function}$suffix("
486                                 . " $display_name )$display_locale";
487                $ret = eval "test_to${function}$suffix($n)";
488                if (is ($@, "", "$display_call didn't give error")) {
489                    my $should_be;
490                    if ($n > 255) {
491                        $should_be = $n;
492                    }
493                    elsif (     $u > 127
494                            && (   $suffix eq ""
495                                || ($suffix eq "_LC" && ! $locale_is_utf8)))
496                    {
497                        $should_be = $n;
498                    }
499                    elsif ($map_ref->[$index] != $missing) {
500                        $should_be = $map_ref->[$index] + $n - $list_ref->[$index]
501                    }
502                    else {
503                        $should_be = $n;
504                    }
505
506                    is ($ret, $should_be,
507                        sprintf("${tab}And correctly returned 0x%02X",
508                                                              $should_be));
509                }
510            }
511        }
512
513        # The _uni, uvchr, and _utf8 functions return both the ordinal of the
514        # first code point of the result, and the result in utf8.  The .xs
515        # tests return these in an array, in [0] and [1] respectively, with
516        # [2] the length of the utf8 in bytes.
517        my $utf8_should_be = "";
518        my $first_ord_should_be;
519        if (ref $map_ref->[$index]) {   # A multi-char result
520            for my $n (0 .. @{$map_ref->[$index]} - 1) {
521                $utf8_should_be .= chr $map_ref->[$index][$n];
522            }
523
524            $first_ord_should_be = $map_ref->[$index][0];
525        }
526        else {  # A single-char result
527            $first_ord_should_be = ($map_ref->[$index] != $missing)
528                                    ? $map_ref->[$index] + $n
529                                                         - $list_ref->[$index]
530                                    : $n;
531            $utf8_should_be = chr $first_ord_should_be;
532        }
533        utf8::upgrade($utf8_should_be);
534
535        # Test _uni, uvchr
536        foreach my $suffix ('_uni', '_uvchr') {
537            my $s;
538            my $len;
539            my $display_call = "to${function}$suffix( $display_name )";
540            $ret = eval "test_to${function}$suffix($n)";
541            if (is ($@, "", "$display_call didn't give error")) {
542                is ($ret->[0], $first_ord_should_be,
543                    sprintf("${tab}And correctly returned 0x%02X",
544                                                    $first_ord_should_be));
545                is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
546                use bytes;
547                is ($ret->[2], length $utf8_should_be,
548                    "${tab}Got correct number of bytes for utf8 length");
549            }
550        }
551
552        # Test _utf8
553        my $char = chr($n);
554        utf8::upgrade($char);
555        $char = quotemeta $char if $char eq '\\' || $char eq "'";
556        foreach my $utf8_param("_safe",
557                                "_safe, malformed",
558                                "deprecated unsafe",
559                                "deprecated mathoms",
560                                )
561        {
562            use Config;
563            next if    $utf8_param eq 'deprecated mathoms'
564                    && $Config{'ccflags'} =~ /-DNO_MATHOMS/;
565
566            my $utf8_param_code = $utf8_param_code{$utf8_param};
567            my $expect_error = $utf8_param_code > 0;
568
569            # Skip if can't malform (because is a UTF-8 invariant)
570            next if $expect_error && $u < ((ord "A" == 65) ? 128 : 160);
571
572            my $display_call = "to${function}_utf8($display_name, $utf8_param )";
573            $ret = eval   "test_to${function}_utf8('$char', $utf8_param_code)";
574            if ($expect_error) {
575                isnt ($@, "", "expected and got error in $display_call");
576                like($@, qr/Malformed UTF-8 character/,
577                     "${tab}And got expected message");
578                undef @warnings;
579            }
580            elsif (is ($@, "", "$display_call didn't give error")) {
581                is ($ret->[0], $first_ord_should_be,
582                    sprintf("${tab}And correctly returned 0x%02X",
583                                                    $first_ord_should_be));
584                is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
585                use bytes;
586                is ($ret->[2], length $utf8_should_be,
587                    "${tab}Got correct number of bytes for utf8 length");
588                if ($utf8_param_code < 0) {
589                    my $warnings_ok;
590                    if (! $seen{"${function}_utf8$utf8_param"}++) {
591                        $warnings_ok = is(@warnings, 1,
592                                                   "${tab}Got a single warning");
593                        if ($warnings_ok) {
594                            my $expected;
595                            if ($utf8_param_code == -2) {
596                                my $lc_func = lc $function;
597                                $expected
598                = qr/starting in Perl .* to_utf8_$lc_func\(\) will be removed/;
599                            }
600                            else {
601                                $expected
602                = qr/starting in Perl .* will require an additional parameter/;
603                            }
604                            $warnings_ok = like($warnings[0], $expected,
605                                      "${tab}Got expected deprecation warning");
606                        }
607                    }
608                    else {
609                        $warnings_ok = is(@warnings, 0,
610                                  "${tab}Deprecation warned only the one time");
611                    }
612                    $warnings_ok or diag("@warnings");
613                    undef @warnings;
614                }
615            }
616        }
617    }
618}
619
620# This is primarily to make sure that no non-Unicode warnings get generated
621is(scalar @warnings, 0, "No unexpected warnings were generated in the tests")
622  or diag @warnings;
623
624done_testing;
625