xref: /openbsd/gnu/usr.bin/perl/t/re/charset.t (revision 5dea098c)
1# Test the /a, /d, etc regex modifiers
2
3BEGIN {
4    chdir 't' if -d 't';
5    require './test.pl';
6    set_up_inc('../lib', '../dist/if');
7    require './loc_tools.pl';
8}
9
10use strict;
11use warnings;
12no warnings 'locale';   # Some /l tests use above-latin1 chars to make sure
13                        # they work, even though they warn.
14use Config;
15
16plan('no_plan');
17
18# Each case is a valid element of its hash key.  Choose, where available, an
19# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
20my %testcases = (
21    '\w' => [ ord("A"), utf8::unicode_to_native(0xE2), 0x16B ],   # Below expects these to all be alpha
22    '\d' => [ ord("0"), 0x0662 ],
23    '\s' => [ ord("\t"), utf8::unicode_to_native(0xA0), 0x1680 ],  # Below expects these to be [:blank:]
24    '[:cntrl:]' => [ utf8::unicode_to_native(0x00), utf8::unicode_to_native(0x88) ],
25    '[:graph:]' => [ ord("&"), utf8::unicode_to_native(0xF7), 0x02C7 ], # Below expects these to be
26                                                                     # [:print:]
27    '[:lower:]' => [ ord("g"), utf8::unicode_to_native(0xE3), 0x0127 ],
28    '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), ord('>'), ord('|'), ord('-'), ord(','), ord(';'), ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), ord('"'), ord('('), ord(')'), ord('['), ord(']'), ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), ord("'"), utf8::unicode_to_native(0xBF), 0x055C ],
29    '[:upper:]' => [ ord("G"), utf8::unicode_to_native(0xC3), 0x0126 ],
30    '[:xdigit:]' => [ ord("4"), 0xFF15 ],
31);
32
33$testcases{'[:digit:]'} = $testcases{'\d'};
34$testcases{'[:alnum:]'} = $testcases{'\w'};
35$testcases{'[:alpha:]'} = $testcases{'\w'};
36$testcases{'[:blank:]'} = $testcases{'\s'};
37$testcases{'[:print:]'} = $testcases{'[:graph:]'};
38$testcases{'[:space:]'} = $testcases{'\s'};
39$testcases{'[:word:]'} = $testcases{'\w'};
40
41my $utf8_locale;
42
43my @charsets = qw(a d u aa);
44my $locales_ok = locales_enabled('LC_CTYPE');
45if (! is_miniperl() && $locales_ok) {
46    require POSIX;
47    my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
48    if ($current_locale eq 'C') {
49
50        # test for d_setlocale is repeated here because this one is compile
51        # time, and the one above is run time
52        use if $Config{d_setlocale}, 'locale';
53
54        # Some implementations don't have the 128-255 range characters all
55        # mean nothing under the C locale (an example being VMS).  This is
56        # legal, but since we don't know what the right answers should be,
57        # skip the locale tests in that situation.
58        for my $i (128 .. 255) {
59            goto skip_adding_C_locale
60                              if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/;
61        }
62        push @charsets, 'l';
63
64    skip_adding_C_locale:
65
66        # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
67        $utf8_locale = find_utf8_ctype_locale();
68        push @charsets, 'L' if defined $utf8_locale;
69    }
70}
71
72# For each possible character set...
73foreach my $charset (@charsets) {
74    my $locale;
75    my $charset_mod = lc $charset;
76    my $charset_display;
77    if ($charset_mod eq 'l') {
78        $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
79                                            ? "C"
80                                            : $utf8_locale
81                           );
82        die "Couldn't change locale" unless $locale;
83        $charset_display = $charset_mod . " ($locale)";
84    }
85    else {
86        $charset_display = $charset_mod;
87    }
88
89    # And in utf8 or not
90    foreach my $upgrade ("", 'utf8::upgrade($a); ') {
91
92        # reverse gets the, \w, \s, \d first.
93        for my $class (reverse sort keys %testcases) {
94
95            # The complement of \w is \W; of [:posix:] is [:^posix:]
96            my $complement = $class;
97            if ($complement !~ s/ ( \[: ) /$1^/x) {
98                $complement = uc($class);
99            }
100
101            # For each test case
102            foreach my $ord (@{$testcases{$class}}) {
103                my $char = chr($ord);
104                $char = ($char eq '$') ? '\$' : display($char);
105
106                # > 255 already implies upgraded.  Skip the ones that don't
107                # have an explicit upgrade.  This shows more clearly in the
108                # output which tests are in utf8, or not.
109                next if $ord > 255 && ! $upgrade;
110
111                my $reason = "";    # Explanation output with each test
112                my $neg_reason = "";
113                my $match = 1;      # Calculated whether test regex should
114                                    # match or not
115
116                # Everything always matches in ASCII, or under /u, or under /l
117                # with a UTF-8 locale
118                if (utf8::native_to_unicode($ord) < 128
119                    || $charset eq 'u'
120                    || $charset eq 'L')
121                {
122                    $reason = "\"$char\" is a $class under /$charset_display";
123                    $neg_reason = "\"$char\" is not a $complement under /$charset_display";
124                }
125                elsif ($charset eq "a" || $charset eq "aa") {
126                    $match = 0;
127                    $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display";
128                    $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display";
129                }
130                elsif ($ord > 255) {
131                    $reason = "\"$char\" is a $class under /$charset_display";
132                    $neg_reason = "\"$char\" is not a $complement under /$charset_display";
133                }
134                elsif ($charset eq 'l') {
135
136                    # We are using the C locale, which is essentially ASCII,
137                    # but under utf8, the above-latin1 chars are treated as
138                    # Unicode)
139                    $reason = "\"$char\" is not a $class in the C locale under /$charset_mod";
140                    $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod";
141                    $match = 0;
142                }
143                elsif ($upgrade) {
144                    $reason = "\"$char\" is a $class in utf8 under /$charset_display";
145                    $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display";
146                }
147                else {
148                    $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display";
149                    $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)";
150                    $match = 0;
151                }
152                $reason = "; $reason" if $reason;
153                $neg_reason = "; $neg_reason" if $neg_reason;
154
155                my $op;
156                my $neg_op;
157                if ($match) {
158                    $op = '=~';
159                    $neg_op = '!~';
160                }
161                else {
162                    $op = '!~';
163                    $neg_op = '=~';
164                }
165
166                # In [...] or not
167                foreach my $bracketed (0, 1) {
168                    my $lb = "";
169                    my $rb = "";
170                    if ($bracketed) {
171
172                        # Adds an extra char to the character class to make sure
173                        # that the class doesn't get optimized away.
174                        $lb = ($bracketed) ? '[_' : "";
175                        $rb = ($bracketed) ? ']' : "";
176                    }
177                    else {  # [:posix:] must be inside outer [ ]
178                        next if $class =~ /\[/;
179                    }
180
181                    my $length = 10;    # For regexec.c regrepeat() cases by
182                                        # matching more than one item
183                    # Test both class and its complement, and with one or more
184                    # than one item to match.
185                    foreach my $eval (
186                        qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
187                        qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
188                    ) {
189                        ok (eval $eval, $eval . $reason);
190                    }
191                    foreach my $eval (
192                        qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
193                        qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
194                    ) {
195                        ok (eval $eval, $eval . $neg_reason);
196                    }
197                }
198
199                next if $class ne '\w';
200
201                # Test \b, \B at beginning and end of string
202                foreach my $eval (
203                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
204                    qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
205                ) {
206                    ok (eval $eval, $eval . $reason);
207                }
208                foreach my $eval (
209                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
210                    qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
211                ) {
212                    ok (eval $eval, $eval . $neg_reason);
213                }
214
215                # Test \b, \B adjacent to a non-word char, both before it and
216                # after.  We test with ASCII, Latin1 and Unicode non-word chars
217                foreach my $space_ord (@{$testcases{'\s'}}) {
218
219                    # Useless to try to test non-utf8 when the ord itself
220                    # forces utf8
221                    next if $space_ord > 255 && ! $upgrade;
222
223                    my $space = display(chr $space_ord);
224
225                    foreach my $eval (
226                        qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
227                        qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
228                    ) {
229                        ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
230                    }
231                    foreach my $eval (
232                        qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
233                        qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
234                    ) {
235                        ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
236                    }
237                }
238
239                # Test \b, \B in the middle of two nominally word chars, but
240                # one or both may be considered non-word depending on range
241                # and charset.
242                foreach my $other_ord (@{$testcases{'\w'}}) {
243                    next if $other_ord > 255 && ! $upgrade;
244                    my $other = display(chr $other_ord);
245
246                    # Determine if the other char is a word char in current
247                    # circumstances
248                    my $other_is_word = 1;
249                    my $other_reason = "\"$other\" is a $class under /$charset_display";
250                    my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
251                    if (utf8::native_to_unicode($other_ord) > 127
252                        && $charset ne 'u' && $charset ne 'L'
253                        && (($charset eq "a" || $charset eq "aa")
254                            || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
255                    {
256                        $other_is_word = 0;
257                        $other_reason = "\"$other\" is not a $class under /$charset_display";
258                        $other_neg_reason = "\"$other\" is a $complement under /$charset_display";
259                    }
260                    my $both_reason = $reason;
261                    $both_reason .= "; $other_reason" if $other_ord != $ord;
262                    my $both_neg_reason = $neg_reason;
263                    $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
264
265                    # If both are the same wordness, then \b will fail; \B
266                    # succeed
267                    if ($match == $other_is_word) {
268                        $op = '!~';
269                        $neg_op = '=~';
270                    }
271                    else {
272                        $op = '=~';
273                        $neg_op = '!~';
274                    }
275
276                    foreach my $eval (
277                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
278                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
279                    ) {
280                        ok (eval $eval, $eval . $both_reason);
281                    }
282                    foreach my $eval (
283                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
284                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
285                    ) {
286                        ok (eval $eval, $eval . $both_neg_reason);
287                    }
288
289                    next if $other_ord == $ord;
290
291                    # These start with the \b or \B.  They are included, based
292                    # on source code analysis, to force the testing of the FBC
293                    # (find_by_class) portions of regexec.c.
294                    foreach my $eval (
295                        qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
296                        qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
297                    ) {
298                        ok (eval $eval, $eval . $both_reason);
299                    }
300                    foreach my $eval (
301                        qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
302                        qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
303                    ) {
304                        ok (eval $eval, $eval . $both_neg_reason);
305                    }
306                }
307            } # End of each test case in a class
308        } # End of \w, \s, ...
309    } # End of utf8 upgraded or not
310}
311
312plan(curr_test() - 1);
313