1# Test the /a, /d, etc regex modifiers 2 3BEGIN { 4 chdir 't' if -d 't'; 5 require './test.pl'; 6 set_up_inc('../lib', '../dist/if'); 7 require './loc_tools.pl'; 8} 9 10use strict; 11use warnings; 12no warnings 'locale'; # Some /l tests use above-latin1 chars to make sure 13 # they work, even though they warn. 14use Config; 15 16plan('no_plan'); 17 18# Each case is a valid element of its hash key. Choose, where available, an 19# ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point. 20my %testcases = ( 21 '\w' => [ ord("A"), utf8::unicode_to_native(0xE2), 0x16B ], # Below expects these to all be alpha 22 '\d' => [ ord("0"), 0x0662 ], 23 '\s' => [ ord("\t"), utf8::unicode_to_native(0xA0), 0x1680 ], # Below expects these to be [:blank:] 24 '[:cntrl:]' => [ utf8::unicode_to_native(0x00), utf8::unicode_to_native(0x88) ], 25 '[:graph:]' => [ ord("&"), utf8::unicode_to_native(0xF7), 0x02C7 ], # Below expects these to be 26 # [:print:] 27 '[:lower:]' => [ ord("g"), utf8::unicode_to_native(0xE3), 0x0127 ], 28 '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), ord('>'), ord('|'), ord('-'), ord(','), ord(';'), ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), ord('"'), ord('('), ord(')'), ord('['), ord(']'), ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), ord("'"), utf8::unicode_to_native(0xBF), 0x055C ], 29 '[:upper:]' => [ ord("G"), utf8::unicode_to_native(0xC3), 0x0126 ], 30 '[:xdigit:]' => [ ord("4"), 0xFF15 ], 31); 32 33$testcases{'[:digit:]'} = $testcases{'\d'}; 34$testcases{'[:alnum:]'} = $testcases{'\w'}; 35$testcases{'[:alpha:]'} = $testcases{'\w'}; 36$testcases{'[:blank:]'} = $testcases{'\s'}; 37$testcases{'[:print:]'} = $testcases{'[:graph:]'}; 38$testcases{'[:space:]'} = $testcases{'\s'}; 39$testcases{'[:word:]'} = $testcases{'\w'}; 40 41my $utf8_locale; 42 43my @charsets = qw(a d u aa); 44my $locales_ok = locales_enabled('LC_CTYPE'); 45if (! is_miniperl() && $locales_ok) { 46 require POSIX; 47 my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // ""; 48 if ($current_locale eq 'C') { 49 50 # test for d_setlocale is repeated here because this one is compile 51 # time, and the one above is run time 52 use if $Config{d_setlocale}, 'locale'; 53 54 # Some implementations don't have the 128-255 range characters all 55 # mean nothing under the C locale (an example being VMS). This is 56 # legal, but since we don't know what the right answers should be, 57 # skip the locale tests in that situation. 58 for my $i (128 .. 255) { 59 goto skip_adding_C_locale 60 if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/; 61 } 62 push @charsets, 'l'; 63 64 skip_adding_C_locale: 65 66 # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale 67 $utf8_locale = find_utf8_ctype_locale(); 68 push @charsets, 'L' if defined $utf8_locale; 69 } 70} 71 72# For each possible character set... 73foreach my $charset (@charsets) { 74 my $locale; 75 my $charset_mod = lc $charset; 76 my $charset_display; 77 if ($charset_mod eq 'l') { 78 $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l') 79 ? "C" 80 : $utf8_locale 81 ); 82 die "Couldn't change locale" unless $locale; 83 $charset_display = $charset_mod . " ($locale)"; 84 } 85 else { 86 $charset_display = $charset_mod; 87 } 88 89 # And in utf8 or not 90 foreach my $upgrade ("", 'utf8::upgrade($a); ') { 91 92 # reverse gets the, \w, \s, \d first. 93 for my $class (reverse sort keys %testcases) { 94 95 # The complement of \w is \W; of [:posix:] is [:^posix:] 96 my $complement = $class; 97 if ($complement !~ s/ ( \[: ) /$1^/x) { 98 $complement = uc($class); 99 } 100 101 # For each test case 102 foreach my $ord (@{$testcases{$class}}) { 103 my $char = chr($ord); 104 $char = ($char eq '$') ? '\$' : display($char); 105 106 # > 255 already implies upgraded. Skip the ones that don't 107 # have an explicit upgrade. This shows more clearly in the 108 # output which tests are in utf8, or not. 109 next if $ord > 255 && ! $upgrade; 110 111 my $reason = ""; # Explanation output with each test 112 my $neg_reason = ""; 113 my $match = 1; # Calculated whether test regex should 114 # match or not 115 116 # Everything always matches in ASCII, or under /u, or under /l 117 # with a UTF-8 locale 118 if (utf8::native_to_unicode($ord) < 128 119 || $charset eq 'u' 120 || $charset eq 'L') 121 { 122 $reason = "\"$char\" is a $class under /$charset_display"; 123 $neg_reason = "\"$char\" is not a $complement under /$charset_display"; 124 } 125 elsif ($charset eq "a" || $charset eq "aa") { 126 $match = 0; 127 $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display"; 128 $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display"; 129 } 130 elsif ($ord > 255) { 131 $reason = "\"$char\" is a $class under /$charset_display"; 132 $neg_reason = "\"$char\" is not a $complement under /$charset_display"; 133 } 134 elsif ($charset eq 'l') { 135 136 # We are using the C locale, which is essentially ASCII, 137 # but under utf8, the above-latin1 chars are treated as 138 # Unicode) 139 $reason = "\"$char\" is not a $class in the C locale under /$charset_mod"; 140 $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod"; 141 $match = 0; 142 } 143 elsif ($upgrade) { 144 $reason = "\"$char\" is a $class in utf8 under /$charset_display"; 145 $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display"; 146 } 147 else { 148 $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display"; 149 $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)"; 150 $match = 0; 151 } 152 $reason = "; $reason" if $reason; 153 $neg_reason = "; $neg_reason" if $neg_reason; 154 155 my $op; 156 my $neg_op; 157 if ($match) { 158 $op = '=~'; 159 $neg_op = '!~'; 160 } 161 else { 162 $op = '!~'; 163 $neg_op = '=~'; 164 } 165 166 # In [...] or not 167 foreach my $bracketed (0, 1) { 168 my $lb = ""; 169 my $rb = ""; 170 if ($bracketed) { 171 172 # Adds an extra char to the character class to make sure 173 # that the class doesn't get optimized away. 174 $lb = ($bracketed) ? '[_' : ""; 175 $rb = ($bracketed) ? ']' : ""; 176 } 177 else { # [:posix:] must be inside outer [ ] 178 next if $class =~ /\[/; 179 } 180 181 my $length = 10; # For regexec.c regrepeat() cases by 182 # matching more than one item 183 # Test both class and its complement, and with one or more 184 # than one item to match. 185 foreach my $eval ( 186 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x], 187 qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x], 188 ) { 189 ok (eval $eval, $eval . $reason); 190 } 191 foreach my $eval ( 192 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x], 193 qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x], 194 ) { 195 ok (eval $eval, $eval . $neg_reason); 196 } 197 } 198 199 next if $class ne '\w'; 200 201 # Test \b, \B at beginning and end of string 202 foreach my $eval ( 203 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x], 204 qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x], 205 ) { 206 ok (eval $eval, $eval . $reason); 207 } 208 foreach my $eval ( 209 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x], 210 qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x], 211 ) { 212 ok (eval $eval, $eval . $neg_reason); 213 } 214 215 # Test \b, \B adjacent to a non-word char, both before it and 216 # after. We test with ASCII, Latin1 and Unicode non-word chars 217 foreach my $space_ord (@{$testcases{'\s'}}) { 218 219 # Useless to try to test non-utf8 when the ord itself 220 # forces utf8 221 next if $space_ord > 255 && ! $upgrade; 222 223 my $space = display(chr $space_ord); 224 225 foreach my $eval ( 226 qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], 227 qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x], 228 ) { 229 ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w"); 230 } 231 foreach my $eval ( 232 qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], 233 qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x], 234 ) { 235 ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w"); 236 } 237 } 238 239 # Test \b, \B in the middle of two nominally word chars, but 240 # one or both may be considered non-word depending on range 241 # and charset. 242 foreach my $other_ord (@{$testcases{'\w'}}) { 243 next if $other_ord > 255 && ! $upgrade; 244 my $other = display(chr $other_ord); 245 246 # Determine if the other char is a word char in current 247 # circumstances 248 my $other_is_word = 1; 249 my $other_reason = "\"$other\" is a $class under /$charset_display"; 250 my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display"; 251 if (utf8::native_to_unicode($other_ord) > 127 252 && $charset ne 'u' && $charset ne 'L' 253 && (($charset eq "a" || $charset eq "aa") 254 || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade)))) 255 { 256 $other_is_word = 0; 257 $other_reason = "\"$other\" is not a $class under /$charset_display"; 258 $other_neg_reason = "\"$other\" is a $complement under /$charset_display"; 259 } 260 my $both_reason = $reason; 261 $both_reason .= "; $other_reason" if $other_ord != $ord; 262 my $both_neg_reason = $neg_reason; 263 $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord; 264 265 # If both are the same wordness, then \b will fail; \B 266 # succeed 267 if ($match == $other_is_word) { 268 $op = '!~'; 269 $neg_op = '=~'; 270 } 271 else { 272 $op = '=~'; 273 $neg_op = '!~'; 274 } 275 276 foreach my $eval ( 277 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x], 278 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x], 279 ) { 280 ok (eval $eval, $eval . $both_reason); 281 } 282 foreach my $eval ( 283 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x], 284 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x], 285 ) { 286 ok (eval $eval, $eval . $both_neg_reason); 287 } 288 289 next if $other_ord == $ord; 290 291 # These start with the \b or \B. They are included, based 292 # on source code analysis, to force the testing of the FBC 293 # (find_by_class) portions of regexec.c. 294 foreach my $eval ( 295 qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x], 296 qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x], 297 ) { 298 ok (eval $eval, $eval . $both_reason); 299 } 300 foreach my $eval ( 301 qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x], 302 qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x], 303 ) { 304 ok (eval $eval, $eval . $both_neg_reason); 305 } 306 } 307 } # End of each test case in a class 308 } # End of \w, \s, ... 309 } # End of utf8 upgraded or not 310} 311 312plan(curr_test() - 1); 313