1#!./perl 2 3# The tests are in a separate file 't/re/re_tests'. 4# Each line in that file is a separate test. 5# There are five columns, separated by tabs. 6# An optional sixth column is used to give a reason, only when skipping tests 7# 8# Column 1 contains the pattern, optionally enclosed in C<''> C<::> or 9# C<//>. Modifiers can be put after the closing delimiter. C<''> will 10# automatically be added to any other patterns. 11# 12# Column 2 contains the string to be matched. 13# 14# Column 3 contains the expected result: 15# y expect a match 16# n expect no match 17# c expect an error 18# T the test is a TODO (can be combined with y/n/c) 19# M skip test on miniperl (combine with y/n/c/T) 20# B test exposes a known bug in Perl, should be skipped 21# b test exposes a known bug in Perl, should be skipped if noamp 22# t test exposes a bug with threading, TODO if qr_embed_thr 23# s test should only be run for regex_sets_compat.t 24# S test should not be run for regex_sets_compat.t 25# a test should only be run on ASCII platforms 26# e test should only be run on EBCDIC platforms 27# 28# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 29# 30# Column 4 contains a string, usually C<$&>. 31# 32# Column 5 contains the expected result of double-quote 33# interpolating that string after the match, or start of error message. 34# 35# Column 6, if present, contains a reason why the test is skipped. 36# This is printed with "skipped", for harness to pick up. 37# 38# Column 7 can be used for comments 39# 40# \n in the tests are interpolated, as are variables of the form ${\w+}. 41# 42# Blanks lines are treated as PASSING tests to keep the line numbers 43# linked to the test number. 44# 45# If you want to add a regular expression test that can't be expressed 46# in this format, don't add it here: put it in re/pat.t instead. 47# 48# Note that the inputs get passed on as "m're'", so the re bypasses the lexer. 49# This means this file cannot be used for testing anything that the lexer 50# handles; in 5.12 this means just \N{NAME} and \N{U+...}. 51# 52# Note that columns 2,3 and 5 are all enclosed in double quotes and then 53# evalled; so something like a\"\x{100}$1 has length 3+length($1). 54# 55# \x... and \o{...} constants are automatically converted to the native 56# character set if necessary. \[0-7] constants aren't 57 58my ($file, $iters); 59BEGIN { 60 $iters = shift || 1; # Poor man performance suite, 10000 is OK. 61 62 # Do this open before any chdir 63 $file = shift; 64 if (defined $file) { 65 open TESTS, $file or die "Can't open $file"; 66 } 67 68 chdir 't' if -d 't'; 69 @INC = qw '../lib ../ext/re'; 70 if (!defined &DynaLoader::boot_DynaLoader) { # miniperl 71 print("1..0 # Skip Unicode tables not built yet\n"), exit 72 unless eval 'require "unicore/UCD.pl"'; 73 } 74 75 # Some of the tests need a locale; which one doesn't much matter, except 76 # that it be valid. Make sure of that 77 eval { require POSIX; 78 POSIX->import(qw(LC_ALL setlocale)); 79 POSIX::setlocale(&LC_ALL, "C"); 80 }; 81} 82 83sub _comment { 84 return map { /^#/ ? "$_\n" : "# $_\n" } 85 map { split /\n/ } @_; 86} 87 88use strict; 89use warnings FATAL=>"all"; 90no warnings 'experimental::vlb'; 91our ($bang, $ffff, $nulnul); # used by the tests 92our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers 93 94if ($no_null && ! eval { require XS::APItest }) { 95 print("1..0 # Skip XS::APItest not available\n"), exit 96} 97 98my $expanded_text = "expanded name from original test number"; 99my $expanded_text_re = qr/$expanded_text/; 100 101if (!defined $file) { 102 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; 103} 104 105my @tests = <TESTS>; 106 107close TESTS; 108 109my $test_num = 0; 110 111# Some scenarios add extra tests to those just read in. For those where there 112# is a character set translation, the added test will already have been 113# translated, so any test number beginning with this one shouldn't be 114# translated again. 115my $first_already_converted_test_num = @tests + 1; 116 117sub convert_from_ascii_guts { 118 my $string_ref = shift; 119 120 return if $test_num >= $first_already_converted_test_num; 121 122 #my $save = $string_ref; 123 # Convert \x{...}, \o{...} 124 $$string_ref =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex; 125 $$string_ref =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex; 126 127 # Convert \xAB 128 $$string_ref =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex; 129 130 # Convert \xA 131 $$string_ref =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex; 132 133 #print STDERR __LINE__, ": $save\n$string_ref\n" if $save ne $string_ref; 134 return; 135} 136 137*convert_from_ascii = (ord("A") == 65) 138 ? sub { 1; } 139 : \&convert_from_ascii_guts; 140 141$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. 142$ffff = chr(0xff) x 2; 143$nulnul = "\0" x 2; 144my $OP = $qr ? 'qr' : 'm'; 145 146$| = 1; 147$::normalize_pat = $::normalize_pat; # silence warning 148TEST: 149foreach (@tests) { 150 $test_num++; 151 if (!/\S/ || /^\s*#/ || /^__END__$/) { 152 chomp; 153 my ($not,$comment)= split /\s*#\s*/, $_, 2; 154 $comment ||= "(blank line)"; 155 print "ok $test_num # $comment\n"; 156 next; 157 } 158 chomp; 159 s/\\n/\n/g unless $regex_sets; 160 my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7); 161 $comment = "" unless defined $comment; 162 if (!defined $subject) { 163 die "Bad test definition on line $test_num: $_\n"; 164 } 165 $reason = '' unless defined $reason; 166 my $input = join(':',$pat,$subject,$result,$repl,$expect); 167 168 # the double '' below keeps simple syntax highlighters from going crazy 169 $pat = "'$pat'" unless $pat =~ /^[:''\/]/; 170 $pat =~ s/(\$\{\w+\})/$1/eeg; 171 $pat =~ s/\\n/\n/g unless $regex_sets; 172 convert_from_ascii(\$pat); 173 174 my $no_null_pat; 175 if ($no_null && $pat =~ /^'(.*)'\z/) { 176 $no_null_pat = XS::APItest::string_without_null($1); 177 } 178 179 convert_from_ascii(\$subject); 180 $subject = eval qq("$subject"); die $@ if $@; 181 182 convert_from_ascii(\$expect); 183 $expect = eval qq("$expect"); die $@ if $@; 184 my $has_amp = $input =~ /\$[&\`\']/; 185 $expect = $repl = '-' if $skip_amp and $has_amp; 186 187 my $todo_qr = $qr_embed_thr && ($result =~ s/t//); 188 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); 189 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; 190 191 if ($::normalize_pat) { 192 my $opat= $pat; 193 # Convert (x)? to (?:(x)|) and (x)+ to (?:(x))+ and (x)* to (?:(x))* 194 $pat =~ s/\(([\w|.]+)\)\?(?![+*?])/(?:($1)|)/g; 195 $pat =~ s/\(([\w|.]+)\)([+*])(?![+*?])/(?:($1))$2/g; 196 if ($opat eq $pat) { 197 # we didn't change anything, no point in testing it again. 198 $skip++; 199 $reason = "Test not valid for $0"; 200 } elsif ($comment=~/!\s*normal/) { 201 $result .= "T"; 202 $comment = "# Known to be broken under $0"; 203 } 204 } 205 206 if ($result =~ s/ ( [Ss] ) //x) { 207 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { 208 $skip++; 209 $reason = "Test not valid for $0"; 210 } 211 } 212 if ($result =~ s/a// && ord("A") != 65) { 213 $skip++; 214 $reason = "Test is only valid for ASCII platforms. $reason"; 215 } 216 if ($result =~ s/e// && ord("A") != 193) { 217 $skip++; 218 $reason = "Test is only valid for EBCDIC platforms. $reason"; 219 } 220 $reason = 'skipping $&' if $reason eq '' && $skip_amp; 221 $result =~ s/B//i unless $skip; 222 my $todo= ($result =~ s/T// && (!$skip_amp || !$has_amp)) ? " # TODO" : ""; 223 my $testname= $test_num; 224 if ($comment) { 225 $comment=~s/^\s*(?:#\s*)?//; 226 $testname .= " - $comment" if $comment; 227 } 228 if (! $skip && $alpha_assertions) { 229 my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x; 230 if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) { 231 $skip++; 232 $reason = "Pattern doesn't contain assertions"; 233 } 234 elsif ($comment !~ $expanded_text_re) { 235 my $expanded_pat = $pat; 236 237 $pat =~ s/\( \? > /(*atomic:/xg; 238 239 if ($pat =~ s/\( \? = /(*pla:/xg) { 240 $expanded_pat =~ s//(*positive_lookahead:/g; 241 } 242 if ($pat =~ s/\( \? ! /(*nla:/xg) { 243 $expanded_pat =~ s//(*negative_lookahead:/g; 244 } 245 if ($pat =~ s/\( \? <= /(*plb:/xg) { 246 $expanded_pat =~ s//(*positive_lookbehind:/g; 247 } 248 if ($pat =~ s/\( \? <! /(*nlb:/xg) { 249 $expanded_pat =~ s//(*negative_lookbehind:/g; 250 } 251 if ($expanded_pat ne $pat) { 252 $comment .= " $expanded_text $test_num"; 253 push @tests, join "\t", $expanded_pat, 254 $subject // "", 255 $result // "", 256 $repl // "", 257 $expect // "", 258 $reason // "", 259 $comment; 260 } 261 } 262 } 263 elsif (! $skip && $regex_sets) { 264 265 # If testing regex sets, change the [bracketed] classes into 266 # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a 267 # class. (We don't bother looking for an odd number of backslashes, 268 # as this hasn't been needed so far.) 269 if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) { 270 $skip++; 271 $reason = "Pattern doesn't contain [brackets]"; 272 } 273 else { # Use non-regex features of Perl to accomplish this. 274 my $modified = ""; 275 my $in_brackets = 0; 276 277 # Go through the pattern character-by-character. We also add 278 # blanks around each token to test the /x parts of (?[ ]) 279 my $pat_len = length($pat); 280 CHAR: for (my $i = 0; $i < $pat_len; $i++) { 281 my $curchar = substr($pat, $i, 1); 282 if ($curchar eq '\\') { 283 $modified .= " " if $in_brackets; 284 $modified .= $curchar; 285 $i++; 286 287 # Get the character the backslash is escaping 288 $curchar = substr($pat, $i, 1); 289 $modified .= $curchar; 290 291 # If the character following that is a '{}', treat the 292 # entire amount as a single token 293 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { 294 my $j = index($pat, '}', $i+2); 295 if ($j < 0) { 296 last unless $in_brackets; 297 if ($result eq 'c') { 298 $skip++; 299 $reason = "Can't handle compilation errors with unmatched '{'"; 300 } 301 else { 302 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 303 next TEST; 304 } 305 } 306 $modified .= substr($pat, $i+1, $j - $i); 307 $i = $j; 308 } 309 elsif ($curchar eq 'x') { 310 311 # \x without brackets is supposed to be followed by 2 312 # hex digits. Take up to 2, and then add a blank 313 # after the last one. This avoids getting errors from 314 # (?[ ]) for run-ons, like \xabc 315 my $j = $i + 1; 316 for (; $j < $i + 3 && $j < $pat_len; $j++) { 317 my $curord = ord(substr($pat, $j, 1)); 318 if (!(($curord >= ord("A") && $curord <= ord("F")) 319 || ($curord >= ord("a") && $curord <= ord("f")) 320 || ($curord >= ord("0") && $curord <= ord("9")))) 321 { 322 $j++; 323 last; 324 } 325 } 326 $j--; 327 $modified .= substr($pat, $i + 1, $j - $i); 328 $modified .= " " if $in_brackets; 329 $i = $j; 330 } 331 elsif (ord($curchar) >= ord('0') 332 && (ord($curchar) <= ord('7'))) 333 { 334 # Similarly, octal constants have up to 3 digits. 335 my $j = $i + 1; 336 for (; $j < $i + 3 && $j < $pat_len; $j++) { 337 my $curord = ord(substr($pat, $j, 1)); 338 if (! ($curord >= ord("0") && $curord <= ord("7"))) { 339 $j++; 340 last; 341 } 342 } 343 $j--; 344 $modified .= substr($pat, $i + 1, $j - $i); 345 $i = $j; 346 } 347 348 next; 349 } # End of processing a backslash sequence 350 351 if (! $in_brackets # Skip (?{ }) 352 && $curchar eq '(' 353 && $i < $pat_len - 2 354 && substr($pat, $i+1, 1) eq '?' 355 && substr($pat, $i+2, 1) eq '{') 356 { 357 $skip++; 358 $reason = "Pattern contains '(?{'"; 359 last; 360 } 361 362 # Closing ']' 363 if ($curchar eq ']' && $in_brackets) { 364 $modified .= " ] ])"; 365 $in_brackets = 0; 366 next; 367 } 368 369 # A regular character. 370 if ($curchar ne '[') { 371 $modified .= " " if $in_brackets; 372 $modified .= $curchar; 373 next; 374 } 375 376 # Here is a '['; If not in a bracketed class, treat as the 377 # beginning of one. 378 if (! $in_brackets) { 379 $in_brackets = 1; 380 $modified .= "(?[ [ "; 381 382 # An immediately following ']' or '^]' is not the ending 383 # of the class, but is to be treated literally. 384 if ($i < $pat_len - 1 385 && substr($pat, $i+1, 1) eq ']') 386 { 387 $i ++; 388 $modified .= " ] "; 389 } 390 elsif ($i < $pat_len - 2 391 && substr($pat, $i+1, 1) eq '^' 392 && substr($pat, $i+2, 1) eq ']') 393 { 394 $i += 2; 395 $modified .= " ^ ] "; 396 } 397 next; 398 } 399 400 # Here is a plain '[' within [ ]. Could mean wants to 401 # match a '[', or it could be a posix class that has a 402 # corresponding ']'. Absorb either 403 404 $modified .= ' ['; 405 last if $i >= $pat_len - 1; 406 407 $i++; 408 $curchar = substr($pat, $i, 1); 409 if ($curchar =~ /[:=.]/) { 410 for (my $j = $i + 1; $j < $pat_len; $j++) { 411 next unless substr($pat, $j, 1) eq ']'; 412 last if $j - $i < 2; 413 if (substr($pat, $j - 1, 1) eq $curchar) { 414 # Here, is a posix class 415 $modified .= substr($pat, $i, $j - $i + 1) . " "; 416 $i = $j; 417 next CHAR; 418 } 419 } 420 } 421 422 # Here wasn't a posix class, just process normally 423 $modified .= " $curchar "; 424 } 425 426 if ($in_brackets && ! $skip) { 427 if ($result eq 'c') { 428 $skip++; 429 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; 430 } 431 else { 432 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 433 next TEST; 434 } 435 } 436 437 # Use our modified pattern instead of the original 438 $pat = $modified; 439 } 440 } 441 if ($::normalize_pat){ 442 if (!$skip && ($result eq "y" or $result eq "n")) { 443 my $opat= $pat; 444 # Convert (x)? to (?:(x)|) and (x)+ to (?:(x))+ and (x)* to (?:(x))* 445 $pat =~ s/\(([\w|.]+)\)\?(?![+*?])/(?:($1)|)/g; 446 $pat =~ s/\(([\w|.]+)\)([+*])(?![+*?])/(?:($1))$2/g; 447 # inject an EVAL into the front of the pattern. 448 # this should disable all optimizations. 449 $pat =~ s/\A(.)/$1(?{ \$the_counter++ })/ 450 or die $pat; 451 } elsif (!$skip) { 452 $skip = $reason = "Test not applicable to $0"; 453 } 454 } 455 456 for my $study ('', 'study $subject;', 'utf8::upgrade($subject);', 457 'utf8::upgrade($subject); study $subject;') { 458 if ( $skip ) { 459 print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n"; 460 next TEST; 461 } 462 our $the_counter = 0; # used in normalization tests 463 # Need to make a copy, else the utf8::upgrade of an already studied 464 # scalar confuses things. 465 my $subject = $subject; 466 $subject = XS::APItest::string_without_null($subject) if $no_null; 467 my $c = $iters; 468 my ($code, $match, $got); 469 if ($repl eq 'pos') { 470 my $patcode = defined $no_null_pat ? '/$no_null_pat/g' 471 : "m${pat}g"; 472 $code= <<EOFCODE; 473 $study 474 pos(\$subject)=0; 475 \$match = ( \$subject =~ $patcode ); 476 \$got = pos(\$subject); 477EOFCODE 478 } 479 elsif ($qr_embed) { 480 $code= <<EOFCODE; 481 my \$RE = qr$pat; 482 $study 483 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 484 \$got = "$repl"; 485EOFCODE 486 } 487 elsif ($qr_embed_thr) { 488 $code= <<EOFCODE; 489 # Can't run the match in a subthread, but can do this and 490 # clone the pattern the other way. 491 my \$RE = threads->new(sub {qr$pat})->join(); 492 $study 493 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 494 \$got = "$repl"; 495EOFCODE 496 } 497 elsif ($no_null) { 498 my $patcode = defined $no_null_pat ? '/$no_null_pat/' 499 : $pat; 500 $code= <<EOFCODE; 501 $study 502 \$match = (\$subject =~ $OP$pat) while \$c--; 503 \$got = "$repl"; 504EOFCODE 505 } 506 else { 507 $code= <<EOFCODE; 508 $study 509 \$match = (\$subject =~ $OP$pat) while \$c--; 510 \$got = "$repl"; 511EOFCODE 512 } 513 $code = "$code" if $regex_sets; 514 #$code.=qq[\n\$expect="$expect";\n]; 515 #use Devel::Peek; 516 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; 517 { 518 # Probably we should annotate specific tests with which warnings 519 # categories they're known to trigger, and hence should be 520 # disabled just for that test 521 no warnings qw(uninitialized regexp deprecated); 522 eval $code; 523 } 524 chomp( my $err = $@ ); 525 if ($result eq 'c') { 526 if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST } 527 last; # no need to study a syntax error 528 } 529 elsif ( $todo_qr ) { 530 print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n"; 531 next TEST; 532 } 533 elsif ($@) { 534 print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; 535 } 536 elsif ($result =~ /^n/) { 537 if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST } 538 } 539 else { 540 if (!$match || $got ne $expect) { 541 eval { require Data::Dumper }; 542 no warnings "utf8"; # But handle should be utf8 543 if ($@ || !defined &DynaLoader::boot_DynaLoader) { 544 # Data::Dumper will load on miniperl, but fail when used in 545 # anger as it tries to load B. I'd prefer to keep the 546 # regular calls below outside of an eval so that real 547 # (unknown) failures get spotted, not ignored. 548 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); 549 } 550 else { # better diagnostics 551 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; 552 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; 553 my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump; 554 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n"); 555 } 556 next TEST; 557 } 558 } 559 } 560 print "ok $testname$todo\n"; 561} 562 563printf "1..%d\n# $iters iterations\n", scalar @tests; 564 5651; 566