1#!./perl 2 3# The tests are in a separate file 't/re/re_tests'. 4# Each line in that file is a separate test. 5# There are five columns, separated by tabs. 6# An optional sixth column is used to give a reason, only when skipping tests 7# 8# Column 1 contains the pattern, optionally enclosed in C<''> C<::> or 9# C<//>. Modifiers can be put after the closing delimiter. C<''> will 10# automatically be added to any other patterns. 11# 12# Column 2 contains the string to be matched. 13# 14# Column 3 contains the expected result: 15# y expect a match 16# n expect no match 17# c expect an error 18# T the test is a TODO (can be combined with y/n/c) 19# M skip test on miniperl (combine with y/n/c/T) 20# B test exposes a known bug in Perl, should be skipped 21# b test exposes a known bug in Perl, should be skipped if noamp 22# t test exposes a bug with threading, TODO if qr_embed_thr 23# s test should only be run for regex_sets_compat.t 24# S test should not be run for regex_sets_compat.t 25# a test should only be run on ASCII platforms 26# e test should only be run on EBCDIC platforms 27# 28# Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 29# 30# Column 4 contains a string, usually C<$&>. 31# 32# Column 5 contains the expected result of double-quote 33# interpolating that string after the match, or start of error message. 34# 35# Column 6, if present, contains a reason why the test is skipped. 36# This is printed with "skipped", for harness to pick up. 37# 38# Column 7 can be used for comments 39# 40# \n in the tests are interpolated, as are variables of the form ${\w+}. 41# 42# Blanks lines are treated as PASSING tests to keep the line numbers 43# linked to the test number. 44# 45# If you want to add a regular expression test that can't be expressed 46# in this format, don't add it here: put it in re/pat.t instead. 47# 48# Note that the inputs get passed on as "m're'", so the re bypasses the lexer. 49# This means this file cannot be used for testing anything that the lexer 50# handles; in 5.12 this means just \N{NAME} and \N{U+...}. 51# 52# Note that columns 2,3 and 5 are all enclosed in double quotes and then 53# evalled; so something like a\"\x{100}$1 has length 3+length($1). 54# 55# \x... and \o{...} constants are automatically converted to the native 56# character set if necessary. \[0-7] constants aren't 57 58my ($file, $iters); 59BEGIN { 60 $iters = shift || 1; # Poor man performance suite, 10000 is OK. 61 62 # Do this open before any chdir 63 $file = shift; 64 if (defined $file) { 65 open TESTS, $file or die "Can't open $file"; 66 } 67 68 chdir 't' if -d 't'; 69 @INC = qw '../lib ../ext/re'; 70 if (!defined &DynaLoader::boot_DynaLoader) { # miniperl 71 print("1..0 # Skip Unicode tables not built yet\n"), exit 72 unless eval 'require "unicore/Heavy.pl"'; 73 } 74 75 # Some of the tests need a locale; which one doesn't much matter, except 76 # that it be valid. Make sure of that 77 eval { require POSIX; 78 POSIX->import(qw(LC_ALL setlocale)); 79 POSIX::setlocale(&LC_ALL, "C"); 80 }; 81} 82 83sub _comment { 84 return map { /^#/ ? "$_\n" : "# $_\n" } 85 map { split /\n/ } @_; 86} 87 88sub convert_from_ascii { 89 my $string = shift; 90 91 #my $save = $string; 92 # Convert \x{...}, \o{...} 93 $string =~ s/ (?<! \\ ) \\x\{ ( .*? ) } / "\\x{" . sprintf("%X", utf8::unicode_to_native(hex $1)) . "}" /gex; 94 $string =~ s/ (?<! \\ ) \\o\{ ( .*? ) } / "\\o{" . sprintf("%o", utf8::unicode_to_native(oct $1)) . "}" /gex; 95 96 # Convert \xAB 97 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9]{2} ) / "\\x" . sprintf("%02X", utf8::unicode_to_native(hex $1)) /gex; 98 99 # Convert \xA 100 $string =~ s/ (?<! \\ ) \\x ( [A-Fa-f0-9] ) (?! [A-Fa-f0-9] ) / "\\x" . sprintf("%X", utf8::unicode_to_native(hex $1)) /gex; 101 102 #print STDERR __LINE__, ": $save\n$string\n" if $save ne $string; 103 return $string; 104} 105 106use strict; 107use warnings FATAL=>"all"; 108no warnings 'experimental::vlb'; 109our ($bang, $ffff, $nulnul); # used by the tests 110our ($qr, $skip_amp, $qr_embed, $qr_embed_thr, $regex_sets, $alpha_assertions, $no_null); # set by our callers 111 112my $expanded_text = "expanded name from original test number"; 113my $expanded_text_re = qr/$expanded_text/; 114 115if (!defined $file) { 116 open TESTS, 're/re_tests' or die "Can't open re/re_tests: $!"; 117} 118 119my @tests = <TESTS>; 120 121close TESTS; 122 123$bang = sprintf "\\%03o", ord "!"; # \41 would not be portable. 124$ffff = chr(0xff) x 2; 125$nulnul = "\0" x 2; 126my $OP = $qr ? 'qr' : 'm'; 127 128$| = 1; 129 130my $test; 131TEST: 132foreach (@tests) { 133 $test++; 134 if (!/\S/ || /^\s*#/ || /^__END__$/) { 135 chomp; 136 my ($not,$comment)= split /\s*#\s*/, $_, 2; 137 $comment ||= "(blank line)"; 138 print "ok $test # $comment\n"; 139 next; 140 } 141 chomp; 142 s/\\n/\n/g unless $regex_sets; 143 my ($pat, $subject, $result, $repl, $expect, $reason, $comment) = split(/\t/,$_,7); 144 $comment = "" unless defined $comment; 145 if (!defined $subject) { 146 die "Bad test definition on line $test: $_\n"; 147 } 148 $reason = '' unless defined $reason; 149 my $input = join(':',$pat,$subject,$result,$repl,$expect); 150 151 # the double '' below keeps simple syntax highlighters from going crazy 152 $pat = "'$pat'" unless $pat =~ /^[:''\/]/; 153 $pat =~ s/(\$\{\w+\})/$1/eeg; 154 $pat =~ s/\\n/\n/g unless $regex_sets; 155 $pat = convert_from_ascii($pat) if ord("A") != 65; 156 157 my $no_null_pat; 158 if ($no_null && $pat =~ /^'(.*)'\z/) { 159 $no_null_pat = XS::APItest::string_without_null($1); 160 } 161 162 $subject = convert_from_ascii($subject) if ord("A") != 65; 163 $subject = eval qq("$subject"); die $@ if $@; 164 165 $expect = convert_from_ascii($expect) if ord("A") != 65; 166 $expect = eval qq("$expect"); die $@ if $@; 167 $expect = $repl = '-' if $skip_amp and $input =~ /\$[&\`\']/; 168 169 my $todo_qr = $qr_embed_thr && ($result =~ s/t//); 170 my $skip = ($skip_amp ? ($result =~ s/B//i) : ($result =~ s/B//)); 171 ++$skip if $result =~ s/M// && !defined &DynaLoader::boot_DynaLoader; 172 if ($result =~ s/ ( [Ss] ) //x) { 173 if (($1 eq 'S' && $regex_sets) || ($1 eq 's' && ! $regex_sets)) { 174 $skip++; 175 $reason = "Test not valid for $0"; 176 } 177 } 178 if ($result =~ s/a// && ord("A") != 65) { 179 $skip++; 180 $reason = "Test is only valid for ASCII platforms. $reason"; 181 } 182 if ($result =~ s/e// && ord("A") != 193) { 183 $skip++; 184 $reason = "Test is only valid for EBCDIC platforms. $reason"; 185 } 186 $reason = 'skipping $&' if $reason eq '' && $skip_amp; 187 $result =~ s/B//i unless $skip; 188 my $todo= $result =~ s/T// ? " # TODO" : ""; 189 my $testname= $test; 190 if ($comment) { 191 $comment=~s/^\s*(?:#\s*)?//; 192 $testname .= " - $comment" if $comment; 193 } 194 if (! $skip && $alpha_assertions) { 195 my $assertions_re = qr/ (?: \Q(?\E (?: > | <? [=>] ) ) /x; 196 if ($pat !~ $assertions_re && $comment !~ $expanded_text_re) { 197 $skip++; 198 $reason = "Pattern doesn't contain assertions"; 199 } 200 elsif ($comment !~ $expanded_text_re) { 201 my $expanded_pat = $pat; 202 203 $pat =~ s/\( \? > /(*atomic:/xg; 204 205 if ($pat =~ s/\( \? = /(*pla:/xg) { 206 $expanded_pat =~ s//(*positive_lookahead:/g; 207 } 208 if ($pat =~ s/\( \? ! /(*nla:/xg) { 209 $expanded_pat =~ s//(*negative_lookahead:/g; 210 } 211 if ($pat =~ s/\( \? <= /(*plb:/xg) { 212 $expanded_pat =~ s//(*positive_lookbehind:/g; 213 } 214 if ($pat =~ s/\( \? <! /(*nlb:/xg) { 215 $expanded_pat =~ s//(*negative_lookbehind:/g; 216 } 217 if ($expanded_pat ne $pat) { 218 $comment .= " $expanded_text $test"; 219 push @tests, join "\t", $expanded_pat, 220 $subject // "", 221 $result // "", 222 $repl // "", 223 $expect // "", 224 $reason // "", 225 $comment; 226 } 227 } 228 } 229 elsif (! $skip && $regex_sets) { 230 231 # If testing regex sets, change the [bracketed] classes into 232 # (?[bracketed]). But note that '\[' and '\c[' don't introduce such a 233 # class. (We don't bother looking for an odd number of backslashes, 234 # as this hasn't been needed so far.) 235 if ($pat !~ / (?<!\\c) (?<!\\) \[ /x) { 236 $skip++; 237 $reason = "Pattern doesn't contain [brackets]"; 238 } 239 else { # Use non-regex features of Perl to accomplish this. 240 my $modified = ""; 241 my $in_brackets = 0; 242 243 # Go through the pattern character-by-character. We also add 244 # blanks around each token to test the /x parts of (?[ ]) 245 my $pat_len = length($pat); 246 CHAR: for (my $i = 0; $i < $pat_len; $i++) { 247 my $curchar = substr($pat, $i, 1); 248 if ($curchar eq '\\') { 249 $modified .= " " if $in_brackets; 250 $modified .= $curchar; 251 $i++; 252 253 # Get the character the backslash is escaping 254 $curchar = substr($pat, $i, 1); 255 $modified .= $curchar; 256 257 # If the character following that is a '{}', treat the 258 # entire amount as a single token 259 if ($i < $pat_len -1 && substr($pat, $i+1, 1) eq '{') { 260 my $j = index($pat, '}', $i+2); 261 if ($j < 0) { 262 last unless $in_brackets; 263 if ($result eq 'c') { 264 $skip++; 265 $reason = "Can't handle compilation errors with unmatched '{'"; 266 } 267 else { 268 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 269 next TEST; 270 } 271 } 272 $modified .= substr($pat, $i+1, $j - $i); 273 $i = $j; 274 } 275 elsif ($curchar eq 'x') { 276 277 # \x without brackets is supposed to be followed by 2 278 # hex digits. Take up to 2, and then add a blank 279 # after the last one. This avoids getting errors from 280 # (?[ ]) for run-ons, like \xabc 281 my $j = $i + 1; 282 for (; $j < $i + 3 && $j < $pat_len; $j++) { 283 my $curord = ord(substr($pat, $j, 1)); 284 if (!(($curord >= ord("A") && $curord <= ord("F")) 285 || ($curord >= ord("a") && $curord <= ord("f")) 286 || ($curord >= ord("0") && $curord <= ord("9")))) 287 { 288 $j++; 289 last; 290 } 291 } 292 $j--; 293 $modified .= substr($pat, $i + 1, $j - $i); 294 $modified .= " " if $in_brackets; 295 $i = $j; 296 } 297 elsif (ord($curchar) >= ord('0') 298 && (ord($curchar) <= ord('7'))) 299 { 300 # Similarly, octal constants have up to 3 digits. 301 my $j = $i + 1; 302 for (; $j < $i + 3 && $j < $pat_len; $j++) { 303 my $curord = ord(substr($pat, $j, 1)); 304 if (! ($curord >= ord("0") && $curord <= ord("7"))) { 305 $j++; 306 last; 307 } 308 } 309 $j--; 310 $modified .= substr($pat, $i + 1, $j - $i); 311 $i = $j; 312 } 313 314 next; 315 } # End of processing a backslash sequence 316 317 if (! $in_brackets # Skip (?{ }) 318 && $curchar eq '(' 319 && $i < $pat_len - 2 320 && substr($pat, $i+1, 1) eq '?' 321 && substr($pat, $i+2, 1) eq '{') 322 { 323 $skip++; 324 $reason = "Pattern contains '(?{'"; 325 last; 326 } 327 328 # Closing ']' 329 if ($curchar eq ']' && $in_brackets) { 330 $modified .= " ] ])"; 331 $in_brackets = 0; 332 next; 333 } 334 335 # A regular character. 336 if ($curchar ne '[') { 337 $modified .= " " if $in_brackets; 338 $modified .= $curchar; 339 next; 340 } 341 342 # Here is a '['; If not in a bracketed class, treat as the 343 # beginning of one. 344 if (! $in_brackets) { 345 $in_brackets = 1; 346 $modified .= "(?[ [ "; 347 348 # An immediately following ']' or '^]' is not the ending 349 # of the class, but is to be treated literally. 350 if ($i < $pat_len - 1 351 && substr($pat, $i+1, 1) eq ']') 352 { 353 $i ++; 354 $modified .= " ] "; 355 } 356 elsif ($i < $pat_len - 2 357 && substr($pat, $i+1, 1) eq '^' 358 && substr($pat, $i+2, 1) eq ']') 359 { 360 $i += 2; 361 $modified .= " ^ ] "; 362 } 363 next; 364 } 365 366 # Here is a plain '[' within [ ]. Could mean wants to 367 # match a '[', or it could be a posix class that has a 368 # corresponding ']'. Absorb either 369 370 $modified .= ' ['; 371 last if $i >= $pat_len - 1; 372 373 $i++; 374 $curchar = substr($pat, $i, 1); 375 if ($curchar =~ /[:=.]/) { 376 for (my $j = $i + 1; $j < $pat_len; $j++) { 377 next unless substr($pat, $j, 1) eq ']'; 378 last if $j - $i < 2; 379 if (substr($pat, $j - 1, 1) eq $curchar) { 380 # Here, is a posix class 381 $modified .= substr($pat, $i, $j - $i + 1) . " "; 382 $i = $j; 383 next CHAR; 384 } 385 } 386 } 387 388 # Here wasn't a posix class, just process normally 389 $modified .= " $curchar "; 390 } 391 392 if ($in_brackets && ! $skip) { 393 if ($result eq 'c') { 394 $skip++; 395 $reason = "Can't figure out where to put the (?[ and ]) since is a compilation error"; 396 } 397 else { 398 print "not ok $testname # Problem in $0; original = '$pat'; mod = '$modified'\n"; 399 next TEST; 400 } 401 } 402 403 # Use our modified pattern instead of the original 404 $pat = $modified; 405 } 406 } 407 408 for my $study ('', 'study $subject;', 'utf8::upgrade($subject);', 409 'utf8::upgrade($subject); study $subject;') { 410 # Need to make a copy, else the utf8::upgrade of an already studied 411 # scalar confuses things. 412 my $subject = $subject; 413 $subject = XS::APItest::string_without_null($subject) if $no_null; 414 my $c = $iters; 415 my ($code, $match, $got); 416 if ($repl eq 'pos') { 417 my $patcode = defined $no_null_pat ? '/$no_null_pat/g' 418 : "m${pat}g"; 419 $code= <<EOFCODE; 420 $study 421 pos(\$subject)=0; 422 \$match = ( \$subject =~ $patcode ); 423 \$got = pos(\$subject); 424EOFCODE 425 } 426 elsif ($qr_embed) { 427 $code= <<EOFCODE; 428 my \$RE = qr$pat; 429 $study 430 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 431 \$got = "$repl"; 432EOFCODE 433 } 434 elsif ($qr_embed_thr) { 435 $code= <<EOFCODE; 436 # Can't run the match in a subthread, but can do this and 437 # clone the pattern the other way. 438 my \$RE = threads->new(sub {qr$pat})->join(); 439 $study 440 \$match = (\$subject =~ /(?:)\$RE(?:)/) while \$c--; 441 \$got = "$repl"; 442EOFCODE 443 } 444 elsif ($no_null) { 445 my $patcode = defined $no_null_pat ? '/$no_null_pat/' 446 : $pat; 447 $code= <<EOFCODE; 448 $study 449 \$match = (\$subject =~ $OP$pat) while \$c--; 450 \$got = "$repl"; 451EOFCODE 452 } 453 else { 454 $code= <<EOFCODE; 455 $study 456 \$match = (\$subject =~ $OP$pat) while \$c--; 457 \$got = "$repl"; 458EOFCODE 459 } 460 $code = "no warnings 'experimental::regex_sets';$code" if $regex_sets; 461 $code = "no warnings 'experimental::alpha_assertions';$code" if $alpha_assertions; 462 #$code.=qq[\n\$expect="$expect";\n]; 463 #use Devel::Peek; 464 #die Dump($code) if $pat=~/\\h/ and $subject=~/\x{A0}/; 465 { 466 # Probably we should annotate specific tests with which warnings 467 # categories they're known to trigger, and hence should be 468 # disabled just for that test 469 no warnings qw(uninitialized regexp deprecated); 470 eval $code; 471 } 472 chomp( my $err = $@ ); 473 if ( $skip ) { 474 print "ok $testname # skipped", length($reason) ? ". $reason" : '', "\n"; 475 next TEST; 476 } 477 elsif ($result eq 'c') { 478 if ($err !~ m!^\Q$expect!) { print "not ok $testname$todo (compile) $input => '$err'\n"; next TEST } 479 last; # no need to study a syntax error 480 } 481 elsif ( $todo_qr ) { 482 print "not ok $testname # TODO", length($reason) ? " - $reason" : '', "\n"; 483 next TEST; 484 } 485 elsif ($@) { 486 print "not ok $testname$todo $input => error '$err'\n", _comment("$code\n$@\n"); next TEST; 487 } 488 elsif ($result =~ /^n/) { 489 if ($match) { print "not ok $testname$todo ($study) $input => false positive\n"; next TEST } 490 } 491 else { 492 if (!$match || $got ne $expect) { 493 eval { require Data::Dumper }; 494 no warnings "utf8"; # But handle should be utf8 495 if ($@ || !defined &DynaLoader::boot_DynaLoader) { 496 # Data::Dumper will load on miniperl, but fail when used in 497 # anger as it tries to load B. I'd prefer to keep the 498 # regular calls below outside of an eval so that real 499 # (unknown) failures get spotted, not ignored. 500 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$code\n"); 501 } 502 else { # better diagnostics 503 my $s = Data::Dumper->new([$subject],['subject'])->Useqq(1)->Dump; 504 my $g = Data::Dumper->new([$got],['got'])->Useqq(1)->Dump; 505 my $e = Data::Dumper->new([$expect],['expected'])->Useqq(1)->Dump; 506 print "not ok $testname$todo ($study) $input => '$got', match=$match\n", _comment("$s\n$code\n$g\n$e\n"); 507 } 508 next TEST; 509 } 510 } 511 } 512 print "ok $testname$todo\n"; 513} 514 515printf "1..%d\n# $iters iterations\n", scalar @tests; 516 5171; 518