1#!./perl 2# 3# This is a home for regular expression tests that don't fit into 4# the format supported by re/regexp.t. If you want to add a test 5# that does fit that format, add it to re/re_tests, not here. 6 7use strict; 8use warnings; 9use 5.010; 10 11 12sub run_tests; 13 14$| = 1; 15 16 17BEGIN { 18 chdir 't' if -d 't'; 19 @INC = ('../lib','.'); 20 do "re/ReTest.pl" or die $@; 21} 22 23 24plan tests => 1159; # Update this when adding/deleting tests. 25 26run_tests() unless caller; 27 28# 29# Tests start here. 30# 31sub run_tests { 32 33 SKIP: 34 { 35 local $Message = '\C matches octet'; 36 $_ = "a\x{100}b"; 37 ok /(.)(\C)(\C)(.)/ or skip q [\C doesn't match], 4; 38 iseq $1, "a"; 39 if ($IS_ASCII) { # ASCII (or equivalent), should be UTF-8 40 iseq $2, "\xC4"; 41 iseq $3, "\x80"; 42 } 43 elsif ($IS_EBCDIC) { # EBCDIC (or equivalent), should be UTF-EBCDIC 44 iseq $2, "\x8C"; 45 iseq $3, "\x41"; 46 } 47 else { 48 SKIP: { 49 ok 0, "Unexpected platform", "ord ('A') = $ordA"; 50 skip "Unexpected platform"; 51 } 52 } 53 iseq $4, "b"; 54 } 55 56 57 SKIP: 58 { 59 local $Message = '\C matches octet'; 60 $_ = "\x{100}"; 61 ok /(\C)/g or skip q [\C doesn't match], 2; 62 if ($IS_ASCII) { 63 iseq $1, "\xC4"; 64 } 65 elsif ($IS_EBCDIC) { 66 iseq $1, "\x8C"; 67 } 68 else { 69 ok 0, "Unexpected platform", "ord ('A') = $ordA"; 70 } 71 ok /(\C)/g or skip q [\C doesn't match]; 72 if ($IS_ASCII) { 73 iseq $1, "\x80"; 74 } 75 elsif ($IS_EBCDIC) { 76 iseq $1, "\x41"; 77 } 78 else { 79 ok 0, "Unexpected platform", "ord ('A') = $ordA"; 80 } 81 } 82 83 84 { 85 # Japhy -- added 03/03/2001 86 () = (my $str = "abc") =~ /(...)/; 87 $str = "def"; 88 iseq $1, "abc", 'Changing subject does not modify $1'; 89 } 90 91 92 SKIP: 93 { 94 # The trick is that in EBCDIC the explicit numeric range should 95 # match (as also in non-EBCDIC) but the explicit alphabetic range 96 # should not match. 97 ok "\x8e" =~ /[\x89-\x91]/, '"\x8e" =~ /[\x89-\x91]/'; 98 ok "\xce" =~ /[\xc9-\xd1]/, '"\xce" =~ /[\xc9-\xd1]/'; 99 100 skip "Not an EBCDIC platform", 2 unless ord ('i') == 0x89 && 101 ord ('J') == 0xd1; 102 103 # In most places these tests would succeed since \x8e does not 104 # in most character sets match 'i' or 'j' nor would \xce match 105 # 'I' or 'J', but strictly speaking these tests are here for 106 # the good of EBCDIC, so let's test these only there. 107 nok "\x8e" !~ /[i-j]/, '"\x8e" !~ /[i-j]/'; 108 nok "\xce" !~ /[I-J]/, '"\xce" !~ /[I-J]/'; 109 } 110 111 112 { 113 ok "\x{ab}" =~ /\x{ab}/, '"\x{ab}" =~ /\x{ab}/ '; 114 ok "\x{abcd}" =~ /\x{abcd}/, '"\x{abcd}" =~ /\x{abcd}/'; 115 } 116 117 118 { 119 local $Message = 'bug id 20001008.001'; 120 121 my @x = ("stra\337e 138", "stra\337e 138"); 122 for (@x) { 123 ok s/(\d+)\s*([\w\-]+)/$1 . uc $2/e; 124 ok my ($latin) = /^(.+)(?:\s+\d)/; 125 iseq $latin, "stra\337e"; 126 ok $latin =~ s/stra\337e/straße/; 127 # 128 # Previous code follows, but outcommented - there were no tests. 129 # 130 # $latin =~ s/stra\337e/straße/; # \303\237 after the 2nd a 131 # use utf8; # needed for the raw UTF-8 132 # $latin =~ s!(s)tr(?:aß|s+e)!$1tr.!; # \303\237 after the a 133 } 134 } 135 136 137 { 138 local $Message = 'Test \x escapes'; 139 ok "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; 140 ok "ba\xd4c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; 141 ok "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\x{d4}"; 142 ok "ba\x{d4}c" =~ /([a\xd4]+)/ && $1 eq "a\xd4"; 143 ok "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; 144 ok "ba\xd4c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; 145 ok "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\x{d4}"; 146 ok "ba\x{d4}c" =~ /([a\x{d4}]+)/ && $1 eq "a\xd4"; 147 } 148 149 150 SKIP: 151 { 152 local $Message = 'Match code points > 255'; 153 $_ = "abc\x{100}\x{200}\x{300}\x{380}\x{400}defg"; 154 ok /(.\x{300})./ or skip "No match", 4; 155 ok $` eq "abc\x{100}" && length ($`) == 4; 156 ok $& eq "\x{200}\x{300}\x{380}" && length ($&) == 3; 157 ok $' eq "\x{400}defg" && length ($') == 5; 158 ok $1 eq "\x{200}\x{300}" && length ($1) == 2; 159 } 160 161 162 163 { 164 my $x = "\x{10FFFD}"; 165 $x =~ s/(.)/$1/g; 166 ok ord($x) == 0x10FFFD && length($x) == 1, "From Robin Houston"; 167 } 168 169 170 { 171 my %d = ( 172 "7f" => [0, 0, 0], 173 "80" => [1, 1, 0], 174 "ff" => [1, 1, 0], 175 "100" => [0, 1, 1], 176 ); 177 SKIP: 178 while (my ($code, $match) = each %d) { 179 local $Message = "Properties of \\x$code"; 180 my $char = eval qq ["\\x{$code}"]; 181 my $i = 0; 182 ok (($char =~ /[\x80-\xff]/) xor !$$match [$i ++]); 183 ok (($char =~ /[\x80-\x{100}]/) xor !$$match [$i ++]); 184 ok (($char =~ /[\x{100}]/) xor !$$match [$i ++]); 185 } 186 } 187 188 189 { 190 # From Japhy 191 local $Message; 192 must_warn 'qr/(?c)/', '^Useless \(\?c\)'; 193 must_warn 'qr/(?-c)/', '^Useless \(\?-c\)'; 194 must_warn 'qr/(?g)/', '^Useless \(\?g\)'; 195 must_warn 'qr/(?-g)/', '^Useless \(\?-g\)'; 196 must_warn 'qr/(?o)/', '^Useless \(\?o\)'; 197 must_warn 'qr/(?-o)/', '^Useless \(\?-o\)'; 198 199 # Now test multi-error regexes 200 must_warn 'qr/(?g-o)/', '^Useless \(\?g\).*\nUseless \(\?-o\)'; 201 must_warn 'qr/(?g-c)/', '^Useless \(\?g\).*\nUseless \(\?-c\)'; 202 # (?c) means (?g) error won't be thrown 203 must_warn 'qr/(?o-cg)/', '^Useless \(\?o\).*\nUseless \(\?-c\)'; 204 must_warn 'qr/(?ogc)/', '^Useless \(\?o\).*\nUseless \(\?g\).*\n' . 205 'Useless \(\?c\)'; 206 } 207 208 209 { 210 local $Message = "/x tests"; 211 $_ = "foo"; 212 eval_ok <<" --"; 213 /f 214 o\r 215 o 216 \$ 217 /x 218 -- 219 eval_ok <<" --"; 220 /f 221 o 222 o 223 \$\r 224 /x 225 -- 226 } 227 228 229 { 230 local $Message = "/o feature"; 231 sub test_o {$_ [0] =~ /$_[1]/o; return $1} 232 iseq test_o ('abc', '(.)..'), 'a'; 233 iseq test_o ('abc', '..(.)'), 'a'; 234 } 235 236 { 237 # Test basic $^N usage outside of a regex 238 local $Message = '$^N usage outside of a regex'; 239 my $x = "abcdef"; 240 ok ($x =~ /cde/ and !defined $^N); 241 ok ($x =~ /(cde)/ and $^N eq "cde"); 242 ok ($x =~ /(c)(d)(e)/ and $^N eq "e"); 243 ok ($x =~ /(c(d)e)/ and $^N eq "cde"); 244 ok ($x =~ /(foo)|(c(d)e)/ and $^N eq "cde"); 245 ok ($x =~ /(c(d)e)|(foo)/ and $^N eq "cde"); 246 ok ($x =~ /(c(d)e)|(abc)/ and $^N eq "abc"); 247 ok ($x =~ /(c(d)e)|(abc)x/ and $^N eq "cde"); 248 ok ($x =~ /(c(d)e)(abc)?/ and $^N eq "cde"); 249 ok ($x =~ /(?:c(d)e)/ and $^N eq "d"); 250 ok ($x =~ /(?:c(d)e)(?:f)/ and $^N eq "d"); 251 ok ($x =~ /(?:([abc])|([def]))*/ and $^N eq "f"); 252 ok ($x =~ /(?:([ace])|([bdf]))*/ and $^N eq "f"); 253 ok ($x =~ /(([ace])|([bd]))*/ and $^N eq "e"); 254 {ok ($x =~ /(([ace])|([bdf]))*/ and $^N eq "f");} 255 ## Test to see if $^N is automatically localized -- it should now 256 ## have the value set in the previous test. 257 iseq $^N, "e", '$^N is automatically localized'; 258 259 # Now test inside (?{ ... }) 260 local $Message = '$^N usage inside (?{ ... })'; 261 our ($y, $z); 262 ok ($x =~ /a([abc])(?{$y=$^N})c/ and $y eq "b"); 263 ok ($x =~ /a([abc]+)(?{$y=$^N})d/ and $y eq "bc"); 264 ok ($x =~ /a([abcdefg]+)(?{$y=$^N})d/ and $y eq "bc"); 265 ok ($x =~ /(a([abcdefg]+)(?{$y=$^N})d)(?{$z=$^N})e/ and $y eq "bc" 266 and $z eq "abcd"); 267 ok ($x =~ /(a([abcdefg]+)(?{$y=$^N})de)(?{$z=$^N})/ and $y eq "bc" 268 and $z eq "abcde"); 269 270 } 271 272 273 SKIP: 274 { 275 ## Should probably put in tests for all the POSIX stuff, 276 ## but not sure how to guarantee a specific locale...... 277 278 skip "Not an ASCII platform", 2 unless $IS_ASCII; 279 local $Message = 'Test [[:cntrl:]]'; 280 my $AllBytes = join "" => map {chr} 0 .. 255; 281 (my $x = $AllBytes) =~ s/[[:cntrl:]]//g; 282 iseq $x, join "", map {chr} 0x20 .. 0x7E, 0x80 .. 0xFF; 283 284 ($x = $AllBytes) =~ s/[^[:cntrl:]]//g; 285 iseq $x, join "", map {chr} 0x00 .. 0x1F, 0x7F; 286 } 287 288 289 { 290 # With /s modifier UTF8 chars were interpreted as bytes 291 local $Message = "UTF-8 chars aren't bytes"; 292 my $a = "Hello \x{263A} World"; 293 my @a = ($a =~ /./gs); 294 iseq $#a, 12; 295 } 296 297 298 { 299 local $Message = '. matches \n with /s'; 300 my $str1 = "foo\nbar"; 301 my $str2 = "foo\n\x{100}bar"; 302 my ($a, $b) = map {chr} $IS_ASCII ? (0xc4, 0x80) : (0x8c, 0x41); 303 my @a; 304 @a = $str1 =~ /./g; iseq @a, 6; iseq "@a", "f o o b a r"; 305 @a = $str1 =~ /./gs; iseq @a, 7; iseq "@a", "f o o \n b a r"; 306 @a = $str1 =~ /\C/g; iseq @a, 7; iseq "@a", "f o o \n b a r"; 307 @a = $str1 =~ /\C/gs; iseq @a, 7; iseq "@a", "f o o \n b a r"; 308 @a = $str2 =~ /./g; iseq @a, 7; iseq "@a", "f o o \x{100} b a r"; 309 @a = $str2 =~ /./gs; iseq @a, 8; iseq "@a", "f o o \n \x{100} b a r"; 310 @a = $str2 =~ /\C/g; iseq @a, 9; iseq "@a", "f o o \n $a $b b a r"; 311 @a = $str2 =~ /\C/gs; iseq @a, 9; iseq "@a", "f o o \n $a $b b a r"; 312 } 313 314 315 { 316 no warnings 'digit'; 317 # Check that \x## works. 5.6.1 and 5.005_03 fail some of these. 318 my $x; 319 $x = "\x4e" . "E"; 320 ok ($x =~ /^\x4EE$/, "Check only 2 bytes of hex are matched."); 321 322 $x = "\x4e" . "i"; 323 ok ($x =~ /^\x4Ei$/, "Check that invalid hex digit stops it (2)"); 324 325 $x = "\x4" . "j"; 326 ok ($x =~ /^\x4j$/, "Check that invalid hex digit stops it (1)"); 327 328 $x = "\x0" . "k"; 329 ok ($x =~ /^\xk$/, "Check that invalid hex digit stops it (0)"); 330 331 $x = "\x0" . "x"; 332 ok ($x =~ /^\xx$/, "\\xx isn't to be treated as \\0"); 333 334 $x = "\x0" . "xa"; 335 ok ($x =~ /^\xxa$/, "\\xxa isn't to be treated as \\xa"); 336 337 $x = "\x9" . "_b"; 338 ok ($x =~ /^\x9_b$/, "\\x9_b isn't to be treated as \\x9b"); 339 340 # and now again in [] ranges 341 342 $x = "\x4e" . "E"; 343 ok ($x =~ /^[\x4EE]{2}$/, "Check only 2 bytes of hex are matched."); 344 345 $x = "\x4e" . "i"; 346 ok ($x =~ /^[\x4Ei]{2}$/, "Check that invalid hex digit stops it (2)"); 347 348 $x = "\x4" . "j"; 349 ok ($x =~ /^[\x4j]{2}$/, "Check that invalid hex digit stops it (1)"); 350 351 $x = "\x0" . "k"; 352 ok ($x =~ /^[\xk]{2}$/, "Check that invalid hex digit stops it (0)"); 353 354 $x = "\x0" . "x"; 355 ok ($x =~ /^[\xx]{2}$/, "\\xx isn't to be treated as \\0"); 356 357 $x = "\x0" . "xa"; 358 ok ($x =~ /^[\xxa]{3}$/, "\\xxa isn't to be treated as \\xa"); 359 360 $x = "\x9" . "_b"; 361 ok ($x =~ /^[\x9_b]{3}$/, "\\x9_b isn't to be treated as \\x9b"); 362 363 # Check that \x{##} works. 5.6.1 fails quite a few of these. 364 365 $x = "\x9b"; 366 ok ($x =~ /^\x{9_b}$/, "\\x{9_b} is to be treated as \\x9b"); 367 368 $x = "\x9b" . "y"; 369 ok ($x =~ /^\x{9_b}y$/, "\\x{9_b} is to be treated as \\x9b (again)"); 370 371 $x = "\x9b" . "y"; 372 ok ($x =~ /^\x{9b_}y$/, "\\x{9b_} is to be treated as \\x9b"); 373 374 $x = "\x9b" . "y"; 375 ok ($x =~ /^\x{9_bq}y$/, "\\x{9_bc} is to be treated as \\x9b"); 376 377 $x = "\x0" . "y"; 378 ok ($x =~ /^\x{x9b}y$/, "\\x{x9b} is to be treated as \\x0"); 379 380 $x = "\x0" . "y"; 381 ok ($x =~ /^\x{0x9b}y$/, "\\x{0x9b} is to be treated as \\x0"); 382 383 $x = "\x9b" . "y"; 384 ok ($x =~ /^\x{09b}y$/, "\\x{09b} is to be treated as \\x9b"); 385 386 $x = "\x9b"; 387 ok ($x =~ /^[\x{9_b}]$/, "\\x{9_b} is to be treated as \\x9b"); 388 389 $x = "\x9b" . "y"; 390 ok ($x =~ /^[\x{9_b}y]{2}$/, 391 "\\x{9_b} is to be treated as \\x9b (again)"); 392 393 $x = "\x9b" . "y"; 394 ok ($x =~ /^[\x{9b_}y]{2}$/, "\\x{9b_} is to be treated as \\x9b"); 395 396 $x = "\x9b" . "y"; 397 ok ($x =~ /^[\x{9_bq}y]{2}$/, "\\x{9_bc} is to be treated as \\x9b"); 398 399 $x = "\x0" . "y"; 400 ok ($x =~ /^[\x{x9b}y]{2}$/, "\\x{x9b} is to be treated as \\x0"); 401 402 $x = "\x0" . "y"; 403 ok ($x =~ /^[\x{0x9b}y]{2}$/, "\\x{0x9b} is to be treated as \\x0"); 404 405 $x = "\x9b" . "y"; 406 ok ($x =~ /^[\x{09b}y]{2}$/, "\\x{09b} is to be treated as \\x9b"); 407 408 } 409 410 411 { 412 # High bit bug -- japhy 413 my $x = "ab\200d"; 414 ok $x =~ /.*?\200/, "High bit fine"; 415 } 416 417 418 { 419 # The basic character classes and Unicode 420 ok "\x{0100}" =~ /\w/, 'LATIN CAPITAL LETTER A WITH MACRON in /\w/'; 421 ok "\x{0660}" =~ /\d/, 'ARABIC-INDIC DIGIT ZERO in /\d/'; 422 ok "\x{1680}" =~ /\s/, 'OGHAM SPACE MARK in /\s/'; 423 } 424 425 426 { 427 local $Message = "Folding matches and Unicode"; 428 ok "a\x{100}" =~ /A/i; 429 ok "A\x{100}" =~ /a/i; 430 ok "a\x{100}" =~ /a/i; 431 ok "A\x{100}" =~ /A/i; 432 ok "\x{101}a" =~ /\x{100}/i; 433 ok "\x{100}a" =~ /\x{100}/i; 434 ok "\x{101}a" =~ /\x{101}/i; 435 ok "\x{100}a" =~ /\x{101}/i; 436 ok "a\x{100}" =~ /A\x{100}/i; 437 ok "A\x{100}" =~ /a\x{100}/i; 438 ok "a\x{100}" =~ /a\x{100}/i; 439 ok "A\x{100}" =~ /A\x{100}/i; 440 ok "a\x{100}" =~ /[A]/i; 441 ok "A\x{100}" =~ /[a]/i; 442 ok "a\x{100}" =~ /[a]/i; 443 ok "A\x{100}" =~ /[A]/i; 444 ok "\x{101}a" =~ /[\x{100}]/i; 445 ok "\x{100}a" =~ /[\x{100}]/i; 446 ok "\x{101}a" =~ /[\x{101}]/i; 447 ok "\x{100}a" =~ /[\x{101}]/i; 448 } 449 450 451 { 452 use charnames ':full'; 453 local $Message = "Folding 'LATIN LETTER A WITH GRAVE'"; 454 455 my $lower = "\N{LATIN SMALL LETTER A WITH GRAVE}"; 456 my $UPPER = "\N{LATIN CAPITAL LETTER A WITH GRAVE}"; 457 458 ok $lower =~ m/$UPPER/i; 459 ok $UPPER =~ m/$lower/i; 460 ok $lower =~ m/[$UPPER]/i; 461 ok $UPPER =~ m/[$lower]/i; 462 463 local $Message = "Folding 'GREEK LETTER ALPHA WITH VRACHY'"; 464 465 $lower = "\N{GREEK CAPITAL LETTER ALPHA WITH VRACHY}"; 466 $UPPER = "\N{GREEK SMALL LETTER ALPHA WITH VRACHY}"; 467 468 ok $lower =~ m/$UPPER/i; 469 ok $UPPER =~ m/$lower/i; 470 ok $lower =~ m/[$UPPER]/i; 471 ok $UPPER =~ m/[$lower]/i; 472 473 local $Message = "Folding 'LATIN LETTER Y WITH DIAERESIS'"; 474 475 $lower = "\N{LATIN SMALL LETTER Y WITH DIAERESIS}"; 476 $UPPER = "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}"; 477 478 ok $lower =~ m/$UPPER/i; 479 ok $UPPER =~ m/$lower/i; 480 ok $lower =~ m/[$UPPER]/i; 481 ok $UPPER =~ m/[$lower]/i; 482 } 483 484 485 { 486 use charnames ':full'; 487 local $PatchId = "13843"; 488 local $Message = "GREEK CAPITAL LETTER SIGMA vs " . 489 "COMBINING GREEK PERISPOMENI"; 490 491 my $SIGMA = "\N{GREEK CAPITAL LETTER SIGMA}"; 492 my $char = "\N{COMBINING GREEK PERISPOMENI}"; 493 494 may_not_warn sub {ok "_:$char:_" !~ m/_:$SIGMA:_/i}; 495 } 496 497 498 { 499 local $Message = '\X'; 500 use charnames ':full'; 501 502 ok "a!" =~ /^(\X)!/ && $1 eq "a"; 503 ok "\xDF!" =~ /^(\X)!/ && $1 eq "\xDF"; 504 ok "\x{100}!" =~ /^(\X)!/ && $1 eq "\x{100}"; 505 ok "\x{100}\x{300}!" =~ /^(\X)!/ && $1 eq "\x{100}\x{300}"; 506 ok "\N{LATIN CAPITAL LETTER E}!" =~ /^(\X)!/ && 507 $1 eq "\N{LATIN CAPITAL LETTER E}"; 508 ok "\N{LATIN CAPITAL LETTER E}\N{COMBINING GRAVE ACCENT}!" 509 =~ /^(\X)!/ && 510 $1 eq "\N{LATIN CAPITAL LETTER E}\N{COMBINING GRAVE ACCENT}"; 511 512 local $Message = '\C and \X'; 513 ok "!abc!" =~ /a\Cc/; 514 ok "!abc!" =~ /a\Xc/; 515 } 516 517 518 { 519 local $Message = "Final Sigma"; 520 521 my $SIGMA = "\x{03A3}"; # CAPITAL 522 my $Sigma = "\x{03C2}"; # SMALL FINAL 523 my $sigma = "\x{03C3}"; # SMALL 524 525 ok $SIGMA =~ /$SIGMA/i; 526 ok $SIGMA =~ /$Sigma/i; 527 ok $SIGMA =~ /$sigma/i; 528 529 ok $Sigma =~ /$SIGMA/i; 530 ok $Sigma =~ /$Sigma/i; 531 ok $Sigma =~ /$sigma/i; 532 533 ok $sigma =~ /$SIGMA/i; 534 ok $sigma =~ /$Sigma/i; 535 ok $sigma =~ /$sigma/i; 536 537 ok $SIGMA =~ /[$SIGMA]/i; 538 ok $SIGMA =~ /[$Sigma]/i; 539 ok $SIGMA =~ /[$sigma]/i; 540 541 ok $Sigma =~ /[$SIGMA]/i; 542 ok $Sigma =~ /[$Sigma]/i; 543 ok $Sigma =~ /[$sigma]/i; 544 545 ok $sigma =~ /[$SIGMA]/i; 546 ok $sigma =~ /[$Sigma]/i; 547 ok $sigma =~ /[$sigma]/i; 548 549 local $Message = "More final Sigma"; 550 551 my $S3 = "$SIGMA$Sigma$sigma"; 552 553 ok ":$S3:" =~ /:(($SIGMA)+):/i && $1 eq $S3 && $2 eq $sigma; 554 ok ":$S3:" =~ /:(($Sigma)+):/i && $1 eq $S3 && $2 eq $sigma; 555 ok ":$S3:" =~ /:(($sigma)+):/i && $1 eq $S3 && $2 eq $sigma; 556 557 ok ":$S3:" =~ /:(([$SIGMA])+):/i && $1 eq $S3 && $2 eq $sigma; 558 ok ":$S3:" =~ /:(([$Sigma])+):/i && $1 eq $S3 && $2 eq $sigma; 559 ok ":$S3:" =~ /:(([$sigma])+):/i && $1 eq $S3 && $2 eq $sigma; 560 } 561 562 563 { 564 use charnames ':full'; 565 local $Message = "Parlez-Vous " . 566 "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais?"; 567 568 ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran.ais/ && 569 $& eq "Francais"; 570 ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ /Fran.ais/ && 571 $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; 572 ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran\Cais/ && 573 $& eq "Francais"; 574 # COMBINING CEDILLA is two bytes when encoded 575 ok "Franc\N{COMBINING CEDILLA}ais" =~ /Franc\C\Cais/; 576 ok "Fran\N{LATIN SMALL LETTER C}ais" =~ /Fran\Xais/ && 577 $& eq "Francais"; 578 ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ /Fran\Xais/ && 579 $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; 580 ok "Franc\N{COMBINING CEDILLA}ais" =~ /Fran\Xais/ && 581 $& eq "Franc\N{COMBINING CEDILLA}ais"; 582 ok "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais" =~ 583 /Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais/ && 584 $& eq "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"; 585 ok "Franc\N{COMBINING CEDILLA}ais" =~ /Franc\N{COMBINING CEDILLA}ais/ && 586 $& eq "Franc\N{COMBINING CEDILLA}ais"; 587 588 my @f = ( 589 ["Fran\N{LATIN SMALL LETTER C}ais", "Francais"], 590 ["Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais", 591 "Fran\N{LATIN SMALL LETTER C WITH CEDILLA}ais"], 592 ["Franc\N{COMBINING CEDILLA}ais", "Franc\N{COMBINING CEDILLA}ais"], 593 ); 594 foreach my $entry (@f) { 595 my ($subject, $match) = @$entry; 596 ok $subject =~ /Fran(?:c\N{COMBINING CEDILLA}?| 597 \N{LATIN SMALL LETTER C WITH CEDILLA})ais/x && 598 $& eq $match; 599 } 600 } 601 602 603 { 604 local $Message = "Lingering (and useless) UTF8 flag doesn't mess up /i"; 605 my $pat = "ABcde"; 606 my $str = "abcDE\x{100}"; 607 chop $str; 608 ok $str =~ /$pat/i; 609 610 $pat = "ABcde\x{100}"; 611 $str = "abcDE"; 612 chop $pat; 613 ok $str =~ /$pat/i; 614 615 $pat = "ABcde\x{100}"; 616 $str = "abcDE\x{100}"; 617 chop $pat; 618 chop $str; 619 ok $str =~ /$pat/i; 620 } 621 622 623 { 624 use charnames ':full'; 625 local $Message = "LATIN SMALL LETTER SHARP S " . 626 "(\N{LATIN SMALL LETTER SHARP S})"; 627 628 ok "\N{LATIN SMALL LETTER SHARP S}" =~ 629 /\N{LATIN SMALL LETTER SHARP S}/; 630 ok "\N{LATIN SMALL LETTER SHARP S}" =~ 631 /\N{LATIN SMALL LETTER SHARP S}/i; 632 ok "\N{LATIN SMALL LETTER SHARP S}" =~ 633 /[\N{LATIN SMALL LETTER SHARP S}]/; 634 ok "\N{LATIN SMALL LETTER SHARP S}" =~ 635 /[\N{LATIN SMALL LETTER SHARP S}]/i; 636 637 ok "ss" =~ /\N{LATIN SMALL LETTER SHARP S}/i; 638 ok "SS" =~ /\N{LATIN SMALL LETTER SHARP S}/i; 639 ok "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i; 640 ok "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i; 641 642 ok "\N{LATIN SMALL LETTER SHARP S}" =~ /ss/i; 643 ok "\N{LATIN SMALL LETTER SHARP S}" =~ /SS/i; 644 645 local $Message = "Unoptimized named sequence in class"; 646 ok "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i; 647 ok "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}x]/i; 648 ok "\N{LATIN SMALL LETTER SHARP S}" =~ 649 /[\N{LATIN SMALL LETTER SHARP S}x]/; 650 ok "\N{LATIN SMALL LETTER SHARP S}" =~ 651 /[\N{LATIN SMALL LETTER SHARP S}x]/i; 652 } 653 654 655 { 656 # More whitespace: U+0085, U+2028, U+2029\n"; 657 658 # U+0085, U+00A0 need to be forced to be Unicode, the \x{100} does that. 659 SKIP: { 660 skip "EBCDIC platform", 4 if $IS_EBCDIC; 661 # Do \x{0015} and \x{0041} match \s in EBCDIC? 662 ok "<\x{100}\x{0085}>" =~ /<\x{100}\s>/, '\x{0085} in \s'; 663 ok "<\x{0085}>" =~ /<\v>/, '\x{0085} in \v'; 664 ok "<\x{100}\x{00A0}>" =~ /<\x{100}\s>/, '\x{00A0} in \s'; 665 ok "<\x{00A0}>" =~ /<\h>/, '\x{00A0} in \h'; 666 } 667 my @h = map {sprintf "%05x" => $_} 0x01680, 0x0180E, 0x02000 .. 0x0200A, 668 0x0202F, 0x0205F, 0x03000; 669 my @v = map {sprintf "%05x" => $_} 0x02028, 0x02029; 670 671 my @H = map {sprintf "%05x" => $_} 0x01361, 0x0200B, 0x02408, 0x02420, 672 0x0303F, 0xE0020; 673 my @V = map {sprintf "%05x" => $_} 0x0008A .. 0x0008D, 0x00348, 0x10100, 674 0xE005F, 0xE007C; 675 676 for my $hex (@h) { 677 my $str = eval qq ["<\\x{$hex}>"]; 678 ok $str =~ /<\s>/, "\\x{$hex} in \\s"; 679 ok $str =~ /<\h>/, "\\x{$hex} in \\h"; 680 ok $str !~ /<\v>/, "\\x{$hex} not in \\v"; 681 } 682 683 for my $hex (@v) { 684 my $str = eval qq ["<\\x{$hex}>"]; 685 ok $str =~ /<\s>/, "\\x{$hex} in \\s"; 686 ok $str =~ /<\v>/, "\\x{$hex} in \\v"; 687 ok $str !~ /<\h>/, "\\x{$hex} not in \\h"; 688 } 689 690 for my $hex (@H) { 691 my $str = eval qq ["<\\x{$hex}>"]; 692 ok $str =~ /<\S>/, "\\x{$hex} in \\S"; 693 ok $str =~ /<\H>/, "\\x{$hex} in \\H"; 694 } 695 696 for my $hex (@V) { 697 my $str = eval qq ["<\\x{$hex}>"]; 698 ok $str =~ /<\S>/, "\\x{$hex} in \\S"; 699 ok $str =~ /<\V>/, "\\x{$hex} in \\V"; 700 } 701 } 702 703 704 { 705 # . with /s should work on characters, as opposed to bytes 706 local $Message = ". with /s works on characters, not bytes"; 707 708 my $s = "\x{e4}\x{100}"; 709 # This is not expected to match: the point is that 710 # neither should we get "Malformed UTF-8" warnings. 711 may_not_warn sub {$s =~ /\G(.+?)\n/gcs}, "No 'Malformed UTF-8' warning"; 712 713 my @c; 714 push @c => $1 while $s =~ /\G(.)/gs; 715 716 local $" = ""; 717 iseq "@c", $s; 718 719 # Test only chars < 256 720 my $t1 = "Q003\n\n\x{e4}\x{f6}\n\nQ004\n\n\x{e7}"; 721 my $r1 = ""; 722 while ($t1 =~ / \G ( .+? ) \n\s+ ( .+? ) ( $ | \n\s+ ) /xgcs) { 723 $r1 .= $1 . $2; 724 } 725 726 my $t2 = $t1 . "\x{100}"; # Repeat with a larger char 727 my $r2 = ""; 728 while ($t2 =~ / \G ( .+? ) \n\s+ ( .+? ) ( $ | \n\s+ ) /xgcs) { 729 $r2 .= $1 . $2; 730 } 731 $r2 =~ s/\x{100}//; 732 733 iseq $r1, $r2; 734 } 735 736 737 { 738 local $Message = "Unicode lookbehind"; 739 ok "A\x{100}B" =~ /(?<=A.)B/; 740 ok "A\x{200}\x{300}B" =~ /(?<=A..)B/; 741 ok "\x{400}AB" =~ /(?<=\x{400}.)B/; 742 ok "\x{500}\x{600}B" =~ /(?<=\x{500}.)B/; 743 744 # Original code also contained: 745 # ok "\x{500\x{600}}B" =~ /(?<=\x{500}.)B/; 746 # but that looks like a typo. 747 } 748 749 750 { 751 local $Message = 'UTF-8 hash keys and /$/'; 752 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters 753 # /2002-01/msg01327.html 754 755 my $u = "a\x{100}"; 756 my $v = substr ($u, 0, 1); 757 my $w = substr ($u, 1, 1); 758 my %u = ($u => $u, $v => $v, $w => $w); 759 for (keys %u) { 760 my $m1 = /^\w*$/ ? 1 : 0; 761 my $m2 = $u {$_} =~ /^\w*$/ ? 1 : 0; 762 iseq $m1, $m2; 763 } 764 } 765 766 767 { 768 local $Message = "No SEGV in s/// and UTF-8"; 769 my $s = "s#\x{100}" x 4; 770 ok $s =~ s/[^\w]/ /g; 771 if ( 1 or $ENV{PERL_TEST_LEGACY_POSIX_CC} ) { 772 iseq $s, "s \x{100}" x 4; 773 } 774 else { 775 iseq $s, "s " x 4; 776 } 777 } 778 779 780 { 781 local $Message = "UTF-8 bug (maybe already known?)"; 782 my $u = "foo"; 783 $u =~ s/./\x{100}/g; 784 iseq $u, "\x{100}\x{100}\x{100}"; 785 786 $u = "foobar"; 787 $u =~ s/[ao]/\x{100}/g; 788 iseq $u, "f\x{100}\x{100}b\x{100}r"; 789 790 $u =~ s/\x{100}/e/g; 791 iseq $u, "feeber"; 792 } 793 794 795 { 796 local $Message = "UTF-8 bug with s///"; 797 # check utf8/non-utf8 mixtures 798 # try to force all float/anchored check combinations 799 800 my $c = "\x{100}"; 801 my $subst; 802 for my $re ("xx.*$c", "x.*$c$c", "$c.*xx", "$c$c.*x", 803 "xx.*(?=$c)", "(?=$c).*xx",) { 804 ok "xxx" !~ /$re/; 805 ok +($subst = "xxx") !~ s/$re//; 806 } 807 for my $re ("xx.*$c*", "$c*.*xx") { 808 ok "xxx" =~ /$re/; 809 ok +($subst = "xxx") =~ s/$re//; 810 iseq $subst, ""; 811 } 812 for my $re ("xxy*", "y*xx") { 813 ok "xx$c" =~ /$re/; 814 ok +($subst = "xx$c") =~ s/$re//; 815 iseq $subst, $c; 816 ok "xy$c" !~ /$re/; 817 ok +($subst = "xy$c") !~ s/$re//; 818 } 819 for my $re ("xy$c*z", "x$c*yz") { 820 ok "xyz" =~ /$re/; 821 ok +($subst = "xyz") =~ s/$re//; 822 iseq $subst, ""; 823 } 824 } 825 826 827 { 828 local $Message = "qr /.../x"; 829 my $R = qr / A B C # D E/x; 830 ok "ABCDE" =~ $R && $& eq "ABC"; 831 ok "ABCDE" =~ /$R/ && $& eq "ABC"; 832 ok "ABCDE" =~ m/$R/ && $& eq "ABC"; 833 ok "ABCDE" =~ /($R)/ && $1 eq "ABC"; 834 ok "ABCDE" =~ m/($R)/ && $1 eq "ABC"; 835 } 836 837 838 839 840 { 841 local $\; 842 $_ = 'aaaaaaaaaa'; 843 utf8::upgrade($_); chop $_; $\="\n"; 844 ok /[^\s]+/, 'm/[^\s]/ utf8'; 845 ok /[^\d]+/, 'm/[^\d]/ utf8'; 846 ok +($a = $_, $_ =~ s/[^\s]+/./g), 's/[^\s]/ utf8'; 847 ok +($a = $_, $a =~ s/[^\d]+/./g), 's/[^\s]/ utf8'; 848 } 849 850 851 852 853 { 854 # Subject: Odd regexp behavior 855 # From: Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk> 856 # Date: Wed, 26 Feb 2003 16:53:12 +0000 857 # Message-Id: <E18o4nw-0008Ly-00@wisbech.cl.cam.ac.uk> 858 # To: perl-unicode@perl.org 859 860 local $Message = 'Markus Kuhn 2003-02-26'; 861 862 my $x = "\x{2019}\nk"; 863 ok $x =~ s/(\S)\n(\S)/$1 $2/sg; 864 ok $x eq "\x{2019} k"; 865 866 $x = "b\nk"; 867 ok $x =~ s/(\S)\n(\S)/$1 $2/sg; 868 ok $x eq "b k"; 869 870 ok "\x{2019}" =~ /\S/; 871 } 872 873 874 { 875 # XXX DAPM 13-Apr-06. Recursive split is still broken. It's only luck it 876 # hasn't been crashing. Disable this test until it is fixed properly. 877 # XXX also check what it returns rather than just doing ok(1,...) 878 # split /(?{ split "" })/, "abc"; 879 local $TODO = "Recursive split is still broken"; 880 ok 0, 'cache_re & "(?{": it dumps core in 5.6.1 & 5.8.0'; 881 } 882 883 884 { 885 ok "\x{100}\n" =~ /\x{100}\n$/, "UTF-8 length cache and fbm_compile"; 886 } 887 888 889 { 890 package Str; 891 use overload q /""/ => sub {${$_ [0]};}; 892 sub new {my ($c, $v) = @_; bless \$v, $c;} 893 894 package main; 895 $_ = Str -> new ("a\x{100}/\x{100}b"); 896 ok join (":", /\b(.)\x{100}/g) eq "a:/", "re_intuit_start and PL_bostr"; 897 } 898 899 { 900 my $re = qq /^([^X]*)X/; 901 utf8::upgrade ($re); 902 ok "\x{100}X" =~ /$re/, "S_cl_and ANYOF_UNICODE & ANYOF_INVERTED"; 903 } 904 905 { 906 ok "123\x{100}" =~ /^.*1.*23\x{100}$/, 907 'UTF-8 + multiple floating substr'; 908 } 909 910 { 911 local $Message = '<20030808193656.5109.1@llama.ni-s.u-net.com>'; 912 913 # LATIN SMALL/CAPITAL LETTER A WITH MACRON 914 ok " \x{101}" =~ qr/\x{100}/i; 915 916 # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW 917 ok " \x{1E01}" =~ qr/\x{1E00}/i; 918 919 # DESERET SMALL/CAPITAL LETTER LONG I 920 ok " \x{10428}" =~ qr/\x{10400}/i; 921 922 # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' 923 ok " \x{1E01}x" =~ qr/\x{1E00}X/i; 924 } 925 926 { 927 for (120 .. 130) { 928 my $head = 'x' x $_; 929 local $Message = q [Don't misparse \x{...} in regexp ] . 930 q [near 127 char EXACT limit]; 931 for my $tail ('\x{0061}', '\x{1234}', '\x61') { 932 eval_ok qq ["$head$tail" =~ /$head$tail/]; 933 } 934 local $Message = q [Don't misparse \N{...} in regexp ] . 935 q [near 127 char EXACT limit]; 936 for my $tail ('\N{SNOWFLAKE}') { 937 eval_ok qq [use charnames ':full'; 938 "$head$tail" =~ /$head$tail/]; 939 } 940 } 941 } 942 943 { # TRIE related 944 our @got = (); 945 "words" =~ /(word|word|word)(?{push @got, $1})s$/; 946 iseq @got, 1, "TRIE optimation"; 947 948 @got = (); 949 "words" =~ /(word|word|word)(?{push @got,$1})s$/i; 950 iseq @got, 1,"TRIEF optimisation"; 951 952 my @nums = map {int rand 1000} 1 .. 100; 953 my $re = "(" . (join "|", @nums) . ")"; 954 $re = qr/\b$re\b/; 955 956 foreach (@nums) { 957 ok $_ =~ /$re/, "Trie nums"; 958 } 959 960 $_ = join " ", @nums; 961 @got = (); 962 push @got, $1 while /$re/g; 963 964 my %count; 965 $count {$_} ++ for @got; 966 my $ok = 1; 967 for (@nums) { 968 $ok = 0 if --$count {$_} < 0; 969 } 970 ok $ok, "Trie min count matches"; 971 } 972 973 974 { 975 # TRIE related 976 # LATIN SMALL/CAPITAL LETTER A WITH MACRON 977 ok "foba \x{101}foo" =~ qr/(foo|\x{100}foo|bar)/i && 978 $1 eq "\x{101}foo", 979 "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH MACRON"; 980 981 # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW 982 ok "foba \x{1E01}foo" =~ qr/(foo|\x{1E00}foo|bar)/i && 983 $1 eq "\x{1E01}foo", 984 "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW"; 985 986 # DESERET SMALL/CAPITAL LETTER LONG I 987 ok "foba \x{10428}foo" =~ qr/(foo|\x{10400}foo|bar)/i && 988 $1 eq "\x{10428}foo", 989 "TRIEF + DESERET SMALL/CAPITAL LETTER LONG I"; 990 991 # LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X' 992 ok "foba \x{1E01}xfoo" =~ qr/(foo|\x{1E00}Xfoo|bar)/i && 993 $1 eq "\x{1E01}xfoo", 994 "TRIEF + LATIN SMALL/CAPITAL LETTER A WITH RING BELOW + 'X'"; 995 996 use charnames ':full'; 997 998 my $s = "\N{LATIN SMALL LETTER SHARP S}"; 999 ok "foba ba$s" =~ qr/(foo|Ba$s|bar)/i && $1 eq "ba$s", 1000 "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; 1001 ok "foba ba$s" =~ qr/(Ba$s|foo|bar)/i && $1 eq "ba$s", 1002 "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; 1003 ok "foba ba$s" =~ qr/(foo|bar|Ba$s)/i && $1 eq "ba$s", 1004 "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; 1005 1006 ok "foba ba$s" =~ qr/(foo|Bass|bar)/i && $1 eq "ba$s", 1007 "TRIEF + LATIN SMALL LETTER SHARP S =~ ss"; 1008 1009 ok "foba ba$s" =~ qr/(foo|BaSS|bar)/i && $1 eq "ba$s", 1010 "TRIEF + LATIN SMALL LETTER SHARP S =~ SS"; 1011 1012 ok "foba ba${s}pxySS$s$s" =~ qr/(b(?:a${s}t|a${s}f|a${s}p)[xy]+$s*)/i 1013 && $1 eq "ba${s}pxySS$s$s", 1014 "COMMON PREFIX TRIEF + LATIN SMALL LETTER SHARP S"; 1015 } 1016 1017 1018 1019 1020 { 1021 BEGIN { 1022 unshift @INC, 'lib'; 1023 } 1024 use Cname; 1025 1026 ok 'fooB' =~ /\N{foo}[\N{B}\N{b}]/, "Passthrough charname"; 1027 # 1028 # Why doesn't must_warn work here? 1029 # 1030 my $w; 1031 local $SIG {__WARN__} = sub {$w .= "@_"}; 1032 eval 'q(xxWxx) =~ /[\N{WARN}]/'; 1033 ok $w && $w =~ /Using just the first character returned by \\N{} in character class/, 1034 "single character in [\\N{}] warning"; 1035 1036 undef $w; 1037 eval q [ok "\0" !~ /[\N{EMPTY-STR}XY]/, 1038 "Zerolength charname in charclass doesn't match \\0"]; 1039 ok $w && $w =~ /Ignoring zero length/, 1040 'Ignoring zero length \N{} in character class warning'; 1041 1042 ok 'AB' =~ /(\N{EVIL})/ && $1 eq 'A', 'Charname caching $1'; 1043 ok 'ABC' =~ /(\N{EVIL})/, 'Charname caching $1'; 1044 ok 'xy' =~ /x\N{EMPTY-STR}y/, 1045 'Empty string charname produces NOTHING node'; 1046 ok '' =~ /\N{EMPTY-STR}/, 1047 'Empty string charname produces NOTHING node'; 1048 ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works'; 1049 ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works'; 1050 1051 # If remove the limitation in regcomp code these should work 1052 # differently 1053 undef $w; 1054 eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string fails gracefully']; 1055 ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit'; 1056 undef $w; 1057 eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string fails gracefully']; 1058 ok $w && $w =~ /Using just the first characters returned/, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit'; 1059 undef $w; 1060 eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string doesnt work']; 1061 ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit'; 1062 undef $w; 1063 eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string doesnt work']; 1064 ok $w && $w =~ /Using just the first characters returned/i, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit'; 1065 undef $w; 1066 eval 'q(syntax error) =~ /\N{MALFORMED}/'; 1067 ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error'; 1068 undef $w; 1069 eval 'q() =~ /\N{4F}/'; 1070 ok $w && $w =~ /Deprecated/, 'Verify that leading digit in name gives warning'; 1071 undef $w; 1072 eval 'q() =~ /\N{COM,MA}/'; 1073 ok $w && $w =~ /Deprecated/, 'Verify that comma in name gives warning'; 1074 undef $w; 1075 my $name = "A\x{D7}O"; 1076 eval "q(W) =~ /\\N{$name}/"; 1077 ok $w && $w =~ /Deprecated/, 'Verify that latin1 symbol in name gives warning'; 1078 undef $w; 1079 $name = "A\x{D1}O"; 1080 eval "q(W) =~ /\\N{$name}/"; 1081 ok ! $w, 'Verify that latin1 letter in name doesnt give warning'; 1082 1083 } 1084 1085 1086 { 1087 use charnames ':full'; 1088 1089 ok 'aabc' !~ /a\N{PLUS SIGN}b/, '/a\N{PLUS SIGN}b/ against aabc'; 1090 ok 'a+bc' =~ /a\N{PLUS SIGN}b/, '/a\N{PLUS SIGN}b/ against a+bc'; 1091 1092 ok ' A B' =~ /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, 1093 'Intermixed named and unicode escapes'; 1094 ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~ 1095 /\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}/, 1096 'Intermixed named and unicode escapes'; 1097 ok "\N{SPACE}\N{U+0041}\N{SPACE}\N{U+0042}" =~ 1098 /[\N{SPACE}\N{U+0041}][\N{SPACE}\N{U+0042}]/, 1099 'Intermixed named and unicode escapes'; 1100 ok "\0" =~ /^\N{NULL}$/, 'Verify that \N{NULL} works; is not confused with an error'; 1101 } 1102 1103 1104 { 1105 our $brackets; 1106 $brackets = qr{ 1107 { (?> [^{}]+ | (??{ $brackets }) )* } 1108 }x; 1109 1110 ok "{b{c}d" !~ m/^((??{ $brackets }))/, "Bracket mismatch"; 1111 1112 SKIP: { 1113 our @stack = (); 1114 my @expect = qw( 1115 stuff1 1116 stuff2 1117 <stuff1>and<stuff2> 1118 right 1119 <right> 1120 <<right>> 1121 <<<right>>> 1122 <<stuff1>and<stuff2>><<<<right>>>> 1123 ); 1124 1125 local $_ = '<<<stuff1>and<stuff2>><<<<right>>>>>'; 1126 ok /^(<((?:(?>[^<>]+)|(?1))*)>(?{push @stack, $2 }))$/, 1127 "Recursion matches"; 1128 iseq @stack, @expect, "Right amount of matches" 1129 or skip "Won't test individual results as count isn't equal", 1130 0 + @expect; 1131 my $idx = 0; 1132 foreach my $expect (@expect) { 1133 iseq $stack [$idx], $expect, 1134 "Expecting '$expect' at stack pos #$idx"; 1135 $idx ++; 1136 } 1137 } 1138 } 1139 1140 1141 { 1142 my $s = '123453456'; 1143 $s =~ s/(?<digits>\d+)\k<digits>/$+{digits}/; 1144 ok $s eq '123456', 'Named capture (angle brackets) s///'; 1145 $s = '123453456'; 1146 $s =~ s/(?'digits'\d+)\k'digits'/$+{digits}/; 1147 ok $s eq '123456', 'Named capture (single quotes) s///'; 1148 } 1149 1150 1151 { 1152 my @ary = ( 1153 pack('U', 0x00F1), # n-tilde 1154 '_'.pack('U', 0x00F1), # _ + n-tilde 1155 'c'.pack('U', 0x0327), # c + cedilla 1156 pack('U*', 0x00F1, 0x0327), # n-tilde + cedilla 1157 pack('U', 0x0391), # ALPHA 1158 pack('U', 0x0391).'2', # ALPHA + 2 1159 pack('U', 0x0391).'_', # ALPHA + _ 1160 ); 1161 1162 for my $uni (@ary) { 1163 my ($r1, $c1, $r2, $c2) = eval qq { 1164 use utf8; 1165 scalar ("..foo foo.." =~ /(?'${uni}'foo) \\k'${uni}'/), 1166 \$+{${uni}}, 1167 scalar ("..bar bar.." =~ /(?<${uni}>bar) \\k<${uni}>/), 1168 \$+{${uni}}; 1169 }; 1170 ok $r1, "Named capture UTF (?'')"; 1171 ok defined $c1 && $c1 eq 'foo', "Named capture UTF \%+"; 1172 ok $r2, "Named capture UTF (?<>)"; 1173 ok defined $c2 && $c2 eq 'bar', "Named capture UTF \%+"; 1174 } 1175 } 1176 1177 { 1178 my $s = 'foo bar baz'; 1179 my @res; 1180 if ('1234' =~ /(?<A>1)(?<B>2)(?<A>3)(?<B>4)/) { 1181 foreach my $name (sort keys(%-)) { 1182 my $ary = $- {$name}; 1183 foreach my $idx (0 .. $#$ary) { 1184 push @res, "$name:$idx:$ary->[$idx]"; 1185 } 1186 } 1187 } 1188 my @expect = qw (A:0:1 A:1:3 B:0:2 B:1:4); 1189 iseq "@res", "@expect", "Check %-"; 1190 eval' 1191 no warnings "uninitialized"; 1192 print for $- {this_key_doesnt_exist}; 1193 '; 1194 ok !$@,'lvalue $- {...} should not throw an exception'; 1195 } 1196 1197 { 1198 # \, breaks {3,4} 1199 ok "xaaay" !~ /xa{3\,4}y/, '\, in a pattern'; 1200 ok "xa{3,4}y" =~ /xa{3\,4}y/, '\, in a pattern'; 1201 1202 # \c\ followed by _ 1203 ok "x\c_y" !~ /x\c\_y/, '\_ in a pattern'; 1204 ok "x\c\_y" =~ /x\c\_y/, '\_ in a pattern'; 1205 1206 # \c\ followed by other characters 1207 for my $c ("z", "\0", "!", chr(254), chr(256)) { 1208 my $targ = "a\034$c"; 1209 my $reg = "a\\c\\$c"; 1210 ok eval ("qq/$targ/ =~ /$reg/"), "\\c\\ in pattern"; 1211 } 1212 } 1213 1214 { # Test the (*PRUNE) pattern 1215 our $count = 0; 1216 'aaab' =~ /a+b?(?{$count++})(*FAIL)/; 1217 iseq $count, 9, "Expect 9 for no (*PRUNE)"; 1218 $count = 0; 1219 'aaab' =~ /a+b?(*PRUNE)(?{$count++})(*FAIL)/; 1220 iseq $count, 3, "Expect 3 with (*PRUNE)"; 1221 local $_ = 'aaab'; 1222 $count = 0; 1223 1 while /.(*PRUNE)(?{$count++})(*FAIL)/g; 1224 iseq $count, 4, "/.(*PRUNE)/"; 1225 $count = 0; 1226 'aaab' =~ /a+b?(??{'(*PRUNE)'})(?{$count++})(*FAIL)/; 1227 iseq $count, 3, "Expect 3 with (*PRUNE)"; 1228 local $_ = 'aaab'; 1229 $count = 0; 1230 1 while /.(??{'(*PRUNE)'})(?{$count++})(*FAIL)/g; 1231 iseq $count, 4, "/.(*PRUNE)/"; 1232 } 1233 1234 1235 { # Test the (*SKIP) pattern 1236 our $count = 0; 1237 'aaab' =~ /a+b?(*SKIP)(?{$count++})(*FAIL)/; 1238 iseq $count, 1, "Expect 1 with (*SKIP)"; 1239 local $_ = 'aaab'; 1240 $count = 0; 1241 1 while /.(*SKIP)(?{$count++})(*FAIL)/g; 1242 iseq $count, 4, "/.(*SKIP)/"; 1243 $_ = 'aaabaaab'; 1244 $count = 0; 1245 our @res = (); 1246 1 while /(a+b?)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; 1247 iseq $count, 2, "Expect 2 with (*SKIP)"; 1248 iseq "@res", "aaab aaab", "Adjacent (*SKIP) works as expected"; 1249 } 1250 1251 1252 { # Test the (*SKIP) pattern 1253 our $count = 0; 1254 'aaab' =~ /a+b?(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/; 1255 iseq $count, 1, "Expect 1 with (*SKIP)"; 1256 local $_ = 'aaab'; 1257 $count = 0; 1258 1 while /.(*MARK:foo)(*SKIP)(?{$count++})(*FAIL)/g; 1259 iseq $count, 4, "/.(*SKIP)/"; 1260 $_ = 'aaabaaab'; 1261 $count = 0; 1262 our @res = (); 1263 1 while /(a+b?)(*MARK:foo)(*SKIP)(?{$count++; push @res,$1})(*FAIL)/g; 1264 iseq $count, 2, "Expect 2 with (*SKIP)"; 1265 iseq "@res", "aaab aaab", "Adjacent (*SKIP) works as expected"; 1266 } 1267 1268 1269 { # Test the (*SKIP) pattern 1270 our $count = 0; 1271 'aaab' =~ /a*(*MARK:a)b?(*MARK:b)(*SKIP:a)(?{$count++})(*FAIL)/; 1272 iseq $count, 3, "Expect 3 with *MARK:a)b?(*MARK:b)(*SKIP:a)"; 1273 local $_ = 'aaabaaab'; 1274 $count = 0; 1275 our @res = (); 1276 1 while 1277 /(a*(*MARK:a)b?)(*MARK:x)(*SKIP:a)(?{$count++; push @res,$1})(*FAIL)/g; 1278 iseq $count, 5, "Expect 5 with (*MARK:a)b?)(*MARK:x)(*SKIP:a)"; 1279 iseq "@res", "aaab b aaab b ", 1280 "Adjacent (*MARK:a)b?)(*MARK:x)(*SKIP:a) works as expected"; 1281 } 1282 1283 1284 { # Test the (*COMMIT) pattern 1285 our $count = 0; 1286 'aaabaaab' =~ /a+b?(*COMMIT)(?{$count++})(*FAIL)/; 1287 iseq $count, 1, "Expect 1 with (*COMMIT)"; 1288 local $_ = 'aaab'; 1289 $count = 0; 1290 1 while /.(*COMMIT)(?{$count++})(*FAIL)/g; 1291 iseq $count, 1, "/.(*COMMIT)/"; 1292 $_ = 'aaabaaab'; 1293 $count = 0; 1294 our @res = (); 1295 1 while /(a+b?)(*COMMIT)(?{$count++; push @res,$1})(*FAIL)/g; 1296 iseq $count, 1, "Expect 1 with (*COMMIT)"; 1297 iseq "@res", "aaab", "Adjacent (*COMMIT) works as expected"; 1298 } 1299 1300 1301 { 1302 # Test named commits and the $REGERROR var 1303 our $REGERROR; 1304 for my $name ('', ':foo') { 1305 for my $pat ("(*PRUNE$name)", 1306 ($name ? "(*MARK$name)" : "") . "(*SKIP$name)", 1307 "(*COMMIT$name)") { 1308 for my $suffix ('(*FAIL)', '') { 1309 'aaaab' =~ /a+b$pat$suffix/; 1310 iseq $REGERROR, 1311 ($suffix ? ($name ? 'foo' : "1") : ""), 1312 "Test $pat and \$REGERROR $suffix"; 1313 } 1314 } 1315 } 1316 } 1317 1318 1319 { 1320 # Test named commits and the $REGERROR var 1321 package Fnorble; 1322 our $REGERROR; 1323 for my $name ('', ':foo') { 1324 for my $pat ("(*PRUNE$name)", 1325 ($name ? "(*MARK$name)" : "") . "(*SKIP$name)", 1326 "(*COMMIT$name)") { 1327 for my $suffix ('(*FAIL)','') { 1328 'aaaab' =~ /a+b$pat$suffix/; 1329 ::iseq $REGERROR, 1330 ($suffix ? ($name ? 'foo' : "1") : ""), 1331 "Test $pat and \$REGERROR $suffix"; 1332 } 1333 } 1334 } 1335 } 1336 1337 1338 { 1339 # Test named commits and the $REGERROR var 1340 local $Message = '$REGERROR'; 1341 our $REGERROR; 1342 for my $word (qw (bar baz bop)) { 1343 $REGERROR = ""; 1344 "aaaaa$word" =~ 1345 /a+(?:bar(*COMMIT:bar)|baz(*COMMIT:baz)|bop(*COMMIT:bop))(*FAIL)/; 1346 iseq $REGERROR, $word; 1347 } 1348 } 1349 1350 { 1351 #Mindnumbingly simple test of (*THEN) 1352 for ("ABC","BAX") { 1353 ok /A (*THEN) X | B (*THEN) C/x, "Simple (*THEN) test"; 1354 } 1355 } 1356 1357 1358 { 1359 local $Message = "Relative Recursion"; 1360 my $parens = qr/(\((?:[^()]++|(?-1))*+\))/; 1361 local $_ = 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'; 1362 my ($all, $one, $two) = ('', '', ''); 1363 ok m/foo $parens \s* \+ \s* bar $parens/x; 1364 iseq $1, '((2*3)+4-3)'; 1365 iseq $2, '(2*(3+4)-1*(2-3))'; 1366 iseq $&, 'foo((2*3)+4-3) + bar(2*(3+4)-1*(2-3))'; 1367 iseq $&, $_; 1368 } 1369 1370 { 1371 my $spaces=" "; 1372 local $_ = join 'bar', $spaces, $spaces; 1373 our $count = 0; 1374 s/(?>\s+bar)(?{$count++})//g; 1375 iseq $_, $spaces, "SUSPEND final string"; 1376 iseq $count, 1, "Optimiser should have prevented more than one match"; 1377 } 1378 1379 1380 { 1381 # From Message-ID: <877ixs6oa6.fsf@k75.linux.bogus> 1382 my $dow_name = "nada"; 1383 my $parser = "(\$dow_name) = \$time_string =~ /(D\x{e9}\\ " . 1384 "C\x{e9}adaoin|D\x{e9}\\ Sathairn|\\w+|\x{100})/"; 1385 my $time_string = "D\x{e9} C\x{e9}adaoin"; 1386 eval $parser; 1387 ok !$@, "Test Eval worked"; 1388 iseq $dow_name, $time_string, "UTF-8 trie common prefix extraction"; 1389 } 1390 1391 1392 { 1393 my $v; 1394 ($v = 'bar') =~ /(\w+)/g; 1395 $v = 'foo'; 1396 iseq "$1", 'bar', '$1 is safe after /g - may fail due ' . 1397 'to specialized config in pp_hot.c' 1398 } 1399 1400 1401 { 1402 local $Message = "http://nntp.perl.org/group/perl.perl5.porters/118663"; 1403 my $qr_barR1 = qr/(bar)\g-1/; 1404 ok "foobarbarxyz" =~ $qr_barR1; 1405 ok "foobarbarxyz" =~ qr/foo${qr_barR1}xyz/; 1406 ok "foobarbarxyz" =~ qr/(foo)${qr_barR1}xyz/; 1407 ok "foobarbarxyz" =~ qr/(foo)(bar)\g{-1}xyz/; 1408 ok "foobarbarxyz" =~ qr/(foo${qr_barR1})xyz/; 1409 ok "foobarbarxyz" =~ qr/(foo(bar)\g{-1})xyz/; 1410 } 1411 1412 { 1413 local $Message = '$REGMARK'; 1414 our @r = (); 1415 our ($REGMARK, $REGERROR); 1416 ok 'foofoo' =~ /foo (*MARK:foo) (?{push @r,$REGMARK}) /x; 1417 iseq "@r","foo"; 1418 iseq $REGMARK, "foo"; 1419 ok 'foofoo' !~ /foo (*MARK:foo) (*FAIL) /x; 1420 ok !$REGMARK; 1421 iseq $REGERROR, 'foo'; 1422 } 1423 1424 1425 { 1426 local $Message = '\K test'; 1427 my $x; 1428 $x = "abc.def.ghi.jkl"; 1429 $x =~ s/.*\K\..*//; 1430 iseq $x, "abc.def.ghi"; 1431 1432 $x = "one two three four"; 1433 $x =~ s/o+ \Kthree//g; 1434 iseq $x, "one two four"; 1435 1436 $x = "abcde"; 1437 $x =~ s/(.)\K/$1/g; 1438 iseq $x, "aabbccddee"; 1439 } 1440 1441 1442 { 1443 sub kt { 1444 return '4' if $_[0] eq '09028623'; 1445 } 1446 # Nested EVAL using PL_curpm (via $1 or friends) 1447 my $re; 1448 our $grabit = qr/ ([0-6][0-9]{7}) (??{ kt $1 }) [890] /x; 1449 $re = qr/^ ( (??{ $grabit }) ) $ /x; 1450 my @res = '0902862349' =~ $re; 1451 iseq join ("-", @res), "0902862349", 1452 'PL_curpm is set properly on nested eval'; 1453 1454 our $qr = qr/ (o) (??{ $1 }) /x; 1455 ok 'boob'=~/( b (??{ $qr }) b )/x && 1, "PL_curpm, nested eval"; 1456 } 1457 1458 1459 { 1460 use charnames ":full"; 1461 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Alphabetic}/, "I =~ Alphabetic"; 1462 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{Uppercase}/, "I =~ Uppercase"; 1463 ok "\N{ROMAN NUMERAL ONE}" !~ /\p{Lowercase}/, "I !~ Lowercase"; 1464 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{IDStart}/, "I =~ ID_Start"; 1465 ok "\N{ROMAN NUMERAL ONE}" =~ /\p{IDContinue}/, "I =~ ID_Continue"; 1466 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Alphabetic}/, "i =~ Alphabetic"; 1467 ok "\N{SMALL ROMAN NUMERAL ONE}" !~ /\p{Uppercase}/, "i !~ Uppercase"; 1468 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{Lowercase}/, "i =~ Lowercase"; 1469 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{IDStart}/, "i =~ ID_Start"; 1470 ok "\N{SMALL ROMAN NUMERAL ONE}" =~ /\p{IDContinue}/, "i =~ ID_Continue" 1471 } 1472 1473 1474 { 1475 # requirement of Unicode Technical Standard #18, 1.7 Code Points 1476 # cf. http://www.unicode.org/reports/tr18/#Supplementary_Characters 1477 for my $u (0x7FF, 0x800, 0xFFFF, 0x10000) { 1478 no warnings 'utf8'; # oops 1479 my $c = chr $u; 1480 my $x = sprintf '%04X', $u; 1481 ok "A${c}B" =~ /A[\0-\x{10000}]B/, "Unicode range - $x"; 1482 } 1483 } 1484 1485 1486 { 1487 my $res=""; 1488 1489 if ('1' =~ /(?|(?<digit>1)|(?<digit>2))/) { 1490 $res = "@{$- {digit}}"; 1491 } 1492 iseq $res, "1", 1493 "Check that (?|...) doesnt cause dupe entries in the names array"; 1494 1495 $res = ""; 1496 if ('11' =~ /(?|(?<digit>1)|(?<digit>2))(?&digit)/) { 1497 $res = "@{$- {digit}}"; 1498 } 1499 iseq $res, "1", "Check that (?&..) to a buffer inside " . 1500 "a (?|...) goes to the leftmost"; 1501 } 1502 1503 1504 { 1505 use warnings; 1506 local $Message = "ASCII pattern that really is UTF-8"; 1507 my @w; 1508 local $SIG {__WARN__} = sub {push @w, "@_"}; 1509 my $c = qq (\x{DF}); 1510 ok $c =~ /${c}|\x{100}/; 1511 ok @w == 0; 1512 } 1513 1514 1515 { 1516 local $Message = "Corruption of match results of qr// across scopes"; 1517 my $qr = qr/(fo+)(ba+r)/; 1518 'foobar' =~ /$qr/; 1519 iseq "$1$2", "foobar"; 1520 { 1521 'foooooobaaaaar' =~ /$qr/; 1522 iseq "$1$2", 'foooooobaaaaar'; 1523 } 1524 iseq "$1$2", "foobar"; 1525 } 1526 1527 1528 { 1529 local $Message = "HORIZWS"; 1530 local $_ = "\t \r\n \n \t".chr(11)."\n"; 1531 s/\H/H/g; 1532 s/\h/h/g; 1533 iseq $_, "hhHHhHhhHH"; 1534 $_ = "\t \r\n \n \t" . chr (11) . "\n"; 1535 utf8::upgrade ($_); 1536 s/\H/H/g; 1537 s/\h/h/g; 1538 iseq $_, "hhHHhHhhHH"; 1539 } 1540 1541 1542 { 1543 local $Message = "Various whitespace special patterns"; 1544 my @h = map {chr $_} 0x09, 0x20, 0xa0, 0x1680, 0x180e, 0x2000, 1545 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 1546 0x2007, 0x2008, 0x2009, 0x200a, 0x202f, 0x205f, 1547 0x3000; 1548 my @v = map {chr $_} 0x0a, 0x0b, 0x0c, 0x0d, 0x85, 0x2028, 1549 0x2029; 1550 my @lb = ("\x0D\x0A", map {chr $_} 0x0A .. 0x0D, 0x85, 0x2028, 0x2029); 1551 foreach my $t ([\@h, qr/\h/, qr/\h+/], 1552 [\@v, qr/\v/, qr/\v+/], 1553 [\@lb, qr/\R/, qr/\R+/],) { 1554 my $ary = shift @$t; 1555 foreach my $pat (@$t) { 1556 foreach my $str (@$ary) { 1557 ok $str =~ /($pat)/, $pat; 1558 iseq $1, $str, $pat; 1559 utf8::upgrade ($str); 1560 ok $str =~ /($pat)/, "Upgraded string - $pat"; 1561 iseq $1, $str, "Upgraded string - $pat"; 1562 } 1563 } 1564 } 1565 } 1566 1567 1568 { 1569 local $Message = "Check that \\xDF match properly in its various forms"; 1570 # Test that \xDF matches properly. this is pretty hacky stuff, 1571 # but its actually needed. The malarky with '-' is to prevent 1572 # compilation caching from playing any role in the test. 1573 my @df = (chr (0xDF), '-', chr (0xDF)); 1574 utf8::upgrade ($df [2]); 1575 my @strs = ('ss', 'sS', 'Ss', 'SS', chr (0xDF)); 1576 my @ss = map {("$_", "$_")} @strs; 1577 utf8::upgrade ($ss [$_ * 2 + 1]) for 0 .. $#strs; 1578 1579 for my $ssi (0 .. $#ss) { 1580 for my $dfi (0 .. $#df) { 1581 my $pat = $df [$dfi]; 1582 my $str = $ss [$ssi]; 1583 my $utf_df = ($dfi > 1) ? 'utf8' : ''; 1584 my $utf_ss = ($ssi % 2) ? 'utf8' : ''; 1585 (my $sstr = $str) =~ s/\xDF/\\xDF/; 1586 1587 if ($utf_df || $utf_ss || length ($ss [$ssi]) == 1) { 1588 my $ret = $str =~ /$pat/i; 1589 next if $pat eq '-'; 1590 ok $ret, "\"$sstr\" =~ /\\xDF/i " . 1591 "(str is @{[$utf_ss||'latin']}, pat is " . 1592 "@{[$utf_df||'latin']})"; 1593 } 1594 else { 1595 my $ret = $str !~ /$pat/i; 1596 next if $pat eq '-'; 1597 ok $ret, "\"$sstr\" !~ /\\xDF/i " . 1598 "(str is @{[$utf_ss||'latin']}, pat is " . 1599 "@{[$utf_df||'latin']})"; 1600 } 1601 } 1602 } 1603 } 1604 1605 1606 { 1607 local $Message = "BBC(Bleadperl Breaks CPAN) Today: String::Multibyte"; 1608 my $re = qr/(?:[\x00-\xFF]{4})/; 1609 my $hyp = "\0\0\0-"; 1610 my $esc = "\0\0\0\\"; 1611 1612 my $str = "$esc$hyp$hyp$esc$esc"; 1613 my @a = ($str =~ /\G(?:\Q$esc$esc\E|\Q$esc$hyp\E|$re)/g); 1614 1615 iseq @a,3; 1616 local $" = "="; 1617 iseq "@a","$esc$hyp=$hyp=$esc$esc"; 1618 } 1619 1620 1621 { 1622 # Test for keys in %+ and %- 1623 local $Message = 'Test keys in %+ and %-'; 1624 no warnings 'uninitialized'; 1625 my $_ = "abcdef"; 1626 /(?<foo>a)|(?<foo>b)/; 1627 iseq ((join ",", sort keys %+), "foo"); 1628 iseq ((join ",", sort keys %-), "foo"); 1629 iseq ((join ",", sort values %+), "a"); 1630 iseq ((join ",", sort map "@$_", values %-), "a "); 1631 /(?<bar>a)(?<bar>b)(?<quux>.)/; 1632 iseq ((join ",", sort keys %+), "bar,quux"); 1633 iseq ((join ",", sort keys %-), "bar,quux"); 1634 iseq ((join ",", sort values %+), "a,c"); # leftmost 1635 iseq ((join ",", sort map "@$_", values %-), "a b,c"); 1636 /(?<un>a)(?<deux>c)?/; # second buffer won't capture 1637 iseq ((join ",", sort keys %+), "un"); 1638 iseq ((join ",", sort keys %-), "deux,un"); 1639 iseq ((join ",", sort values %+), "a"); 1640 iseq ((join ",", sort map "@$_", values %-), ",a"); 1641 } 1642 1643 1644 { 1645 # length() on captures, the numbered ones end up in Perl_magic_len 1646 my $_ = "aoeu \xe6var ook"; 1647 /^ \w+ \s (?<eek>\S+)/x; 1648 1649 iseq length ($`), 0, q[length $`]; 1650 iseq length ($'), 4, q[length $']; 1651 iseq length ($&), 9, q[length $&]; 1652 iseq length ($1), 4, q[length $1]; 1653 iseq length ($+{eek}), 4, q[length $+{eek} == length $1]; 1654 } 1655 1656 1657 { 1658 my $ok = -1; 1659 1660 $ok = exists ($-{x}) ? 1 : 0 if 'bar' =~ /(?<x>foo)|bar/; 1661 iseq $ok, 1, '$-{x} exists after "bar"=~/(?<x>foo)|bar/'; 1662 iseq scalar (%+), 0, 'scalar %+ == 0 after "bar"=~/(?<x>foo)|bar/'; 1663 iseq scalar (%-), 1, 'scalar %- == 1 after "bar"=~/(?<x>foo)|bar/'; 1664 1665 $ok = -1; 1666 $ok = exists ($+{x}) ? 1 : 0 if 'bar' =~ /(?<x>foo)|bar/; 1667 iseq $ok, 0, '$+{x} not exists after "bar"=~/(?<x>foo)|bar/'; 1668 iseq scalar (%+), 0, 'scalar %+ == 0 after "bar"=~/(?<x>foo)|bar/'; 1669 iseq scalar (%-), 1, 'scalar %- == 1 after "bar"=~/(?<x>foo)|bar/'; 1670 1671 $ok = -1; 1672 $ok = exists ($-{x}) ? 1 : 0 if 'foo' =~ /(?<x>foo)|bar/; 1673 iseq $ok, 1, '$-{x} exists after "foo"=~/(?<x>foo)|bar/'; 1674 iseq scalar (%+), 1, 'scalar %+ == 1 after "foo"=~/(?<x>foo)|bar/'; 1675 iseq scalar (%-), 1, 'scalar %- == 1 after "foo"=~/(?<x>foo)|bar/'; 1676 1677 $ok = -1; 1678 $ok = exists ($+{x}) ? 1 : 0 if 'foo'=~/(?<x>foo)|bar/; 1679 iseq $ok, 1, '$+{x} exists after "foo"=~/(?<x>foo)|bar/'; 1680 } 1681 1682 1683 { 1684 local $_; 1685 ($_ = 'abc') =~ /(abc)/g; 1686 $_ = '123'; 1687 iseq "$1", 'abc', "/g leads to unsafe match vars: $1"; 1688 } 1689 1690 1691 { 1692 local $Message = 'Message-ID: <20070818091501.7eff4831@r2d2>'; 1693 my $str = ""; 1694 for (0 .. 5) { 1695 my @x; 1696 $str .= "@x"; # this should ALWAYS be the empty string 1697 'a' =~ /(a|)/; 1698 push @x, 1; 1699 } 1700 iseq length ($str), 0, "Trie scope error, string should be empty"; 1701 $str = ""; 1702 my @foo = ('a') x 5; 1703 for (@foo) { 1704 my @bar; 1705 $str .= "@bar"; 1706 s/a|/push @bar, 1/e; 1707 } 1708 iseq length ($str), 0, "Trie scope error, string should be empty"; 1709 } 1710 1711 1712 { 1713# more TRIE/AHOCORASICK problems with mixed utf8 / latin-1 and case folding 1714 for my $chr (160 .. 255) { 1715 my $chr_byte = chr($chr); 1716 my $chr_utf8 = chr($chr); utf8::upgrade($chr_utf8); 1717 my $rx = qr{$chr_byte|X}i; 1718 ok($chr_utf8 =~ $rx, "utf8/latin, codepoint $chr"); 1719 } 1720 } 1721 1722 { 1723 our $a = 3; "" =~ /(??{ $a })/; 1724 our $b = $a; 1725 iseq $b, $a, "Copy of scalar used for postponed subexpression"; 1726 } 1727 1728 1729 { 1730 our @ctl_n = (); 1731 our @plus = (); 1732 our $nested_tags; 1733 $nested_tags = qr{ 1734 < 1735 (\w+) 1736 (?{ 1737 push @ctl_n,$^N; 1738 push @plus,$+; 1739 }) 1740 > 1741 (??{$nested_tags})* 1742 </\s* \w+ \s*> 1743 }x; 1744 1745 my $match = '<bla><blubb></blubb></bla>' =~ m/^$nested_tags$/; 1746 ok $match, 'nested construct matches'; 1747 iseq "@ctl_n", "bla blubb", '$^N inside of (?{}) works as expected'; 1748 iseq "@plus", "bla blubb", '$+ inside of (?{}) works as expected'; 1749 } 1750 1751 1752 SKIP: { 1753 # XXX: This set of tests is essentially broken, POSIX character classes 1754 # should not have differing definitions under Unicode. 1755 # There are property names for that. 1756 skip "Tests assume ASCII", 4 unless $IS_ASCII; 1757 1758 my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/} 1759 map {chr} 0x20 .. 0x7f; 1760 iseq join ('', @notIsPunct), '$+<=>^`|~', 1761 '[:punct:] disagress with IsPunct on Symbols'; 1762 1763 my @isPrint = grep {not /[[:print:]]/ and /\p{IsPrint}/} 1764 map {chr} 0 .. 0x1f, 0x7f .. 0x9f; 1765 iseq join ('', @isPrint), "", 1766 'IsPrint agrees with [:print:] on control characters'; 1767 1768 my @isPunct = grep {/[[:punct:]]/ != /\p{IsPunct}/} 1769 map {chr} 0x80 .. 0xff; 1770 iseq join ('', @isPunct), "\xa1\xab\xb7\xbb\xbf", # ¡ « · » ¿ 1771 'IsPunct disagrees with [:punct:] outside ASCII'; 1772 1773 my @isPunctLatin1 = eval q { 1774 use encoding 'latin1'; 1775 grep {/[[:punct:]]/ != /\p{IsPunct}/} map {chr} 0x80 .. 0xff; 1776 }; 1777 skip "Eval failed ($@)", 1 if $@; 1778 skip "PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS set to 0", 1 1779 if !$ENV{PERL_TEST_LEGACY_POSIX_CC}; 1780 iseq join ('', @isPunctLatin1), '', 1781 'IsPunct agrees with [:punct:] with explicit Latin1'; 1782 } 1783 1784 # 1785 # Keep the following tests last -- they may crash perl 1786 # 1787 print "# Tests that follow may crash perl\n"; 1788 { 1789 eval '/\k/'; 1790 ok $@ =~ /\QSequence \k... not terminated in regex;\E/, 1791 'Lone \k not allowed'; 1792 } 1793 1794 { 1795 local $Message = "Substitution with lookahead (possible segv)"; 1796 $_ = "ns1ns1ns1"; 1797 s/ns(?=\d)/ns_/g; 1798 iseq $_, "ns_1ns_1ns_1"; 1799 $_ = "ns1"; 1800 s/ns(?=\d)/ns_/; 1801 iseq $_, "ns_1"; 1802 $_ = "123"; 1803 s/(?=\d+)|(?<=\d)/!Bang!/g; 1804 iseq $_, "!Bang!1!Bang!2!Bang!3!Bang!"; 1805 } 1806 1807 { 1808 # Earlier versions of Perl said this was fatal. 1809 local $Message = "U+0FFFF shouldn't crash the regex engine"; 1810 no warnings 'utf8'; 1811 my $a = eval "chr(65535)"; 1812 use warnings; 1813 my $warning_message; 1814 local $SIG{__WARN__} = sub { $warning_message = $_[0] }; 1815 eval $a =~ /[a-z]/; 1816 ok(1); # If it didn't crash, it worked. 1817 } 1818} # End of sub run_tests 1819 18201; 1821