1package Text::Balanced; 2 3# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. 4# FOR FULL DOCUMENTATION SEE Balanced.pod 5 6use 5.005; 7use strict; 8use Exporter (); 9use SelfLoader; 10 11use vars qw { $VERSION @ISA %EXPORT_TAGS }; 12BEGIN { 13 $VERSION = '2.03'; 14 @ISA = 'Exporter'; 15 %EXPORT_TAGS = ( 16 ALL => [ qw{ 17 &extract_delimited 18 &extract_bracketed 19 &extract_quotelike 20 &extract_codeblock 21 &extract_variable 22 &extract_tagged 23 &extract_multiple 24 &gen_delimited_pat 25 &gen_extract_tagged 26 &delimited_pat 27 } ], 28 ); 29} 30 31Exporter::export_ok_tags('ALL'); 32 33# PROTOTYPES 34 35sub _match_bracketed($$$$$$); 36sub _match_variable($$); 37sub _match_codeblock($$$$$$$); 38sub _match_quotelike($$$$); 39 40# HANDLE RETURN VALUES IN VARIOUS CONTEXTS 41 42sub _failmsg { 43 my ($message, $pos) = @_; 44 $@ = bless { 45 error => $message, 46 pos => $pos, 47 }, 'Text::Balanced::ErrorMsg'; 48} 49 50sub _fail { 51 my ($wantarray, $textref, $message, $pos) = @_; 52 _failmsg $message, $pos if $message; 53 return (undef, $$textref, undef) if $wantarray; 54 return undef; 55} 56 57sub _succeed { 58 $@ = undef; 59 my ($wantarray,$textref) = splice @_, 0, 2; 60 my ($extrapos, $extralen) = @_ > 18 61 ? splice(@_, -2, 2) 62 : (0, 0); 63 my ($startlen, $oppos) = @_[5,6]; 64 my $remainderpos = $_[2]; 65 if ( $wantarray ) { 66 my @res; 67 while (my ($from, $len) = splice @_, 0, 2) { 68 push @res, substr($$textref, $from, $len); 69 } 70 if ( $extralen ) { # CORRECT FILLET 71 my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n"); 72 $res[1] = "$extra$res[1]"; 73 eval { substr($$textref,$remainderpos,0) = $extra; 74 substr($$textref,$extrapos,$extralen,"\n")} ; 75 #REARRANGE HERE DOC AND FILLET IF POSSIBLE 76 pos($$textref) = $remainderpos-$extralen+1; # RESET \G 77 } else { 78 pos($$textref) = $remainderpos; # RESET \G 79 } 80 return @res; 81 } else { 82 my $match = substr($$textref,$_[0],$_[1]); 83 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; 84 my $extra = $extralen 85 ? substr($$textref, $extrapos, $extralen)."\n" : ""; 86 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE 87 pos($$textref) = $_[4]; # RESET \G 88 return $match; 89 } 90} 91 92# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING 93 94sub gen_delimited_pat($;$) # ($delimiters;$escapes) 95{ 96 my ($dels, $escs) = @_; 97 return "" unless $dels =~ /\S/; 98 $escs = '\\' unless $escs; 99 $escs .= substr($escs,-1) x (length($dels)-length($escs)); 100 my @pat = (); 101 my $i; 102 for ($i=0; $i<length $dels; $i++) 103 { 104 my $del = quotemeta substr($dels,$i,1); 105 my $esc = quotemeta substr($escs,$i,1); 106 if ($del eq $esc) 107 { 108 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; 109 } 110 else 111 { 112 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; 113 } 114 } 115 my $pat = join '|', @pat; 116 return "(?:$pat)"; 117} 118 119*delimited_pat = \&gen_delimited_pat; 120 121# THE EXTRACTION FUNCTIONS 122 123sub extract_delimited (;$$$$) 124{ 125 my $textref = defined $_[0] ? \$_[0] : \$_; 126 my $wantarray = wantarray; 127 my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; 128 my $pre = defined $_[2] ? $_[2] : '\s*'; 129 my $esc = defined $_[3] ? $_[3] : qq{\\}; 130 my $pat = gen_delimited_pat($del, $esc); 131 my $startpos = pos $$textref || 0; 132 return _fail($wantarray, $textref, "Not a delimited pattern", 0) 133 unless $$textref =~ m/\G($pre)($pat)/gc; 134 my $prelen = length($1); 135 my $matchpos = $startpos+$prelen; 136 my $endpos = pos $$textref; 137 return _succeed $wantarray, $textref, 138 $matchpos, $endpos-$matchpos, # MATCH 139 $endpos, length($$textref)-$endpos, # REMAINDER 140 $startpos, $prelen; # PREFIX 141} 142 143sub extract_bracketed (;$$$) 144{ 145 my $textref = defined $_[0] ? \$_[0] : \$_; 146 my $ldel = defined $_[1] ? $_[1] : '{([<'; 147 my $pre = defined $_[2] ? $_[2] : '\s*'; 148 my $wantarray = wantarray; 149 my $qdel = ""; 150 my $quotelike; 151 $ldel =~ s/'//g and $qdel .= q{'}; 152 $ldel =~ s/"//g and $qdel .= q{"}; 153 $ldel =~ s/`//g and $qdel .= q{`}; 154 $ldel =~ s/q//g and $quotelike = 1; 155 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; 156 my $rdel = $ldel; 157 unless ($rdel =~ tr/[({</])}>/) 158 { 159 return _fail $wantarray, $textref, 160 "Did not find a suitable bracket in delimiter: \"$_[1]\"", 161 0; 162 } 163 my $posbug = pos; 164 $ldel = join('|', map { quotemeta $_ } split('', $ldel)); 165 $rdel = join('|', map { quotemeta $_ } split('', $rdel)); 166 pos = $posbug; 167 168 my $startpos = pos $$textref || 0; 169 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); 170 171 return _fail ($wantarray, $textref) unless @match; 172 173 return _succeed ( $wantarray, $textref, 174 $match[2], $match[5]+2, # MATCH 175 @match[8,9], # REMAINDER 176 @match[0,1], # PREFIX 177 ); 178} 179 180sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel 181{ 182 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; 183 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); 184 unless ($$textref =~ m/\G$pre/gc) 185 { 186 _failmsg "Did not find prefix: /$pre/", $startpos; 187 return; 188 } 189 190 $ldelpos = pos $$textref; 191 192 unless ($$textref =~ m/\G($ldel)/gc) 193 { 194 _failmsg "Did not find opening bracket after prefix: \"$pre\"", 195 pos $$textref; 196 pos $$textref = $startpos; 197 return; 198 } 199 200 my @nesting = ( $1 ); 201 my $textlen = length $$textref; 202 while (pos $$textref < $textlen) 203 { 204 next if $$textref =~ m/\G\\./gcs; 205 206 if ($$textref =~ m/\G($ldel)/gc) 207 { 208 push @nesting, $1; 209 } 210 elsif ($$textref =~ m/\G($rdel)/gc) 211 { 212 my ($found, $brackettype) = ($1, $1); 213 if ($#nesting < 0) 214 { 215 _failmsg "Unmatched closing bracket: \"$found\"", 216 pos $$textref; 217 pos $$textref = $startpos; 218 return; 219 } 220 my $expected = pop(@nesting); 221 $expected =~ tr/({[</)}]>/; 222 if ($expected ne $brackettype) 223 { 224 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, 225 pos $$textref; 226 pos $$textref = $startpos; 227 return; 228 } 229 last if $#nesting < 0; 230 } 231 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) 232 { 233 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; 234 _failmsg "Unmatched embedded quote ($1)", 235 pos $$textref; 236 pos $$textref = $startpos; 237 return; 238 } 239 elsif ($quotelike && _match_quotelike($textref,"",1,0)) 240 { 241 next; 242 } 243 244 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } 245 } 246 if ($#nesting>=0) 247 { 248 _failmsg "Unmatched opening bracket(s): " 249 . join("..",@nesting)."..", 250 pos $$textref; 251 pos $$textref = $startpos; 252 return; 253 } 254 255 $endpos = pos $$textref; 256 257 return ( 258 $startpos, $ldelpos-$startpos, # PREFIX 259 $ldelpos, 1, # OPENING BRACKET 260 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS 261 $endpos-1, 1, # CLOSING BRACKET 262 $endpos, length($$textref)-$endpos, # REMAINDER 263 ); 264} 265 266sub _revbracket($) 267{ 268 my $brack = reverse $_[0]; 269 $brack =~ tr/[({</])}>/; 270 return $brack; 271} 272 273my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; 274 275sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) 276{ 277 my $textref = defined $_[0] ? \$_[0] : \$_; 278 my $ldel = $_[1]; 279 my $rdel = $_[2]; 280 my $pre = defined $_[3] ? $_[3] : '\s*'; 281 my %options = defined $_[4] ? %{$_[4]} : (); 282 my $omode = defined $options{fail} ? $options{fail} : ''; 283 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 284 : defined($options{reject}) ? $options{reject} 285 : '' 286 ; 287 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 288 : defined($options{ignore}) ? $options{ignore} 289 : '' 290 ; 291 292 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } 293 $@ = undef; 294 295 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 296 297 return _fail(wantarray, $textref) unless @match; 298 return _succeed wantarray, $textref, 299 $match[2], $match[3]+$match[5]+$match[7], # MATCH 300 @match[8..9,0..1,2..7]; # REM, PRE, BITS 301} 302 303sub _match_tagged # ($$$$$$$) 304{ 305 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; 306 my $rdelspec; 307 308 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); 309 310 unless ($$textref =~ m/\G($pre)/gc) 311 { 312 _failmsg "Did not find prefix: /$pre/", pos $$textref; 313 goto failed; 314 } 315 316 $opentagpos = pos($$textref); 317 318 unless ($$textref =~ m/\G$ldel/gc) 319 { 320 _failmsg "Did not find opening tag: /$ldel/", pos $$textref; 321 goto failed; 322 } 323 324 $textpos = pos($$textref); 325 326 if (!defined $rdel) 327 { 328 $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]); 329 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes) 330 { 331 _failmsg "Unable to construct closing tag to match: $rdel", 332 pos $$textref; 333 goto failed; 334 } 335 } 336 else 337 { 338 $rdelspec = eval "qq{$rdel}" || do { 339 my $del; 340 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) 341 { next if $rdel =~ /\Q$_/; $del = $_; last } 342 unless ($del) { 343 use Carp; 344 croak "Can't interpolate right delimiter $rdel" 345 } 346 eval "qq$del$rdel$del"; 347 }; 348 } 349 350 while (pos($$textref) < length($$textref)) 351 { 352 next if $$textref =~ m/\G\\./gc; 353 354 if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) 355 { 356 $parapos = pos($$textref) - length($1) 357 unless defined $parapos; 358 } 359 elsif ($$textref =~ m/\G($rdelspec)/gc ) 360 { 361 $closetagpos = pos($$textref)-length($1); 362 goto matched; 363 } 364 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) 365 { 366 next; 367 } 368 elsif ($bad && $$textref =~ m/\G($bad)/gcs) 369 { 370 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS 371 goto short if ($omode eq 'PARA' || $omode eq 'MAX'); 372 _failmsg "Found invalid nested tag: $1", pos $$textref; 373 goto failed; 374 } 375 elsif ($$textref =~ m/\G($ldel)/gc) 376 { 377 my $tag = $1; 378 pos($$textref) -= length($tag); # REWIND TO NESTED TAG 379 unless (_match_tagged(@_)) # MATCH NESTED TAG 380 { 381 goto short if $omode eq 'PARA' || $omode eq 'MAX'; 382 _failmsg "Found unbalanced nested tag: $tag", 383 pos $$textref; 384 goto failed; 385 } 386 } 387 else { $$textref =~ m/./gcs } 388 } 389 390short: 391 $closetagpos = pos($$textref); 392 goto matched if $omode eq 'MAX'; 393 goto failed unless $omode eq 'PARA'; 394 395 if (defined $parapos) { pos($$textref) = $parapos } 396 else { $parapos = pos($$textref) } 397 398 return ( 399 $startpos, $opentagpos-$startpos, # PREFIX 400 $opentagpos, $textpos-$opentagpos, # OPENING TAG 401 $textpos, $parapos-$textpos, # TEXT 402 $parapos, 0, # NO CLOSING TAG 403 $parapos, length($$textref)-$parapos, # REMAINDER 404 ); 405 406matched: 407 $endpos = pos($$textref); 408 return ( 409 $startpos, $opentagpos-$startpos, # PREFIX 410 $opentagpos, $textpos-$opentagpos, # OPENING TAG 411 $textpos, $closetagpos-$textpos, # TEXT 412 $closetagpos, $endpos-$closetagpos, # CLOSING TAG 413 $endpos, length($$textref)-$endpos, # REMAINDER 414 ); 415 416failed: 417 _failmsg "Did not find closing tag", pos $$textref unless $@; 418 pos($$textref) = $startpos; 419 return; 420} 421 422sub extract_variable (;$$) 423{ 424 my $textref = defined $_[0] ? \$_[0] : \$_; 425 return ("","","") unless defined $$textref; 426 my $pre = defined $_[1] ? $_[1] : '\s*'; 427 428 my @match = _match_variable($textref,$pre); 429 430 return _fail wantarray, $textref unless @match; 431 432 return _succeed wantarray, $textref, 433 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX 434} 435 436sub _match_variable($$) 437{ 438# $# 439# $^ 440# $$ 441 my ($textref, $pre) = @_; 442 my $startpos = pos($$textref) = pos($$textref)||0; 443 unless ($$textref =~ m/\G($pre)/gc) 444 { 445 _failmsg "Did not find prefix: /$pre/", pos $$textref; 446 return; 447 } 448 my $varpos = pos($$textref); 449 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) 450 { 451 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) 452 { 453 _failmsg "Did not find leading dereferencer", pos $$textref; 454 pos $$textref = $startpos; 455 return; 456 } 457 my $deref = $1; 458 459 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci 460 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0) 461 or $deref eq '$#' or $deref eq '$$' ) 462 { 463 _failmsg "Bad identifier after dereferencer", pos $$textref; 464 pos $$textref = $startpos; 465 return; 466 } 467 } 468 469 while (1) 470 { 471 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; 472 next if _match_codeblock($textref, 473 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, 474 qr/[({[]/, qr/[)}\]]/, 475 qr/[({[]/, qr/[)}\]]/, 0); 476 next if _match_codeblock($textref, 477 qr/\s*/, qr/[{[]/, qr/[}\]]/, 478 qr/[{[]/, qr/[}\]]/, 0); 479 next if _match_variable($textref,'\s*->\s*'); 480 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; 481 last; 482 } 483 484 my $endpos = pos($$textref); 485 return ($startpos, $varpos-$startpos, 486 $varpos, $endpos-$varpos, 487 $endpos, length($$textref)-$endpos 488 ); 489} 490 491sub extract_codeblock (;$$$$$) 492{ 493 my $textref = defined $_[0] ? \$_[0] : \$_; 494 my $wantarray = wantarray; 495 my $ldel_inner = defined $_[1] ? $_[1] : '{'; 496 my $pre = defined $_[2] ? $_[2] : '\s*'; 497 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; 498 my $rd = $_[4]; 499 my $rdel_inner = $ldel_inner; 500 my $rdel_outer = $ldel_outer; 501 my $posbug = pos; 502 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } 503 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } 504 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) 505 { 506 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' 507 } 508 pos = $posbug; 509 510 my @match = _match_codeblock($textref, $pre, 511 $ldel_outer, $rdel_outer, 512 $ldel_inner, $rdel_inner, 513 $rd); 514 return _fail($wantarray, $textref) unless @match; 515 return _succeed($wantarray, $textref, 516 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX 517 ); 518 519} 520 521sub _match_codeblock($$$$$$$) 522{ 523 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; 524 my $startpos = pos($$textref) = pos($$textref) || 0; 525 unless ($$textref =~ m/\G($pre)/gc) 526 { 527 _failmsg qq{Did not match prefix /$pre/ at"} . 528 substr($$textref,pos($$textref),20) . 529 q{..."}, 530 pos $$textref; 531 return; 532 } 533 my $codepos = pos($$textref); 534 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER 535 { 536 _failmsg qq{Did not find expected opening bracket at "} . 537 substr($$textref,pos($$textref),20) . 538 q{..."}, 539 pos $$textref; 540 pos $$textref = $startpos; 541 return; 542 } 543 my $closing = $1; 544 $closing =~ tr/([<{/)]>}/; 545 my $matched; 546 my $patvalid = 1; 547 while (pos($$textref) < length($$textref)) 548 { 549 $matched = ''; 550 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) 551 { 552 $patvalid = 0; 553 next; 554 } 555 556 if ($$textref =~ m/\G\s*#.*/gc) 557 { 558 next; 559 } 560 561 if ($$textref =~ m/\G\s*($rdel_outer)/gc) 562 { 563 unless ($matched = ($closing && $1 eq $closing) ) 564 { 565 next if $1 eq '>'; # MIGHT BE A "LESS THAN" 566 _failmsg q{Mismatched closing bracket at "} . 567 substr($$textref,pos($$textref),20) . 568 qq{...". Expected '$closing'}, 569 pos $$textref; 570 } 571 last; 572 } 573 574 if (_match_variable($textref,'\s*') || 575 _match_quotelike($textref,'\s*',$patvalid,$patvalid) ) 576 { 577 $patvalid = 0; 578 next; 579 } 580 581 582 # NEED TO COVER MANY MORE CASES HERE!!! 583 if ($$textref =~ m#\G\s*(?!$ldel_inner) 584 ( [-+*x/%^&|.]=? 585 | [!=]~ 586 | =(?!>) 587 | (\*\*|&&|\|\||<<|>>)=? 588 | split|grep|map|return 589 | [([] 590 )#gcx) 591 { 592 $patvalid = 1; 593 next; 594 } 595 596 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) 597 { 598 $patvalid = 1; 599 next; 600 } 601 602 if ($$textref =~ m/\G\s*$ldel_outer/gc) 603 { 604 _failmsg q{Improperly nested codeblock at "} . 605 substr($$textref,pos($$textref),20) . 606 q{..."}, 607 pos $$textref; 608 last; 609 } 610 611 $patvalid = 0; 612 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; 613 } 614 continue { $@ = undef } 615 616 unless ($matched) 617 { 618 _failmsg 'No match found for opening bracket', pos $$textref 619 unless $@; 620 return; 621 } 622 623 my $endpos = pos($$textref); 624 return ( $startpos, $codepos-$startpos, 625 $codepos, $endpos-$codepos, 626 $endpos, length($$textref)-$endpos, 627 ); 628} 629 630 631my %mods = ( 632 'none' => '[cgimsox]*', 633 'm' => '[cgimsox]*', 634 's' => '[cegimsox]*', 635 'tr' => '[cds]*', 636 'y' => '[cds]*', 637 'qq' => '', 638 'qx' => '', 639 'qw' => '', 640 'qr' => '[imsx]*', 641 'q' => '', 642 ); 643 644sub extract_quotelike (;$$) 645{ 646 my $textref = $_[0] ? \$_[0] : \$_; 647 my $wantarray = wantarray; 648 my $pre = defined $_[1] ? $_[1] : '\s*'; 649 650 my @match = _match_quotelike($textref,$pre,1,0); 651 return _fail($wantarray, $textref) unless @match; 652 return _succeed($wantarray, $textref, 653 $match[2], $match[18]-$match[2], # MATCH 654 @match[18,19], # REMAINDER 655 @match[0,1], # PREFIX 656 @match[2..17], # THE BITS 657 @match[20,21], # ANY FILLET? 658 ); 659}; 660 661sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) 662{ 663 my ($textref, $pre, $rawmatch, $qmark) = @_; 664 665 my ($textlen,$startpos, 666 $oppos, 667 $preld1pos,$ld1pos,$str1pos,$rd1pos, 668 $preld2pos,$ld2pos,$str2pos,$rd2pos, 669 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); 670 671 unless ($$textref =~ m/\G($pre)/gc) 672 { 673 _failmsg qq{Did not find prefix /$pre/ at "} . 674 substr($$textref, pos($$textref), 20) . 675 q{..."}, 676 pos $$textref; 677 return; 678 } 679 $oppos = pos($$textref); 680 681 my $initial = substr($$textref,$oppos,1); 682 683 if ($initial && $initial =~ m|^[\"\'\`]| 684 || $rawmatch && $initial =~ m|^/| 685 || $qmark && $initial =~ m|^\?|) 686 { 687 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) 688 { 689 _failmsg qq{Did not find closing delimiter to match '$initial' at "} . 690 substr($$textref, $oppos, 20) . 691 q{..."}, 692 pos $$textref; 693 pos $$textref = $startpos; 694 return; 695 } 696 $modpos= pos($$textref); 697 $rd1pos = $modpos-1; 698 699 if ($initial eq '/' || $initial eq '?') 700 { 701 $$textref =~ m/\G$mods{none}/gc 702 } 703 704 my $endpos = pos($$textref); 705 return ( 706 $startpos, $oppos-$startpos, # PREFIX 707 $oppos, 0, # NO OPERATOR 708 $oppos, 1, # LEFT DEL 709 $oppos+1, $rd1pos-$oppos-1, # STR/PAT 710 $rd1pos, 1, # RIGHT DEL 711 $modpos, 0, # NO 2ND LDEL 712 $modpos, 0, # NO 2ND STR 713 $modpos, 0, # NO 2ND RDEL 714 $modpos, $endpos-$modpos, # MODIFIERS 715 $endpos, $textlen-$endpos, # REMAINDER 716 ); 717 } 718 719 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) 720 { 721 _failmsg q{No quotelike operator found after prefix at "} . 722 substr($$textref, pos($$textref), 20) . 723 q{..."}, 724 pos $$textref; 725 pos $$textref = $startpos; 726 return; 727 } 728 729 my $op = $1; 730 $preld1pos = pos($$textref); 731 if ($op eq '<<') { 732 $ld1pos = pos($$textref); 733 my $label; 734 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { 735 $label = $1; 736 } 737 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' 738 | \G " ([^"\\]* (?:\\.[^"\\]*)*) " 739 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` 740 }gcsx) { 741 $label = $+; 742 } 743 else { 744 $label = ""; 745 } 746 my $extrapos = pos($$textref); 747 $$textref =~ m{.*\n}gc; 748 $str1pos = pos($$textref)--; 749 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) { 750 _failmsg qq{Missing here doc terminator ('$label') after "} . 751 substr($$textref, $startpos, 20) . 752 q{..."}, 753 pos $$textref; 754 pos $$textref = $startpos; 755 return; 756 } 757 $rd1pos = pos($$textref); 758 $$textref =~ m{\Q$label\E\n}gc; 759 $ld2pos = pos($$textref); 760 return ( 761 $startpos, $oppos-$startpos, # PREFIX 762 $oppos, length($op), # OPERATOR 763 $ld1pos, $extrapos-$ld1pos, # LEFT DEL 764 $str1pos, $rd1pos-$str1pos, # STR/PAT 765 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL 766 $ld2pos, 0, # NO 2ND LDEL 767 $ld2pos, 0, # NO 2ND STR 768 $ld2pos, 0, # NO 2ND RDEL 769 $ld2pos, 0, # NO MODIFIERS 770 $ld2pos, $textlen-$ld2pos, # REMAINDER 771 $extrapos, $str1pos-$extrapos, # FILLETED BIT 772 ); 773 } 774 775 $$textref =~ m/\G\s*/gc; 776 $ld1pos = pos($$textref); 777 $str1pos = $ld1pos+1; 778 779 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD 780 { 781 _failmsg "No block delimiter found after quotelike $op", 782 pos $$textref; 783 pos $$textref = $startpos; 784 return; 785 } 786 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN 787 my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); 788 if ($ldel1 =~ /[[(<{]/) 789 { 790 $rdel1 =~ tr/[({</])}>/; 791 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1)) 792 || do { pos $$textref = $startpos; return }; 793 $ld2pos = pos($$textref); 794 $rd1pos = $ld2pos-1; 795 } 796 else 797 { 798 $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs 799 || do { pos $$textref = $startpos; return }; 800 $ld2pos = $rd1pos = pos($$textref)-1; 801 } 802 803 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; 804 if ($second_arg) 805 { 806 my ($ldel2, $rdel2); 807 if ($ldel1 =~ /[[(<{]/) 808 { 809 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD 810 { 811 _failmsg "Missing second block for quotelike $op", 812 pos $$textref; 813 pos $$textref = $startpos; 814 return; 815 } 816 $ldel2 = $rdel2 = "\Q$1"; 817 $rdel2 =~ tr/[({</])}>/; 818 } 819 else 820 { 821 $ldel2 = $rdel2 = $ldel1; 822 } 823 $str2pos = $ld2pos+1; 824 825 if ($ldel2 =~ /[[(<{]/) 826 { 827 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD 828 defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2)) 829 || do { pos $$textref = $startpos; return }; 830 } 831 else 832 { 833 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs 834 || do { pos $$textref = $startpos; return }; 835 } 836 $rd2pos = pos($$textref)-1; 837 } 838 else 839 { 840 $ld2pos = $str2pos = $rd2pos = $rd1pos; 841 } 842 843 $modpos = pos $$textref; 844 845 $$textref =~ m/\G($mods{$op})/gc; 846 my $endpos = pos $$textref; 847 848 return ( 849 $startpos, $oppos-$startpos, # PREFIX 850 $oppos, length($op), # OPERATOR 851 $ld1pos, 1, # LEFT DEL 852 $str1pos, $rd1pos-$str1pos, # STR/PAT 853 $rd1pos, 1, # RIGHT DEL 854 $ld2pos, $second_arg, # 2ND LDEL (MAYBE) 855 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) 856 $rd2pos, $second_arg, # 2ND RDEL (MAYBE) 857 $modpos, $endpos-$modpos, # MODIFIERS 858 $endpos, $textlen-$endpos, # REMAINDER 859 ); 860} 861 862my $def_func = [ 863 sub { extract_variable($_[0], '') }, 864 sub { extract_quotelike($_[0],'') }, 865 sub { extract_codeblock($_[0],'{}','') }, 866]; 867 868sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) 869{ 870 my $textref = defined($_[0]) ? \$_[0] : \$_; 871 my $posbug = pos; 872 my ($lastpos, $firstpos); 873 my @fields = (); 874 875 #for ($$textref) 876 { 877 my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; 878 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; 879 my $igunk = $_[3]; 880 881 pos $$textref ||= 0; 882 883 unless (wantarray) 884 { 885 use Carp; 886 carp "extract_multiple reset maximal count to 1 in scalar context" 887 if $^W && defined($_[2]) && $max > 1; 888 $max = 1 889 } 890 891 my $unkpos; 892 my $func; 893 my $class; 894 895 my @class; 896 foreach $func ( @func ) 897 { 898 if (ref($func) eq 'HASH') 899 { 900 push @class, (keys %$func)[0]; 901 $func = (values %$func)[0]; 902 } 903 else 904 { 905 push @class, undef; 906 } 907 } 908 909 FIELD: while (pos($$textref) < length($$textref)) 910 { 911 my ($field, $rem); 912 my @bits; 913 foreach my $i ( 0..$#func ) 914 { 915 my $pref; 916 $func = $func[$i]; 917 $class = $class[$i]; 918 $lastpos = pos $$textref; 919 if (ref($func) eq 'CODE') 920 { ($field,$rem,$pref) = @bits = $func->($$textref) } 921 elsif (ref($func) eq 'Text::Balanced::Extractor') 922 { @bits = $field = $func->extract($$textref) } 923 elsif( $$textref =~ m/\G$func/gc ) 924 { @bits = $field = defined($1) 925 ? $1 926 : substr($$textref, $-[0], $+[0] - $-[0]) 927 } 928 $pref ||= ""; 929 if (defined($field) && length($field)) 930 { 931 if (!$igunk) { 932 $unkpos = $lastpos 933 if length($pref) && !defined($unkpos); 934 if (defined $unkpos) 935 { 936 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; 937 $firstpos = $unkpos unless defined $firstpos; 938 undef $unkpos; 939 last FIELD if @fields == $max; 940 } 941 } 942 push @fields, $class 943 ? bless (\$field, $class) 944 : $field; 945 $firstpos = $lastpos unless defined $firstpos; 946 $lastpos = pos $$textref; 947 last FIELD if @fields == $max; 948 next FIELD; 949 } 950 } 951 if ($$textref =~ /\G(.)/gcs) 952 { 953 $unkpos = pos($$textref)-1 954 unless $igunk || defined $unkpos; 955 } 956 } 957 958 if (defined $unkpos) 959 { 960 push @fields, substr($$textref, $unkpos); 961 $firstpos = $unkpos unless defined $firstpos; 962 $lastpos = length $$textref; 963 } 964 last; 965 } 966 967 pos $$textref = $lastpos; 968 return @fields if wantarray; 969 970 $firstpos ||= 0; 971 eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; 972 pos $$textref = $firstpos }; 973 return $fields[0]; 974} 975 976sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) 977{ 978 my $ldel = $_[0]; 979 my $rdel = $_[1]; 980 my $pre = defined $_[2] ? $_[2] : '\s*'; 981 my %options = defined $_[3] ? %{$_[3]} : (); 982 my $omode = defined $options{fail} ? $options{fail} : ''; 983 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 984 : defined($options{reject}) ? $options{reject} 985 : '' 986 ; 987 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 988 : defined($options{ignore}) ? $options{ignore} 989 : '' 990 ; 991 992 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } 993 994 my $posbug = pos; 995 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } 996 pos = $posbug; 997 998 my $closure = sub 999 { 1000 my $textref = defined $_[0] ? \$_[0] : \$_; 1001 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 1002 1003 return _fail(wantarray, $textref) unless @match; 1004 return _succeed wantarray, $textref, 1005 $match[2], $match[3]+$match[5]+$match[7], # MATCH 1006 @match[8..9,0..1,2..7]; # REM, PRE, BITS 1007 }; 1008 1009 bless $closure, 'Text::Balanced::Extractor'; 1010} 1011 1012package Text::Balanced::Extractor; 1013 1014sub extract($$) # ($self, $text) 1015{ 1016 &{$_[0]}($_[1]); 1017} 1018 1019package Text::Balanced::ErrorMsg; 1020 1021use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }; 1022 10231; 1024 1025__END__ 1026 1027=pod 1028 1029=head1 NAME 1030 1031Text::Balanced - Extract delimited text sequences from strings. 1032 1033=head1 SYNOPSIS 1034 1035 use Text::Balanced qw ( 1036 extract_delimited 1037 extract_bracketed 1038 extract_quotelike 1039 extract_codeblock 1040 extract_variable 1041 extract_tagged 1042 extract_multiple 1043 gen_delimited_pat 1044 gen_extract_tagged 1045 ); 1046 1047 # Extract the initial substring of $text that is delimited by 1048 # two (unescaped) instances of the first character in $delim. 1049 1050 ($extracted, $remainder) = extract_delimited($text,$delim); 1051 1052 1053 # Extract the initial substring of $text that is bracketed 1054 # with a delimiter(s) specified by $delim (where the string 1055 # in $delim contains one or more of '(){}[]<>'). 1056 1057 ($extracted, $remainder) = extract_bracketed($text,$delim); 1058 1059 1060 # Extract the initial substring of $text that is bounded by 1061 # an XML tag. 1062 1063 ($extracted, $remainder) = extract_tagged($text); 1064 1065 1066 # Extract the initial substring of $text that is bounded by 1067 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags 1068 1069 ($extracted, $remainder) = 1070 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); 1071 1072 1073 # Extract the initial substring of $text that represents a 1074 # Perl "quote or quote-like operation" 1075 1076 ($extracted, $remainder) = extract_quotelike($text); 1077 1078 1079 # Extract the initial substring of $text that represents a block 1080 # of Perl code, bracketed by any of character(s) specified by $delim 1081 # (where the string $delim contains one or more of '(){}[]<>'). 1082 1083 ($extracted, $remainder) = extract_codeblock($text,$delim); 1084 1085 1086 # Extract the initial substrings of $text that would be extracted by 1087 # one or more sequential applications of the specified functions 1088 # or regular expressions 1089 1090 @extracted = extract_multiple($text, 1091 [ \&extract_bracketed, 1092 \&extract_quotelike, 1093 \&some_other_extractor_sub, 1094 qr/[xyz]*/, 1095 'literal', 1096 ]); 1097 1098# Create a string representing an optimized pattern (a la Friedl) 1099# that matches a substring delimited by any of the specified characters 1100# (in this case: any type of quote or a slash) 1101 1102 $patstring = gen_delimited_pat(q{'"`/}); 1103 1104# Generate a reference to an anonymous sub that is just like extract_tagged 1105# but pre-compiled and optimized for a specific pair of tags, and consequently 1106# much faster (i.e. 3 times faster). It uses qr// for better performance on 1107# repeated calls, so it only works under Perl 5.005 or later. 1108 1109 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); 1110 1111 ($extracted, $remainder) = $extract_head->($text); 1112 1113=head1 DESCRIPTION 1114 1115The various C<extract_...> subroutines may be used to 1116extract a delimited substring, possibly after skipping a 1117specified prefix string. By default, that prefix is 1118optional whitespace (C</\s*/>), but you can change it to whatever 1119you wish (see below). 1120 1121The substring to be extracted must appear at the 1122current C<pos> location of the string's variable 1123(or at index zero, if no C<pos> position is defined). 1124In other words, the C<extract_...> subroutines I<don't> 1125extract the first occurrence of a substring anywhere 1126in a string (like an unanchored regex would). Rather, 1127they extract an occurrence of the substring appearing 1128immediately at the current matching position in the 1129string (like a C<\G>-anchored regex would). 1130 1131=head2 General behaviour in list contexts 1132 1133In a list context, all the subroutines return a list, the first three 1134elements of which are always: 1135 1136=over 4 1137 1138=item [0] 1139 1140The extracted string, including the specified delimiters. 1141If the extraction fails C<undef> is returned. 1142 1143=item [1] 1144 1145The remainder of the input string (i.e. the characters after the 1146extracted string). On failure, the entire string is returned. 1147 1148=item [2] 1149 1150The skipped prefix (i.e. the characters before the extracted string). 1151On failure, C<undef> is returned. 1152 1153=back 1154 1155Note that in a list context, the contents of the original input text (the first 1156argument) are not modified in any way. 1157 1158However, if the input text was passed in a variable, that variable's 1159C<pos> value is updated to point at the first character after the 1160extracted text. That means that in a list context the various 1161subroutines can be used much like regular expressions. For example: 1162 1163 while ( $next = (extract_quotelike($text))[0] ) 1164 { 1165 # process next quote-like (in $next) 1166 } 1167 1168=head2 General behaviour in scalar and void contexts 1169 1170In a scalar context, the extracted string is returned, having first been 1171removed from the input text. Thus, the following code also processes 1172each quote-like operation, but actually removes them from $text: 1173 1174 while ( $next = extract_quotelike($text) ) 1175 { 1176 # process next quote-like (in $next) 1177 } 1178 1179Note that if the input text is a read-only string (i.e. a literal), 1180no attempt is made to remove the extracted text. 1181 1182In a void context the behaviour of the extraction subroutines is 1183exactly the same as in a scalar context, except (of course) that the 1184extracted substring is not returned. 1185 1186=head2 A note about prefixes 1187 1188Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.) 1189This can bite you if you're expecting a prefix specification like 1190'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix 1191pattern will only succeed if the <H1> tag is on the current line, since 1192. normally doesn't match newlines. 1193 1194To overcome this limitation, you need to turn on /s matching within 1195the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' 1196 1197=head2 C<extract_delimited> 1198 1199The C<extract_delimited> function formalizes the common idiom 1200of extracting a single-character-delimited substring from the start of 1201a string. For example, to extract a single-quote delimited string, the 1202following code is typically used: 1203 1204 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; 1205 $extracted = $1; 1206 1207but with C<extract_delimited> it can be simplified to: 1208 1209 ($extracted,$remainder) = extract_delimited($text, "'"); 1210 1211C<extract_delimited> takes up to four scalars (the input text, the 1212delimiters, a prefix pattern to be skipped, and any escape characters) 1213and extracts the initial substring of the text that 1214is appropriately delimited. If the delimiter string has multiple 1215characters, the first one encountered in the text is taken to delimit 1216the substring. 1217The third argument specifies a prefix pattern that is to be skipped 1218(but must be present!) before the substring is extracted. 1219The final argument specifies the escape character to be used for each 1220delimiter. 1221 1222All arguments are optional. If the escape characters are not specified, 1223every delimiter is escaped with a backslash (C<\>). 1224If the prefix is not specified, the 1225pattern C<'\s*'> - optional whitespace - is used. If the delimiter set 1226is also not specified, the set C</["'`]/> is used. If the text to be processed 1227is not specified either, C<$_> is used. 1228 1229In list context, C<extract_delimited> returns a array of three 1230elements, the extracted substring (I<including the surrounding 1231delimiters>), the remainder of the text, and the skipped prefix (if 1232any). If a suitable delimited substring is not found, the first 1233element of the array is the empty string, the second is the complete 1234original text, and the prefix returned in the third element is an 1235empty string. 1236 1237In a scalar context, just the extracted substring is returned. In 1238a void context, the extracted substring (and any prefix) are simply 1239removed from the beginning of the first argument. 1240 1241Examples: 1242 1243 # Remove a single-quoted substring from the very beginning of $text: 1244 1245 $substring = extract_delimited($text, "'", ''); 1246 1247 # Remove a single-quoted Pascalish substring (i.e. one in which 1248 # doubling the quote character escapes it) from the very 1249 # beginning of $text: 1250 1251 $substring = extract_delimited($text, "'", '', "'"); 1252 1253 # Extract a single- or double- quoted substring from the 1254 # beginning of $text, optionally after some whitespace 1255 # (note the list context to protect $text from modification): 1256 1257 ($substring) = extract_delimited $text, q{"'}; 1258 1259 # Delete the substring delimited by the first '/' in $text: 1260 1261 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; 1262 1263Note that this last example is I<not> the same as deleting the first 1264quote-like pattern. For instance, if C<$text> contained the string: 1265 1266 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" 1267 1268then after the deletion it would contain: 1269 1270 "if ('.$UNIXCMD/s) { $cmd = $1; }" 1271 1272not: 1273 1274 "if ('./cmd' =~ ms) { $cmd = $1; }" 1275 1276See L<"extract_quotelike"> for a (partial) solution to this problem. 1277 1278=head2 C<extract_bracketed> 1279 1280Like C<"extract_delimited">, the C<extract_bracketed> function takes 1281up to three optional scalar arguments: a string to extract from, a delimiter 1282specifier, and a prefix pattern. As before, a missing prefix defaults to 1283optional whitespace and a missing text defaults to C<$_>. However, a missing 1284delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). 1285 1286C<extract_bracketed> extracts a balanced-bracket-delimited 1287substring (using any one (or more) of the user-specified delimiter 1288brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also 1289respect quoted unbalanced brackets (see below). 1290 1291A "delimiter bracket" is a bracket in list of delimiters passed as 1292C<extract_bracketed>'s second argument. Delimiter brackets are 1293specified by giving either the left or right (or both!) versions 1294of the required bracket(s). Note that the order in which 1295two or more delimiter brackets are specified is not significant. 1296 1297A "balanced-bracket-delimited substring" is a substring bounded by 1298matched brackets, such that any other (left or right) delimiter 1299bracket I<within> the substring is also matched by an opposite 1300(right or left) delimiter bracket I<at the same level of nesting>. Any 1301type of bracket not in the delimiter list is treated as an ordinary 1302character. 1303 1304In other words, each type of bracket specified as a delimiter must be 1305balanced and correctly nested within the substring, and any other kind of 1306("non-delimiter") bracket in the substring is ignored. 1307 1308For example, given the string: 1309 1310 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; 1311 1312then a call to C<extract_bracketed> in a list context: 1313 1314 @result = extract_bracketed( $text, '{}' ); 1315 1316would return: 1317 1318 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) 1319 1320since both sets of C<'{..}'> brackets are properly nested and evenly balanced. 1321(In a scalar context just the first element of the array would be returned. In 1322a void context, C<$text> would be replaced by an empty string.) 1323 1324Likewise the call in: 1325 1326 @result = extract_bracketed( $text, '{[' ); 1327 1328would return the same result, since all sets of both types of specified 1329delimiter brackets are correctly nested and balanced. 1330 1331However, the call in: 1332 1333 @result = extract_bracketed( $text, '{([<' ); 1334 1335would fail, returning: 1336 1337 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); 1338 1339because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and 1340the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would 1341return an empty string. In a void context, C<$text> would be unchanged.) 1342 1343Note that the embedded single-quotes in the string don't help in this 1344case, since they have not been specified as acceptable delimiters and are 1345therefore treated as non-delimiter characters (and ignored). 1346 1347However, if a particular species of quote character is included in the 1348delimiter specification, then that type of quote will be correctly handled. 1349for example, if C<$text> is: 1350 1351 $text = '<A HREF=">>>>">link</A>'; 1352 1353then 1354 1355 @result = extract_bracketed( $text, '<">' ); 1356 1357returns: 1358 1359 ( '<A HREF=">>>>">', 'link</A>', "" ) 1360 1361as expected. Without the specification of C<"> as an embedded quoter: 1362 1363 @result = extract_bracketed( $text, '<>' ); 1364 1365the result would be: 1366 1367 ( '<A HREF=">', '>>>">link</A>', "" ) 1368 1369In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like 1370quoting (i.e. q{string}, qq{string}, etc) can be specified by including the 1371letter 'q' as a delimiter. Hence: 1372 1373 @result = extract_bracketed( $text, '<q>' ); 1374 1375would correctly match something like this: 1376 1377 $text = '<leftop: conj /and/ conj>'; 1378 1379See also: C<"extract_quotelike"> and C<"extract_codeblock">. 1380 1381=head2 C<extract_variable> 1382 1383C<extract_variable> extracts any valid Perl variable or 1384variable-involved expression, including scalars, arrays, hashes, array 1385accesses, hash look-ups, method calls through objects, subroutine calls 1386through subroutine references, etc. 1387 1388The subroutine takes up to two optional arguments: 1389 1390=over 4 1391 1392=item 1. 1393 1394A string to be processed (C<$_> if the string is omitted or C<undef>) 1395 1396=item 2. 1397 1398A string specifying a pattern to be matched as a prefix (which is to be 1399skipped). If omitted, optional whitespace is skipped. 1400 1401=back 1402 1403On success in a list context, an array of 3 elements is returned. The 1404elements are: 1405 1406=over 4 1407 1408=item [0] 1409 1410the extracted variable, or variablish expression 1411 1412=item [1] 1413 1414the remainder of the input text, 1415 1416=item [2] 1417 1418the prefix substring (if any), 1419 1420=back 1421 1422On failure, all of these values (except the remaining text) are C<undef>. 1423 1424In a scalar context, C<extract_variable> returns just the complete 1425substring that matched a variablish expression. C<undef> is returned on 1426failure. In addition, the original input text has the returned substring 1427(and any prefix) removed from it. 1428 1429In a void context, the input text just has the matched substring (and 1430any specified prefix) removed. 1431 1432 1433=head2 C<extract_tagged> 1434 1435C<extract_tagged> extracts and segments text between (balanced) 1436specified tags. 1437 1438The subroutine takes up to five optional arguments: 1439 1440=over 4 1441 1442=item 1. 1443 1444A string to be processed (C<$_> if the string is omitted or C<undef>) 1445 1446=item 2. 1447 1448A string specifying a pattern to be matched as the opening tag. 1449If the pattern string is omitted (or C<undef>) then a pattern 1450that matches any standard XML tag is used. 1451 1452=item 3. 1453 1454A string specifying a pattern to be matched at the closing tag. 1455If the pattern string is omitted (or C<undef>) then the closing 1456tag is constructed by inserting a C</> after any leading bracket 1457characters in the actual opening tag that was matched (I<not> the pattern 1458that matched the tag). For example, if the opening tag pattern 1459is specified as C<'{{\w+}}'> and actually matched the opening tag 1460C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. 1461 1462=item 4. 1463 1464A string specifying a pattern to be matched as a prefix (which is to be 1465skipped). If omitted, optional whitespace is skipped. 1466 1467=item 5. 1468 1469A hash reference containing various parsing options (see below) 1470 1471=back 1472 1473The various options that can be specified are: 1474 1475=over 4 1476 1477=item C<reject =E<gt> $listref> 1478 1479The list reference contains one or more strings specifying patterns 1480that must I<not> appear within the tagged text. 1481 1482For example, to extract 1483an HTML link (which should not contain nested links) use: 1484 1485 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); 1486 1487=item C<ignore =E<gt> $listref> 1488 1489The list reference contains one or more strings specifying patterns 1490that are I<not> be be treated as nested tags within the tagged text 1491(even if they would match the start tag pattern). 1492 1493For example, to extract an arbitrary XML tag, but ignore "empty" elements: 1494 1495 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); 1496 1497(also see L<"gen_delimited_pat"> below). 1498 1499=item C<fail =E<gt> $str> 1500 1501The C<fail> option indicates the action to be taken if a matching end 1502tag is not encountered (i.e. before the end of the string or some 1503C<reject> pattern matches). By default, a failure to match a closing 1504tag causes C<extract_tagged> to immediately fail. 1505 1506However, if the string value associated with <reject> is "MAX", then 1507C<extract_tagged> returns the complete text up to the point of failure. 1508If the string is "PARA", C<extract_tagged> returns only the first paragraph 1509after the tag (up to the first line that is either empty or contains 1510only whitespace characters). 1511If the string is "", the default behaviour (i.e. failure) is reinstated. 1512 1513For example, suppose the start tag "/para" introduces a paragraph, which then 1514continues until the next "/endpara" tag or until another "/para" tag is 1515encountered: 1516 1517 $text = "/para line 1\n\nline 3\n/para line 4"; 1518 1519 extract_tagged($text, '/para', '/endpara', undef, 1520 {reject => '/para', fail => MAX ); 1521 1522 # EXTRACTED: "/para line 1\n\nline 3\n" 1523 1524Suppose instead, that if no matching "/endpara" tag is found, the "/para" 1525tag refers only to the immediately following paragraph: 1526 1527 $text = "/para line 1\n\nline 3\n/para line 4"; 1528 1529 extract_tagged($text, '/para', '/endpara', undef, 1530 {reject => '/para', fail => MAX ); 1531 1532 # EXTRACTED: "/para line 1\n" 1533 1534Note that the specified C<fail> behaviour applies to nested tags as well. 1535 1536=back 1537 1538On success in a list context, an array of 6 elements is returned. The elements are: 1539 1540=over 4 1541 1542=item [0] 1543 1544the extracted tagged substring (including the outermost tags), 1545 1546=item [1] 1547 1548the remainder of the input text, 1549 1550=item [2] 1551 1552the prefix substring (if any), 1553 1554=item [3] 1555 1556the opening tag 1557 1558=item [4] 1559 1560the text between the opening and closing tags 1561 1562=item [5] 1563 1564the closing tag (or "" if no closing tag was found) 1565 1566=back 1567 1568On failure, all of these values (except the remaining text) are C<undef>. 1569 1570In a scalar context, C<extract_tagged> returns just the complete 1571substring that matched a tagged text (including the start and end 1572tags). C<undef> is returned on failure. In addition, the original input 1573text has the returned substring (and any prefix) removed from it. 1574 1575In a void context, the input text just has the matched substring (and 1576any specified prefix) removed. 1577 1578=head2 C<gen_extract_tagged> 1579 1580(Note: This subroutine is only available under Perl5.005) 1581 1582C<gen_extract_tagged> generates a new anonymous subroutine which 1583extracts text between (balanced) specified tags. In other words, 1584it generates a function identical in function to C<extract_tagged>. 1585 1586The difference between C<extract_tagged> and the anonymous 1587subroutines generated by 1588C<gen_extract_tagged>, is that those generated subroutines: 1589 1590=over 4 1591 1592=item * 1593 1594do not have to reparse tag specification or parsing options every time 1595they are called (whereas C<extract_tagged> has to effectively rebuild 1596its tag parser on every call); 1597 1598=item * 1599 1600make use of the new qr// construct to pre-compile the regexes they use 1601(whereas C<extract_tagged> uses standard string variable interpolation 1602to create tag-matching patterns). 1603 1604=back 1605 1606The subroutine takes up to four optional arguments (the same set as 1607C<extract_tagged> except for the string to be processed). It returns 1608a reference to a subroutine which in turn takes a single argument (the text to 1609be extracted from). 1610 1611In other words, the implementation of C<extract_tagged> is exactly 1612equivalent to: 1613 1614 sub extract_tagged 1615 { 1616 my $text = shift; 1617 $extractor = gen_extract_tagged(@_); 1618 return $extractor->($text); 1619 } 1620 1621(although C<extract_tagged> is not currently implemented that way, in order 1622to preserve pre-5.005 compatibility). 1623 1624Using C<gen_extract_tagged> to create extraction functions for specific tags 1625is a good idea if those functions are going to be called more than once, since 1626their performance is typically twice as good as the more general-purpose 1627C<extract_tagged>. 1628 1629 1630=head2 C<extract_quotelike> 1631 1632C<extract_quotelike> attempts to recognize, extract, and segment any 1633one of the various Perl quotes and quotelike operators (see 1634L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket 1635delimiters (for the quotelike operators), and trailing modifiers are 1636all caught. For example, in: 1637 1638 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' 1639 1640 extract_quotelike ' "You said, \"Use sed\"." ' 1641 1642 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' 1643 1644 extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' 1645 1646the full Perl quotelike operations are all extracted correctly. 1647 1648Note too that, when using the /x modifier on a regex, any comment 1649containing the current pattern delimiter will cause the regex to be 1650immediately terminated. In other words: 1651 1652 'm / 1653 (?i) # CASE INSENSITIVE 1654 [a-z_] # LEADING ALPHABETIC/UNDERSCORE 1655 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS 1656 /x' 1657 1658will be extracted as if it were: 1659 1660 'm / 1661 (?i) # CASE INSENSITIVE 1662 [a-z_] # LEADING ALPHABETIC/' 1663 1664This behaviour is identical to that of the actual compiler. 1665 1666C<extract_quotelike> takes two arguments: the text to be processed and 1667a prefix to be matched at the very beginning of the text. If no prefix 1668is specified, optional whitespace is the default. If no text is given, 1669C<$_> is used. 1670 1671In a list context, an array of 11 elements is returned. The elements are: 1672 1673=over 4 1674 1675=item [0] 1676 1677the extracted quotelike substring (including trailing modifiers), 1678 1679=item [1] 1680 1681the remainder of the input text, 1682 1683=item [2] 1684 1685the prefix substring (if any), 1686 1687=item [3] 1688 1689the name of the quotelike operator (if any), 1690 1691=item [4] 1692 1693the left delimiter of the first block of the operation, 1694 1695=item [5] 1696 1697the text of the first block of the operation 1698(that is, the contents of 1699a quote, the regex of a match or substitution or the target list of a 1700translation), 1701 1702=item [6] 1703 1704the right delimiter of the first block of the operation, 1705 1706=item [7] 1707 1708the left delimiter of the second block of the operation 1709(that is, if it is a C<s>, C<tr>, or C<y>), 1710 1711=item [8] 1712 1713the text of the second block of the operation 1714(that is, the replacement of a substitution or the translation list 1715of a translation), 1716 1717=item [9] 1718 1719the right delimiter of the second block of the operation (if any), 1720 1721=item [10] 1722 1723the trailing modifiers on the operation (if any). 1724 1725=back 1726 1727For each of the fields marked "(if any)" the default value on success is 1728an empty string. 1729On failure, all of these values (except the remaining text) are C<undef>. 1730 1731In a scalar context, C<extract_quotelike> returns just the complete substring 1732that matched a quotelike operation (or C<undef> on failure). In a scalar or 1733void context, the input text has the same substring (and any specified 1734prefix) removed. 1735 1736Examples: 1737 1738 # Remove the first quotelike literal that appears in text 1739 1740 $quotelike = extract_quotelike($text,'.*?'); 1741 1742 # Replace one or more leading whitespace-separated quotelike 1743 # literals in $_ with "<QLL>" 1744 1745 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; 1746 1747 1748 # Isolate the search pattern in a quotelike operation from $text 1749 1750 ($op,$pat) = (extract_quotelike $text)[3,5]; 1751 if ($op =~ /[ms]/) 1752 { 1753 print "search pattern: $pat\n"; 1754 } 1755 else 1756 { 1757 print "$op is not a pattern matching operation\n"; 1758 } 1759 1760=head2 C<extract_quotelike> and "here documents" 1761 1762C<extract_quotelike> can successfully extract "here documents" from an input 1763string, but with an important caveat in list contexts. 1764 1765Unlike other types of quote-like literals, a here document is rarely 1766a contiguous substring. For example, a typical piece of code using 1767here document might look like this: 1768 1769 <<'EOMSG' || die; 1770 This is the message. 1771 EOMSG 1772 exit; 1773 1774Given this as an input string in a scalar context, C<extract_quotelike> 1775would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", 1776leaving the string " || die;\nexit;" in the original variable. In other words, 1777the two separate pieces of the here document are successfully extracted and 1778concatenated. 1779 1780In a list context, C<extract_quotelike> would return the list 1781 1782=over 4 1783 1784=item [0] 1785 1786"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, 1787including fore and aft delimiters), 1788 1789=item [1] 1790 1791" || die;\nexit;" (i.e. the remainder of the input text, concatenated), 1792 1793=item [2] 1794 1795"" (i.e. the prefix substring -- trivial in this case), 1796 1797=item [3] 1798 1799"<<" (i.e. the "name" of the quotelike operator) 1800 1801=item [4] 1802 1803"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), 1804 1805=item [5] 1806 1807"This is the message.\n" (i.e. the text of the here document), 1808 1809=item [6] 1810 1811"EOMSG" (i.e. the right delimiter of the here document), 1812 1813=item [7..10] 1814 1815"" (a here document has no second left delimiter, second text, second right 1816delimiter, or trailing modifiers). 1817 1818=back 1819 1820However, the matching position of the input variable would be set to 1821"exit;" (i.e. I<after> the closing delimiter of the here document), 1822which would cause the earlier " || die;\nexit;" to be skipped in any 1823sequence of code fragment extractions. 1824 1825To avoid this problem, when it encounters a here document whilst 1826extracting from a modifiable string, C<extract_quotelike> silently 1827rearranges the string to an equivalent piece of Perl: 1828 1829 <<'EOMSG' 1830 This is the message. 1831 EOMSG 1832 || die; 1833 exit; 1834 1835in which the here document I<is> contiguous. It still leaves the 1836matching position after the here document, but now the rest of the line 1837on which the here document starts is not skipped. 1838 1839To prevent <extract_quotelike> from mucking about with the input in this way 1840(this is the only case where a list-context C<extract_quotelike> does so), 1841you can pass the input variable as an interpolated literal: 1842 1843 $quotelike = extract_quotelike("$var"); 1844 1845=head2 C<extract_codeblock> 1846 1847C<extract_codeblock> attempts to recognize and extract a balanced 1848bracket delimited substring that may contain unbalanced brackets 1849inside Perl quotes or quotelike operations. That is, C<extract_codeblock> 1850is like a combination of C<"extract_bracketed"> and 1851C<"extract_quotelike">. 1852 1853C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: 1854a text to process, a set of delimiter brackets to look for, and a prefix to 1855match first. It also takes an optional fourth parameter, which allows the 1856outermost delimiter brackets to be specified separately (see below). 1857 1858Omitting the first argument (input text) means process C<$_> instead. 1859Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. 1860Omitting the third argument (prefix argument) implies optional whitespace at the start. 1861Omitting the fourth argument (outermost delimiter brackets) indicates that the 1862value of the second argument is to be used for the outermost delimiters. 1863 1864Once the prefix an dthe outermost opening delimiter bracket have been 1865recognized, code blocks are extracted by stepping through the input text and 1866trying the following alternatives in sequence: 1867 1868=over 4 1869 1870=item 1. 1871 1872Try and match a closing delimiter bracket. If the bracket was the same 1873species as the last opening bracket, return the substring to that 1874point. If the bracket was mismatched, return an error. 1875 1876=item 2. 1877 1878Try to match a quote or quotelike operator. If found, call 1879C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return 1880the error it returned. Otherwise go back to step 1. 1881 1882=item 3. 1883 1884Try to match an opening delimiter bracket. If found, call 1885C<extract_codeblock> recursively to eat the embedded block. If the 1886recursive call fails, return an error. Otherwise, go back to step 1. 1887 1888=item 4. 1889 1890Unconditionally match a bareword or any other single character, and 1891then go back to step 1. 1892 1893=back 1894 1895Examples: 1896 1897 # Find a while loop in the text 1898 1899 if ($text =~ s/.*?while\s*\{/{/) 1900 { 1901 $loop = "while " . extract_codeblock($text); 1902 } 1903 1904 # Remove the first round-bracketed list (which may include 1905 # round- or curly-bracketed code blocks or quotelike operators) 1906 1907 extract_codeblock $text, "(){}", '[^(]*'; 1908 1909 1910The ability to specify a different outermost delimiter bracket is useful 1911in some circumstances. For example, in the Parse::RecDescent module, 1912parser actions which are to be performed only on a successful parse 1913are specified using a C<E<lt>defer:...E<gt>> directive. For example: 1914 1915 sentence: subject verb object 1916 <defer: {$::theVerb = $item{verb}} > 1917 1918Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code 1919within the C<E<lt>defer:...E<gt>> directive, but there's a problem. 1920 1921A deferred action like this: 1922 1923 <defer: {if ($count>10) {$count--}} > 1924 1925will be incorrectly parsed as: 1926 1927 <defer: {if ($count> 1928 1929because the "less than" operator is interpreted as a closing delimiter. 1930 1931But, by extracting the directive using 1932S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> 1933the '>' character is only treated as a delimited at the outermost 1934level of the code block, so the directive is parsed correctly. 1935 1936=head2 C<extract_multiple> 1937 1938The C<extract_multiple> subroutine takes a string to be processed and a 1939list of extractors (subroutines or regular expressions) to apply to that string. 1940 1941In an array context C<extract_multiple> returns an array of substrings 1942of the original string, as extracted by the specified extractors. 1943In a scalar context, C<extract_multiple> returns the first 1944substring successfully extracted from the original string. In both 1945scalar and void contexts the original string has the first successfully 1946extracted substring removed from it. In all contexts 1947C<extract_multiple> starts at the current C<pos> of the string, and 1948sets that C<pos> appropriately after it matches. 1949 1950Hence, the aim of of a call to C<extract_multiple> in a list context 1951is to split the processed string into as many non-overlapping fields as 1952possible, by repeatedly applying each of the specified extractors 1953to the remainder of the string. Thus C<extract_multiple> is 1954a generalized form of Perl's C<split> subroutine. 1955 1956The subroutine takes up to four optional arguments: 1957 1958=over 4 1959 1960=item 1. 1961 1962A string to be processed (C<$_> if the string is omitted or C<undef>) 1963 1964=item 2. 1965 1966A reference to a list of subroutine references and/or qr// objects and/or 1967literal strings and/or hash references, specifying the extractors 1968to be used to split the string. If this argument is omitted (or 1969C<undef>) the list: 1970 1971 [ 1972 sub { extract_variable($_[0], '') }, 1973 sub { extract_quotelike($_[0],'') }, 1974 sub { extract_codeblock($_[0],'{}','') }, 1975 ] 1976 1977is used. 1978 1979=item 3. 1980 1981An number specifying the maximum number of fields to return. If this 1982argument is omitted (or C<undef>), split continues as long as possible. 1983 1984If the third argument is I<N>, then extraction continues until I<N> fields 1985have been successfully extracted, or until the string has been completely 1986processed. 1987 1988Note that in scalar and void contexts the value of this argument is 1989automatically reset to 1 (under C<-w>, a warning is issued if the argument 1990has to be reset). 1991 1992=item 4. 1993 1994A value indicating whether unmatched substrings (see below) within the 1995text should be skipped or returned as fields. If the value is true, 1996such substrings are skipped. Otherwise, they are returned. 1997 1998=back 1999 2000The extraction process works by applying each extractor in 2001sequence to the text string. 2002 2003If the extractor is a subroutine it is called in a list context and is 2004expected to return a list of a single element, namely the extracted 2005text. It may optionally also return two further arguments: a string 2006representing the text left after extraction (like $' for a pattern 2007match), and a string representing any prefix skipped before the 2008extraction (like $` in a pattern match). Note that this is designed 2009to facilitate the use of other Text::Balanced subroutines with 2010C<extract_multiple>. Note too that the value returned by an extractor 2011subroutine need not bear any relationship to the corresponding substring 2012of the original text (see examples below). 2013 2014If the extractor is a precompiled regular expression or a string, 2015it is matched against the text in a scalar context with a leading 2016'\G' and the gc modifiers enabled. The extracted value is either 2017$1 if that variable is defined after the match, or else the 2018complete match (i.e. $&). 2019 2020If the extractor is a hash reference, it must contain exactly one element. 2021The value of that element is one of the 2022above extractor types (subroutine reference, regular expression, or string). 2023The key of that element is the name of a class into which the successful 2024return value of the extractor will be blessed. 2025 2026If an extractor returns a defined value, that value is immediately 2027treated as the next extracted field and pushed onto the list of fields. 2028If the extractor was specified in a hash reference, the field is also 2029blessed into the appropriate class, 2030 2031If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is 2032assumed to have failed to extract. 2033If none of the extractor subroutines succeeds, then one 2034character is extracted from the start of the text and the extraction 2035subroutines reapplied. Characters which are thus removed are accumulated and 2036eventually become the next field (unless the fourth argument is true, in which 2037case they are discarded). 2038 2039For example, the following extracts substrings that are valid Perl variables: 2040 2041 @fields = extract_multiple($text, 2042 [ sub { extract_variable($_[0]) } ], 2043 undef, 1); 2044 2045This example separates a text into fields which are quote delimited, 2046curly bracketed, and anything else. The delimited and bracketed 2047parts are also blessed to identify them (the "anything else" is unblessed): 2048 2049 @fields = extract_multiple($text, 2050 [ 2051 { Delim => sub { extract_delimited($_[0],q{'"}) } }, 2052 { Brack => sub { extract_bracketed($_[0],'{}') } }, 2053 ]); 2054 2055This call extracts the next single substring that is a valid Perl quotelike 2056operator (and removes it from $text): 2057 2058 $quotelike = extract_multiple($text, 2059 [ 2060 sub { extract_quotelike($_[0]) }, 2061 ], undef, 1); 2062 2063Finally, here is yet another way to do comma-separated value parsing: 2064 2065 @fields = extract_multiple($csv_text, 2066 [ 2067 sub { extract_delimited($_[0],q{'"}) }, 2068 qr/([^,]+)(.*)/, 2069 ], 2070 undef,1); 2071 2072The list in the second argument means: 2073I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. 2074The undef third argument means: 2075I<"...as many times as possible...">, 2076and the true value in the fourth argument means 2077I<"...discarding anything else that appears (i.e. the commas)">. 2078 2079If you wanted the commas preserved as separate fields (i.e. like split 2080does if your split pattern has capturing parentheses), you would 2081just make the last parameter undefined (or remove it). 2082 2083=head2 C<gen_delimited_pat> 2084 2085The C<gen_delimited_pat> subroutine takes a single (string) argument and 2086 > builds a Friedl-style optimized regex that matches a string delimited 2087by any one of the characters in the single argument. For example: 2088 2089 gen_delimited_pat(q{'"}) 2090 2091returns the regex: 2092 2093 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') 2094 2095Note that the specified delimiters are automatically quotemeta'd. 2096 2097A typical use of C<gen_delimited_pat> would be to build special purpose tags 2098for C<extract_tagged>. For example, to properly ignore "empty" XML elements 2099(which might contain quoted strings): 2100 2101 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; 2102 2103 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); 2104 2105C<gen_delimited_pat> may also be called with an optional second argument, 2106which specifies the "escape" character(s) to be used for each delimiter. 2107For example to match a Pascal-style string (where ' is the delimiter 2108and '' is a literal ' within the string): 2109 2110 gen_delimited_pat(q{'},q{'}); 2111 2112Different escape characters can be specified for different delimiters. 2113For example, to specify that '/' is the escape for single quotes 2114and '%' is the escape for double quotes: 2115 2116 gen_delimited_pat(q{'"},q{/%}); 2117 2118If more delimiters than escape chars are specified, the last escape char 2119is used for the remaining delimiters. 2120If no escape char is specified for a given specified delimiter, '\' is used. 2121 2122=head2 C<delimited_pat> 2123 2124Note that C<gen_delimited_pat> was previously called C<delimited_pat>. 2125That name may still be used, but is now deprecated. 2126 2127 2128=head1 DIAGNOSTICS 2129 2130In a list context, all the functions return C<(undef,$original_text)> 2131on failure. In a scalar context, failure is indicated by returning C<undef> 2132(in this case the input text is not modified in any way). 2133 2134In addition, on failure in I<any> context, the C<$@> variable is set. 2135Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed 2136below. 2137Accessing C<$@-E<gt>{pos}> returns the offset into the original string at 2138which the error was detected (although not necessarily where it occurred!) 2139Printing C<$@> directly produces the error message, with the offset appended. 2140On success, the C<$@> variable is guaranteed to be C<undef>. 2141 2142The available diagnostics are: 2143 2144=over 4 2145 2146=item C<Did not find a suitable bracket: "%s"> 2147 2148The delimiter provided to C<extract_bracketed> was not one of 2149C<'()[]E<lt>E<gt>{}'>. 2150 2151=item C<Did not find prefix: /%s/> 2152 2153A non-optional prefix was specified but wasn't found at the start of the text. 2154 2155=item C<Did not find opening bracket after prefix: "%s"> 2156 2157C<extract_bracketed> or C<extract_codeblock> was expecting a 2158particular kind of bracket at the start of the text, and didn't find it. 2159 2160=item C<No quotelike operator found after prefix: "%s"> 2161 2162C<extract_quotelike> didn't find one of the quotelike operators C<q>, 2163C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring 2164it was extracting. 2165 2166=item C<Unmatched closing bracket: "%c"> 2167 2168C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered 2169a closing bracket where none was expected. 2170 2171=item C<Unmatched opening bracket(s): "%s"> 2172 2173C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran 2174out of characters in the text before closing one or more levels of nested 2175brackets. 2176 2177=item C<Unmatched embedded quote (%s)> 2178 2179C<extract_bracketed> attempted to match an embedded quoted substring, but 2180failed to find a closing quote to match it. 2181 2182=item C<Did not find closing delimiter to match '%s'> 2183 2184C<extract_quotelike> was unable to find a closing delimiter to match the 2185one that opened the quote-like operation. 2186 2187=item C<Mismatched closing bracket: expected "%c" but found "%s"> 2188 2189C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found 2190a valid bracket delimiter, but it was the wrong species. This usually 2191indicates a nesting error, but may indicate incorrect quoting or escaping. 2192 2193=item C<No block delimiter found after quotelike "%s"> 2194 2195C<extract_quotelike> or C<extract_codeblock> found one of the 2196quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> 2197without a suitable block after it. 2198 2199=item C<Did not find leading dereferencer> 2200 2201C<extract_variable> was expecting one of '$', '@', or '%' at the start of 2202a variable, but didn't find any of them. 2203 2204=item C<Bad identifier after dereferencer> 2205 2206C<extract_variable> found a '$', '@', or '%' indicating a variable, but that 2207character was not followed by a legal Perl identifier. 2208 2209=item C<Did not find expected opening bracket at %s> 2210 2211C<extract_codeblock> failed to find any of the outermost opening brackets 2212that were specified. 2213 2214=item C<Improperly nested codeblock at %s> 2215 2216A nested code block was found that started with a delimiter that was specified 2217as being only to be used as an outermost bracket. 2218 2219=item C<Missing second block for quotelike "%s"> 2220 2221C<extract_codeblock> or C<extract_quotelike> found one of the 2222quotelike operators C<s>, C<tr> or C<y> followed by only one block. 2223 2224=item C<No match found for opening bracket> 2225 2226C<extract_codeblock> failed to find a closing bracket to match the outermost 2227opening bracket. 2228 2229=item C<Did not find opening tag: /%s/> 2230 2231C<extract_tagged> did not find a suitable opening tag (after any specified 2232prefix was removed). 2233 2234=item C<Unable to construct closing tag to match: /%s/> 2235 2236C<extract_tagged> matched the specified opening tag and tried to 2237modify the matched text to produce a matching closing tag (because 2238none was specified). It failed to generate the closing tag, almost 2239certainly because the opening tag did not start with a 2240bracket of some kind. 2241 2242=item C<Found invalid nested tag: %s> 2243 2244C<extract_tagged> found a nested tag that appeared in the "reject" list 2245(and the failure mode was not "MAX" or "PARA"). 2246 2247=item C<Found unbalanced nested tag: %s> 2248 2249C<extract_tagged> found a nested opening tag that was not matched by a 2250corresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). 2251 2252=item C<Did not find closing tag> 2253 2254C<extract_tagged> reached the end of the text without finding a closing tag 2255to match the original opening tag (and the failure mode was not 2256"MAX" or "PARA"). 2257 2258=back 2259 2260=head1 AUTHOR 2261 2262Damian Conway (damian@conway.org) 2263 2264=head1 BUGS AND IRRITATIONS 2265 2266There are undoubtedly serious bugs lurking somewhere in this code, if 2267only because parts of it give the impression of understanding a great deal 2268more about Perl than they really do. 2269 2270Bug reports and other feedback are most welcome. 2271 2272=head1 COPYRIGHT 2273 2274Copyright 1997 - 2001 Damian Conway. All Rights Reserved. 2275 2276Some (minor) parts copyright 2009 Adam Kennedy. 2277 2278This module is free software. It may be used, redistributed 2279and/or modified under the same terms as Perl itself. 2280 2281=cut 2282