1# Copyright (C) 1997-2001 Damian Conway. All rights reserved. 2# Copyright (C) 2009 Adam Kennedy. 3# Copyright (C) 2015 Steve Hay. All rights reserved. 4 5# This module is free software; you can redistribute it and/or modify it under 6# the same terms as Perl itself, i.e. under the terms of either the GNU General 7# Public License or the Artistic License, as specified in the F<LICENCE> file. 8 9package Text::Balanced; 10 11# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. 12# FOR FULL DOCUMENTATION SEE Balanced.pod 13 14use 5.008001; 15use strict; 16use Exporter (); 17 18use vars qw { $VERSION @ISA %EXPORT_TAGS }; 19BEGIN { 20 $VERSION = '2.04'; 21 @ISA = 'Exporter'; 22 %EXPORT_TAGS = ( 23 ALL => [ qw{ 24 &extract_delimited 25 &extract_bracketed 26 &extract_quotelike 27 &extract_codeblock 28 &extract_variable 29 &extract_tagged 30 &extract_multiple 31 &gen_delimited_pat 32 &gen_extract_tagged 33 &delimited_pat 34 } ], 35 ); 36} 37 38Exporter::export_ok_tags('ALL'); 39 40## no critic (Subroutines::ProhibitSubroutinePrototypes) 41 42# PROTOTYPES 43 44sub _match_bracketed($$$$$$); 45sub _match_variable($$); 46sub _match_codeblock($$$$$$$); 47sub _match_quotelike($$$$); 48 49# HANDLE RETURN VALUES IN VARIOUS CONTEXTS 50 51sub _failmsg { 52 my ($message, $pos) = @_; 53 $@ = bless { 54 error => $message, 55 pos => $pos, 56 }, 'Text::Balanced::ErrorMsg'; 57} 58 59sub _fail { 60 my ($wantarray, $textref, $message, $pos) = @_; 61 _failmsg $message, $pos if $message; 62 return (undef, $$textref, undef) if $wantarray; 63 return; 64} 65 66sub _succeed { 67 $@ = undef; 68 my ($wantarray,$textref) = splice @_, 0, 2; 69 my ($extrapos, $extralen) = @_ > 18 70 ? splice(@_, -2, 2) 71 : (0, 0); 72 my ($startlen, $oppos) = @_[5,6]; 73 my $remainderpos = $_[2]; 74 if ( $wantarray ) { 75 my @res; 76 while (my ($from, $len) = splice @_, 0, 2) { 77 push @res, substr($$textref, $from, $len); 78 } 79 if ( $extralen ) { # CORRECT FILLET 80 my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n"); 81 $res[1] = "$extra$res[1]"; 82 eval { substr($$textref,$remainderpos,0) = $extra; 83 substr($$textref,$extrapos,$extralen,"\n")} ; 84 #REARRANGE HERE DOC AND FILLET IF POSSIBLE 85 pos($$textref) = $remainderpos-$extralen+1; # RESET \G 86 } else { 87 pos($$textref) = $remainderpos; # RESET \G 88 } 89 return @res; 90 } else { 91 my $match = substr($$textref,$_[0],$_[1]); 92 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; 93 my $extra = $extralen 94 ? substr($$textref, $extrapos, $extralen)."\n" : ""; 95 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE 96 pos($$textref) = $_[4]; # RESET \G 97 return $match; 98 } 99} 100 101# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING 102 103sub gen_delimited_pat($;$) # ($delimiters;$escapes) 104{ 105 my ($dels, $escs) = @_; 106 return "" unless $dels =~ /\S/; 107 $escs = '\\' unless $escs; 108 $escs .= substr($escs,-1) x (length($dels)-length($escs)); 109 my @pat = (); 110 my $i; 111 for ($i=0; $i<length $dels; $i++) 112 { 113 my $del = quotemeta substr($dels,$i,1); 114 my $esc = quotemeta substr($escs,$i,1); 115 if ($del eq $esc) 116 { 117 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; 118 } 119 else 120 { 121 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; 122 } 123 } 124 my $pat = join '|', @pat; 125 return "(?:$pat)"; 126} 127 128*delimited_pat = \&gen_delimited_pat; 129 130# THE EXTRACTION FUNCTIONS 131 132sub extract_delimited (;$$$$) 133{ 134 my $textref = defined $_[0] ? \$_[0] : \$_; 135 my $wantarray = wantarray; 136 my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; 137 my $pre = defined $_[2] ? $_[2] : '\s*'; 138 my $esc = defined $_[3] ? $_[3] : qq{\\}; 139 my $pat = gen_delimited_pat($del, $esc); 140 my $startpos = pos $$textref || 0; 141 return _fail($wantarray, $textref, "Not a delimited pattern", 0) 142 unless $$textref =~ m/\G($pre)($pat)/gc; 143 my $prelen = length($1); 144 my $matchpos = $startpos+$prelen; 145 my $endpos = pos $$textref; 146 return _succeed $wantarray, $textref, 147 $matchpos, $endpos-$matchpos, # MATCH 148 $endpos, length($$textref)-$endpos, # REMAINDER 149 $startpos, $prelen; # PREFIX 150} 151 152sub extract_bracketed (;$$$) 153{ 154 my $textref = defined $_[0] ? \$_[0] : \$_; 155 my $ldel = defined $_[1] ? $_[1] : '{([<'; 156 my $pre = defined $_[2] ? $_[2] : '\s*'; 157 my $wantarray = wantarray; 158 my $qdel = ""; 159 my $quotelike; 160 $ldel =~ s/'//g and $qdel .= q{'}; 161 $ldel =~ s/"//g and $qdel .= q{"}; 162 $ldel =~ s/`//g and $qdel .= q{`}; 163 $ldel =~ s/q//g and $quotelike = 1; 164 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; 165 my $rdel = $ldel; 166 unless ($rdel =~ tr/[({</])}>/) 167 { 168 return _fail $wantarray, $textref, 169 "Did not find a suitable bracket in delimiter: \"$_[1]\"", 170 0; 171 } 172 my $posbug = pos; 173 $ldel = join('|', map { quotemeta $_ } split('', $ldel)); 174 $rdel = join('|', map { quotemeta $_ } split('', $rdel)); 175 pos = $posbug; 176 177 my $startpos = pos $$textref || 0; 178 my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); 179 180 return _fail ($wantarray, $textref) unless @match; 181 182 return _succeed ( $wantarray, $textref, 183 $match[2], $match[5]+2, # MATCH 184 @match[8,9], # REMAINDER 185 @match[0,1], # PREFIX 186 ); 187} 188 189sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel 190{ 191 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; 192 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); 193 unless ($$textref =~ m/\G$pre/gc) 194 { 195 _failmsg "Did not find prefix: /$pre/", $startpos; 196 return; 197 } 198 199 $ldelpos = pos $$textref; 200 201 unless ($$textref =~ m/\G($ldel)/gc) 202 { 203 _failmsg "Did not find opening bracket after prefix: \"$pre\"", 204 pos $$textref; 205 pos $$textref = $startpos; 206 return; 207 } 208 209 my @nesting = ( $1 ); 210 my $textlen = length $$textref; 211 while (pos $$textref < $textlen) 212 { 213 next if $$textref =~ m/\G\\./gcs; 214 215 if ($$textref =~ m/\G($ldel)/gc) 216 { 217 push @nesting, $1; 218 } 219 elsif ($$textref =~ m/\G($rdel)/gc) 220 { 221 my ($found, $brackettype) = ($1, $1); 222 if ($#nesting < 0) 223 { 224 _failmsg "Unmatched closing bracket: \"$found\"", 225 pos $$textref; 226 pos $$textref = $startpos; 227 return; 228 } 229 my $expected = pop(@nesting); 230 $expected =~ tr/({[</)}]>/; 231 if ($expected ne $brackettype) 232 { 233 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, 234 pos $$textref; 235 pos $$textref = $startpos; 236 return; 237 } 238 last if $#nesting < 0; 239 } 240 elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) 241 { 242 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; 243 _failmsg "Unmatched embedded quote ($1)", 244 pos $$textref; 245 pos $$textref = $startpos; 246 return; 247 } 248 elsif ($quotelike && _match_quotelike($textref,"",1,0)) 249 { 250 next; 251 } 252 253 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } 254 } 255 if ($#nesting>=0) 256 { 257 _failmsg "Unmatched opening bracket(s): " 258 . join("..",@nesting)."..", 259 pos $$textref; 260 pos $$textref = $startpos; 261 return; 262 } 263 264 $endpos = pos $$textref; 265 266 return ( 267 $startpos, $ldelpos-$startpos, # PREFIX 268 $ldelpos, 1, # OPENING BRACKET 269 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS 270 $endpos-1, 1, # CLOSING BRACKET 271 $endpos, length($$textref)-$endpos, # REMAINDER 272 ); 273} 274 275sub _revbracket($) 276{ 277 my $brack = reverse $_[0]; 278 $brack =~ tr/[({</])}>/; 279 return $brack; 280} 281 282my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; 283 284sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) 285{ 286 my $textref = defined $_[0] ? \$_[0] : \$_; 287 my $ldel = $_[1]; 288 my $rdel = $_[2]; 289 my $pre = defined $_[3] ? $_[3] : '\s*'; 290 my %options = defined $_[4] ? %{$_[4]} : (); 291 my $omode = defined $options{fail} ? $options{fail} : ''; 292 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 293 : defined($options{reject}) ? $options{reject} 294 : '' 295 ; 296 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 297 : defined($options{ignore}) ? $options{ignore} 298 : '' 299 ; 300 301 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } 302 $@ = undef; 303 304 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 305 306 return _fail(wantarray, $textref) unless @match; 307 return _succeed wantarray, $textref, 308 $match[2], $match[3]+$match[5]+$match[7], # MATCH 309 @match[8..9,0..1,2..7]; # REM, PRE, BITS 310} 311 312sub _match_tagged # ($$$$$$$) 313{ 314 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; 315 my $rdelspec; 316 317 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); 318 319 unless ($$textref =~ m/\G($pre)/gc) 320 { 321 _failmsg "Did not find prefix: /$pre/", pos $$textref; 322 goto failed; 323 } 324 325 $opentagpos = pos($$textref); 326 327 unless ($$textref =~ m/\G$ldel/gc) 328 { 329 _failmsg "Did not find opening tag: /$ldel/", pos $$textref; 330 goto failed; 331 } 332 333 $textpos = pos($$textref); 334 335 if (!defined $rdel) 336 { 337 $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]); 338 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes) 339 { 340 _failmsg "Unable to construct closing tag to match: $rdel", 341 pos $$textref; 342 goto failed; 343 } 344 } 345 else 346 { 347 ## no critic (BuiltinFunctions::ProhibitStringyEval) 348 $rdelspec = eval "qq{$rdel}" || do { 349 my $del; 350 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) 351 { next if $rdel =~ /\Q$_/; $del = $_; last } 352 unless ($del) { 353 use Carp; 354 croak "Can't interpolate right delimiter $rdel" 355 } 356 eval "qq$del$rdel$del"; 357 }; 358 } 359 360 while (pos($$textref) < length($$textref)) 361 { 362 next if $$textref =~ m/\G\\./gc; 363 364 if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) 365 { 366 $parapos = pos($$textref) - length($1) 367 unless defined $parapos; 368 } 369 elsif ($$textref =~ m/\G($rdelspec)/gc ) 370 { 371 $closetagpos = pos($$textref)-length($1); 372 goto matched; 373 } 374 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) 375 { 376 next; 377 } 378 elsif ($bad && $$textref =~ m/\G($bad)/gcs) 379 { 380 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS 381 goto short if ($omode eq 'PARA' || $omode eq 'MAX'); 382 _failmsg "Found invalid nested tag: $1", pos $$textref; 383 goto failed; 384 } 385 elsif ($$textref =~ m/\G($ldel)/gc) 386 { 387 my $tag = $1; 388 pos($$textref) -= length($tag); # REWIND TO NESTED TAG 389 unless (_match_tagged(@_)) # MATCH NESTED TAG 390 { 391 goto short if $omode eq 'PARA' || $omode eq 'MAX'; 392 _failmsg "Found unbalanced nested tag: $tag", 393 pos $$textref; 394 goto failed; 395 } 396 } 397 else { $$textref =~ m/./gcs } 398 } 399 400short: 401 $closetagpos = pos($$textref); 402 goto matched if $omode eq 'MAX'; 403 goto failed unless $omode eq 'PARA'; 404 405 if (defined $parapos) { pos($$textref) = $parapos } 406 else { $parapos = pos($$textref) } 407 408 return ( 409 $startpos, $opentagpos-$startpos, # PREFIX 410 $opentagpos, $textpos-$opentagpos, # OPENING TAG 411 $textpos, $parapos-$textpos, # TEXT 412 $parapos, 0, # NO CLOSING TAG 413 $parapos, length($$textref)-$parapos, # REMAINDER 414 ); 415 416matched: 417 $endpos = pos($$textref); 418 return ( 419 $startpos, $opentagpos-$startpos, # PREFIX 420 $opentagpos, $textpos-$opentagpos, # OPENING TAG 421 $textpos, $closetagpos-$textpos, # TEXT 422 $closetagpos, $endpos-$closetagpos, # CLOSING TAG 423 $endpos, length($$textref)-$endpos, # REMAINDER 424 ); 425 426failed: 427 _failmsg "Did not find closing tag", pos $$textref unless $@; 428 pos($$textref) = $startpos; 429 return; 430} 431 432sub extract_variable (;$$) 433{ 434 my $textref = defined $_[0] ? \$_[0] : \$_; 435 return ("","","") unless defined $$textref; 436 my $pre = defined $_[1] ? $_[1] : '\s*'; 437 438 my @match = _match_variable($textref,$pre); 439 440 return _fail wantarray, $textref unless @match; 441 442 return _succeed wantarray, $textref, 443 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX 444} 445 446sub _match_variable($$) 447{ 448# $# 449# $^ 450# $$ 451 my ($textref, $pre) = @_; 452 my $startpos = pos($$textref) = pos($$textref)||0; 453 unless ($$textref =~ m/\G($pre)/gc) 454 { 455 _failmsg "Did not find prefix: /$pre/", pos $$textref; 456 return; 457 } 458 my $varpos = pos($$textref); 459 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) 460 { 461 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) 462 { 463 _failmsg "Did not find leading dereferencer", pos $$textref; 464 pos $$textref = $startpos; 465 return; 466 } 467 my $deref = $1; 468 469 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci 470 or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0) 471 or $deref eq '$#' or $deref eq '$$' ) 472 { 473 _failmsg "Bad identifier after dereferencer", pos $$textref; 474 pos $$textref = $startpos; 475 return; 476 } 477 } 478 479 while (1) 480 { 481 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; 482 next if _match_codeblock($textref, 483 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, 484 qr/[({[]/, qr/[)}\]]/, 485 qr/[({[]/, qr/[)}\]]/, 0); 486 next if _match_codeblock($textref, 487 qr/\s*/, qr/[{[]/, qr/[}\]]/, 488 qr/[{[]/, qr/[}\]]/, 0); 489 next if _match_variable($textref,'\s*->\s*'); 490 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; 491 last; 492 } 493 494 my $endpos = pos($$textref); 495 return ($startpos, $varpos-$startpos, 496 $varpos, $endpos-$varpos, 497 $endpos, length($$textref)-$endpos 498 ); 499} 500 501sub extract_codeblock (;$$$$$) 502{ 503 my $textref = defined $_[0] ? \$_[0] : \$_; 504 my $wantarray = wantarray; 505 my $ldel_inner = defined $_[1] ? $_[1] : '{'; 506 my $pre = defined $_[2] ? $_[2] : '\s*'; 507 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; 508 my $rd = $_[4]; 509 my $rdel_inner = $ldel_inner; 510 my $rdel_outer = $ldel_outer; 511 my $posbug = pos; 512 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } 513 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } 514 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) 515 { 516 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' 517 } 518 pos = $posbug; 519 520 my @match = _match_codeblock($textref, $pre, 521 $ldel_outer, $rdel_outer, 522 $ldel_inner, $rdel_inner, 523 $rd); 524 return _fail($wantarray, $textref) unless @match; 525 return _succeed($wantarray, $textref, 526 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX 527 ); 528 529} 530 531sub _match_codeblock($$$$$$$) 532{ 533 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; 534 my $startpos = pos($$textref) = pos($$textref) || 0; 535 unless ($$textref =~ m/\G($pre)/gc) 536 { 537 _failmsg qq{Did not match prefix /$pre/ at"} . 538 substr($$textref,pos($$textref),20) . 539 q{..."}, 540 pos $$textref; 541 return; 542 } 543 my $codepos = pos($$textref); 544 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER 545 { 546 _failmsg qq{Did not find expected opening bracket at "} . 547 substr($$textref,pos($$textref),20) . 548 q{..."}, 549 pos $$textref; 550 pos $$textref = $startpos; 551 return; 552 } 553 my $closing = $1; 554 $closing =~ tr/([<{/)]>}/; 555 my $matched; 556 my $patvalid = 1; 557 while (pos($$textref) < length($$textref)) 558 { 559 $matched = ''; 560 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) 561 { 562 $patvalid = 0; 563 next; 564 } 565 566 if ($$textref =~ m/\G\s*#.*/gc) 567 { 568 next; 569 } 570 571 if ($$textref =~ m/\G\s*($rdel_outer)/gc) 572 { 573 unless ($matched = ($closing && $1 eq $closing) ) 574 { 575 next if $1 eq '>'; # MIGHT BE A "LESS THAN" 576 _failmsg q{Mismatched closing bracket at "} . 577 substr($$textref,pos($$textref),20) . 578 qq{...". Expected '$closing'}, 579 pos $$textref; 580 } 581 last; 582 } 583 584 if (_match_variable($textref,'\s*') || 585 _match_quotelike($textref,'\s*',$patvalid,$patvalid) ) 586 { 587 $patvalid = 0; 588 next; 589 } 590 591 592 # NEED TO COVER MANY MORE CASES HERE!!! 593 if ($$textref =~ m#\G\s*(?!$ldel_inner) 594 ( [-+*x/%^&|.]=? 595 | [!=]~ 596 | =(?!>) 597 | (\*\*|&&|\|\||<<|>>)=? 598 | split|grep|map|return 599 | [([] 600 )#gcx) 601 { 602 $patvalid = 1; 603 next; 604 } 605 606 if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) 607 { 608 $patvalid = 1; 609 next; 610 } 611 612 if ($$textref =~ m/\G\s*$ldel_outer/gc) 613 { 614 _failmsg q{Improperly nested codeblock at "} . 615 substr($$textref,pos($$textref),20) . 616 q{..."}, 617 pos $$textref; 618 last; 619 } 620 621 $patvalid = 0; 622 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; 623 } 624 continue { $@ = undef } 625 626 unless ($matched) 627 { 628 _failmsg 'No match found for opening bracket', pos $$textref 629 unless $@; 630 return; 631 } 632 633 my $endpos = pos($$textref); 634 return ( $startpos, $codepos-$startpos, 635 $codepos, $endpos-$codepos, 636 $endpos, length($$textref)-$endpos, 637 ); 638} 639 640 641my %mods = ( 642 'none' => '[cgimsox]*', 643 'm' => '[cgimsox]*', 644 's' => '[cegimsox]*', 645 'tr' => '[cds]*', 646 'y' => '[cds]*', 647 'qq' => '', 648 'qx' => '', 649 'qw' => '', 650 'qr' => '[imsx]*', 651 'q' => '', 652); 653 654sub extract_quotelike (;$$) 655{ 656 my $textref = $_[0] ? \$_[0] : \$_; 657 my $wantarray = wantarray; 658 my $pre = defined $_[1] ? $_[1] : '\s*'; 659 660 my @match = _match_quotelike($textref,$pre,1,0); 661 return _fail($wantarray, $textref) unless @match; 662 return _succeed($wantarray, $textref, 663 $match[2], $match[18]-$match[2], # MATCH 664 @match[18,19], # REMAINDER 665 @match[0,1], # PREFIX 666 @match[2..17], # THE BITS 667 @match[20,21], # ANY FILLET? 668 ); 669}; 670 671sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) 672{ 673 my ($textref, $pre, $rawmatch, $qmark) = @_; 674 675 my ($textlen,$startpos, 676 $oppos, 677 $preld1pos,$ld1pos,$str1pos,$rd1pos, 678 $preld2pos,$ld2pos,$str2pos,$rd2pos, 679 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); 680 681 unless ($$textref =~ m/\G($pre)/gc) 682 { 683 _failmsg qq{Did not find prefix /$pre/ at "} . 684 substr($$textref, pos($$textref), 20) . 685 q{..."}, 686 pos $$textref; 687 return; 688 } 689 $oppos = pos($$textref); 690 691 my $initial = substr($$textref,$oppos,1); 692 693 if ($initial && $initial =~ m|^[\"\'\`]| 694 || $rawmatch && $initial =~ m|^/| 695 || $qmark && $initial =~ m|^\?|) 696 { 697 unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) 698 { 699 _failmsg qq{Did not find closing delimiter to match '$initial' at "} . 700 substr($$textref, $oppos, 20) . 701 q{..."}, 702 pos $$textref; 703 pos $$textref = $startpos; 704 return; 705 } 706 $modpos= pos($$textref); 707 $rd1pos = $modpos-1; 708 709 if ($initial eq '/' || $initial eq '?') 710 { 711 $$textref =~ m/\G$mods{none}/gc 712 } 713 714 my $endpos = pos($$textref); 715 return ( 716 $startpos, $oppos-$startpos, # PREFIX 717 $oppos, 0, # NO OPERATOR 718 $oppos, 1, # LEFT DEL 719 $oppos+1, $rd1pos-$oppos-1, # STR/PAT 720 $rd1pos, 1, # RIGHT DEL 721 $modpos, 0, # NO 2ND LDEL 722 $modpos, 0, # NO 2ND STR 723 $modpos, 0, # NO 2ND RDEL 724 $modpos, $endpos-$modpos, # MODIFIERS 725 $endpos, $textlen-$endpos, # REMAINDER 726 ); 727 } 728 729 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) 730 { 731 _failmsg q{No quotelike operator found after prefix at "} . 732 substr($$textref, pos($$textref), 20) . 733 q{..."}, 734 pos $$textref; 735 pos $$textref = $startpos; 736 return; 737 } 738 739 my $op = $1; 740 $preld1pos = pos($$textref); 741 if ($op eq '<<') { 742 $ld1pos = pos($$textref); 743 my $label; 744 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { 745 $label = $1; 746 } 747 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' 748 | \G " ([^"\\]* (?:\\.[^"\\]*)*) " 749 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` 750 }gcsx) { 751 $label = $+; 752 } 753 else { 754 $label = ""; 755 } 756 my $extrapos = pos($$textref); 757 $$textref =~ m{.*\n}gc; 758 $str1pos = pos($$textref)--; 759 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) { 760 _failmsg qq{Missing here doc terminator ('$label') after "} . 761 substr($$textref, $startpos, 20) . 762 q{..."}, 763 pos $$textref; 764 pos $$textref = $startpos; 765 return; 766 } 767 $rd1pos = pos($$textref); 768 $$textref =~ m{\Q$label\E\n}gc; 769 $ld2pos = pos($$textref); 770 return ( 771 $startpos, $oppos-$startpos, # PREFIX 772 $oppos, length($op), # OPERATOR 773 $ld1pos, $extrapos-$ld1pos, # LEFT DEL 774 $str1pos, $rd1pos-$str1pos, # STR/PAT 775 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL 776 $ld2pos, 0, # NO 2ND LDEL 777 $ld2pos, 0, # NO 2ND STR 778 $ld2pos, 0, # NO 2ND RDEL 779 $ld2pos, 0, # NO MODIFIERS 780 $ld2pos, $textlen-$ld2pos, # REMAINDER 781 $extrapos, $str1pos-$extrapos, # FILLETED BIT 782 ); 783 } 784 785 $$textref =~ m/\G\s*/gc; 786 $ld1pos = pos($$textref); 787 $str1pos = $ld1pos+1; 788 789 unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD 790 { 791 _failmsg "No block delimiter found after quotelike $op", 792 pos $$textref; 793 pos $$textref = $startpos; 794 return; 795 } 796 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN 797 my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); 798 if ($ldel1 =~ /[[(<{]/) 799 { 800 $rdel1 =~ tr/[({</])}>/; 801 defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1)) 802 || do { pos $$textref = $startpos; return }; 803 $ld2pos = pos($$textref); 804 $rd1pos = $ld2pos-1; 805 } 806 else 807 { 808 $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs 809 || do { pos $$textref = $startpos; return }; 810 $ld2pos = $rd1pos = pos($$textref)-1; 811 } 812 813 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; 814 if ($second_arg) 815 { 816 my ($ldel2, $rdel2); 817 if ($ldel1 =~ /[[(<{]/) 818 { 819 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD 820 { 821 _failmsg "Missing second block for quotelike $op", 822 pos $$textref; 823 pos $$textref = $startpos; 824 return; 825 } 826 $ldel2 = $rdel2 = "\Q$1"; 827 $rdel2 =~ tr/[({</])}>/; 828 } 829 else 830 { 831 $ldel2 = $rdel2 = $ldel1; 832 } 833 $str2pos = $ld2pos+1; 834 835 if ($ldel2 =~ /[[(<{]/) 836 { 837 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD 838 defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2)) 839 || do { pos $$textref = $startpos; return }; 840 } 841 else 842 { 843 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs 844 || do { pos $$textref = $startpos; return }; 845 } 846 $rd2pos = pos($$textref)-1; 847 } 848 else 849 { 850 $ld2pos = $str2pos = $rd2pos = $rd1pos; 851 } 852 853 $modpos = pos $$textref; 854 855 $$textref =~ m/\G($mods{$op})/gc; 856 my $endpos = pos $$textref; 857 858 return ( 859 $startpos, $oppos-$startpos, # PREFIX 860 $oppos, length($op), # OPERATOR 861 $ld1pos, 1, # LEFT DEL 862 $str1pos, $rd1pos-$str1pos, # STR/PAT 863 $rd1pos, 1, # RIGHT DEL 864 $ld2pos, $second_arg, # 2ND LDEL (MAYBE) 865 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) 866 $rd2pos, $second_arg, # 2ND RDEL (MAYBE) 867 $modpos, $endpos-$modpos, # MODIFIERS 868 $endpos, $textlen-$endpos, # REMAINDER 869 ); 870} 871 872my $def_func = [ 873 sub { extract_variable($_[0], '') }, 874 sub { extract_quotelike($_[0],'') }, 875 sub { extract_codeblock($_[0],'{}','') }, 876]; 877 878sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) 879{ 880 my $textref = defined($_[0]) ? \$_[0] : \$_; 881 my $posbug = pos; 882 my ($lastpos, $firstpos); 883 my @fields = (); 884 885 #for ($$textref) 886 { 887 my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; 888 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; 889 my $igunk = $_[3]; 890 891 pos $$textref ||= 0; 892 893 unless (wantarray) 894 { 895 use Carp; 896 carp "extract_multiple reset maximal count to 1 in scalar context" 897 if $^W && defined($_[2]) && $max > 1; 898 $max = 1 899 } 900 901 my $unkpos; 902 my $class; 903 904 my @class; 905 foreach my $func ( @func ) 906 { 907 if (ref($func) eq 'HASH') 908 { 909 push @class, (keys %$func)[0]; 910 $func = (values %$func)[0]; 911 } 912 else 913 { 914 push @class, undef; 915 } 916 } 917 918 FIELD: while (pos($$textref) < length($$textref)) 919 { 920 my ($field, $rem); 921 my @bits; 922 foreach my $i ( 0..$#func ) 923 { 924 my $pref; 925 my $func = $func[$i]; 926 $class = $class[$i]; 927 $lastpos = pos $$textref; 928 if (ref($func) eq 'CODE') 929 { ($field,$rem,$pref) = @bits = $func->($$textref) } 930 elsif (ref($func) eq 'Text::Balanced::Extractor') 931 { @bits = $field = $func->extract($$textref) } 932 elsif( $$textref =~ m/\G$func/gc ) 933 { @bits = $field = defined($1) 934 ? $1 935 : substr($$textref, $-[0], $+[0] - $-[0]) 936 } 937 $pref ||= ""; 938 if (defined($field) && length($field)) 939 { 940 if (!$igunk) { 941 $unkpos = $lastpos 942 if length($pref) && !defined($unkpos); 943 if (defined $unkpos) 944 { 945 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; 946 $firstpos = $unkpos unless defined $firstpos; 947 undef $unkpos; 948 last FIELD if @fields == $max; 949 } 950 } 951 push @fields, $class 952 ? bless (\$field, $class) 953 : $field; 954 $firstpos = $lastpos unless defined $firstpos; 955 $lastpos = pos $$textref; 956 last FIELD if @fields == $max; 957 next FIELD; 958 } 959 } 960 if ($$textref =~ /\G(.)/gcs) 961 { 962 $unkpos = pos($$textref)-1 963 unless $igunk || defined $unkpos; 964 } 965 } 966 967 if (defined $unkpos) 968 { 969 push @fields, substr($$textref, $unkpos); 970 $firstpos = $unkpos unless defined $firstpos; 971 $lastpos = length $$textref; 972 } 973 last; 974 } 975 976 pos $$textref = $lastpos; 977 return @fields if wantarray; 978 979 $firstpos ||= 0; 980 eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; 981 pos $$textref = $firstpos }; 982 return $fields[0]; 983} 984 985sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) 986{ 987 my $ldel = $_[0]; 988 my $rdel = $_[1]; 989 my $pre = defined $_[2] ? $_[2] : '\s*'; 990 my %options = defined $_[3] ? %{$_[3]} : (); 991 my $omode = defined $options{fail} ? $options{fail} : ''; 992 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 993 : defined($options{reject}) ? $options{reject} 994 : '' 995 ; 996 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 997 : defined($options{ignore}) ? $options{ignore} 998 : '' 999 ; 1000 1001 if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } 1002 1003 my $posbug = pos; 1004 for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } 1005 pos = $posbug; 1006 1007 my $closure = sub 1008 { 1009 my $textref = defined $_[0] ? \$_[0] : \$_; 1010 my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 1011 1012 return _fail(wantarray, $textref) unless @match; 1013 return _succeed wantarray, $textref, 1014 $match[2], $match[3]+$match[5]+$match[7], # MATCH 1015 @match[8..9,0..1,2..7]; # REM, PRE, BITS 1016 }; 1017 1018 bless $closure, 'Text::Balanced::Extractor'; 1019} 1020 1021package Text::Balanced::Extractor; 1022 1023sub extract($$) # ($self, $text) 1024{ 1025 &{$_[0]}($_[1]); 1026} 1027 1028package Text::Balanced::ErrorMsg; 1029 1030use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }; 1031 10321; 1033 1034__END__ 1035 1036=pod 1037 1038=head1 NAME 1039 1040Text::Balanced - Extract delimited text sequences from strings. 1041 1042=head1 SYNOPSIS 1043 1044 use Text::Balanced qw ( 1045 extract_delimited 1046 extract_bracketed 1047 extract_quotelike 1048 extract_codeblock 1049 extract_variable 1050 extract_tagged 1051 extract_multiple 1052 gen_delimited_pat 1053 gen_extract_tagged 1054 ); 1055 1056 # Extract the initial substring of $text that is delimited by 1057 # two (unescaped) instances of the first character in $delim. 1058 1059 ($extracted, $remainder) = extract_delimited($text,$delim); 1060 1061 # Extract the initial substring of $text that is bracketed 1062 # with a delimiter(s) specified by $delim (where the string 1063 # in $delim contains one or more of '(){}[]<>'). 1064 1065 ($extracted, $remainder) = extract_bracketed($text,$delim); 1066 1067 # Extract the initial substring of $text that is bounded by 1068 # an XML tag. 1069 1070 ($extracted, $remainder) = extract_tagged($text); 1071 1072 # Extract the initial substring of $text that is bounded by 1073 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags 1074 1075 ($extracted, $remainder) = 1076 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); 1077 1078 # Extract the initial substring of $text that represents a 1079 # Perl "quote or quote-like operation" 1080 1081 ($extracted, $remainder) = extract_quotelike($text); 1082 1083 # Extract the initial substring of $text that represents a block 1084 # of Perl code, bracketed by any of character(s) specified by $delim 1085 # (where the string $delim contains one or more of '(){}[]<>'). 1086 1087 ($extracted, $remainder) = extract_codeblock($text,$delim); 1088 1089 # Extract the initial substrings of $text that would be extracted by 1090 # one or more sequential applications of the specified functions 1091 # or regular expressions 1092 1093 @extracted = extract_multiple($text, 1094 [ \&extract_bracketed, 1095 \&extract_quotelike, 1096 \&some_other_extractor_sub, 1097 qr/[xyz]*/, 1098 'literal', 1099 ]); 1100 1101 # Create a string representing an optimized pattern (a la Friedl) 1102 # that matches a substring delimited by any of the specified characters 1103 # (in this case: any type of quote or a slash) 1104 1105 $patstring = gen_delimited_pat(q{'"`/}); 1106 1107 # Generate a reference to an anonymous sub that is just like extract_tagged 1108 # but pre-compiled and optimized for a specific pair of tags, and 1109 # consequently much faster (i.e. 3 times faster). It uses qr// for better 1110 # performance on repeated calls. 1111 1112 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); 1113 ($extracted, $remainder) = $extract_head->($text); 1114 1115=head1 DESCRIPTION 1116 1117The various C<extract_...> subroutines may be used to 1118extract a delimited substring, possibly after skipping a 1119specified prefix string. By default, that prefix is 1120optional whitespace (C</\s*/>), but you can change it to whatever 1121you wish (see below). 1122 1123The substring to be extracted must appear at the 1124current C<pos> location of the string's variable 1125(or at index zero, if no C<pos> position is defined). 1126In other words, the C<extract_...> subroutines I<don't> 1127extract the first occurrence of a substring anywhere 1128in a string (like an unanchored regex would). Rather, 1129they extract an occurrence of the substring appearing 1130immediately at the current matching position in the 1131string (like a C<\G>-anchored regex would). 1132 1133=head2 General Behaviour in List Contexts 1134 1135In a list context, all the subroutines return a list, the first three 1136elements of which are always: 1137 1138=over 4 1139 1140=item [0] 1141 1142The extracted string, including the specified delimiters. 1143If the extraction fails C<undef> is returned. 1144 1145=item [1] 1146 1147The remainder of the input string (i.e. the characters after the 1148extracted string). On failure, the entire string is returned. 1149 1150=item [2] 1151 1152The skipped prefix (i.e. the characters before the extracted string). 1153On failure, C<undef> is returned. 1154 1155=back 1156 1157Note that in a list context, the contents of the original input text (the first 1158argument) are not modified in any way. 1159 1160However, if the input text was passed in a variable, that variable's 1161C<pos> value is updated to point at the first character after the 1162extracted text. That means that in a list context the various 1163subroutines can be used much like regular expressions. For example: 1164 1165 while ( $next = (extract_quotelike($text))[0] ) 1166 { 1167 # process next quote-like (in $next) 1168 } 1169 1170=head2 General Behaviour in Scalar and Void Contexts 1171 1172In a scalar context, the extracted string is returned, having first been 1173removed from the input text. Thus, the following code also processes 1174each quote-like operation, but actually removes them from $text: 1175 1176 while ( $next = extract_quotelike($text) ) 1177 { 1178 # process next quote-like (in $next) 1179 } 1180 1181Note that if the input text is a read-only string (i.e. a literal), 1182no attempt is made to remove the extracted text. 1183 1184In a void context the behaviour of the extraction subroutines is 1185exactly the same as in a scalar context, except (of course) that the 1186extracted substring is not returned. 1187 1188=head2 A Note About Prefixes 1189 1190Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.) 1191This can bite you if you're expecting a prefix specification like 1192'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix 1193pattern will only succeed if the <H1> tag is on the current line, since 1194. normally doesn't match newlines. 1195 1196To overcome this limitation, you need to turn on /s matching within 1197the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' 1198 1199=head2 Functions 1200 1201=over 4 1202 1203=item C<extract_delimited> 1204 1205The C<extract_delimited> function formalizes the common idiom 1206of extracting a single-character-delimited substring from the start of 1207a string. For example, to extract a single-quote delimited string, the 1208following code is typically used: 1209 1210 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; 1211 $extracted = $1; 1212 1213but with C<extract_delimited> it can be simplified to: 1214 1215 ($extracted,$remainder) = extract_delimited($text, "'"); 1216 1217C<extract_delimited> takes up to four scalars (the input text, the 1218delimiters, a prefix pattern to be skipped, and any escape characters) 1219and extracts the initial substring of the text that 1220is appropriately delimited. If the delimiter string has multiple 1221characters, the first one encountered in the text is taken to delimit 1222the substring. 1223The third argument specifies a prefix pattern that is to be skipped 1224(but must be present!) before the substring is extracted. 1225The final argument specifies the escape character to be used for each 1226delimiter. 1227 1228All arguments are optional. If the escape characters are not specified, 1229every delimiter is escaped with a backslash (C<\>). 1230If the prefix is not specified, the 1231pattern C<'\s*'> - optional whitespace - is used. If the delimiter set 1232is also not specified, the set C</["'`]/> is used. If the text to be processed 1233is not specified either, C<$_> is used. 1234 1235In list context, C<extract_delimited> returns a array of three 1236elements, the extracted substring (I<including the surrounding 1237delimiters>), the remainder of the text, and the skipped prefix (if 1238any). If a suitable delimited substring is not found, the first 1239element of the array is the empty string, the second is the complete 1240original text, and the prefix returned in the third element is an 1241empty string. 1242 1243In a scalar context, just the extracted substring is returned. In 1244a void context, the extracted substring (and any prefix) are simply 1245removed from the beginning of the first argument. 1246 1247Examples: 1248 1249 # Remove a single-quoted substring from the very beginning of $text: 1250 1251 $substring = extract_delimited($text, "'", ''); 1252 1253 # Remove a single-quoted Pascalish substring (i.e. one in which 1254 # doubling the quote character escapes it) from the very 1255 # beginning of $text: 1256 1257 $substring = extract_delimited($text, "'", '', "'"); 1258 1259 # Extract a single- or double- quoted substring from the 1260 # beginning of $text, optionally after some whitespace 1261 # (note the list context to protect $text from modification): 1262 1263 ($substring) = extract_delimited $text, q{"'}; 1264 1265 # Delete the substring delimited by the first '/' in $text: 1266 1267 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; 1268 1269Note that this last example is I<not> the same as deleting the first 1270quote-like pattern. For instance, if C<$text> contained the string: 1271 1272 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" 1273 1274then after the deletion it would contain: 1275 1276 "if ('.$UNIXCMD/s) { $cmd = $1; }" 1277 1278not: 1279 1280 "if ('./cmd' =~ ms) { $cmd = $1; }" 1281 1282See L<"extract_quotelike"> for a (partial) solution to this problem. 1283 1284=item C<extract_bracketed> 1285 1286Like C<"extract_delimited">, the C<extract_bracketed> function takes 1287up to three optional scalar arguments: a string to extract from, a delimiter 1288specifier, and a prefix pattern. As before, a missing prefix defaults to 1289optional whitespace and a missing text defaults to C<$_>. However, a missing 1290delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). 1291 1292C<extract_bracketed> extracts a balanced-bracket-delimited 1293substring (using any one (or more) of the user-specified delimiter 1294brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also 1295respect quoted unbalanced brackets (see below). 1296 1297A "delimiter bracket" is a bracket in list of delimiters passed as 1298C<extract_bracketed>'s second argument. Delimiter brackets are 1299specified by giving either the left or right (or both!) versions 1300of the required bracket(s). Note that the order in which 1301two or more delimiter brackets are specified is not significant. 1302 1303A "balanced-bracket-delimited substring" is a substring bounded by 1304matched brackets, such that any other (left or right) delimiter 1305bracket I<within> the substring is also matched by an opposite 1306(right or left) delimiter bracket I<at the same level of nesting>. Any 1307type of bracket not in the delimiter list is treated as an ordinary 1308character. 1309 1310In other words, each type of bracket specified as a delimiter must be 1311balanced and correctly nested within the substring, and any other kind of 1312("non-delimiter") bracket in the substring is ignored. 1313 1314For example, given the string: 1315 1316 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; 1317 1318then a call to C<extract_bracketed> in a list context: 1319 1320 @result = extract_bracketed( $text, '{}' ); 1321 1322would return: 1323 1324 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) 1325 1326since both sets of C<'{..}'> brackets are properly nested and evenly balanced. 1327(In a scalar context just the first element of the array would be returned. In 1328a void context, C<$text> would be replaced by an empty string.) 1329 1330Likewise the call in: 1331 1332 @result = extract_bracketed( $text, '{[' ); 1333 1334would return the same result, since all sets of both types of specified 1335delimiter brackets are correctly nested and balanced. 1336 1337However, the call in: 1338 1339 @result = extract_bracketed( $text, '{([<' ); 1340 1341would fail, returning: 1342 1343 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); 1344 1345because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and 1346the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would 1347return an empty string. In a void context, C<$text> would be unchanged.) 1348 1349Note that the embedded single-quotes in the string don't help in this 1350case, since they have not been specified as acceptable delimiters and are 1351therefore treated as non-delimiter characters (and ignored). 1352 1353However, if a particular species of quote character is included in the 1354delimiter specification, then that type of quote will be correctly handled. 1355for example, if C<$text> is: 1356 1357 $text = '<A HREF=">>>>">link</A>'; 1358 1359then 1360 1361 @result = extract_bracketed( $text, '<">' ); 1362 1363returns: 1364 1365 ( '<A HREF=">>>>">', 'link</A>', "" ) 1366 1367as expected. Without the specification of C<"> as an embedded quoter: 1368 1369 @result = extract_bracketed( $text, '<>' ); 1370 1371the result would be: 1372 1373 ( '<A HREF=">', '>>>">link</A>', "" ) 1374 1375In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like 1376quoting (i.e. q{string}, qq{string}, etc) can be specified by including the 1377letter 'q' as a delimiter. Hence: 1378 1379 @result = extract_bracketed( $text, '<q>' ); 1380 1381would correctly match something like this: 1382 1383 $text = '<leftop: conj /and/ conj>'; 1384 1385See also: C<"extract_quotelike"> and C<"extract_codeblock">. 1386 1387=item C<extract_variable> 1388 1389C<extract_variable> extracts any valid Perl variable or 1390variable-involved expression, including scalars, arrays, hashes, array 1391accesses, hash look-ups, method calls through objects, subroutine calls 1392through subroutine references, etc. 1393 1394The subroutine takes up to two optional arguments: 1395 1396=over 4 1397 1398=item 1. 1399 1400A string to be processed (C<$_> if the string is omitted or C<undef>) 1401 1402=item 2. 1403 1404A string specifying a pattern to be matched as a prefix (which is to be 1405skipped). If omitted, optional whitespace is skipped. 1406 1407=back 1408 1409On success in a list context, an array of 3 elements is returned. The 1410elements are: 1411 1412=over 4 1413 1414=item [0] 1415 1416the extracted variable, or variablish expression 1417 1418=item [1] 1419 1420the remainder of the input text, 1421 1422=item [2] 1423 1424the prefix substring (if any), 1425 1426=back 1427 1428On failure, all of these values (except the remaining text) are C<undef>. 1429 1430In a scalar context, C<extract_variable> returns just the complete 1431substring that matched a variablish expression. C<undef> is returned on 1432failure. In addition, the original input text has the returned substring 1433(and any prefix) removed from it. 1434 1435In a void context, the input text just has the matched substring (and 1436any specified prefix) removed. 1437 1438=item C<extract_tagged> 1439 1440C<extract_tagged> extracts and segments text between (balanced) 1441specified tags. 1442 1443The subroutine takes up to five optional arguments: 1444 1445=over 4 1446 1447=item 1. 1448 1449A string to be processed (C<$_> if the string is omitted or C<undef>) 1450 1451=item 2. 1452 1453A string specifying a pattern to be matched as the opening tag. 1454If the pattern string is omitted (or C<undef>) then a pattern 1455that matches any standard XML tag is used. 1456 1457=item 3. 1458 1459A string specifying a pattern to be matched at the closing tag. 1460If the pattern string is omitted (or C<undef>) then the closing 1461tag is constructed by inserting a C</> after any leading bracket 1462characters in the actual opening tag that was matched (I<not> the pattern 1463that matched the tag). For example, if the opening tag pattern 1464is specified as C<'{{\w+}}'> and actually matched the opening tag 1465C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. 1466 1467=item 4. 1468 1469A string specifying a pattern to be matched as a prefix (which is to be 1470skipped). If omitted, optional whitespace is skipped. 1471 1472=item 5. 1473 1474A hash reference containing various parsing options (see below) 1475 1476=back 1477 1478The various options that can be specified are: 1479 1480=over 4 1481 1482=item C<reject =E<gt> $listref> 1483 1484The list reference contains one or more strings specifying patterns 1485that must I<not> appear within the tagged text. 1486 1487For example, to extract 1488an HTML link (which should not contain nested links) use: 1489 1490 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); 1491 1492=item C<ignore =E<gt> $listref> 1493 1494The list reference contains one or more strings specifying patterns 1495that are I<not> to be treated as nested tags within the tagged text 1496(even if they would match the start tag pattern). 1497 1498For example, to extract an arbitrary XML tag, but ignore "empty" elements: 1499 1500 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); 1501 1502(also see L<"gen_delimited_pat"> below). 1503 1504=item C<fail =E<gt> $str> 1505 1506The C<fail> option indicates the action to be taken if a matching end 1507tag is not encountered (i.e. before the end of the string or some 1508C<reject> pattern matches). By default, a failure to match a closing 1509tag causes C<extract_tagged> to immediately fail. 1510 1511However, if the string value associated with <reject> is "MAX", then 1512C<extract_tagged> returns the complete text up to the point of failure. 1513If the string is "PARA", C<extract_tagged> returns only the first paragraph 1514after the tag (up to the first line that is either empty or contains 1515only whitespace characters). 1516If the string is "", the default behaviour (i.e. failure) is reinstated. 1517 1518For example, suppose the start tag "/para" introduces a paragraph, which then 1519continues until the next "/endpara" tag or until another "/para" tag is 1520encountered: 1521 1522 $text = "/para line 1\n\nline 3\n/para line 4"; 1523 1524 extract_tagged($text, '/para', '/endpara', undef, 1525 {reject => '/para', fail => MAX ); 1526 1527 # EXTRACTED: "/para line 1\n\nline 3\n" 1528 1529Suppose instead, that if no matching "/endpara" tag is found, the "/para" 1530tag refers only to the immediately following paragraph: 1531 1532 $text = "/para line 1\n\nline 3\n/para line 4"; 1533 1534 extract_tagged($text, '/para', '/endpara', undef, 1535 {reject => '/para', fail => MAX ); 1536 1537 # EXTRACTED: "/para line 1\n" 1538 1539Note that the specified C<fail> behaviour applies to nested tags as well. 1540 1541=back 1542 1543On success in a list context, an array of 6 elements is returned. The elements are: 1544 1545=over 4 1546 1547=item [0] 1548 1549the extracted tagged substring (including the outermost tags), 1550 1551=item [1] 1552 1553the remainder of the input text, 1554 1555=item [2] 1556 1557the prefix substring (if any), 1558 1559=item [3] 1560 1561the opening tag 1562 1563=item [4] 1564 1565the text between the opening and closing tags 1566 1567=item [5] 1568 1569the closing tag (or "" if no closing tag was found) 1570 1571=back 1572 1573On failure, all of these values (except the remaining text) are C<undef>. 1574 1575In a scalar context, C<extract_tagged> returns just the complete 1576substring that matched a tagged text (including the start and end 1577tags). C<undef> is returned on failure. In addition, the original input 1578text has the returned substring (and any prefix) removed from it. 1579 1580In a void context, the input text just has the matched substring (and 1581any specified prefix) removed. 1582 1583=item C<gen_extract_tagged> 1584 1585C<gen_extract_tagged> generates a new anonymous subroutine which 1586extracts text between (balanced) specified tags. In other words, 1587it generates a function identical in function to C<extract_tagged>. 1588 1589The difference between C<extract_tagged> and the anonymous 1590subroutines generated by 1591C<gen_extract_tagged>, is that those generated subroutines: 1592 1593=over 4 1594 1595=item * 1596 1597do not have to reparse tag specification or parsing options every time 1598they are called (whereas C<extract_tagged> has to effectively rebuild 1599its tag parser on every call); 1600 1601=item * 1602 1603make use of the new qr// construct to pre-compile the regexes they use 1604(whereas C<extract_tagged> uses standard string variable interpolation 1605to create tag-matching patterns). 1606 1607=back 1608 1609The subroutine takes up to four optional arguments (the same set as 1610C<extract_tagged> except for the string to be processed). It returns 1611a reference to a subroutine which in turn takes a single argument (the text to 1612be extracted from). 1613 1614In other words, the implementation of C<extract_tagged> is exactly 1615equivalent to: 1616 1617 sub extract_tagged 1618 { 1619 my $text = shift; 1620 $extractor = gen_extract_tagged(@_); 1621 return $extractor->($text); 1622 } 1623 1624(although C<extract_tagged> is not currently implemented that way). 1625 1626Using C<gen_extract_tagged> to create extraction functions for specific tags 1627is a good idea if those functions are going to be called more than once, since 1628their performance is typically twice as good as the more general-purpose 1629C<extract_tagged>. 1630 1631=item C<extract_quotelike> 1632 1633C<extract_quotelike> attempts to recognize, extract, and segment any 1634one of the various Perl quotes and quotelike operators (see 1635L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket 1636delimiters (for the quotelike operators), and trailing modifiers are 1637all caught. For example, in: 1638 1639 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' 1640 1641 extract_quotelike ' "You said, \"Use sed\"." ' 1642 1643 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' 1644 1645 extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' 1646 1647the full Perl quotelike operations are all extracted correctly. 1648 1649Note too that, when using the /x modifier on a regex, any comment 1650containing the current pattern delimiter will cause the regex to be 1651immediately terminated. In other words: 1652 1653 'm / 1654 (?i) # CASE INSENSITIVE 1655 [a-z_] # LEADING ALPHABETIC/UNDERSCORE 1656 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS 1657 /x' 1658 1659will be extracted as if it were: 1660 1661 'm / 1662 (?i) # CASE INSENSITIVE 1663 [a-z_] # LEADING ALPHABETIC/' 1664 1665This behaviour is identical to that of the actual compiler. 1666 1667C<extract_quotelike> takes two arguments: the text to be processed and 1668a prefix to be matched at the very beginning of the text. If no prefix 1669is specified, optional whitespace is the default. If no text is given, 1670C<$_> is used. 1671 1672In a list context, an array of 11 elements is returned. The elements are: 1673 1674=over 4 1675 1676=item [0] 1677 1678the extracted quotelike substring (including trailing modifiers), 1679 1680=item [1] 1681 1682the remainder of the input text, 1683 1684=item [2] 1685 1686the prefix substring (if any), 1687 1688=item [3] 1689 1690the name of the quotelike operator (if any), 1691 1692=item [4] 1693 1694the left delimiter of the first block of the operation, 1695 1696=item [5] 1697 1698the text of the first block of the operation 1699(that is, the contents of 1700a quote, the regex of a match or substitution or the target list of a 1701translation), 1702 1703=item [6] 1704 1705the right delimiter of the first block of the operation, 1706 1707=item [7] 1708 1709the left delimiter of the second block of the operation 1710(that is, if it is a C<s>, C<tr>, or C<y>), 1711 1712=item [8] 1713 1714the text of the second block of the operation 1715(that is, the replacement of a substitution or the translation list 1716of a translation), 1717 1718=item [9] 1719 1720the right delimiter of the second block of the operation (if any), 1721 1722=item [10] 1723 1724the trailing modifiers on the operation (if any). 1725 1726=back 1727 1728For each of the fields marked "(if any)" the default value on success is 1729an empty string. 1730On failure, all of these values (except the remaining text) are C<undef>. 1731 1732In a scalar context, C<extract_quotelike> returns just the complete substring 1733that matched a quotelike operation (or C<undef> on failure). In a scalar or 1734void context, the input text has the same substring (and any specified 1735prefix) removed. 1736 1737Examples: 1738 1739 # Remove the first quotelike literal that appears in text 1740 1741 $quotelike = extract_quotelike($text,'.*?'); 1742 1743 # Replace one or more leading whitespace-separated quotelike 1744 # literals in $_ with "<QLL>" 1745 1746 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; 1747 1748 1749 # Isolate the search pattern in a quotelike operation from $text 1750 1751 ($op,$pat) = (extract_quotelike $text)[3,5]; 1752 if ($op =~ /[ms]/) 1753 { 1754 print "search pattern: $pat\n"; 1755 } 1756 else 1757 { 1758 print "$op is not a pattern matching operation\n"; 1759 } 1760 1761=item C<extract_quotelike> 1762 1763C<extract_quotelike> can successfully extract "here documents" from an input 1764string, but with an important caveat in list contexts. 1765 1766Unlike other types of quote-like literals, a here document is rarely 1767a contiguous substring. For example, a typical piece of code using 1768here document might look like this: 1769 1770 <<'EOMSG' || die; 1771 This is the message. 1772 EOMSG 1773 exit; 1774 1775Given this as an input string in a scalar context, C<extract_quotelike> 1776would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", 1777leaving the string " || die;\nexit;" in the original variable. In other words, 1778the two separate pieces of the here document are successfully extracted and 1779concatenated. 1780 1781In a list context, C<extract_quotelike> would return the list 1782 1783=over 4 1784 1785=item [0] 1786 1787"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, 1788including fore and aft delimiters), 1789 1790=item [1] 1791 1792" || die;\nexit;" (i.e. the remainder of the input text, concatenated), 1793 1794=item [2] 1795 1796"" (i.e. the prefix substring -- trivial in this case), 1797 1798=item [3] 1799 1800"<<" (i.e. the "name" of the quotelike operator) 1801 1802=item [4] 1803 1804"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), 1805 1806=item [5] 1807 1808"This is the message.\n" (i.e. the text of the here document), 1809 1810=item [6] 1811 1812"EOMSG" (i.e. the right delimiter of the here document), 1813 1814=item [7..10] 1815 1816"" (a here document has no second left delimiter, second text, second right 1817delimiter, or trailing modifiers). 1818 1819=back 1820 1821However, the matching position of the input variable would be set to 1822"exit;" (i.e. I<after> the closing delimiter of the here document), 1823which would cause the earlier " || die;\nexit;" to be skipped in any 1824sequence of code fragment extractions. 1825 1826To avoid this problem, when it encounters a here document whilst 1827extracting from a modifiable string, C<extract_quotelike> silently 1828rearranges the string to an equivalent piece of Perl: 1829 1830 <<'EOMSG' 1831 This is the message. 1832 EOMSG 1833 || die; 1834 exit; 1835 1836in which the here document I<is> contiguous. It still leaves the 1837matching position after the here document, but now the rest of the line 1838on which the here document starts is not skipped. 1839 1840To prevent <extract_quotelike> from mucking about with the input in this way 1841(this is the only case where a list-context C<extract_quotelike> does so), 1842you can pass the input variable as an interpolated literal: 1843 1844 $quotelike = extract_quotelike("$var"); 1845 1846=item C<extract_codeblock> 1847 1848C<extract_codeblock> attempts to recognize and extract a balanced 1849bracket delimited substring that may contain unbalanced brackets 1850inside Perl quotes or quotelike operations. That is, C<extract_codeblock> 1851is like a combination of C<"extract_bracketed"> and 1852C<"extract_quotelike">. 1853 1854C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: 1855a text to process, a set of delimiter brackets to look for, and a prefix to 1856match first. It also takes an optional fourth parameter, which allows the 1857outermost delimiter brackets to be specified separately (see below). 1858 1859Omitting the first argument (input text) means process C<$_> instead. 1860Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. 1861Omitting the third argument (prefix argument) implies optional whitespace at the start. 1862Omitting the fourth argument (outermost delimiter brackets) indicates that the 1863value of the second argument is to be used for the outermost delimiters. 1864 1865Once the prefix and the outermost opening delimiter bracket have been 1866recognized, code blocks are extracted by stepping through the input text and 1867trying the following alternatives in sequence: 1868 1869=over 4 1870 1871=item 1. 1872 1873Try and match a closing delimiter bracket. If the bracket was the same 1874species as the last opening bracket, return the substring to that 1875point. If the bracket was mismatched, return an error. 1876 1877=item 2. 1878 1879Try to match a quote or quotelike operator. If found, call 1880C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return 1881the error it returned. Otherwise go back to step 1. 1882 1883=item 3. 1884 1885Try to match an opening delimiter bracket. If found, call 1886C<extract_codeblock> recursively to eat the embedded block. If the 1887recursive call fails, return an error. Otherwise, go back to step 1. 1888 1889=item 4. 1890 1891Unconditionally match a bareword or any other single character, and 1892then go back to step 1. 1893 1894=back 1895 1896Examples: 1897 1898 # Find a while loop in the text 1899 1900 if ($text =~ s/.*?while\s*\{/{/) 1901 { 1902 $loop = "while " . extract_codeblock($text); 1903 } 1904 1905 # Remove the first round-bracketed list (which may include 1906 # round- or curly-bracketed code blocks or quotelike operators) 1907 1908 extract_codeblock $text, "(){}", '[^(]*'; 1909 1910 1911The ability to specify a different outermost delimiter bracket is useful 1912in some circumstances. For example, in the Parse::RecDescent module, 1913parser actions which are to be performed only on a successful parse 1914are specified using a C<E<lt>defer:...E<gt>> directive. For example: 1915 1916 sentence: subject verb object 1917 <defer: {$::theVerb = $item{verb}} > 1918 1919Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code 1920within the C<E<lt>defer:...E<gt>> directive, but there's a problem. 1921 1922A deferred action like this: 1923 1924 <defer: {if ($count>10) {$count--}} > 1925 1926will be incorrectly parsed as: 1927 1928 <defer: {if ($count> 1929 1930because the "less than" operator is interpreted as a closing delimiter. 1931 1932But, by extracting the directive using 1933S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> 1934the '>' character is only treated as a delimited at the outermost 1935level of the code block, so the directive is parsed correctly. 1936 1937=item C<extract_multiple> 1938 1939The C<extract_multiple> subroutine takes a string to be processed and a 1940list of extractors (subroutines or regular expressions) to apply to that string. 1941 1942In an array context C<extract_multiple> returns an array of substrings 1943of the original string, as extracted by the specified extractors. 1944In a scalar context, C<extract_multiple> returns the first 1945substring successfully extracted from the original string. In both 1946scalar and void contexts the original string has the first successfully 1947extracted substring removed from it. In all contexts 1948C<extract_multiple> starts at the current C<pos> of the string, and 1949sets that C<pos> appropriately after it matches. 1950 1951Hence, the aim of a call to C<extract_multiple> in a list context 1952is to split the processed string into as many non-overlapping fields as 1953possible, by repeatedly applying each of the specified extractors 1954to the remainder of the string. Thus C<extract_multiple> is 1955a generalized form of Perl's C<split> subroutine. 1956 1957The subroutine takes up to four optional arguments: 1958 1959=over 4 1960 1961=item 1. 1962 1963A string to be processed (C<$_> if the string is omitted or C<undef>) 1964 1965=item 2. 1966 1967A reference to a list of subroutine references and/or qr// objects and/or 1968literal strings and/or hash references, specifying the extractors 1969to be used to split the string. If this argument is omitted (or 1970C<undef>) the list: 1971 1972 [ 1973 sub { extract_variable($_[0], '') }, 1974 sub { extract_quotelike($_[0],'') }, 1975 sub { extract_codeblock($_[0],'{}','') }, 1976 ] 1977 1978is used. 1979 1980=item 3. 1981 1982An number specifying the maximum number of fields to return. If this 1983argument is omitted (or C<undef>), split continues as long as possible. 1984 1985If the third argument is I<N>, then extraction continues until I<N> fields 1986have been successfully extracted, or until the string has been completely 1987processed. 1988 1989Note that in scalar and void contexts the value of this argument is 1990automatically reset to 1 (under C<-w>, a warning is issued if the argument 1991has to be reset). 1992 1993=item 4. 1994 1995A value indicating whether unmatched substrings (see below) within the 1996text should be skipped or returned as fields. If the value is true, 1997such substrings are skipped. Otherwise, they are returned. 1998 1999=back 2000 2001The extraction process works by applying each extractor in 2002sequence to the text string. 2003 2004If the extractor is a subroutine it is called in a list context and is 2005expected to return a list of a single element, namely the extracted 2006text. It may optionally also return two further arguments: a string 2007representing the text left after extraction (like $' for a pattern 2008match), and a string representing any prefix skipped before the 2009extraction (like $` in a pattern match). Note that this is designed 2010to facilitate the use of other Text::Balanced subroutines with 2011C<extract_multiple>. Note too that the value returned by an extractor 2012subroutine need not bear any relationship to the corresponding substring 2013of the original text (see examples below). 2014 2015If the extractor is a precompiled regular expression or a string, 2016it is matched against the text in a scalar context with a leading 2017'\G' and the gc modifiers enabled. The extracted value is either 2018$1 if that variable is defined after the match, or else the 2019complete match (i.e. $&). 2020 2021If the extractor is a hash reference, it must contain exactly one element. 2022The value of that element is one of the 2023above extractor types (subroutine reference, regular expression, or string). 2024The key of that element is the name of a class into which the successful 2025return value of the extractor will be blessed. 2026 2027If an extractor returns a defined value, that value is immediately 2028treated as the next extracted field and pushed onto the list of fields. 2029If the extractor was specified in a hash reference, the field is also 2030blessed into the appropriate class, 2031 2032If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is 2033assumed to have failed to extract. 2034If none of the extractor subroutines succeeds, then one 2035character is extracted from the start of the text and the extraction 2036subroutines reapplied. Characters which are thus removed are accumulated and 2037eventually become the next field (unless the fourth argument is true, in which 2038case they are discarded). 2039 2040For example, the following extracts substrings that are valid Perl variables: 2041 2042 @fields = extract_multiple($text, 2043 [ sub { extract_variable($_[0]) } ], 2044 undef, 1); 2045 2046This example separates a text into fields which are quote delimited, 2047curly bracketed, and anything else. The delimited and bracketed 2048parts are also blessed to identify them (the "anything else" is unblessed): 2049 2050 @fields = extract_multiple($text, 2051 [ 2052 { Delim => sub { extract_delimited($_[0],q{'"}) } }, 2053 { Brack => sub { extract_bracketed($_[0],'{}') } }, 2054 ]); 2055 2056This call extracts the next single substring that is a valid Perl quotelike 2057operator (and removes it from $text): 2058 2059 $quotelike = extract_multiple($text, 2060 [ 2061 sub { extract_quotelike($_[0]) }, 2062 ], undef, 1); 2063 2064Finally, here is yet another way to do comma-separated value parsing: 2065 2066 @fields = extract_multiple($csv_text, 2067 [ 2068 sub { extract_delimited($_[0],q{'"}) }, 2069 qr/([^,]+)(.*)/, 2070 ], 2071 undef,1); 2072 2073The list in the second argument means: 2074I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. 2075The undef third argument means: 2076I<"...as many times as possible...">, 2077and the true value in the fourth argument means 2078I<"...discarding anything else that appears (i.e. the commas)">. 2079 2080If you wanted the commas preserved as separate fields (i.e. like split 2081does if your split pattern has capturing parentheses), you would 2082just make the last parameter undefined (or remove it). 2083 2084=item C<gen_delimited_pat> 2085 2086The C<gen_delimited_pat> subroutine takes a single (string) argument and 2087 > builds a Friedl-style optimized regex that matches a string delimited 2088by any one of the characters in the single argument. For example: 2089 2090 gen_delimited_pat(q{'"}) 2091 2092returns the regex: 2093 2094 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') 2095 2096Note that the specified delimiters are automatically quotemeta'd. 2097 2098A typical use of C<gen_delimited_pat> would be to build special purpose tags 2099for C<extract_tagged>. For example, to properly ignore "empty" XML elements 2100(which might contain quoted strings): 2101 2102 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; 2103 2104 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); 2105 2106C<gen_delimited_pat> may also be called with an optional second argument, 2107which specifies the "escape" character(s) to be used for each delimiter. 2108For example to match a Pascal-style string (where ' is the delimiter 2109and '' is a literal ' within the string): 2110 2111 gen_delimited_pat(q{'},q{'}); 2112 2113Different escape characters can be specified for different delimiters. 2114For example, to specify that '/' is the escape for single quotes 2115and '%' is the escape for double quotes: 2116 2117 gen_delimited_pat(q{'"},q{/%}); 2118 2119If more delimiters than escape chars are specified, the last escape char 2120is used for the remaining delimiters. 2121If no escape char is specified for a given specified delimiter, '\' is used. 2122 2123=item C<delimited_pat> 2124 2125Note that C<gen_delimited_pat> was previously called C<delimited_pat>. 2126That name may still be used, but is now deprecated. 2127 2128=back 2129 2130=head1 DIAGNOSTICS 2131 2132In a list context, all the functions return C<(undef,$original_text)> 2133on failure. In a scalar context, failure is indicated by returning C<undef> 2134(in this case the input text is not modified in any way). 2135 2136In addition, on failure in I<any> context, the C<$@> variable is set. 2137Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed 2138below. 2139Accessing C<$@-E<gt>{pos}> returns the offset into the original string at 2140which the error was detected (although not necessarily where it occurred!) 2141Printing C<$@> directly produces the error message, with the offset appended. 2142On success, the C<$@> variable is guaranteed to be C<undef>. 2143 2144The available diagnostics are: 2145 2146=over 4 2147 2148=item C<Did not find a suitable bracket: "%s"> 2149 2150The delimiter provided to C<extract_bracketed> was not one of 2151C<'()[]E<lt>E<gt>{}'>. 2152 2153=item C<Did not find prefix: /%s/> 2154 2155A non-optional prefix was specified but wasn't found at the start of the text. 2156 2157=item C<Did not find opening bracket after prefix: "%s"> 2158 2159C<extract_bracketed> or C<extract_codeblock> was expecting a 2160particular kind of bracket at the start of the text, and didn't find it. 2161 2162=item C<No quotelike operator found after prefix: "%s"> 2163 2164C<extract_quotelike> didn't find one of the quotelike operators C<q>, 2165C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring 2166it was extracting. 2167 2168=item C<Unmatched closing bracket: "%c"> 2169 2170C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered 2171a closing bracket where none was expected. 2172 2173=item C<Unmatched opening bracket(s): "%s"> 2174 2175C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran 2176out of characters in the text before closing one or more levels of nested 2177brackets. 2178 2179=item C<Unmatched embedded quote (%s)> 2180 2181C<extract_bracketed> attempted to match an embedded quoted substring, but 2182failed to find a closing quote to match it. 2183 2184=item C<Did not find closing delimiter to match '%s'> 2185 2186C<extract_quotelike> was unable to find a closing delimiter to match the 2187one that opened the quote-like operation. 2188 2189=item C<Mismatched closing bracket: expected "%c" but found "%s"> 2190 2191C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found 2192a valid bracket delimiter, but it was the wrong species. This usually 2193indicates a nesting error, but may indicate incorrect quoting or escaping. 2194 2195=item C<No block delimiter found after quotelike "%s"> 2196 2197C<extract_quotelike> or C<extract_codeblock> found one of the 2198quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> 2199without a suitable block after it. 2200 2201=item C<Did not find leading dereferencer> 2202 2203C<extract_variable> was expecting one of '$', '@', or '%' at the start of 2204a variable, but didn't find any of them. 2205 2206=item C<Bad identifier after dereferencer> 2207 2208C<extract_variable> found a '$', '@', or '%' indicating a variable, but that 2209character was not followed by a legal Perl identifier. 2210 2211=item C<Did not find expected opening bracket at %s> 2212 2213C<extract_codeblock> failed to find any of the outermost opening brackets 2214that were specified. 2215 2216=item C<Improperly nested codeblock at %s> 2217 2218A nested code block was found that started with a delimiter that was specified 2219as being only to be used as an outermost bracket. 2220 2221=item C<Missing second block for quotelike "%s"> 2222 2223C<extract_codeblock> or C<extract_quotelike> found one of the 2224quotelike operators C<s>, C<tr> or C<y> followed by only one block. 2225 2226=item C<No match found for opening bracket> 2227 2228C<extract_codeblock> failed to find a closing bracket to match the outermost 2229opening bracket. 2230 2231=item C<Did not find opening tag: /%s/> 2232 2233C<extract_tagged> did not find a suitable opening tag (after any specified 2234prefix was removed). 2235 2236=item C<Unable to construct closing tag to match: /%s/> 2237 2238C<extract_tagged> matched the specified opening tag and tried to 2239modify the matched text to produce a matching closing tag (because 2240none was specified). It failed to generate the closing tag, almost 2241certainly because the opening tag did not start with a 2242bracket of some kind. 2243 2244=item C<Found invalid nested tag: %s> 2245 2246C<extract_tagged> found a nested tag that appeared in the "reject" list 2247(and the failure mode was not "MAX" or "PARA"). 2248 2249=item C<Found unbalanced nested tag: %s> 2250 2251C<extract_tagged> found a nested opening tag that was not matched by a 2252corresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). 2253 2254=item C<Did not find closing tag> 2255 2256C<extract_tagged> reached the end of the text without finding a closing tag 2257to match the original opening tag (and the failure mode was not 2258"MAX" or "PARA"). 2259 2260=back 2261 2262=head1 EXPORTS 2263 2264The following symbols are, or can be, exported by this module: 2265 2266=over 4 2267 2268=item Default Exports 2269 2270I<None>. 2271 2272=item Optional Exports 2273 2274C<extract_delimited>, 2275C<extract_bracketed>, 2276C<extract_quotelike>, 2277C<extract_codeblock>, 2278C<extract_variable>, 2279C<extract_tagged>, 2280C<extract_multiple>, 2281C<gen_delimited_pat>, 2282C<gen_extract_tagged>, 2283C<delimited_pat>. 2284 2285=item Export Tags 2286 2287=over 4 2288 2289=item C<:ALL> 2290 2291C<extract_delimited>, 2292C<extract_bracketed>, 2293C<extract_quotelike>, 2294C<extract_codeblock>, 2295C<extract_variable>, 2296C<extract_tagged>, 2297C<extract_multiple>, 2298C<gen_delimited_pat>, 2299C<gen_extract_tagged>, 2300C<delimited_pat>. 2301 2302=back 2303 2304=back 2305 2306=head1 KNOWN BUGS 2307 2308See L<https://rt.cpan.org/Dist/Display.html?Status=Active&Queue=Text-Balanced>. 2309 2310=head1 FEEDBACK 2311 2312Patches, bug reports, suggestions or any other feedback is welcome. 2313 2314Patches can be sent as GitHub pull requests at 2315L<https://github.com/steve-m-hay/Text-Balanced/pulls>. 2316 2317Bug reports and suggestions can be made on the CPAN Request Tracker at 2318L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Text-Balanced>. 2319 2320Currently active requests on the CPAN Request Tracker can be viewed at 2321L<https://rt.cpan.org/Public/Dist/Display.html?Status=Active;Queue=Text-Balanced>. 2322 2323Please test this distribution. See CPAN Testers Reports at 2324L<https://www.cpantesters.org/> for details of how to get involved. 2325 2326Previous test results on CPAN Testers Reports can be viewed at 2327L<https://www.cpantesters.org/distro/T/Text-Balanced.html>. 2328 2329Please rate this distribution on CPAN Ratings at 2330L<https://cpanratings.perl.org/rate/?distribution=Text-Balanced>. 2331 2332=head1 AVAILABILITY 2333 2334The latest version of this module is available from CPAN (see 2335L<perlmodlib/"CPAN"> for details) at 2336 2337L<https://metacpan.org/release/Text-Balanced> or 2338 2339L<https://www.cpan.org/authors/id/S/SH/SHAY/> or 2340 2341L<https://www.cpan.org/modules/by-module/Text/>. 2342 2343The latest source code is available from GitHub at 2344L<https://github.com/steve-m-hay/Text-Balanced>. 2345 2346=head1 INSTALLATION 2347 2348See the F<INSTALL> file. 2349 2350=head1 AUTHOR 2351 2352Damian Conway E<lt>L<damian@conway.org|mailto:damian@conway.org>E<gt>. 2353 2354Steve Hay E<lt>L<shay@cpan.org|mailto:shay@cpan.org>E<gt> is now maintaining 2355Text::Balanced as of version 2.03. 2356 2357=head1 COPYRIGHT 2358 2359Copyright (C) 1997-2001 Damian Conway. All rights reserved. 2360 2361Copyright (C) 2009 Adam Kennedy. 2362 2363Copyright (C) 2015, 2020 Steve Hay. All rights reserved. 2364 2365=head1 LICENCE 2366 2367This module is free software; you can redistribute it and/or modify it under the 2368same terms as Perl itself, i.e. under the terms of either the GNU General Public 2369License or the Artistic License, as specified in the F<LICENCE> file. 2370 2371=head1 VERSION 2372 2373Version 2.04 2374 2375=head1 DATE 2376 237711 Dec 2020 2378 2379=head1 HISTORY 2380 2381See the F<Changes> file. 2382 2383=cut 2384