1# Copyright (C) 1997-2001 Damian Conway. All rights reserved. 2# Copyright (C) 2009 Adam Kennedy. 3# Copyright (C) 2015, 2022 Steve Hay and other contributors. All rights 4# reserved. 5 6# This module is free software; you can redistribute it and/or modify it under 7# the same terms as Perl itself, i.e. under the terms of either the GNU General 8# Public License or the Artistic License, as specified in the F<LICENCE> file. 9 10package Text::Balanced; 11 12# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. 13# FOR FULL DOCUMENTATION SEE Balanced.pod 14 15use 5.008001; 16use strict; 17use Exporter (); 18 19use vars qw { $VERSION @ISA %EXPORT_TAGS }; 20BEGIN { 21 $VERSION = '2.06'; 22 @ISA = 'Exporter'; 23 %EXPORT_TAGS = ( 24 ALL => [ qw{ 25 &extract_delimited 26 &extract_bracketed 27 &extract_quotelike 28 &extract_codeblock 29 &extract_variable 30 &extract_tagged 31 &extract_multiple 32 &gen_delimited_pat 33 &gen_extract_tagged 34 &delimited_pat 35 } ], 36 ); 37} 38 39Exporter::export_ok_tags('ALL'); 40 41our $RE_PREREGEX_PAT = qr#( 42 [!=]~ 43 | split|grep|map 44 | not|and|or|xor 45)#x; 46our $RE_EXPR_PAT = qr#( 47 (?:\*\*|&&|\|\||<<|>>|//|[-+*x%^&|.])=? 48 | /(?:[^/]) 49 | =(?!>) 50 | return 51 | [\(\[] 52)#x; 53our $RE_NUM = qr/\s*[+\-.0-9][+\-.0-9e]*/i; # numerical constant 54 55our %ref2slashvalid; # is quotelike /.../ pattern valid here for given textref? 56our %ref2qmarkvalid; # is quotelike ?...? pattern valid here for given textref? 57 58# HANDLE RETURN VALUES IN VARIOUS CONTEXTS 59 60sub _failmsg { 61 my ($message, $pos) = @_; 62 $@ = bless { 63 error => $message, 64 pos => $pos, 65 }, 'Text::Balanced::ErrorMsg'; 66} 67 68sub _fail { 69 my ($wantarray, $textref, $message, $pos) = @_; 70 _failmsg $message, $pos if $message; 71 return (undef, $$textref, undef) if $wantarray; 72 return; 73} 74 75sub _succeed { 76 $@ = undef; 77 my ($wantarray,$textref) = splice @_, 0, 2; 78 my ($extrapos, $extralen) = @_ > 18 79 ? splice(@_, -2, 2) 80 : (0, 0); 81 my ($startlen, $oppos) = @_[5,6]; 82 my $remainderpos = $_[2]; 83 if ( $wantarray ) { 84 my @res; 85 while (my ($from, $len) = splice @_, 0, 2) { 86 push @res, substr($$textref, $from, $len); 87 } 88 if ( $extralen ) { # CORRECT FILLET 89 my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n"); 90 $res[1] = "$extra$res[1]"; 91 eval { substr($$textref,$remainderpos,0) = $extra; 92 substr($$textref,$extrapos,$extralen,"\n")} ; 93 #REARRANGE HERE DOC AND FILLET IF POSSIBLE 94 pos($$textref) = $remainderpos-$extralen+1; # RESET \G 95 } else { 96 pos($$textref) = $remainderpos; # RESET \G 97 } 98 return @res; 99 } else { 100 my $match = substr($$textref,$_[0],$_[1]); 101 substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; 102 my $extra = $extralen 103 ? substr($$textref, $extrapos, $extralen)."\n" : ""; 104 eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE 105 pos($$textref) = $_[4]; # RESET \G 106 return $match; 107 } 108} 109 110# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING 111## no critic (Subroutines::ProhibitSubroutinePrototypes) 112 113sub gen_delimited_pat($;$) # ($delimiters;$escapes) 114{ 115 my ($dels, $escs) = @_; 116 return "" unless $dels =~ /\S/; 117 $escs = '\\' unless $escs; 118 $escs .= substr($escs,-1) x (length($dels)-length($escs)); 119 my @pat = (); 120 my $i; 121 for ($i=0; $i<length $dels; $i++) 122 { 123 my $del = quotemeta substr($dels,$i,1); 124 my $esc = quotemeta substr($escs,$i,1); 125 if ($del eq $esc) 126 { 127 push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; 128 } 129 else 130 { 131 push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; 132 } 133 } 134 my $pat = join '|', @pat; 135 return "(?:$pat)"; 136} 137 138*delimited_pat = \&gen_delimited_pat; 139 140# THE EXTRACTION FUNCTIONS 141 142sub extract_delimited (;$$$$) 143{ 144 my $textref = defined $_[0] ? \$_[0] : \$_; 145 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 146 my $wantarray = wantarray; 147 my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; 148 my $pre = defined $_[2] ? $_[2] : '\s*'; 149 my $esc = defined $_[3] ? $_[3] : qq{\\}; 150 my $pat = gen_delimited_pat($del, $esc); 151 my $startpos = pos $$textref || 0; 152 return _fail($wantarray, $textref, "Not a delimited pattern", 0) 153 unless $$textref =~ m/\G($pre)($pat)/gc; 154 my $prelen = length($1); 155 my $matchpos = $startpos+$prelen; 156 my $endpos = pos $$textref; 157 return _succeed $wantarray, $textref, 158 $matchpos, $endpos-$matchpos, # MATCH 159 $endpos, length($$textref)-$endpos, # REMAINDER 160 $startpos, $prelen; # PREFIX 161} 162 163my %eb_delim_cache; 164sub _eb_delims { 165 my ($ldel_orig) = @_; 166 return @{ $eb_delim_cache{$ldel_orig} } if $eb_delim_cache{$ldel_orig}; 167 my $qdel = ""; 168 my $quotelike; 169 my $ldel = $ldel_orig; 170 $ldel =~ s/'//g and $qdel .= q{'}; 171 $ldel =~ s/"//g and $qdel .= q{"}; 172 $ldel =~ s/`//g and $qdel .= q{`}; 173 $ldel =~ s/q//g and $quotelike = 1; 174 $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; 175 my $rdel = $ldel; 176 return @{ $eb_delim_cache{$ldel_orig} = [] } unless $rdel =~ tr/[({</])}>/; 177 my $posbug = pos; 178 $ldel = join('|', map { quotemeta $_ } split('', $ldel)); 179 $rdel = join('|', map { quotemeta $_ } split('', $rdel)); 180 pos = $posbug; 181 @{ $eb_delim_cache{$ldel_orig} = [ 182 qr/\G($ldel)/, $qdel && qr/\G([$qdel])/, $quotelike, qr/\G($rdel)/ 183 ] }; 184} 185sub extract_bracketed (;$$$) 186{ 187 my $textref = defined $_[0] ? \$_[0] : \$_; 188 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 189 my $ldel = defined $_[1] ? $_[1] : '{([<'; 190 my $pre = defined $_[2] ? qr/\G$_[2]/ : qr/\G\s*/; 191 my $wantarray = wantarray; 192 my @ret = _eb_delims($ldel); 193 unless (@ret) 194 { 195 return _fail $wantarray, $textref, 196 "Did not find a suitable bracket in delimiter: \"$_[1]\"", 197 0; 198 } 199 200 my $startpos = pos $$textref || 0; 201 my @match = _match_bracketed($textref, $pre, @ret); 202 203 return _fail ($wantarray, $textref) unless @match; 204 205 return _succeed ( $wantarray, $textref, 206 $match[2], $match[5]+2, # MATCH 207 @match[8,9], # REMAINDER 208 @match[0,1], # PREFIX 209 ); 210} 211 212sub _match_bracketed # $textref, $pre, $ldel, $qdel, $quotelike, $rdel 213{ 214 my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; 215 my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); 216 unless ($$textref =~ m/$pre/gc) 217 { 218 _failmsg "Did not find prefix: /$pre/", $startpos; 219 return; 220 } 221 222 $ldelpos = pos $$textref; 223 224 unless ($$textref =~ m/$ldel/gc) 225 { 226 _failmsg "Did not find opening bracket after prefix: \"$pre\"", 227 pos $$textref; 228 pos $$textref = $startpos; 229 return; 230 } 231 232 my @nesting = ( $1 ); 233 my $textlen = length $$textref; 234 while (pos $$textref < $textlen) 235 { 236 next if $$textref =~ m/\G\\./gcs; 237 238 if ($$textref =~ m/$ldel/gc) 239 { 240 push @nesting, $1; 241 } 242 elsif ($$textref =~ m/$rdel/gc) 243 { 244 my ($found, $brackettype) = ($1, $1); 245 if ($#nesting < 0) 246 { 247 _failmsg "Unmatched closing bracket: \"$found\"", 248 pos $$textref; 249 pos $$textref = $startpos; 250 return; 251 } 252 my $expected = pop(@nesting); 253 $expected =~ tr/({[</)}]>/; 254 if ($expected ne $brackettype) 255 { 256 _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, 257 pos $$textref; 258 pos $$textref = $startpos; 259 return; 260 } 261 last if $#nesting < 0; 262 } 263 elsif ($qdel && $$textref =~ m/$qdel/gc) 264 { 265 $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; 266 _failmsg "Unmatched embedded quote ($1)", 267 pos $$textref; 268 pos $$textref = $startpos; 269 return; 270 } 271 elsif ($quotelike && _match_quotelike($textref,qr/\G()/,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref})) 272 { 273 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; # back-compat 274 next; 275 } 276 277 else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } 278 } 279 if ($#nesting>=0) 280 { 281 _failmsg "Unmatched opening bracket(s): " 282 . join("..",@nesting)."..", 283 pos $$textref; 284 pos $$textref = $startpos; 285 return; 286 } 287 288 $endpos = pos $$textref; 289 290 return ( 291 $startpos, $ldelpos-$startpos, # PREFIX 292 $ldelpos, 1, # OPENING BRACKET 293 $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS 294 $endpos-1, 1, # CLOSING BRACKET 295 $endpos, length($$textref)-$endpos, # REMAINDER 296 ); 297} 298 299sub _revbracket($) 300{ 301 my $brack = reverse $_[0]; 302 $brack =~ tr/[({</])}>/; 303 return $brack; 304} 305 306my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; 307 308my $et_default_ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; 309sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) 310{ 311 my $textref = defined $_[0] ? \$_[0] : \$_; 312 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 313 my $ldel = $_[1]; 314 my $rdel = $_[2]; 315 my $pre = defined $_[3] ? qr/\G$_[3]/ : qr/\G\s*/; 316 my %options = defined $_[4] ? %{$_[4]} : (); 317 my $omode = defined $options{fail} ? $options{fail} : ''; 318 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 319 : defined($options{reject}) ? $options{reject} 320 : '' 321 ; 322 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 323 : defined($options{ignore}) ? $options{ignore} 324 : '' 325 ; 326 327 $ldel = $et_default_ldel if !defined $ldel; 328 $@ = undef; 329 330 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 331 332 return _fail(wantarray, $textref) unless @match; 333 return _succeed wantarray, $textref, 334 $match[2], $match[3]+$match[5]+$match[7], # MATCH 335 @match[8..9,0..1,2..7]; # REM, PRE, BITS 336} 337 338sub _match_tagged # ($$$$$$$) 339{ 340 my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; 341 my $rdelspec; 342 343 my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); 344 345 unless ($$textref =~ m/$pre/gc) 346 { 347 _failmsg "Did not find prefix: /$pre/", pos $$textref; 348 goto failed; 349 } 350 351 $opentagpos = pos($$textref); 352 353 unless ($$textref =~ m/\G$ldel/gc) 354 { 355 _failmsg "Did not find opening tag: /$ldel/", pos $$textref; 356 goto failed; 357 } 358 359 $textpos = pos($$textref); 360 361 if (!defined $rdel) 362 { 363 $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]); 364 unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes) 365 { 366 _failmsg "Unable to construct closing tag to match: $rdel", 367 pos $$textref; 368 goto failed; 369 } 370 } 371 else 372 { 373 ## no critic (BuiltinFunctions::ProhibitStringyEval) 374 $rdelspec = eval "qq{$rdel}" || do { 375 my $del; 376 for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) 377 { next if $rdel =~ /\Q$_/; $del = $_; last } 378 unless ($del) { 379 use Carp; 380 croak "Can't interpolate right delimiter $rdel" 381 } 382 eval "qq$del$rdel$del"; 383 }; 384 } 385 386 while (pos($$textref) < length($$textref)) 387 { 388 next if $$textref =~ m/\G\\./gc; 389 390 if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) 391 { 392 $parapos = pos($$textref) - length($1) 393 unless defined $parapos; 394 } 395 elsif ($$textref =~ m/\G($rdelspec)/gc ) 396 { 397 $closetagpos = pos($$textref)-length($1); 398 goto matched; 399 } 400 elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) 401 { 402 next; 403 } 404 elsif ($bad && $$textref =~ m/\G($bad)/gcs) 405 { 406 pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS 407 goto short if ($omode eq 'PARA' || $omode eq 'MAX'); 408 _failmsg "Found invalid nested tag: $1", pos $$textref; 409 goto failed; 410 } 411 elsif ($$textref =~ m/\G($ldel)/gc) 412 { 413 my $tag = $1; 414 pos($$textref) -= length($tag); # REWIND TO NESTED TAG 415 unless (_match_tagged(@_)) # MATCH NESTED TAG 416 { 417 goto short if $omode eq 'PARA' || $omode eq 'MAX'; 418 _failmsg "Found unbalanced nested tag: $tag", 419 pos $$textref; 420 goto failed; 421 } 422 } 423 else { $$textref =~ m/./gcs } 424 } 425 426short: 427 $closetagpos = pos($$textref); 428 goto matched if $omode eq 'MAX'; 429 goto failed unless $omode eq 'PARA'; 430 431 if (defined $parapos) { pos($$textref) = $parapos } 432 else { $parapos = pos($$textref) } 433 434 return ( 435 $startpos, $opentagpos-$startpos, # PREFIX 436 $opentagpos, $textpos-$opentagpos, # OPENING TAG 437 $textpos, $parapos-$textpos, # TEXT 438 $parapos, 0, # NO CLOSING TAG 439 $parapos, length($$textref)-$parapos, # REMAINDER 440 ); 441 442matched: 443 $endpos = pos($$textref); 444 return ( 445 $startpos, $opentagpos-$startpos, # PREFIX 446 $opentagpos, $textpos-$opentagpos, # OPENING TAG 447 $textpos, $closetagpos-$textpos, # TEXT 448 $closetagpos, $endpos-$closetagpos, # CLOSING TAG 449 $endpos, length($$textref)-$endpos, # REMAINDER 450 ); 451 452failed: 453 _failmsg "Did not find closing tag", pos $$textref unless $@; 454 pos($$textref) = $startpos; 455 return; 456} 457 458sub extract_variable (;$$) 459{ 460 my $textref = defined $_[0] ? \$_[0] : \$_; 461 return ("","","") unless defined $$textref; 462 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 463 my $pre = defined $_[1] ? qr/\G$_[1]/ : qr/\G\s*/; 464 465 my @match = _match_variable($textref,$pre); 466 467 return _fail wantarray, $textref unless @match; 468 469 return _succeed wantarray, $textref, 470 @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX 471} 472 473sub _match_variable 474{ 475# $# 476# $^ 477# $$ 478 my ($textref, $pre) = @_; 479 my $startpos = pos($$textref) = pos($$textref)||0; 480 unless ($$textref =~ m/$pre/gc) 481 { 482 _failmsg "Did not find prefix: /$pre/", pos $$textref; 483 return; 484 } 485 my $varpos = pos($$textref); 486 unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) 487 { 488 unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) 489 { 490 _failmsg "Did not find leading dereferencer", pos $$textref; 491 pos $$textref = $startpos; 492 return; 493 } 494 my $deref = $1; 495 496 unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci 497 or _match_codeblock($textref, qr/\G()/, '\{', qr/\G\s*(\})/, '\{', '\}', 0, 1) 498 or $deref eq '$#' or $deref eq '$$' 499 or pos($$textref) == length $$textref ) 500 { 501 _failmsg "Bad identifier after dereferencer", pos $$textref; 502 pos $$textref = $startpos; 503 return; 504 } 505 } 506 507 while (1) 508 { 509 next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; 510 next if _match_codeblock($textref, 511 qr/\G\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, 512 qr/[({[]/, qr/\G\s*([)}\]])/, 513 qr/[({[]/, qr/[)}\]]/, 0, 1); 514 next if _match_codeblock($textref, 515 qr/\G\s*/, qr/[{[]/, qr/\G\s*([}\]])/, 516 qr/[{[]/, qr/[}\]]/, 0, 1); 517 next if _match_variable($textref,qr/\G\s*->\s*/); 518 next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; 519 last; 520 } 521 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 522 523 my $endpos = pos($$textref); 524 return ($startpos, $varpos-$startpos, 525 $varpos, $endpos-$varpos, 526 $endpos, length($$textref)-$endpos 527 ); 528} 529 530my %ec_delim_cache; 531sub _ec_delims { 532 my ($ldel_inner, $ldel_outer) = @_; 533 return @{ $ec_delim_cache{$ldel_outer}{$ldel_inner} } 534 if $ec_delim_cache{$ldel_outer}{$ldel_inner}; 535 my $rdel_inner = $ldel_inner; 536 my $rdel_outer = $ldel_outer; 537 my $posbug = pos; 538 for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } 539 for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } 540 for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) 541 { 542 $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' 543 } 544 pos = $posbug; 545 @{ $ec_delim_cache{$ldel_outer}{$ldel_inner} = [ 546 $ldel_outer, qr/\G\s*($rdel_outer)/, $ldel_inner, $rdel_inner 547 ] }; 548} 549sub extract_codeblock (;$$$$$) 550{ 551 my $textref = defined $_[0] ? \$_[0] : \$_; 552 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 553 my $wantarray = wantarray; 554 my $ldel_inner = defined $_[1] ? $_[1] : '{'; 555 my $pre = !defined $_[2] ? qr/\G\s*/ : qr/\G$_[2]/; 556 my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; 557 my $rd = $_[4]; 558 my @delims = _ec_delims($ldel_inner, $ldel_outer); 559 560 my @match = _match_codeblock($textref, $pre, @delims, $rd, 1); 561 return _fail($wantarray, $textref) unless @match; 562 return _succeed($wantarray, $textref, 563 @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX 564 ); 565} 566 567sub _match_codeblock 568{ 569 my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd, $no_backcompat) = @_; 570 $rdel_outer = qr/\G\s*($rdel_outer)/ if !$no_backcompat; # Switch calls this func directly 571 my $startpos = pos($$textref) = pos($$textref) || 0; 572 unless ($$textref =~ m/$pre/gc) 573 { 574 _failmsg qq{Did not match prefix /$pre/ at"} . 575 substr($$textref,pos($$textref),20) . 576 q{..."}, 577 pos $$textref; 578 return; 579 } 580 my $codepos = pos($$textref); 581 unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER 582 { 583 _failmsg qq{Did not find expected opening bracket at "} . 584 substr($$textref,pos($$textref),20) . 585 q{..."}, 586 pos $$textref; 587 pos $$textref = $startpos; 588 return; 589 } 590 my $closing = $1; 591 $closing =~ tr/([<{/)]>}/; 592 my $matched; 593 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 594 if !pos($$textref) or !defined $ref2slashvalid{$textref}; # default, or reset 595 while (pos($$textref) < length($$textref)) 596 { 597 if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) 598 { 599 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 600 next; 601 } 602 603 if ($$textref =~ m/\G\s*#.*/gc) 604 { 605 next; 606 } 607 608 if ($$textref =~ m/$rdel_outer/gc) 609 { 610 unless ($matched = ($closing && $1 eq $closing) ) 611 { 612 next if $1 eq '>'; # MIGHT BE A "LESS THAN" 613 _failmsg q{Mismatched closing bracket at "} . 614 substr($$textref,pos($$textref),20) . 615 qq{...". Expected '$closing'}, 616 pos $$textref; 617 } 618 last; 619 } 620 621 if (_match_variable($textref,qr/\G\s*/) || 622 _match_quotelike($textref,qr/\G\s*/,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref}) ) 623 { 624 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 625 next; 626 } 627 628 if ($$textref =~ m#\G\s*(?!$ldel_inner)(?:$RE_PREREGEX_PAT|$RE_EXPR_PAT)#gc) 629 { 630 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; 631 next; 632 } 633 634 if ( _match_codeblock($textref, qr/\G\s*/, $ldel_inner, qr/\G\s*($rdel_inner)/, $ldel_inner, $rdel_inner, $rd, 1) ) 635 { 636 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; 637 next; 638 } 639 640 if ($$textref =~ m/\G\s*$ldel_outer/gc) 641 { 642 _failmsg q{Improperly nested codeblock at "} . 643 substr($$textref,pos($$textref),20) . 644 q{..."}, 645 pos $$textref; 646 last; 647 } 648 649 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 650 $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; 651 } 652 continue { $@ = undef } 653 654 unless ($matched) 655 { 656 _failmsg 'No match found for opening bracket', pos $$textref 657 unless $@; 658 return; 659 } 660 661 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = undef; 662 my $endpos = pos($$textref); 663 return ( $startpos, $codepos-$startpos, 664 $codepos, $endpos-$codepos, 665 $endpos, length($$textref)-$endpos, 666 ); 667} 668 669 670my %mods = ( 671 'none' => '[cgimsox]*', 672 'm' => '[cgimsox]*', 673 's' => '[cegimsox]*', 674 'tr' => '[cds]*', 675 'y' => '[cds]*', 676 'qq' => '', 677 'qx' => '', 678 'qw' => '', 679 'qr' => '[imsx]*', 680 'q' => '', 681); 682 683sub extract_quotelike (;$$) 684{ 685 my $textref = $_[0] ? \$_[0] : \$_; 686 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 687 my $wantarray = wantarray; 688 my $pre = defined $_[1] ? qr/\G$_[1]/ : qr/\G\s*/; 689 690 my @match = _match_quotelike($textref,$pre,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref}); 691 return _fail($wantarray, $textref) unless @match; 692 return _succeed($wantarray, $textref, 693 $match[2], $match[18]-$match[2], # MATCH 694 @match[18,19], # REMAINDER 695 @match[0,1], # PREFIX 696 @match[2..17], # THE BITS 697 @match[20,21], # ANY FILLET? 698 ); 699}; 700 701my %maybe_quote = map +($_=>1), qw(" ' `); 702sub _match_quotelike 703{ 704 my ($textref, $pre, $allow_slash_match, $allow_qmark_match) = @_; 705 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 706 if !pos($$textref) or !defined $ref2slashvalid{$textref}; # default, or reset 707 708 my ($textlen,$startpos, 709 $preld1pos,$ld1pos,$str1pos,$rd1pos, 710 $preld2pos,$ld2pos,$str2pos,$rd2pos, 711 $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); 712 713 unless ($$textref =~ m/$pre/gc) 714 { 715 _failmsg qq{Did not find prefix /$pre/ at "} . 716 substr($$textref, pos($$textref), 20) . 717 q{..."}, 718 pos $$textref; 719 return; 720 } 721 my $oppos = pos($$textref); 722 my $initial = substr($$textref,$oppos,1); 723 if ($initial && $maybe_quote{$initial} 724 || $allow_slash_match && $initial eq '/' 725 || $allow_qmark_match && $initial eq '?') 726 { 727 unless ($$textref =~ m/\G \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) 728 { 729 _failmsg qq{Did not find closing delimiter to match '$initial' at "} . 730 substr($$textref, $oppos, 20) . 731 q{..."}, 732 pos $$textref; 733 pos $$textref = $startpos; 734 return; 735 } 736 $modpos= pos($$textref); 737 $rd1pos = $modpos-1; 738 739 if ($initial eq '/' || $initial eq '?') 740 { 741 $$textref =~ m/\G$mods{none}/gc 742 } 743 744 my $endpos = pos($$textref); 745 $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = 0; 746 return ( 747 $startpos, $oppos-$startpos, # PREFIX 748 $oppos, 0, # NO OPERATOR 749 $oppos, 1, # LEFT DEL 750 $oppos+1, $rd1pos-$oppos-1, # STR/PAT 751 $rd1pos, 1, # RIGHT DEL 752 $modpos, 0, # NO 2ND LDEL 753 $modpos, 0, # NO 2ND STR 754 $modpos, 0, # NO 2ND RDEL 755 $modpos, $endpos-$modpos, # MODIFIERS 756 $endpos, $textlen-$endpos, # REMAINDER 757 ); 758 } 759 760 unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<(?=[a-zA-Z]|\s*['"`;,]))}gc) 761 { 762 _failmsg q{No quotelike operator found after prefix at "} . 763 substr($$textref, pos($$textref), 20) . 764 q{..."}, 765 pos $$textref; 766 pos $$textref = $startpos; 767 return; 768 } 769 770 my $op = $1; 771 $preld1pos = pos($$textref); 772 if ($op eq '<<') { 773 $ld1pos = pos($$textref); 774 my $label; 775 if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { 776 $label = $1; 777 } 778 elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' 779 | \G " ([^"\\]* (?:\\.[^"\\]*)*) " 780 | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` 781 }gcsx) { 782 $label = $+; 783 } 784 else { 785 $label = ""; 786 } 787 my $extrapos = pos($$textref); 788 $$textref =~ m{.*\n}gc; 789 $str1pos = pos($$textref)--; 790 unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) { 791 _failmsg qq{Missing here doc terminator ('$label') after "} . 792 substr($$textref, $startpos, 20) . 793 q{..."}, 794 pos $$textref; 795 pos $$textref = $startpos; 796 return; 797 } 798 $rd1pos = pos($$textref); 799 $$textref =~ m{\Q$label\E\n}gc; 800 $ld2pos = pos($$textref); 801 $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = 0; 802 return ( 803 $startpos, $oppos-$startpos, # PREFIX 804 $oppos, length($op), # OPERATOR 805 $ld1pos, $extrapos-$ld1pos, # LEFT DEL 806 $str1pos, $rd1pos-$str1pos, # STR/PAT 807 $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL 808 $ld2pos, 0, # NO 2ND LDEL 809 $ld2pos, 0, # NO 2ND STR 810 $ld2pos, 0, # NO 2ND RDEL 811 $ld2pos, 0, # NO MODIFIERS 812 $ld2pos, $textlen-$ld2pos, # REMAINDER 813 $extrapos, $str1pos-$extrapos, # FILLETED BIT 814 ); 815 } 816 817 $$textref =~ m/\G\s*/gc; 818 $ld1pos = pos($$textref); 819 $str1pos = $ld1pos+1; 820 821 if ($$textref !~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD 822 { 823 _failmsg "No block delimiter found after quotelike $op", 824 pos $$textref; 825 pos $$textref = $startpos; 826 return; 827 } 828 elsif (substr($$textref, $ld1pos, 2) eq '=>') 829 { 830 _failmsg "quotelike $op was actually quoted by '=>'", 831 pos $$textref; 832 pos $$textref = $startpos; 833 return; 834 } 835 pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN 836 my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); 837 if ($ldel1 =~ /[[(<{]/) 838 { 839 $rdel1 =~ tr/[({</])}>/; 840 defined(_match_bracketed($textref,qr/\G/,qr/\G($ldel1)/,"","",qr/\G($rdel1)/)) 841 || do { pos $$textref = $startpos; return }; 842 $ld2pos = pos($$textref); 843 $rd1pos = $ld2pos-1; 844 } 845 else 846 { 847 $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs 848 || do { pos $$textref = $startpos; return }; 849 $ld2pos = $rd1pos = pos($$textref)-1; 850 } 851 852 my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; 853 if ($second_arg) 854 { 855 my ($ldel2, $rdel2); 856 if ($ldel1 =~ /[[(<{]/) 857 { 858 unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD 859 { 860 _failmsg "Missing second block for quotelike $op", 861 pos $$textref; 862 pos $$textref = $startpos; 863 return; 864 } 865 $ldel2 = $rdel2 = "\Q$1"; 866 $rdel2 =~ tr/[({</])}>/; 867 } 868 else 869 { 870 $ldel2 = $rdel2 = $ldel1; 871 } 872 $str2pos = $ld2pos+1; 873 874 if ($ldel2 =~ /[[(<{]/) 875 { 876 pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD 877 defined(_match_bracketed($textref,qr/\G/,qr/\G($ldel2)/,"","",qr/\G($rdel2)/)) 878 || do { pos $$textref = $startpos; return }; 879 } 880 else 881 { 882 $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs 883 || do { pos $$textref = $startpos; return }; 884 } 885 $rd2pos = pos($$textref)-1; 886 } 887 else 888 { 889 $ld2pos = $str2pos = $rd2pos = $rd1pos; 890 } 891 892 $modpos = pos $$textref; 893 894 $$textref =~ m/\G($mods{$op})/gc; 895 my $endpos = pos $$textref; 896 $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = undef; 897 898 return ( 899 $startpos, $oppos-$startpos, # PREFIX 900 $oppos, length($op), # OPERATOR 901 $ld1pos, 1, # LEFT DEL 902 $str1pos, $rd1pos-$str1pos, # STR/PAT 903 $rd1pos, 1, # RIGHT DEL 904 $ld2pos, $second_arg, # 2ND LDEL (MAYBE) 905 $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) 906 $rd2pos, $second_arg, # 2ND RDEL (MAYBE) 907 $modpos, $endpos-$modpos, # MODIFIERS 908 $endpos, $textlen-$endpos, # REMAINDER 909 ); 910} 911 912my $def_func = [ 913 sub { extract_variable($_[0], '') }, 914 sub { extract_quotelike($_[0],'') }, 915 sub { extract_codeblock($_[0],'{}','') }, 916]; 917my %ref_not_regex = map +($_=>1), qw(CODE Text::Balanced::Extractor); 918 919sub _update_patvalid { 920 my ($textref, $text) = @_; 921 if ($ref2slashvalid{$textref} && $text =~ m/(?:$RE_NUM|[\)\]])\s*$/) 922 { 923 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0; 924 } elsif (!$ref2slashvalid{$textref} && $text =~ m/$RE_PREREGEX_PAT\s*$/) 925 { 926 $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; 927 } elsif (!$ref2slashvalid{$textref} && $text =~ m/$RE_EXPR_PAT\s*$/) 928 { 929 $ref2slashvalid{$textref} = 1; 930 $ref2qmarkvalid{$textref} = 0; 931 } 932} 933sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) 934{ 935 my $textref = defined($_[0]) ? \$_[0] : \$_; 936 $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset 937 my $posbug = pos; 938 my ($lastpos, $firstpos); 939 my @fields = (); 940 941 #for ($$textref) 942 { 943 my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; 944 my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; 945 my $igunk = $_[3]; 946 947 pos $$textref ||= 0; 948 949 unless (wantarray) 950 { 951 use Carp; 952 carp "extract_multiple reset maximal count to 1 in scalar context" 953 if $^W && defined($_[2]) && $max > 1; 954 $max = 1 955 } 956 957 my @class; 958 foreach my $func ( @func ) 959 { 960 push @class, undef; 961 ($class[-1], $func) = %$func if ref($func) eq 'HASH'; 962 $func = qr/\G$func/ if !$ref_not_regex{ref $func}; 963 } 964 965 my $unkpos; 966 FIELD: while (pos($$textref) < length($$textref)) 967 { 968 foreach my $i ( 0..$#func ) 969 { 970 my ($field, $pref); 971 my ($class, $func) = ($class[$i], $func[$i]); 972 $lastpos = pos $$textref; 973 if (ref($func) eq 'CODE') 974 { ($field,undef,$pref) = $func->($$textref) } 975 elsif (ref($func) eq 'Text::Balanced::Extractor') 976 { $field = $func->extract($$textref) } 977 elsif( $$textref =~ m/$func[$i]/gc ) 978 { $field = defined($1) 979 ? $1 980 : substr($$textref, $-[0], $+[0] - $-[0]) 981 } 982 $pref ||= ""; 983 if (defined($field) && length($field)) 984 { 985 if (!$igunk) { 986 $unkpos = $lastpos 987 if length($pref) && !defined($unkpos); 988 if (defined $unkpos) 989 { 990 push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; 991 $firstpos = $unkpos unless defined $firstpos; 992 undef $unkpos; 993 last FIELD if @fields == $max; 994 } 995 } 996 push @fields, $class ? bless(\$field, $class) : $field; 997 _update_patvalid($textref, $fields[-1]); 998 $firstpos = $lastpos unless defined $firstpos; 999 $lastpos = pos $$textref; 1000 last FIELD if @fields == $max; 1001 next FIELD; 1002 } 1003 } 1004 if ($$textref =~ /\G(.)/gcs) 1005 { 1006 $unkpos = pos($$textref)-1 1007 unless $igunk || defined $unkpos; 1008 _update_patvalid($textref, substr $$textref, $unkpos, pos($$textref)-$unkpos); 1009 } 1010 } 1011 1012 if (defined $unkpos) 1013 { 1014 push @fields, substr($$textref, $unkpos); 1015 $firstpos = $unkpos unless defined $firstpos; 1016 $lastpos = length $$textref; 1017 } 1018 last; 1019 } 1020 1021 pos $$textref = $lastpos; 1022 return @fields if wantarray; 1023 1024 $firstpos ||= 0; 1025 eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; 1026 pos $$textref = $firstpos }; 1027 return $fields[0]; 1028} 1029 1030sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) 1031{ 1032 my $ldel = $_[0]; 1033 my $rdel = $_[1]; 1034 my $pre = defined $_[2] ? qr/\G$_[2]/ : qr/\G\s*/; 1035 my %options = defined $_[3] ? %{$_[3]} : (); 1036 my $omode = defined $options{fail} ? $options{fail} : ''; 1037 my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) 1038 : defined($options{reject}) ? $options{reject} 1039 : '' 1040 ; 1041 my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) 1042 : defined($options{ignore}) ? $options{ignore} 1043 : '' 1044 ; 1045 1046 $ldel = $et_default_ldel if !defined $ldel; 1047 1048 my $posbug = pos; 1049 for ($ldel, $bad, $ignore) { $_ = qr/$_/ if $_ } 1050 pos = $posbug; 1051 1052 my $closure = sub 1053 { 1054 my $textref = defined $_[0] ? \$_[0] : \$_; 1055 my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); 1056 1057 return _fail(wantarray, $textref) unless @match; 1058 return _succeed wantarray, $textref, 1059 $match[2], $match[3]+$match[5]+$match[7], # MATCH 1060 @match[8..9,0..1,2..7]; # REM, PRE, BITS 1061 }; 1062 1063 bless $closure, 'Text::Balanced::Extractor'; 1064} 1065 1066package Text::Balanced::Extractor; 1067 1068sub extract($$) # ($self, $text) 1069{ 1070 &{$_[0]}($_[1]); 1071} 1072 1073package Text::Balanced::ErrorMsg; 1074 1075use overload 1076 '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }, 1077 fallback => 1; 1078 10791; 1080 1081__END__ 1082 1083=pod 1084 1085=head1 NAME 1086 1087Text::Balanced - Extract delimited text sequences from strings. 1088 1089=head1 SYNOPSIS 1090 1091 use Text::Balanced qw ( 1092 extract_delimited 1093 extract_bracketed 1094 extract_quotelike 1095 extract_codeblock 1096 extract_variable 1097 extract_tagged 1098 extract_multiple 1099 gen_delimited_pat 1100 gen_extract_tagged 1101 ); 1102 1103 # Extract the initial substring of $text that is delimited by 1104 # two (unescaped) instances of the first character in $delim. 1105 1106 ($extracted, $remainder) = extract_delimited($text,$delim); 1107 1108 # Extract the initial substring of $text that is bracketed 1109 # with a delimiter(s) specified by $delim (where the string 1110 # in $delim contains one or more of '(){}[]<>'). 1111 1112 ($extracted, $remainder) = extract_bracketed($text,$delim); 1113 1114 # Extract the initial substring of $text that is bounded by 1115 # an XML tag. 1116 1117 ($extracted, $remainder) = extract_tagged($text); 1118 1119 # Extract the initial substring of $text that is bounded by 1120 # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags 1121 1122 ($extracted, $remainder) = 1123 extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); 1124 1125 # Extract the initial substring of $text that represents a 1126 # Perl "quote or quote-like operation" 1127 1128 ($extracted, $remainder) = extract_quotelike($text); 1129 1130 # Extract the initial substring of $text that represents a block 1131 # of Perl code, bracketed by any of character(s) specified by $delim 1132 # (where the string $delim contains one or more of '(){}[]<>'). 1133 1134 ($extracted, $remainder) = extract_codeblock($text,$delim); 1135 1136 # Extract the initial substrings of $text that would be extracted by 1137 # one or more sequential applications of the specified functions 1138 # or regular expressions 1139 1140 @extracted = extract_multiple($text, 1141 [ \&extract_bracketed, 1142 \&extract_quotelike, 1143 \&some_other_extractor_sub, 1144 qr/[xyz]*/, 1145 'literal', 1146 ]); 1147 1148 # Create a string representing an optimized pattern (a la Friedl) 1149 # that matches a substring delimited by any of the specified characters 1150 # (in this case: any type of quote or a slash) 1151 1152 $patstring = gen_delimited_pat(q{'"`/}); 1153 1154 # Generate a reference to an anonymous sub that is just like extract_tagged 1155 # but pre-compiled and optimized for a specific pair of tags, and 1156 # consequently much faster (i.e. 3 times faster). It uses qr// for better 1157 # performance on repeated calls. 1158 1159 $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); 1160 ($extracted, $remainder) = $extract_head->($text); 1161 1162=head1 DESCRIPTION 1163 1164The various C<extract_...> subroutines may be used to 1165extract a delimited substring, possibly after skipping a 1166specified prefix string. By default, that prefix is 1167optional whitespace (C</\s*/>), but you can change it to whatever 1168you wish (see below). 1169 1170The substring to be extracted must appear at the 1171current C<pos> location of the string's variable 1172(or at index zero, if no C<pos> position is defined). 1173In other words, the C<extract_...> subroutines I<don't> 1174extract the first occurrence of a substring anywhere 1175in a string (like an unanchored regex would). Rather, 1176they extract an occurrence of the substring appearing 1177immediately at the current matching position in the 1178string (like a C<\G>-anchored regex would). 1179 1180=head2 General Behaviour in List Contexts 1181 1182In a list context, all the subroutines return a list, the first three 1183elements of which are always: 1184 1185=over 4 1186 1187=item [0] 1188 1189The extracted string, including the specified delimiters. 1190If the extraction fails C<undef> is returned. 1191 1192=item [1] 1193 1194The remainder of the input string (i.e. the characters after the 1195extracted string). On failure, the entire string is returned. 1196 1197=item [2] 1198 1199The skipped prefix (i.e. the characters before the extracted string). 1200On failure, C<undef> is returned. 1201 1202=back 1203 1204Note that in a list context, the contents of the original input text (the first 1205argument) are not modified in any way. 1206 1207However, if the input text was passed in a variable, that variable's 1208C<pos> value is updated to point at the first character after the 1209extracted text. That means that in a list context the various 1210subroutines can be used much like regular expressions. For example: 1211 1212 while ( $next = (extract_quotelike($text))[0] ) 1213 { 1214 # process next quote-like (in $next) 1215 } 1216 1217=head2 General Behaviour in Scalar and Void Contexts 1218 1219In a scalar context, the extracted string is returned, having first been 1220removed from the input text. Thus, the following code also processes 1221each quote-like operation, but actually removes them from $text: 1222 1223 while ( $next = extract_quotelike($text) ) 1224 { 1225 # process next quote-like (in $next) 1226 } 1227 1228Note that if the input text is a read-only string (i.e. a literal), 1229no attempt is made to remove the extracted text. 1230 1231In a void context the behaviour of the extraction subroutines is 1232exactly the same as in a scalar context, except (of course) that the 1233extracted substring is not returned. 1234 1235=head2 A Note About Prefixes 1236 1237Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.) 1238This can bite you if you're expecting a prefix specification like 1239'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix 1240pattern will only succeed if the <H1> tag is on the current line, since 1241. normally doesn't match newlines. 1242 1243To overcome this limitation, you need to turn on /s matching within 1244the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' 1245 1246=head2 Functions 1247 1248=over 4 1249 1250=item C<extract_delimited> 1251 1252The C<extract_delimited> function formalizes the common idiom 1253of extracting a single-character-delimited substring from the start of 1254a string. For example, to extract a single-quote delimited string, the 1255following code is typically used: 1256 1257 ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; 1258 $extracted = $1; 1259 1260but with C<extract_delimited> it can be simplified to: 1261 1262 ($extracted,$remainder) = extract_delimited($text, "'"); 1263 1264C<extract_delimited> takes up to four scalars (the input text, the 1265delimiters, a prefix pattern to be skipped, and any escape characters) 1266and extracts the initial substring of the text that 1267is appropriately delimited. If the delimiter string has multiple 1268characters, the first one encountered in the text is taken to delimit 1269the substring. 1270The third argument specifies a prefix pattern that is to be skipped 1271(but must be present!) before the substring is extracted. 1272The final argument specifies the escape character to be used for each 1273delimiter. 1274 1275All arguments are optional. If the escape characters are not specified, 1276every delimiter is escaped with a backslash (C<\>). 1277If the prefix is not specified, the 1278pattern C<'\s*'> - optional whitespace - is used. If the delimiter set 1279is also not specified, the set C</["'`]/> is used. If the text to be processed 1280is not specified either, C<$_> is used. 1281 1282In list context, C<extract_delimited> returns a array of three 1283elements, the extracted substring (I<including the surrounding 1284delimiters>), the remainder of the text, and the skipped prefix (if 1285any). If a suitable delimited substring is not found, the first 1286element of the array is the empty string, the second is the complete 1287original text, and the prefix returned in the third element is an 1288empty string. 1289 1290In a scalar context, just the extracted substring is returned. In 1291a void context, the extracted substring (and any prefix) are simply 1292removed from the beginning of the first argument. 1293 1294Examples: 1295 1296 # Remove a single-quoted substring from the very beginning of $text: 1297 1298 $substring = extract_delimited($text, "'", ''); 1299 1300 # Remove a single-quoted Pascalish substring (i.e. one in which 1301 # doubling the quote character escapes it) from the very 1302 # beginning of $text: 1303 1304 $substring = extract_delimited($text, "'", '', "'"); 1305 1306 # Extract a single- or double- quoted substring from the 1307 # beginning of $text, optionally after some whitespace 1308 # (note the list context to protect $text from modification): 1309 1310 ($substring) = extract_delimited $text, q{"'}; 1311 1312 # Delete the substring delimited by the first '/' in $text: 1313 1314 $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; 1315 1316Note that this last example is I<not> the same as deleting the first 1317quote-like pattern. For instance, if C<$text> contained the string: 1318 1319 "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" 1320 1321then after the deletion it would contain: 1322 1323 "if ('.$UNIXCMD/s) { $cmd = $1; }" 1324 1325not: 1326 1327 "if ('./cmd' =~ ms) { $cmd = $1; }" 1328 1329See L<"extract_quotelike"> for a (partial) solution to this problem. 1330 1331=item C<extract_bracketed> 1332 1333Like C<"extract_delimited">, the C<extract_bracketed> function takes 1334up to three optional scalar arguments: a string to extract from, a delimiter 1335specifier, and a prefix pattern. As before, a missing prefix defaults to 1336optional whitespace and a missing text defaults to C<$_>. However, a missing 1337delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). 1338 1339C<extract_bracketed> extracts a balanced-bracket-delimited 1340substring (using any one (or more) of the user-specified delimiter 1341brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also 1342respect quoted unbalanced brackets (see below). 1343 1344A "delimiter bracket" is a bracket in list of delimiters passed as 1345C<extract_bracketed>'s second argument. Delimiter brackets are 1346specified by giving either the left or right (or both!) versions 1347of the required bracket(s). Note that the order in which 1348two or more delimiter brackets are specified is not significant. 1349 1350A "balanced-bracket-delimited substring" is a substring bounded by 1351matched brackets, such that any other (left or right) delimiter 1352bracket I<within> the substring is also matched by an opposite 1353(right or left) delimiter bracket I<at the same level of nesting>. Any 1354type of bracket not in the delimiter list is treated as an ordinary 1355character. 1356 1357In other words, each type of bracket specified as a delimiter must be 1358balanced and correctly nested within the substring, and any other kind of 1359("non-delimiter") bracket in the substring is ignored. 1360 1361For example, given the string: 1362 1363 $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; 1364 1365then a call to C<extract_bracketed> in a list context: 1366 1367 @result = extract_bracketed( $text, '{}' ); 1368 1369would return: 1370 1371 ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) 1372 1373since both sets of C<'{..}'> brackets are properly nested and evenly balanced. 1374(In a scalar context just the first element of the array would be returned. In 1375a void context, C<$text> would be replaced by an empty string.) 1376 1377Likewise the call in: 1378 1379 @result = extract_bracketed( $text, '{[' ); 1380 1381would return the same result, since all sets of both types of specified 1382delimiter brackets are correctly nested and balanced. 1383 1384However, the call in: 1385 1386 @result = extract_bracketed( $text, '{([<' ); 1387 1388would fail, returning: 1389 1390 ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); 1391 1392because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and 1393the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would 1394return an empty string. In a void context, C<$text> would be unchanged.) 1395 1396Note that the embedded single-quotes in the string don't help in this 1397case, since they have not been specified as acceptable delimiters and are 1398therefore treated as non-delimiter characters (and ignored). 1399 1400However, if a particular species of quote character is included in the 1401delimiter specification, then that type of quote will be correctly handled. 1402for example, if C<$text> is: 1403 1404 $text = '<A HREF=">>>>">link</A>'; 1405 1406then 1407 1408 @result = extract_bracketed( $text, '<">' ); 1409 1410returns: 1411 1412 ( '<A HREF=">>>>">', 'link</A>', "" ) 1413 1414as expected. Without the specification of C<"> as an embedded quoter: 1415 1416 @result = extract_bracketed( $text, '<>' ); 1417 1418the result would be: 1419 1420 ( '<A HREF=">', '>>>">link</A>', "" ) 1421 1422In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like 1423quoting (i.e. q{string}, qq{string}, etc) can be specified by including the 1424letter 'q' as a delimiter. Hence: 1425 1426 @result = extract_bracketed( $text, '<q>' ); 1427 1428would correctly match something like this: 1429 1430 $text = '<leftop: conj /and/ conj>'; 1431 1432See also: C<"extract_quotelike"> and C<"extract_codeblock">. 1433 1434=item C<extract_variable> 1435 1436C<extract_variable> extracts any valid Perl variable or 1437variable-involved expression, including scalars, arrays, hashes, array 1438accesses, hash look-ups, method calls through objects, subroutine calls 1439through subroutine references, etc. 1440 1441The subroutine takes up to two optional arguments: 1442 1443=over 4 1444 1445=item 1. 1446 1447A string to be processed (C<$_> if the string is omitted or C<undef>) 1448 1449=item 2. 1450 1451A string specifying a pattern to be matched as a prefix (which is to be 1452skipped). If omitted, optional whitespace is skipped. 1453 1454=back 1455 1456On success in a list context, an array of 3 elements is returned. The 1457elements are: 1458 1459=over 4 1460 1461=item [0] 1462 1463the extracted variable, or variablish expression 1464 1465=item [1] 1466 1467the remainder of the input text, 1468 1469=item [2] 1470 1471the prefix substring (if any), 1472 1473=back 1474 1475On failure, all of these values (except the remaining text) are C<undef>. 1476 1477In a scalar context, C<extract_variable> returns just the complete 1478substring that matched a variablish expression. C<undef> is returned on 1479failure. In addition, the original input text has the returned substring 1480(and any prefix) removed from it. 1481 1482In a void context, the input text just has the matched substring (and 1483any specified prefix) removed. 1484 1485=item C<extract_tagged> 1486 1487C<extract_tagged> extracts and segments text between (balanced) 1488specified tags. 1489 1490The subroutine takes up to five optional arguments: 1491 1492=over 4 1493 1494=item 1. 1495 1496A string to be processed (C<$_> if the string is omitted or C<undef>) 1497 1498=item 2. 1499 1500A string specifying a pattern (i.e. regex) to be matched as the opening tag. 1501If the pattern string is omitted (or C<undef>) then a pattern 1502that matches any standard XML tag is used. 1503 1504=item 3. 1505 1506A string specifying a pattern to be matched at the closing tag. 1507If the pattern string is omitted (or C<undef>) then the closing 1508tag is constructed by inserting a C</> after any leading bracket 1509characters in the actual opening tag that was matched (I<not> the pattern 1510that matched the tag). For example, if the opening tag pattern 1511is specified as C<'{{\w+}}'> and actually matched the opening tag 1512C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. 1513 1514=item 4. 1515 1516A string specifying a pattern to be matched as a prefix (which is to be 1517skipped). If omitted, optional whitespace is skipped. 1518 1519=item 5. 1520 1521A hash reference containing various parsing options (see below) 1522 1523=back 1524 1525The various options that can be specified are: 1526 1527=over 4 1528 1529=item C<reject =E<gt> $listref> 1530 1531The list reference contains one or more strings specifying patterns 1532that must I<not> appear within the tagged text. 1533 1534For example, to extract 1535an HTML link (which should not contain nested links) use: 1536 1537 extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); 1538 1539=item C<ignore =E<gt> $listref> 1540 1541The list reference contains one or more strings specifying patterns 1542that are I<not> to be treated as nested tags within the tagged text 1543(even if they would match the start tag pattern). 1544 1545For example, to extract an arbitrary XML tag, but ignore "empty" elements: 1546 1547 extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); 1548 1549(also see L<"gen_delimited_pat"> below). 1550 1551=item C<fail =E<gt> $str> 1552 1553The C<fail> option indicates the action to be taken if a matching end 1554tag is not encountered (i.e. before the end of the string or some 1555C<reject> pattern matches). By default, a failure to match a closing 1556tag causes C<extract_tagged> to immediately fail. 1557 1558However, if the string value associated with <reject> is "MAX", then 1559C<extract_tagged> returns the complete text up to the point of failure. 1560If the string is "PARA", C<extract_tagged> returns only the first paragraph 1561after the tag (up to the first line that is either empty or contains 1562only whitespace characters). 1563If the string is "", the default behaviour (i.e. failure) is reinstated. 1564 1565For example, suppose the start tag "/para" introduces a paragraph, which then 1566continues until the next "/endpara" tag or until another "/para" tag is 1567encountered: 1568 1569 $text = "/para line 1\n\nline 3\n/para line 4"; 1570 1571 extract_tagged($text, '/para', '/endpara', undef, 1572 {reject => '/para', fail => MAX ); 1573 1574 # EXTRACTED: "/para line 1\n\nline 3\n" 1575 1576Suppose instead, that if no matching "/endpara" tag is found, the "/para" 1577tag refers only to the immediately following paragraph: 1578 1579 $text = "/para line 1\n\nline 3\n/para line 4"; 1580 1581 extract_tagged($text, '/para', '/endpara', undef, 1582 {reject => '/para', fail => MAX ); 1583 1584 # EXTRACTED: "/para line 1\n" 1585 1586Note that the specified C<fail> behaviour applies to nested tags as well. 1587 1588=back 1589 1590On success in a list context, an array of 6 elements is returned. The elements are: 1591 1592=over 4 1593 1594=item [0] 1595 1596the extracted tagged substring (including the outermost tags), 1597 1598=item [1] 1599 1600the remainder of the input text, 1601 1602=item [2] 1603 1604the prefix substring (if any), 1605 1606=item [3] 1607 1608the opening tag 1609 1610=item [4] 1611 1612the text between the opening and closing tags 1613 1614=item [5] 1615 1616the closing tag (or "" if no closing tag was found) 1617 1618=back 1619 1620On failure, all of these values (except the remaining text) are C<undef>. 1621 1622In a scalar context, C<extract_tagged> returns just the complete 1623substring that matched a tagged text (including the start and end 1624tags). C<undef> is returned on failure. In addition, the original input 1625text has the returned substring (and any prefix) removed from it. 1626 1627In a void context, the input text just has the matched substring (and 1628any specified prefix) removed. 1629 1630=item C<gen_extract_tagged> 1631 1632C<gen_extract_tagged> generates a new anonymous subroutine which 1633extracts text between (balanced) specified tags. In other words, 1634it generates a function identical in function to C<extract_tagged>. 1635 1636The difference between C<extract_tagged> and the anonymous 1637subroutines generated by 1638C<gen_extract_tagged>, is that those generated subroutines: 1639 1640=over 4 1641 1642=item * 1643 1644do not have to reparse tag specification or parsing options every time 1645they are called (whereas C<extract_tagged> has to effectively rebuild 1646its tag parser on every call); 1647 1648=item * 1649 1650make use of the new qr// construct to pre-compile the regexes they use 1651(whereas C<extract_tagged> uses standard string variable interpolation 1652to create tag-matching patterns). 1653 1654=back 1655 1656The subroutine takes up to four optional arguments (the same set as 1657C<extract_tagged> except for the string to be processed). It returns 1658a reference to a subroutine which in turn takes a single argument (the text to 1659be extracted from). 1660 1661In other words, the implementation of C<extract_tagged> is exactly 1662equivalent to: 1663 1664 sub extract_tagged 1665 { 1666 my $text = shift; 1667 $extractor = gen_extract_tagged(@_); 1668 return $extractor->($text); 1669 } 1670 1671(although C<extract_tagged> is not currently implemented that way). 1672 1673Using C<gen_extract_tagged> to create extraction functions for specific tags 1674is a good idea if those functions are going to be called more than once, since 1675their performance is typically twice as good as the more general-purpose 1676C<extract_tagged>. 1677 1678=item C<extract_quotelike> 1679 1680C<extract_quotelike> attempts to recognize, extract, and segment any 1681one of the various Perl quotes and quotelike operators (see 1682L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket 1683delimiters (for the quotelike operators), and trailing modifiers are 1684all caught. For example, in: 1685 1686 extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' 1687 1688 extract_quotelike ' "You said, \"Use sed\"." ' 1689 1690 extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' 1691 1692 extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' 1693 1694the full Perl quotelike operations are all extracted correctly. 1695 1696Note too that, when using the /x modifier on a regex, any comment 1697containing the current pattern delimiter will cause the regex to be 1698immediately terminated. In other words: 1699 1700 'm / 1701 (?i) # CASE INSENSITIVE 1702 [a-z_] # LEADING ALPHABETIC/UNDERSCORE 1703 [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS 1704 /x' 1705 1706will be extracted as if it were: 1707 1708 'm / 1709 (?i) # CASE INSENSITIVE 1710 [a-z_] # LEADING ALPHABETIC/' 1711 1712This behaviour is identical to that of the actual compiler. 1713 1714C<extract_quotelike> takes two arguments: the text to be processed and 1715a prefix to be matched at the very beginning of the text. If no prefix 1716is specified, optional whitespace is the default. If no text is given, 1717C<$_> is used. 1718 1719In a list context, an array of 11 elements is returned. The elements are: 1720 1721=over 4 1722 1723=item [0] 1724 1725the extracted quotelike substring (including trailing modifiers), 1726 1727=item [1] 1728 1729the remainder of the input text, 1730 1731=item [2] 1732 1733the prefix substring (if any), 1734 1735=item [3] 1736 1737the name of the quotelike operator (if any), 1738 1739=item [4] 1740 1741the left delimiter of the first block of the operation, 1742 1743=item [5] 1744 1745the text of the first block of the operation 1746(that is, the contents of 1747a quote, the regex of a match or substitution or the target list of a 1748translation), 1749 1750=item [6] 1751 1752the right delimiter of the first block of the operation, 1753 1754=item [7] 1755 1756the left delimiter of the second block of the operation 1757(that is, if it is a C<s>, C<tr>, or C<y>), 1758 1759=item [8] 1760 1761the text of the second block of the operation 1762(that is, the replacement of a substitution or the translation list 1763of a translation), 1764 1765=item [9] 1766 1767the right delimiter of the second block of the operation (if any), 1768 1769=item [10] 1770 1771the trailing modifiers on the operation (if any). 1772 1773=back 1774 1775For each of the fields marked "(if any)" the default value on success is 1776an empty string. 1777On failure, all of these values (except the remaining text) are C<undef>. 1778 1779In a scalar context, C<extract_quotelike> returns just the complete substring 1780that matched a quotelike operation (or C<undef> on failure). In a scalar or 1781void context, the input text has the same substring (and any specified 1782prefix) removed. 1783 1784Examples: 1785 1786 # Remove the first quotelike literal that appears in text 1787 1788 $quotelike = extract_quotelike($text,'.*?'); 1789 1790 # Replace one or more leading whitespace-separated quotelike 1791 # literals in $_ with "<QLL>" 1792 1793 do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; 1794 1795 1796 # Isolate the search pattern in a quotelike operation from $text 1797 1798 ($op,$pat) = (extract_quotelike $text)[3,5]; 1799 if ($op =~ /[ms]/) 1800 { 1801 print "search pattern: $pat\n"; 1802 } 1803 else 1804 { 1805 print "$op is not a pattern matching operation\n"; 1806 } 1807 1808=item C<extract_quotelike> 1809 1810C<extract_quotelike> can successfully extract "here documents" from an input 1811string, but with an important caveat in list contexts. 1812 1813Unlike other types of quote-like literals, a here document is rarely 1814a contiguous substring. For example, a typical piece of code using 1815here document might look like this: 1816 1817 <<'EOMSG' || die; 1818 This is the message. 1819 EOMSG 1820 exit; 1821 1822Given this as an input string in a scalar context, C<extract_quotelike> 1823would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", 1824leaving the string " || die;\nexit;" in the original variable. In other words, 1825the two separate pieces of the here document are successfully extracted and 1826concatenated. 1827 1828In a list context, C<extract_quotelike> would return the list 1829 1830=over 4 1831 1832=item [0] 1833 1834"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, 1835including fore and aft delimiters), 1836 1837=item [1] 1838 1839" || die;\nexit;" (i.e. the remainder of the input text, concatenated), 1840 1841=item [2] 1842 1843"" (i.e. the prefix substring -- trivial in this case), 1844 1845=item [3] 1846 1847"<<" (i.e. the "name" of the quotelike operator) 1848 1849=item [4] 1850 1851"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), 1852 1853=item [5] 1854 1855"This is the message.\n" (i.e. the text of the here document), 1856 1857=item [6] 1858 1859"EOMSG" (i.e. the right delimiter of the here document), 1860 1861=item [7..10] 1862 1863"" (a here document has no second left delimiter, second text, second right 1864delimiter, or trailing modifiers). 1865 1866=back 1867 1868However, the matching position of the input variable would be set to 1869"exit;" (i.e. I<after> the closing delimiter of the here document), 1870which would cause the earlier " || die;\nexit;" to be skipped in any 1871sequence of code fragment extractions. 1872 1873To avoid this problem, when it encounters a here document whilst 1874extracting from a modifiable string, C<extract_quotelike> silently 1875rearranges the string to an equivalent piece of Perl: 1876 1877 <<'EOMSG' 1878 This is the message. 1879 EOMSG 1880 || die; 1881 exit; 1882 1883in which the here document I<is> contiguous. It still leaves the 1884matching position after the here document, but now the rest of the line 1885on which the here document starts is not skipped. 1886 1887To prevent <extract_quotelike> from mucking about with the input in this way 1888(this is the only case where a list-context C<extract_quotelike> does so), 1889you can pass the input variable as an interpolated literal: 1890 1891 $quotelike = extract_quotelike("$var"); 1892 1893=item C<extract_codeblock> 1894 1895C<extract_codeblock> attempts to recognize and extract a balanced 1896bracket delimited substring that may contain unbalanced brackets 1897inside Perl quotes or quotelike operations. That is, C<extract_codeblock> 1898is like a combination of C<"extract_bracketed"> and 1899C<"extract_quotelike">. 1900 1901C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: 1902a text to process, a set of delimiter brackets to look for, and a prefix to 1903match first. It also takes an optional fourth parameter, which allows the 1904outermost delimiter brackets to be specified separately (see below), 1905and a fifth parameter used only by L<Parse::RecDescent>. 1906 1907Omitting the first argument (input text) means process C<$_> instead. 1908Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. 1909Omitting the third argument (prefix argument) implies optional whitespace at the start. 1910Omitting the fourth argument (outermost delimiter brackets) indicates that the 1911value of the second argument is to be used for the outermost delimiters. 1912 1913Once the prefix and the outermost opening delimiter bracket have been 1914recognized, code blocks are extracted by stepping through the input text and 1915trying the following alternatives in sequence: 1916 1917=over 4 1918 1919=item 1. 1920 1921Try and match a closing delimiter bracket. If the bracket was the same 1922species as the last opening bracket, return the substring to that 1923point. If the bracket was mismatched, return an error. 1924 1925=item 2. 1926 1927Try to match a quote or quotelike operator. If found, call 1928C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return 1929the error it returned. Otherwise go back to step 1. 1930 1931=item 3. 1932 1933Try to match an opening delimiter bracket. If found, call 1934C<extract_codeblock> recursively to eat the embedded block. If the 1935recursive call fails, return an error. Otherwise, go back to step 1. 1936 1937=item 4. 1938 1939Unconditionally match a bareword or any other single character, and 1940then go back to step 1. 1941 1942=back 1943 1944Examples: 1945 1946 # Find a while loop in the text 1947 1948 if ($text =~ s/.*?while\s*\{/{/) 1949 { 1950 $loop = "while " . extract_codeblock($text); 1951 } 1952 1953 # Remove the first round-bracketed list (which may include 1954 # round- or curly-bracketed code blocks or quotelike operators) 1955 1956 extract_codeblock $text, "(){}", '[^(]*'; 1957 1958 1959The ability to specify a different outermost delimiter bracket is useful 1960in some circumstances. For example, in the Parse::RecDescent module, 1961parser actions which are to be performed only on a successful parse 1962are specified using a C<E<lt>defer:...E<gt>> directive. For example: 1963 1964 sentence: subject verb object 1965 <defer: {$::theVerb = $item{verb}} > 1966 1967Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code 1968within the C<E<lt>defer:...E<gt>> directive, but there's a problem. 1969 1970A deferred action like this: 1971 1972 <defer: {if ($count>10) {$count--}} > 1973 1974will be incorrectly parsed as: 1975 1976 <defer: {if ($count> 1977 1978because the "less than" operator is interpreted as a closing delimiter. 1979 1980But, by extracting the directive using 1981S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> 1982the '>' character is only treated as a delimited at the outermost 1983level of the code block, so the directive is parsed correctly. 1984 1985=item C<extract_multiple> 1986 1987The C<extract_multiple> subroutine takes a string to be processed and a 1988list of extractors (subroutines or regular expressions) to apply to that string. 1989 1990In an array context C<extract_multiple> returns an array of substrings 1991of the original string, as extracted by the specified extractors. 1992In a scalar context, C<extract_multiple> returns the first 1993substring successfully extracted from the original string. In both 1994scalar and void contexts the original string has the first successfully 1995extracted substring removed from it. In all contexts 1996C<extract_multiple> starts at the current C<pos> of the string, and 1997sets that C<pos> appropriately after it matches. 1998 1999Hence, the aim of a call to C<extract_multiple> in a list context 2000is to split the processed string into as many non-overlapping fields as 2001possible, by repeatedly applying each of the specified extractors 2002to the remainder of the string. Thus C<extract_multiple> is 2003a generalized form of Perl's C<split> subroutine. 2004 2005The subroutine takes up to four optional arguments: 2006 2007=over 4 2008 2009=item 1. 2010 2011A string to be processed (C<$_> if the string is omitted or C<undef>) 2012 2013=item 2. 2014 2015A reference to a list of subroutine references and/or qr// objects and/or 2016literal strings and/or hash references, specifying the extractors 2017to be used to split the string. If this argument is omitted (or 2018C<undef>) the list: 2019 2020 [ 2021 sub { extract_variable($_[0], '') }, 2022 sub { extract_quotelike($_[0],'') }, 2023 sub { extract_codeblock($_[0],'{}','') }, 2024 ] 2025 2026is used. 2027 2028=item 3. 2029 2030An number specifying the maximum number of fields to return. If this 2031argument is omitted (or C<undef>), split continues as long as possible. 2032 2033If the third argument is I<N>, then extraction continues until I<N> fields 2034have been successfully extracted, or until the string has been completely 2035processed. 2036 2037Note that in scalar and void contexts the value of this argument is 2038automatically reset to 1 (under C<-w>, a warning is issued if the argument 2039has to be reset). 2040 2041=item 4. 2042 2043A value indicating whether unmatched substrings (see below) within the 2044text should be skipped or returned as fields. If the value is true, 2045such substrings are skipped. Otherwise, they are returned. 2046 2047=back 2048 2049The extraction process works by applying each extractor in 2050sequence to the text string. 2051 2052If the extractor is a subroutine it is called in a list context and is 2053expected to return a list of a single element, namely the extracted 2054text. It may optionally also return two further arguments: a string 2055representing the text left after extraction (like $' for a pattern 2056match), and a string representing any prefix skipped before the 2057extraction (like $` in a pattern match). Note that this is designed 2058to facilitate the use of other Text::Balanced subroutines with 2059C<extract_multiple>. Note too that the value returned by an extractor 2060subroutine need not bear any relationship to the corresponding substring 2061of the original text (see examples below). 2062 2063If the extractor is a precompiled regular expression or a string, 2064it is matched against the text in a scalar context with a leading 2065'\G' and the gc modifiers enabled. The extracted value is either 2066$1 if that variable is defined after the match, or else the 2067complete match (i.e. $&). 2068 2069If the extractor is a hash reference, it must contain exactly one element. 2070The value of that element is one of the 2071above extractor types (subroutine reference, regular expression, or string). 2072The key of that element is the name of a class into which the successful 2073return value of the extractor will be blessed. 2074 2075If an extractor returns a defined value, that value is immediately 2076treated as the next extracted field and pushed onto the list of fields. 2077If the extractor was specified in a hash reference, the field is also 2078blessed into the appropriate class, 2079 2080If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is 2081assumed to have failed to extract. 2082If none of the extractor subroutines succeeds, then one 2083character is extracted from the start of the text and the extraction 2084subroutines reapplied. Characters which are thus removed are accumulated and 2085eventually become the next field (unless the fourth argument is true, in which 2086case they are discarded). 2087 2088For example, the following extracts substrings that are valid Perl variables: 2089 2090 @fields = extract_multiple($text, 2091 [ sub { extract_variable($_[0]) } ], 2092 undef, 1); 2093 2094This example separates a text into fields which are quote delimited, 2095curly bracketed, and anything else. The delimited and bracketed 2096parts are also blessed to identify them (the "anything else" is unblessed): 2097 2098 @fields = extract_multiple($text, 2099 [ 2100 { Delim => sub { extract_delimited($_[0],q{'"}) } }, 2101 { Brack => sub { extract_bracketed($_[0],'{}') } }, 2102 ]); 2103 2104This call extracts the next single substring that is a valid Perl quotelike 2105operator (and removes it from $text): 2106 2107 $quotelike = extract_multiple($text, 2108 [ 2109 sub { extract_quotelike($_[0]) }, 2110 ], undef, 1); 2111 2112Finally, here is yet another way to do comma-separated value parsing: 2113 2114 $csv_text = "a,'x b',c"; 2115 @fields = extract_multiple($csv_text, 2116 [ 2117 sub { extract_delimited($_[0],q{'"}) }, 2118 qr/([^,]+)/, 2119 ], 2120 undef,1); 2121 # @fields is now ('a', "'x b'", 'c') 2122 2123The list in the second argument means: 2124I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. 2125The undef third argument means: 2126I<"...as many times as possible...">, 2127and the true value in the fourth argument means 2128I<"...discarding anything else that appears (i.e. the commas)">. 2129 2130If you wanted the commas preserved as separate fields (i.e. like split 2131does if your split pattern has capturing parentheses), you would 2132just make the last parameter undefined (or remove it). 2133 2134=item C<gen_delimited_pat> 2135 2136The C<gen_delimited_pat> subroutine takes a single (string) argument and 2137builds a Friedl-style optimized regex that matches a string delimited 2138by any one of the characters in the single argument. For example: 2139 2140 gen_delimited_pat(q{'"}) 2141 2142returns the regex: 2143 2144 (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') 2145 2146Note that the specified delimiters are automatically quotemeta'd. 2147 2148A typical use of C<gen_delimited_pat> would be to build special purpose tags 2149for C<extract_tagged>. For example, to properly ignore "empty" XML elements 2150(which might contain quoted strings): 2151 2152 my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; 2153 2154 extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); 2155 2156C<gen_delimited_pat> may also be called with an optional second argument, 2157which specifies the "escape" character(s) to be used for each delimiter. 2158For example to match a Pascal-style string (where ' is the delimiter 2159and '' is a literal ' within the string): 2160 2161 gen_delimited_pat(q{'},q{'}); 2162 2163Different escape characters can be specified for different delimiters. 2164For example, to specify that '/' is the escape for single quotes 2165and '%' is the escape for double quotes: 2166 2167 gen_delimited_pat(q{'"},q{/%}); 2168 2169If more delimiters than escape chars are specified, the last escape char 2170is used for the remaining delimiters. 2171If no escape char is specified for a given specified delimiter, '\' is used. 2172 2173=item C<delimited_pat> 2174 2175Note that C<gen_delimited_pat> was previously called C<delimited_pat>. 2176That name may still be used, but is now deprecated. 2177 2178=back 2179 2180=head1 DIAGNOSTICS 2181 2182In a list context, all the functions return C<(undef,$original_text)> 2183on failure. In a scalar context, failure is indicated by returning C<undef> 2184(in this case the input text is not modified in any way). 2185 2186In addition, on failure in I<any> context, the C<$@> variable is set. 2187Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed 2188below. 2189Accessing C<$@-E<gt>{pos}> returns the offset into the original string at 2190which the error was detected (although not necessarily where it occurred!) 2191Printing C<$@> directly produces the error message, with the offset appended. 2192On success, the C<$@> variable is guaranteed to be C<undef>. 2193 2194The available diagnostics are: 2195 2196=over 4 2197 2198=item C<Did not find a suitable bracket: "%s"> 2199 2200The delimiter provided to C<extract_bracketed> was not one of 2201C<'()[]E<lt>E<gt>{}'>. 2202 2203=item C<Did not find prefix: /%s/> 2204 2205A non-optional prefix was specified but wasn't found at the start of the text. 2206 2207=item C<Did not find opening bracket after prefix: "%s"> 2208 2209C<extract_bracketed> or C<extract_codeblock> was expecting a 2210particular kind of bracket at the start of the text, and didn't find it. 2211 2212=item C<No quotelike operator found after prefix: "%s"> 2213 2214C<extract_quotelike> didn't find one of the quotelike operators C<q>, 2215C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring 2216it was extracting. 2217 2218=item C<Unmatched closing bracket: "%c"> 2219 2220C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered 2221a closing bracket where none was expected. 2222 2223=item C<Unmatched opening bracket(s): "%s"> 2224 2225C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran 2226out of characters in the text before closing one or more levels of nested 2227brackets. 2228 2229=item C<Unmatched embedded quote (%s)> 2230 2231C<extract_bracketed> attempted to match an embedded quoted substring, but 2232failed to find a closing quote to match it. 2233 2234=item C<Did not find closing delimiter to match '%s'> 2235 2236C<extract_quotelike> was unable to find a closing delimiter to match the 2237one that opened the quote-like operation. 2238 2239=item C<Mismatched closing bracket: expected "%c" but found "%s"> 2240 2241C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found 2242a valid bracket delimiter, but it was the wrong species. This usually 2243indicates a nesting error, but may indicate incorrect quoting or escaping. 2244 2245=item C<No block delimiter found after quotelike "%s"> 2246 2247C<extract_quotelike> or C<extract_codeblock> found one of the 2248quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> 2249without a suitable block after it. 2250 2251=item C<Did not find leading dereferencer> 2252 2253C<extract_variable> was expecting one of '$', '@', or '%' at the start of 2254a variable, but didn't find any of them. 2255 2256=item C<Bad identifier after dereferencer> 2257 2258C<extract_variable> found a '$', '@', or '%' indicating a variable, but that 2259character was not followed by a legal Perl identifier. 2260 2261=item C<Did not find expected opening bracket at %s> 2262 2263C<extract_codeblock> failed to find any of the outermost opening brackets 2264that were specified. 2265 2266=item C<Improperly nested codeblock at %s> 2267 2268A nested code block was found that started with a delimiter that was specified 2269as being only to be used as an outermost bracket. 2270 2271=item C<Missing second block for quotelike "%s"> 2272 2273C<extract_codeblock> or C<extract_quotelike> found one of the 2274quotelike operators C<s>, C<tr> or C<y> followed by only one block. 2275 2276=item C<No match found for opening bracket> 2277 2278C<extract_codeblock> failed to find a closing bracket to match the outermost 2279opening bracket. 2280 2281=item C<Did not find opening tag: /%s/> 2282 2283C<extract_tagged> did not find a suitable opening tag (after any specified 2284prefix was removed). 2285 2286=item C<Unable to construct closing tag to match: /%s/> 2287 2288C<extract_tagged> matched the specified opening tag and tried to 2289modify the matched text to produce a matching closing tag (because 2290none was specified). It failed to generate the closing tag, almost 2291certainly because the opening tag did not start with a 2292bracket of some kind. 2293 2294=item C<Found invalid nested tag: %s> 2295 2296C<extract_tagged> found a nested tag that appeared in the "reject" list 2297(and the failure mode was not "MAX" or "PARA"). 2298 2299=item C<Found unbalanced nested tag: %s> 2300 2301C<extract_tagged> found a nested opening tag that was not matched by a 2302corresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). 2303 2304=item C<Did not find closing tag> 2305 2306C<extract_tagged> reached the end of the text without finding a closing tag 2307to match the original opening tag (and the failure mode was not 2308"MAX" or "PARA"). 2309 2310=back 2311 2312=head1 EXPORTS 2313 2314The following symbols are, or can be, exported by this module: 2315 2316=over 4 2317 2318=item Default Exports 2319 2320I<None>. 2321 2322=item Optional Exports 2323 2324C<extract_delimited>, 2325C<extract_bracketed>, 2326C<extract_quotelike>, 2327C<extract_codeblock>, 2328C<extract_variable>, 2329C<extract_tagged>, 2330C<extract_multiple>, 2331C<gen_delimited_pat>, 2332C<gen_extract_tagged>, 2333C<delimited_pat>. 2334 2335=item Export Tags 2336 2337=over 4 2338 2339=item C<:ALL> 2340 2341C<extract_delimited>, 2342C<extract_bracketed>, 2343C<extract_quotelike>, 2344C<extract_codeblock>, 2345C<extract_variable>, 2346C<extract_tagged>, 2347C<extract_multiple>, 2348C<gen_delimited_pat>, 2349C<gen_extract_tagged>, 2350C<delimited_pat>. 2351 2352=back 2353 2354=back 2355 2356=head1 KNOWN BUGS 2357 2358See L<https://rt.cpan.org/Dist/Display.html?Status=Active&Queue=Text-Balanced>. 2359 2360=head1 FEEDBACK 2361 2362Patches, bug reports, suggestions or any other feedback is welcome. 2363 2364Patches can be sent as GitHub pull requests at 2365L<https://github.com/steve-m-hay/Text-Balanced/pulls>. 2366 2367Bug reports and suggestions can be made on the CPAN Request Tracker at 2368L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Text-Balanced>. 2369 2370Currently active requests on the CPAN Request Tracker can be viewed at 2371L<https://rt.cpan.org/Public/Dist/Display.html?Status=Active;Queue=Text-Balanced>. 2372 2373Please test this distribution. See CPAN Testers Reports at 2374L<https://www.cpantesters.org/> for details of how to get involved. 2375 2376Previous test results on CPAN Testers Reports can be viewed at 2377L<https://www.cpantesters.org/distro/T/Text-Balanced.html>. 2378 2379Please rate this distribution on CPAN Ratings at 2380L<https://cpanratings.perl.org/rate/?distribution=Text-Balanced>. 2381 2382=head1 AVAILABILITY 2383 2384The latest version of this module is available from CPAN (see 2385L<perlmodlib/"CPAN"> for details) at 2386 2387L<https://metacpan.org/release/Text-Balanced> or 2388 2389L<https://www.cpan.org/authors/id/S/SH/SHAY/> or 2390 2391L<https://www.cpan.org/modules/by-module/Text/>. 2392 2393The latest source code is available from GitHub at 2394L<https://github.com/steve-m-hay/Text-Balanced>. 2395 2396=head1 INSTALLATION 2397 2398See the F<INSTALL> file. 2399 2400=head1 AUTHOR 2401 2402Damian Conway E<lt>L<damian@conway.org|mailto:damian@conway.org>E<gt>. 2403 2404Steve Hay E<lt>L<shay@cpan.org|mailto:shay@cpan.org>E<gt> is now maintaining 2405Text::Balanced as of version 2.03. 2406 2407=head1 COPYRIGHT 2408 2409Copyright (C) 1997-2001 Damian Conway. All rights reserved. 2410 2411Copyright (C) 2009 Adam Kennedy. 2412 2413Copyright (C) 2015, 2020, 2022 Steve Hay and other contributors. All rights 2414reserved. 2415 2416=head1 LICENCE 2417 2418This module is free software; you can redistribute it and/or modify it under the 2419same terms as Perl itself, i.e. under the terms of either the GNU General Public 2420License or the Artistic License, as specified in the F<LICENCE> file. 2421 2422=head1 VERSION 2423 2424Version 2.06 2425 2426=head1 DATE 2427 242805 Jun 2022 2429 2430=head1 HISTORY 2431 2432See the F<Changes> file. 2433 2434=cut 2435