1# Copyright (C) 1997-2001 Damian Conway.  All rights reserved.
2# Copyright (C) 2009 Adam Kennedy.
3# Copyright (C) 2015, 2022 Steve Hay and other contributors.  All rights
4# reserved.
5
6# This module is free software; you can redistribute it and/or modify it under
7# the same terms as Perl itself, i.e. under the terms of either the GNU General
8# Public License or the Artistic License, as specified in the F<LICENCE> file.
9
10package Text::Balanced;
11
12# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
13# FOR FULL DOCUMENTATION SEE Balanced.pod
14
15use 5.008001;
16use strict;
17use Exporter ();
18
19use vars qw { $VERSION @ISA %EXPORT_TAGS };
20BEGIN {
21    $VERSION     = '2.06';
22    @ISA         = 'Exporter';
23    %EXPORT_TAGS = (
24        ALL => [ qw{
25            &extract_delimited
26            &extract_bracketed
27            &extract_quotelike
28            &extract_codeblock
29            &extract_variable
30            &extract_tagged
31            &extract_multiple
32            &gen_delimited_pat
33            &gen_extract_tagged
34            &delimited_pat
35        } ],
36    );
37}
38
39Exporter::export_ok_tags('ALL');
40
41our $RE_PREREGEX_PAT = qr#(
42    [!=]~
43    | split|grep|map
44    | not|and|or|xor
45)#x;
46our $RE_EXPR_PAT = qr#(
47    (?:\*\*|&&|\|\||<<|>>|//|[-+*x%^&|.])=?
48    | /(?:[^/])
49    | =(?!>)
50    | return
51    | [\(\[]
52)#x;
53our $RE_NUM = qr/\s*[+\-.0-9][+\-.0-9e]*/i; # numerical constant
54
55our %ref2slashvalid; # is quotelike /.../ pattern valid here for given textref?
56our %ref2qmarkvalid; # is quotelike ?...? pattern valid here for given textref?
57
58# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
59
60sub _failmsg {
61    my ($message, $pos) = @_;
62    $@ = bless {
63        error => $message,
64        pos   => $pos,
65    }, 'Text::Balanced::ErrorMsg';
66}
67
68sub _fail {
69    my ($wantarray, $textref, $message, $pos) = @_;
70    _failmsg $message, $pos if $message;
71    return (undef, $$textref, undef) if $wantarray;
72    return;
73}
74
75sub _succeed {
76    $@ = undef;
77    my ($wantarray,$textref) = splice @_, 0, 2;
78    my ($extrapos, $extralen) = @_ > 18
79        ? splice(@_, -2, 2)
80        : (0, 0);
81    my ($startlen, $oppos) = @_[5,6];
82    my $remainderpos = $_[2];
83    if ( $wantarray ) {
84        my @res;
85        while (my ($from, $len) = splice @_, 0, 2) {
86            push @res, substr($$textref, $from, $len);
87        }
88        if ( $extralen ) { # CORRECT FILLET
89            my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
90            $res[1] = "$extra$res[1]";
91            eval { substr($$textref,$remainderpos,0) = $extra;
92                   substr($$textref,$extrapos,$extralen,"\n")} ;
93                    #REARRANGE HERE DOC AND FILLET IF POSSIBLE
94            pos($$textref) = $remainderpos-$extralen+1; # RESET \G
95        } else {
96            pos($$textref) = $remainderpos;             # RESET \G
97        }
98        return @res;
99    } else {
100        my $match = substr($$textref,$_[0],$_[1]);
101        substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
102        my $extra = $extralen
103            ? substr($$textref, $extrapos, $extralen)."\n" : "";
104        eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
105        pos($$textref) = $_[4];                         # RESET \G
106        return $match;
107    }
108}
109
110# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
111## no critic (Subroutines::ProhibitSubroutinePrototypes)
112
113sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
114{
115    my ($dels, $escs) = @_;
116    return "" unless $dels =~ /\S/;
117    $escs = '\\' unless $escs;
118    $escs .= substr($escs,-1) x (length($dels)-length($escs));
119    my @pat = ();
120    my $i;
121    for ($i=0; $i<length $dels; $i++)
122    {
123        my $del = quotemeta substr($dels,$i,1);
124        my $esc = quotemeta substr($escs,$i,1);
125        if ($del eq $esc)
126        {
127            push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
128        }
129        else
130        {
131            push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
132        }
133    }
134    my $pat = join '|', @pat;
135    return "(?:$pat)";
136}
137
138*delimited_pat = \&gen_delimited_pat;
139
140# THE EXTRACTION FUNCTIONS
141
142sub extract_delimited (;$$$$)
143{
144    my $textref = defined $_[0] ? \$_[0] : \$_;
145    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
146    my $wantarray = wantarray;
147    my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
148    my $pre  = defined $_[2] ? $_[2] : '\s*';
149    my $esc  = defined $_[3] ? $_[3] : qq{\\};
150    my $pat = gen_delimited_pat($del, $esc);
151    my $startpos = pos $$textref || 0;
152    return _fail($wantarray, $textref, "Not a delimited pattern", 0)
153        unless $$textref =~ m/\G($pre)($pat)/gc;
154    my $prelen = length($1);
155    my $matchpos = $startpos+$prelen;
156    my $endpos = pos $$textref;
157    return _succeed $wantarray, $textref,
158                    $matchpos, $endpos-$matchpos,               # MATCH
159                    $endpos,   length($$textref)-$endpos,       # REMAINDER
160                    $startpos, $prelen;                         # PREFIX
161}
162
163my %eb_delim_cache;
164sub _eb_delims {
165    my ($ldel_orig) = @_;
166    return @{ $eb_delim_cache{$ldel_orig} } if $eb_delim_cache{$ldel_orig};
167    my $qdel = "";
168    my $quotelike;
169    my $ldel = $ldel_orig;
170    $ldel =~ s/'//g and $qdel .= q{'};
171    $ldel =~ s/"//g and $qdel .= q{"};
172    $ldel =~ s/`//g and $qdel .= q{`};
173    $ldel =~ s/q//g and $quotelike = 1;
174    $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
175    my $rdel = $ldel;
176    return @{ $eb_delim_cache{$ldel_orig} = [] } unless $rdel =~ tr/[({</])}>/;
177    my $posbug = pos;
178    $ldel = join('|', map { quotemeta $_ } split('', $ldel));
179    $rdel = join('|', map { quotemeta $_ } split('', $rdel));
180    pos = $posbug;
181    @{ $eb_delim_cache{$ldel_orig} = [
182        qr/\G($ldel)/, $qdel && qr/\G([$qdel])/, $quotelike, qr/\G($rdel)/
183    ] };
184}
185sub extract_bracketed (;$$$)
186{
187    my $textref = defined $_[0] ? \$_[0] : \$_;
188    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
189    my $ldel = defined $_[1] ? $_[1] : '{([<';
190    my $pre  = defined $_[2] ? qr/\G$_[2]/ : qr/\G\s*/;
191    my $wantarray = wantarray;
192    my @ret = _eb_delims($ldel);
193    unless (@ret)
194    {
195        return _fail $wantarray, $textref,
196                     "Did not find a suitable bracket in delimiter: \"$_[1]\"",
197                     0;
198    }
199
200    my $startpos = pos $$textref || 0;
201    my @match = _match_bracketed($textref, $pre, @ret);
202
203    return _fail ($wantarray, $textref) unless @match;
204
205    return _succeed ( $wantarray, $textref,
206                      $match[2], $match[5]+2,           # MATCH
207                      @match[8,9],                      # REMAINDER
208                      @match[0,1],                      # PREFIX
209                    );
210}
211
212sub _match_bracketed    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
213{
214    my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
215    my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
216    unless ($$textref =~ m/$pre/gc)
217    {
218        _failmsg "Did not find prefix: /$pre/", $startpos;
219        return;
220    }
221
222    $ldelpos = pos $$textref;
223
224    unless ($$textref =~ m/$ldel/gc)
225    {
226        _failmsg "Did not find opening bracket after prefix: \"$pre\"",
227                 pos $$textref;
228        pos $$textref = $startpos;
229        return;
230    }
231
232    my @nesting = ( $1 );
233    my $textlen = length $$textref;
234    while (pos $$textref < $textlen)
235    {
236        next if $$textref =~ m/\G\\./gcs;
237
238        if ($$textref =~ m/$ldel/gc)
239        {
240            push @nesting, $1;
241        }
242        elsif ($$textref =~ m/$rdel/gc)
243        {
244            my ($found, $brackettype) = ($1, $1);
245            if ($#nesting < 0)
246            {
247                _failmsg "Unmatched closing bracket: \"$found\"",
248                         pos $$textref;
249                pos $$textref = $startpos;
250                return;
251            }
252            my $expected = pop(@nesting);
253            $expected =~ tr/({[</)}]>/;
254            if ($expected ne $brackettype)
255            {
256                _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
257                         pos $$textref;
258                pos $$textref = $startpos;
259                return;
260            }
261            last if $#nesting < 0;
262        }
263        elsif ($qdel && $$textref =~ m/$qdel/gc)
264        {
265            $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
266            _failmsg "Unmatched embedded quote ($1)",
267                     pos $$textref;
268            pos $$textref = $startpos;
269            return;
270        }
271        elsif ($quotelike && _match_quotelike($textref,qr/\G()/,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref}))
272        {
273            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1; # back-compat
274            next;
275        }
276
277        else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
278    }
279    if ($#nesting>=0)
280    {
281        _failmsg "Unmatched opening bracket(s): "
282                     . join("..",@nesting)."..",
283                 pos $$textref;
284        pos $$textref = $startpos;
285        return;
286    }
287
288    $endpos = pos $$textref;
289
290    return (
291        $startpos,  $ldelpos-$startpos,         # PREFIX
292        $ldelpos,   1,                          # OPENING BRACKET
293        $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
294        $endpos-1,  1,                          # CLOSING BRACKET
295        $endpos,    length($$textref)-$endpos,  # REMAINDER
296    );
297}
298
299sub _revbracket($)
300{
301    my $brack = reverse $_[0];
302    $brack =~ tr/[({</])}>/;
303    return $brack;
304}
305
306my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
307
308my $et_default_ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>';
309sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
310{
311    my $textref = defined $_[0] ? \$_[0] : \$_;
312    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
313    my $ldel    = $_[1];
314    my $rdel    = $_[2];
315    my $pre     = defined $_[3] ? qr/\G$_[3]/ : qr/\G\s*/;
316    my %options = defined $_[4] ? %{$_[4]} : ();
317    my $omode   = defined $options{fail} ? $options{fail} : '';
318    my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
319                : defined($options{reject})        ? $options{reject}
320                :                                    ''
321                ;
322    my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
323                : defined($options{ignore})        ? $options{ignore}
324                :                                    ''
325                ;
326
327    $ldel = $et_default_ldel if !defined $ldel;
328    $@ = undef;
329
330    my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
331
332    return _fail(wantarray, $textref) unless @match;
333    return _succeed wantarray, $textref,
334            $match[2], $match[3]+$match[5]+$match[7],    # MATCH
335            @match[8..9,0..1,2..7];                      # REM, PRE, BITS
336}
337
338sub _match_tagged       # ($$$$$$$)
339{
340    my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
341    my $rdelspec;
342
343    my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
344
345    unless ($$textref =~ m/$pre/gc)
346    {
347        _failmsg "Did not find prefix: /$pre/", pos $$textref;
348        goto failed;
349    }
350
351    $opentagpos = pos($$textref);
352
353    unless ($$textref =~ m/\G$ldel/gc)
354    {
355        _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
356        goto failed;
357    }
358
359    $textpos = pos($$textref);
360
361    if (!defined $rdel)
362    {
363        $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
364        unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
365        {
366            _failmsg "Unable to construct closing tag to match: $rdel",
367                     pos $$textref;
368            goto failed;
369        }
370    }
371    else
372    {
373        ## no critic (BuiltinFunctions::ProhibitStringyEval)
374        $rdelspec = eval "qq{$rdel}" || do {
375            my $del;
376            for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
377                { next if $rdel =~ /\Q$_/; $del = $_; last }
378            unless ($del) {
379                use Carp;
380                croak "Can't interpolate right delimiter $rdel"
381            }
382            eval "qq$del$rdel$del";
383        };
384    }
385
386    while (pos($$textref) < length($$textref))
387    {
388        next if $$textref =~ m/\G\\./gc;
389
390        if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
391        {
392            $parapos = pos($$textref) - length($1)
393                unless defined $parapos;
394        }
395        elsif ($$textref =~ m/\G($rdelspec)/gc )
396        {
397            $closetagpos = pos($$textref)-length($1);
398            goto matched;
399        }
400        elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
401        {
402            next;
403        }
404        elsif ($bad && $$textref =~ m/\G($bad)/gcs)
405        {
406            pos($$textref) -= length($1);       # CUT OFF WHATEVER CAUSED THE SHORTNESS
407            goto short if ($omode eq 'PARA' || $omode eq 'MAX');
408            _failmsg "Found invalid nested tag: $1", pos $$textref;
409            goto failed;
410        }
411        elsif ($$textref =~ m/\G($ldel)/gc)
412        {
413            my $tag = $1;
414            pos($$textref) -= length($tag);     # REWIND TO NESTED TAG
415            unless (_match_tagged(@_))  # MATCH NESTED TAG
416            {
417                goto short if $omode eq 'PARA' || $omode eq 'MAX';
418                _failmsg "Found unbalanced nested tag: $tag",
419                         pos $$textref;
420                goto failed;
421            }
422        }
423        else { $$textref =~ m/./gcs }
424    }
425
426short:
427    $closetagpos = pos($$textref);
428    goto matched if $omode eq 'MAX';
429    goto failed unless $omode eq 'PARA';
430
431    if (defined $parapos) { pos($$textref) = $parapos }
432    else                  { $parapos = pos($$textref) }
433
434    return (
435        $startpos,    $opentagpos-$startpos,            # PREFIX
436        $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
437        $textpos,     $parapos-$textpos,                # TEXT
438        $parapos,     0,                                # NO CLOSING TAG
439        $parapos,     length($$textref)-$parapos,       # REMAINDER
440    );
441
442matched:
443    $endpos = pos($$textref);
444    return (
445        $startpos,    $opentagpos-$startpos,            # PREFIX
446        $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
447        $textpos,     $closetagpos-$textpos,            # TEXT
448        $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
449        $endpos,      length($$textref)-$endpos,        # REMAINDER
450    );
451
452failed:
453    _failmsg "Did not find closing tag", pos $$textref unless $@;
454    pos($$textref) = $startpos;
455    return;
456}
457
458sub extract_variable (;$$)
459{
460    my $textref = defined $_[0] ? \$_[0] : \$_;
461    return ("","","") unless defined $$textref;
462    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
463    my $pre  = defined $_[1] ? qr/\G$_[1]/ : qr/\G\s*/;
464
465    my @match = _match_variable($textref,$pre);
466
467    return _fail wantarray, $textref unless @match;
468
469    return _succeed wantarray, $textref,
470                    @match[2..3,4..5,0..1];        # MATCH, REMAINDER, PREFIX
471}
472
473sub _match_variable
474{
475#  $#
476#  $^
477#  $$
478    my ($textref, $pre) = @_;
479    my $startpos = pos($$textref) = pos($$textref)||0;
480    unless ($$textref =~ m/$pre/gc)
481    {
482        _failmsg "Did not find prefix: /$pre/", pos $$textref;
483        return;
484    }
485    my $varpos = pos($$textref);
486    unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
487    {
488        unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
489        {
490            _failmsg "Did not find leading dereferencer", pos $$textref;
491            pos $$textref = $startpos;
492            return;
493        }
494        my $deref = $1;
495
496        unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
497            or _match_codeblock($textref, qr/\G()/, '\{', qr/\G\s*(\})/, '\{', '\}', 0, 1)
498            or $deref eq '$#' or $deref eq '$$'
499            or pos($$textref) == length $$textref )
500        {
501            _failmsg "Bad identifier after dereferencer", pos $$textref;
502            pos $$textref = $startpos;
503            return;
504        }
505    }
506
507    while (1)
508    {
509        next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
510        next if _match_codeblock($textref,
511                                 qr/\G\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
512                                 qr/[({[]/, qr/\G\s*([)}\]])/,
513                                 qr/[({[]/, qr/[)}\]]/, 0, 1);
514        next if _match_codeblock($textref,
515                                 qr/\G\s*/, qr/[{[]/, qr/\G\s*([}\]])/,
516                                 qr/[{[]/, qr/[}\]]/, 0, 1);
517        next if _match_variable($textref,qr/\G\s*->\s*/);
518        next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
519        last;
520    }
521    $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
522
523    my $endpos = pos($$textref);
524    return ($startpos, $varpos-$startpos,
525            $varpos,   $endpos-$varpos,
526            $endpos,   length($$textref)-$endpos
527    );
528}
529
530my %ec_delim_cache;
531sub _ec_delims {
532    my ($ldel_inner, $ldel_outer) = @_;
533    return @{ $ec_delim_cache{$ldel_outer}{$ldel_inner} }
534        if $ec_delim_cache{$ldel_outer}{$ldel_inner};
535    my $rdel_inner = $ldel_inner;
536    my $rdel_outer = $ldel_outer;
537    my $posbug = pos;
538    for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
539    for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
540    for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
541    {
542        $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
543    }
544    pos = $posbug;
545    @{ $ec_delim_cache{$ldel_outer}{$ldel_inner} = [
546        $ldel_outer, qr/\G\s*($rdel_outer)/, $ldel_inner, $rdel_inner
547    ] };
548}
549sub extract_codeblock (;$$$$$)
550{
551    my $textref = defined $_[0] ? \$_[0] : \$_;
552    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
553    my $wantarray = wantarray;
554    my $ldel_inner = defined $_[1] ? $_[1] : '{';
555    my $pre = !defined $_[2] ? qr/\G\s*/ : qr/\G$_[2]/;
556    my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
557    my $rd         = $_[4];
558    my @delims = _ec_delims($ldel_inner, $ldel_outer);
559
560    my @match = _match_codeblock($textref, $pre, @delims, $rd, 1);
561    return _fail($wantarray, $textref) unless @match;
562    return _succeed($wantarray, $textref,
563                    @match[2..3,4..5,0..1]    # MATCH, REMAINDER, PREFIX
564    );
565}
566
567sub _match_codeblock
568{
569    my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd, $no_backcompat) = @_;
570    $rdel_outer = qr/\G\s*($rdel_outer)/ if !$no_backcompat; # Switch calls this func directly
571    my $startpos = pos($$textref) = pos($$textref) || 0;
572    unless ($$textref =~ m/$pre/gc)
573    {
574        _failmsg qq{Did not match prefix /$pre/ at"} .
575                     substr($$textref,pos($$textref),20) .
576                     q{..."},
577                 pos $$textref;
578        return;
579    }
580    my $codepos = pos($$textref);
581    unless ($$textref =~ m/\G($ldel_outer)/gc)  # OUTERMOST DELIMITER
582    {
583        _failmsg qq{Did not find expected opening bracket at "} .
584                     substr($$textref,pos($$textref),20) .
585                     q{..."},
586                 pos $$textref;
587        pos $$textref = $startpos;
588        return;
589    }
590    my $closing = $1;
591       $closing =~ tr/([<{/)]>}/;
592    my $matched;
593    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0
594      if !pos($$textref) or !defined $ref2slashvalid{$textref}; # default, or reset
595    while (pos($$textref) < length($$textref))
596    {
597        if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
598        {
599            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
600            next;
601        }
602
603        if ($$textref =~ m/\G\s*#.*/gc)
604        {
605            next;
606        }
607
608        if ($$textref =~ m/$rdel_outer/gc)
609        {
610            unless ($matched = ($closing && $1 eq $closing) )
611            {
612                next if $1 eq '>';      # MIGHT BE A "LESS THAN"
613                _failmsg q{Mismatched closing bracket at "} .
614                             substr($$textref,pos($$textref),20) .
615                             qq{...". Expected '$closing'},
616                         pos $$textref;
617            }
618            last;
619        }
620
621        if (_match_variable($textref,qr/\G\s*/) ||
622            _match_quotelike($textref,qr/\G\s*/,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref}) )
623        {
624            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
625            next;
626        }
627
628        if ($$textref =~ m#\G\s*(?!$ldel_inner)(?:$RE_PREREGEX_PAT|$RE_EXPR_PAT)#gc)
629        {
630            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1;
631            next;
632        }
633
634        if ( _match_codeblock($textref, qr/\G\s*/, $ldel_inner, qr/\G\s*($rdel_inner)/, $ldel_inner, $rdel_inner, $rd, 1) )
635        {
636            $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1;
637            next;
638        }
639
640        if ($$textref =~ m/\G\s*$ldel_outer/gc)
641        {
642            _failmsg q{Improperly nested codeblock at "} .
643                         substr($$textref,pos($$textref),20) .
644                         q{..."},
645                     pos $$textref;
646            last;
647        }
648
649        $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
650        $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
651    }
652    continue { $@ = undef }
653
654    unless ($matched)
655    {
656        _failmsg 'No match found for opening bracket', pos $$textref
657                unless $@;
658        return;
659    }
660
661    $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = undef;
662    my $endpos = pos($$textref);
663    return ( $startpos, $codepos-$startpos,
664             $codepos, $endpos-$codepos,
665             $endpos,  length($$textref)-$endpos,
666    );
667}
668
669
670my %mods   = (
671    'none' => '[cgimsox]*',
672    'm'    => '[cgimsox]*',
673    's'    => '[cegimsox]*',
674    'tr'   => '[cds]*',
675    'y'    => '[cds]*',
676    'qq'   => '',
677    'qx'   => '',
678    'qw'   => '',
679    'qr'   => '[imsx]*',
680    'q'    => '',
681);
682
683sub extract_quotelike (;$$)
684{
685    my $textref = $_[0] ? \$_[0] : \$_;
686    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
687    my $wantarray = wantarray;
688    my $pre  = defined $_[1] ? qr/\G$_[1]/ : qr/\G\s*/;
689
690    my @match = _match_quotelike($textref,$pre,$ref2slashvalid{$textref},$ref2qmarkvalid{$textref});
691    return _fail($wantarray, $textref) unless @match;
692    return _succeed($wantarray, $textref,
693                    $match[2], $match[18]-$match[2],    # MATCH
694                    @match[18,19],                      # REMAINDER
695                    @match[0,1],                        # PREFIX
696                    @match[2..17],                      # THE BITS
697                    @match[20,21],                      # ANY FILLET?
698    );
699};
700
701my %maybe_quote = map +($_=>1), qw(" ' `);
702sub _match_quotelike
703{
704    my ($textref, $pre, $allow_slash_match, $allow_qmark_match) = @_;
705    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0
706      if !pos($$textref) or !defined $ref2slashvalid{$textref}; # default, or reset
707
708    my ($textlen,$startpos,
709        $preld1pos,$ld1pos,$str1pos,$rd1pos,
710        $preld2pos,$ld2pos,$str2pos,$rd2pos,
711        $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
712
713    unless ($$textref =~ m/$pre/gc)
714    {
715        _failmsg qq{Did not find prefix /$pre/ at "} .
716                     substr($$textref, pos($$textref), 20) .
717                     q{..."},
718                 pos $$textref;
719        return;
720    }
721    my $oppos = pos($$textref);
722    my $initial = substr($$textref,$oppos,1);
723    if ($initial && $maybe_quote{$initial}
724                 || $allow_slash_match && $initial eq '/'
725                 || $allow_qmark_match && $initial eq '?')
726    {
727        unless ($$textref =~ m/\G \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
728        {
729            _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
730                         substr($$textref, $oppos, 20) .
731                         q{..."},
732                     pos $$textref;
733            pos $$textref = $startpos;
734            return;
735        }
736        $modpos= pos($$textref);
737        $rd1pos = $modpos-1;
738
739        if ($initial eq '/' || $initial eq '?')
740        {
741            $$textref =~ m/\G$mods{none}/gc
742        }
743
744        my $endpos = pos($$textref);
745        $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = 0;
746        return (
747            $startpos,  $oppos-$startpos,       # PREFIX
748            $oppos,     0,                      # NO OPERATOR
749            $oppos,     1,                      # LEFT DEL
750            $oppos+1,   $rd1pos-$oppos-1,       # STR/PAT
751            $rd1pos,    1,                      # RIGHT DEL
752            $modpos,    0,                      # NO 2ND LDEL
753            $modpos,    0,                      # NO 2ND STR
754            $modpos,    0,                      # NO 2ND RDEL
755            $modpos,    $endpos-$modpos,        # MODIFIERS
756            $endpos,    $textlen-$endpos,       # REMAINDER
757        );
758    }
759
760    unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<(?=[a-zA-Z]|\s*['"`;,]))}gc)
761    {
762        _failmsg q{No quotelike operator found after prefix at "} .
763                     substr($$textref, pos($$textref), 20) .
764                     q{..."},
765                 pos $$textref;
766        pos $$textref = $startpos;
767        return;
768    }
769
770    my $op = $1;
771    $preld1pos = pos($$textref);
772    if ($op eq '<<') {
773        $ld1pos = pos($$textref);
774        my $label;
775        if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
776            $label = $1;
777        }
778        elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
779                             | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
780                             | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
781                             }gcsx) {
782            $label = $+;
783        }
784        else {
785            $label = "";
786        }
787        my $extrapos = pos($$textref);
788        $$textref =~ m{.*\n}gc;
789        $str1pos = pos($$textref)--;
790        unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
791            _failmsg qq{Missing here doc terminator ('$label') after "} .
792                         substr($$textref, $startpos, 20) .
793                         q{..."},
794                     pos $$textref;
795            pos $$textref = $startpos;
796            return;
797        }
798        $rd1pos = pos($$textref);
799        $$textref =~ m{\Q$label\E\n}gc;
800        $ld2pos = pos($$textref);
801        $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = 0;
802        return (
803            $startpos,  $oppos-$startpos,       # PREFIX
804            $oppos,     length($op),            # OPERATOR
805            $ld1pos,    $extrapos-$ld1pos,      # LEFT DEL
806            $str1pos,   $rd1pos-$str1pos,       # STR/PAT
807            $rd1pos,    $ld2pos-$rd1pos,        # RIGHT DEL
808            $ld2pos,    0,                      # NO 2ND LDEL
809            $ld2pos,    0,                      # NO 2ND STR
810            $ld2pos,    0,                      # NO 2ND RDEL
811            $ld2pos,    0,                      # NO MODIFIERS
812            $ld2pos,    $textlen-$ld2pos,       # REMAINDER
813            $extrapos,  $str1pos-$extrapos,     # FILLETED BIT
814        );
815    }
816
817    $$textref =~ m/\G\s*/gc;
818    $ld1pos = pos($$textref);
819    $str1pos = $ld1pos+1;
820
821    if ($$textref !~ m/\G(\S)/gc)   # SHOULD USE LOOKAHEAD
822    {
823        _failmsg "No block delimiter found after quotelike $op",
824                 pos $$textref;
825        pos $$textref = $startpos;
826        return;
827    }
828    elsif (substr($$textref, $ld1pos, 2) eq '=>')
829    {
830        _failmsg "quotelike $op was actually quoted by '=>'",
831                 pos $$textref;
832        pos $$textref = $startpos;
833        return;
834    }
835    pos($$textref) = $ld1pos;   # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
836    my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
837    if ($ldel1 =~ /[[(<{]/)
838    {
839        $rdel1 =~ tr/[({</])}>/;
840        defined(_match_bracketed($textref,qr/\G/,qr/\G($ldel1)/,"","",qr/\G($rdel1)/))
841            || do { pos $$textref = $startpos; return };
842        $ld2pos = pos($$textref);
843        $rd1pos = $ld2pos-1;
844    }
845    else
846    {
847        $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
848            || do { pos $$textref = $startpos; return };
849        $ld2pos = $rd1pos = pos($$textref)-1;
850    }
851
852    my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
853    if ($second_arg)
854    {
855        my ($ldel2, $rdel2);
856        if ($ldel1 =~ /[[(<{]/)
857        {
858            unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
859            {
860                _failmsg "Missing second block for quotelike $op",
861                         pos $$textref;
862                pos $$textref = $startpos;
863                return;
864            }
865            $ldel2 = $rdel2 = "\Q$1";
866            $rdel2 =~ tr/[({</])}>/;
867        }
868        else
869        {
870            $ldel2 = $rdel2 = $ldel1;
871        }
872        $str2pos = $ld2pos+1;
873
874        if ($ldel2 =~ /[[(<{]/)
875        {
876            pos($$textref)--;   # OVERCOME BROKEN LOOKAHEAD
877            defined(_match_bracketed($textref,qr/\G/,qr/\G($ldel2)/,"","",qr/\G($rdel2)/))
878                || do { pos $$textref = $startpos; return };
879        }
880        else
881        {
882            $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
883                || do { pos $$textref = $startpos; return };
884        }
885        $rd2pos = pos($$textref)-1;
886    }
887    else
888    {
889        $ld2pos = $str2pos = $rd2pos = $rd1pos;
890    }
891
892    $modpos = pos $$textref;
893
894    $$textref =~ m/\G($mods{$op})/gc;
895    my $endpos = pos $$textref;
896    $ref2qmarkvalid{$textref} = $ref2slashvalid{$textref} = undef;
897
898    return (
899        $startpos,      $oppos-$startpos,       # PREFIX
900        $oppos,         length($op),            # OPERATOR
901        $ld1pos,        1,                      # LEFT DEL
902        $str1pos,       $rd1pos-$str1pos,       # STR/PAT
903        $rd1pos,        1,                      # RIGHT DEL
904        $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
905        $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
906        $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
907        $modpos,        $endpos-$modpos,        # MODIFIERS
908        $endpos,        $textlen-$endpos,       # REMAINDER
909    );
910}
911
912my $def_func = [
913    sub { extract_variable($_[0], '') },
914    sub { extract_quotelike($_[0],'') },
915    sub { extract_codeblock($_[0],'{}','') },
916];
917my %ref_not_regex = map +($_=>1), qw(CODE Text::Balanced::Extractor);
918
919sub _update_patvalid {
920    my ($textref, $text) = @_;
921    if ($ref2slashvalid{$textref} && $text =~ m/(?:$RE_NUM|[\)\]])\s*$/)
922    {
923        $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 0;
924    } elsif (!$ref2slashvalid{$textref} && $text =~ m/$RE_PREREGEX_PAT\s*$/)
925    {
926        $ref2slashvalid{$textref} = $ref2qmarkvalid{$textref} = 1;
927    } elsif (!$ref2slashvalid{$textref} && $text =~ m/$RE_EXPR_PAT\s*$/)
928    {
929        $ref2slashvalid{$textref} = 1;
930        $ref2qmarkvalid{$textref} = 0;
931    }
932}
933sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
934{
935    my $textref = defined($_[0]) ? \$_[0] : \$_;
936    $ref2slashvalid{$textref} = 1, $ref2qmarkvalid{$textref} = 0 if !pos($$textref); # reset
937    my $posbug = pos;
938    my ($lastpos, $firstpos);
939    my @fields = ();
940
941    #for ($$textref)
942    {
943        my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
944        my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
945        my $igunk = $_[3];
946
947        pos $$textref ||= 0;
948
949        unless (wantarray)
950        {
951            use Carp;
952            carp "extract_multiple reset maximal count to 1 in scalar context"
953                    if $^W && defined($_[2]) && $max > 1;
954            $max = 1
955        }
956
957        my @class;
958        foreach my $func ( @func )
959        {
960            push @class, undef;
961            ($class[-1], $func) = %$func if ref($func) eq 'HASH';
962            $func = qr/\G$func/ if !$ref_not_regex{ref $func};
963        }
964
965        my $unkpos;
966        FIELD: while (pos($$textref) < length($$textref))
967        {
968            foreach my $i ( 0..$#func )
969            {
970                my ($field, $pref);
971                my ($class, $func) = ($class[$i], $func[$i]);
972                $lastpos = pos $$textref;
973                if (ref($func) eq 'CODE')
974                    { ($field,undef,$pref) = $func->($$textref) }
975                elsif (ref($func) eq 'Text::Balanced::Extractor')
976                    { $field = $func->extract($$textref) }
977                elsif( $$textref =~ m/$func[$i]/gc )
978                    { $field = defined($1)
979                        ? $1
980                        : substr($$textref, $-[0], $+[0] - $-[0])
981                    }
982                $pref ||= "";
983                if (defined($field) && length($field))
984                {
985                    if (!$igunk) {
986                        $unkpos = $lastpos
987                            if length($pref) && !defined($unkpos);
988                        if (defined $unkpos)
989                        {
990                            push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
991                            $firstpos = $unkpos unless defined $firstpos;
992                            undef $unkpos;
993                            last FIELD if @fields == $max;
994                        }
995                    }
996                    push @fields, $class ? bless(\$field, $class) : $field;
997                    _update_patvalid($textref, $fields[-1]);
998                    $firstpos = $lastpos unless defined $firstpos;
999                    $lastpos = pos $$textref;
1000                    last FIELD if @fields == $max;
1001                    next FIELD;
1002                }
1003            }
1004            if ($$textref =~ /\G(.)/gcs)
1005            {
1006                $unkpos = pos($$textref)-1
1007                    unless $igunk || defined $unkpos;
1008                _update_patvalid($textref, substr $$textref, $unkpos, pos($$textref)-$unkpos);
1009            }
1010        }
1011
1012        if (defined $unkpos)
1013        {
1014            push @fields, substr($$textref, $unkpos);
1015            $firstpos = $unkpos unless defined $firstpos;
1016            $lastpos = length $$textref;
1017        }
1018        last;
1019    }
1020
1021    pos $$textref = $lastpos;
1022    return @fields if wantarray;
1023
1024    $firstpos ||= 0;
1025    eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
1026           pos $$textref = $firstpos };
1027    return $fields[0];
1028}
1029
1030sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
1031{
1032    my $ldel    = $_[0];
1033    my $rdel    = $_[1];
1034    my $pre     = defined $_[2] ? qr/\G$_[2]/ : qr/\G\s*/;
1035    my %options = defined $_[3] ? %{$_[3]} : ();
1036    my $omode   = defined $options{fail} ? $options{fail} : '';
1037    my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
1038                : defined($options{reject})        ? $options{reject}
1039                :                                    ''
1040                ;
1041    my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
1042                : defined($options{ignore})        ? $options{ignore}
1043                :                                    ''
1044                ;
1045
1046    $ldel = $et_default_ldel if !defined $ldel;
1047
1048    my $posbug = pos;
1049    for ($ldel, $bad, $ignore) { $_ = qr/$_/ if $_ }
1050    pos = $posbug;
1051
1052    my $closure = sub
1053    {
1054        my $textref = defined $_[0] ? \$_[0] : \$_;
1055        my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1056
1057        return _fail(wantarray, $textref) unless @match;
1058        return _succeed wantarray, $textref,
1059                        $match[2], $match[3]+$match[5]+$match[7],   # MATCH
1060                        @match[8..9,0..1,2..7];                     # REM, PRE, BITS
1061    };
1062
1063    bless $closure, 'Text::Balanced::Extractor';
1064}
1065
1066package Text::Balanced::Extractor;
1067
1068sub extract($$) # ($self, $text)
1069{
1070    &{$_[0]}($_[1]);
1071}
1072
1073package Text::Balanced::ErrorMsg;
1074
1075use overload
1076    '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" },
1077    fallback => 1;
1078
10791;
1080
1081__END__
1082
1083=pod
1084
1085=head1 NAME
1086
1087Text::Balanced - Extract delimited text sequences from strings.
1088
1089=head1 SYNOPSIS
1090
1091    use Text::Balanced qw (
1092        extract_delimited
1093        extract_bracketed
1094        extract_quotelike
1095        extract_codeblock
1096        extract_variable
1097        extract_tagged
1098        extract_multiple
1099        gen_delimited_pat
1100        gen_extract_tagged
1101    );
1102
1103    # Extract the initial substring of $text that is delimited by
1104    # two (unescaped) instances of the first character in $delim.
1105
1106    ($extracted, $remainder) = extract_delimited($text,$delim);
1107
1108    # Extract the initial substring of $text that is bracketed
1109    # with a delimiter(s) specified by $delim (where the string
1110    # in $delim contains one or more of '(){}[]<>').
1111
1112    ($extracted, $remainder) = extract_bracketed($text,$delim);
1113
1114    # Extract the initial substring of $text that is bounded by
1115    # an XML tag.
1116
1117    ($extracted, $remainder) = extract_tagged($text);
1118
1119    # Extract the initial substring of $text that is bounded by
1120    # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1121
1122    ($extracted, $remainder) =
1123        extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1124
1125    # Extract the initial substring of $text that represents a
1126    # Perl "quote or quote-like operation"
1127
1128    ($extracted, $remainder) = extract_quotelike($text);
1129
1130    # Extract the initial substring of $text that represents a block
1131    # of Perl code, bracketed by any of character(s) specified by $delim
1132    # (where the string $delim contains one or more of '(){}[]<>').
1133
1134    ($extracted, $remainder) = extract_codeblock($text,$delim);
1135
1136    # Extract the initial substrings of $text that would be extracted by
1137    # one or more sequential applications of the specified functions
1138    # or regular expressions
1139
1140    @extracted = extract_multiple($text,
1141                                  [ \&extract_bracketed,
1142                                    \&extract_quotelike,
1143                                    \&some_other_extractor_sub,
1144                                    qr/[xyz]*/,
1145                                    'literal',
1146                                  ]);
1147
1148    # Create a string representing an optimized pattern (a la Friedl)
1149    # that matches a substring delimited by any of the specified characters
1150    # (in this case: any type of quote or a slash)
1151
1152    $patstring = gen_delimited_pat(q{'"`/});
1153
1154    # Generate a reference to an anonymous sub that is just like extract_tagged
1155    # but pre-compiled and optimized for a specific pair of tags, and
1156    # consequently much faster (i.e. 3 times faster). It uses qr// for better
1157    # performance on repeated calls.
1158
1159    $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1160    ($extracted, $remainder) = $extract_head->($text);
1161
1162=head1 DESCRIPTION
1163
1164The various C<extract_...> subroutines may be used to
1165extract a delimited substring, possibly after skipping a
1166specified prefix string. By default, that prefix is
1167optional whitespace (C</\s*/>), but you can change it to whatever
1168you wish (see below).
1169
1170The substring to be extracted must appear at the
1171current C<pos> location of the string's variable
1172(or at index zero, if no C<pos> position is defined).
1173In other words, the C<extract_...> subroutines I<don't>
1174extract the first occurrence of a substring anywhere
1175in a string (like an unanchored regex would). Rather,
1176they extract an occurrence of the substring appearing
1177immediately at the current matching position in the
1178string (like a C<\G>-anchored regex would).
1179
1180=head2 General Behaviour in List Contexts
1181
1182In a list context, all the subroutines return a list, the first three
1183elements of which are always:
1184
1185=over 4
1186
1187=item [0]
1188
1189The extracted string, including the specified delimiters.
1190If the extraction fails C<undef> is returned.
1191
1192=item [1]
1193
1194The remainder of the input string (i.e. the characters after the
1195extracted string). On failure, the entire string is returned.
1196
1197=item [2]
1198
1199The skipped prefix (i.e. the characters before the extracted string).
1200On failure, C<undef> is returned.
1201
1202=back
1203
1204Note that in a list context, the contents of the original input text (the first
1205argument) are not modified in any way.
1206
1207However, if the input text was passed in a variable, that variable's
1208C<pos> value is updated to point at the first character after the
1209extracted text. That means that in a list context the various
1210subroutines can be used much like regular expressions. For example:
1211
1212    while ( $next = (extract_quotelike($text))[0] )
1213    {
1214        # process next quote-like (in $next)
1215    }
1216
1217=head2 General Behaviour in Scalar and Void Contexts
1218
1219In a scalar context, the extracted string is returned, having first been
1220removed from the input text. Thus, the following code also processes
1221each quote-like operation, but actually removes them from $text:
1222
1223    while ( $next = extract_quotelike($text) )
1224    {
1225        # process next quote-like (in $next)
1226    }
1227
1228Note that if the input text is a read-only string (i.e. a literal),
1229no attempt is made to remove the extracted text.
1230
1231In a void context the behaviour of the extraction subroutines is
1232exactly the same as in a scalar context, except (of course) that the
1233extracted substring is not returned.
1234
1235=head2 A Note About Prefixes
1236
1237Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1238This can bite you if you're expecting a prefix specification like
1239'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1240pattern will only succeed if the <H1> tag is on the current line, since
1241. normally doesn't match newlines.
1242
1243To overcome this limitation, you need to turn on /s matching within
1244the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1245
1246=head2 Functions
1247
1248=over 4
1249
1250=item C<extract_delimited>
1251
1252The C<extract_delimited> function formalizes the common idiom
1253of extracting a single-character-delimited substring from the start of
1254a string. For example, to extract a single-quote delimited string, the
1255following code is typically used:
1256
1257    ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1258    $extracted = $1;
1259
1260but with C<extract_delimited> it can be simplified to:
1261
1262    ($extracted,$remainder) = extract_delimited($text, "'");
1263
1264C<extract_delimited> takes up to four scalars (the input text, the
1265delimiters, a prefix pattern to be skipped, and any escape characters)
1266and extracts the initial substring of the text that
1267is appropriately delimited. If the delimiter string has multiple
1268characters, the first one encountered in the text is taken to delimit
1269the substring.
1270The third argument specifies a prefix pattern that is to be skipped
1271(but must be present!) before the substring is extracted.
1272The final argument specifies the escape character to be used for each
1273delimiter.
1274
1275All arguments are optional. If the escape characters are not specified,
1276every delimiter is escaped with a backslash (C<\>).
1277If the prefix is not specified, the
1278pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1279is also not specified, the set C</["'`]/> is used. If the text to be processed
1280is not specified either, C<$_> is used.
1281
1282In list context, C<extract_delimited> returns a array of three
1283elements, the extracted substring (I<including the surrounding
1284delimiters>), the remainder of the text, and the skipped prefix (if
1285any). If a suitable delimited substring is not found, the first
1286element of the array is the empty string, the second is the complete
1287original text, and the prefix returned in the third element is an
1288empty string.
1289
1290In a scalar context, just the extracted substring is returned. In
1291a void context, the extracted substring (and any prefix) are simply
1292removed from the beginning of the first argument.
1293
1294Examples:
1295
1296    # Remove a single-quoted substring from the very beginning of $text:
1297
1298        $substring = extract_delimited($text, "'", '');
1299
1300    # Remove a single-quoted Pascalish substring (i.e. one in which
1301    # doubling the quote character escapes it) from the very
1302    # beginning of $text:
1303
1304        $substring = extract_delimited($text, "'", '', "'");
1305
1306    # Extract a single- or double- quoted substring from the
1307    # beginning of $text, optionally after some whitespace
1308    # (note the list context to protect $text from modification):
1309
1310        ($substring) = extract_delimited $text, q{"'};
1311
1312    # Delete the substring delimited by the first '/' in $text:
1313
1314        $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1315
1316Note that this last example is I<not> the same as deleting the first
1317quote-like pattern. For instance, if C<$text> contained the string:
1318
1319    "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1320
1321then after the deletion it would contain:
1322
1323    "if ('.$UNIXCMD/s) { $cmd = $1; }"
1324
1325not:
1326
1327    "if ('./cmd' =~ ms) { $cmd = $1; }"
1328
1329See L<"extract_quotelike"> for a (partial) solution to this problem.
1330
1331=item C<extract_bracketed>
1332
1333Like C<"extract_delimited">, the C<extract_bracketed> function takes
1334up to three optional scalar arguments: a string to extract from, a delimiter
1335specifier, and a prefix pattern. As before, a missing prefix defaults to
1336optional whitespace and a missing text defaults to C<$_>. However, a missing
1337delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1338
1339C<extract_bracketed> extracts a balanced-bracket-delimited
1340substring (using any one (or more) of the user-specified delimiter
1341brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1342respect quoted unbalanced brackets (see below).
1343
1344A "delimiter bracket" is a bracket in list of delimiters passed as
1345C<extract_bracketed>'s second argument. Delimiter brackets are
1346specified by giving either the left or right (or both!) versions
1347of the required bracket(s). Note that the order in which
1348two or more delimiter brackets are specified is not significant.
1349
1350A "balanced-bracket-delimited substring" is a substring bounded by
1351matched brackets, such that any other (left or right) delimiter
1352bracket I<within> the substring is also matched by an opposite
1353(right or left) delimiter bracket I<at the same level of nesting>. Any
1354type of bracket not in the delimiter list is treated as an ordinary
1355character.
1356
1357In other words, each type of bracket specified as a delimiter must be
1358balanced and correctly nested within the substring, and any other kind of
1359("non-delimiter") bracket in the substring is ignored.
1360
1361For example, given the string:
1362
1363    $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1364
1365then a call to C<extract_bracketed> in a list context:
1366
1367    @result = extract_bracketed( $text, '{}' );
1368
1369would return:
1370
1371    ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1372
1373since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1374(In a scalar context just the first element of the array would be returned. In
1375a void context, C<$text> would be replaced by an empty string.)
1376
1377Likewise the call in:
1378
1379    @result = extract_bracketed( $text, '{[' );
1380
1381would return the same result, since all sets of both types of specified
1382delimiter brackets are correctly nested and balanced.
1383
1384However, the call in:
1385
1386    @result = extract_bracketed( $text, '{([<' );
1387
1388would fail, returning:
1389
1390    ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1391
1392because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1393the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1394return an empty string. In a void context, C<$text> would be unchanged.)
1395
1396Note that the embedded single-quotes in the string don't help in this
1397case, since they have not been specified as acceptable delimiters and are
1398therefore treated as non-delimiter characters (and ignored).
1399
1400However, if a particular species of quote character is included in the
1401delimiter specification, then that type of quote will be correctly handled.
1402for example, if C<$text> is:
1403
1404    $text = '<A HREF=">>>>">link</A>';
1405
1406then
1407
1408    @result = extract_bracketed( $text, '<">' );
1409
1410returns:
1411
1412    ( '<A HREF=">>>>">', 'link</A>', "" )
1413
1414as expected. Without the specification of C<"> as an embedded quoter:
1415
1416    @result = extract_bracketed( $text, '<>' );
1417
1418the result would be:
1419
1420    ( '<A HREF=">', '>>>">link</A>', "" )
1421
1422In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1423quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1424letter 'q' as a delimiter. Hence:
1425
1426    @result = extract_bracketed( $text, '<q>' );
1427
1428would correctly match something like this:
1429
1430    $text = '<leftop: conj /and/ conj>';
1431
1432See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1433
1434=item C<extract_variable>
1435
1436C<extract_variable> extracts any valid Perl variable or
1437variable-involved expression, including scalars, arrays, hashes, array
1438accesses, hash look-ups, method calls through objects, subroutine calls
1439through subroutine references, etc.
1440
1441The subroutine takes up to two optional arguments:
1442
1443=over 4
1444
1445=item 1.
1446
1447A string to be processed (C<$_> if the string is omitted or C<undef>)
1448
1449=item 2.
1450
1451A string specifying a pattern to be matched as a prefix (which is to be
1452skipped). If omitted, optional whitespace is skipped.
1453
1454=back
1455
1456On success in a list context, an array of 3 elements is returned. The
1457elements are:
1458
1459=over 4
1460
1461=item [0]
1462
1463the extracted variable, or variablish expression
1464
1465=item [1]
1466
1467the remainder of the input text,
1468
1469=item [2]
1470
1471the prefix substring (if any),
1472
1473=back
1474
1475On failure, all of these values (except the remaining text) are C<undef>.
1476
1477In a scalar context, C<extract_variable> returns just the complete
1478substring that matched a variablish expression. C<undef> is returned on
1479failure. In addition, the original input text has the returned substring
1480(and any prefix) removed from it.
1481
1482In a void context, the input text just has the matched substring (and
1483any specified prefix) removed.
1484
1485=item C<extract_tagged>
1486
1487C<extract_tagged> extracts and segments text between (balanced)
1488specified tags.
1489
1490The subroutine takes up to five optional arguments:
1491
1492=over 4
1493
1494=item 1.
1495
1496A string to be processed (C<$_> if the string is omitted or C<undef>)
1497
1498=item 2.
1499
1500A string specifying a pattern (i.e. regex) to be matched as the opening tag.
1501If the pattern string is omitted (or C<undef>) then a pattern
1502that matches any standard XML tag is used.
1503
1504=item 3.
1505
1506A string specifying a pattern to be matched at the closing tag.
1507If the pattern string is omitted (or C<undef>) then the closing
1508tag is constructed by inserting a C</> after any leading bracket
1509characters in the actual opening tag that was matched (I<not> the pattern
1510that matched the tag). For example, if the opening tag pattern
1511is specified as C<'{{\w+}}'> and actually matched the opening tag
1512C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1513
1514=item 4.
1515
1516A string specifying a pattern to be matched as a prefix (which is to be
1517skipped). If omitted, optional whitespace is skipped.
1518
1519=item 5.
1520
1521A hash reference containing various parsing options (see below)
1522
1523=back
1524
1525The various options that can be specified are:
1526
1527=over 4
1528
1529=item C<reject =E<gt> $listref>
1530
1531The list reference contains one or more strings specifying patterns
1532that must I<not> appear within the tagged text.
1533
1534For example, to extract
1535an HTML link (which should not contain nested links) use:
1536
1537        extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1538
1539=item C<ignore =E<gt> $listref>
1540
1541The list reference contains one or more strings specifying patterns
1542that are I<not> to be treated as nested tags within the tagged text
1543(even if they would match the start tag pattern).
1544
1545For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1546
1547        extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1548
1549(also see L<"gen_delimited_pat"> below).
1550
1551=item C<fail =E<gt> $str>
1552
1553The C<fail> option indicates the action to be taken if a matching end
1554tag is not encountered (i.e. before the end of the string or some
1555C<reject> pattern matches). By default, a failure to match a closing
1556tag causes C<extract_tagged> to immediately fail.
1557
1558However, if the string value associated with <reject> is "MAX", then
1559C<extract_tagged> returns the complete text up to the point of failure.
1560If the string is "PARA", C<extract_tagged> returns only the first paragraph
1561after the tag (up to the first line that is either empty or contains
1562only whitespace characters).
1563If the string is "", the default behaviour (i.e. failure) is reinstated.
1564
1565For example, suppose the start tag "/para" introduces a paragraph, which then
1566continues until the next "/endpara" tag or until another "/para" tag is
1567encountered:
1568
1569        $text = "/para line 1\n\nline 3\n/para line 4";
1570
1571        extract_tagged($text, '/para', '/endpara', undef,
1572                                {reject => '/para', fail => MAX );
1573
1574        # EXTRACTED: "/para line 1\n\nline 3\n"
1575
1576Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1577tag refers only to the immediately following paragraph:
1578
1579        $text = "/para line 1\n\nline 3\n/para line 4";
1580
1581        extract_tagged($text, '/para', '/endpara', undef,
1582                        {reject => '/para', fail => MAX );
1583
1584        # EXTRACTED: "/para line 1\n"
1585
1586Note that the specified C<fail> behaviour applies to nested tags as well.
1587
1588=back
1589
1590On success in a list context, an array of 6 elements is returned. The elements are:
1591
1592=over 4
1593
1594=item [0]
1595
1596the extracted tagged substring (including the outermost tags),
1597
1598=item [1]
1599
1600the remainder of the input text,
1601
1602=item [2]
1603
1604the prefix substring (if any),
1605
1606=item [3]
1607
1608the opening tag
1609
1610=item [4]
1611
1612the text between the opening and closing tags
1613
1614=item [5]
1615
1616the closing tag (or "" if no closing tag was found)
1617
1618=back
1619
1620On failure, all of these values (except the remaining text) are C<undef>.
1621
1622In a scalar context, C<extract_tagged> returns just the complete
1623substring that matched a tagged text (including the start and end
1624tags). C<undef> is returned on failure. In addition, the original input
1625text has the returned substring (and any prefix) removed from it.
1626
1627In a void context, the input text just has the matched substring (and
1628any specified prefix) removed.
1629
1630=item C<gen_extract_tagged>
1631
1632C<gen_extract_tagged> generates a new anonymous subroutine which
1633extracts text between (balanced) specified tags. In other words,
1634it generates a function identical in function to C<extract_tagged>.
1635
1636The difference between C<extract_tagged> and the anonymous
1637subroutines generated by
1638C<gen_extract_tagged>, is that those generated subroutines:
1639
1640=over 4
1641
1642=item *
1643
1644do not have to reparse tag specification or parsing options every time
1645they are called (whereas C<extract_tagged> has to effectively rebuild
1646its tag parser on every call);
1647
1648=item *
1649
1650make use of the new qr// construct to pre-compile the regexes they use
1651(whereas C<extract_tagged> uses standard string variable interpolation
1652to create tag-matching patterns).
1653
1654=back
1655
1656The subroutine takes up to four optional arguments (the same set as
1657C<extract_tagged> except for the string to be processed). It returns
1658a reference to a subroutine which in turn takes a single argument (the text to
1659be extracted from).
1660
1661In other words, the implementation of C<extract_tagged> is exactly
1662equivalent to:
1663
1664        sub extract_tagged
1665        {
1666                my $text = shift;
1667                $extractor = gen_extract_tagged(@_);
1668                return $extractor->($text);
1669        }
1670
1671(although C<extract_tagged> is not currently implemented that way).
1672
1673Using C<gen_extract_tagged> to create extraction functions for specific tags
1674is a good idea if those functions are going to be called more than once, since
1675their performance is typically twice as good as the more general-purpose
1676C<extract_tagged>.
1677
1678=item C<extract_quotelike>
1679
1680C<extract_quotelike> attempts to recognize, extract, and segment any
1681one of the various Perl quotes and quotelike operators (see
1682L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1683delimiters (for the quotelike operators), and trailing modifiers are
1684all caught. For example, in:
1685
1686        extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1687
1688        extract_quotelike '  "You said, \"Use sed\"."  '
1689
1690        extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1691
1692        extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1693
1694the full Perl quotelike operations are all extracted correctly.
1695
1696Note too that, when using the /x modifier on a regex, any comment
1697containing the current pattern delimiter will cause the regex to be
1698immediately terminated. In other words:
1699
1700        'm /
1701                (?i)            # CASE INSENSITIVE
1702                [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1703                [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1704           /x'
1705
1706will be extracted as if it were:
1707
1708        'm /
1709                (?i)            # CASE INSENSITIVE
1710                [a-z_]          # LEADING ALPHABETIC/'
1711
1712This behaviour is identical to that of the actual compiler.
1713
1714C<extract_quotelike> takes two arguments: the text to be processed and
1715a prefix to be matched at the very beginning of the text. If no prefix
1716is specified, optional whitespace is the default. If no text is given,
1717C<$_> is used.
1718
1719In a list context, an array of 11 elements is returned. The elements are:
1720
1721=over 4
1722
1723=item [0]
1724
1725the extracted quotelike substring (including trailing modifiers),
1726
1727=item [1]
1728
1729the remainder of the input text,
1730
1731=item [2]
1732
1733the prefix substring (if any),
1734
1735=item [3]
1736
1737the name of the quotelike operator (if any),
1738
1739=item [4]
1740
1741the left delimiter of the first block of the operation,
1742
1743=item [5]
1744
1745the text of the first block of the operation
1746(that is, the contents of
1747a quote, the regex of a match or substitution or the target list of a
1748translation),
1749
1750=item [6]
1751
1752the right delimiter of the first block of the operation,
1753
1754=item [7]
1755
1756the left delimiter of the second block of the operation
1757(that is, if it is a C<s>, C<tr>, or C<y>),
1758
1759=item [8]
1760
1761the text of the second block of the operation
1762(that is, the replacement of a substitution or the translation list
1763of a translation),
1764
1765=item [9]
1766
1767the right delimiter of the second block of the operation (if any),
1768
1769=item [10]
1770
1771the trailing modifiers on the operation (if any).
1772
1773=back
1774
1775For each of the fields marked "(if any)" the default value on success is
1776an empty string.
1777On failure, all of these values (except the remaining text) are C<undef>.
1778
1779In a scalar context, C<extract_quotelike> returns just the complete substring
1780that matched a quotelike operation (or C<undef> on failure). In a scalar or
1781void context, the input text has the same substring (and any specified
1782prefix) removed.
1783
1784Examples:
1785
1786        # Remove the first quotelike literal that appears in text
1787
1788                $quotelike = extract_quotelike($text,'.*?');
1789
1790        # Replace one or more leading whitespace-separated quotelike
1791        # literals in $_ with "<QLL>"
1792
1793                do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1794
1795
1796        # Isolate the search pattern in a quotelike operation from $text
1797
1798                ($op,$pat) = (extract_quotelike $text)[3,5];
1799                if ($op =~ /[ms]/)
1800                {
1801                        print "search pattern: $pat\n";
1802                }
1803                else
1804                {
1805                        print "$op is not a pattern matching operation\n";
1806                }
1807
1808=item C<extract_quotelike>
1809
1810C<extract_quotelike> can successfully extract "here documents" from an input
1811string, but with an important caveat in list contexts.
1812
1813Unlike other types of quote-like literals, a here document is rarely
1814a contiguous substring. For example, a typical piece of code using
1815here document might look like this:
1816
1817        <<'EOMSG' || die;
1818        This is the message.
1819        EOMSG
1820        exit;
1821
1822Given this as an input string in a scalar context, C<extract_quotelike>
1823would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1824leaving the string " || die;\nexit;" in the original variable. In other words,
1825the two separate pieces of the here document are successfully extracted and
1826concatenated.
1827
1828In a list context, C<extract_quotelike> would return the list
1829
1830=over 4
1831
1832=item [0]
1833
1834"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1835including fore and aft delimiters),
1836
1837=item [1]
1838
1839" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1840
1841=item [2]
1842
1843"" (i.e. the prefix substring -- trivial in this case),
1844
1845=item [3]
1846
1847"<<" (i.e. the "name" of the quotelike operator)
1848
1849=item [4]
1850
1851"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1852
1853=item [5]
1854
1855"This is the message.\n" (i.e. the text of the here document),
1856
1857=item [6]
1858
1859"EOMSG" (i.e. the right delimiter of the here document),
1860
1861=item [7..10]
1862
1863"" (a here document has no second left delimiter, second text, second right
1864delimiter, or trailing modifiers).
1865
1866=back
1867
1868However, the matching position of the input variable would be set to
1869"exit;" (i.e. I<after> the closing delimiter of the here document),
1870which would cause the earlier " || die;\nexit;" to be skipped in any
1871sequence of code fragment extractions.
1872
1873To avoid this problem, when it encounters a here document whilst
1874extracting from a modifiable string, C<extract_quotelike> silently
1875rearranges the string to an equivalent piece of Perl:
1876
1877        <<'EOMSG'
1878        This is the message.
1879        EOMSG
1880        || die;
1881        exit;
1882
1883in which the here document I<is> contiguous. It still leaves the
1884matching position after the here document, but now the rest of the line
1885on which the here document starts is not skipped.
1886
1887To prevent <extract_quotelike> from mucking about with the input in this way
1888(this is the only case where a list-context C<extract_quotelike> does so),
1889you can pass the input variable as an interpolated literal:
1890
1891        $quotelike = extract_quotelike("$var");
1892
1893=item C<extract_codeblock>
1894
1895C<extract_codeblock> attempts to recognize and extract a balanced
1896bracket delimited substring that may contain unbalanced brackets
1897inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1898is like a combination of C<"extract_bracketed"> and
1899C<"extract_quotelike">.
1900
1901C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1902a text to process, a set of delimiter brackets to look for, and a prefix to
1903match first. It also takes an optional fourth parameter, which allows the
1904outermost delimiter brackets to be specified separately (see below),
1905and a fifth parameter used only by L<Parse::RecDescent>.
1906
1907Omitting the first argument (input text) means process C<$_> instead.
1908Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1909Omitting the third argument (prefix argument) implies optional whitespace at the start.
1910Omitting the fourth argument (outermost delimiter brackets) indicates that the
1911value of the second argument is to be used for the outermost delimiters.
1912
1913Once the prefix and the outermost opening delimiter bracket have been
1914recognized, code blocks are extracted by stepping through the input text and
1915trying the following alternatives in sequence:
1916
1917=over 4
1918
1919=item 1.
1920
1921Try and match a closing delimiter bracket. If the bracket was the same
1922species as the last opening bracket, return the substring to that
1923point. If the bracket was mismatched, return an error.
1924
1925=item 2.
1926
1927Try to match a quote or quotelike operator. If found, call
1928C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1929the error it returned. Otherwise go back to step 1.
1930
1931=item 3.
1932
1933Try to match an opening delimiter bracket. If found, call
1934C<extract_codeblock> recursively to eat the embedded block. If the
1935recursive call fails, return an error. Otherwise, go back to step 1.
1936
1937=item 4.
1938
1939Unconditionally match a bareword or any other single character, and
1940then go back to step 1.
1941
1942=back
1943
1944Examples:
1945
1946        # Find a while loop in the text
1947
1948                if ($text =~ s/.*?while\s*\{/{/)
1949                {
1950                        $loop = "while " . extract_codeblock($text);
1951                }
1952
1953        # Remove the first round-bracketed list (which may include
1954        # round- or curly-bracketed code blocks or quotelike operators)
1955
1956                extract_codeblock $text, "(){}", '[^(]*';
1957
1958
1959The ability to specify a different outermost delimiter bracket is useful
1960in some circumstances. For example, in the Parse::RecDescent module,
1961parser actions which are to be performed only on a successful parse
1962are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1963
1964        sentence: subject verb object
1965                        <defer: {$::theVerb = $item{verb}} >
1966
1967Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1968within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1969
1970A deferred action like this:
1971
1972                        <defer: {if ($count>10) {$count--}} >
1973
1974will be incorrectly parsed as:
1975
1976                        <defer: {if ($count>
1977
1978because the "less than" operator is interpreted as a closing delimiter.
1979
1980But, by extracting the directive using
1981S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1982the '>' character is only treated as a delimited at the outermost
1983level of the code block, so the directive is parsed correctly.
1984
1985=item C<extract_multiple>
1986
1987The C<extract_multiple> subroutine takes a string to be processed and a
1988list of extractors (subroutines or regular expressions) to apply to that string.
1989
1990In an array context C<extract_multiple> returns an array of substrings
1991of the original string, as extracted by the specified extractors.
1992In a scalar context, C<extract_multiple> returns the first
1993substring successfully extracted from the original string. In both
1994scalar and void contexts the original string has the first successfully
1995extracted substring removed from it. In all contexts
1996C<extract_multiple> starts at the current C<pos> of the string, and
1997sets that C<pos> appropriately after it matches.
1998
1999Hence, the aim of a call to C<extract_multiple> in a list context
2000is to split the processed string into as many non-overlapping fields as
2001possible, by repeatedly applying each of the specified extractors
2002to the remainder of the string. Thus C<extract_multiple> is
2003a generalized form of Perl's C<split> subroutine.
2004
2005The subroutine takes up to four optional arguments:
2006
2007=over 4
2008
2009=item 1.
2010
2011A string to be processed (C<$_> if the string is omitted or C<undef>)
2012
2013=item 2.
2014
2015A reference to a list of subroutine references and/or qr// objects and/or
2016literal strings and/or hash references, specifying the extractors
2017to be used to split the string. If this argument is omitted (or
2018C<undef>) the list:
2019
2020        [
2021                sub { extract_variable($_[0], '') },
2022                sub { extract_quotelike($_[0],'') },
2023                sub { extract_codeblock($_[0],'{}','') },
2024        ]
2025
2026is used.
2027
2028=item 3.
2029
2030An number specifying the maximum number of fields to return. If this
2031argument is omitted (or C<undef>), split continues as long as possible.
2032
2033If the third argument is I<N>, then extraction continues until I<N> fields
2034have been successfully extracted, or until the string has been completely
2035processed.
2036
2037Note that in scalar and void contexts the value of this argument is
2038automatically reset to 1 (under C<-w>, a warning is issued if the argument
2039has to be reset).
2040
2041=item 4.
2042
2043A value indicating whether unmatched substrings (see below) within the
2044text should be skipped or returned as fields. If the value is true,
2045such substrings are skipped. Otherwise, they are returned.
2046
2047=back
2048
2049The extraction process works by applying each extractor in
2050sequence to the text string.
2051
2052If the extractor is a subroutine it is called in a list context and is
2053expected to return a list of a single element, namely the extracted
2054text. It may optionally also return two further arguments: a string
2055representing the text left after extraction (like $' for a pattern
2056match), and a string representing any prefix skipped before the
2057extraction (like $` in a pattern match). Note that this is designed
2058to facilitate the use of other Text::Balanced subroutines with
2059C<extract_multiple>. Note too that the value returned by an extractor
2060subroutine need not bear any relationship to the corresponding substring
2061of the original text (see examples below).
2062
2063If the extractor is a precompiled regular expression or a string,
2064it is matched against the text in a scalar context with a leading
2065'\G' and the gc modifiers enabled. The extracted value is either
2066$1 if that variable is defined after the match, or else the
2067complete match (i.e. $&).
2068
2069If the extractor is a hash reference, it must contain exactly one element.
2070The value of that element is one of the
2071above extractor types (subroutine reference, regular expression, or string).
2072The key of that element is the name of a class into which the successful
2073return value of the extractor will be blessed.
2074
2075If an extractor returns a defined value, that value is immediately
2076treated as the next extracted field and pushed onto the list of fields.
2077If the extractor was specified in a hash reference, the field is also
2078blessed into the appropriate class,
2079
2080If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2081assumed to have failed to extract.
2082If none of the extractor subroutines succeeds, then one
2083character is extracted from the start of the text and the extraction
2084subroutines reapplied. Characters which are thus removed are accumulated and
2085eventually become the next field (unless the fourth argument is true, in which
2086case they are discarded).
2087
2088For example, the following extracts substrings that are valid Perl variables:
2089
2090        @fields = extract_multiple($text,
2091                                   [ sub { extract_variable($_[0]) } ],
2092                                   undef, 1);
2093
2094This example separates a text into fields which are quote delimited,
2095curly bracketed, and anything else. The delimited and bracketed
2096parts are also blessed to identify them (the "anything else" is unblessed):
2097
2098        @fields = extract_multiple($text,
2099                   [
2100                        { Delim => sub { extract_delimited($_[0],q{'"}) } },
2101                        { Brack => sub { extract_bracketed($_[0],'{}') } },
2102                   ]);
2103
2104This call extracts the next single substring that is a valid Perl quotelike
2105operator (and removes it from $text):
2106
2107        $quotelike = extract_multiple($text,
2108                                      [
2109                                        sub { extract_quotelike($_[0]) },
2110                                      ], undef, 1);
2111
2112Finally, here is yet another way to do comma-separated value parsing:
2113
2114        $csv_text = "a,'x b',c";
2115        @fields = extract_multiple($csv_text,
2116                                  [
2117                                        sub { extract_delimited($_[0],q{'"}) },
2118                                        qr/([^,]+)/,
2119                                  ],
2120                                  undef,1);
2121        # @fields is now ('a', "'x b'", 'c')
2122
2123The list in the second argument means:
2124I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2125The undef third argument means:
2126I<"...as many times as possible...">,
2127and the true value in the fourth argument means
2128I<"...discarding anything else that appears (i.e. the commas)">.
2129
2130If you wanted the commas preserved as separate fields (i.e. like split
2131does if your split pattern has capturing parentheses), you would
2132just make the last parameter undefined (or remove it).
2133
2134=item C<gen_delimited_pat>
2135
2136The C<gen_delimited_pat> subroutine takes a single (string) argument and
2137builds a Friedl-style optimized regex that matches a string delimited
2138by any one of the characters in the single argument. For example:
2139
2140        gen_delimited_pat(q{'"})
2141
2142returns the regex:
2143
2144        (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2145
2146Note that the specified delimiters are automatically quotemeta'd.
2147
2148A typical use of C<gen_delimited_pat> would be to build special purpose tags
2149for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2150(which might contain quoted strings):
2151
2152        my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2153
2154        extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2155
2156C<gen_delimited_pat> may also be called with an optional second argument,
2157which specifies the "escape" character(s) to be used for each delimiter.
2158For example to match a Pascal-style string (where ' is the delimiter
2159and '' is a literal ' within the string):
2160
2161        gen_delimited_pat(q{'},q{'});
2162
2163Different escape characters can be specified for different delimiters.
2164For example, to specify that '/' is the escape for single quotes
2165and '%' is the escape for double quotes:
2166
2167        gen_delimited_pat(q{'"},q{/%});
2168
2169If more delimiters than escape chars are specified, the last escape char
2170is used for the remaining delimiters.
2171If no escape char is specified for a given specified delimiter, '\' is used.
2172
2173=item C<delimited_pat>
2174
2175Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2176That name may still be used, but is now deprecated.
2177
2178=back
2179
2180=head1 DIAGNOSTICS
2181
2182In a list context, all the functions return C<(undef,$original_text)>
2183on failure. In a scalar context, failure is indicated by returning C<undef>
2184(in this case the input text is not modified in any way).
2185
2186In addition, on failure in I<any> context, the C<$@> variable is set.
2187Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2188below.
2189Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2190which the error was detected (although not necessarily where it occurred!)
2191Printing C<$@> directly produces the error message, with the offset appended.
2192On success, the C<$@> variable is guaranteed to be C<undef>.
2193
2194The available diagnostics are:
2195
2196=over 4
2197
2198=item  C<Did not find a suitable bracket: "%s">
2199
2200The delimiter provided to C<extract_bracketed> was not one of
2201C<'()[]E<lt>E<gt>{}'>.
2202
2203=item  C<Did not find prefix: /%s/>
2204
2205A non-optional prefix was specified but wasn't found at the start of the text.
2206
2207=item  C<Did not find opening bracket after prefix: "%s">
2208
2209C<extract_bracketed> or C<extract_codeblock> was expecting a
2210particular kind of bracket at the start of the text, and didn't find it.
2211
2212=item  C<No quotelike operator found after prefix: "%s">
2213
2214C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2215C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2216it was extracting.
2217
2218=item  C<Unmatched closing bracket: "%c">
2219
2220C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2221a closing bracket where none was expected.
2222
2223=item  C<Unmatched opening bracket(s): "%s">
2224
2225C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2226out of characters in the text before closing one or more levels of nested
2227brackets.
2228
2229=item C<Unmatched embedded quote (%s)>
2230
2231C<extract_bracketed> attempted to match an embedded quoted substring, but
2232failed to find a closing quote to match it.
2233
2234=item C<Did not find closing delimiter to match '%s'>
2235
2236C<extract_quotelike> was unable to find a closing delimiter to match the
2237one that opened the quote-like operation.
2238
2239=item  C<Mismatched closing bracket: expected "%c" but found "%s">
2240
2241C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2242a valid bracket delimiter, but it was the wrong species. This usually
2243indicates a nesting error, but may indicate incorrect quoting or escaping.
2244
2245=item  C<No block delimiter found after quotelike "%s">
2246
2247C<extract_quotelike> or C<extract_codeblock> found one of the
2248quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2249without a suitable block after it.
2250
2251=item C<Did not find leading dereferencer>
2252
2253C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2254a variable, but didn't find any of them.
2255
2256=item C<Bad identifier after dereferencer>
2257
2258C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2259character was not followed by a legal Perl identifier.
2260
2261=item C<Did not find expected opening bracket at %s>
2262
2263C<extract_codeblock> failed to find any of the outermost opening brackets
2264that were specified.
2265
2266=item C<Improperly nested codeblock at %s>
2267
2268A nested code block was found that started with a delimiter that was specified
2269as being only to be used as an outermost bracket.
2270
2271=item  C<Missing second block for quotelike "%s">
2272
2273C<extract_codeblock> or C<extract_quotelike> found one of the
2274quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2275
2276=item C<No match found for opening bracket>
2277
2278C<extract_codeblock> failed to find a closing bracket to match the outermost
2279opening bracket.
2280
2281=item C<Did not find opening tag: /%s/>
2282
2283C<extract_tagged> did not find a suitable opening tag (after any specified
2284prefix was removed).
2285
2286=item C<Unable to construct closing tag to match: /%s/>
2287
2288C<extract_tagged> matched the specified opening tag and tried to
2289modify the matched text to produce a matching closing tag (because
2290none was specified). It failed to generate the closing tag, almost
2291certainly because the opening tag did not start with a
2292bracket of some kind.
2293
2294=item C<Found invalid nested tag: %s>
2295
2296C<extract_tagged> found a nested tag that appeared in the "reject" list
2297(and the failure mode was not "MAX" or "PARA").
2298
2299=item C<Found unbalanced nested tag: %s>
2300
2301C<extract_tagged> found a nested opening tag that was not matched by a
2302corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2303
2304=item C<Did not find closing tag>
2305
2306C<extract_tagged> reached the end of the text without finding a closing tag
2307to match the original opening tag (and the failure mode was not
2308"MAX" or "PARA").
2309
2310=back
2311
2312=head1 EXPORTS
2313
2314The following symbols are, or can be, exported by this module:
2315
2316=over 4
2317
2318=item Default Exports
2319
2320I<None>.
2321
2322=item Optional Exports
2323
2324C<extract_delimited>,
2325C<extract_bracketed>,
2326C<extract_quotelike>,
2327C<extract_codeblock>,
2328C<extract_variable>,
2329C<extract_tagged>,
2330C<extract_multiple>,
2331C<gen_delimited_pat>,
2332C<gen_extract_tagged>,
2333C<delimited_pat>.
2334
2335=item Export Tags
2336
2337=over 4
2338
2339=item C<:ALL>
2340
2341C<extract_delimited>,
2342C<extract_bracketed>,
2343C<extract_quotelike>,
2344C<extract_codeblock>,
2345C<extract_variable>,
2346C<extract_tagged>,
2347C<extract_multiple>,
2348C<gen_delimited_pat>,
2349C<gen_extract_tagged>,
2350C<delimited_pat>.
2351
2352=back
2353
2354=back
2355
2356=head1 KNOWN BUGS
2357
2358See L<https://rt.cpan.org/Dist/Display.html?Status=Active&Queue=Text-Balanced>.
2359
2360=head1 FEEDBACK
2361
2362Patches, bug reports, suggestions or any other feedback is welcome.
2363
2364Patches can be sent as GitHub pull requests at
2365L<https://github.com/steve-m-hay/Text-Balanced/pulls>.
2366
2367Bug reports and suggestions can be made on the CPAN Request Tracker at
2368L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Text-Balanced>.
2369
2370Currently active requests on the CPAN Request Tracker can be viewed at
2371L<https://rt.cpan.org/Public/Dist/Display.html?Status=Active;Queue=Text-Balanced>.
2372
2373Please test this distribution.  See CPAN Testers Reports at
2374L<https://www.cpantesters.org/> for details of how to get involved.
2375
2376Previous test results on CPAN Testers Reports can be viewed at
2377L<https://www.cpantesters.org/distro/T/Text-Balanced.html>.
2378
2379Please rate this distribution on CPAN Ratings at
2380L<https://cpanratings.perl.org/rate/?distribution=Text-Balanced>.
2381
2382=head1 AVAILABILITY
2383
2384The latest version of this module is available from CPAN (see
2385L<perlmodlib/"CPAN"> for details) at
2386
2387L<https://metacpan.org/release/Text-Balanced> or
2388
2389L<https://www.cpan.org/authors/id/S/SH/SHAY/> or
2390
2391L<https://www.cpan.org/modules/by-module/Text/>.
2392
2393The latest source code is available from GitHub at
2394L<https://github.com/steve-m-hay/Text-Balanced>.
2395
2396=head1 INSTALLATION
2397
2398See the F<INSTALL> file.
2399
2400=head1 AUTHOR
2401
2402Damian Conway E<lt>L<damian@conway.org|mailto:damian@conway.org>E<gt>.
2403
2404Steve Hay E<lt>L<shay@cpan.org|mailto:shay@cpan.org>E<gt> is now maintaining
2405Text::Balanced as of version 2.03.
2406
2407=head1 COPYRIGHT
2408
2409Copyright (C) 1997-2001 Damian Conway.  All rights reserved.
2410
2411Copyright (C) 2009 Adam Kennedy.
2412
2413Copyright (C) 2015, 2020, 2022 Steve Hay and other contributors.  All rights
2414reserved.
2415
2416=head1 LICENCE
2417
2418This module is free software; you can redistribute it and/or modify it under the
2419same terms as Perl itself, i.e. under the terms of either the GNU General Public
2420License or the Artistic License, as specified in the F<LICENCE> file.
2421
2422=head1 VERSION
2423
2424Version 2.06
2425
2426=head1 DATE
2427
242805 Jun 2022
2429
2430=head1 HISTORY
2431
2432See the F<Changes> file.
2433
2434=cut
2435