1# Copyright (C) 1997-2001 Damian Conway.  All rights reserved.
2# Copyright (C) 2009 Adam Kennedy.
3# Copyright (C) 2015 Steve Hay.  All rights reserved.
4
5# This module is free software; you can redistribute it and/or modify it under
6# the same terms as Perl itself, i.e. under the terms of either the GNU General
7# Public License or the Artistic License, as specified in the F<LICENCE> file.
8
9package Text::Balanced;
10
11# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
12# FOR FULL DOCUMENTATION SEE Balanced.pod
13
14use 5.008001;
15use strict;
16use Exporter ();
17
18use vars qw { $VERSION @ISA %EXPORT_TAGS };
19BEGIN {
20    $VERSION     = '2.04';
21    @ISA         = 'Exporter';
22    %EXPORT_TAGS = (
23        ALL => [ qw{
24            &extract_delimited
25            &extract_bracketed
26            &extract_quotelike
27            &extract_codeblock
28            &extract_variable
29            &extract_tagged
30            &extract_multiple
31            &gen_delimited_pat
32            &gen_extract_tagged
33            &delimited_pat
34        } ],
35    );
36}
37
38Exporter::export_ok_tags('ALL');
39
40## no critic (Subroutines::ProhibitSubroutinePrototypes)
41
42# PROTOTYPES
43
44sub _match_bracketed($$$$$$);
45sub _match_variable($$);
46sub _match_codeblock($$$$$$$);
47sub _match_quotelike($$$$);
48
49# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
50
51sub _failmsg {
52    my ($message, $pos) = @_;
53    $@ = bless {
54        error => $message,
55        pos   => $pos,
56    }, 'Text::Balanced::ErrorMsg';
57}
58
59sub _fail {
60    my ($wantarray, $textref, $message, $pos) = @_;
61    _failmsg $message, $pos if $message;
62    return (undef, $$textref, undef) if $wantarray;
63    return;
64}
65
66sub _succeed {
67    $@ = undef;
68    my ($wantarray,$textref) = splice @_, 0, 2;
69    my ($extrapos, $extralen) = @_ > 18
70        ? splice(@_, -2, 2)
71        : (0, 0);
72    my ($startlen, $oppos) = @_[5,6];
73    my $remainderpos = $_[2];
74    if ( $wantarray ) {
75        my @res;
76        while (my ($from, $len) = splice @_, 0, 2) {
77            push @res, substr($$textref, $from, $len);
78        }
79        if ( $extralen ) { # CORRECT FILLET
80            my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
81            $res[1] = "$extra$res[1]";
82            eval { substr($$textref,$remainderpos,0) = $extra;
83                   substr($$textref,$extrapos,$extralen,"\n")} ;
84                    #REARRANGE HERE DOC AND FILLET IF POSSIBLE
85            pos($$textref) = $remainderpos-$extralen+1; # RESET \G
86        } else {
87            pos($$textref) = $remainderpos;             # RESET \G
88        }
89        return @res;
90    } else {
91        my $match = substr($$textref,$_[0],$_[1]);
92        substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
93        my $extra = $extralen
94            ? substr($$textref, $extrapos, $extralen)."\n" : "";
95        eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;     #CHOP OUT PREFIX & MATCH, IF POSSIBLE
96        pos($$textref) = $_[4];                         # RESET \G
97        return $match;
98    }
99}
100
101# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
102
103sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
104{
105    my ($dels, $escs) = @_;
106    return "" unless $dels =~ /\S/;
107    $escs = '\\' unless $escs;
108    $escs .= substr($escs,-1) x (length($dels)-length($escs));
109    my @pat = ();
110    my $i;
111    for ($i=0; $i<length $dels; $i++)
112    {
113        my $del = quotemeta substr($dels,$i,1);
114        my $esc = quotemeta substr($escs,$i,1);
115        if ($del eq $esc)
116        {
117            push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
118        }
119        else
120        {
121            push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
122        }
123    }
124    my $pat = join '|', @pat;
125    return "(?:$pat)";
126}
127
128*delimited_pat = \&gen_delimited_pat;
129
130# THE EXTRACTION FUNCTIONS
131
132sub extract_delimited (;$$$$)
133{
134    my $textref = defined $_[0] ? \$_[0] : \$_;
135    my $wantarray = wantarray;
136    my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
137    my $pre  = defined $_[2] ? $_[2] : '\s*';
138    my $esc  = defined $_[3] ? $_[3] : qq{\\};
139    my $pat = gen_delimited_pat($del, $esc);
140    my $startpos = pos $$textref || 0;
141    return _fail($wantarray, $textref, "Not a delimited pattern", 0)
142        unless $$textref =~ m/\G($pre)($pat)/gc;
143    my $prelen = length($1);
144    my $matchpos = $startpos+$prelen;
145    my $endpos = pos $$textref;
146    return _succeed $wantarray, $textref,
147                    $matchpos, $endpos-$matchpos,               # MATCH
148                    $endpos,   length($$textref)-$endpos,       # REMAINDER
149                    $startpos, $prelen;                         # PREFIX
150}
151
152sub extract_bracketed (;$$$)
153{
154    my $textref = defined $_[0] ? \$_[0] : \$_;
155    my $ldel = defined $_[1] ? $_[1] : '{([<';
156    my $pre  = defined $_[2] ? $_[2] : '\s*';
157    my $wantarray = wantarray;
158    my $qdel = "";
159    my $quotelike;
160    $ldel =~ s/'//g and $qdel .= q{'};
161    $ldel =~ s/"//g and $qdel .= q{"};
162    $ldel =~ s/`//g and $qdel .= q{`};
163    $ldel =~ s/q//g and $quotelike = 1;
164    $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
165    my $rdel = $ldel;
166    unless ($rdel =~ tr/[({</])}>/)
167    {
168        return _fail $wantarray, $textref,
169                     "Did not find a suitable bracket in delimiter: \"$_[1]\"",
170                     0;
171    }
172    my $posbug = pos;
173    $ldel = join('|', map { quotemeta $_ } split('', $ldel));
174    $rdel = join('|', map { quotemeta $_ } split('', $rdel));
175    pos = $posbug;
176
177    my $startpos = pos $$textref || 0;
178    my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
179
180    return _fail ($wantarray, $textref) unless @match;
181
182    return _succeed ( $wantarray, $textref,
183                      $match[2], $match[5]+2,           # MATCH
184                      @match[8,9],                      # REMAINDER
185                      @match[0,1],                      # PREFIX
186                    );
187}
188
189sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
190{
191    my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
192    my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
193    unless ($$textref =~ m/\G$pre/gc)
194    {
195        _failmsg "Did not find prefix: /$pre/", $startpos;
196        return;
197    }
198
199    $ldelpos = pos $$textref;
200
201    unless ($$textref =~ m/\G($ldel)/gc)
202    {
203        _failmsg "Did not find opening bracket after prefix: \"$pre\"",
204                 pos $$textref;
205        pos $$textref = $startpos;
206        return;
207    }
208
209    my @nesting = ( $1 );
210    my $textlen = length $$textref;
211    while (pos $$textref < $textlen)
212    {
213        next if $$textref =~ m/\G\\./gcs;
214
215        if ($$textref =~ m/\G($ldel)/gc)
216        {
217            push @nesting, $1;
218        }
219        elsif ($$textref =~ m/\G($rdel)/gc)
220        {
221            my ($found, $brackettype) = ($1, $1);
222            if ($#nesting < 0)
223            {
224                _failmsg "Unmatched closing bracket: \"$found\"",
225                         pos $$textref;
226                pos $$textref = $startpos;
227                return;
228            }
229            my $expected = pop(@nesting);
230            $expected =~ tr/({[</)}]>/;
231            if ($expected ne $brackettype)
232            {
233                _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
234                         pos $$textref;
235                pos $$textref = $startpos;
236                return;
237            }
238            last if $#nesting < 0;
239        }
240        elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
241        {
242            $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
243            _failmsg "Unmatched embedded quote ($1)",
244                     pos $$textref;
245            pos $$textref = $startpos;
246            return;
247        }
248        elsif ($quotelike && _match_quotelike($textref,"",1,0))
249        {
250            next;
251        }
252
253        else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
254    }
255    if ($#nesting>=0)
256    {
257        _failmsg "Unmatched opening bracket(s): "
258                     . join("..",@nesting)."..",
259                 pos $$textref;
260        pos $$textref = $startpos;
261        return;
262    }
263
264    $endpos = pos $$textref;
265
266    return (
267        $startpos,  $ldelpos-$startpos,         # PREFIX
268        $ldelpos,   1,                          # OPENING BRACKET
269        $ldelpos+1, $endpos-$ldelpos-2,         # CONTENTS
270        $endpos-1,  1,                          # CLOSING BRACKET
271        $endpos,    length($$textref)-$endpos,  # REMAINDER
272    );
273}
274
275sub _revbracket($)
276{
277    my $brack = reverse $_[0];
278    $brack =~ tr/[({</])}>/;
279    return $brack;
280}
281
282my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
283
284sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
285{
286    my $textref = defined $_[0] ? \$_[0] : \$_;
287    my $ldel    = $_[1];
288    my $rdel    = $_[2];
289    my $pre     = defined $_[3] ? $_[3] : '\s*';
290    my %options = defined $_[4] ? %{$_[4]} : ();
291    my $omode   = defined $options{fail} ? $options{fail} : '';
292    my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
293                : defined($options{reject})        ? $options{reject}
294                :                                    ''
295                ;
296    my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
297                : defined($options{ignore})        ? $options{ignore}
298                :                                    ''
299                ;
300
301    if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
302    $@ = undef;
303
304    my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
305
306    return _fail(wantarray, $textref) unless @match;
307    return _succeed wantarray, $textref,
308            $match[2], $match[3]+$match[5]+$match[7],    # MATCH
309            @match[8..9,0..1,2..7];                      # REM, PRE, BITS
310}
311
312sub _match_tagged       # ($$$$$$$)
313{
314    my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
315    my $rdelspec;
316
317    my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
318
319    unless ($$textref =~ m/\G($pre)/gc)
320    {
321        _failmsg "Did not find prefix: /$pre/", pos $$textref;
322        goto failed;
323    }
324
325    $opentagpos = pos($$textref);
326
327    unless ($$textref =~ m/\G$ldel/gc)
328    {
329        _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
330        goto failed;
331    }
332
333    $textpos = pos($$textref);
334
335    if (!defined $rdel)
336    {
337        $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
338        unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
339        {
340            _failmsg "Unable to construct closing tag to match: $rdel",
341                     pos $$textref;
342            goto failed;
343        }
344    }
345    else
346    {
347        ## no critic (BuiltinFunctions::ProhibitStringyEval)
348        $rdelspec = eval "qq{$rdel}" || do {
349            my $del;
350            for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
351                { next if $rdel =~ /\Q$_/; $del = $_; last }
352            unless ($del) {
353                use Carp;
354                croak "Can't interpolate right delimiter $rdel"
355            }
356            eval "qq$del$rdel$del";
357        };
358    }
359
360    while (pos($$textref) < length($$textref))
361    {
362        next if $$textref =~ m/\G\\./gc;
363
364        if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
365        {
366            $parapos = pos($$textref) - length($1)
367                unless defined $parapos;
368        }
369        elsif ($$textref =~ m/\G($rdelspec)/gc )
370        {
371            $closetagpos = pos($$textref)-length($1);
372            goto matched;
373        }
374        elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
375        {
376            next;
377        }
378        elsif ($bad && $$textref =~ m/\G($bad)/gcs)
379        {
380            pos($$textref) -= length($1);       # CUT OFF WHATEVER CAUSED THE SHORTNESS
381            goto short if ($omode eq 'PARA' || $omode eq 'MAX');
382            _failmsg "Found invalid nested tag: $1", pos $$textref;
383            goto failed;
384        }
385        elsif ($$textref =~ m/\G($ldel)/gc)
386        {
387            my $tag = $1;
388            pos($$textref) -= length($tag);     # REWIND TO NESTED TAG
389            unless (_match_tagged(@_))  # MATCH NESTED TAG
390            {
391                goto short if $omode eq 'PARA' || $omode eq 'MAX';
392                _failmsg "Found unbalanced nested tag: $tag",
393                         pos $$textref;
394                goto failed;
395            }
396        }
397        else { $$textref =~ m/./gcs }
398    }
399
400short:
401    $closetagpos = pos($$textref);
402    goto matched if $omode eq 'MAX';
403    goto failed unless $omode eq 'PARA';
404
405    if (defined $parapos) { pos($$textref) = $parapos }
406    else                  { $parapos = pos($$textref) }
407
408    return (
409        $startpos,    $opentagpos-$startpos,            # PREFIX
410        $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
411        $textpos,     $parapos-$textpos,                # TEXT
412        $parapos,     0,                                # NO CLOSING TAG
413        $parapos,     length($$textref)-$parapos,       # REMAINDER
414    );
415
416matched:
417    $endpos = pos($$textref);
418    return (
419        $startpos,    $opentagpos-$startpos,            # PREFIX
420        $opentagpos,  $textpos-$opentagpos,             # OPENING TAG
421        $textpos,     $closetagpos-$textpos,            # TEXT
422        $closetagpos, $endpos-$closetagpos,             # CLOSING TAG
423        $endpos,      length($$textref)-$endpos,        # REMAINDER
424    );
425
426failed:
427    _failmsg "Did not find closing tag", pos $$textref unless $@;
428    pos($$textref) = $startpos;
429    return;
430}
431
432sub extract_variable (;$$)
433{
434    my $textref = defined $_[0] ? \$_[0] : \$_;
435    return ("","","") unless defined $$textref;
436    my $pre  = defined $_[1] ? $_[1] : '\s*';
437
438    my @match = _match_variable($textref,$pre);
439
440    return _fail wantarray, $textref unless @match;
441
442    return _succeed wantarray, $textref,
443                    @match[2..3,4..5,0..1];        # MATCH, REMAINDER, PREFIX
444}
445
446sub _match_variable($$)
447{
448#  $#
449#  $^
450#  $$
451    my ($textref, $pre) = @_;
452    my $startpos = pos($$textref) = pos($$textref)||0;
453    unless ($$textref =~ m/\G($pre)/gc)
454    {
455        _failmsg "Did not find prefix: /$pre/", pos $$textref;
456        return;
457    }
458    my $varpos = pos($$textref);
459    unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
460    {
461        unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
462        {
463            _failmsg "Did not find leading dereferencer", pos $$textref;
464            pos $$textref = $startpos;
465            return;
466        }
467        my $deref = $1;
468
469        unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
470            or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
471            or $deref eq '$#' or $deref eq '$$' )
472        {
473            _failmsg "Bad identifier after dereferencer", pos $$textref;
474            pos $$textref = $startpos;
475            return;
476        }
477    }
478
479    while (1)
480    {
481        next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
482        next if _match_codeblock($textref,
483                                 qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
484                                 qr/[({[]/, qr/[)}\]]/,
485                                 qr/[({[]/, qr/[)}\]]/, 0);
486        next if _match_codeblock($textref,
487                                 qr/\s*/, qr/[{[]/, qr/[}\]]/,
488                                 qr/[{[]/, qr/[}\]]/, 0);
489        next if _match_variable($textref,'\s*->\s*');
490        next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
491        last;
492    }
493
494    my $endpos = pos($$textref);
495    return ($startpos, $varpos-$startpos,
496            $varpos,   $endpos-$varpos,
497            $endpos,   length($$textref)-$endpos
498    );
499}
500
501sub extract_codeblock (;$$$$$)
502{
503    my $textref = defined $_[0] ? \$_[0] : \$_;
504    my $wantarray = wantarray;
505    my $ldel_inner = defined $_[1] ? $_[1] : '{';
506    my $pre        = defined $_[2] ? $_[2] : '\s*';
507    my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
508    my $rd         = $_[4];
509    my $rdel_inner = $ldel_inner;
510    my $rdel_outer = $ldel_outer;
511    my $posbug = pos;
512    for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
513    for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
514    for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
515    {
516        $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
517    }
518    pos = $posbug;
519
520    my @match = _match_codeblock($textref, $pre,
521                                 $ldel_outer, $rdel_outer,
522                                 $ldel_inner, $rdel_inner,
523                                 $rd);
524    return _fail($wantarray, $textref) unless @match;
525    return _succeed($wantarray, $textref,
526                    @match[2..3,4..5,0..1]    # MATCH, REMAINDER, PREFIX
527    );
528
529}
530
531sub _match_codeblock($$$$$$$)
532{
533    my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
534    my $startpos = pos($$textref) = pos($$textref) || 0;
535    unless ($$textref =~ m/\G($pre)/gc)
536    {
537        _failmsg qq{Did not match prefix /$pre/ at"} .
538                     substr($$textref,pos($$textref),20) .
539                     q{..."},
540                 pos $$textref;
541        return;
542    }
543    my $codepos = pos($$textref);
544    unless ($$textref =~ m/\G($ldel_outer)/gc)  # OUTERMOST DELIMITER
545    {
546        _failmsg qq{Did not find expected opening bracket at "} .
547                     substr($$textref,pos($$textref),20) .
548                     q{..."},
549                 pos $$textref;
550        pos $$textref = $startpos;
551        return;
552    }
553    my $closing = $1;
554       $closing =~ tr/([<{/)]>}/;
555    my $matched;
556    my $patvalid = 1;
557    while (pos($$textref) < length($$textref))
558    {
559        $matched = '';
560        if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
561        {
562            $patvalid = 0;
563            next;
564        }
565
566        if ($$textref =~ m/\G\s*#.*/gc)
567        {
568            next;
569        }
570
571        if ($$textref =~ m/\G\s*($rdel_outer)/gc)
572        {
573            unless ($matched = ($closing && $1 eq $closing) )
574            {
575                next if $1 eq '>';      # MIGHT BE A "LESS THAN"
576                _failmsg q{Mismatched closing bracket at "} .
577                             substr($$textref,pos($$textref),20) .
578                             qq{...". Expected '$closing'},
579                         pos $$textref;
580            }
581            last;
582        }
583
584        if (_match_variable($textref,'\s*') ||
585            _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
586        {
587            $patvalid = 0;
588            next;
589        }
590
591
592        # NEED TO COVER MANY MORE CASES HERE!!!
593        if ($$textref =~ m#\G\s*(?!$ldel_inner)
594                                ( [-+*x/%^&|.]=?
595                                | [!=]~
596                                | =(?!>)
597                                | (\*\*|&&|\|\||<<|>>)=?
598                                | split|grep|map|return
599                                | [([]
600                                )#gcx)
601        {
602            $patvalid = 1;
603            next;
604        }
605
606        if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
607        {
608            $patvalid = 1;
609            next;
610        }
611
612        if ($$textref =~ m/\G\s*$ldel_outer/gc)
613        {
614            _failmsg q{Improperly nested codeblock at "} .
615                         substr($$textref,pos($$textref),20) .
616                         q{..."},
617                     pos $$textref;
618            last;
619        }
620
621        $patvalid = 0;
622        $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
623    }
624    continue { $@ = undef }
625
626    unless ($matched)
627    {
628        _failmsg 'No match found for opening bracket', pos $$textref
629                unless $@;
630        return;
631    }
632
633    my $endpos = pos($$textref);
634    return ( $startpos, $codepos-$startpos,
635             $codepos, $endpos-$codepos,
636             $endpos,  length($$textref)-$endpos,
637    );
638}
639
640
641my %mods   = (
642    'none' => '[cgimsox]*',
643    'm'    => '[cgimsox]*',
644    's'    => '[cegimsox]*',
645    'tr'   => '[cds]*',
646    'y'    => '[cds]*',
647    'qq'   => '',
648    'qx'   => '',
649    'qw'   => '',
650    'qr'   => '[imsx]*',
651    'q'    => '',
652);
653
654sub extract_quotelike (;$$)
655{
656    my $textref = $_[0] ? \$_[0] : \$_;
657    my $wantarray = wantarray;
658    my $pre  = defined $_[1] ? $_[1] : '\s*';
659
660    my @match = _match_quotelike($textref,$pre,1,0);
661    return _fail($wantarray, $textref) unless @match;
662    return _succeed($wantarray, $textref,
663                    $match[2], $match[18]-$match[2],    # MATCH
664                    @match[18,19],                      # REMAINDER
665                    @match[0,1],                        # PREFIX
666                    @match[2..17],                      # THE BITS
667                    @match[20,21],                      # ANY FILLET?
668    );
669};
670
671sub _match_quotelike($$$$)      # ($textref, $prepat, $allow_raw_match)
672{
673    my ($textref, $pre, $rawmatch, $qmark) = @_;
674
675    my ($textlen,$startpos,
676        $oppos,
677        $preld1pos,$ld1pos,$str1pos,$rd1pos,
678        $preld2pos,$ld2pos,$str2pos,$rd2pos,
679        $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
680
681    unless ($$textref =~ m/\G($pre)/gc)
682    {
683        _failmsg qq{Did not find prefix /$pre/ at "} .
684                     substr($$textref, pos($$textref), 20) .
685                     q{..."},
686                 pos $$textref;
687        return;
688    }
689    $oppos = pos($$textref);
690
691    my $initial = substr($$textref,$oppos,1);
692
693    if ($initial && $initial =~ m|^[\"\'\`]|
694                 || $rawmatch && $initial =~ m|^/|
695                 || $qmark && $initial =~ m|^\?|)
696    {
697        unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
698        {
699            _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
700                         substr($$textref, $oppos, 20) .
701                         q{..."},
702                     pos $$textref;
703            pos $$textref = $startpos;
704            return;
705        }
706        $modpos= pos($$textref);
707        $rd1pos = $modpos-1;
708
709        if ($initial eq '/' || $initial eq '?')
710        {
711            $$textref =~ m/\G$mods{none}/gc
712        }
713
714        my $endpos = pos($$textref);
715        return (
716            $startpos,  $oppos-$startpos,       # PREFIX
717            $oppos,     0,                      # NO OPERATOR
718            $oppos,     1,                      # LEFT DEL
719            $oppos+1,   $rd1pos-$oppos-1,       # STR/PAT
720            $rd1pos,    1,                      # RIGHT DEL
721            $modpos,    0,                      # NO 2ND LDEL
722            $modpos,    0,                      # NO 2ND STR
723            $modpos,    0,                      # NO 2ND RDEL
724            $modpos,    $endpos-$modpos,        # MODIFIERS
725            $endpos,    $textlen-$endpos,       # REMAINDER
726        );
727    }
728
729    unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
730    {
731        _failmsg q{No quotelike operator found after prefix at "} .
732                     substr($$textref, pos($$textref), 20) .
733                     q{..."},
734                 pos $$textref;
735        pos $$textref = $startpos;
736        return;
737    }
738
739    my $op = $1;
740    $preld1pos = pos($$textref);
741    if ($op eq '<<') {
742        $ld1pos = pos($$textref);
743        my $label;
744        if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
745            $label = $1;
746        }
747        elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
748                             | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
749                             | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
750                             }gcsx) {
751            $label = $+;
752        }
753        else {
754            $label = "";
755        }
756        my $extrapos = pos($$textref);
757        $$textref =~ m{.*\n}gc;
758        $str1pos = pos($$textref)--;
759        unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
760            _failmsg qq{Missing here doc terminator ('$label') after "} .
761                         substr($$textref, $startpos, 20) .
762                         q{..."},
763                     pos $$textref;
764            pos $$textref = $startpos;
765            return;
766        }
767        $rd1pos = pos($$textref);
768        $$textref =~ m{\Q$label\E\n}gc;
769        $ld2pos = pos($$textref);
770        return (
771            $startpos,  $oppos-$startpos,       # PREFIX
772            $oppos,     length($op),            # OPERATOR
773            $ld1pos,    $extrapos-$ld1pos,      # LEFT DEL
774            $str1pos,   $rd1pos-$str1pos,       # STR/PAT
775            $rd1pos,    $ld2pos-$rd1pos,        # RIGHT DEL
776            $ld2pos,    0,                      # NO 2ND LDEL
777            $ld2pos,    0,                      # NO 2ND STR
778            $ld2pos,    0,                      # NO 2ND RDEL
779            $ld2pos,    0,                      # NO MODIFIERS
780            $ld2pos,    $textlen-$ld2pos,       # REMAINDER
781            $extrapos,  $str1pos-$extrapos,     # FILLETED BIT
782        );
783    }
784
785    $$textref =~ m/\G\s*/gc;
786    $ld1pos = pos($$textref);
787    $str1pos = $ld1pos+1;
788
789    unless ($$textref =~ m/\G(\S)/gc)   # SHOULD USE LOOKAHEAD
790    {
791        _failmsg "No block delimiter found after quotelike $op",
792                 pos $$textref;
793        pos $$textref = $startpos;
794        return;
795    }
796    pos($$textref) = $ld1pos;   # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
797    my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
798    if ($ldel1 =~ /[[(<{]/)
799    {
800        $rdel1 =~ tr/[({</])}>/;
801        defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
802            || do { pos $$textref = $startpos; return };
803        $ld2pos = pos($$textref);
804        $rd1pos = $ld2pos-1;
805    }
806    else
807    {
808        $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
809            || do { pos $$textref = $startpos; return };
810        $ld2pos = $rd1pos = pos($$textref)-1;
811    }
812
813    my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
814    if ($second_arg)
815    {
816        my ($ldel2, $rdel2);
817        if ($ldel1 =~ /[[(<{]/)
818        {
819            unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
820            {
821                _failmsg "Missing second block for quotelike $op",
822                         pos $$textref;
823                pos $$textref = $startpos;
824                return;
825            }
826            $ldel2 = $rdel2 = "\Q$1";
827            $rdel2 =~ tr/[({</])}>/;
828        }
829        else
830        {
831            $ldel2 = $rdel2 = $ldel1;
832        }
833        $str2pos = $ld2pos+1;
834
835        if ($ldel2 =~ /[[(<{]/)
836        {
837            pos($$textref)--;   # OVERCOME BROKEN LOOKAHEAD
838            defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
839                || do { pos $$textref = $startpos; return };
840        }
841        else
842        {
843            $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
844                || do { pos $$textref = $startpos; return };
845        }
846        $rd2pos = pos($$textref)-1;
847    }
848    else
849    {
850        $ld2pos = $str2pos = $rd2pos = $rd1pos;
851    }
852
853    $modpos = pos $$textref;
854
855    $$textref =~ m/\G($mods{$op})/gc;
856    my $endpos = pos $$textref;
857
858    return (
859        $startpos,      $oppos-$startpos,       # PREFIX
860        $oppos,         length($op),            # OPERATOR
861        $ld1pos,        1,                      # LEFT DEL
862        $str1pos,       $rd1pos-$str1pos,       # STR/PAT
863        $rd1pos,        1,                      # RIGHT DEL
864        $ld2pos,        $second_arg,            # 2ND LDEL (MAYBE)
865        $str2pos,       $rd2pos-$str2pos,       # 2ND STR (MAYBE)
866        $rd2pos,        $second_arg,            # 2ND RDEL (MAYBE)
867        $modpos,        $endpos-$modpos,        # MODIFIERS
868        $endpos,        $textlen-$endpos,       # REMAINDER
869    );
870}
871
872my $def_func = [
873    sub { extract_variable($_[0], '') },
874    sub { extract_quotelike($_[0],'') },
875    sub { extract_codeblock($_[0],'{}','') },
876];
877
878sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
879{
880    my $textref = defined($_[0]) ? \$_[0] : \$_;
881    my $posbug = pos;
882    my ($lastpos, $firstpos);
883    my @fields = ();
884
885    #for ($$textref)
886    {
887        my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
888        my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
889        my $igunk = $_[3];
890
891        pos $$textref ||= 0;
892
893        unless (wantarray)
894        {
895            use Carp;
896            carp "extract_multiple reset maximal count to 1 in scalar context"
897                    if $^W && defined($_[2]) && $max > 1;
898            $max = 1
899        }
900
901        my $unkpos;
902        my $class;
903
904        my @class;
905        foreach my $func ( @func )
906        {
907            if (ref($func) eq 'HASH')
908            {
909                push @class, (keys %$func)[0];
910                $func = (values %$func)[0];
911            }
912            else
913            {
914                push @class, undef;
915            }
916        }
917
918        FIELD: while (pos($$textref) < length($$textref))
919        {
920            my ($field, $rem);
921            my @bits;
922            foreach my $i ( 0..$#func )
923            {
924                my $pref;
925                my $func = $func[$i];
926                $class = $class[$i];
927                $lastpos = pos $$textref;
928                if (ref($func) eq 'CODE')
929                    { ($field,$rem,$pref) = @bits = $func->($$textref) }
930                elsif (ref($func) eq 'Text::Balanced::Extractor')
931                    { @bits = $field = $func->extract($$textref) }
932                elsif( $$textref =~ m/\G$func/gc )
933                    { @bits = $field = defined($1)
934                        ? $1
935                        : substr($$textref, $-[0], $+[0] - $-[0])
936                    }
937                $pref ||= "";
938                if (defined($field) && length($field))
939                {
940                    if (!$igunk) {
941                        $unkpos = $lastpos
942                            if length($pref) && !defined($unkpos);
943                        if (defined $unkpos)
944                        {
945                            push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
946                            $firstpos = $unkpos unless defined $firstpos;
947                            undef $unkpos;
948                            last FIELD if @fields == $max;
949                        }
950                    }
951                    push @fields, $class
952                            ? bless (\$field, $class)
953                            : $field;
954                    $firstpos = $lastpos unless defined $firstpos;
955                    $lastpos = pos $$textref;
956                    last FIELD if @fields == $max;
957                    next FIELD;
958                }
959            }
960            if ($$textref =~ /\G(.)/gcs)
961            {
962                $unkpos = pos($$textref)-1
963                    unless $igunk || defined $unkpos;
964            }
965        }
966
967        if (defined $unkpos)
968        {
969            push @fields, substr($$textref, $unkpos);
970            $firstpos = $unkpos unless defined $firstpos;
971            $lastpos = length $$textref;
972        }
973        last;
974    }
975
976    pos $$textref = $lastpos;
977    return @fields if wantarray;
978
979    $firstpos ||= 0;
980    eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
981           pos $$textref = $firstpos };
982    return $fields[0];
983}
984
985sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
986{
987    my $ldel    = $_[0];
988    my $rdel    = $_[1];
989    my $pre     = defined $_[2] ? $_[2] : '\s*';
990    my %options = defined $_[3] ? %{$_[3]} : ();
991    my $omode   = defined $options{fail} ? $options{fail} : '';
992    my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
993                : defined($options{reject})        ? $options{reject}
994                :                                    ''
995                ;
996    my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
997                : defined($options{ignore})        ? $options{ignore}
998                :                                    ''
999                ;
1000
1001    if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
1002
1003    my $posbug = pos;
1004    for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1005    pos = $posbug;
1006
1007    my $closure = sub
1008    {
1009        my $textref = defined $_[0] ? \$_[0] : \$_;
1010        my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1011
1012        return _fail(wantarray, $textref) unless @match;
1013        return _succeed wantarray, $textref,
1014                        $match[2], $match[3]+$match[5]+$match[7],   # MATCH
1015                        @match[8..9,0..1,2..7];                     # REM, PRE, BITS
1016    };
1017
1018    bless $closure, 'Text::Balanced::Extractor';
1019}
1020
1021package Text::Balanced::Extractor;
1022
1023sub extract($$) # ($self, $text)
1024{
1025    &{$_[0]}($_[1]);
1026}
1027
1028package Text::Balanced::ErrorMsg;
1029
1030use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1031
10321;
1033
1034__END__
1035
1036=pod
1037
1038=head1 NAME
1039
1040Text::Balanced - Extract delimited text sequences from strings.
1041
1042=head1 SYNOPSIS
1043
1044    use Text::Balanced qw (
1045        extract_delimited
1046        extract_bracketed
1047        extract_quotelike
1048        extract_codeblock
1049        extract_variable
1050        extract_tagged
1051        extract_multiple
1052        gen_delimited_pat
1053        gen_extract_tagged
1054    );
1055
1056    # Extract the initial substring of $text that is delimited by
1057    # two (unescaped) instances of the first character in $delim.
1058
1059    ($extracted, $remainder) = extract_delimited($text,$delim);
1060
1061    # Extract the initial substring of $text that is bracketed
1062    # with a delimiter(s) specified by $delim (where the string
1063    # in $delim contains one or more of '(){}[]<>').
1064
1065    ($extracted, $remainder) = extract_bracketed($text,$delim);
1066
1067    # Extract the initial substring of $text that is bounded by
1068    # an XML tag.
1069
1070    ($extracted, $remainder) = extract_tagged($text);
1071
1072    # Extract the initial substring of $text that is bounded by
1073    # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1074
1075    ($extracted, $remainder) =
1076        extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1077
1078    # Extract the initial substring of $text that represents a
1079    # Perl "quote or quote-like operation"
1080
1081    ($extracted, $remainder) = extract_quotelike($text);
1082
1083    # Extract the initial substring of $text that represents a block
1084    # of Perl code, bracketed by any of character(s) specified by $delim
1085    # (where the string $delim contains one or more of '(){}[]<>').
1086
1087    ($extracted, $remainder) = extract_codeblock($text,$delim);
1088
1089    # Extract the initial substrings of $text that would be extracted by
1090    # one or more sequential applications of the specified functions
1091    # or regular expressions
1092
1093    @extracted = extract_multiple($text,
1094                                  [ \&extract_bracketed,
1095                                    \&extract_quotelike,
1096                                    \&some_other_extractor_sub,
1097                                    qr/[xyz]*/,
1098                                    'literal',
1099                                  ]);
1100
1101    # Create a string representing an optimized pattern (a la Friedl)
1102    # that matches a substring delimited by any of the specified characters
1103    # (in this case: any type of quote or a slash)
1104
1105    $patstring = gen_delimited_pat(q{'"`/});
1106
1107    # Generate a reference to an anonymous sub that is just like extract_tagged
1108    # but pre-compiled and optimized for a specific pair of tags, and
1109    # consequently much faster (i.e. 3 times faster). It uses qr// for better
1110    # performance on repeated calls.
1111
1112    $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1113    ($extracted, $remainder) = $extract_head->($text);
1114
1115=head1 DESCRIPTION
1116
1117The various C<extract_...> subroutines may be used to
1118extract a delimited substring, possibly after skipping a
1119specified prefix string. By default, that prefix is
1120optional whitespace (C</\s*/>), but you can change it to whatever
1121you wish (see below).
1122
1123The substring to be extracted must appear at the
1124current C<pos> location of the string's variable
1125(or at index zero, if no C<pos> position is defined).
1126In other words, the C<extract_...> subroutines I<don't>
1127extract the first occurrence of a substring anywhere
1128in a string (like an unanchored regex would). Rather,
1129they extract an occurrence of the substring appearing
1130immediately at the current matching position in the
1131string (like a C<\G>-anchored regex would).
1132
1133=head2 General Behaviour in List Contexts
1134
1135In a list context, all the subroutines return a list, the first three
1136elements of which are always:
1137
1138=over 4
1139
1140=item [0]
1141
1142The extracted string, including the specified delimiters.
1143If the extraction fails C<undef> is returned.
1144
1145=item [1]
1146
1147The remainder of the input string (i.e. the characters after the
1148extracted string). On failure, the entire string is returned.
1149
1150=item [2]
1151
1152The skipped prefix (i.e. the characters before the extracted string).
1153On failure, C<undef> is returned.
1154
1155=back
1156
1157Note that in a list context, the contents of the original input text (the first
1158argument) are not modified in any way.
1159
1160However, if the input text was passed in a variable, that variable's
1161C<pos> value is updated to point at the first character after the
1162extracted text. That means that in a list context the various
1163subroutines can be used much like regular expressions. For example:
1164
1165    while ( $next = (extract_quotelike($text))[0] )
1166    {
1167        # process next quote-like (in $next)
1168    }
1169
1170=head2 General Behaviour in Scalar and Void Contexts
1171
1172In a scalar context, the extracted string is returned, having first been
1173removed from the input text. Thus, the following code also processes
1174each quote-like operation, but actually removes them from $text:
1175
1176    while ( $next = extract_quotelike($text) )
1177    {
1178        # process next quote-like (in $next)
1179    }
1180
1181Note that if the input text is a read-only string (i.e. a literal),
1182no attempt is made to remove the extracted text.
1183
1184In a void context the behaviour of the extraction subroutines is
1185exactly the same as in a scalar context, except (of course) that the
1186extracted substring is not returned.
1187
1188=head2 A Note About Prefixes
1189
1190Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1191This can bite you if you're expecting a prefix specification like
1192'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1193pattern will only succeed if the <H1> tag is on the current line, since
1194. normally doesn't match newlines.
1195
1196To overcome this limitation, you need to turn on /s matching within
1197the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1198
1199=head2 Functions
1200
1201=over 4
1202
1203=item C<extract_delimited>
1204
1205The C<extract_delimited> function formalizes the common idiom
1206of extracting a single-character-delimited substring from the start of
1207a string. For example, to extract a single-quote delimited string, the
1208following code is typically used:
1209
1210    ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1211    $extracted = $1;
1212
1213but with C<extract_delimited> it can be simplified to:
1214
1215    ($extracted,$remainder) = extract_delimited($text, "'");
1216
1217C<extract_delimited> takes up to four scalars (the input text, the
1218delimiters, a prefix pattern to be skipped, and any escape characters)
1219and extracts the initial substring of the text that
1220is appropriately delimited. If the delimiter string has multiple
1221characters, the first one encountered in the text is taken to delimit
1222the substring.
1223The third argument specifies a prefix pattern that is to be skipped
1224(but must be present!) before the substring is extracted.
1225The final argument specifies the escape character to be used for each
1226delimiter.
1227
1228All arguments are optional. If the escape characters are not specified,
1229every delimiter is escaped with a backslash (C<\>).
1230If the prefix is not specified, the
1231pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1232is also not specified, the set C</["'`]/> is used. If the text to be processed
1233is not specified either, C<$_> is used.
1234
1235In list context, C<extract_delimited> returns a array of three
1236elements, the extracted substring (I<including the surrounding
1237delimiters>), the remainder of the text, and the skipped prefix (if
1238any). If a suitable delimited substring is not found, the first
1239element of the array is the empty string, the second is the complete
1240original text, and the prefix returned in the third element is an
1241empty string.
1242
1243In a scalar context, just the extracted substring is returned. In
1244a void context, the extracted substring (and any prefix) are simply
1245removed from the beginning of the first argument.
1246
1247Examples:
1248
1249    # Remove a single-quoted substring from the very beginning of $text:
1250
1251        $substring = extract_delimited($text, "'", '');
1252
1253    # Remove a single-quoted Pascalish substring (i.e. one in which
1254    # doubling the quote character escapes it) from the very
1255    # beginning of $text:
1256
1257        $substring = extract_delimited($text, "'", '', "'");
1258
1259    # Extract a single- or double- quoted substring from the
1260    # beginning of $text, optionally after some whitespace
1261    # (note the list context to protect $text from modification):
1262
1263        ($substring) = extract_delimited $text, q{"'};
1264
1265    # Delete the substring delimited by the first '/' in $text:
1266
1267        $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1268
1269Note that this last example is I<not> the same as deleting the first
1270quote-like pattern. For instance, if C<$text> contained the string:
1271
1272    "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1273
1274then after the deletion it would contain:
1275
1276    "if ('.$UNIXCMD/s) { $cmd = $1; }"
1277
1278not:
1279
1280    "if ('./cmd' =~ ms) { $cmd = $1; }"
1281
1282See L<"extract_quotelike"> for a (partial) solution to this problem.
1283
1284=item C<extract_bracketed>
1285
1286Like C<"extract_delimited">, the C<extract_bracketed> function takes
1287up to three optional scalar arguments: a string to extract from, a delimiter
1288specifier, and a prefix pattern. As before, a missing prefix defaults to
1289optional whitespace and a missing text defaults to C<$_>. However, a missing
1290delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1291
1292C<extract_bracketed> extracts a balanced-bracket-delimited
1293substring (using any one (or more) of the user-specified delimiter
1294brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1295respect quoted unbalanced brackets (see below).
1296
1297A "delimiter bracket" is a bracket in list of delimiters passed as
1298C<extract_bracketed>'s second argument. Delimiter brackets are
1299specified by giving either the left or right (or both!) versions
1300of the required bracket(s). Note that the order in which
1301two or more delimiter brackets are specified is not significant.
1302
1303A "balanced-bracket-delimited substring" is a substring bounded by
1304matched brackets, such that any other (left or right) delimiter
1305bracket I<within> the substring is also matched by an opposite
1306(right or left) delimiter bracket I<at the same level of nesting>. Any
1307type of bracket not in the delimiter list is treated as an ordinary
1308character.
1309
1310In other words, each type of bracket specified as a delimiter must be
1311balanced and correctly nested within the substring, and any other kind of
1312("non-delimiter") bracket in the substring is ignored.
1313
1314For example, given the string:
1315
1316    $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1317
1318then a call to C<extract_bracketed> in a list context:
1319
1320    @result = extract_bracketed( $text, '{}' );
1321
1322would return:
1323
1324    ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1325
1326since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1327(In a scalar context just the first element of the array would be returned. In
1328a void context, C<$text> would be replaced by an empty string.)
1329
1330Likewise the call in:
1331
1332    @result = extract_bracketed( $text, '{[' );
1333
1334would return the same result, since all sets of both types of specified
1335delimiter brackets are correctly nested and balanced.
1336
1337However, the call in:
1338
1339    @result = extract_bracketed( $text, '{([<' );
1340
1341would fail, returning:
1342
1343    ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1344
1345because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1346the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1347return an empty string. In a void context, C<$text> would be unchanged.)
1348
1349Note that the embedded single-quotes in the string don't help in this
1350case, since they have not been specified as acceptable delimiters and are
1351therefore treated as non-delimiter characters (and ignored).
1352
1353However, if a particular species of quote character is included in the
1354delimiter specification, then that type of quote will be correctly handled.
1355for example, if C<$text> is:
1356
1357    $text = '<A HREF=">>>>">link</A>';
1358
1359then
1360
1361    @result = extract_bracketed( $text, '<">' );
1362
1363returns:
1364
1365    ( '<A HREF=">>>>">', 'link</A>', "" )
1366
1367as expected. Without the specification of C<"> as an embedded quoter:
1368
1369    @result = extract_bracketed( $text, '<>' );
1370
1371the result would be:
1372
1373    ( '<A HREF=">', '>>>">link</A>', "" )
1374
1375In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1376quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1377letter 'q' as a delimiter. Hence:
1378
1379    @result = extract_bracketed( $text, '<q>' );
1380
1381would correctly match something like this:
1382
1383    $text = '<leftop: conj /and/ conj>';
1384
1385See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1386
1387=item C<extract_variable>
1388
1389C<extract_variable> extracts any valid Perl variable or
1390variable-involved expression, including scalars, arrays, hashes, array
1391accesses, hash look-ups, method calls through objects, subroutine calls
1392through subroutine references, etc.
1393
1394The subroutine takes up to two optional arguments:
1395
1396=over 4
1397
1398=item 1.
1399
1400A string to be processed (C<$_> if the string is omitted or C<undef>)
1401
1402=item 2.
1403
1404A string specifying a pattern to be matched as a prefix (which is to be
1405skipped). If omitted, optional whitespace is skipped.
1406
1407=back
1408
1409On success in a list context, an array of 3 elements is returned. The
1410elements are:
1411
1412=over 4
1413
1414=item [0]
1415
1416the extracted variable, or variablish expression
1417
1418=item [1]
1419
1420the remainder of the input text,
1421
1422=item [2]
1423
1424the prefix substring (if any),
1425
1426=back
1427
1428On failure, all of these values (except the remaining text) are C<undef>.
1429
1430In a scalar context, C<extract_variable> returns just the complete
1431substring that matched a variablish expression. C<undef> is returned on
1432failure. In addition, the original input text has the returned substring
1433(and any prefix) removed from it.
1434
1435In a void context, the input text just has the matched substring (and
1436any specified prefix) removed.
1437
1438=item C<extract_tagged>
1439
1440C<extract_tagged> extracts and segments text between (balanced)
1441specified tags.
1442
1443The subroutine takes up to five optional arguments:
1444
1445=over 4
1446
1447=item 1.
1448
1449A string to be processed (C<$_> if the string is omitted or C<undef>)
1450
1451=item 2.
1452
1453A string specifying a pattern to be matched as the opening tag.
1454If the pattern string is omitted (or C<undef>) then a pattern
1455that matches any standard XML tag is used.
1456
1457=item 3.
1458
1459A string specifying a pattern to be matched at the closing tag.
1460If the pattern string is omitted (or C<undef>) then the closing
1461tag is constructed by inserting a C</> after any leading bracket
1462characters in the actual opening tag that was matched (I<not> the pattern
1463that matched the tag). For example, if the opening tag pattern
1464is specified as C<'{{\w+}}'> and actually matched the opening tag
1465C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1466
1467=item 4.
1468
1469A string specifying a pattern to be matched as a prefix (which is to be
1470skipped). If omitted, optional whitespace is skipped.
1471
1472=item 5.
1473
1474A hash reference containing various parsing options (see below)
1475
1476=back
1477
1478The various options that can be specified are:
1479
1480=over 4
1481
1482=item C<reject =E<gt> $listref>
1483
1484The list reference contains one or more strings specifying patterns
1485that must I<not> appear within the tagged text.
1486
1487For example, to extract
1488an HTML link (which should not contain nested links) use:
1489
1490        extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1491
1492=item C<ignore =E<gt> $listref>
1493
1494The list reference contains one or more strings specifying patterns
1495that are I<not> to be treated as nested tags within the tagged text
1496(even if they would match the start tag pattern).
1497
1498For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1499
1500        extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1501
1502(also see L<"gen_delimited_pat"> below).
1503
1504=item C<fail =E<gt> $str>
1505
1506The C<fail> option indicates the action to be taken if a matching end
1507tag is not encountered (i.e. before the end of the string or some
1508C<reject> pattern matches). By default, a failure to match a closing
1509tag causes C<extract_tagged> to immediately fail.
1510
1511However, if the string value associated with <reject> is "MAX", then
1512C<extract_tagged> returns the complete text up to the point of failure.
1513If the string is "PARA", C<extract_tagged> returns only the first paragraph
1514after the tag (up to the first line that is either empty or contains
1515only whitespace characters).
1516If the string is "", the default behaviour (i.e. failure) is reinstated.
1517
1518For example, suppose the start tag "/para" introduces a paragraph, which then
1519continues until the next "/endpara" tag or until another "/para" tag is
1520encountered:
1521
1522        $text = "/para line 1\n\nline 3\n/para line 4";
1523
1524        extract_tagged($text, '/para', '/endpara', undef,
1525                                {reject => '/para', fail => MAX );
1526
1527        # EXTRACTED: "/para line 1\n\nline 3\n"
1528
1529Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1530tag refers only to the immediately following paragraph:
1531
1532        $text = "/para line 1\n\nline 3\n/para line 4";
1533
1534        extract_tagged($text, '/para', '/endpara', undef,
1535                        {reject => '/para', fail => MAX );
1536
1537        # EXTRACTED: "/para line 1\n"
1538
1539Note that the specified C<fail> behaviour applies to nested tags as well.
1540
1541=back
1542
1543On success in a list context, an array of 6 elements is returned. The elements are:
1544
1545=over 4
1546
1547=item [0]
1548
1549the extracted tagged substring (including the outermost tags),
1550
1551=item [1]
1552
1553the remainder of the input text,
1554
1555=item [2]
1556
1557the prefix substring (if any),
1558
1559=item [3]
1560
1561the opening tag
1562
1563=item [4]
1564
1565the text between the opening and closing tags
1566
1567=item [5]
1568
1569the closing tag (or "" if no closing tag was found)
1570
1571=back
1572
1573On failure, all of these values (except the remaining text) are C<undef>.
1574
1575In a scalar context, C<extract_tagged> returns just the complete
1576substring that matched a tagged text (including the start and end
1577tags). C<undef> is returned on failure. In addition, the original input
1578text has the returned substring (and any prefix) removed from it.
1579
1580In a void context, the input text just has the matched substring (and
1581any specified prefix) removed.
1582
1583=item C<gen_extract_tagged>
1584
1585C<gen_extract_tagged> generates a new anonymous subroutine which
1586extracts text between (balanced) specified tags. In other words,
1587it generates a function identical in function to C<extract_tagged>.
1588
1589The difference between C<extract_tagged> and the anonymous
1590subroutines generated by
1591C<gen_extract_tagged>, is that those generated subroutines:
1592
1593=over 4
1594
1595=item *
1596
1597do not have to reparse tag specification or parsing options every time
1598they are called (whereas C<extract_tagged> has to effectively rebuild
1599its tag parser on every call);
1600
1601=item *
1602
1603make use of the new qr// construct to pre-compile the regexes they use
1604(whereas C<extract_tagged> uses standard string variable interpolation
1605to create tag-matching patterns).
1606
1607=back
1608
1609The subroutine takes up to four optional arguments (the same set as
1610C<extract_tagged> except for the string to be processed). It returns
1611a reference to a subroutine which in turn takes a single argument (the text to
1612be extracted from).
1613
1614In other words, the implementation of C<extract_tagged> is exactly
1615equivalent to:
1616
1617        sub extract_tagged
1618        {
1619                my $text = shift;
1620                $extractor = gen_extract_tagged(@_);
1621                return $extractor->($text);
1622        }
1623
1624(although C<extract_tagged> is not currently implemented that way).
1625
1626Using C<gen_extract_tagged> to create extraction functions for specific tags
1627is a good idea if those functions are going to be called more than once, since
1628their performance is typically twice as good as the more general-purpose
1629C<extract_tagged>.
1630
1631=item C<extract_quotelike>
1632
1633C<extract_quotelike> attempts to recognize, extract, and segment any
1634one of the various Perl quotes and quotelike operators (see
1635L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1636delimiters (for the quotelike operators), and trailing modifiers are
1637all caught. For example, in:
1638
1639        extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1640
1641        extract_quotelike '  "You said, \"Use sed\"."  '
1642
1643        extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1644
1645        extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1646
1647the full Perl quotelike operations are all extracted correctly.
1648
1649Note too that, when using the /x modifier on a regex, any comment
1650containing the current pattern delimiter will cause the regex to be
1651immediately terminated. In other words:
1652
1653        'm /
1654                (?i)            # CASE INSENSITIVE
1655                [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1656                [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1657           /x'
1658
1659will be extracted as if it were:
1660
1661        'm /
1662                (?i)            # CASE INSENSITIVE
1663                [a-z_]          # LEADING ALPHABETIC/'
1664
1665This behaviour is identical to that of the actual compiler.
1666
1667C<extract_quotelike> takes two arguments: the text to be processed and
1668a prefix to be matched at the very beginning of the text. If no prefix
1669is specified, optional whitespace is the default. If no text is given,
1670C<$_> is used.
1671
1672In a list context, an array of 11 elements is returned. The elements are:
1673
1674=over 4
1675
1676=item [0]
1677
1678the extracted quotelike substring (including trailing modifiers),
1679
1680=item [1]
1681
1682the remainder of the input text,
1683
1684=item [2]
1685
1686the prefix substring (if any),
1687
1688=item [3]
1689
1690the name of the quotelike operator (if any),
1691
1692=item [4]
1693
1694the left delimiter of the first block of the operation,
1695
1696=item [5]
1697
1698the text of the first block of the operation
1699(that is, the contents of
1700a quote, the regex of a match or substitution or the target list of a
1701translation),
1702
1703=item [6]
1704
1705the right delimiter of the first block of the operation,
1706
1707=item [7]
1708
1709the left delimiter of the second block of the operation
1710(that is, if it is a C<s>, C<tr>, or C<y>),
1711
1712=item [8]
1713
1714the text of the second block of the operation
1715(that is, the replacement of a substitution or the translation list
1716of a translation),
1717
1718=item [9]
1719
1720the right delimiter of the second block of the operation (if any),
1721
1722=item [10]
1723
1724the trailing modifiers on the operation (if any).
1725
1726=back
1727
1728For each of the fields marked "(if any)" the default value on success is
1729an empty string.
1730On failure, all of these values (except the remaining text) are C<undef>.
1731
1732In a scalar context, C<extract_quotelike> returns just the complete substring
1733that matched a quotelike operation (or C<undef> on failure). In a scalar or
1734void context, the input text has the same substring (and any specified
1735prefix) removed.
1736
1737Examples:
1738
1739        # Remove the first quotelike literal that appears in text
1740
1741                $quotelike = extract_quotelike($text,'.*?');
1742
1743        # Replace one or more leading whitespace-separated quotelike
1744        # literals in $_ with "<QLL>"
1745
1746                do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1747
1748
1749        # Isolate the search pattern in a quotelike operation from $text
1750
1751                ($op,$pat) = (extract_quotelike $text)[3,5];
1752                if ($op =~ /[ms]/)
1753                {
1754                        print "search pattern: $pat\n";
1755                }
1756                else
1757                {
1758                        print "$op is not a pattern matching operation\n";
1759                }
1760
1761=item C<extract_quotelike>
1762
1763C<extract_quotelike> can successfully extract "here documents" from an input
1764string, but with an important caveat in list contexts.
1765
1766Unlike other types of quote-like literals, a here document is rarely
1767a contiguous substring. For example, a typical piece of code using
1768here document might look like this:
1769
1770        <<'EOMSG' || die;
1771        This is the message.
1772        EOMSG
1773        exit;
1774
1775Given this as an input string in a scalar context, C<extract_quotelike>
1776would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1777leaving the string " || die;\nexit;" in the original variable. In other words,
1778the two separate pieces of the here document are successfully extracted and
1779concatenated.
1780
1781In a list context, C<extract_quotelike> would return the list
1782
1783=over 4
1784
1785=item [0]
1786
1787"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1788including fore and aft delimiters),
1789
1790=item [1]
1791
1792" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1793
1794=item [2]
1795
1796"" (i.e. the prefix substring -- trivial in this case),
1797
1798=item [3]
1799
1800"<<" (i.e. the "name" of the quotelike operator)
1801
1802=item [4]
1803
1804"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1805
1806=item [5]
1807
1808"This is the message.\n" (i.e. the text of the here document),
1809
1810=item [6]
1811
1812"EOMSG" (i.e. the right delimiter of the here document),
1813
1814=item [7..10]
1815
1816"" (a here document has no second left delimiter, second text, second right
1817delimiter, or trailing modifiers).
1818
1819=back
1820
1821However, the matching position of the input variable would be set to
1822"exit;" (i.e. I<after> the closing delimiter of the here document),
1823which would cause the earlier " || die;\nexit;" to be skipped in any
1824sequence of code fragment extractions.
1825
1826To avoid this problem, when it encounters a here document whilst
1827extracting from a modifiable string, C<extract_quotelike> silently
1828rearranges the string to an equivalent piece of Perl:
1829
1830        <<'EOMSG'
1831        This is the message.
1832        EOMSG
1833        || die;
1834        exit;
1835
1836in which the here document I<is> contiguous. It still leaves the
1837matching position after the here document, but now the rest of the line
1838on which the here document starts is not skipped.
1839
1840To prevent <extract_quotelike> from mucking about with the input in this way
1841(this is the only case where a list-context C<extract_quotelike> does so),
1842you can pass the input variable as an interpolated literal:
1843
1844        $quotelike = extract_quotelike("$var");
1845
1846=item C<extract_codeblock>
1847
1848C<extract_codeblock> attempts to recognize and extract a balanced
1849bracket delimited substring that may contain unbalanced brackets
1850inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1851is like a combination of C<"extract_bracketed"> and
1852C<"extract_quotelike">.
1853
1854C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1855a text to process, a set of delimiter brackets to look for, and a prefix to
1856match first. It also takes an optional fourth parameter, which allows the
1857outermost delimiter brackets to be specified separately (see below).
1858
1859Omitting the first argument (input text) means process C<$_> instead.
1860Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1861Omitting the third argument (prefix argument) implies optional whitespace at the start.
1862Omitting the fourth argument (outermost delimiter brackets) indicates that the
1863value of the second argument is to be used for the outermost delimiters.
1864
1865Once the prefix and the outermost opening delimiter bracket have been
1866recognized, code blocks are extracted by stepping through the input text and
1867trying the following alternatives in sequence:
1868
1869=over 4
1870
1871=item 1.
1872
1873Try and match a closing delimiter bracket. If the bracket was the same
1874species as the last opening bracket, return the substring to that
1875point. If the bracket was mismatched, return an error.
1876
1877=item 2.
1878
1879Try to match a quote or quotelike operator. If found, call
1880C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1881the error it returned. Otherwise go back to step 1.
1882
1883=item 3.
1884
1885Try to match an opening delimiter bracket. If found, call
1886C<extract_codeblock> recursively to eat the embedded block. If the
1887recursive call fails, return an error. Otherwise, go back to step 1.
1888
1889=item 4.
1890
1891Unconditionally match a bareword or any other single character, and
1892then go back to step 1.
1893
1894=back
1895
1896Examples:
1897
1898        # Find a while loop in the text
1899
1900                if ($text =~ s/.*?while\s*\{/{/)
1901                {
1902                        $loop = "while " . extract_codeblock($text);
1903                }
1904
1905        # Remove the first round-bracketed list (which may include
1906        # round- or curly-bracketed code blocks or quotelike operators)
1907
1908                extract_codeblock $text, "(){}", '[^(]*';
1909
1910
1911The ability to specify a different outermost delimiter bracket is useful
1912in some circumstances. For example, in the Parse::RecDescent module,
1913parser actions which are to be performed only on a successful parse
1914are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1915
1916        sentence: subject verb object
1917                        <defer: {$::theVerb = $item{verb}} >
1918
1919Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1920within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1921
1922A deferred action like this:
1923
1924                        <defer: {if ($count>10) {$count--}} >
1925
1926will be incorrectly parsed as:
1927
1928                        <defer: {if ($count>
1929
1930because the "less than" operator is interpreted as a closing delimiter.
1931
1932But, by extracting the directive using
1933S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1934the '>' character is only treated as a delimited at the outermost
1935level of the code block, so the directive is parsed correctly.
1936
1937=item C<extract_multiple>
1938
1939The C<extract_multiple> subroutine takes a string to be processed and a
1940list of extractors (subroutines or regular expressions) to apply to that string.
1941
1942In an array context C<extract_multiple> returns an array of substrings
1943of the original string, as extracted by the specified extractors.
1944In a scalar context, C<extract_multiple> returns the first
1945substring successfully extracted from the original string. In both
1946scalar and void contexts the original string has the first successfully
1947extracted substring removed from it. In all contexts
1948C<extract_multiple> starts at the current C<pos> of the string, and
1949sets that C<pos> appropriately after it matches.
1950
1951Hence, the aim of a call to C<extract_multiple> in a list context
1952is to split the processed string into as many non-overlapping fields as
1953possible, by repeatedly applying each of the specified extractors
1954to the remainder of the string. Thus C<extract_multiple> is
1955a generalized form of Perl's C<split> subroutine.
1956
1957The subroutine takes up to four optional arguments:
1958
1959=over 4
1960
1961=item 1.
1962
1963A string to be processed (C<$_> if the string is omitted or C<undef>)
1964
1965=item 2.
1966
1967A reference to a list of subroutine references and/or qr// objects and/or
1968literal strings and/or hash references, specifying the extractors
1969to be used to split the string. If this argument is omitted (or
1970C<undef>) the list:
1971
1972        [
1973                sub { extract_variable($_[0], '') },
1974                sub { extract_quotelike($_[0],'') },
1975                sub { extract_codeblock($_[0],'{}','') },
1976        ]
1977
1978is used.
1979
1980=item 3.
1981
1982An number specifying the maximum number of fields to return. If this
1983argument is omitted (or C<undef>), split continues as long as possible.
1984
1985If the third argument is I<N>, then extraction continues until I<N> fields
1986have been successfully extracted, or until the string has been completely
1987processed.
1988
1989Note that in scalar and void contexts the value of this argument is
1990automatically reset to 1 (under C<-w>, a warning is issued if the argument
1991has to be reset).
1992
1993=item 4.
1994
1995A value indicating whether unmatched substrings (see below) within the
1996text should be skipped or returned as fields. If the value is true,
1997such substrings are skipped. Otherwise, they are returned.
1998
1999=back
2000
2001The extraction process works by applying each extractor in
2002sequence to the text string.
2003
2004If the extractor is a subroutine it is called in a list context and is
2005expected to return a list of a single element, namely the extracted
2006text. It may optionally also return two further arguments: a string
2007representing the text left after extraction (like $' for a pattern
2008match), and a string representing any prefix skipped before the
2009extraction (like $` in a pattern match). Note that this is designed
2010to facilitate the use of other Text::Balanced subroutines with
2011C<extract_multiple>. Note too that the value returned by an extractor
2012subroutine need not bear any relationship to the corresponding substring
2013of the original text (see examples below).
2014
2015If the extractor is a precompiled regular expression or a string,
2016it is matched against the text in a scalar context with a leading
2017'\G' and the gc modifiers enabled. The extracted value is either
2018$1 if that variable is defined after the match, or else the
2019complete match (i.e. $&).
2020
2021If the extractor is a hash reference, it must contain exactly one element.
2022The value of that element is one of the
2023above extractor types (subroutine reference, regular expression, or string).
2024The key of that element is the name of a class into which the successful
2025return value of the extractor will be blessed.
2026
2027If an extractor returns a defined value, that value is immediately
2028treated as the next extracted field and pushed onto the list of fields.
2029If the extractor was specified in a hash reference, the field is also
2030blessed into the appropriate class,
2031
2032If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2033assumed to have failed to extract.
2034If none of the extractor subroutines succeeds, then one
2035character is extracted from the start of the text and the extraction
2036subroutines reapplied. Characters which are thus removed are accumulated and
2037eventually become the next field (unless the fourth argument is true, in which
2038case they are discarded).
2039
2040For example, the following extracts substrings that are valid Perl variables:
2041
2042        @fields = extract_multiple($text,
2043                                   [ sub { extract_variable($_[0]) } ],
2044                                   undef, 1);
2045
2046This example separates a text into fields which are quote delimited,
2047curly bracketed, and anything else. The delimited and bracketed
2048parts are also blessed to identify them (the "anything else" is unblessed):
2049
2050        @fields = extract_multiple($text,
2051                   [
2052                        { Delim => sub { extract_delimited($_[0],q{'"}) } },
2053                        { Brack => sub { extract_bracketed($_[0],'{}') } },
2054                   ]);
2055
2056This call extracts the next single substring that is a valid Perl quotelike
2057operator (and removes it from $text):
2058
2059        $quotelike = extract_multiple($text,
2060                                      [
2061                                        sub { extract_quotelike($_[0]) },
2062                                      ], undef, 1);
2063
2064Finally, here is yet another way to do comma-separated value parsing:
2065
2066        @fields = extract_multiple($csv_text,
2067                                  [
2068                                        sub { extract_delimited($_[0],q{'"}) },
2069                                        qr/([^,]+)(.*)/,
2070                                  ],
2071                                  undef,1);
2072
2073The list in the second argument means:
2074I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2075The undef third argument means:
2076I<"...as many times as possible...">,
2077and the true value in the fourth argument means
2078I<"...discarding anything else that appears (i.e. the commas)">.
2079
2080If you wanted the commas preserved as separate fields (i.e. like split
2081does if your split pattern has capturing parentheses), you would
2082just make the last parameter undefined (or remove it).
2083
2084=item C<gen_delimited_pat>
2085
2086The C<gen_delimited_pat> subroutine takes a single (string) argument and
2087   > builds a Friedl-style optimized regex that matches a string delimited
2088by any one of the characters in the single argument. For example:
2089
2090        gen_delimited_pat(q{'"})
2091
2092returns the regex:
2093
2094        (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2095
2096Note that the specified delimiters are automatically quotemeta'd.
2097
2098A typical use of C<gen_delimited_pat> would be to build special purpose tags
2099for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2100(which might contain quoted strings):
2101
2102        my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2103
2104        extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2105
2106C<gen_delimited_pat> may also be called with an optional second argument,
2107which specifies the "escape" character(s) to be used for each delimiter.
2108For example to match a Pascal-style string (where ' is the delimiter
2109and '' is a literal ' within the string):
2110
2111        gen_delimited_pat(q{'},q{'});
2112
2113Different escape characters can be specified for different delimiters.
2114For example, to specify that '/' is the escape for single quotes
2115and '%' is the escape for double quotes:
2116
2117        gen_delimited_pat(q{'"},q{/%});
2118
2119If more delimiters than escape chars are specified, the last escape char
2120is used for the remaining delimiters.
2121If no escape char is specified for a given specified delimiter, '\' is used.
2122
2123=item C<delimited_pat>
2124
2125Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2126That name may still be used, but is now deprecated.
2127
2128=back
2129
2130=head1 DIAGNOSTICS
2131
2132In a list context, all the functions return C<(undef,$original_text)>
2133on failure. In a scalar context, failure is indicated by returning C<undef>
2134(in this case the input text is not modified in any way).
2135
2136In addition, on failure in I<any> context, the C<$@> variable is set.
2137Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2138below.
2139Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2140which the error was detected (although not necessarily where it occurred!)
2141Printing C<$@> directly produces the error message, with the offset appended.
2142On success, the C<$@> variable is guaranteed to be C<undef>.
2143
2144The available diagnostics are:
2145
2146=over 4
2147
2148=item  C<Did not find a suitable bracket: "%s">
2149
2150The delimiter provided to C<extract_bracketed> was not one of
2151C<'()[]E<lt>E<gt>{}'>.
2152
2153=item  C<Did not find prefix: /%s/>
2154
2155A non-optional prefix was specified but wasn't found at the start of the text.
2156
2157=item  C<Did not find opening bracket after prefix: "%s">
2158
2159C<extract_bracketed> or C<extract_codeblock> was expecting a
2160particular kind of bracket at the start of the text, and didn't find it.
2161
2162=item  C<No quotelike operator found after prefix: "%s">
2163
2164C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2165C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2166it was extracting.
2167
2168=item  C<Unmatched closing bracket: "%c">
2169
2170C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2171a closing bracket where none was expected.
2172
2173=item  C<Unmatched opening bracket(s): "%s">
2174
2175C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
2176out of characters in the text before closing one or more levels of nested
2177brackets.
2178
2179=item C<Unmatched embedded quote (%s)>
2180
2181C<extract_bracketed> attempted to match an embedded quoted substring, but
2182failed to find a closing quote to match it.
2183
2184=item C<Did not find closing delimiter to match '%s'>
2185
2186C<extract_quotelike> was unable to find a closing delimiter to match the
2187one that opened the quote-like operation.
2188
2189=item  C<Mismatched closing bracket: expected "%c" but found "%s">
2190
2191C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2192a valid bracket delimiter, but it was the wrong species. This usually
2193indicates a nesting error, but may indicate incorrect quoting or escaping.
2194
2195=item  C<No block delimiter found after quotelike "%s">
2196
2197C<extract_quotelike> or C<extract_codeblock> found one of the
2198quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2199without a suitable block after it.
2200
2201=item C<Did not find leading dereferencer>
2202
2203C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2204a variable, but didn't find any of them.
2205
2206=item C<Bad identifier after dereferencer>
2207
2208C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2209character was not followed by a legal Perl identifier.
2210
2211=item C<Did not find expected opening bracket at %s>
2212
2213C<extract_codeblock> failed to find any of the outermost opening brackets
2214that were specified.
2215
2216=item C<Improperly nested codeblock at %s>
2217
2218A nested code block was found that started with a delimiter that was specified
2219as being only to be used as an outermost bracket.
2220
2221=item  C<Missing second block for quotelike "%s">
2222
2223C<extract_codeblock> or C<extract_quotelike> found one of the
2224quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2225
2226=item C<No match found for opening bracket>
2227
2228C<extract_codeblock> failed to find a closing bracket to match the outermost
2229opening bracket.
2230
2231=item C<Did not find opening tag: /%s/>
2232
2233C<extract_tagged> did not find a suitable opening tag (after any specified
2234prefix was removed).
2235
2236=item C<Unable to construct closing tag to match: /%s/>
2237
2238C<extract_tagged> matched the specified opening tag and tried to
2239modify the matched text to produce a matching closing tag (because
2240none was specified). It failed to generate the closing tag, almost
2241certainly because the opening tag did not start with a
2242bracket of some kind.
2243
2244=item C<Found invalid nested tag: %s>
2245
2246C<extract_tagged> found a nested tag that appeared in the "reject" list
2247(and the failure mode was not "MAX" or "PARA").
2248
2249=item C<Found unbalanced nested tag: %s>
2250
2251C<extract_tagged> found a nested opening tag that was not matched by a
2252corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2253
2254=item C<Did not find closing tag>
2255
2256C<extract_tagged> reached the end of the text without finding a closing tag
2257to match the original opening tag (and the failure mode was not
2258"MAX" or "PARA").
2259
2260=back
2261
2262=head1 EXPORTS
2263
2264The following symbols are, or can be, exported by this module:
2265
2266=over 4
2267
2268=item Default Exports
2269
2270I<None>.
2271
2272=item Optional Exports
2273
2274C<extract_delimited>,
2275C<extract_bracketed>,
2276C<extract_quotelike>,
2277C<extract_codeblock>,
2278C<extract_variable>,
2279C<extract_tagged>,
2280C<extract_multiple>,
2281C<gen_delimited_pat>,
2282C<gen_extract_tagged>,
2283C<delimited_pat>.
2284
2285=item Export Tags
2286
2287=over 4
2288
2289=item C<:ALL>
2290
2291C<extract_delimited>,
2292C<extract_bracketed>,
2293C<extract_quotelike>,
2294C<extract_codeblock>,
2295C<extract_variable>,
2296C<extract_tagged>,
2297C<extract_multiple>,
2298C<gen_delimited_pat>,
2299C<gen_extract_tagged>,
2300C<delimited_pat>.
2301
2302=back
2303
2304=back
2305
2306=head1 KNOWN BUGS
2307
2308See L<https://rt.cpan.org/Dist/Display.html?Status=Active&Queue=Text-Balanced>.
2309
2310=head1 FEEDBACK
2311
2312Patches, bug reports, suggestions or any other feedback is welcome.
2313
2314Patches can be sent as GitHub pull requests at
2315L<https://github.com/steve-m-hay/Text-Balanced/pulls>.
2316
2317Bug reports and suggestions can be made on the CPAN Request Tracker at
2318L<https://rt.cpan.org/Public/Bug/Report.html?Queue=Text-Balanced>.
2319
2320Currently active requests on the CPAN Request Tracker can be viewed at
2321L<https://rt.cpan.org/Public/Dist/Display.html?Status=Active;Queue=Text-Balanced>.
2322
2323Please test this distribution.  See CPAN Testers Reports at
2324L<https://www.cpantesters.org/> for details of how to get involved.
2325
2326Previous test results on CPAN Testers Reports can be viewed at
2327L<https://www.cpantesters.org/distro/T/Text-Balanced.html>.
2328
2329Please rate this distribution on CPAN Ratings at
2330L<https://cpanratings.perl.org/rate/?distribution=Text-Balanced>.
2331
2332=head1 AVAILABILITY
2333
2334The latest version of this module is available from CPAN (see
2335L<perlmodlib/"CPAN"> for details) at
2336
2337L<https://metacpan.org/release/Text-Balanced> or
2338
2339L<https://www.cpan.org/authors/id/S/SH/SHAY/> or
2340
2341L<https://www.cpan.org/modules/by-module/Text/>.
2342
2343The latest source code is available from GitHub at
2344L<https://github.com/steve-m-hay/Text-Balanced>.
2345
2346=head1 INSTALLATION
2347
2348See the F<INSTALL> file.
2349
2350=head1 AUTHOR
2351
2352Damian Conway E<lt>L<damian@conway.org|mailto:damian@conway.org>E<gt>.
2353
2354Steve Hay E<lt>L<shay@cpan.org|mailto:shay@cpan.org>E<gt> is now maintaining
2355Text::Balanced as of version 2.03.
2356
2357=head1 COPYRIGHT
2358
2359Copyright (C) 1997-2001 Damian Conway.  All rights reserved.
2360
2361Copyright (C) 2009 Adam Kennedy.
2362
2363Copyright (C) 2015, 2020 Steve Hay.  All rights reserved.
2364
2365=head1 LICENCE
2366
2367This module is free software; you can redistribute it and/or modify it under the
2368same terms as Perl itself, i.e. under the terms of either the GNU General Public
2369License or the Artistic License, as specified in the F<LICENCE> file.
2370
2371=head1 VERSION
2372
2373Version 2.04
2374
2375=head1 DATE
2376
237711 Dec 2020
2378
2379=head1 HISTORY
2380
2381See the F<Changes> file.
2382
2383=cut
2384