1package Text::ParseWords;
2
3use strict;
4use warnings;
5require 5.006;
6our $VERSION = "3.31";
7
8
9use Exporter;
10our @ISA = qw(Exporter);
11our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
12our @EXPORT_OK = qw(old_shellwords);
13our $PERL_SINGLE_QUOTE;
14
15
16sub shellwords {
17    my (@lines) = @_;
18    my @allwords;
19
20    foreach my $line (@lines) {
21	$line =~ s/^\s+//;
22	my @words = parse_line('\s+', 0, $line);
23	pop @words if (@words and !defined $words[-1]);
24	return() unless (@words || !length($line));
25	push(@allwords, @words);
26    }
27    return(@allwords);
28}
29
30
31
32sub quotewords {
33    my($delim, $keep, @lines) = @_;
34    my($line, @words, @allwords);
35
36    foreach $line (@lines) {
37	@words = parse_line($delim, $keep, $line);
38	return() unless (@words || !length($line));
39	push(@allwords, @words);
40    }
41    return(@allwords);
42}
43
44
45
46sub nested_quotewords {
47    my($delim, $keep, @lines) = @_;
48    my($i, @allwords);
49
50    for ($i = 0; $i < @lines; $i++) {
51	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
52	return() unless (@{$allwords[$i]} || !length($lines[$i]));
53    }
54    return(@allwords);
55}
56
57
58
59sub parse_line {
60    my($delimiter, $keep, $line) = @_;
61    my($word, @pieces);
62
63    no warnings 'uninitialized';	# we will be testing undef strings
64
65    while (length($line)) {
66        # This pattern is optimised to be stack conservative on older perls.
67        # Do not refactor without being careful and testing it on very long strings.
68        # See Perl bug #42980 for an example of a stack busting input.
69        $line =~ s/^
70                    (?:
71                        # double quoted string
72                        (")                             # $quote
73                        ((?>[^\\"]*(?:\\.[^\\"]*)*))"   # $quoted
74		    |	# --OR--
75                        # singe quoted string
76                        (')                             # $quote
77                        ((?>[^\\']*(?:\\.[^\\']*)*))'   # $quoted
78                    |   # --OR--
79                        # unquoted string
80		        (                               # $unquoted
81                            (?:\\.|[^\\"'])*?
82                        )
83                        # followed by
84		        (                               # $delim
85                            \Z(?!\n)                    # EOL
86                        |   # --OR--
87                            (?-x:$delimiter)            # delimiter
88                        |   # --OR--
89                            (?!^)(?=["'])               # a quote
90                        )
91		    )//xs or return;		# extended layout
92        my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
93
94
95	return() unless( defined($quote) || length($unquoted) || length($delim));
96
97        if ($keep) {
98	    $quoted = "$quote$quoted$quote";
99	}
100        else {
101	    $unquoted =~ s/\\(.)/$1/sg;
102	    if (defined $quote) {
103		$quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
104		$quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
105            }
106	}
107        $word .= substr($line, 0, 0);	# leave results tainted
108        $word .= defined $quote ? $quoted : $unquoted;
109
110        if (length($delim)) {
111            push(@pieces, $word);
112            push(@pieces, $delim) if ($keep eq 'delimiters');
113            undef $word;
114        }
115        if (!length($line)) {
116            push(@pieces, $word);
117	}
118    }
119    return(@pieces);
120}
121
122
123
124sub old_shellwords {
125
126    # Usage:
127    #	use ParseWords;
128    #	@words = old_shellwords($line);
129    #	or
130    #	@words = old_shellwords(@lines);
131    #	or
132    #	@words = old_shellwords();	# defaults to $_ (and clobbers it)
133
134    no warnings 'uninitialized';	# we will be testing undef strings
135    local *_ = \join('', @_) if @_;
136    my (@words, $snippet);
137
138    s/\A\s+//;
139    while ($_ ne '') {
140	my $field = substr($_, 0, 0);	# leave results tainted
141	for (;;) {
142	    if (s/\A"(([^"\\]|\\.)*)"//s) {
143		($snippet = $1) =~ s#\\(.)#$1#sg;
144	    }
145	    elsif (/\A"/) {
146		require Carp;
147		Carp::carp("Unmatched double quote: $_");
148		return();
149	    }
150	    elsif (s/\A'(([^'\\]|\\.)*)'//s) {
151		($snippet = $1) =~ s#\\(.)#$1#sg;
152	    }
153	    elsif (/\A'/) {
154		require Carp;
155		Carp::carp("Unmatched single quote: $_");
156		return();
157	    }
158	    elsif (s/\A\\(.?)//s) {
159		$snippet = $1;
160	    }
161	    elsif (s/\A([^\s\\'"]+)//) {
162		$snippet = $1;
163	    }
164	    else {
165		s/\A\s+//;
166		last;
167	    }
168	    $field .= $snippet;
169	}
170	push(@words, $field);
171    }
172    return @words;
173}
174
1751;
176
177__END__
178
179=head1 NAME
180
181Text::ParseWords - parse text into an array of tokens or array of arrays
182
183=head1 SYNOPSIS
184
185  use Text::ParseWords;
186  @lists = nested_quotewords($delim, $keep, @lines);
187  @words = quotewords($delim, $keep, @lines);
188  @words = shellwords(@lines);
189  @words = parse_line($delim, $keep, $line);
190  @words = old_shellwords(@lines); # DEPRECATED!
191
192=head1 DESCRIPTION
193
194The C<nested_quotewords()> and C<quotewords()> functions accept a delimiter
195(which can be a regular expression)
196and a list of lines and then breaks those lines up into a list of
197words ignoring delimiters that appear inside quotes.  C<quotewords()>
198returns all of the tokens in a single long list, while C<nested_quotewords()>
199returns a list of token lists corresponding to the elements of C<@lines>.
200C<parse_line()> does tokenizing on a single string.  The C<*quotewords()>
201functions simply call C<parse_line()>, so if you're only splitting
202one line you can call C<parse_line()> directly and save a function
203call.
204
205The C<$keep> controls what happens with delimters and special characters:
206
207=over 4
208
209=item true
210
211If true, then the tokens are split on the specified delimiter,
212but all other characters (including quotes and backslashes)
213are kept in the tokens.
214
215=item false
216
217If $keep is false then the C<*quotewords()> functions
218remove all quotes and backslashes that are
219not themselves backslash-escaped or inside of single quotes (i.e.,
220C<quotewords()> tries to interpret these characters just like the Bourne
221shell).  NB: these semantics are significantly different from the
222original version of this module shipped with Perl 5.000 through 5.004.
223
224=item C<"delimiters">
225
226As an additional feature, $keep may be the keyword "delimiters" which
227causes the functions to preserve the delimiters in each string as
228tokens in the token lists, in addition to preserving quote and
229backslash characters.
230
231=back
232
233C<shellwords()> is written as a special case of C<quotewords()>, and it
234does token parsing with whitespace as a delimiter-- similar to most
235Unix shells.
236
237=head1 EXAMPLES
238
239The sample program:
240
241  use Text::ParseWords;
242  @words = quotewords('\s+', 0, q{this   is "a test" of\ quotewords \"for you});
243  $i = 0;
244  foreach (@words) {
245      print "$i: <$_>\n";
246      $i++;
247  }
248
249produces:
250
251  0: <this>
252  1: <is>
253  2: <a test>
254  3: <of quotewords>
255  4: <"for>
256  5: <you>
257
258demonstrating:
259
260=over 4
261
262=item 0Z<>
263
264a simple word
265
266=item 1Z<>
267
268multiple spaces are skipped because of our $delim
269
270=item 2Z<>
271
272use of quotes to include a space in a word
273
274=item 3Z<>
275
276use of a backslash to include a space in a word
277
278=item 4Z<>
279
280use of a backslash to remove the special meaning of a double-quote
281
282=item 5Z<>
283
284another simple word (note the lack of effect of the
285backslashed double-quote)
286
287=back
288
289Replacing C<quotewords('\s+', 0, q{this   is...})>
290with C<shellwords(q{this   is...})>
291is a simpler way to accomplish the same thing.
292
293=head1 SEE ALSO
294
295L<Text::CSV> - for parsing CSV files
296
297=head1 AUTHORS
298
299The original author is unknown,
300but presumably this evolved from C<shellwords.pl> in Perl 4.
301
302Much of the code for C<parse_line()>
303(including the primary regexp)
304came from Joerk Behrends E<lt>jbehrends@multimediaproduzenten.deE<gt>.
305
306Examples section and other documentation provided by
307John Heidemann E<lt>johnh@ISI.EDUE<gt>.
308
309Hal Pomeranz E<lt>pomeranz@netcom.comE<gt>
310maintained this from 1994 through 1999,
311and did the first CPAN release.
312
313Alexandr Ciornii E<lt>alexchornyATgmail.comE<gt>
314maintained this from 2008 to 2015.
315
316Many other people have contributed,
317with special thanks due to
318Michael Schwern E<lt>schwern@envirolink.orgE<gt>
319and
320Jeff Friedl E<lt>jfriedl@yahoo-inc.comE<gt>.
321
322=head1 COPYRIGHT AND LICENSE
323
324This library is free software; you may redistribute and/or modify it
325under the same terms as Perl itself.
326
327=cut
328