1package Text::ParseWords; 2 3use strict; 4use warnings; 5require 5.006; 6our $VERSION = "3.31"; 7 8 9use Exporter; 10our @ISA = qw(Exporter); 11our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line); 12our @EXPORT_OK = qw(old_shellwords); 13our $PERL_SINGLE_QUOTE; 14 15 16sub shellwords { 17 my (@lines) = @_; 18 my @allwords; 19 20 foreach my $line (@lines) { 21 $line =~ s/^\s+//; 22 my @words = parse_line('\s+', 0, $line); 23 pop @words if (@words and !defined $words[-1]); 24 return() unless (@words || !length($line)); 25 push(@allwords, @words); 26 } 27 return(@allwords); 28} 29 30 31 32sub quotewords { 33 my($delim, $keep, @lines) = @_; 34 my($line, @words, @allwords); 35 36 foreach $line (@lines) { 37 @words = parse_line($delim, $keep, $line); 38 return() unless (@words || !length($line)); 39 push(@allwords, @words); 40 } 41 return(@allwords); 42} 43 44 45 46sub nested_quotewords { 47 my($delim, $keep, @lines) = @_; 48 my($i, @allwords); 49 50 for ($i = 0; $i < @lines; $i++) { 51 @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]); 52 return() unless (@{$allwords[$i]} || !length($lines[$i])); 53 } 54 return(@allwords); 55} 56 57 58 59sub parse_line { 60 my($delimiter, $keep, $line) = @_; 61 my($word, @pieces); 62 63 no warnings 'uninitialized'; # we will be testing undef strings 64 65 while (length($line)) { 66 # This pattern is optimised to be stack conservative on older perls. 67 # Do not refactor without being careful and testing it on very long strings. 68 # See Perl bug #42980 for an example of a stack busting input. 69 $line =~ s/^ 70 (?: 71 # double quoted string 72 (") # $quote 73 ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted 74 | # --OR-- 75 # singe quoted string 76 (') # $quote 77 ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted 78 | # --OR-- 79 # unquoted string 80 ( # $unquoted 81 (?:\\.|[^\\"'])*? 82 ) 83 # followed by 84 ( # $delim 85 \Z(?!\n) # EOL 86 | # --OR-- 87 (?-x:$delimiter) # delimiter 88 | # --OR-- 89 (?!^)(?=["']) # a quote 90 ) 91 )//xs or return; # extended layout 92 my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6); 93 94 95 return() unless( defined($quote) || length($unquoted) || length($delim)); 96 97 if ($keep) { 98 $quoted = "$quote$quoted$quote"; 99 } 100 else { 101 $unquoted =~ s/\\(.)/$1/sg; 102 if (defined $quote) { 103 $quoted =~ s/\\(.)/$1/sg if ($quote eq '"'); 104 $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'"); 105 } 106 } 107 $word .= substr($line, 0, 0); # leave results tainted 108 $word .= defined $quote ? $quoted : $unquoted; 109 110 if (length($delim)) { 111 push(@pieces, $word); 112 push(@pieces, $delim) if ($keep eq 'delimiters'); 113 undef $word; 114 } 115 if (!length($line)) { 116 push(@pieces, $word); 117 } 118 } 119 return(@pieces); 120} 121 122 123 124sub old_shellwords { 125 126 # Usage: 127 # use ParseWords; 128 # @words = old_shellwords($line); 129 # or 130 # @words = old_shellwords(@lines); 131 # or 132 # @words = old_shellwords(); # defaults to $_ (and clobbers it) 133 134 no warnings 'uninitialized'; # we will be testing undef strings 135 local *_ = \join('', @_) if @_; 136 my (@words, $snippet); 137 138 s/\A\s+//; 139 while ($_ ne '') { 140 my $field = substr($_, 0, 0); # leave results tainted 141 for (;;) { 142 if (s/\A"(([^"\\]|\\.)*)"//s) { 143 ($snippet = $1) =~ s#\\(.)#$1#sg; 144 } 145 elsif (/\A"/) { 146 require Carp; 147 Carp::carp("Unmatched double quote: $_"); 148 return(); 149 } 150 elsif (s/\A'(([^'\\]|\\.)*)'//s) { 151 ($snippet = $1) =~ s#\\(.)#$1#sg; 152 } 153 elsif (/\A'/) { 154 require Carp; 155 Carp::carp("Unmatched single quote: $_"); 156 return(); 157 } 158 elsif (s/\A\\(.?)//s) { 159 $snippet = $1; 160 } 161 elsif (s/\A([^\s\\'"]+)//) { 162 $snippet = $1; 163 } 164 else { 165 s/\A\s+//; 166 last; 167 } 168 $field .= $snippet; 169 } 170 push(@words, $field); 171 } 172 return @words; 173} 174 1751; 176 177__END__ 178 179=head1 NAME 180 181Text::ParseWords - parse text into an array of tokens or array of arrays 182 183=head1 SYNOPSIS 184 185 use Text::ParseWords; 186 @lists = nested_quotewords($delim, $keep, @lines); 187 @words = quotewords($delim, $keep, @lines); 188 @words = shellwords(@lines); 189 @words = parse_line($delim, $keep, $line); 190 @words = old_shellwords(@lines); # DEPRECATED! 191 192=head1 DESCRIPTION 193 194The C<nested_quotewords()> and C<quotewords()> functions accept a delimiter 195(which can be a regular expression) 196and a list of lines and then breaks those lines up into a list of 197words ignoring delimiters that appear inside quotes. C<quotewords()> 198returns all of the tokens in a single long list, while C<nested_quotewords()> 199returns a list of token lists corresponding to the elements of C<@lines>. 200C<parse_line()> does tokenizing on a single string. The C<*quotewords()> 201functions simply call C<parse_line()>, so if you're only splitting 202one line you can call C<parse_line()> directly and save a function 203call. 204 205The C<$keep> controls what happens with delimters and special characters: 206 207=over 4 208 209=item true 210 211If true, then the tokens are split on the specified delimiter, 212but all other characters (including quotes and backslashes) 213are kept in the tokens. 214 215=item false 216 217If $keep is false then the C<*quotewords()> functions 218remove all quotes and backslashes that are 219not themselves backslash-escaped or inside of single quotes (i.e., 220C<quotewords()> tries to interpret these characters just like the Bourne 221shell). NB: these semantics are significantly different from the 222original version of this module shipped with Perl 5.000 through 5.004. 223 224=item C<"delimiters"> 225 226As an additional feature, $keep may be the keyword "delimiters" which 227causes the functions to preserve the delimiters in each string as 228tokens in the token lists, in addition to preserving quote and 229backslash characters. 230 231=back 232 233C<shellwords()> is written as a special case of C<quotewords()>, and it 234does token parsing with whitespace as a delimiter-- similar to most 235Unix shells. 236 237=head1 EXAMPLES 238 239The sample program: 240 241 use Text::ParseWords; 242 @words = quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you}); 243 $i = 0; 244 foreach (@words) { 245 print "$i: <$_>\n"; 246 $i++; 247 } 248 249produces: 250 251 0: <this> 252 1: <is> 253 2: <a test> 254 3: <of quotewords> 255 4: <"for> 256 5: <you> 257 258demonstrating: 259 260=over 4 261 262=item 0Z<> 263 264a simple word 265 266=item 1Z<> 267 268multiple spaces are skipped because of our $delim 269 270=item 2Z<> 271 272use of quotes to include a space in a word 273 274=item 3Z<> 275 276use of a backslash to include a space in a word 277 278=item 4Z<> 279 280use of a backslash to remove the special meaning of a double-quote 281 282=item 5Z<> 283 284another simple word (note the lack of effect of the 285backslashed double-quote) 286 287=back 288 289Replacing C<quotewords('\s+', 0, q{this is...})> 290with C<shellwords(q{this is...})> 291is a simpler way to accomplish the same thing. 292 293=head1 SEE ALSO 294 295L<Text::CSV> - for parsing CSV files 296 297=head1 AUTHORS 298 299The original author is unknown, 300but presumably this evolved from C<shellwords.pl> in Perl 4. 301 302Much of the code for C<parse_line()> 303(including the primary regexp) 304came from Joerk Behrends E<lt>jbehrends@multimediaproduzenten.deE<gt>. 305 306Examples section and other documentation provided by 307John Heidemann E<lt>johnh@ISI.EDUE<gt>. 308 309Hal Pomeranz E<lt>pomeranz@netcom.comE<gt> 310maintained this from 1994 through 1999, 311and did the first CPAN release. 312 313Alexandr Ciornii E<lt>alexchornyATgmail.comE<gt> 314maintained this from 2008 to 2015. 315 316Many other people have contributed, 317with special thanks due to 318Michael Schwern E<lt>schwern@envirolink.orgE<gt> 319and 320Jeff Friedl E<lt>jfriedl@yahoo-inc.comE<gt>. 321 322=head1 COPYRIGHT AND LICENSE 323 324This library is free software; you may redistribute and/or modify it 325under the same terms as Perl itself. 326 327=cut 328