1use strict; 2use warnings; 3use 5.010; 4 5package Email::Address::List; 6 7our $VERSION = '0.06'; 8use Email::Address; 9 10=head1 NAME 11 12Email::Address::List - RFC close address list parsing 13 14=head1 SYNOPSIS 15 16 use Email::Address::List; 17 18 my $header = <<'END'; 19 Foo Bar <simple@example.com>, (an obsolete comment),,, 20 a group: 21 a . weird . address @ 22 for-real .biz 23 ; invalid thingy, < 24 more@example.com 25 > 26 END 27 28 my @list = Email::Address::List->parse($header); 29 foreach my $e ( @list ) { 30 if ($e->{'type'} eq 'mailbox') { 31 print "an address: ", $e->{'value'}->format ,"\n"; 32 } 33 else { 34 print $e->{'type'}, "\n" 35 } 36 } 37 38 # prints: 39 # an address: "Foo Bar" <simple@example.com> 40 # comment 41 # group start 42 # an address: a.weird.address@forreal.biz 43 # group end 44 # unknown 45 # an address: more@example.com 46 47=head1 DESCRIPTION 48 49Parser for From, To, Cc, Bcc, Reply-To, Sender and 50previous prefixed with Resent- (eg Resent-From) headers. 51 52=head1 REASONING 53 54L<Email::Address> is good at parsing addresses out of any text 55even mentioned headers and this module is derived work 56from Email::Address. 57 58However, mentioned headers are structured and contain lists 59of addresses. Most of the time you want to parse such field 60from start to end keeping everything even if it's an invalid 61input. 62 63=head1 METHODS 64 65=head2 parse 66 67A class method that takes a header value (w/o name and :) and 68a set of named options, for example: 69 70 my @list = Email::Address::List->parse( $line, option => 1 ); 71 72Returns list of hashes. Each hash at least has 'type' key that 73describes the entry. Types: 74 75=over 4 76 77=item mailbox 78 79A mailbox entry with L<Email::Address> object under value key. 80 81If mailbox has obsolete parts then 'obsolete' is true. 82 83If address (not display-name/phrase or comments, but 84local-part@domain) contains not ASCII chars then 'not_ascii' is 85set to true. According to RFC 5322 not ASCII chars are not 86allowed within mailbox. However, there are no big problems if 87those are used and actually RFC 6532 extends a few rules 88from 5322 with UTF8-non-ascii. Either use the feature or just 89skip such addresses with skip_not_ascii option. 90 91=item group start 92 93Some headers with mailboxes may contain groupped addresses. This 94element is returned for position where group starts. Under value 95key you find name of the group. B<NOTE> that value is not post 96processed at the moment, so it may contain spaces, comments, 97quoted strings and other noise. Author willing to take patches 98and warns that this will be changed at some point without additional 99notifications, so if you need groups info then you better send a 100patch :) 101 102Groups can not be nested, but one field may have multiple groups or 103mix of addresses that are in a group and not in any. 104 105See skip_groups option. 106 107=item group end 108 109Returned when a group ends. 110 111=item comment 112 113Obsolete syntax allows one to use standalone comments between mailboxes 114that can not be addressed to any mailbox. In such situations a comment 115returned as an entry of this type. Comment itself is under value. 116 117=item unknown 118 119Returned if parser met something that shouldn't be there. Parser 120tries to recover by jumping over to next comma (or semicolon if inside 121group) that is out quoted string or comment, so "foo, bar, baz" string 122results in three unknown entries. Jumping over comments and quoted strings 123means that parser is very sensitive to unbalanced quotes and parens, 124but it's on purpose. 125 126=back 127 128It can be controlled which elements are skipped, for example: 129 130 Email::Address::List->parse($line, skip_unknown => 1, ...); 131 132=over 4 133 134=item skip_comments 135 136Skips comments between mailboxes. Comments inside and next to a mailbox 137are not skipped, but returned as part of mailbox entry. 138 139=item skip_not_ascii 140 141Skips mailboxes where address part has not ASCII characters. 142 143=item skip_groups 144 145Skips group starts and end elements, however emails within groups are 146still returned. 147 148=item skip_unknown 149 150Skip anything that is not recognizable. It still tries to recover as 151described earlier. 152 153=back 154 155=cut 156 157# mailbox = name-addr / addr-spec 158# display-name = phrase 159# 160# from = "From:" mailbox-list CRLF 161# sender = "Sender:" mailbox CRLF 162# reply-to = "Reply-To:" address-list CRLF 163# 164# to = "To:" address-list CRLF 165# cc = "Cc:" address-list CRLF 166# bcc = "Bcc:" [address-list / CFWS] CRLF 167# 168# resent-from = "Resent-From:" mailbox-list CRLF 169# resent-sender = "Resent-Sender:" mailbox CRLF 170# resent-to = "Resent-To:" address-list CRLF 171# resent-cc = "Resent-Cc:" address-list CRLF 172# resent-bcc = "Resent-Bcc:" [address-list / CFWS] CRLF 173# 174# obs-from = "From" *WSP ":" mailbox-list CRLF 175# obs-sender = "Sender" *WSP ":" mailbox CRLF 176# obs-reply-to = "Reply-To" *WSP ":" address-list CRLF 177# 178# obs-to = "To" *WSP ":" address-list CRLF 179# obs-cc = "Cc" *WSP ":" address-list CRLF 180# obs-bcc = "Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF 181# 182# obs-resent-from = "Resent-From" *WSP ":" mailbox-list CRLF 183# obs-resent-send = "Resent-Sender" *WSP ":" mailbox CRLF 184# obs-resent-date = "Resent-Date" *WSP ":" date-time CRLF 185# obs-resent-to = "Resent-To" *WSP ":" address-list CRLF 186# obs-resent-cc = "Resent-Cc" *WSP ":" address-list CRLF 187# obs-resent-bcc = "Resent-Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF 188# obs-resent-mid = "Resent-Message-ID" *WSP ":" msg-id CRLF 189# obs-resent-rply = "Resent-Reply-To" *WSP ":" address-list CRLF 190 191our $COMMENT_NEST_LEVEL ||= 2; 192 193our %RE; 194our %CRE; 195 196$RE{'CTL'} = q{\x00-\x1F\x7F}; 197$RE{'special'} = q{()<>\\[\\]:;@\\\\,."}; 198 199$RE{'text'} = qr/[^\x0A\x0D]/; 200 201$RE{'quoted_pair'} = qr/\\$RE{'text'}/; 202 203$RE{'atext'} = qr/[^$RE{'CTL'}$RE{'special'}\s]/; 204$RE{'ctext'} = qr/[^()\\]++/; 205$RE{'qtext'} = qr/[^\\"]/; 206$RE{'dtext'} = qr/[^\[\]\\]/; 207 208($RE{'ccontent'}, $RE{'comment'}) = (q{})x2; 209for (1 .. $COMMENT_NEST_LEVEL) { 210 $RE{'ccontent'} = qr/$RE{'ctext'}|$RE{'quoted_pair'}|$RE{'comment'}/; 211 $RE{'comment'} = qr/(?>\s*+\((?:\s*+$RE{'ccontent'})*+\s*+\)\s*+)/; 212} 213$RE{'cfws'} = qr/$RE{'comment'}++|\s*+/; 214 215$RE{'qcontent'} = qr/$RE{'qtext'}|$RE{'quoted_pair'}/; 216$RE{'quoted-string'} = qr/$RE{'cfws'}"$RE{'qcontent'}*+"$RE{'cfws'}/; 217 218$RE{'atom'} = qr/$RE{'cfws'}$RE{'atext'}++$RE{'cfws'}/; 219 220$RE{'word'} = qr/$RE{'atom'} | $RE{'quoted-string'}/x; 221$RE{'phrase'} = qr/$RE{'word'}+/x; 222$RE{'display-name'} = $RE{'phrase'}; 223 224$RE{'dot_atom_text'} = qr/$RE{'atext'}++(?:\.$RE{'atext'}++)*/; 225$RE{'dot_atom'} = qr/$RE{'cfws'}$RE{'dot_atom_text'}$RE{'cfws'}/; 226$RE{'local-part'} = qr/$RE{'dot_atom'}|$RE{'quoted-string'}/; 227 228$RE{'dcontent'} = qr/$RE{'dtext'}|$RE{'quoted_pair'}/; 229$RE{'domain_literal'} = qr/$RE{'cfws'}\[(?:\s*$RE{'dcontent'})*\s*\]$RE{'cfws'}/; 230$RE{'domain'} = qr/$RE{'dot_atom'}|$RE{'domain_literal'}/; 231 232$RE{'addr-spec'} = qr/$RE{'local-part'}\@$RE{'domain'}/; 233$RE{'angle-addr'} = qr/$RE{'cfws'} < $RE{'addr-spec'} > $RE{'cfws'}/x; 234 235$RE{'name-addr'} = qr/$RE{'display-name'}?$RE{'angle-addr'}/; 236$RE{'mailbox'} = qr/(?:$RE{'name-addr'}|$RE{'addr-spec'})$RE{'comment'}*/; 237 238$CRE{'addr-spec'} = qr/($RE{'local-part'})\@($RE{'domain'})/; 239$CRE{'mailbox'} = qr/ 240 (?: 241 ($RE{'display-name'})?($RE{'cfws'})<$CRE{'addr-spec'}>($RE{'cfws'}) 242 |$CRE{'addr-spec'} 243 )($RE{'comment'}*+) 244/x; 245 246$RE{'dword'} = qr/$RE{'cfws'} (?: $RE{'atom'} | \. | "$RE{'qcontent'}++" ) $RE{'cfws'}/x; 247$RE{'obs-phrase'} = qr/$RE{'word'} $RE{'dword'}*+/x; 248$RE{'obs-display-name'} = $RE{'obs-phrase'}; 249$RE{'obs-route'} = qr/ 250 (?:$RE{'cfws'}|,)* 251 \@$RE{'domain'} 252 (?:,$RE{'cfws'}?(?:\@$RE{'domain'})?)* 253 : 254/x; 255$RE{'obs-domain'} = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/; 256$RE{'obs-local-part'} = qr/$RE{'word'}(?:\.$RE{'word'})*/; 257$RE{'obs-addr-spec'} = qr/$RE{'obs-local-part'}\@$RE{'obs-domain'}/; 258$CRE{'obs-addr-spec'} = qr/($RE{'obs-local-part'})\@($RE{'obs-domain'})/; 259$CRE{'obs-mailbox'} = qr/ 260 (?: 261 ($RE{'obs-display-name'})? 262 ($RE{'cfws'})< $RE{'obs-route'}? $CRE{'obs-addr-spec'} >($RE{'cfws'}) 263 |$CRE{'obs-addr-spec'} 264 )($RE{'comment'}*+) 265/x; 266 267sub parse { 268 my $self = shift; 269 my %args = @_%2? (line => @_) : @_; 270 my $line = delete $args{'line'}; 271 272 my $in_group = 0; 273 274 my @res; 275 while ($line =~ /\S/) { 276 # in obs- case we have number of optional comments/spaces/ 277 # address-list = (address *("," address)) / obs-addr-list 278 # obs-addr-list = *([CFWS] ",") address *("," [address / CFWS])) 279 if ( $line =~ s/^(?:($RE{'cfws'})?,)//o ) { 280 push @res, {type => 'comment', value => $1 } 281 if $1 && !$args{'skip_comments'} && $1 =~ /($RE{'comment'})/; 282 next; 283 } 284 $line =~ s/^\s+//o; 285 286 # now it's only comma separated address where address is: 287 # address = mailbox / group 288 289 # deal with groups 290 # group = display-name ":" [group-list] ";" [CFWS] 291 # group-list = mailbox-list / CFWS / obs-group-list 292 # obs-group-list = 1*([CFWS] ",") [CFWS]) 293 if ( !$in_group && $line =~ s/^($RE{'display-name'})://o ) { 294 push @res, {type => 'group start', value => $1 } 295 unless $args{'skip_groups'}; 296 $in_group = 1; next; 297 } 298 if ( $in_group && $line =~ s/^;// ) { 299 push @res, {type => 'group end'} unless $args{'skip_groups'}; 300 $in_group = 0; next; 301 } 302 303 # now we got rid of groups and cfws, 'address = mailbox' 304 # mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list 305 # obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS])) 306 307 # so address-list is now comma separated list of mailboxes: 308 # address-list = (mailbox *("," mailbox)) 309 my $obsolete = 0; 310 if ( $line =~ s/^($CRE{'mailbox'})($RE{cfws}*)(?=,|;|$)//o 311 || ($line =~ s/^($CRE{'obs-mailbox'})($RE{cfws}*)(?=,|;|$)//o and $obsolete = 1) 312 ) { 313 my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox( 314 $1,$2,$3,$4,$5,$6,$7,$8,$9 315 ); 316 my $not_ascii = "$user\@$host" =~ /\P{ASCII}/? 1 : 0; 317 next if $not_ascii && $args{skip_not_ascii}; 318 319 push @res, { 320 type => 'mailbox', 321 value => Email::Address->new( 322 $phrase, "$user\@$host", join(' ', @comments), 323 $original, 324 ), 325 obsolete => $obsolete, 326 not_ascii => $not_ascii, 327 }; 328 next; 329 } 330 331 # if we got here then something unknown on our way 332 # try to recorver 333 if ($in_group) { 334 if ( $line =~ s/^([^;,"\)]*+(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^;,"\)]*+)*+)(?=;|,)//o ) { 335 push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'}; 336 next; 337 } 338 } else { 339 if ( $line =~ s/^([^,"\)]*+(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^,"\)]*+)*+)(?=,)//o ) { 340 push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'}; 341 next; 342 } 343 } 344 push @res, { type => 'unknown', value => $line } unless $args{'skip_unknown'}; 345 last; 346 } 347 return @res; 348} 349 350my $dequote = sub { 351 local $_ = shift; 352 s/^"//; s/"$//; s/\\(.)/$1/g; 353 return "$_"; 354}; 355my $quote = sub { 356 local $_ = shift; 357 s/([\\"])/\\$1/g; 358 return qq{"$_"}; 359}; 360 361sub _process_mailbox { 362 my $self = shift; 363 my $original = shift; 364 my @rest = (@_); 365 366 my @comments; 367 foreach ( grep defined, splice @rest ) { 368 s{ ($RE{'quoted-string'}) | ($RE{comment}) } 369 { $1? $1 : do { push @comments, $2; $comments[-1] =~ /^\s|\s$/? ' ' : '' } }xgoe; 370 s/^\s+//; s/\s+$//; 371 next unless length; 372 373 push @rest, $_; 374 } 375 my ($host, $user, $phrase) = reverse @rest; 376 377 # deal with spaces out of quoted strings 378 s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : ' ' }xgoe 379 foreach grep defined, $phrase; 380 s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : '' }xgoe 381 foreach $user, $host; 382 383 # dequote 384 s{ ($RE{'quoted-string'}) }{ $dequote->($1) }xgoe 385 foreach grep defined, $phrase, $user; 386 $user = $quote->($user) unless $user =~ /^$RE{'dot_atom'}$/; 387 388 @comments = grep length, map { s/^\s+//; s/\s+$//; $_ } grep defined, @comments; 389 return $original, $phrase, $user, $host, @comments; 390} 391 392 393=head1 AUTHOR 394 395Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt> 396 397=head1 LICENSE 398 399Under the same terms as Perl itself. 400 401=cut 402 4031; 404