1use strict;
2use warnings;
3use 5.010;
4
5package Email::Address::List;
6
7our $VERSION = '0.06';
8use Email::Address;
9
10=head1 NAME
11
12Email::Address::List - RFC close address list parsing
13
14=head1 SYNOPSIS
15
16    use Email::Address::List;
17
18    my $header = <<'END';
19    Foo Bar <simple@example.com>, (an obsolete comment),,,
20     a group:
21      a . weird . address @
22      for-real .biz
23     ; invalid thingy, <
24     more@example.com
25     >
26    END
27
28    my @list = Email::Address::List->parse($header);
29    foreach my $e ( @list ) {
30        if ($e->{'type'} eq 'mailbox') {
31            print "an address: ", $e->{'value'}->format ,"\n";
32        }
33        else {
34            print $e->{'type'}, "\n"
35        }
36    }
37
38    # prints:
39    # an address: "Foo Bar" <simple@example.com>
40    # comment
41    # group start
42    # an address: a.weird.address@forreal.biz
43    # group end
44    # unknown
45    # an address: more@example.com
46
47=head1 DESCRIPTION
48
49Parser for From, To, Cc, Bcc, Reply-To, Sender and
50previous prefixed with Resent- (eg Resent-From) headers.
51
52=head1 REASONING
53
54L<Email::Address> is good at parsing addresses out of any text
55even mentioned headers and this module is derived work
56from Email::Address.
57
58However, mentioned headers are structured and contain lists
59of addresses. Most of the time you want to parse such field
60from start to end keeping everything even if it's an invalid
61input.
62
63=head1 METHODS
64
65=head2 parse
66
67A class method that takes a header value (w/o name and :) and
68a set of named options, for example:
69
70    my @list = Email::Address::List->parse( $line, option => 1 );
71
72Returns list of hashes. Each hash at least has 'type' key that
73describes the entry. Types:
74
75=over 4
76
77=item mailbox
78
79A mailbox entry with L<Email::Address> object under value key.
80
81If mailbox has obsolete parts then 'obsolete' is true.
82
83If address (not display-name/phrase or comments, but
84local-part@domain) contains not ASCII chars then 'not_ascii' is
85set to true. According to RFC 5322 not ASCII chars are not
86allowed within mailbox. However, there are no big problems if
87those are used and actually RFC 6532 extends a few rules
88from 5322 with UTF8-non-ascii. Either use the feature or just
89skip such addresses with skip_not_ascii option.
90
91=item group start
92
93Some headers with mailboxes may contain groupped addresses. This
94element is returned for position where group starts. Under value
95key you find name of the group. B<NOTE> that value is not post
96processed at the moment, so it may contain spaces, comments,
97quoted strings and other noise. Author willing to take patches
98and warns that this will be changed at some point without additional
99notifications, so if you need groups info then you better send a
100patch :)
101
102Groups can not be nested, but one field may have multiple groups or
103mix of addresses that are in a group and not in any.
104
105See skip_groups option.
106
107=item group end
108
109Returned when a group ends.
110
111=item comment
112
113Obsolete syntax allows one to use standalone comments between mailboxes
114that can not be addressed to any mailbox. In such situations a comment
115returned as an entry of this type. Comment itself is under value.
116
117=item unknown
118
119Returned if parser met something that shouldn't be there. Parser
120tries to recover by jumping over to next comma (or semicolon if inside
121group) that is out quoted string or comment, so "foo, bar, baz" string
122results in three unknown entries. Jumping over comments and quoted strings
123means that parser is very sensitive to unbalanced quotes and parens,
124but it's on purpose.
125
126=back
127
128It can be controlled which elements are skipped, for example:
129
130    Email::Address::List->parse($line, skip_unknown => 1, ...);
131
132=over 4
133
134=item skip_comments
135
136Skips comments between mailboxes. Comments inside and next to a mailbox
137are not skipped, but returned as part of mailbox entry.
138
139=item skip_not_ascii
140
141Skips mailboxes where address part has not ASCII characters.
142
143=item skip_groups
144
145Skips group starts and end elements, however emails within groups are
146still returned.
147
148=item skip_unknown
149
150Skip anything that is not recognizable. It still tries to recover as
151described earlier.
152
153=back
154
155=cut
156
157#   mailbox         =   name-addr / addr-spec
158#   display-name    =   phrase
159#
160#   from            =   "From:" mailbox-list CRLF
161#   sender          =   "Sender:" mailbox CRLF
162#   reply-to        =   "Reply-To:" address-list CRLF
163#
164#   to              =   "To:" address-list CRLF
165#   cc              =   "Cc:" address-list CRLF
166#   bcc             =   "Bcc:" [address-list / CFWS] CRLF
167#
168#   resent-from     =   "Resent-From:" mailbox-list CRLF
169#   resent-sender   =   "Resent-Sender:" mailbox CRLF
170#   resent-to       =   "Resent-To:" address-list CRLF
171#   resent-cc       =   "Resent-Cc:" address-list CRLF
172#   resent-bcc      =   "Resent-Bcc:" [address-list / CFWS] CRLF
173#
174#   obs-from        =   "From" *WSP ":" mailbox-list CRLF
175#   obs-sender      =   "Sender" *WSP ":" mailbox CRLF
176#   obs-reply-to    =   "Reply-To" *WSP ":" address-list CRLF
177#
178#   obs-to          =   "To" *WSP ":" address-list CRLF
179#   obs-cc          =   "Cc" *WSP ":" address-list CRLF
180#   obs-bcc         =   "Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
181#
182#   obs-resent-from =   "Resent-From" *WSP ":" mailbox-list CRLF
183#   obs-resent-send =   "Resent-Sender" *WSP ":" mailbox CRLF
184#   obs-resent-date =   "Resent-Date" *WSP ":" date-time CRLF
185#   obs-resent-to   =   "Resent-To" *WSP ":" address-list CRLF
186#   obs-resent-cc   =   "Resent-Cc" *WSP ":" address-list CRLF
187#   obs-resent-bcc  =   "Resent-Bcc" *WSP ":" (address-list / (*([CFWS] ",") [CFWS])) CRLF
188#   obs-resent-mid  =   "Resent-Message-ID" *WSP ":" msg-id CRLF
189#   obs-resent-rply =   "Resent-Reply-To" *WSP ":" address-list CRLF
190
191our $COMMENT_NEST_LEVEL ||= 2;
192
193our %RE;
194our %CRE;
195
196$RE{'CTL'}            = q{\x00-\x1F\x7F};
197$RE{'special'}        = q{()<>\\[\\]:;@\\\\,."};
198
199$RE{'text'}           = qr/[^\x0A\x0D]/;
200
201$RE{'quoted_pair'}    = qr/\\$RE{'text'}/;
202
203$RE{'atext'}          = qr/[^$RE{'CTL'}$RE{'special'}\s]/;
204$RE{'ctext'}          = qr/[^()\\]++/;
205$RE{'qtext'}          = qr/[^\\"]/;
206$RE{'dtext'}          = qr/[^\[\]\\]/;
207
208($RE{'ccontent'}, $RE{'comment'}) = (q{})x2;
209for (1 .. $COMMENT_NEST_LEVEL) {
210  $RE{'ccontent'} = qr/$RE{'ctext'}|$RE{'quoted_pair'}|$RE{'comment'}/;
211  $RE{'comment'}  = qr/(?>\s*+\((?:\s*+$RE{'ccontent'})*+\s*+\)\s*+)/;
212}
213$RE{'cfws'}           = qr/$RE{'comment'}++|\s*+/;
214
215$RE{'qcontent'}       = qr/$RE{'qtext'}|$RE{'quoted_pair'}/;
216$RE{'quoted-string'}  = qr/$RE{'cfws'}"$RE{'qcontent'}*+"$RE{'cfws'}/;
217
218$RE{'atom'}           = qr/$RE{'cfws'}$RE{'atext'}++$RE{'cfws'}/;
219
220$RE{'word'}           = qr/$RE{'atom'} | $RE{'quoted-string'}/x;
221$RE{'phrase'}         = qr/$RE{'word'}+/x;
222$RE{'display-name'}   = $RE{'phrase'};
223
224$RE{'dot_atom_text'}  = qr/$RE{'atext'}++(?:\.$RE{'atext'}++)*/;
225$RE{'dot_atom'}       = qr/$RE{'cfws'}$RE{'dot_atom_text'}$RE{'cfws'}/;
226$RE{'local-part'}     = qr/$RE{'dot_atom'}|$RE{'quoted-string'}/;
227
228$RE{'dcontent'}       = qr/$RE{'dtext'}|$RE{'quoted_pair'}/;
229$RE{'domain_literal'} = qr/$RE{'cfws'}\[(?:\s*$RE{'dcontent'})*\s*\]$RE{'cfws'}/;
230$RE{'domain'}         = qr/$RE{'dot_atom'}|$RE{'domain_literal'}/;
231
232$RE{'addr-spec'}      = qr/$RE{'local-part'}\@$RE{'domain'}/;
233$RE{'angle-addr'}     = qr/$RE{'cfws'} < $RE{'addr-spec'} > $RE{'cfws'}/x;
234
235$RE{'name-addr'}      = qr/$RE{'display-name'}?$RE{'angle-addr'}/;
236$RE{'mailbox'}        = qr/(?:$RE{'name-addr'}|$RE{'addr-spec'})$RE{'comment'}*/;
237
238$CRE{'addr-spec'}      = qr/($RE{'local-part'})\@($RE{'domain'})/;
239$CRE{'mailbox'} = qr/
240    (?:
241        ($RE{'display-name'})?($RE{'cfws'})<$CRE{'addr-spec'}>($RE{'cfws'})
242        |$CRE{'addr-spec'}
243    )($RE{'comment'}*+)
244/x;
245
246$RE{'dword'}            = qr/$RE{'cfws'} (?: $RE{'atom'} | \. | "$RE{'qcontent'}++" ) $RE{'cfws'}/x;
247$RE{'obs-phrase'}       = qr/$RE{'word'} $RE{'dword'}*+/x;
248$RE{'obs-display-name'} = $RE{'obs-phrase'};
249$RE{'obs-route'}        = qr/
250    (?:$RE{'cfws'}|,)*
251    \@$RE{'domain'}
252    (?:,$RE{'cfws'}?(?:\@$RE{'domain'})?)*
253    :
254/x;
255$RE{'obs-domain'}       = qr/$RE{'atom'}(?:\.$RE{'atom'})*|$RE{'domain_literal'}/;
256$RE{'obs-local-part'}   = qr/$RE{'word'}(?:\.$RE{'word'})*/;
257$RE{'obs-addr-spec'}    = qr/$RE{'obs-local-part'}\@$RE{'obs-domain'}/;
258$CRE{'obs-addr-spec'}   = qr/($RE{'obs-local-part'})\@($RE{'obs-domain'})/;
259$CRE{'obs-mailbox'} = qr/
260    (?:
261        ($RE{'obs-display-name'})?
262        ($RE{'cfws'})< $RE{'obs-route'}? $CRE{'obs-addr-spec'} >($RE{'cfws'})
263        |$CRE{'obs-addr-spec'}
264    )($RE{'comment'}*+)
265/x;
266
267sub parse {
268    my $self = shift;
269    my %args = @_%2? (line => @_) : @_;
270    my $line = delete $args{'line'};
271
272    my $in_group = 0;
273
274    my @res;
275    while ($line =~ /\S/) {
276        # in obs- case we have number of optional comments/spaces/
277        # address-list    =   (address *("," address)) / obs-addr-list
278        # obs-addr-list   =   *([CFWS] ",") address *("," [address / CFWS]))
279        if ( $line =~ s/^(?:($RE{'cfws'})?,)//o ) {
280            push @res, {type => 'comment', value => $1 }
281                if $1 && !$args{'skip_comments'} && $1 =~ /($RE{'comment'})/;
282            next;
283        }
284        $line =~ s/^\s+//o;
285
286        # now it's only comma separated address where address is:
287        # address         =   mailbox / group
288
289        # deal with groups
290        # group           =   display-name ":" [group-list] ";" [CFWS]
291        # group-list      =   mailbox-list / CFWS / obs-group-list
292        # obs-group-list  =   1*([CFWS] ",") [CFWS])
293        if ( !$in_group && $line =~ s/^($RE{'display-name'})://o ) {
294            push @res, {type => 'group start', value => $1 }
295                unless $args{'skip_groups'};
296            $in_group = 1; next;
297        }
298        if ( $in_group && $line =~ s/^;// ) {
299            push @res, {type => 'group end'} unless $args{'skip_groups'};
300            $in_group = 0; next;
301        }
302
303        # now we got rid of groups and cfws, 'address = mailbox'
304        # mailbox-list    =   (mailbox *("," mailbox)) / obs-mbox-list
305        # obs-mbox-list   =   *([CFWS] ",") mailbox *("," [mailbox / CFWS]))
306
307        # so address-list is now comma separated list of mailboxes:
308        # address-list    = (mailbox *("," mailbox))
309        my $obsolete = 0;
310        if ( $line =~ s/^($CRE{'mailbox'})($RE{cfws}*)(?=,|;|$)//o
311            || ($line =~ s/^($CRE{'obs-mailbox'})($RE{cfws}*)(?=,|;|$)//o and $obsolete = 1)
312        ) {
313            my ($original, $phrase, $user, $host, @comments) = $self->_process_mailbox(
314                $1,$2,$3,$4,$5,$6,$7,$8,$9
315            );
316            my $not_ascii = "$user\@$host" =~ /\P{ASCII}/? 1 : 0;
317            next if $not_ascii && $args{skip_not_ascii};
318
319            push @res, {
320                type => 'mailbox',
321                value => Email::Address->new(
322                    $phrase, "$user\@$host", join(' ', @comments),
323                    $original,
324                ),
325                obsolete => $obsolete,
326                not_ascii => $not_ascii,
327            };
328            next;
329        }
330
331        # if we got here then something unknown on our way
332        # try to recorver
333        if ($in_group) {
334            if ( $line =~ s/^([^;,"\)]*+(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^;,"\)]*+)*+)(?=;|,)//o ) {
335                push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
336                next;
337            }
338        } else {
339            if ( $line =~ s/^([^,"\)]*+(?:(?:$RE{'quoted-string'}|$RE{'comment'})[^,"\)]*+)*+)(?=,)//o ) {
340                push @res, { type => 'unknown', value => $1 } unless $args{'skip_unknown'};
341                next;
342            }
343        }
344        push @res, { type => 'unknown', value => $line } unless $args{'skip_unknown'};
345        last;
346    }
347    return @res;
348}
349
350my $dequote = sub {
351    local $_ = shift;
352    s/^"//; s/"$//; s/\\(.)/$1/g;
353    return "$_";
354};
355my $quote = sub {
356    local $_ = shift;
357    s/([\\"])/\\$1/g;
358    return qq{"$_"};
359};
360
361sub _process_mailbox {
362    my $self = shift;
363    my $original = shift;
364    my @rest = (@_);
365
366    my @comments;
367    foreach ( grep defined, splice @rest ) {
368        s{ ($RE{'quoted-string'}) | ($RE{comment}) }
369         { $1? $1 : do { push @comments, $2; $comments[-1] =~ /^\s|\s$/? ' ' : '' } }xgoe;
370        s/^\s+//; s/\s+$//;
371        next unless length;
372
373        push @rest, $_;
374    }
375    my ($host, $user, $phrase) = reverse @rest;
376
377    # deal with spaces out of quoted strings
378    s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : ' ' }xgoe
379        foreach grep defined, $phrase;
380    s{ ($RE{'quoted-string'}) | \s+ }{ $1? $1 : '' }xgoe
381        foreach $user, $host;
382
383    # dequote
384    s{ ($RE{'quoted-string'}) }{ $dequote->($1) }xgoe
385        foreach grep defined, $phrase, $user;
386    $user = $quote->($user) unless $user =~ /^$RE{'dot_atom'}$/;
387
388    @comments = grep length, map { s/^\s+//; s/\s+$//; $_ } grep defined, @comments;
389    return $original, $phrase, $user, $host, @comments;
390}
391
392
393=head1 AUTHOR
394
395Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt>
396
397=head1 LICENSE
398
399Under the same terms as Perl itself.
400
401=cut
402
4031;
404