1"""Recognizes simple heuristically delimited bounces."""
2
3import re
4
5from email.iterators import body_line_iterator
6from email.quoprimime import unquote
7from enum import Enum
8from flufl.bounce.interfaces import IBounceDetector, NoTemporaryFailures
9from public import public
10from zope.interface import implementer
11
12
13class ParseState(Enum):
14    start = 0
15    tag_seen = 1
16
17
18def _unquote_match(match):
19    return unquote(match.group(0))
20
21
22def _quopri_decode(address):
23    # Some addresses come back with quopri encoded spaces.  This will decode
24    # them and strip the spaces.  We can't use the undocumebted
25    # email.quoprimime.header_decode() because that also turns underscores
26    # into spaces, which is not good for us.  Instead we'll use the
27    # undocumented email.quoprimime.unquote().
28    #
29    # For compatibility with Python 3, the API requires byte addresses.
30    unquoted = re.sub('=[a-fA-F0-9]{2}', _unquote_match, address)
31    return unquoted.encode('us-ascii').strip()
32
33
34def _c(pattern):
35    return re.compile(pattern, re.IGNORECASE)
36
37
38# This is a list of tuples of the form
39#
40#     (start cre, end cre, address cre)
41#
42# where 'cre' means compiled regular expression, start is the line just before
43# the bouncing address block, end is the line just after the bouncing address
44# block, and address cre is the regexp that will recognize the addresses.  It
45# must have a group called 'addr' which will contain exactly and only the
46# address that bounced.
47PATTERNS = [
48    # sdm.de
49    (_c('here is your list of failed recipients'),
50     _c('here is your returned mail'),
51     _c(r'<(?P<addr>[^>]*)>')),
52    # sz-sb.de, corridor.com, nfg.nl
53    (_c('the following addresses had'),
54     _c('transcript of session follows'),
55     _c(r'^ *(\(expanded from: )?<?(?P<addr>[^\s@]+@[^\s@>]+?)>?\)?\s*$')),
56    # robanal.demon.co.uk
57    (_c('this message was created automatically by mail delivery software'),
58     _c('original message follows'),
59     _c('rcpt to:\s*<(?P<addr>[^>]*)>')),
60    # s1.com (InterScan E-Mail VirusWall NT ???)
61    (_c('message from interscan e-mail viruswall nt'),
62     _c('end of message'),
63     _c('rcpt to:\s*<(?P<addr>[^>]*)>')),
64    # Smail
65    (_c('failed addresses follow:'),
66     _c('message text follows:'),
67     _c(r'\s*(?P<addr>\S+@\S+)')),
68    # newmail.ru
69    (_c('This is the machine generated message from mail service.'),
70     _c('--- Below the next line is a copy of the message.'),
71     _c('<(?P<addr>[^>]*)>')),
72    # turbosport.com runs something called `MDaemon 3.5.2' ???
73    (_c('The following addresses did NOT receive a copy of your message:'),
74     _c('--- Session Transcript ---'),
75     _c('[>]\s*(?P<addr>.*)$')),
76    # usa.net
77    (_c('Intended recipient:\s*(?P<addr>.*)$'),
78     _c('--------RETURNED MAIL FOLLOWS--------'),
79     _c('Intended recipient:\s*(?P<addr>.*)$')),
80    # hotpop.com
81    (_c('Undeliverable Address:\s*(?P<addr>.*)$'),
82     _c('Original message attached'),
83     _c('Undeliverable Address:\s*(?P<addr>.*)$')),
84    # Another demon.co.uk format
85    (_c('This message was created automatically by mail delivery'),
86     _c('^---- START OF RETURNED MESSAGE ----'),
87     _c("addressed to '(?P<addr>[^']*)'")),
88    # Prodigy.net full mailbox
89    (_c("User's mailbox is full:"),
90     _c('Unable to deliver mail.'),
91     _c("User's mailbox is full:\s*<(?P<addr>[^>]*)>")),
92    # Microsoft SMTPSVC
93    (_c('The email below could not be delivered to the following user:'),
94     _c('Old message:'),
95     _c('<(?P<addr>[^>]*)>')),
96    # Yahoo on behalf of other domains like sbcglobal.net
97    (_c('Unable to deliver message to the following address\(es\)\.'),
98     _c('--- Original message follows\.'),
99     _c('<(?P<addr>[^>]*)>:')),
100    # googlemail.com
101    (_c('Delivery to the following recipient failed'),
102     _c('----- Original message -----'),
103     _c('^\s*(?P<addr>[^\s@]+@[^\s@]+)\s*$')),
104    # kundenserver.de
105    (_c('A message that you sent could not be delivered'),
106     _c('^---'),
107     _c('<(?P<addr>[^>]*)>')),
108    # another kundenserver.de
109    (_c('A message that you sent could not be delivered'),
110     _c('^---'),
111     _c('^(?P<addr>[^\s@]+@[^\s@:]+):')),
112    # thehartford.com / songbird
113    (_c('Del(i|e)very to the following recipients (failed|was aborted)'),
114     # this one may or may not have the original message, but there's nothing
115     # unique to stop on, so stop on the first line of at least 3 characters
116     # that doesn't start with 'D' (to not stop immediately) and has no '@'.
117     # Also note that simple_30.txt contains an apparent misspelling in the
118     # MTA's DSN section.
119     _c('^[^D][^@]{2,}$'),
120     _c('^[\s*]*(?P<addr>[^\s@]+@[^\s@]+)\s*$')),
121    # and another thehartfod.com/hartfordlife.com
122    (_c('^Your message\s*$'),
123     _c('^because:'),
124     _c('^\s*(?P<addr>[^\s@]+@[^\s@]+)\s*$')),
125    # kviv.be (InterScan NT)
126    (_c('^Unable to deliver message to'),
127     _c(r'\*+\s+End of message\s+\*+'),
128     _c('<(?P<addr>[^>]*)>')),
129    # earthlink.net supported domains
130    (_c('^Sorry, unable to deliver your message to'),
131     _c('^A copy of the original message'),
132     _c('\s*(?P<addr>[^\s@]+@[^\s@]+)\s+')),
133    # ademe.fr
134    (_c('^A message could not be delivered to:'),
135     _c('^Subject:'),
136     _c('^\s*(?P<addr>[^\s@]+@[^\s@]+)\s*$')),
137    # andrew.ac.jp
138    (_c('^Invalid final delivery userid:'),
139     _c('^Original message follows.'),
140     _c('\s*(?P<addr>[^\s@]+@[^\s@]+)\s*$')),
141    # E500_SMTP_Mail_Service@lerctr.org
142    (_c('------ Failed Recipients ------'),
143     _c('-------- Returned Mail --------'),
144     _c('<(?P<addr>[^>]*)>')),
145    # cynergycom.net
146    (_c('A message that you sent could not be delivered'),
147     _c('^---'),
148     _c('(?P<addr>[^\s@]+@[^\s@)]+)')),
149    # LSMTP for Windows
150    (_c('^--> Error description:\s*$'),
151     _c('^Error-End:'),
152     _c('^Error-for:\s+(?P<addr>[^\s@]+@[^\s@]+)')),
153    # Qmail with a tri-language intro beginning in spanish
154    (_c('Your message could not be delivered'),
155     _c('^-'),
156     _c('<(?P<addr>[^>]*)>:')),
157    # socgen.com
158    (_c('Your message could not be delivered to'),
159     _c('^\s*$'),
160     _c('(?P<addr>[^\s@]+@[^\s@]+)')),
161    # dadoservice.it
162    (_c('Your message has encountered delivery problems'),
163     _c('Your message reads'),
164     _c('addressed to\s*(?P<addr>[^\s@]+@[^\s@)]+)')),
165    # gomaps.com
166    (_c('Did not reach the following recipient'),
167     _c('^\s*$'),
168     _c('\s(?P<addr>[^\s@]+@[^\s@]+)')),
169    # EYOU MTA SYSTEM
170    (_c('This is the deliver program at'),
171     _c('^-'),
172     _c('^(?P<addr>[^\s@]+@[^\s@<>]+)')),
173    # A non-standard qmail at ieo.it
174    (_c('this is the email server at'),
175     _c('^-'),
176     _c('\s(?P<addr>[^\s@]+@[^\s@]+)[\s,]')),
177    # pla.net.py (MDaemon.PRO ?)
178    (_c('- no such user here'),
179     _c('There is no user'),
180     _c('^(?P<addr>[^\s@]+@[^\s@]+)\s')),
181    # mxlogic.net
182    (_c('The following address failed:'),
183     _c('Included is a copy of the message header'),
184     _c('<(?P<addr>[^>]+)>')),
185    # fastdnsservers.com
186    (_c('The following recipient\(s\) could not be reached'),
187     _c('\s*Error Type'),
188     _c('^(?P<addr>[^\s@]+@[^\s@<>]+)')),
189    # xxx.com (simple_36.txt)
190    (_c('Could not deliver message to the following recipient'),
191     _c('\s*-- The header'),
192     _c('Failed Recipient: (?P<addr>[^\s@]+@[^\s@<>]+)')),
193    # mta1.service.uci.edu
194    (_c('Message not delivered to the following addresses'),
195     _c('Error detail'),
196     _c('\s*(?P<addr>[^\s@]+@[^\s@)]+)')),
197    # Dovecot LDA Over quota MDN (bogus - should be DSN).
198    (_c('^Your message'),
199     _c('^Reporting'),
200     _c('Your message to <?(?P<addr>[^\s<@]+@[^\s@>]+)>? was automatically'
201        ' rejected')),
202    # mail.ru
203    (_c('A message that you sent was rejected'),
204     _c('This is a copy of your message'),
205     _c('\s(?P<addr>[^\s@]+@[^\s@]+)')),
206    # MailEnable
207    (_c('Message could not be delivered to some recipients.'),
208     _c('Message headers follow'),
209     _c('Recipient: \[SMTP:(?P<addr>[^\s@]+@[^\s@]+)\]')),
210    # Next one goes here...
211    ]
212
213
214@public
215@implementer(IBounceDetector)
216class SimpleMatch:
217    """Recognizes simple heuristically delimited bounces."""
218
219    PATTERNS = PATTERNS
220
221    def process(self, msg):
222        """See `IBounceDetector`."""
223        addresses = set()
224        # MAS: This is a mess. The outer loop used to be over the message
225        # so we only looped through the message once.  Looping through the
226        # message for each set of patterns is obviously way more work, but
227        # if we don't do it, problems arise because scre from the wrong
228        # pattern set matches first and then acre doesn't match.  The
229        # alternative is to split things into separate modules, but then
230        # we process the message multiple times anyway.
231        for scre, ecre, acre in self.PATTERNS:
232            state = ParseState.start
233            for line in body_line_iterator(msg):
234                if state is ParseState.start:
235                    if scre.search(line):
236                        state = ParseState.tag_seen
237                if state is ParseState.tag_seen:
238                    mo = acre.search(line)
239                    if mo:
240                        address = mo.group('addr')
241                        if address:
242                            addresses.add(_quopri_decode(address))
243                    elif ecre.search(line):
244                        break
245            if len(addresses) > 0:
246                break
247        return NoTemporaryFailures, addresses
248