1"""Recognizes simple heuristically delimited bounces.""" 2 3import re 4 5from email.iterators import body_line_iterator 6from email.quoprimime import unquote 7from enum import Enum 8from flufl.bounce.interfaces import IBounceDetector, NoTemporaryFailures 9from public import public 10from zope.interface import implementer 11 12 13class ParseState(Enum): 14 start = 0 15 tag_seen = 1 16 17 18def _unquote_match(match): 19 return unquote(match.group(0)) 20 21 22def _quopri_decode(address): 23 # Some addresses come back with quopri encoded spaces. This will decode 24 # them and strip the spaces. We can't use the undocumebted 25 # email.quoprimime.header_decode() because that also turns underscores 26 # into spaces, which is not good for us. Instead we'll use the 27 # undocumented email.quoprimime.unquote(). 28 # 29 # For compatibility with Python 3, the API requires byte addresses. 30 unquoted = re.sub('=[a-fA-F0-9]{2}', _unquote_match, address) 31 return unquoted.encode('us-ascii').strip() 32 33 34def _c(pattern): 35 return re.compile(pattern, re.IGNORECASE) 36 37 38# This is a list of tuples of the form 39# 40# (start cre, end cre, address cre) 41# 42# where 'cre' means compiled regular expression, start is the line just before 43# the bouncing address block, end is the line just after the bouncing address 44# block, and address cre is the regexp that will recognize the addresses. It 45# must have a group called 'addr' which will contain exactly and only the 46# address that bounced. 47PATTERNS = [ 48 # sdm.de 49 (_c('here is your list of failed recipients'), 50 _c('here is your returned mail'), 51 _c(r'<(?P<addr>[^>]*)>')), 52 # sz-sb.de, corridor.com, nfg.nl 53 (_c('the following addresses had'), 54 _c('transcript of session follows'), 55 _c(r'^ *(\(expanded from: )?<?(?P<addr>[^\s@]+@[^\s@>]+?)>?\)?\s*$')), 56 # robanal.demon.co.uk 57 (_c('this message was created automatically by mail delivery software'), 58 _c('original message follows'), 59 _c('rcpt to:\s*<(?P<addr>[^>]*)>')), 60 # s1.com (InterScan E-Mail VirusWall NT ???) 61 (_c('message from interscan e-mail viruswall nt'), 62 _c('end of message'), 63 _c('rcpt to:\s*<(?P<addr>[^>]*)>')), 64 # Smail 65 (_c('failed addresses follow:'), 66 _c('message text follows:'), 67 _c(r'\s*(?P<addr>\S+@\S+)')), 68 # newmail.ru 69 (_c('This is the machine generated message from mail service.'), 70 _c('--- Below the next line is a copy of the message.'), 71 _c('<(?P<addr>[^>]*)>')), 72 # turbosport.com runs something called `MDaemon 3.5.2' ??? 73 (_c('The following addresses did NOT receive a copy of your message:'), 74 _c('--- Session Transcript ---'), 75 _c('[>]\s*(?P<addr>.*)$')), 76 # usa.net 77 (_c('Intended recipient:\s*(?P<addr>.*)$'), 78 _c('--------RETURNED MAIL FOLLOWS--------'), 79 _c('Intended recipient:\s*(?P<addr>.*)$')), 80 # hotpop.com 81 (_c('Undeliverable Address:\s*(?P<addr>.*)$'), 82 _c('Original message attached'), 83 _c('Undeliverable Address:\s*(?P<addr>.*)$')), 84 # Another demon.co.uk format 85 (_c('This message was created automatically by mail delivery'), 86 _c('^---- START OF RETURNED MESSAGE ----'), 87 _c("addressed to '(?P<addr>[^']*)'")), 88 # Prodigy.net full mailbox 89 (_c("User's mailbox is full:"), 90 _c('Unable to deliver mail.'), 91 _c("User's mailbox is full:\s*<(?P<addr>[^>]*)>")), 92 # Microsoft SMTPSVC 93 (_c('The email below could not be delivered to the following user:'), 94 _c('Old message:'), 95 _c('<(?P<addr>[^>]*)>')), 96 # Yahoo on behalf of other domains like sbcglobal.net 97 (_c('Unable to deliver message to the following address\(es\)\.'), 98 _c('--- Original message follows\.'), 99 _c('<(?P<addr>[^>]*)>:')), 100 # googlemail.com 101 (_c('Delivery to the following recipient failed'), 102 _c('----- Original message -----'), 103 _c('^\s*(?P<addr>[^\s@]+@[^\s@]+)\s*$')), 104 # kundenserver.de 105 (_c('A message that you sent could not be delivered'), 106 _c('^---'), 107 _c('<(?P<addr>[^>]*)>')), 108 # another kundenserver.de 109 (_c('A message that you sent could not be delivered'), 110 _c('^---'), 111 _c('^(?P<addr>[^\s@]+@[^\s@:]+):')), 112 # thehartford.com / songbird 113 (_c('Del(i|e)very to the following recipients (failed|was aborted)'), 114 # this one may or may not have the original message, but there's nothing 115 # unique to stop on, so stop on the first line of at least 3 characters 116 # that doesn't start with 'D' (to not stop immediately) and has no '@'. 117 # Also note that simple_30.txt contains an apparent misspelling in the 118 # MTA's DSN section. 119 _c('^[^D][^@]{2,}$'), 120 _c('^[\s*]*(?P<addr>[^\s@]+@[^\s@]+)\s*$')), 121 # and another thehartfod.com/hartfordlife.com 122 (_c('^Your message\s*$'), 123 _c('^because:'), 124 _c('^\s*(?P<addr>[^\s@]+@[^\s@]+)\s*$')), 125 # kviv.be (InterScan NT) 126 (_c('^Unable to deliver message to'), 127 _c(r'\*+\s+End of message\s+\*+'), 128 _c('<(?P<addr>[^>]*)>')), 129 # earthlink.net supported domains 130 (_c('^Sorry, unable to deliver your message to'), 131 _c('^A copy of the original message'), 132 _c('\s*(?P<addr>[^\s@]+@[^\s@]+)\s+')), 133 # ademe.fr 134 (_c('^A message could not be delivered to:'), 135 _c('^Subject:'), 136 _c('^\s*(?P<addr>[^\s@]+@[^\s@]+)\s*$')), 137 # andrew.ac.jp 138 (_c('^Invalid final delivery userid:'), 139 _c('^Original message follows.'), 140 _c('\s*(?P<addr>[^\s@]+@[^\s@]+)\s*$')), 141 # E500_SMTP_Mail_Service@lerctr.org 142 (_c('------ Failed Recipients ------'), 143 _c('-------- Returned Mail --------'), 144 _c('<(?P<addr>[^>]*)>')), 145 # cynergycom.net 146 (_c('A message that you sent could not be delivered'), 147 _c('^---'), 148 _c('(?P<addr>[^\s@]+@[^\s@)]+)')), 149 # LSMTP for Windows 150 (_c('^--> Error description:\s*$'), 151 _c('^Error-End:'), 152 _c('^Error-for:\s+(?P<addr>[^\s@]+@[^\s@]+)')), 153 # Qmail with a tri-language intro beginning in spanish 154 (_c('Your message could not be delivered'), 155 _c('^-'), 156 _c('<(?P<addr>[^>]*)>:')), 157 # socgen.com 158 (_c('Your message could not be delivered to'), 159 _c('^\s*$'), 160 _c('(?P<addr>[^\s@]+@[^\s@]+)')), 161 # dadoservice.it 162 (_c('Your message has encountered delivery problems'), 163 _c('Your message reads'), 164 _c('addressed to\s*(?P<addr>[^\s@]+@[^\s@)]+)')), 165 # gomaps.com 166 (_c('Did not reach the following recipient'), 167 _c('^\s*$'), 168 _c('\s(?P<addr>[^\s@]+@[^\s@]+)')), 169 # EYOU MTA SYSTEM 170 (_c('This is the deliver program at'), 171 _c('^-'), 172 _c('^(?P<addr>[^\s@]+@[^\s@<>]+)')), 173 # A non-standard qmail at ieo.it 174 (_c('this is the email server at'), 175 _c('^-'), 176 _c('\s(?P<addr>[^\s@]+@[^\s@]+)[\s,]')), 177 # pla.net.py (MDaemon.PRO ?) 178 (_c('- no such user here'), 179 _c('There is no user'), 180 _c('^(?P<addr>[^\s@]+@[^\s@]+)\s')), 181 # mxlogic.net 182 (_c('The following address failed:'), 183 _c('Included is a copy of the message header'), 184 _c('<(?P<addr>[^>]+)>')), 185 # fastdnsservers.com 186 (_c('The following recipient\(s\) could not be reached'), 187 _c('\s*Error Type'), 188 _c('^(?P<addr>[^\s@]+@[^\s@<>]+)')), 189 # xxx.com (simple_36.txt) 190 (_c('Could not deliver message to the following recipient'), 191 _c('\s*-- The header'), 192 _c('Failed Recipient: (?P<addr>[^\s@]+@[^\s@<>]+)')), 193 # mta1.service.uci.edu 194 (_c('Message not delivered to the following addresses'), 195 _c('Error detail'), 196 _c('\s*(?P<addr>[^\s@]+@[^\s@)]+)')), 197 # Dovecot LDA Over quota MDN (bogus - should be DSN). 198 (_c('^Your message'), 199 _c('^Reporting'), 200 _c('Your message to <?(?P<addr>[^\s<@]+@[^\s@>]+)>? was automatically' 201 ' rejected')), 202 # mail.ru 203 (_c('A message that you sent was rejected'), 204 _c('This is a copy of your message'), 205 _c('\s(?P<addr>[^\s@]+@[^\s@]+)')), 206 # MailEnable 207 (_c('Message could not be delivered to some recipients.'), 208 _c('Message headers follow'), 209 _c('Recipient: \[SMTP:(?P<addr>[^\s@]+@[^\s@]+)\]')), 210 # Next one goes here... 211 ] 212 213 214@public 215@implementer(IBounceDetector) 216class SimpleMatch: 217 """Recognizes simple heuristically delimited bounces.""" 218 219 PATTERNS = PATTERNS 220 221 def process(self, msg): 222 """See `IBounceDetector`.""" 223 addresses = set() 224 # MAS: This is a mess. The outer loop used to be over the message 225 # so we only looped through the message once. Looping through the 226 # message for each set of patterns is obviously way more work, but 227 # if we don't do it, problems arise because scre from the wrong 228 # pattern set matches first and then acre doesn't match. The 229 # alternative is to split things into separate modules, but then 230 # we process the message multiple times anyway. 231 for scre, ecre, acre in self.PATTERNS: 232 state = ParseState.start 233 for line in body_line_iterator(msg): 234 if state is ParseState.start: 235 if scre.search(line): 236 state = ParseState.tag_seen 237 if state is ParseState.tag_seen: 238 mo = acre.search(line) 239 if mo: 240 address = mo.group('addr') 241 if address: 242 addresses.add(_quopri_decode(address)) 243 elif ecre.search(line): 244 break 245 if len(addresses) > 0: 246 break 247 return NoTemporaryFailures, addresses 248