1# Copyright (C) 2002-2007 Python Software Foundation 2# Contact: email-sig@python.org 3 4"""Email address parsing code. 5 6Lifted directly from rfc822.py. This should eventually be rewritten. 7""" 8 9from __future__ import unicode_literals 10from __future__ import print_function 11from __future__ import division 12from __future__ import absolute_import 13from future.builtins import int 14 15__all__ = [ 16 'mktime_tz', 17 'parsedate', 18 'parsedate_tz', 19 'quote', 20 ] 21 22import time, calendar 23 24SPACE = ' ' 25EMPTYSTRING = '' 26COMMASPACE = ', ' 27 28# Parse a date field 29_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 30 'aug', 'sep', 'oct', 'nov', 'dec', 31 'january', 'february', 'march', 'april', 'may', 'june', 'july', 32 'august', 'september', 'october', 'november', 'december'] 33 34_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] 35 36# The timezone table does not include the military time zones defined 37# in RFC822, other than Z. According to RFC1123, the description in 38# RFC822 gets the signs wrong, so we can't rely on any such time 39# zones. RFC1123 recommends that numeric timezone indicators be used 40# instead of timezone names. 41 42_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, 43 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) 44 'EST': -500, 'EDT': -400, # Eastern 45 'CST': -600, 'CDT': -500, # Central 46 'MST': -700, 'MDT': -600, # Mountain 47 'PST': -800, 'PDT': -700 # Pacific 48 } 49 50 51def parsedate_tz(data): 52 """Convert a date string to a time tuple. 53 54 Accounts for military timezones. 55 """ 56 res = _parsedate_tz(data) 57 if not res: 58 return 59 if res[9] is None: 60 res[9] = 0 61 return tuple(res) 62 63def _parsedate_tz(data): 64 """Convert date to extended time tuple. 65 66 The last (additional) element is the time zone offset in seconds, except if 67 the timezone was specified as -0000. In that case the last element is 68 None. This indicates a UTC timestamp that explicitly declaims knowledge of 69 the source timezone, as opposed to a +0000 timestamp that indicates the 70 source timezone really was UTC. 71 72 """ 73 if not data: 74 return 75 data = data.split() 76 # The FWS after the comma after the day-of-week is optional, so search and 77 # adjust for this. 78 if data[0].endswith(',') or data[0].lower() in _daynames: 79 # There's a dayname here. Skip it 80 del data[0] 81 else: 82 i = data[0].rfind(',') 83 if i >= 0: 84 data[0] = data[0][i+1:] 85 if len(data) == 3: # RFC 850 date, deprecated 86 stuff = data[0].split('-') 87 if len(stuff) == 3: 88 data = stuff + data[1:] 89 if len(data) == 4: 90 s = data[3] 91 i = s.find('+') 92 if i == -1: 93 i = s.find('-') 94 if i > 0: 95 data[3:] = [s[:i], s[i:]] 96 else: 97 data.append('') # Dummy tz 98 if len(data) < 5: 99 return None 100 data = data[:5] 101 [dd, mm, yy, tm, tz] = data 102 mm = mm.lower() 103 if mm not in _monthnames: 104 dd, mm = mm, dd.lower() 105 if mm not in _monthnames: 106 return None 107 mm = _monthnames.index(mm) + 1 108 if mm > 12: 109 mm -= 12 110 if dd[-1] == ',': 111 dd = dd[:-1] 112 i = yy.find(':') 113 if i > 0: 114 yy, tm = tm, yy 115 if yy[-1] == ',': 116 yy = yy[:-1] 117 if not yy[0].isdigit(): 118 yy, tz = tz, yy 119 if tm[-1] == ',': 120 tm = tm[:-1] 121 tm = tm.split(':') 122 if len(tm) == 2: 123 [thh, tmm] = tm 124 tss = '0' 125 elif len(tm) == 3: 126 [thh, tmm, tss] = tm 127 elif len(tm) == 1 and '.' in tm[0]: 128 # Some non-compliant MUAs use '.' to separate time elements. 129 tm = tm[0].split('.') 130 if len(tm) == 2: 131 [thh, tmm] = tm 132 tss = 0 133 elif len(tm) == 3: 134 [thh, tmm, tss] = tm 135 else: 136 return None 137 try: 138 yy = int(yy) 139 dd = int(dd) 140 thh = int(thh) 141 tmm = int(tmm) 142 tss = int(tss) 143 except ValueError: 144 return None 145 # Check for a yy specified in two-digit format, then convert it to the 146 # appropriate four-digit format, according to the POSIX standard. RFC 822 147 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822) 148 # mandates a 4-digit yy. For more information, see the documentation for 149 # the time module. 150 if yy < 100: 151 # The year is between 1969 and 1999 (inclusive). 152 if yy > 68: 153 yy += 1900 154 # The year is between 2000 and 2068 (inclusive). 155 else: 156 yy += 2000 157 tzoffset = None 158 tz = tz.upper() 159 if tz in _timezones: 160 tzoffset = _timezones[tz] 161 else: 162 try: 163 tzoffset = int(tz) 164 except ValueError: 165 pass 166 if tzoffset==0 and tz.startswith('-'): 167 tzoffset = None 168 # Convert a timezone offset into seconds ; -0500 -> -18000 169 if tzoffset: 170 if tzoffset < 0: 171 tzsign = -1 172 tzoffset = -tzoffset 173 else: 174 tzsign = 1 175 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) 176 # Daylight Saving Time flag is set to -1, since DST is unknown. 177 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset] 178 179 180def parsedate(data): 181 """Convert a time string to a time tuple.""" 182 t = parsedate_tz(data) 183 if isinstance(t, tuple): 184 return t[:9] 185 else: 186 return t 187 188 189def mktime_tz(data): 190 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp.""" 191 if data[9] is None: 192 # No zone info, so localtime is better assumption than GMT 193 return time.mktime(data[:8] + (-1,)) 194 else: 195 t = calendar.timegm(data) 196 return t - data[9] 197 198 199def quote(str): 200 """Prepare string to be used in a quoted string. 201 202 Turns backslash and double quote characters into quoted pairs. These 203 are the only characters that need to be quoted inside a quoted string. 204 Does not add the surrounding double quotes. 205 """ 206 return str.replace('\\', '\\\\').replace('"', '\\"') 207 208 209class AddrlistClass(object): 210 """Address parser class by Ben Escoto. 211 212 To understand what this class does, it helps to have a copy of RFC 2822 in 213 front of you. 214 215 Note: this class interface is deprecated and may be removed in the future. 216 Use email.utils.AddressList instead. 217 """ 218 219 def __init__(self, field): 220 """Initialize a new instance. 221 222 `field' is an unparsed address header field, containing 223 one or more addresses. 224 """ 225 self.specials = '()<>@,:;.\"[]' 226 self.pos = 0 227 self.LWS = ' \t' 228 self.CR = '\r\n' 229 self.FWS = self.LWS + self.CR 230 self.atomends = self.specials + self.LWS + self.CR 231 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it 232 # is obsolete syntax. RFC 2822 requires that we recognize obsolete 233 # syntax, so allow dots in phrases. 234 self.phraseends = self.atomends.replace('.', '') 235 self.field = field 236 self.commentlist = [] 237 238 def gotonext(self): 239 """Skip white space and extract comments.""" 240 wslist = [] 241 while self.pos < len(self.field): 242 if self.field[self.pos] in self.LWS + '\n\r': 243 if self.field[self.pos] not in '\n\r': 244 wslist.append(self.field[self.pos]) 245 self.pos += 1 246 elif self.field[self.pos] == '(': 247 self.commentlist.append(self.getcomment()) 248 else: 249 break 250 return EMPTYSTRING.join(wslist) 251 252 def getaddrlist(self): 253 """Parse all addresses. 254 255 Returns a list containing all of the addresses. 256 """ 257 result = [] 258 while self.pos < len(self.field): 259 ad = self.getaddress() 260 if ad: 261 result += ad 262 else: 263 result.append(('', '')) 264 return result 265 266 def getaddress(self): 267 """Parse the next address.""" 268 self.commentlist = [] 269 self.gotonext() 270 271 oldpos = self.pos 272 oldcl = self.commentlist 273 plist = self.getphraselist() 274 275 self.gotonext() 276 returnlist = [] 277 278 if self.pos >= len(self.field): 279 # Bad email address technically, no domain. 280 if plist: 281 returnlist = [(SPACE.join(self.commentlist), plist[0])] 282 283 elif self.field[self.pos] in '.@': 284 # email address is just an addrspec 285 # this isn't very efficient since we start over 286 self.pos = oldpos 287 self.commentlist = oldcl 288 addrspec = self.getaddrspec() 289 returnlist = [(SPACE.join(self.commentlist), addrspec)] 290 291 elif self.field[self.pos] == ':': 292 # address is a group 293 returnlist = [] 294 295 fieldlen = len(self.field) 296 self.pos += 1 297 while self.pos < len(self.field): 298 self.gotonext() 299 if self.pos < fieldlen and self.field[self.pos] == ';': 300 self.pos += 1 301 break 302 returnlist = returnlist + self.getaddress() 303 304 elif self.field[self.pos] == '<': 305 # Address is a phrase then a route addr 306 routeaddr = self.getrouteaddr() 307 308 if self.commentlist: 309 returnlist = [(SPACE.join(plist) + ' (' + 310 ' '.join(self.commentlist) + ')', routeaddr)] 311 else: 312 returnlist = [(SPACE.join(plist), routeaddr)] 313 314 else: 315 if plist: 316 returnlist = [(SPACE.join(self.commentlist), plist[0])] 317 elif self.field[self.pos] in self.specials: 318 self.pos += 1 319 320 self.gotonext() 321 if self.pos < len(self.field) and self.field[self.pos] == ',': 322 self.pos += 1 323 return returnlist 324 325 def getrouteaddr(self): 326 """Parse a route address (Return-path value). 327 328 This method just skips all the route stuff and returns the addrspec. 329 """ 330 if self.field[self.pos] != '<': 331 return 332 333 expectroute = False 334 self.pos += 1 335 self.gotonext() 336 adlist = '' 337 while self.pos < len(self.field): 338 if expectroute: 339 self.getdomain() 340 expectroute = False 341 elif self.field[self.pos] == '>': 342 self.pos += 1 343 break 344 elif self.field[self.pos] == '@': 345 self.pos += 1 346 expectroute = True 347 elif self.field[self.pos] == ':': 348 self.pos += 1 349 else: 350 adlist = self.getaddrspec() 351 self.pos += 1 352 break 353 self.gotonext() 354 355 return adlist 356 357 def getaddrspec(self): 358 """Parse an RFC 2822 addr-spec.""" 359 aslist = [] 360 361 self.gotonext() 362 while self.pos < len(self.field): 363 preserve_ws = True 364 if self.field[self.pos] == '.': 365 if aslist and not aslist[-1].strip(): 366 aslist.pop() 367 aslist.append('.') 368 self.pos += 1 369 preserve_ws = False 370 elif self.field[self.pos] == '"': 371 aslist.append('"%s"' % quote(self.getquote())) 372 elif self.field[self.pos] in self.atomends: 373 if aslist and not aslist[-1].strip(): 374 aslist.pop() 375 break 376 else: 377 aslist.append(self.getatom()) 378 ws = self.gotonext() 379 if preserve_ws and ws: 380 aslist.append(ws) 381 382 if self.pos >= len(self.field) or self.field[self.pos] != '@': 383 return EMPTYSTRING.join(aslist) 384 385 aslist.append('@') 386 self.pos += 1 387 self.gotonext() 388 return EMPTYSTRING.join(aslist) + self.getdomain() 389 390 def getdomain(self): 391 """Get the complete domain name from an address.""" 392 sdlist = [] 393 while self.pos < len(self.field): 394 if self.field[self.pos] in self.LWS: 395 self.pos += 1 396 elif self.field[self.pos] == '(': 397 self.commentlist.append(self.getcomment()) 398 elif self.field[self.pos] == '[': 399 sdlist.append(self.getdomainliteral()) 400 elif self.field[self.pos] == '.': 401 self.pos += 1 402 sdlist.append('.') 403 elif self.field[self.pos] in self.atomends: 404 break 405 else: 406 sdlist.append(self.getatom()) 407 return EMPTYSTRING.join(sdlist) 408 409 def getdelimited(self, beginchar, endchars, allowcomments=True): 410 """Parse a header fragment delimited by special characters. 411 412 `beginchar' is the start character for the fragment. 413 If self is not looking at an instance of `beginchar' then 414 getdelimited returns the empty string. 415 416 `endchars' is a sequence of allowable end-delimiting characters. 417 Parsing stops when one of these is encountered. 418 419 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed 420 within the parsed fragment. 421 """ 422 if self.field[self.pos] != beginchar: 423 return '' 424 425 slist = [''] 426 quote = False 427 self.pos += 1 428 while self.pos < len(self.field): 429 if quote: 430 slist.append(self.field[self.pos]) 431 quote = False 432 elif self.field[self.pos] in endchars: 433 self.pos += 1 434 break 435 elif allowcomments and self.field[self.pos] == '(': 436 slist.append(self.getcomment()) 437 continue # have already advanced pos from getcomment 438 elif self.field[self.pos] == '\\': 439 quote = True 440 else: 441 slist.append(self.field[self.pos]) 442 self.pos += 1 443 444 return EMPTYSTRING.join(slist) 445 446 def getquote(self): 447 """Get a quote-delimited fragment from self's field.""" 448 return self.getdelimited('"', '"\r', False) 449 450 def getcomment(self): 451 """Get a parenthesis-delimited fragment from self's field.""" 452 return self.getdelimited('(', ')\r', True) 453 454 def getdomainliteral(self): 455 """Parse an RFC 2822 domain-literal.""" 456 return '[%s]' % self.getdelimited('[', ']\r', False) 457 458 def getatom(self, atomends=None): 459 """Parse an RFC 2822 atom. 460 461 Optional atomends specifies a different set of end token delimiters 462 (the default is to use self.atomends). This is used e.g. in 463 getphraselist() since phrase endings must not include the `.' (which 464 is legal in phrases).""" 465 atomlist = [''] 466 if atomends is None: 467 atomends = self.atomends 468 469 while self.pos < len(self.field): 470 if self.field[self.pos] in atomends: 471 break 472 else: 473 atomlist.append(self.field[self.pos]) 474 self.pos += 1 475 476 return EMPTYSTRING.join(atomlist) 477 478 def getphraselist(self): 479 """Parse a sequence of RFC 2822 phrases. 480 481 A phrase is a sequence of words, which are in turn either RFC 2822 482 atoms or quoted-strings. Phrases are canonicalized by squeezing all 483 runs of continuous whitespace into one space. 484 """ 485 plist = [] 486 487 while self.pos < len(self.field): 488 if self.field[self.pos] in self.FWS: 489 self.pos += 1 490 elif self.field[self.pos] == '"': 491 plist.append(self.getquote()) 492 elif self.field[self.pos] == '(': 493 self.commentlist.append(self.getcomment()) 494 elif self.field[self.pos] in self.phraseends: 495 break 496 else: 497 plist.append(self.getatom(self.phraseends)) 498 499 return plist 500 501class AddressList(AddrlistClass): 502 """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" 503 def __init__(self, field): 504 AddrlistClass.__init__(self, field) 505 if field: 506 self.addresslist = self.getaddrlist() 507 else: 508 self.addresslist = [] 509 510 def __len__(self): 511 return len(self.addresslist) 512 513 def __add__(self, other): 514 # Set union 515 newaddr = AddressList(None) 516 newaddr.addresslist = self.addresslist[:] 517 for x in other.addresslist: 518 if not x in self.addresslist: 519 newaddr.addresslist.append(x) 520 return newaddr 521 522 def __iadd__(self, other): 523 # Set union, in-place 524 for x in other.addresslist: 525 if not x in self.addresslist: 526 self.addresslist.append(x) 527 return self 528 529 def __sub__(self, other): 530 # Set difference 531 newaddr = AddressList(None) 532 for x in self.addresslist: 533 if not x in other.addresslist: 534 newaddr.addresslist.append(x) 535 return newaddr 536 537 def __isub__(self, other): 538 # Set difference, in-place 539 for x in other.addresslist: 540 if x in self.addresslist: 541 self.addresslist.remove(x) 542 return self 543 544 def __getitem__(self, index): 545 # Make indexing, slices, and 'in' work 546 return self.addresslist[index] 547