1# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py.  This should eventually be rewritten.
7"""
8
9from __future__ import unicode_literals
10from __future__ import print_function
11from __future__ import division
12from __future__ import absolute_import
13from future.builtins import int
14
15__all__ = [
16    'mktime_tz',
17    'parsedate',
18    'parsedate_tz',
19    'quote',
20    ]
21
22import time, calendar
23
24SPACE = ' '
25EMPTYSTRING = ''
26COMMASPACE = ', '
27
28# Parse a date field
29_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
30               'aug', 'sep', 'oct', 'nov', 'dec',
31               'january', 'february', 'march', 'april', 'may', 'june', 'july',
32               'august', 'september', 'october', 'november', 'december']
33
34_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
35
36# The timezone table does not include the military time zones defined
37# in RFC822, other than Z.  According to RFC1123, the description in
38# RFC822 gets the signs wrong, so we can't rely on any such time
39# zones.  RFC1123 recommends that numeric timezone indicators be used
40# instead of timezone names.
41
42_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
43              'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
44              'EST': -500, 'EDT': -400,  # Eastern
45              'CST': -600, 'CDT': -500,  # Central
46              'MST': -700, 'MDT': -600,  # Mountain
47              'PST': -800, 'PDT': -700   # Pacific
48              }
49
50
51def parsedate_tz(data):
52    """Convert a date string to a time tuple.
53
54    Accounts for military timezones.
55    """
56    res = _parsedate_tz(data)
57    if not res:
58        return
59    if res[9] is None:
60        res[9] = 0
61    return tuple(res)
62
63def _parsedate_tz(data):
64    """Convert date to extended time tuple.
65
66    The last (additional) element is the time zone offset in seconds, except if
67    the timezone was specified as -0000.  In that case the last element is
68    None.  This indicates a UTC timestamp that explicitly declaims knowledge of
69    the source timezone, as opposed to a +0000 timestamp that indicates the
70    source timezone really was UTC.
71
72    """
73    if not data:
74        return
75    data = data.split()
76    # The FWS after the comma after the day-of-week is optional, so search and
77    # adjust for this.
78    if data[0].endswith(',') or data[0].lower() in _daynames:
79        # There's a dayname here. Skip it
80        del data[0]
81    else:
82        i = data[0].rfind(',')
83        if i >= 0:
84            data[0] = data[0][i+1:]
85    if len(data) == 3: # RFC 850 date, deprecated
86        stuff = data[0].split('-')
87        if len(stuff) == 3:
88            data = stuff + data[1:]
89    if len(data) == 4:
90        s = data[3]
91        i = s.find('+')
92        if i == -1:
93            i = s.find('-')
94        if i > 0:
95            data[3:] = [s[:i], s[i:]]
96        else:
97            data.append('') # Dummy tz
98    if len(data) < 5:
99        return None
100    data = data[:5]
101    [dd, mm, yy, tm, tz] = data
102    mm = mm.lower()
103    if mm not in _monthnames:
104        dd, mm = mm, dd.lower()
105        if mm not in _monthnames:
106            return None
107    mm = _monthnames.index(mm) + 1
108    if mm > 12:
109        mm -= 12
110    if dd[-1] == ',':
111        dd = dd[:-1]
112    i = yy.find(':')
113    if i > 0:
114        yy, tm = tm, yy
115    if yy[-1] == ',':
116        yy = yy[:-1]
117    if not yy[0].isdigit():
118        yy, tz = tz, yy
119    if tm[-1] == ',':
120        tm = tm[:-1]
121    tm = tm.split(':')
122    if len(tm) == 2:
123        [thh, tmm] = tm
124        tss = '0'
125    elif len(tm) == 3:
126        [thh, tmm, tss] = tm
127    elif len(tm) == 1 and '.' in tm[0]:
128        # Some non-compliant MUAs use '.' to separate time elements.
129        tm = tm[0].split('.')
130        if len(tm) == 2:
131            [thh, tmm] = tm
132            tss = 0
133        elif len(tm) == 3:
134            [thh, tmm, tss] = tm
135    else:
136        return None
137    try:
138        yy = int(yy)
139        dd = int(dd)
140        thh = int(thh)
141        tmm = int(tmm)
142        tss = int(tss)
143    except ValueError:
144        return None
145    # Check for a yy specified in two-digit format, then convert it to the
146    # appropriate four-digit format, according to the POSIX standard. RFC 822
147    # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
148    # mandates a 4-digit yy. For more information, see the documentation for
149    # the time module.
150    if yy < 100:
151        # The year is between 1969 and 1999 (inclusive).
152        if yy > 68:
153            yy += 1900
154        # The year is between 2000 and 2068 (inclusive).
155        else:
156            yy += 2000
157    tzoffset = None
158    tz = tz.upper()
159    if tz in _timezones:
160        tzoffset = _timezones[tz]
161    else:
162        try:
163            tzoffset = int(tz)
164        except ValueError:
165            pass
166        if tzoffset==0 and tz.startswith('-'):
167            tzoffset = None
168    # Convert a timezone offset into seconds ; -0500 -> -18000
169    if tzoffset:
170        if tzoffset < 0:
171            tzsign = -1
172            tzoffset = -tzoffset
173        else:
174            tzsign = 1
175        tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
176    # Daylight Saving Time flag is set to -1, since DST is unknown.
177    return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
178
179
180def parsedate(data):
181    """Convert a time string to a time tuple."""
182    t = parsedate_tz(data)
183    if isinstance(t, tuple):
184        return t[:9]
185    else:
186        return t
187
188
189def mktime_tz(data):
190    """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
191    if data[9] is None:
192        # No zone info, so localtime is better assumption than GMT
193        return time.mktime(data[:8] + (-1,))
194    else:
195        t = calendar.timegm(data)
196        return t - data[9]
197
198
199def quote(str):
200    """Prepare string to be used in a quoted string.
201
202    Turns backslash and double quote characters into quoted pairs.  These
203    are the only characters that need to be quoted inside a quoted string.
204    Does not add the surrounding double quotes.
205    """
206    return str.replace('\\', '\\\\').replace('"', '\\"')
207
208
209class AddrlistClass(object):
210    """Address parser class by Ben Escoto.
211
212    To understand what this class does, it helps to have a copy of RFC 2822 in
213    front of you.
214
215    Note: this class interface is deprecated and may be removed in the future.
216    Use email.utils.AddressList instead.
217    """
218
219    def __init__(self, field):
220        """Initialize a new instance.
221
222        `field' is an unparsed address header field, containing
223        one or more addresses.
224        """
225        self.specials = '()<>@,:;.\"[]'
226        self.pos = 0
227        self.LWS = ' \t'
228        self.CR = '\r\n'
229        self.FWS = self.LWS + self.CR
230        self.atomends = self.specials + self.LWS + self.CR
231        # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
232        # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
233        # syntax, so allow dots in phrases.
234        self.phraseends = self.atomends.replace('.', '')
235        self.field = field
236        self.commentlist = []
237
238    def gotonext(self):
239        """Skip white space and extract comments."""
240        wslist = []
241        while self.pos < len(self.field):
242            if self.field[self.pos] in self.LWS + '\n\r':
243                if self.field[self.pos] not in '\n\r':
244                    wslist.append(self.field[self.pos])
245                self.pos += 1
246            elif self.field[self.pos] == '(':
247                self.commentlist.append(self.getcomment())
248            else:
249                break
250        return EMPTYSTRING.join(wslist)
251
252    def getaddrlist(self):
253        """Parse all addresses.
254
255        Returns a list containing all of the addresses.
256        """
257        result = []
258        while self.pos < len(self.field):
259            ad = self.getaddress()
260            if ad:
261                result += ad
262            else:
263                result.append(('', ''))
264        return result
265
266    def getaddress(self):
267        """Parse the next address."""
268        self.commentlist = []
269        self.gotonext()
270
271        oldpos = self.pos
272        oldcl = self.commentlist
273        plist = self.getphraselist()
274
275        self.gotonext()
276        returnlist = []
277
278        if self.pos >= len(self.field):
279            # Bad email address technically, no domain.
280            if plist:
281                returnlist = [(SPACE.join(self.commentlist), plist[0])]
282
283        elif self.field[self.pos] in '.@':
284            # email address is just an addrspec
285            # this isn't very efficient since we start over
286            self.pos = oldpos
287            self.commentlist = oldcl
288            addrspec = self.getaddrspec()
289            returnlist = [(SPACE.join(self.commentlist), addrspec)]
290
291        elif self.field[self.pos] == ':':
292            # address is a group
293            returnlist = []
294
295            fieldlen = len(self.field)
296            self.pos += 1
297            while self.pos < len(self.field):
298                self.gotonext()
299                if self.pos < fieldlen and self.field[self.pos] == ';':
300                    self.pos += 1
301                    break
302                returnlist = returnlist + self.getaddress()
303
304        elif self.field[self.pos] == '<':
305            # Address is a phrase then a route addr
306            routeaddr = self.getrouteaddr()
307
308            if self.commentlist:
309                returnlist = [(SPACE.join(plist) + ' (' +
310                               ' '.join(self.commentlist) + ')', routeaddr)]
311            else:
312                returnlist = [(SPACE.join(plist), routeaddr)]
313
314        else:
315            if plist:
316                returnlist = [(SPACE.join(self.commentlist), plist[0])]
317            elif self.field[self.pos] in self.specials:
318                self.pos += 1
319
320        self.gotonext()
321        if self.pos < len(self.field) and self.field[self.pos] == ',':
322            self.pos += 1
323        return returnlist
324
325    def getrouteaddr(self):
326        """Parse a route address (Return-path value).
327
328        This method just skips all the route stuff and returns the addrspec.
329        """
330        if self.field[self.pos] != '<':
331            return
332
333        expectroute = False
334        self.pos += 1
335        self.gotonext()
336        adlist = ''
337        while self.pos < len(self.field):
338            if expectroute:
339                self.getdomain()
340                expectroute = False
341            elif self.field[self.pos] == '>':
342                self.pos += 1
343                break
344            elif self.field[self.pos] == '@':
345                self.pos += 1
346                expectroute = True
347            elif self.field[self.pos] == ':':
348                self.pos += 1
349            else:
350                adlist = self.getaddrspec()
351                self.pos += 1
352                break
353            self.gotonext()
354
355        return adlist
356
357    def getaddrspec(self):
358        """Parse an RFC 2822 addr-spec."""
359        aslist = []
360
361        self.gotonext()
362        while self.pos < len(self.field):
363            preserve_ws = True
364            if self.field[self.pos] == '.':
365                if aslist and not aslist[-1].strip():
366                    aslist.pop()
367                aslist.append('.')
368                self.pos += 1
369                preserve_ws = False
370            elif self.field[self.pos] == '"':
371                aslist.append('"%s"' % quote(self.getquote()))
372            elif self.field[self.pos] in self.atomends:
373                if aslist and not aslist[-1].strip():
374                    aslist.pop()
375                break
376            else:
377                aslist.append(self.getatom())
378            ws = self.gotonext()
379            if preserve_ws and ws:
380                aslist.append(ws)
381
382        if self.pos >= len(self.field) or self.field[self.pos] != '@':
383            return EMPTYSTRING.join(aslist)
384
385        aslist.append('@')
386        self.pos += 1
387        self.gotonext()
388        return EMPTYSTRING.join(aslist) + self.getdomain()
389
390    def getdomain(self):
391        """Get the complete domain name from an address."""
392        sdlist = []
393        while self.pos < len(self.field):
394            if self.field[self.pos] in self.LWS:
395                self.pos += 1
396            elif self.field[self.pos] == '(':
397                self.commentlist.append(self.getcomment())
398            elif self.field[self.pos] == '[':
399                sdlist.append(self.getdomainliteral())
400            elif self.field[self.pos] == '.':
401                self.pos += 1
402                sdlist.append('.')
403            elif self.field[self.pos] in self.atomends:
404                break
405            else:
406                sdlist.append(self.getatom())
407        return EMPTYSTRING.join(sdlist)
408
409    def getdelimited(self, beginchar, endchars, allowcomments=True):
410        """Parse a header fragment delimited by special characters.
411
412        `beginchar' is the start character for the fragment.
413        If self is not looking at an instance of `beginchar' then
414        getdelimited returns the empty string.
415
416        `endchars' is a sequence of allowable end-delimiting characters.
417        Parsing stops when one of these is encountered.
418
419        If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
420        within the parsed fragment.
421        """
422        if self.field[self.pos] != beginchar:
423            return ''
424
425        slist = ['']
426        quote = False
427        self.pos += 1
428        while self.pos < len(self.field):
429            if quote:
430                slist.append(self.field[self.pos])
431                quote = False
432            elif self.field[self.pos] in endchars:
433                self.pos += 1
434                break
435            elif allowcomments and self.field[self.pos] == '(':
436                slist.append(self.getcomment())
437                continue        # have already advanced pos from getcomment
438            elif self.field[self.pos] == '\\':
439                quote = True
440            else:
441                slist.append(self.field[self.pos])
442            self.pos += 1
443
444        return EMPTYSTRING.join(slist)
445
446    def getquote(self):
447        """Get a quote-delimited fragment from self's field."""
448        return self.getdelimited('"', '"\r', False)
449
450    def getcomment(self):
451        """Get a parenthesis-delimited fragment from self's field."""
452        return self.getdelimited('(', ')\r', True)
453
454    def getdomainliteral(self):
455        """Parse an RFC 2822 domain-literal."""
456        return '[%s]' % self.getdelimited('[', ']\r', False)
457
458    def getatom(self, atomends=None):
459        """Parse an RFC 2822 atom.
460
461        Optional atomends specifies a different set of end token delimiters
462        (the default is to use self.atomends).  This is used e.g. in
463        getphraselist() since phrase endings must not include the `.' (which
464        is legal in phrases)."""
465        atomlist = ['']
466        if atomends is None:
467            atomends = self.atomends
468
469        while self.pos < len(self.field):
470            if self.field[self.pos] in atomends:
471                break
472            else:
473                atomlist.append(self.field[self.pos])
474            self.pos += 1
475
476        return EMPTYSTRING.join(atomlist)
477
478    def getphraselist(self):
479        """Parse a sequence of RFC 2822 phrases.
480
481        A phrase is a sequence of words, which are in turn either RFC 2822
482        atoms or quoted-strings.  Phrases are canonicalized by squeezing all
483        runs of continuous whitespace into one space.
484        """
485        plist = []
486
487        while self.pos < len(self.field):
488            if self.field[self.pos] in self.FWS:
489                self.pos += 1
490            elif self.field[self.pos] == '"':
491                plist.append(self.getquote())
492            elif self.field[self.pos] == '(':
493                self.commentlist.append(self.getcomment())
494            elif self.field[self.pos] in self.phraseends:
495                break
496            else:
497                plist.append(self.getatom(self.phraseends))
498
499        return plist
500
501class AddressList(AddrlistClass):
502    """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
503    def __init__(self, field):
504        AddrlistClass.__init__(self, field)
505        if field:
506            self.addresslist = self.getaddrlist()
507        else:
508            self.addresslist = []
509
510    def __len__(self):
511        return len(self.addresslist)
512
513    def __add__(self, other):
514        # Set union
515        newaddr = AddressList(None)
516        newaddr.addresslist = self.addresslist[:]
517        for x in other.addresslist:
518            if not x in self.addresslist:
519                newaddr.addresslist.append(x)
520        return newaddr
521
522    def __iadd__(self, other):
523        # Set union, in-place
524        for x in other.addresslist:
525            if not x in self.addresslist:
526                self.addresslist.append(x)
527        return self
528
529    def __sub__(self, other):
530        # Set difference
531        newaddr = AddressList(None)
532        for x in self.addresslist:
533            if not x in other.addresslist:
534                newaddr.addresslist.append(x)
535        return newaddr
536
537    def __isub__(self, other):
538        # Set difference, in-place
539        for x in other.addresslist:
540            if x in self.addresslist:
541                self.addresslist.remove(x)
542        return self
543
544    def __getitem__(self, index):
545        # Make indexing, slices, and 'in' work
546        return self.addresslist[index]
547