1r"""HTTP cookie handling for web clients.
2
3This is a backport of the Py3.3 ``http.cookiejar`` module for
4python-future.
5
6This module has (now fairly distant) origins in Gisle Aas' Perl module
7HTTP::Cookies, from the libwww-perl library.
8
9Docstrings, comments and debug strings in this code refer to the
10attributes of the HTTP cookie system as cookie-attributes, to distinguish
11them clearly from Python attributes.
12
13Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
14distributed with the Python standard library, but are available from
15http://wwwsearch.sf.net/):
16
17                        CookieJar____
18                        /     \      \
19            FileCookieJar      \      \
20             /    |   \         \      \
21 MozillaCookieJar | LWPCookieJar \      \
22                  |               |      \
23                  |   ---MSIEBase |       \
24                  |  /      |     |        \
25                  | /   MSIEDBCookieJar BSDDBCookieJar
26                  |/
27               MSIECookieJar
28
29"""
30
31from __future__ import unicode_literals
32from __future__ import print_function
33from __future__ import division
34from __future__ import absolute_import
35from future.builtins import filter, int, map, open, str
36from future.utils import as_native_str, PY2
37
38__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
39           'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
40
41import copy
42import datetime
43import re
44if PY2:
45    re.ASCII = 0
46import time
47from future.backports.urllib.parse import urlparse, urlsplit, quote
48from future.backports.http.client import HTTP_PORT
49try:
50    import threading as _threading
51except ImportError:
52    import dummy_threading as _threading
53from calendar import timegm
54
55debug = False   # set to True to enable debugging via the logging module
56logger = None
57
58def _debug(*args):
59    if not debug:
60        return
61    global logger
62    if not logger:
63        import logging
64        logger = logging.getLogger("http.cookiejar")
65    return logger.debug(*args)
66
67
68DEFAULT_HTTP_PORT = str(HTTP_PORT)
69MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
70                         "instance initialised with one)")
71
72def _warn_unhandled_exception():
73    # There are a few catch-all except: statements in this module, for
74    # catching input that's bad in unexpected ways.  Warn if any
75    # exceptions are caught there.
76    import io, warnings, traceback
77    f = io.StringIO()
78    traceback.print_exc(None, f)
79    msg = f.getvalue()
80    warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
81
82
83# Date/time conversion
84# -----------------------------------------------------------------------------
85
86EPOCH_YEAR = 1970
87def _timegm(tt):
88    year, month, mday, hour, min, sec = tt[:6]
89    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
90        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
91        return timegm(tt)
92    else:
93        return None
94
95DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
96MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
97          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
98MONTHS_LOWER = []
99for month in MONTHS: MONTHS_LOWER.append(month.lower())
100
101def time2isoz(t=None):
102    """Return a string representing time in seconds since epoch, t.
103
104    If the function is called without an argument, it will use the current
105    time.
106
107    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
108    representing Universal Time (UTC, aka GMT).  An example of this format is:
109
110    1994-11-24 08:49:37Z
111
112    """
113    if t is None:
114        dt = datetime.datetime.utcnow()
115    else:
116        dt = datetime.datetime.utcfromtimestamp(t)
117    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
118        dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
119
120def time2netscape(t=None):
121    """Return a string representing time in seconds since epoch, t.
122
123    If the function is called without an argument, it will use the current
124    time.
125
126    The format of the returned string is like this:
127
128    Wed, DD-Mon-YYYY HH:MM:SS GMT
129
130    """
131    if t is None:
132        dt = datetime.datetime.utcnow()
133    else:
134        dt = datetime.datetime.utcfromtimestamp(t)
135    return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
136        DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
137        dt.year, dt.hour, dt.minute, dt.second)
138
139
140UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
141
142TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
143def offset_from_tz_string(tz):
144    offset = None
145    if tz in UTC_ZONES:
146        offset = 0
147    else:
148        m = TIMEZONE_RE.search(tz)
149        if m:
150            offset = 3600 * int(m.group(2))
151            if m.group(3):
152                offset = offset + 60 * int(m.group(3))
153            if m.group(1) == '-':
154                offset = -offset
155    return offset
156
157def _str2time(day, mon, yr, hr, min, sec, tz):
158    # translate month name to number
159    # month numbers start with 1 (January)
160    try:
161        mon = MONTHS_LOWER.index(mon.lower())+1
162    except ValueError:
163        # maybe it's already a number
164        try:
165            imon = int(mon)
166        except ValueError:
167            return None
168        if 1 <= imon <= 12:
169            mon = imon
170        else:
171            return None
172
173    # make sure clock elements are defined
174    if hr is None: hr = 0
175    if min is None: min = 0
176    if sec is None: sec = 0
177
178    yr = int(yr)
179    day = int(day)
180    hr = int(hr)
181    min = int(min)
182    sec = int(sec)
183
184    if yr < 1000:
185        # find "obvious" year
186        cur_yr = time.localtime(time.time())[0]
187        m = cur_yr % 100
188        tmp = yr
189        yr = yr + cur_yr - m
190        m = m - tmp
191        if abs(m) > 50:
192            if m > 0: yr = yr + 100
193            else: yr = yr - 100
194
195    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
196    t = _timegm((yr, mon, day, hr, min, sec, tz))
197
198    if t is not None:
199        # adjust time using timezone string, to get absolute time since epoch
200        if tz is None:
201            tz = "UTC"
202        tz = tz.upper()
203        offset = offset_from_tz_string(tz)
204        if offset is None:
205            return None
206        t = t - offset
207
208    return t
209
210STRICT_DATE_RE = re.compile(
211    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
212    "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
213WEEKDAY_RE = re.compile(
214    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
215LOOSE_HTTP_DATE_RE = re.compile(
216    r"""^
217    (\d\d?)            # day
218       (?:\s+|[-\/])
219    (\w+)              # month
220        (?:\s+|[-\/])
221    (\d+)              # year
222    (?:
223          (?:\s+|:)    # separator before clock
224       (\d\d?):(\d\d)  # hour:min
225       (?::(\d\d))?    # optional seconds
226    )?                 # optional clock
227       \s*
228    ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
229       \s*
230    (?:\(\w+\))?       # ASCII representation of timezone in parens.
231       \s*$""", re.X | re.ASCII)
232def http2time(text):
233    """Returns time in seconds since epoch of time represented by a string.
234
235    Return value is an integer.
236
237    None is returned if the format of str is unrecognized, the time is outside
238    the representable range, or the timezone string is not recognized.  If the
239    string contains no timezone, UTC is assumed.
240
241    The timezone in the string may be numerical (like "-0800" or "+0100") or a
242    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
243    timezone strings equivalent to UTC (zero offset) are known to the function.
244
245    The function loosely parses the following formats:
246
247    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
248    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
249    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
250    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
251    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
252    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
253
254    The parser ignores leading and trailing whitespace.  The time may be
255    absent.
256
257    If the year is given with only 2 digits, the function will select the
258    century that makes the year closest to the current date.
259
260    """
261    # fast exit for strictly conforming string
262    m = STRICT_DATE_RE.search(text)
263    if m:
264        g = m.groups()
265        mon = MONTHS_LOWER.index(g[1].lower()) + 1
266        tt = (int(g[2]), mon, int(g[0]),
267              int(g[3]), int(g[4]), float(g[5]))
268        return _timegm(tt)
269
270    # No, we need some messy parsing...
271
272    # clean up
273    text = text.lstrip()
274    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
275
276    # tz is time zone specifier string
277    day, mon, yr, hr, min, sec, tz = [None]*7
278
279    # loose regexp parse
280    m = LOOSE_HTTP_DATE_RE.search(text)
281    if m is not None:
282        day, mon, yr, hr, min, sec, tz = m.groups()
283    else:
284        return None  # bad format
285
286    return _str2time(day, mon, yr, hr, min, sec, tz)
287
288ISO_DATE_RE = re.compile(
289    """^
290    (\d{4})              # year
291       [-\/]?
292    (\d\d?)              # numerical month
293       [-\/]?
294    (\d\d?)              # day
295   (?:
296         (?:\s+|[-:Tt])  # separator before clock
297      (\d\d?):?(\d\d)    # hour:min
298      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
299   )?                    # optional clock
300      \s*
301   ([-+]?\d\d?:?(:?\d\d)?
302    |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
303      \s*$""", re.X | re. ASCII)
304def iso2time(text):
305    """
306    As for http2time, but parses the ISO 8601 formats:
307
308    1994-02-03 14:15:29 -0100    -- ISO 8601 format
309    1994-02-03 14:15:29          -- zone is optional
310    1994-02-03                   -- only date
311    1994-02-03T14:15:29          -- Use T as separator
312    19940203T141529Z             -- ISO 8601 compact format
313    19940203                     -- only date
314
315    """
316    # clean up
317    text = text.lstrip()
318
319    # tz is time zone specifier string
320    day, mon, yr, hr, min, sec, tz = [None]*7
321
322    # loose regexp parse
323    m = ISO_DATE_RE.search(text)
324    if m is not None:
325        # XXX there's an extra bit of the timezone I'm ignoring here: is
326        #   this the right thing to do?
327        yr, mon, day, hr, min, sec, tz, _ = m.groups()
328    else:
329        return None  # bad format
330
331    return _str2time(day, mon, yr, hr, min, sec, tz)
332
333
334# Header parsing
335# -----------------------------------------------------------------------------
336
337def unmatched(match):
338    """Return unmatched part of re.Match object."""
339    start, end = match.span(0)
340    return match.string[:start]+match.string[end:]
341
342HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
343HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
344HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
345HEADER_ESCAPE_RE = re.compile(r"\\(.)")
346def split_header_words(header_values):
347    r"""Parse header values into a list of lists containing key,value pairs.
348
349    The function knows how to deal with ",", ";" and "=" as well as quoted
350    values after "=".  A list of space separated tokens are parsed as if they
351    were separated by ";".
352
353    If the header_values passed as argument contains multiple values, then they
354    are treated as if they were a single value separated by comma ",".
355
356    This means that this function is useful for parsing header fields that
357    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
358    the requirement for tokens).
359
360      headers           = #header
361      header            = (token | parameter) *( [";"] (token | parameter))
362
363      token             = 1*<any CHAR except CTLs or separators>
364      separators        = "(" | ")" | "<" | ">" | "@"
365                        | "," | ";" | ":" | "\" | <">
366                        | "/" | "[" | "]" | "?" | "="
367                        | "{" | "}" | SP | HT
368
369      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
370      qdtext            = <any TEXT except <">>
371      quoted-pair       = "\" CHAR
372
373      parameter         = attribute "=" value
374      attribute         = token
375      value             = token | quoted-string
376
377    Each header is represented by a list of key/value pairs.  The value for a
378    simple token (not part of a parameter) is None.  Syntactically incorrect
379    headers will not necessarily be parsed as you would want.
380
381    This is easier to describe with some examples:
382
383    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
384    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
385    >>> split_header_words(['text/html; charset="iso-8859-1"'])
386    [[('text/html', None), ('charset', 'iso-8859-1')]]
387    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
388    [[('Basic', None), ('realm', '"foobar"')]]
389
390    """
391    assert not isinstance(header_values, str)
392    result = []
393    for text in header_values:
394        orig_text = text
395        pairs = []
396        while text:
397            m = HEADER_TOKEN_RE.search(text)
398            if m:
399                text = unmatched(m)
400                name = m.group(1)
401                m = HEADER_QUOTED_VALUE_RE.search(text)
402                if m:  # quoted value
403                    text = unmatched(m)
404                    value = m.group(1)
405                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
406                else:
407                    m = HEADER_VALUE_RE.search(text)
408                    if m:  # unquoted value
409                        text = unmatched(m)
410                        value = m.group(1)
411                        value = value.rstrip()
412                    else:
413                        # no value, a lone token
414                        value = None
415                pairs.append((name, value))
416            elif text.lstrip().startswith(","):
417                # concatenated headers, as per RFC 2616 section 4.2
418                text = text.lstrip()[1:]
419                if pairs: result.append(pairs)
420                pairs = []
421            else:
422                # skip junk
423                non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
424                assert nr_junk_chars > 0, (
425                    "split_header_words bug: '%s', '%s', %s" %
426                    (orig_text, text, pairs))
427                text = non_junk
428        if pairs: result.append(pairs)
429    return result
430
431HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
432def join_header_words(lists):
433    """Do the inverse (almost) of the conversion done by split_header_words.
434
435    Takes a list of lists of (key, value) pairs and produces a single header
436    value.  Attribute values are quoted if needed.
437
438    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
439    'text/plain; charset="iso-8859/1"'
440    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
441    'text/plain, charset="iso-8859/1"'
442
443    """
444    headers = []
445    for pairs in lists:
446        attr = []
447        for k, v in pairs:
448            if v is not None:
449                if not re.search(r"^\w+$", v):
450                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
451                    v = '"%s"' % v
452                k = "%s=%s" % (k, v)
453            attr.append(k)
454        if attr: headers.append("; ".join(attr))
455    return ", ".join(headers)
456
457def strip_quotes(text):
458    if text.startswith('"'):
459        text = text[1:]
460    if text.endswith('"'):
461        text = text[:-1]
462    return text
463
464def parse_ns_headers(ns_headers):
465    """Ad-hoc parser for Netscape protocol cookie-attributes.
466
467    The old Netscape cookie format for Set-Cookie can for instance contain
468    an unquoted "," in the expires field, so we have to use this ad-hoc
469    parser instead of split_header_words.
470
471    XXX This may not make the best possible effort to parse all the crap
472    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
473    parser is probably better, so could do worse than following that if
474    this ever gives any trouble.
475
476    Currently, this is also used for parsing RFC 2109 cookies.
477
478    """
479    known_attrs = ("expires", "domain", "path", "secure",
480                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
481                   "version", "port", "max-age")
482
483    result = []
484    for ns_header in ns_headers:
485        pairs = []
486        version_set = False
487        for ii, param in enumerate(re.split(r";\s*", ns_header)):
488            param = param.rstrip()
489            if param == "": continue
490            if "=" not in param:
491                k, v = param, None
492            else:
493                k, v = re.split(r"\s*=\s*", param, 1)
494                k = k.lstrip()
495            if ii != 0:
496                lc = k.lower()
497                if lc in known_attrs:
498                    k = lc
499                if k == "version":
500                    # This is an RFC 2109 cookie.
501                    v = strip_quotes(v)
502                    version_set = True
503                if k == "expires":
504                    # convert expires date to seconds since epoch
505                    v = http2time(strip_quotes(v))  # None if invalid
506            pairs.append((k, v))
507
508        if pairs:
509            if not version_set:
510                pairs.append(("version", "0"))
511            result.append(pairs)
512
513    return result
514
515
516IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
517def is_HDN(text):
518    """Return True if text is a host domain name."""
519    # XXX
520    # This may well be wrong.  Which RFC is HDN defined in, if any (for
521    #  the purposes of RFC 2965)?
522    # For the current implementation, what about IPv6?  Remember to look
523    #  at other uses of IPV4_RE also, if change this.
524    if IPV4_RE.search(text):
525        return False
526    if text == "":
527        return False
528    if text[0] == "." or text[-1] == ".":
529        return False
530    return True
531
532def domain_match(A, B):
533    """Return True if domain A domain-matches domain B, according to RFC 2965.
534
535    A and B may be host domain names or IP addresses.
536
537    RFC 2965, section 1:
538
539    Host names can be specified either as an IP address or a HDN string.
540    Sometimes we compare one host name with another.  (Such comparisons SHALL
541    be case-insensitive.)  Host A's name domain-matches host B's if
542
543         *  their host name strings string-compare equal; or
544
545         * A is a HDN string and has the form NB, where N is a non-empty
546            name string, B has the form .B', and B' is a HDN string.  (So,
547            x.y.com domain-matches .Y.com but not Y.com.)
548
549    Note that domain-match is not a commutative operation: a.b.c.com
550    domain-matches .c.com, but not the reverse.
551
552    """
553    # Note that, if A or B are IP addresses, the only relevant part of the
554    # definition of the domain-match algorithm is the direct string-compare.
555    A = A.lower()
556    B = B.lower()
557    if A == B:
558        return True
559    if not is_HDN(A):
560        return False
561    i = A.rfind(B)
562    if i == -1 or i == 0:
563        # A does not have form NB, or N is the empty string
564        return False
565    if not B.startswith("."):
566        return False
567    if not is_HDN(B[1:]):
568        return False
569    return True
570
571def liberal_is_HDN(text):
572    """Return True if text is a sort-of-like a host domain name.
573
574    For accepting/blocking domains.
575
576    """
577    if IPV4_RE.search(text):
578        return False
579    return True
580
581def user_domain_match(A, B):
582    """For blocking/accepting domains.
583
584    A and B may be host domain names or IP addresses.
585
586    """
587    A = A.lower()
588    B = B.lower()
589    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
590        if A == B:
591            # equal IP addresses
592            return True
593        return False
594    initial_dot = B.startswith(".")
595    if initial_dot and A.endswith(B):
596        return True
597    if not initial_dot and A == B:
598        return True
599    return False
600
601cut_port_re = re.compile(r":\d+$", re.ASCII)
602def request_host(request):
603    """Return request-host, as defined by RFC 2965.
604
605    Variation from RFC: returned value is lowercased, for convenient
606    comparison.
607
608    """
609    url = request.get_full_url()
610    host = urlparse(url)[1]
611    if host == "":
612        host = request.get_header("Host", "")
613
614    # remove port, if present
615    host = cut_port_re.sub("", host, 1)
616    return host.lower()
617
618def eff_request_host(request):
619    """Return a tuple (request-host, effective request-host name).
620
621    As defined by RFC 2965, except both are lowercased.
622
623    """
624    erhn = req_host = request_host(request)
625    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
626        erhn = req_host + ".local"
627    return req_host, erhn
628
629def request_path(request):
630    """Path component of request-URI, as defined by RFC 2965."""
631    url = request.get_full_url()
632    parts = urlsplit(url)
633    path = escape_path(parts.path)
634    if not path.startswith("/"):
635        # fix bad RFC 2396 absoluteURI
636        path = "/" + path
637    return path
638
639def request_port(request):
640    host = request.host
641    i = host.find(':')
642    if i >= 0:
643        port = host[i+1:]
644        try:
645            int(port)
646        except ValueError:
647            _debug("nonnumeric port: '%s'", port)
648            return None
649    else:
650        port = DEFAULT_HTTP_PORT
651    return port
652
653# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
654# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
655HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
656ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
657def uppercase_escaped_char(match):
658    return "%%%s" % match.group(1).upper()
659def escape_path(path):
660    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
661    # There's no knowing what character encoding was used to create URLs
662    # containing %-escapes, but since we have to pick one to escape invalid
663    # path characters, we pick UTF-8, as recommended in the HTML 4.0
664    # specification:
665    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
666    # And here, kind of: draft-fielding-uri-rfc2396bis-03
667    # (And in draft IRI specification: draft-duerst-iri-05)
668    # (And here, for new URI schemes: RFC 2718)
669    path = quote(path, HTTP_PATH_SAFE)
670    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
671    return path
672
673def reach(h):
674    """Return reach of host h, as defined by RFC 2965, section 1.
675
676    The reach R of a host name H is defined as follows:
677
678       *  If
679
680          -  H is the host domain name of a host; and,
681
682          -  H has the form A.B; and
683
684          -  A has no embedded (that is, interior) dots; and
685
686          -  B has at least one embedded dot, or B is the string "local".
687             then the reach of H is .B.
688
689       *  Otherwise, the reach of H is H.
690
691    >>> reach("www.acme.com")
692    '.acme.com'
693    >>> reach("acme.com")
694    'acme.com'
695    >>> reach("acme.local")
696    '.local'
697
698    """
699    i = h.find(".")
700    if i >= 0:
701        #a = h[:i]  # this line is only here to show what a is
702        b = h[i+1:]
703        i = b.find(".")
704        if is_HDN(h) and (i >= 0 or b == "local"):
705            return "."+b
706    return h
707
708def is_third_party(request):
709    """
710
711    RFC 2965, section 3.3.6:
712
713        An unverifiable transaction is to a third-party host if its request-
714        host U does not domain-match the reach R of the request-host O in the
715        origin transaction.
716
717    """
718    req_host = request_host(request)
719    if not domain_match(req_host, reach(request.get_origin_req_host())):
720        return True
721    else:
722        return False
723
724
725class Cookie(object):
726    """HTTP Cookie.
727
728    This class represents both Netscape and RFC 2965 cookies.
729
730    This is deliberately a very simple class.  It just holds attributes.  It's
731    possible to construct Cookie instances that don't comply with the cookie
732    standards.  CookieJar.make_cookies is the factory function for Cookie
733    objects -- it deals with cookie parsing, supplying defaults, and
734    normalising to the representation used in this class.  CookiePolicy is
735    responsible for checking them to see whether they should be accepted from
736    and returned to the server.
737
738    Note that the port may be present in the headers, but unspecified ("Port"
739    rather than"Port=80", for example); if this is the case, port is None.
740
741    """
742
743    def __init__(self, version, name, value,
744                 port, port_specified,
745                 domain, domain_specified, domain_initial_dot,
746                 path, path_specified,
747                 secure,
748                 expires,
749                 discard,
750                 comment,
751                 comment_url,
752                 rest,
753                 rfc2109=False,
754                 ):
755
756        if version is not None: version = int(version)
757        if expires is not None: expires = int(expires)
758        if port is None and port_specified is True:
759            raise ValueError("if port is None, port_specified must be false")
760
761        self.version = version
762        self.name = name
763        self.value = value
764        self.port = port
765        self.port_specified = port_specified
766        # normalise case, as per RFC 2965 section 3.3.3
767        self.domain = domain.lower()
768        self.domain_specified = domain_specified
769        # Sigh.  We need to know whether the domain given in the
770        # cookie-attribute had an initial dot, in order to follow RFC 2965
771        # (as clarified in draft errata).  Needed for the returned $Domain
772        # value.
773        self.domain_initial_dot = domain_initial_dot
774        self.path = path
775        self.path_specified = path_specified
776        self.secure = secure
777        self.expires = expires
778        self.discard = discard
779        self.comment = comment
780        self.comment_url = comment_url
781        self.rfc2109 = rfc2109
782
783        self._rest = copy.copy(rest)
784
785    def has_nonstandard_attr(self, name):
786        return name in self._rest
787    def get_nonstandard_attr(self, name, default=None):
788        return self._rest.get(name, default)
789    def set_nonstandard_attr(self, name, value):
790        self._rest[name] = value
791
792    def is_expired(self, now=None):
793        if now is None: now = time.time()
794        if (self.expires is not None) and (self.expires <= now):
795            return True
796        return False
797
798    def __str__(self):
799        if self.port is None: p = ""
800        else: p = ":"+self.port
801        limit = self.domain + p + self.path
802        if self.value is not None:
803            namevalue = "%s=%s" % (self.name, self.value)
804        else:
805            namevalue = self.name
806        return "<Cookie %s for %s>" % (namevalue, limit)
807
808    @as_native_str()
809    def __repr__(self):
810        args = []
811        for name in ("version", "name", "value",
812                     "port", "port_specified",
813                     "domain", "domain_specified", "domain_initial_dot",
814                     "path", "path_specified",
815                     "secure", "expires", "discard", "comment", "comment_url",
816                     ):
817            attr = getattr(self, name)
818            ### Python-Future:
819            # Avoid u'...' prefixes for unicode strings:
820            if isinstance(attr, str):
821                attr = str(attr)
822            ###
823            args.append(str("%s=%s") % (name, repr(attr)))
824        args.append("rest=%s" % repr(self._rest))
825        args.append("rfc2109=%s" % repr(self.rfc2109))
826        return "Cookie(%s)" % ", ".join(args)
827
828
829class CookiePolicy(object):
830    """Defines which cookies get accepted from and returned to server.
831
832    May also modify cookies, though this is probably a bad idea.
833
834    The subclass DefaultCookiePolicy defines the standard rules for Netscape
835    and RFC 2965 cookies -- override that if you want a customised policy.
836
837    """
838    def set_ok(self, cookie, request):
839        """Return true if (and only if) cookie should be accepted from server.
840
841        Currently, pre-expired cookies never get this far -- the CookieJar
842        class deletes such cookies itself.
843
844        """
845        raise NotImplementedError()
846
847    def return_ok(self, cookie, request):
848        """Return true if (and only if) cookie should be returned to server."""
849        raise NotImplementedError()
850
851    def domain_return_ok(self, domain, request):
852        """Return false if cookies should not be returned, given cookie domain.
853        """
854        return True
855
856    def path_return_ok(self, path, request):
857        """Return false if cookies should not be returned, given cookie path.
858        """
859        return True
860
861
862class DefaultCookiePolicy(CookiePolicy):
863    """Implements the standard rules for accepting and returning cookies."""
864
865    DomainStrictNoDots = 1
866    DomainStrictNonDomain = 2
867    DomainRFC2965Match = 4
868
869    DomainLiberal = 0
870    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
871
872    def __init__(self,
873                 blocked_domains=None, allowed_domains=None,
874                 netscape=True, rfc2965=False,
875                 rfc2109_as_netscape=None,
876                 hide_cookie2=False,
877                 strict_domain=False,
878                 strict_rfc2965_unverifiable=True,
879                 strict_ns_unverifiable=False,
880                 strict_ns_domain=DomainLiberal,
881                 strict_ns_set_initial_dollar=False,
882                 strict_ns_set_path=False,
883                 ):
884        """Constructor arguments should be passed as keyword arguments only."""
885        self.netscape = netscape
886        self.rfc2965 = rfc2965
887        self.rfc2109_as_netscape = rfc2109_as_netscape
888        self.hide_cookie2 = hide_cookie2
889        self.strict_domain = strict_domain
890        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
891        self.strict_ns_unverifiable = strict_ns_unverifiable
892        self.strict_ns_domain = strict_ns_domain
893        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
894        self.strict_ns_set_path = strict_ns_set_path
895
896        if blocked_domains is not None:
897            self._blocked_domains = tuple(blocked_domains)
898        else:
899            self._blocked_domains = ()
900
901        if allowed_domains is not None:
902            allowed_domains = tuple(allowed_domains)
903        self._allowed_domains = allowed_domains
904
905    def blocked_domains(self):
906        """Return the sequence of blocked domains (as a tuple)."""
907        return self._blocked_domains
908    def set_blocked_domains(self, blocked_domains):
909        """Set the sequence of blocked domains."""
910        self._blocked_domains = tuple(blocked_domains)
911
912    def is_blocked(self, domain):
913        for blocked_domain in self._blocked_domains:
914            if user_domain_match(domain, blocked_domain):
915                return True
916        return False
917
918    def allowed_domains(self):
919        """Return None, or the sequence of allowed domains (as a tuple)."""
920        return self._allowed_domains
921    def set_allowed_domains(self, allowed_domains):
922        """Set the sequence of allowed domains, or None."""
923        if allowed_domains is not None:
924            allowed_domains = tuple(allowed_domains)
925        self._allowed_domains = allowed_domains
926
927    def is_not_allowed(self, domain):
928        if self._allowed_domains is None:
929            return False
930        for allowed_domain in self._allowed_domains:
931            if user_domain_match(domain, allowed_domain):
932                return False
933        return True
934
935    def set_ok(self, cookie, request):
936        """
937        If you override .set_ok(), be sure to call this method.  If it returns
938        false, so should your subclass (assuming your subclass wants to be more
939        strict about which cookies to accept).
940
941        """
942        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
943
944        assert cookie.name is not None
945
946        for n in "version", "verifiability", "name", "path", "domain", "port":
947            fn_name = "set_ok_"+n
948            fn = getattr(self, fn_name)
949            if not fn(cookie, request):
950                return False
951
952        return True
953
954    def set_ok_version(self, cookie, request):
955        if cookie.version is None:
956            # Version is always set to 0 by parse_ns_headers if it's a Netscape
957            # cookie, so this must be an invalid RFC 2965 cookie.
958            _debug("   Set-Cookie2 without version attribute (%s=%s)",
959                   cookie.name, cookie.value)
960            return False
961        if cookie.version > 0 and not self.rfc2965:
962            _debug("   RFC 2965 cookies are switched off")
963            return False
964        elif cookie.version == 0 and not self.netscape:
965            _debug("   Netscape cookies are switched off")
966            return False
967        return True
968
969    def set_ok_verifiability(self, cookie, request):
970        if request.unverifiable and is_third_party(request):
971            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
972                _debug("   third-party RFC 2965 cookie during "
973                             "unverifiable transaction")
974                return False
975            elif cookie.version == 0 and self.strict_ns_unverifiable:
976                _debug("   third-party Netscape cookie during "
977                             "unverifiable transaction")
978                return False
979        return True
980
981    def set_ok_name(self, cookie, request):
982        # Try and stop servers setting V0 cookies designed to hack other
983        # servers that know both V0 and V1 protocols.
984        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
985            cookie.name.startswith("$")):
986            _debug("   illegal name (starts with '$'): '%s'", cookie.name)
987            return False
988        return True
989
990    def set_ok_path(self, cookie, request):
991        if cookie.path_specified:
992            req_path = request_path(request)
993            if ((cookie.version > 0 or
994                 (cookie.version == 0 and self.strict_ns_set_path)) and
995                not req_path.startswith(cookie.path)):
996                _debug("   path attribute %s is not a prefix of request "
997                       "path %s", cookie.path, req_path)
998                return False
999        return True
1000
1001    def set_ok_domain(self, cookie, request):
1002        if self.is_blocked(cookie.domain):
1003            _debug("   domain %s is in user block-list", cookie.domain)
1004            return False
1005        if self.is_not_allowed(cookie.domain):
1006            _debug("   domain %s is not in user allow-list", cookie.domain)
1007            return False
1008        if cookie.domain_specified:
1009            req_host, erhn = eff_request_host(request)
1010            domain = cookie.domain
1011            if self.strict_domain and (domain.count(".") >= 2):
1012                # XXX This should probably be compared with the Konqueror
1013                # (kcookiejar.cpp) and Mozilla implementations, but it's a
1014                # losing battle.
1015                i = domain.rfind(".")
1016                j = domain.rfind(".", 0, i)
1017                if j == 0:  # domain like .foo.bar
1018                    tld = domain[i+1:]
1019                    sld = domain[j+1:i]
1020                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1021                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
1022                       "info", "jobs", "mobi", "museum", "name", "pro",
1023                       "travel", "eu") and len(tld) == 2:
1024                        # domain like .co.uk
1025                        _debug("   country-code second level domain %s", domain)
1026                        return False
1027            if domain.startswith("."):
1028                undotted_domain = domain[1:]
1029            else:
1030                undotted_domain = domain
1031            embedded_dots = (undotted_domain.find(".") >= 0)
1032            if not embedded_dots and domain != ".local":
1033                _debug("   non-local domain %s contains no embedded dot",
1034                       domain)
1035                return False
1036            if cookie.version == 0:
1037                if (not erhn.endswith(domain) and
1038                    (not erhn.startswith(".") and
1039                     not ("."+erhn).endswith(domain))):
1040                    _debug("   effective request-host %s (even with added "
1041                           "initial dot) does not end with %s",
1042                           erhn, domain)
1043                    return False
1044            if (cookie.version > 0 or
1045                (self.strict_ns_domain & self.DomainRFC2965Match)):
1046                if not domain_match(erhn, domain):
1047                    _debug("   effective request-host %s does not domain-match "
1048                           "%s", erhn, domain)
1049                    return False
1050            if (cookie.version > 0 or
1051                (self.strict_ns_domain & self.DomainStrictNoDots)):
1052                host_prefix = req_host[:-len(domain)]
1053                if (host_prefix.find(".") >= 0 and
1054                    not IPV4_RE.search(req_host)):
1055                    _debug("   host prefix %s for domain %s contains a dot",
1056                           host_prefix, domain)
1057                    return False
1058        return True
1059
1060    def set_ok_port(self, cookie, request):
1061        if cookie.port_specified:
1062            req_port = request_port(request)
1063            if req_port is None:
1064                req_port = "80"
1065            else:
1066                req_port = str(req_port)
1067            for p in cookie.port.split(","):
1068                try:
1069                    int(p)
1070                except ValueError:
1071                    _debug("   bad port %s (not numeric)", p)
1072                    return False
1073                if p == req_port:
1074                    break
1075            else:
1076                _debug("   request port (%s) not found in %s",
1077                       req_port, cookie.port)
1078                return False
1079        return True
1080
1081    def return_ok(self, cookie, request):
1082        """
1083        If you override .return_ok(), be sure to call this method.  If it
1084        returns false, so should your subclass (assuming your subclass wants to
1085        be more strict about which cookies to return).
1086
1087        """
1088        # Path has already been checked by .path_return_ok(), and domain
1089        # blocking done by .domain_return_ok().
1090        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1091
1092        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1093            fn_name = "return_ok_"+n
1094            fn = getattr(self, fn_name)
1095            if not fn(cookie, request):
1096                return False
1097        return True
1098
1099    def return_ok_version(self, cookie, request):
1100        if cookie.version > 0 and not self.rfc2965:
1101            _debug("   RFC 2965 cookies are switched off")
1102            return False
1103        elif cookie.version == 0 and not self.netscape:
1104            _debug("   Netscape cookies are switched off")
1105            return False
1106        return True
1107
1108    def return_ok_verifiability(self, cookie, request):
1109        if request.unverifiable and is_third_party(request):
1110            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1111                _debug("   third-party RFC 2965 cookie during unverifiable "
1112                       "transaction")
1113                return False
1114            elif cookie.version == 0 and self.strict_ns_unverifiable:
1115                _debug("   third-party Netscape cookie during unverifiable "
1116                       "transaction")
1117                return False
1118        return True
1119
1120    def return_ok_secure(self, cookie, request):
1121        if cookie.secure and request.type != "https":
1122            _debug("   secure cookie with non-secure request")
1123            return False
1124        return True
1125
1126    def return_ok_expires(self, cookie, request):
1127        if cookie.is_expired(self._now):
1128            _debug("   cookie expired")
1129            return False
1130        return True
1131
1132    def return_ok_port(self, cookie, request):
1133        if cookie.port:
1134            req_port = request_port(request)
1135            if req_port is None:
1136                req_port = "80"
1137            for p in cookie.port.split(","):
1138                if p == req_port:
1139                    break
1140            else:
1141                _debug("   request port %s does not match cookie port %s",
1142                       req_port, cookie.port)
1143                return False
1144        return True
1145
1146    def return_ok_domain(self, cookie, request):
1147        req_host, erhn = eff_request_host(request)
1148        domain = cookie.domain
1149
1150        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1151        if (cookie.version == 0 and
1152            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1153            not cookie.domain_specified and domain != erhn):
1154            _debug("   cookie with unspecified domain does not string-compare "
1155                   "equal to request domain")
1156            return False
1157
1158        if cookie.version > 0 and not domain_match(erhn, domain):
1159            _debug("   effective request-host name %s does not domain-match "
1160                   "RFC 2965 cookie domain %s", erhn, domain)
1161            return False
1162        if cookie.version == 0 and not ("."+erhn).endswith(domain):
1163            _debug("   request-host %s does not match Netscape cookie domain "
1164                   "%s", req_host, domain)
1165            return False
1166        return True
1167
1168    def domain_return_ok(self, domain, request):
1169        # Liberal check of.  This is here as an optimization to avoid
1170        # having to load lots of MSIE cookie files unless necessary.
1171        req_host, erhn = eff_request_host(request)
1172        if not req_host.startswith("."):
1173            req_host = "."+req_host
1174        if not erhn.startswith("."):
1175            erhn = "."+erhn
1176        if not (req_host.endswith(domain) or erhn.endswith(domain)):
1177            #_debug("   request domain %s does not match cookie domain %s",
1178            #       req_host, domain)
1179            return False
1180
1181        if self.is_blocked(domain):
1182            _debug("   domain %s is in user block-list", domain)
1183            return False
1184        if self.is_not_allowed(domain):
1185            _debug("   domain %s is not in user allow-list", domain)
1186            return False
1187
1188        return True
1189
1190    def path_return_ok(self, path, request):
1191        _debug("- checking cookie path=%s", path)
1192        req_path = request_path(request)
1193        if not req_path.startswith(path):
1194            _debug("  %s does not path-match %s", req_path, path)
1195            return False
1196        return True
1197
1198
1199def vals_sorted_by_key(adict):
1200    keys = sorted(adict.keys())
1201    return map(adict.get, keys)
1202
1203def deepvalues(mapping):
1204    """Iterates over nested mapping, depth-first, in sorted order by key."""
1205    values = vals_sorted_by_key(mapping)
1206    for obj in values:
1207        mapping = False
1208        try:
1209            obj.items
1210        except AttributeError:
1211            pass
1212        else:
1213            mapping = True
1214            for subobj in deepvalues(obj):
1215                yield subobj
1216        if not mapping:
1217            yield obj
1218
1219
1220# Used as second parameter to dict.get() method, to distinguish absent
1221# dict key from one with a None value.
1222class Absent(object): pass
1223
1224class CookieJar(object):
1225    """Collection of HTTP cookies.
1226
1227    You may not need to know about this class: try
1228    urllib.request.build_opener(HTTPCookieProcessor).open(url).
1229    """
1230
1231    non_word_re = re.compile(r"\W")
1232    quote_re = re.compile(r"([\"\\])")
1233    strict_domain_re = re.compile(r"\.?[^.]*")
1234    domain_re = re.compile(r"[^.]*")
1235    dots_re = re.compile(r"^\.+")
1236
1237    magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
1238
1239    def __init__(self, policy=None):
1240        if policy is None:
1241            policy = DefaultCookiePolicy()
1242        self._policy = policy
1243
1244        self._cookies_lock = _threading.RLock()
1245        self._cookies = {}
1246
1247    def set_policy(self, policy):
1248        self._policy = policy
1249
1250    def _cookies_for_domain(self, domain, request):
1251        cookies = []
1252        if not self._policy.domain_return_ok(domain, request):
1253            return []
1254        _debug("Checking %s for cookies to return", domain)
1255        cookies_by_path = self._cookies[domain]
1256        for path in cookies_by_path.keys():
1257            if not self._policy.path_return_ok(path, request):
1258                continue
1259            cookies_by_name = cookies_by_path[path]
1260            for cookie in cookies_by_name.values():
1261                if not self._policy.return_ok(cookie, request):
1262                    _debug("   not returning cookie")
1263                    continue
1264                _debug("   it's a match")
1265                cookies.append(cookie)
1266        return cookies
1267
1268    def _cookies_for_request(self, request):
1269        """Return a list of cookies to be returned to server."""
1270        cookies = []
1271        for domain in self._cookies.keys():
1272            cookies.extend(self._cookies_for_domain(domain, request))
1273        return cookies
1274
1275    def _cookie_attrs(self, cookies):
1276        """Return a list of cookie-attributes to be returned to server.
1277
1278        like ['foo="bar"; $Path="/"', ...]
1279
1280        The $Version attribute is also added when appropriate (currently only
1281        once per request).
1282
1283        """
1284        # add cookies in order of most specific (ie. longest) path first
1285        cookies.sort(key=lambda a: len(a.path), reverse=True)
1286
1287        version_set = False
1288
1289        attrs = []
1290        for cookie in cookies:
1291            # set version of Cookie header
1292            # XXX
1293            # What should it be if multiple matching Set-Cookie headers have
1294            #  different versions themselves?
1295            # Answer: there is no answer; was supposed to be settled by
1296            #  RFC 2965 errata, but that may never appear...
1297            version = cookie.version
1298            if not version_set:
1299                version_set = True
1300                if version > 0:
1301                    attrs.append("$Version=%s" % version)
1302
1303            # quote cookie value if necessary
1304            # (not for Netscape protocol, which already has any quotes
1305            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1306            if ((cookie.value is not None) and
1307                self.non_word_re.search(cookie.value) and version > 0):
1308                value = self.quote_re.sub(r"\\\1", cookie.value)
1309            else:
1310                value = cookie.value
1311
1312            # add cookie-attributes to be returned in Cookie header
1313            if cookie.value is None:
1314                attrs.append(cookie.name)
1315            else:
1316                attrs.append("%s=%s" % (cookie.name, value))
1317            if version > 0:
1318                if cookie.path_specified:
1319                    attrs.append('$Path="%s"' % cookie.path)
1320                if cookie.domain.startswith("."):
1321                    domain = cookie.domain
1322                    if (not cookie.domain_initial_dot and
1323                        domain.startswith(".")):
1324                        domain = domain[1:]
1325                    attrs.append('$Domain="%s"' % domain)
1326                if cookie.port is not None:
1327                    p = "$Port"
1328                    if cookie.port_specified:
1329                        p = p + ('="%s"' % cookie.port)
1330                    attrs.append(p)
1331
1332        return attrs
1333
1334    def add_cookie_header(self, request):
1335        """Add correct Cookie: header to request (urllib.request.Request object).
1336
1337        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1338
1339        """
1340        _debug("add_cookie_header")
1341        self._cookies_lock.acquire()
1342        try:
1343
1344            self._policy._now = self._now = int(time.time())
1345
1346            cookies = self._cookies_for_request(request)
1347
1348            attrs = self._cookie_attrs(cookies)
1349            if attrs:
1350                if not request.has_header("Cookie"):
1351                    request.add_unredirected_header(
1352                        "Cookie", "; ".join(attrs))
1353
1354            # if necessary, advertise that we know RFC 2965
1355            if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1356                not request.has_header("Cookie2")):
1357                for cookie in cookies:
1358                    if cookie.version != 1:
1359                        request.add_unredirected_header("Cookie2", '$Version="1"')
1360                        break
1361
1362        finally:
1363            self._cookies_lock.release()
1364
1365        self.clear_expired_cookies()
1366
1367    def _normalized_cookie_tuples(self, attrs_set):
1368        """Return list of tuples containing normalised cookie information.
1369
1370        attrs_set is the list of lists of key,value pairs extracted from
1371        the Set-Cookie or Set-Cookie2 headers.
1372
1373        Tuples are name, value, standard, rest, where name and value are the
1374        cookie name and value, standard is a dictionary containing the standard
1375        cookie-attributes (discard, secure, version, expires or max-age,
1376        domain, path and port) and rest is a dictionary containing the rest of
1377        the cookie-attributes.
1378
1379        """
1380        cookie_tuples = []
1381
1382        boolean_attrs = "discard", "secure"
1383        value_attrs = ("version",
1384                       "expires", "max-age",
1385                       "domain", "path", "port",
1386                       "comment", "commenturl")
1387
1388        for cookie_attrs in attrs_set:
1389            name, value = cookie_attrs[0]
1390
1391            # Build dictionary of standard cookie-attributes (standard) and
1392            # dictionary of other cookie-attributes (rest).
1393
1394            # Note: expiry time is normalised to seconds since epoch.  V0
1395            # cookies should have the Expires cookie-attribute, and V1 cookies
1396            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1397            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1398            # accept either (but prefer Max-Age).
1399            max_age_set = False
1400
1401            bad_cookie = False
1402
1403            standard = {}
1404            rest = {}
1405            for k, v in cookie_attrs[1:]:
1406                lc = k.lower()
1407                # don't lose case distinction for unknown fields
1408                if lc in value_attrs or lc in boolean_attrs:
1409                    k = lc
1410                if k in boolean_attrs and v is None:
1411                    # boolean cookie-attribute is present, but has no value
1412                    # (like "discard", rather than "port=80")
1413                    v = True
1414                if k in standard:
1415                    # only first value is significant
1416                    continue
1417                if k == "domain":
1418                    if v is None:
1419                        _debug("   missing value for domain attribute")
1420                        bad_cookie = True
1421                        break
1422                    # RFC 2965 section 3.3.3
1423                    v = v.lower()
1424                if k == "expires":
1425                    if max_age_set:
1426                        # Prefer max-age to expires (like Mozilla)
1427                        continue
1428                    if v is None:
1429                        _debug("   missing or invalid value for expires "
1430                              "attribute: treating as session cookie")
1431                        continue
1432                if k == "max-age":
1433                    max_age_set = True
1434                    try:
1435                        v = int(v)
1436                    except ValueError:
1437                        _debug("   missing or invalid (non-numeric) value for "
1438                              "max-age attribute")
1439                        bad_cookie = True
1440                        break
1441                    # convert RFC 2965 Max-Age to seconds since epoch
1442                    # XXX Strictly you're supposed to follow RFC 2616
1443                    #   age-calculation rules.  Remember that zero Max-Age is a
1444                    #   is a request to discard (old and new) cookie, though.
1445                    k = "expires"
1446                    v = self._now + v
1447                if (k in value_attrs) or (k in boolean_attrs):
1448                    if (v is None and
1449                        k not in ("port", "comment", "commenturl")):
1450                        _debug("   missing value for %s attribute" % k)
1451                        bad_cookie = True
1452                        break
1453                    standard[k] = v
1454                else:
1455                    rest[k] = v
1456
1457            if bad_cookie:
1458                continue
1459
1460            cookie_tuples.append((name, value, standard, rest))
1461
1462        return cookie_tuples
1463
1464    def _cookie_from_cookie_tuple(self, tup, request):
1465        # standard is dict of standard cookie-attributes, rest is dict of the
1466        # rest of them
1467        name, value, standard, rest = tup
1468
1469        domain = standard.get("domain", Absent)
1470        path = standard.get("path", Absent)
1471        port = standard.get("port", Absent)
1472        expires = standard.get("expires", Absent)
1473
1474        # set the easy defaults
1475        version = standard.get("version", None)
1476        if version is not None:
1477            try:
1478                version = int(version)
1479            except ValueError:
1480                return None  # invalid version, ignore cookie
1481        secure = standard.get("secure", False)
1482        # (discard is also set if expires is Absent)
1483        discard = standard.get("discard", False)
1484        comment = standard.get("comment", None)
1485        comment_url = standard.get("commenturl", None)
1486
1487        # set default path
1488        if path is not Absent and path != "":
1489            path_specified = True
1490            path = escape_path(path)
1491        else:
1492            path_specified = False
1493            path = request_path(request)
1494            i = path.rfind("/")
1495            if i != -1:
1496                if version == 0:
1497                    # Netscape spec parts company from reality here
1498                    path = path[:i]
1499                else:
1500                    path = path[:i+1]
1501            if len(path) == 0: path = "/"
1502
1503        # set default domain
1504        domain_specified = domain is not Absent
1505        # but first we have to remember whether it starts with a dot
1506        domain_initial_dot = False
1507        if domain_specified:
1508            domain_initial_dot = bool(domain.startswith("."))
1509        if domain is Absent:
1510            req_host, erhn = eff_request_host(request)
1511            domain = erhn
1512        elif not domain.startswith("."):
1513            domain = "."+domain
1514
1515        # set default port
1516        port_specified = False
1517        if port is not Absent:
1518            if port is None:
1519                # Port attr present, but has no value: default to request port.
1520                # Cookie should then only be sent back on that port.
1521                port = request_port(request)
1522            else:
1523                port_specified = True
1524                port = re.sub(r"\s+", "", port)
1525        else:
1526            # No port attr present.  Cookie can be sent back on any port.
1527            port = None
1528
1529        # set default expires and discard
1530        if expires is Absent:
1531            expires = None
1532            discard = True
1533        elif expires <= self._now:
1534            # Expiry date in past is request to delete cookie.  This can't be
1535            # in DefaultCookiePolicy, because can't delete cookies there.
1536            try:
1537                self.clear(domain, path, name)
1538            except KeyError:
1539                pass
1540            _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1541                   domain, path, name)
1542            return None
1543
1544        return Cookie(version,
1545                      name, value,
1546                      port, port_specified,
1547                      domain, domain_specified, domain_initial_dot,
1548                      path, path_specified,
1549                      secure,
1550                      expires,
1551                      discard,
1552                      comment,
1553                      comment_url,
1554                      rest)
1555
1556    def _cookies_from_attrs_set(self, attrs_set, request):
1557        cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1558
1559        cookies = []
1560        for tup in cookie_tuples:
1561            cookie = self._cookie_from_cookie_tuple(tup, request)
1562            if cookie: cookies.append(cookie)
1563        return cookies
1564
1565    def _process_rfc2109_cookies(self, cookies):
1566        rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1567        if rfc2109_as_ns is None:
1568            rfc2109_as_ns = not self._policy.rfc2965
1569        for cookie in cookies:
1570            if cookie.version == 1:
1571                cookie.rfc2109 = True
1572                if rfc2109_as_ns:
1573                    # treat 2109 cookies as Netscape cookies rather than
1574                    # as RFC2965 cookies
1575                    cookie.version = 0
1576
1577    def make_cookies(self, response, request):
1578        """Return sequence of Cookie objects extracted from response object."""
1579        # get cookie-attributes for RFC 2965 and Netscape protocols
1580        headers = response.info()
1581        rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1582        ns_hdrs = headers.get_all("Set-Cookie", [])
1583
1584        rfc2965 = self._policy.rfc2965
1585        netscape = self._policy.netscape
1586
1587        if ((not rfc2965_hdrs and not ns_hdrs) or
1588            (not ns_hdrs and not rfc2965) or
1589            (not rfc2965_hdrs and not netscape) or
1590            (not netscape and not rfc2965)):
1591            return []  # no relevant cookie headers: quick exit
1592
1593        try:
1594            cookies = self._cookies_from_attrs_set(
1595                split_header_words(rfc2965_hdrs), request)
1596        except Exception:
1597            _warn_unhandled_exception()
1598            cookies = []
1599
1600        if ns_hdrs and netscape:
1601            try:
1602                # RFC 2109 and Netscape cookies
1603                ns_cookies = self._cookies_from_attrs_set(
1604                    parse_ns_headers(ns_hdrs), request)
1605            except Exception:
1606                _warn_unhandled_exception()
1607                ns_cookies = []
1608            self._process_rfc2109_cookies(ns_cookies)
1609
1610            # Look for Netscape cookies (from Set-Cookie headers) that match
1611            # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1612            # For each match, keep the RFC 2965 cookie and ignore the Netscape
1613            # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1614            # bundled in with the Netscape cookies for this purpose, which is
1615            # reasonable behaviour.
1616            if rfc2965:
1617                lookup = {}
1618                for cookie in cookies:
1619                    lookup[(cookie.domain, cookie.path, cookie.name)] = None
1620
1621                def no_matching_rfc2965(ns_cookie, lookup=lookup):
1622                    key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1623                    return key not in lookup
1624                ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1625
1626            if ns_cookies:
1627                cookies.extend(ns_cookies)
1628
1629        return cookies
1630
1631    def set_cookie_if_ok(self, cookie, request):
1632        """Set a cookie if policy says it's OK to do so."""
1633        self._cookies_lock.acquire()
1634        try:
1635            self._policy._now = self._now = int(time.time())
1636
1637            if self._policy.set_ok(cookie, request):
1638                self.set_cookie(cookie)
1639
1640
1641        finally:
1642            self._cookies_lock.release()
1643
1644    def set_cookie(self, cookie):
1645        """Set a cookie, without checking whether or not it should be set."""
1646        c = self._cookies
1647        self._cookies_lock.acquire()
1648        try:
1649            if cookie.domain not in c: c[cookie.domain] = {}
1650            c2 = c[cookie.domain]
1651            if cookie.path not in c2: c2[cookie.path] = {}
1652            c3 = c2[cookie.path]
1653            c3[cookie.name] = cookie
1654        finally:
1655            self._cookies_lock.release()
1656
1657    def extract_cookies(self, response, request):
1658        """Extract cookies from response, where allowable given the request."""
1659        _debug("extract_cookies: %s", response.info())
1660        self._cookies_lock.acquire()
1661        try:
1662            self._policy._now = self._now = int(time.time())
1663
1664            for cookie in self.make_cookies(response, request):
1665                if self._policy.set_ok(cookie, request):
1666                    _debug(" setting cookie: %s", cookie)
1667                    self.set_cookie(cookie)
1668        finally:
1669            self._cookies_lock.release()
1670
1671    def clear(self, domain=None, path=None, name=None):
1672        """Clear some cookies.
1673
1674        Invoking this method without arguments will clear all cookies.  If
1675        given a single argument, only cookies belonging to that domain will be
1676        removed.  If given two arguments, cookies belonging to the specified
1677        path within that domain are removed.  If given three arguments, then
1678        the cookie with the specified name, path and domain is removed.
1679
1680        Raises KeyError if no matching cookie exists.
1681
1682        """
1683        if name is not None:
1684            if (domain is None) or (path is None):
1685                raise ValueError(
1686                    "domain and path must be given to remove a cookie by name")
1687            del self._cookies[domain][path][name]
1688        elif path is not None:
1689            if domain is None:
1690                raise ValueError(
1691                    "domain must be given to remove cookies by path")
1692            del self._cookies[domain][path]
1693        elif domain is not None:
1694            del self._cookies[domain]
1695        else:
1696            self._cookies = {}
1697
1698    def clear_session_cookies(self):
1699        """Discard all session cookies.
1700
1701        Note that the .save() method won't save session cookies anyway, unless
1702        you ask otherwise by passing a true ignore_discard argument.
1703
1704        """
1705        self._cookies_lock.acquire()
1706        try:
1707            for cookie in self:
1708                if cookie.discard:
1709                    self.clear(cookie.domain, cookie.path, cookie.name)
1710        finally:
1711            self._cookies_lock.release()
1712
1713    def clear_expired_cookies(self):
1714        """Discard all expired cookies.
1715
1716        You probably don't need to call this method: expired cookies are never
1717        sent back to the server (provided you're using DefaultCookiePolicy),
1718        this method is called by CookieJar itself every so often, and the
1719        .save() method won't save expired cookies anyway (unless you ask
1720        otherwise by passing a true ignore_expires argument).
1721
1722        """
1723        self._cookies_lock.acquire()
1724        try:
1725            now = time.time()
1726            for cookie in self:
1727                if cookie.is_expired(now):
1728                    self.clear(cookie.domain, cookie.path, cookie.name)
1729        finally:
1730            self._cookies_lock.release()
1731
1732    def __iter__(self):
1733        return deepvalues(self._cookies)
1734
1735    def __len__(self):
1736        """Return number of contained cookies."""
1737        i = 0
1738        for cookie in self: i = i + 1
1739        return i
1740
1741    @as_native_str()
1742    def __repr__(self):
1743        r = []
1744        for cookie in self: r.append(repr(cookie))
1745        return "<%s[%s]>" % (self.__class__, ", ".join(r))
1746
1747    def __str__(self):
1748        r = []
1749        for cookie in self: r.append(str(cookie))
1750        return "<%s[%s]>" % (self.__class__, ", ".join(r))
1751
1752
1753# derives from IOError for backwards-compatibility with Python 2.4.0
1754class LoadError(IOError): pass
1755
1756class FileCookieJar(CookieJar):
1757    """CookieJar that can be loaded from and saved to a file."""
1758
1759    def __init__(self, filename=None, delayload=False, policy=None):
1760        """
1761        Cookies are NOT loaded from the named file until either the .load() or
1762        .revert() method is called.
1763
1764        """
1765        CookieJar.__init__(self, policy)
1766        if filename is not None:
1767            try:
1768                filename+""
1769            except:
1770                raise ValueError("filename must be string-like")
1771        self.filename = filename
1772        self.delayload = bool(delayload)
1773
1774    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1775        """Save cookies to a file."""
1776        raise NotImplementedError()
1777
1778    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1779        """Load cookies from a file."""
1780        if filename is None:
1781            if self.filename is not None: filename = self.filename
1782            else: raise ValueError(MISSING_FILENAME_TEXT)
1783
1784        f = open(filename)
1785        try:
1786            self._really_load(f, filename, ignore_discard, ignore_expires)
1787        finally:
1788            f.close()
1789
1790    def revert(self, filename=None,
1791               ignore_discard=False, ignore_expires=False):
1792        """Clear all cookies and reload cookies from a saved file.
1793
1794        Raises LoadError (or IOError) if reversion is not successful; the
1795        object's state will not be altered if this happens.
1796
1797        """
1798        if filename is None:
1799            if self.filename is not None: filename = self.filename
1800            else: raise ValueError(MISSING_FILENAME_TEXT)
1801
1802        self._cookies_lock.acquire()
1803        try:
1804
1805            old_state = copy.deepcopy(self._cookies)
1806            self._cookies = {}
1807            try:
1808                self.load(filename, ignore_discard, ignore_expires)
1809            except (LoadError, IOError):
1810                self._cookies = old_state
1811                raise
1812
1813        finally:
1814            self._cookies_lock.release()
1815
1816
1817def lwp_cookie_str(cookie):
1818    """Return string representation of Cookie in an the LWP cookie file format.
1819
1820    Actually, the format is extended a bit -- see module docstring.
1821
1822    """
1823    h = [(cookie.name, cookie.value),
1824         ("path", cookie.path),
1825         ("domain", cookie.domain)]
1826    if cookie.port is not None: h.append(("port", cookie.port))
1827    if cookie.path_specified: h.append(("path_spec", None))
1828    if cookie.port_specified: h.append(("port_spec", None))
1829    if cookie.domain_initial_dot: h.append(("domain_dot", None))
1830    if cookie.secure: h.append(("secure", None))
1831    if cookie.expires: h.append(("expires",
1832                               time2isoz(float(cookie.expires))))
1833    if cookie.discard: h.append(("discard", None))
1834    if cookie.comment: h.append(("comment", cookie.comment))
1835    if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1836
1837    keys = sorted(cookie._rest.keys())
1838    for k in keys:
1839        h.append((k, str(cookie._rest[k])))
1840
1841    h.append(("version", str(cookie.version)))
1842
1843    return join_header_words([h])
1844
1845class LWPCookieJar(FileCookieJar):
1846    """
1847    The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
1848    "Set-Cookie3" is the format used by the libwww-perl libary, not known
1849    to be compatible with any browser, but which is easy to read and
1850    doesn't lose information about RFC 2965 cookies.
1851
1852    Additional methods
1853
1854    as_lwp_str(ignore_discard=True, ignore_expired=True)
1855
1856    """
1857
1858    def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1859        """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
1860
1861        ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1862
1863        """
1864        now = time.time()
1865        r = []
1866        for cookie in self:
1867            if not ignore_discard and cookie.discard:
1868                continue
1869            if not ignore_expires and cookie.is_expired(now):
1870                continue
1871            r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1872        return "\n".join(r+[""])
1873
1874    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1875        if filename is None:
1876            if self.filename is not None: filename = self.filename
1877            else: raise ValueError(MISSING_FILENAME_TEXT)
1878
1879        f = open(filename, "w")
1880        try:
1881            # There really isn't an LWP Cookies 2.0 format, but this indicates
1882            # that there is extra information in here (domain_dot and
1883            # port_spec) while still being compatible with libwww-perl, I hope.
1884            f.write("#LWP-Cookies-2.0\n")
1885            f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1886        finally:
1887            f.close()
1888
1889    def _really_load(self, f, filename, ignore_discard, ignore_expires):
1890        magic = f.readline()
1891        if not self.magic_re.search(magic):
1892            msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1893                   "file" % filename)
1894            raise LoadError(msg)
1895
1896        now = time.time()
1897
1898        header = "Set-Cookie3:"
1899        boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1900                         "secure", "discard")
1901        value_attrs = ("version",
1902                       "port", "path", "domain",
1903                       "expires",
1904                       "comment", "commenturl")
1905
1906        try:
1907            while 1:
1908                line = f.readline()
1909                if line == "": break
1910                if not line.startswith(header):
1911                    continue
1912                line = line[len(header):].strip()
1913
1914                for data in split_header_words([line]):
1915                    name, value = data[0]
1916                    standard = {}
1917                    rest = {}
1918                    for k in boolean_attrs:
1919                        standard[k] = False
1920                    for k, v in data[1:]:
1921                        if k is not None:
1922                            lc = k.lower()
1923                        else:
1924                            lc = None
1925                        # don't lose case distinction for unknown fields
1926                        if (lc in value_attrs) or (lc in boolean_attrs):
1927                            k = lc
1928                        if k in boolean_attrs:
1929                            if v is None: v = True
1930                            standard[k] = v
1931                        elif k in value_attrs:
1932                            standard[k] = v
1933                        else:
1934                            rest[k] = v
1935
1936                    h = standard.get
1937                    expires = h("expires")
1938                    discard = h("discard")
1939                    if expires is not None:
1940                        expires = iso2time(expires)
1941                    if expires is None:
1942                        discard = True
1943                    domain = h("domain")
1944                    domain_specified = domain.startswith(".")
1945                    c = Cookie(h("version"), name, value,
1946                               h("port"), h("port_spec"),
1947                               domain, domain_specified, h("domain_dot"),
1948                               h("path"), h("path_spec"),
1949                               h("secure"),
1950                               expires,
1951                               discard,
1952                               h("comment"),
1953                               h("commenturl"),
1954                               rest)
1955                    if not ignore_discard and c.discard:
1956                        continue
1957                    if not ignore_expires and c.is_expired(now):
1958                        continue
1959                    self.set_cookie(c)
1960
1961        except IOError:
1962            raise
1963        except Exception:
1964            _warn_unhandled_exception()
1965            raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1966                            (filename, line))
1967
1968
1969class MozillaCookieJar(FileCookieJar):
1970    """
1971
1972    WARNING: you may want to backup your browser's cookies file if you use
1973    this class to save cookies.  I *think* it works, but there have been
1974    bugs in the past!
1975
1976    This class differs from CookieJar only in the format it uses to save and
1977    load cookies to and from a file.  This class uses the Mozilla/Netscape
1978    `cookies.txt' format.  lynx uses this file format, too.
1979
1980    Don't expect cookies saved while the browser is running to be noticed by
1981    the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1982    you change them on disk while it's running; on Windows, you probably can't
1983    save at all while the browser is running).
1984
1985    Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1986    Netscape cookies on saving.
1987
1988    In particular, the cookie version and port number information is lost,
1989    together with information about whether or not Path, Port and Discard were
1990    specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1991    domain as set in the HTTP header started with a dot (yes, I'm aware some
1992    domains in Netscape files start with a dot and some don't -- trust me, you
1993    really don't want to know any more about this).
1994
1995    Note that though Mozilla and Netscape use the same format, they use
1996    slightly different headers.  The class saves cookies using the Netscape
1997    header by default (Mozilla can cope with that).
1998
1999    """
2000    magic_re = re.compile("#( Netscape)? HTTP Cookie File")
2001    header = """\
2002# Netscape HTTP Cookie File
2003# http://www.netscape.com/newsref/std/cookie_spec.html
2004# This is a generated file!  Do not edit.
2005
2006"""
2007
2008    def _really_load(self, f, filename, ignore_discard, ignore_expires):
2009        now = time.time()
2010
2011        magic = f.readline()
2012        if not self.magic_re.search(magic):
2013            f.close()
2014            raise LoadError(
2015                "%r does not look like a Netscape format cookies file" %
2016                filename)
2017
2018        try:
2019            while 1:
2020                line = f.readline()
2021                if line == "": break
2022
2023                # last field may be absent, so keep any trailing tab
2024                if line.endswith("\n"): line = line[:-1]
2025
2026                # skip comments and blank lines XXX what is $ for?
2027                if (line.strip().startswith(("#", "$")) or
2028                    line.strip() == ""):
2029                    continue
2030
2031                domain, domain_specified, path, secure, expires, name, value = \
2032                        line.split("\t")
2033                secure = (secure == "TRUE")
2034                domain_specified = (domain_specified == "TRUE")
2035                if name == "":
2036                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2037                    # with no name, whereas http.cookiejar regards it as a
2038                    # cookie with no value.
2039                    name = value
2040                    value = None
2041
2042                initial_dot = domain.startswith(".")
2043                assert domain_specified == initial_dot
2044
2045                discard = False
2046                if expires == "":
2047                    expires = None
2048                    discard = True
2049
2050                # assume path_specified is false
2051                c = Cookie(0, name, value,
2052                           None, False,
2053                           domain, domain_specified, initial_dot,
2054                           path, False,
2055                           secure,
2056                           expires,
2057                           discard,
2058                           None,
2059                           None,
2060                           {})
2061                if not ignore_discard and c.discard:
2062                    continue
2063                if not ignore_expires and c.is_expired(now):
2064                    continue
2065                self.set_cookie(c)
2066
2067        except IOError:
2068            raise
2069        except Exception:
2070            _warn_unhandled_exception()
2071            raise LoadError("invalid Netscape format cookies file %r: %r" %
2072                            (filename, line))
2073
2074    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2075        if filename is None:
2076            if self.filename is not None: filename = self.filename
2077            else: raise ValueError(MISSING_FILENAME_TEXT)
2078
2079        f = open(filename, "w")
2080        try:
2081            f.write(self.header)
2082            now = time.time()
2083            for cookie in self:
2084                if not ignore_discard and cookie.discard:
2085                    continue
2086                if not ignore_expires and cookie.is_expired(now):
2087                    continue
2088                if cookie.secure: secure = "TRUE"
2089                else: secure = "FALSE"
2090                if cookie.domain.startswith("."): initial_dot = "TRUE"
2091                else: initial_dot = "FALSE"
2092                if cookie.expires is not None:
2093                    expires = str(cookie.expires)
2094                else:
2095                    expires = ""
2096                if cookie.value is None:
2097                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2098                    # with no name, whereas http.cookiejar regards it as a
2099                    # cookie with no value.
2100                    name = ""
2101                    value = cookie.name
2102                else:
2103                    name = cookie.name
2104                    value = cookie.value
2105                f.write(
2106                    "\t".join([cookie.domain, initial_dot, cookie.path,
2107                               secure, expires, name, value])+
2108                    "\n")
2109        finally:
2110            f.close()
2111