1r"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
13
14                        CookieJar____
15                        /     \      \
16            FileCookieJar      \      \
17             /    |   \         \      \
18 MozillaCookieJar | LWPCookieJar \      \
19                  |               |      \
20                  |   ---MSIEBase |       \
21                  |  /      |     |        \
22                  | /   MSIEDBCookieJar BSDDBCookieJar
23                  |/
24               MSIECookieJar
25
26"""
27
28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29           'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
31import copy
32import datetime
33import re
34import time
35import urllib.parse, urllib.request
36import threading as _threading
37import http.client  # only for the default HTTP port
38from calendar import timegm
39
40debug = False   # set to True to enable debugging via the logging module
41logger = None
42
43def _debug(*args):
44    if not debug:
45        return
46    global logger
47    if not logger:
48        import logging
49        logger = logging.getLogger("http.cookiejar")
50    return logger.debug(*args)
51
52
53DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
54MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55                         "instance initialised with one)")
56
57def _warn_unhandled_exception():
58    # There are a few catch-all except: statements in this module, for
59    # catching input that's bad in unexpected ways.  Warn if any
60    # exceptions are caught there.
61    import io, warnings, traceback
62    f = io.StringIO()
63    traceback.print_exc(None, f)
64    msg = f.getvalue()
65    warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
66
67
68# Date/time conversion
69# -----------------------------------------------------------------------------
70
71EPOCH_YEAR = 1970
72def _timegm(tt):
73    year, month, mday, hour, min, sec = tt[:6]
74    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76        return timegm(tt)
77    else:
78        return None
79
80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83MONTHS_LOWER = []
84for month in MONTHS: MONTHS_LOWER.append(month.lower())
85
86def time2isoz(t=None):
87    """Return a string representing time in seconds since epoch, t.
88
89    If the function is called without an argument, it will use the current
90    time.
91
92    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93    representing Universal Time (UTC, aka GMT).  An example of this format is:
94
95    1994-11-24 08:49:37Z
96
97    """
98    if t is None:
99        dt = datetime.datetime.utcnow()
100    else:
101        dt = datetime.datetime.utcfromtimestamp(t)
102    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
103        dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
104
105def time2netscape(t=None):
106    """Return a string representing time in seconds since epoch, t.
107
108    If the function is called without an argument, it will use the current
109    time.
110
111    The format of the returned string is like this:
112
113    Wed, DD-Mon-YYYY HH:MM:SS GMT
114
115    """
116    if t is None:
117        dt = datetime.datetime.utcnow()
118    else:
119        dt = datetime.datetime.utcfromtimestamp(t)
120    return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
121        DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
122        dt.year, dt.hour, dt.minute, dt.second)
123
124
125UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
126
127TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
128def offset_from_tz_string(tz):
129    offset = None
130    if tz in UTC_ZONES:
131        offset = 0
132    else:
133        m = TIMEZONE_RE.search(tz)
134        if m:
135            offset = 3600 * int(m.group(2))
136            if m.group(3):
137                offset = offset + 60 * int(m.group(3))
138            if m.group(1) == '-':
139                offset = -offset
140    return offset
141
142def _str2time(day, mon, yr, hr, min, sec, tz):
143    yr = int(yr)
144    if yr > datetime.MAXYEAR:
145        return None
146
147    # translate month name to number
148    # month numbers start with 1 (January)
149    try:
150        mon = MONTHS_LOWER.index(mon.lower())+1
151    except ValueError:
152        # maybe it's already a number
153        try:
154            imon = int(mon)
155        except ValueError:
156            return None
157        if 1 <= imon <= 12:
158            mon = imon
159        else:
160            return None
161
162    # make sure clock elements are defined
163    if hr is None: hr = 0
164    if min is None: min = 0
165    if sec is None: sec = 0
166
167    day = int(day)
168    hr = int(hr)
169    min = int(min)
170    sec = int(sec)
171
172    if yr < 1000:
173        # find "obvious" year
174        cur_yr = time.localtime(time.time())[0]
175        m = cur_yr % 100
176        tmp = yr
177        yr = yr + cur_yr - m
178        m = m - tmp
179        if abs(m) > 50:
180            if m > 0: yr = yr + 100
181            else: yr = yr - 100
182
183    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
184    t = _timegm((yr, mon, day, hr, min, sec, tz))
185
186    if t is not None:
187        # adjust time using timezone string, to get absolute time since epoch
188        if tz is None:
189            tz = "UTC"
190        tz = tz.upper()
191        offset = offset_from_tz_string(tz)
192        if offset is None:
193            return None
194        t = t - offset
195
196    return t
197
198STRICT_DATE_RE = re.compile(
199    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
200    r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
201WEEKDAY_RE = re.compile(
202    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
203LOOSE_HTTP_DATE_RE = re.compile(
204    r"""^
205    (\d\d?)            # day
206       (?:\s+|[-\/])
207    (\w+)              # month
208        (?:\s+|[-\/])
209    (\d+)              # year
210    (?:
211          (?:\s+|:)    # separator before clock
212       (\d\d?):(\d\d)  # hour:min
213       (?::(\d\d))?    # optional seconds
214    )?                 # optional clock
215       \s*
216    (?:
217       ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+) # timezone
218       \s*
219    )?
220    (?:
221       \(\w+\)         # ASCII representation of timezone in parens.
222       \s*
223    )?$""", re.X | re.ASCII)
224def http2time(text):
225    """Returns time in seconds since epoch of time represented by a string.
226
227    Return value is an integer.
228
229    None is returned if the format of str is unrecognized, the time is outside
230    the representable range, or the timezone string is not recognized.  If the
231    string contains no timezone, UTC is assumed.
232
233    The timezone in the string may be numerical (like "-0800" or "+0100") or a
234    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
235    timezone strings equivalent to UTC (zero offset) are known to the function.
236
237    The function loosely parses the following formats:
238
239    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
240    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
241    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
242    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
243    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
244    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
245
246    The parser ignores leading and trailing whitespace.  The time may be
247    absent.
248
249    If the year is given with only 2 digits, the function will select the
250    century that makes the year closest to the current date.
251
252    """
253    # fast exit for strictly conforming string
254    m = STRICT_DATE_RE.search(text)
255    if m:
256        g = m.groups()
257        mon = MONTHS_LOWER.index(g[1].lower()) + 1
258        tt = (int(g[2]), mon, int(g[0]),
259              int(g[3]), int(g[4]), float(g[5]))
260        return _timegm(tt)
261
262    # No, we need some messy parsing...
263
264    # clean up
265    text = text.lstrip()
266    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
267
268    # tz is time zone specifier string
269    day, mon, yr, hr, min, sec, tz = [None]*7
270
271    # loose regexp parse
272    m = LOOSE_HTTP_DATE_RE.search(text)
273    if m is not None:
274        day, mon, yr, hr, min, sec, tz = m.groups()
275    else:
276        return None  # bad format
277
278    return _str2time(day, mon, yr, hr, min, sec, tz)
279
280ISO_DATE_RE = re.compile(
281    r"""^
282    (\d{4})              # year
283       [-\/]?
284    (\d\d?)              # numerical month
285       [-\/]?
286    (\d\d?)              # day
287   (?:
288         (?:\s+|[-:Tt])  # separator before clock
289      (\d\d?):?(\d\d)    # hour:min
290      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
291   )?                    # optional clock
292      \s*
293   (?:
294      ([-+]?\d\d?:?(:?\d\d)?
295       |Z|z)             # timezone  (Z is "zero meridian", i.e. GMT)
296      \s*
297   )?$""", re.X | re. ASCII)
298def iso2time(text):
299    """
300    As for http2time, but parses the ISO 8601 formats:
301
302    1994-02-03 14:15:29 -0100    -- ISO 8601 format
303    1994-02-03 14:15:29          -- zone is optional
304    1994-02-03                   -- only date
305    1994-02-03T14:15:29          -- Use T as separator
306    19940203T141529Z             -- ISO 8601 compact format
307    19940203                     -- only date
308
309    """
310    # clean up
311    text = text.lstrip()
312
313    # tz is time zone specifier string
314    day, mon, yr, hr, min, sec, tz = [None]*7
315
316    # loose regexp parse
317    m = ISO_DATE_RE.search(text)
318    if m is not None:
319        # XXX there's an extra bit of the timezone I'm ignoring here: is
320        #   this the right thing to do?
321        yr, mon, day, hr, min, sec, tz, _ = m.groups()
322    else:
323        return None  # bad format
324
325    return _str2time(day, mon, yr, hr, min, sec, tz)
326
327
328# Header parsing
329# -----------------------------------------------------------------------------
330
331def unmatched(match):
332    """Return unmatched part of re.Match object."""
333    start, end = match.span(0)
334    return match.string[:start]+match.string[end:]
335
336HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
337HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
338HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
339HEADER_ESCAPE_RE = re.compile(r"\\(.)")
340def split_header_words(header_values):
341    r"""Parse header values into a list of lists containing key,value pairs.
342
343    The function knows how to deal with ",", ";" and "=" as well as quoted
344    values after "=".  A list of space separated tokens are parsed as if they
345    were separated by ";".
346
347    If the header_values passed as argument contains multiple values, then they
348    are treated as if they were a single value separated by comma ",".
349
350    This means that this function is useful for parsing header fields that
351    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
352    the requirement for tokens).
353
354      headers           = #header
355      header            = (token | parameter) *( [";"] (token | parameter))
356
357      token             = 1*<any CHAR except CTLs or separators>
358      separators        = "(" | ")" | "<" | ">" | "@"
359                        | "," | ";" | ":" | "\" | <">
360                        | "/" | "[" | "]" | "?" | "="
361                        | "{" | "}" | SP | HT
362
363      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
364      qdtext            = <any TEXT except <">>
365      quoted-pair       = "\" CHAR
366
367      parameter         = attribute "=" value
368      attribute         = token
369      value             = token | quoted-string
370
371    Each header is represented by a list of key/value pairs.  The value for a
372    simple token (not part of a parameter) is None.  Syntactically incorrect
373    headers will not necessarily be parsed as you would want.
374
375    This is easier to describe with some examples:
376
377    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
378    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
379    >>> split_header_words(['text/html; charset="iso-8859-1"'])
380    [[('text/html', None), ('charset', 'iso-8859-1')]]
381    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
382    [[('Basic', None), ('realm', '"foobar"')]]
383
384    """
385    assert not isinstance(header_values, str)
386    result = []
387    for text in header_values:
388        orig_text = text
389        pairs = []
390        while text:
391            m = HEADER_TOKEN_RE.search(text)
392            if m:
393                text = unmatched(m)
394                name = m.group(1)
395                m = HEADER_QUOTED_VALUE_RE.search(text)
396                if m:  # quoted value
397                    text = unmatched(m)
398                    value = m.group(1)
399                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
400                else:
401                    m = HEADER_VALUE_RE.search(text)
402                    if m:  # unquoted value
403                        text = unmatched(m)
404                        value = m.group(1)
405                        value = value.rstrip()
406                    else:
407                        # no value, a lone token
408                        value = None
409                pairs.append((name, value))
410            elif text.lstrip().startswith(","):
411                # concatenated headers, as per RFC 2616 section 4.2
412                text = text.lstrip()[1:]
413                if pairs: result.append(pairs)
414                pairs = []
415            else:
416                # skip junk
417                non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
418                assert nr_junk_chars > 0, (
419                    "split_header_words bug: '%s', '%s', %s" %
420                    (orig_text, text, pairs))
421                text = non_junk
422        if pairs: result.append(pairs)
423    return result
424
425HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
426def join_header_words(lists):
427    """Do the inverse (almost) of the conversion done by split_header_words.
428
429    Takes a list of lists of (key, value) pairs and produces a single header
430    value.  Attribute values are quoted if needed.
431
432    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
433    'text/plain; charset="iso-8859-1"'
434    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
435    'text/plain, charset="iso-8859-1"'
436
437    """
438    headers = []
439    for pairs in lists:
440        attr = []
441        for k, v in pairs:
442            if v is not None:
443                if not re.search(r"^\w+$", v):
444                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
445                    v = '"%s"' % v
446                k = "%s=%s" % (k, v)
447            attr.append(k)
448        if attr: headers.append("; ".join(attr))
449    return ", ".join(headers)
450
451def strip_quotes(text):
452    if text.startswith('"'):
453        text = text[1:]
454    if text.endswith('"'):
455        text = text[:-1]
456    return text
457
458def parse_ns_headers(ns_headers):
459    """Ad-hoc parser for Netscape protocol cookie-attributes.
460
461    The old Netscape cookie format for Set-Cookie can for instance contain
462    an unquoted "," in the expires field, so we have to use this ad-hoc
463    parser instead of split_header_words.
464
465    XXX This may not make the best possible effort to parse all the crap
466    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
467    parser is probably better, so could do worse than following that if
468    this ever gives any trouble.
469
470    Currently, this is also used for parsing RFC 2109 cookies.
471
472    """
473    known_attrs = ("expires", "domain", "path", "secure",
474                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
475                   "version", "port", "max-age")
476
477    result = []
478    for ns_header in ns_headers:
479        pairs = []
480        version_set = False
481
482        # XXX: The following does not strictly adhere to RFCs in that empty
483        # names and values are legal (the former will only appear once and will
484        # be overwritten if multiple occurrences are present). This is
485        # mostly to deal with backwards compatibility.
486        for ii, param in enumerate(ns_header.split(';')):
487            param = param.strip()
488
489            key, sep, val = param.partition('=')
490            key = key.strip()
491
492            if not key:
493                if ii == 0:
494                    break
495                else:
496                    continue
497
498            # allow for a distinction between present and empty and missing
499            # altogether
500            val = val.strip() if sep else None
501
502            if ii != 0:
503                lc = key.lower()
504                if lc in known_attrs:
505                    key = lc
506
507                if key == "version":
508                    # This is an RFC 2109 cookie.
509                    if val is not None:
510                        val = strip_quotes(val)
511                    version_set = True
512                elif key == "expires":
513                    # convert expires date to seconds since epoch
514                    if val is not None:
515                        val = http2time(strip_quotes(val))  # None if invalid
516            pairs.append((key, val))
517
518        if pairs:
519            if not version_set:
520                pairs.append(("version", "0"))
521            result.append(pairs)
522
523    return result
524
525
526IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
527def is_HDN(text):
528    """Return True if text is a host domain name."""
529    # XXX
530    # This may well be wrong.  Which RFC is HDN defined in, if any (for
531    #  the purposes of RFC 2965)?
532    # For the current implementation, what about IPv6?  Remember to look
533    #  at other uses of IPV4_RE also, if change this.
534    if IPV4_RE.search(text):
535        return False
536    if text == "":
537        return False
538    if text[0] == "." or text[-1] == ".":
539        return False
540    return True
541
542def domain_match(A, B):
543    """Return True if domain A domain-matches domain B, according to RFC 2965.
544
545    A and B may be host domain names or IP addresses.
546
547    RFC 2965, section 1:
548
549    Host names can be specified either as an IP address or a HDN string.
550    Sometimes we compare one host name with another.  (Such comparisons SHALL
551    be case-insensitive.)  Host A's name domain-matches host B's if
552
553         *  their host name strings string-compare equal; or
554
555         * A is a HDN string and has the form NB, where N is a non-empty
556            name string, B has the form .B', and B' is a HDN string.  (So,
557            x.y.com domain-matches .Y.com but not Y.com.)
558
559    Note that domain-match is not a commutative operation: a.b.c.com
560    domain-matches .c.com, but not the reverse.
561
562    """
563    # Note that, if A or B are IP addresses, the only relevant part of the
564    # definition of the domain-match algorithm is the direct string-compare.
565    A = A.lower()
566    B = B.lower()
567    if A == B:
568        return True
569    if not is_HDN(A):
570        return False
571    i = A.rfind(B)
572    if i == -1 or i == 0:
573        # A does not have form NB, or N is the empty string
574        return False
575    if not B.startswith("."):
576        return False
577    if not is_HDN(B[1:]):
578        return False
579    return True
580
581def liberal_is_HDN(text):
582    """Return True if text is a sort-of-like a host domain name.
583
584    For accepting/blocking domains.
585
586    """
587    if IPV4_RE.search(text):
588        return False
589    return True
590
591def user_domain_match(A, B):
592    """For blocking/accepting domains.
593
594    A and B may be host domain names or IP addresses.
595
596    """
597    A = A.lower()
598    B = B.lower()
599    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
600        if A == B:
601            # equal IP addresses
602            return True
603        return False
604    initial_dot = B.startswith(".")
605    if initial_dot and A.endswith(B):
606        return True
607    if not initial_dot and A == B:
608        return True
609    return False
610
611cut_port_re = re.compile(r":\d+$", re.ASCII)
612def request_host(request):
613    """Return request-host, as defined by RFC 2965.
614
615    Variation from RFC: returned value is lowercased, for convenient
616    comparison.
617
618    """
619    url = request.get_full_url()
620    host = urllib.parse.urlparse(url)[1]
621    if host == "":
622        host = request.get_header("Host", "")
623
624    # remove port, if present
625    host = cut_port_re.sub("", host, 1)
626    return host.lower()
627
628def eff_request_host(request):
629    """Return a tuple (request-host, effective request-host name).
630
631    As defined by RFC 2965, except both are lowercased.
632
633    """
634    erhn = req_host = request_host(request)
635    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
636        erhn = req_host + ".local"
637    return req_host, erhn
638
639def request_path(request):
640    """Path component of request-URI, as defined by RFC 2965."""
641    url = request.get_full_url()
642    parts = urllib.parse.urlsplit(url)
643    path = escape_path(parts.path)
644    if not path.startswith("/"):
645        # fix bad RFC 2396 absoluteURI
646        path = "/" + path
647    return path
648
649def request_port(request):
650    host = request.host
651    i = host.find(':')
652    if i >= 0:
653        port = host[i+1:]
654        try:
655            int(port)
656        except ValueError:
657            _debug("nonnumeric port: '%s'", port)
658            return None
659    else:
660        port = DEFAULT_HTTP_PORT
661    return port
662
663# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
664# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
665HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
666ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
667def uppercase_escaped_char(match):
668    return "%%%s" % match.group(1).upper()
669def escape_path(path):
670    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
671    # There's no knowing what character encoding was used to create URLs
672    # containing %-escapes, but since we have to pick one to escape invalid
673    # path characters, we pick UTF-8, as recommended in the HTML 4.0
674    # specification:
675    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
676    # And here, kind of: draft-fielding-uri-rfc2396bis-03
677    # (And in draft IRI specification: draft-duerst-iri-05)
678    # (And here, for new URI schemes: RFC 2718)
679    path = urllib.parse.quote(path, HTTP_PATH_SAFE)
680    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
681    return path
682
683def reach(h):
684    """Return reach of host h, as defined by RFC 2965, section 1.
685
686    The reach R of a host name H is defined as follows:
687
688       *  If
689
690          -  H is the host domain name of a host; and,
691
692          -  H has the form A.B; and
693
694          -  A has no embedded (that is, interior) dots; and
695
696          -  B has at least one embedded dot, or B is the string "local".
697             then the reach of H is .B.
698
699       *  Otherwise, the reach of H is H.
700
701    >>> reach("www.acme.com")
702    '.acme.com'
703    >>> reach("acme.com")
704    'acme.com'
705    >>> reach("acme.local")
706    '.local'
707
708    """
709    i = h.find(".")
710    if i >= 0:
711        #a = h[:i]  # this line is only here to show what a is
712        b = h[i+1:]
713        i = b.find(".")
714        if is_HDN(h) and (i >= 0 or b == "local"):
715            return "."+b
716    return h
717
718def is_third_party(request):
719    """
720
721    RFC 2965, section 3.3.6:
722
723        An unverifiable transaction is to a third-party host if its request-
724        host U does not domain-match the reach R of the request-host O in the
725        origin transaction.
726
727    """
728    req_host = request_host(request)
729    if not domain_match(req_host, reach(request.origin_req_host)):
730        return True
731    else:
732        return False
733
734
735class Cookie:
736    """HTTP Cookie.
737
738    This class represents both Netscape and RFC 2965 cookies.
739
740    This is deliberately a very simple class.  It just holds attributes.  It's
741    possible to construct Cookie instances that don't comply with the cookie
742    standards.  CookieJar.make_cookies is the factory function for Cookie
743    objects -- it deals with cookie parsing, supplying defaults, and
744    normalising to the representation used in this class.  CookiePolicy is
745    responsible for checking them to see whether they should be accepted from
746    and returned to the server.
747
748    Note that the port may be present in the headers, but unspecified ("Port"
749    rather than"Port=80", for example); if this is the case, port is None.
750
751    """
752
753    def __init__(self, version, name, value,
754                 port, port_specified,
755                 domain, domain_specified, domain_initial_dot,
756                 path, path_specified,
757                 secure,
758                 expires,
759                 discard,
760                 comment,
761                 comment_url,
762                 rest,
763                 rfc2109=False,
764                 ):
765
766        if version is not None: version = int(version)
767        if expires is not None: expires = int(float(expires))
768        if port is None and port_specified is True:
769            raise ValueError("if port is None, port_specified must be false")
770
771        self.version = version
772        self.name = name
773        self.value = value
774        self.port = port
775        self.port_specified = port_specified
776        # normalise case, as per RFC 2965 section 3.3.3
777        self.domain = domain.lower()
778        self.domain_specified = domain_specified
779        # Sigh.  We need to know whether the domain given in the
780        # cookie-attribute had an initial dot, in order to follow RFC 2965
781        # (as clarified in draft errata).  Needed for the returned $Domain
782        # value.
783        self.domain_initial_dot = domain_initial_dot
784        self.path = path
785        self.path_specified = path_specified
786        self.secure = secure
787        self.expires = expires
788        self.discard = discard
789        self.comment = comment
790        self.comment_url = comment_url
791        self.rfc2109 = rfc2109
792
793        self._rest = copy.copy(rest)
794
795    def has_nonstandard_attr(self, name):
796        return name in self._rest
797    def get_nonstandard_attr(self, name, default=None):
798        return self._rest.get(name, default)
799    def set_nonstandard_attr(self, name, value):
800        self._rest[name] = value
801
802    def is_expired(self, now=None):
803        if now is None: now = time.time()
804        if (self.expires is not None) and (self.expires <= now):
805            return True
806        return False
807
808    def __str__(self):
809        if self.port is None: p = ""
810        else: p = ":"+self.port
811        limit = self.domain + p + self.path
812        if self.value is not None:
813            namevalue = "%s=%s" % (self.name, self.value)
814        else:
815            namevalue = self.name
816        return "<Cookie %s for %s>" % (namevalue, limit)
817
818    def __repr__(self):
819        args = []
820        for name in ("version", "name", "value",
821                     "port", "port_specified",
822                     "domain", "domain_specified", "domain_initial_dot",
823                     "path", "path_specified",
824                     "secure", "expires", "discard", "comment", "comment_url",
825                     ):
826            attr = getattr(self, name)
827            args.append("%s=%s" % (name, repr(attr)))
828        args.append("rest=%s" % repr(self._rest))
829        args.append("rfc2109=%s" % repr(self.rfc2109))
830        return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
831
832
833class CookiePolicy:
834    """Defines which cookies get accepted from and returned to server.
835
836    May also modify cookies, though this is probably a bad idea.
837
838    The subclass DefaultCookiePolicy defines the standard rules for Netscape
839    and RFC 2965 cookies -- override that if you want a customized policy.
840
841    """
842    def set_ok(self, cookie, request):
843        """Return true if (and only if) cookie should be accepted from server.
844
845        Currently, pre-expired cookies never get this far -- the CookieJar
846        class deletes such cookies itself.
847
848        """
849        raise NotImplementedError()
850
851    def return_ok(self, cookie, request):
852        """Return true if (and only if) cookie should be returned to server."""
853        raise NotImplementedError()
854
855    def domain_return_ok(self, domain, request):
856        """Return false if cookies should not be returned, given cookie domain.
857        """
858        return True
859
860    def path_return_ok(self, path, request):
861        """Return false if cookies should not be returned, given cookie path.
862        """
863        return True
864
865
866class DefaultCookiePolicy(CookiePolicy):
867    """Implements the standard rules for accepting and returning cookies."""
868
869    DomainStrictNoDots = 1
870    DomainStrictNonDomain = 2
871    DomainRFC2965Match = 4
872
873    DomainLiberal = 0
874    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
875
876    def __init__(self,
877                 blocked_domains=None, allowed_domains=None,
878                 netscape=True, rfc2965=False,
879                 rfc2109_as_netscape=None,
880                 hide_cookie2=False,
881                 strict_domain=False,
882                 strict_rfc2965_unverifiable=True,
883                 strict_ns_unverifiable=False,
884                 strict_ns_domain=DomainLiberal,
885                 strict_ns_set_initial_dollar=False,
886                 strict_ns_set_path=False,
887                 ):
888        """Constructor arguments should be passed as keyword arguments only."""
889        self.netscape = netscape
890        self.rfc2965 = rfc2965
891        self.rfc2109_as_netscape = rfc2109_as_netscape
892        self.hide_cookie2 = hide_cookie2
893        self.strict_domain = strict_domain
894        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
895        self.strict_ns_unverifiable = strict_ns_unverifiable
896        self.strict_ns_domain = strict_ns_domain
897        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
898        self.strict_ns_set_path = strict_ns_set_path
899
900        if blocked_domains is not None:
901            self._blocked_domains = tuple(blocked_domains)
902        else:
903            self._blocked_domains = ()
904
905        if allowed_domains is not None:
906            allowed_domains = tuple(allowed_domains)
907        self._allowed_domains = allowed_domains
908
909    def blocked_domains(self):
910        """Return the sequence of blocked domains (as a tuple)."""
911        return self._blocked_domains
912    def set_blocked_domains(self, blocked_domains):
913        """Set the sequence of blocked domains."""
914        self._blocked_domains = tuple(blocked_domains)
915
916    def is_blocked(self, domain):
917        for blocked_domain in self._blocked_domains:
918            if user_domain_match(domain, blocked_domain):
919                return True
920        return False
921
922    def allowed_domains(self):
923        """Return None, or the sequence of allowed domains (as a tuple)."""
924        return self._allowed_domains
925    def set_allowed_domains(self, allowed_domains):
926        """Set the sequence of allowed domains, or None."""
927        if allowed_domains is not None:
928            allowed_domains = tuple(allowed_domains)
929        self._allowed_domains = allowed_domains
930
931    def is_not_allowed(self, domain):
932        if self._allowed_domains is None:
933            return False
934        for allowed_domain in self._allowed_domains:
935            if user_domain_match(domain, allowed_domain):
936                return False
937        return True
938
939    def set_ok(self, cookie, request):
940        """
941        If you override .set_ok(), be sure to call this method.  If it returns
942        false, so should your subclass (assuming your subclass wants to be more
943        strict about which cookies to accept).
944
945        """
946        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
947
948        assert cookie.name is not None
949
950        for n in "version", "verifiability", "name", "path", "domain", "port":
951            fn_name = "set_ok_"+n
952            fn = getattr(self, fn_name)
953            if not fn(cookie, request):
954                return False
955
956        return True
957
958    def set_ok_version(self, cookie, request):
959        if cookie.version is None:
960            # Version is always set to 0 by parse_ns_headers if it's a Netscape
961            # cookie, so this must be an invalid RFC 2965 cookie.
962            _debug("   Set-Cookie2 without version attribute (%s=%s)",
963                   cookie.name, cookie.value)
964            return False
965        if cookie.version > 0 and not self.rfc2965:
966            _debug("   RFC 2965 cookies are switched off")
967            return False
968        elif cookie.version == 0 and not self.netscape:
969            _debug("   Netscape cookies are switched off")
970            return False
971        return True
972
973    def set_ok_verifiability(self, cookie, request):
974        if request.unverifiable and is_third_party(request):
975            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
976                _debug("   third-party RFC 2965 cookie during "
977                             "unverifiable transaction")
978                return False
979            elif cookie.version == 0 and self.strict_ns_unverifiable:
980                _debug("   third-party Netscape cookie during "
981                             "unverifiable transaction")
982                return False
983        return True
984
985    def set_ok_name(self, cookie, request):
986        # Try and stop servers setting V0 cookies designed to hack other
987        # servers that know both V0 and V1 protocols.
988        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
989            cookie.name.startswith("$")):
990            _debug("   illegal name (starts with '$'): '%s'", cookie.name)
991            return False
992        return True
993
994    def set_ok_path(self, cookie, request):
995        if cookie.path_specified:
996            req_path = request_path(request)
997            if ((cookie.version > 0 or
998                 (cookie.version == 0 and self.strict_ns_set_path)) and
999                not self.path_return_ok(cookie.path, request)):
1000                _debug("   path attribute %s is not a prefix of request "
1001                       "path %s", cookie.path, req_path)
1002                return False
1003        return True
1004
1005    def set_ok_domain(self, cookie, request):
1006        if self.is_blocked(cookie.domain):
1007            _debug("   domain %s is in user block-list", cookie.domain)
1008            return False
1009        if self.is_not_allowed(cookie.domain):
1010            _debug("   domain %s is not in user allow-list", cookie.domain)
1011            return False
1012        if cookie.domain_specified:
1013            req_host, erhn = eff_request_host(request)
1014            domain = cookie.domain
1015            if self.strict_domain and (domain.count(".") >= 2):
1016                # XXX This should probably be compared with the Konqueror
1017                # (kcookiejar.cpp) and Mozilla implementations, but it's a
1018                # losing battle.
1019                i = domain.rfind(".")
1020                j = domain.rfind(".", 0, i)
1021                if j == 0:  # domain like .foo.bar
1022                    tld = domain[i+1:]
1023                    sld = domain[j+1:i]
1024                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1025                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
1026                       "info", "jobs", "mobi", "museum", "name", "pro",
1027                       "travel", "eu") and len(tld) == 2:
1028                        # domain like .co.uk
1029                        _debug("   country-code second level domain %s", domain)
1030                        return False
1031            if domain.startswith("."):
1032                undotted_domain = domain[1:]
1033            else:
1034                undotted_domain = domain
1035            embedded_dots = (undotted_domain.find(".") >= 0)
1036            if not embedded_dots and domain != ".local":
1037                _debug("   non-local domain %s contains no embedded dot",
1038                       domain)
1039                return False
1040            if cookie.version == 0:
1041                if (not erhn.endswith(domain) and
1042                    (not erhn.startswith(".") and
1043                     not ("."+erhn).endswith(domain))):
1044                    _debug("   effective request-host %s (even with added "
1045                           "initial dot) does not end with %s",
1046                           erhn, domain)
1047                    return False
1048            if (cookie.version > 0 or
1049                (self.strict_ns_domain & self.DomainRFC2965Match)):
1050                if not domain_match(erhn, domain):
1051                    _debug("   effective request-host %s does not domain-match "
1052                           "%s", erhn, domain)
1053                    return False
1054            if (cookie.version > 0 or
1055                (self.strict_ns_domain & self.DomainStrictNoDots)):
1056                host_prefix = req_host[:-len(domain)]
1057                if (host_prefix.find(".") >= 0 and
1058                    not IPV4_RE.search(req_host)):
1059                    _debug("   host prefix %s for domain %s contains a dot",
1060                           host_prefix, domain)
1061                    return False
1062        return True
1063
1064    def set_ok_port(self, cookie, request):
1065        if cookie.port_specified:
1066            req_port = request_port(request)
1067            if req_port is None:
1068                req_port = "80"
1069            else:
1070                req_port = str(req_port)
1071            for p in cookie.port.split(","):
1072                try:
1073                    int(p)
1074                except ValueError:
1075                    _debug("   bad port %s (not numeric)", p)
1076                    return False
1077                if p == req_port:
1078                    break
1079            else:
1080                _debug("   request port (%s) not found in %s",
1081                       req_port, cookie.port)
1082                return False
1083        return True
1084
1085    def return_ok(self, cookie, request):
1086        """
1087        If you override .return_ok(), be sure to call this method.  If it
1088        returns false, so should your subclass (assuming your subclass wants to
1089        be more strict about which cookies to return).
1090
1091        """
1092        # Path has already been checked by .path_return_ok(), and domain
1093        # blocking done by .domain_return_ok().
1094        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1095
1096        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1097            fn_name = "return_ok_"+n
1098            fn = getattr(self, fn_name)
1099            if not fn(cookie, request):
1100                return False
1101        return True
1102
1103    def return_ok_version(self, cookie, request):
1104        if cookie.version > 0 and not self.rfc2965:
1105            _debug("   RFC 2965 cookies are switched off")
1106            return False
1107        elif cookie.version == 0 and not self.netscape:
1108            _debug("   Netscape cookies are switched off")
1109            return False
1110        return True
1111
1112    def return_ok_verifiability(self, cookie, request):
1113        if request.unverifiable and is_third_party(request):
1114            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1115                _debug("   third-party RFC 2965 cookie during unverifiable "
1116                       "transaction")
1117                return False
1118            elif cookie.version == 0 and self.strict_ns_unverifiable:
1119                _debug("   third-party Netscape cookie during unverifiable "
1120                       "transaction")
1121                return False
1122        return True
1123
1124    def return_ok_secure(self, cookie, request):
1125        if cookie.secure and request.type != "https":
1126            _debug("   secure cookie with non-secure request")
1127            return False
1128        return True
1129
1130    def return_ok_expires(self, cookie, request):
1131        if cookie.is_expired(self._now):
1132            _debug("   cookie expired")
1133            return False
1134        return True
1135
1136    def return_ok_port(self, cookie, request):
1137        if cookie.port:
1138            req_port = request_port(request)
1139            if req_port is None:
1140                req_port = "80"
1141            for p in cookie.port.split(","):
1142                if p == req_port:
1143                    break
1144            else:
1145                _debug("   request port %s does not match cookie port %s",
1146                       req_port, cookie.port)
1147                return False
1148        return True
1149
1150    def return_ok_domain(self, cookie, request):
1151        req_host, erhn = eff_request_host(request)
1152        domain = cookie.domain
1153
1154        if domain and not domain.startswith("."):
1155            dotdomain = "." + domain
1156        else:
1157            dotdomain = domain
1158
1159        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1160        if (cookie.version == 0 and
1161            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1162            not cookie.domain_specified and domain != erhn):
1163            _debug("   cookie with unspecified domain does not string-compare "
1164                   "equal to request domain")
1165            return False
1166
1167        if cookie.version > 0 and not domain_match(erhn, domain):
1168            _debug("   effective request-host name %s does not domain-match "
1169                   "RFC 2965 cookie domain %s", erhn, domain)
1170            return False
1171        if cookie.version == 0 and not ("."+erhn).endswith(dotdomain):
1172            _debug("   request-host %s does not match Netscape cookie domain "
1173                   "%s", req_host, domain)
1174            return False
1175        return True
1176
1177    def domain_return_ok(self, domain, request):
1178        # Liberal check of.  This is here as an optimization to avoid
1179        # having to load lots of MSIE cookie files unless necessary.
1180        req_host, erhn = eff_request_host(request)
1181        if not req_host.startswith("."):
1182            req_host = "."+req_host
1183        if not erhn.startswith("."):
1184            erhn = "."+erhn
1185        if domain and not domain.startswith("."):
1186            dotdomain = "." + domain
1187        else:
1188            dotdomain = domain
1189        if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)):
1190            #_debug("   request domain %s does not match cookie domain %s",
1191            #       req_host, domain)
1192            return False
1193
1194        if self.is_blocked(domain):
1195            _debug("   domain %s is in user block-list", domain)
1196            return False
1197        if self.is_not_allowed(domain):
1198            _debug("   domain %s is not in user allow-list", domain)
1199            return False
1200
1201        return True
1202
1203    def path_return_ok(self, path, request):
1204        _debug("- checking cookie path=%s", path)
1205        req_path = request_path(request)
1206        pathlen = len(path)
1207        if req_path == path:
1208            return True
1209        elif (req_path.startswith(path) and
1210              (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")):
1211            return True
1212
1213        _debug("  %s does not path-match %s", req_path, path)
1214        return False
1215
1216def vals_sorted_by_key(adict):
1217    keys = sorted(adict.keys())
1218    return map(adict.get, keys)
1219
1220def deepvalues(mapping):
1221    """Iterates over nested mapping, depth-first, in sorted order by key."""
1222    values = vals_sorted_by_key(mapping)
1223    for obj in values:
1224        mapping = False
1225        try:
1226            obj.items
1227        except AttributeError:
1228            pass
1229        else:
1230            mapping = True
1231            yield from deepvalues(obj)
1232        if not mapping:
1233            yield obj
1234
1235
1236# Used as second parameter to dict.get() method, to distinguish absent
1237# dict key from one with a None value.
1238class Absent: pass
1239
1240class CookieJar:
1241    """Collection of HTTP cookies.
1242
1243    You may not need to know about this class: try
1244    urllib.request.build_opener(HTTPCookieProcessor).open(url).
1245    """
1246
1247    non_word_re = re.compile(r"\W")
1248    quote_re = re.compile(r"([\"\\])")
1249    strict_domain_re = re.compile(r"\.?[^.]*")
1250    domain_re = re.compile(r"[^.]*")
1251    dots_re = re.compile(r"^\.+")
1252
1253    magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
1254
1255    def __init__(self, policy=None):
1256        if policy is None:
1257            policy = DefaultCookiePolicy()
1258        self._policy = policy
1259
1260        self._cookies_lock = _threading.RLock()
1261        self._cookies = {}
1262
1263    def set_policy(self, policy):
1264        self._policy = policy
1265
1266    def _cookies_for_domain(self, domain, request):
1267        cookies = []
1268        if not self._policy.domain_return_ok(domain, request):
1269            return []
1270        _debug("Checking %s for cookies to return", domain)
1271        cookies_by_path = self._cookies[domain]
1272        for path in cookies_by_path.keys():
1273            if not self._policy.path_return_ok(path, request):
1274                continue
1275            cookies_by_name = cookies_by_path[path]
1276            for cookie in cookies_by_name.values():
1277                if not self._policy.return_ok(cookie, request):
1278                    _debug("   not returning cookie")
1279                    continue
1280                _debug("   it's a match")
1281                cookies.append(cookie)
1282        return cookies
1283
1284    def _cookies_for_request(self, request):
1285        """Return a list of cookies to be returned to server."""
1286        cookies = []
1287        for domain in self._cookies.keys():
1288            cookies.extend(self._cookies_for_domain(domain, request))
1289        return cookies
1290
1291    def _cookie_attrs(self, cookies):
1292        """Return a list of cookie-attributes to be returned to server.
1293
1294        like ['foo="bar"; $Path="/"', ...]
1295
1296        The $Version attribute is also added when appropriate (currently only
1297        once per request).
1298
1299        """
1300        # add cookies in order of most specific (ie. longest) path first
1301        cookies.sort(key=lambda a: len(a.path), reverse=True)
1302
1303        version_set = False
1304
1305        attrs = []
1306        for cookie in cookies:
1307            # set version of Cookie header
1308            # XXX
1309            # What should it be if multiple matching Set-Cookie headers have
1310            #  different versions themselves?
1311            # Answer: there is no answer; was supposed to be settled by
1312            #  RFC 2965 errata, but that may never appear...
1313            version = cookie.version
1314            if not version_set:
1315                version_set = True
1316                if version > 0:
1317                    attrs.append("$Version=%s" % version)
1318
1319            # quote cookie value if necessary
1320            # (not for Netscape protocol, which already has any quotes
1321            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1322            if ((cookie.value is not None) and
1323                self.non_word_re.search(cookie.value) and version > 0):
1324                value = self.quote_re.sub(r"\\\1", cookie.value)
1325            else:
1326                value = cookie.value
1327
1328            # add cookie-attributes to be returned in Cookie header
1329            if cookie.value is None:
1330                attrs.append(cookie.name)
1331            else:
1332                attrs.append("%s=%s" % (cookie.name, value))
1333            if version > 0:
1334                if cookie.path_specified:
1335                    attrs.append('$Path="%s"' % cookie.path)
1336                if cookie.domain.startswith("."):
1337                    domain = cookie.domain
1338                    if (not cookie.domain_initial_dot and
1339                        domain.startswith(".")):
1340                        domain = domain[1:]
1341                    attrs.append('$Domain="%s"' % domain)
1342                if cookie.port is not None:
1343                    p = "$Port"
1344                    if cookie.port_specified:
1345                        p = p + ('="%s"' % cookie.port)
1346                    attrs.append(p)
1347
1348        return attrs
1349
1350    def add_cookie_header(self, request):
1351        """Add correct Cookie: header to request (urllib.request.Request object).
1352
1353        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1354
1355        """
1356        _debug("add_cookie_header")
1357        self._cookies_lock.acquire()
1358        try:
1359
1360            self._policy._now = self._now = int(time.time())
1361
1362            cookies = self._cookies_for_request(request)
1363
1364            attrs = self._cookie_attrs(cookies)
1365            if attrs:
1366                if not request.has_header("Cookie"):
1367                    request.add_unredirected_header(
1368                        "Cookie", "; ".join(attrs))
1369
1370            # if necessary, advertise that we know RFC 2965
1371            if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1372                not request.has_header("Cookie2")):
1373                for cookie in cookies:
1374                    if cookie.version != 1:
1375                        request.add_unredirected_header("Cookie2", '$Version="1"')
1376                        break
1377
1378        finally:
1379            self._cookies_lock.release()
1380
1381        self.clear_expired_cookies()
1382
1383    def _normalized_cookie_tuples(self, attrs_set):
1384        """Return list of tuples containing normalised cookie information.
1385
1386        attrs_set is the list of lists of key,value pairs extracted from
1387        the Set-Cookie or Set-Cookie2 headers.
1388
1389        Tuples are name, value, standard, rest, where name and value are the
1390        cookie name and value, standard is a dictionary containing the standard
1391        cookie-attributes (discard, secure, version, expires or max-age,
1392        domain, path and port) and rest is a dictionary containing the rest of
1393        the cookie-attributes.
1394
1395        """
1396        cookie_tuples = []
1397
1398        boolean_attrs = "discard", "secure"
1399        value_attrs = ("version",
1400                       "expires", "max-age",
1401                       "domain", "path", "port",
1402                       "comment", "commenturl")
1403
1404        for cookie_attrs in attrs_set:
1405            name, value = cookie_attrs[0]
1406
1407            # Build dictionary of standard cookie-attributes (standard) and
1408            # dictionary of other cookie-attributes (rest).
1409
1410            # Note: expiry time is normalised to seconds since epoch.  V0
1411            # cookies should have the Expires cookie-attribute, and V1 cookies
1412            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1413            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1414            # accept either (but prefer Max-Age).
1415            max_age_set = False
1416
1417            bad_cookie = False
1418
1419            standard = {}
1420            rest = {}
1421            for k, v in cookie_attrs[1:]:
1422                lc = k.lower()
1423                # don't lose case distinction for unknown fields
1424                if lc in value_attrs or lc in boolean_attrs:
1425                    k = lc
1426                if k in boolean_attrs and v is None:
1427                    # boolean cookie-attribute is present, but has no value
1428                    # (like "discard", rather than "port=80")
1429                    v = True
1430                if k in standard:
1431                    # only first value is significant
1432                    continue
1433                if k == "domain":
1434                    if v is None:
1435                        _debug("   missing value for domain attribute")
1436                        bad_cookie = True
1437                        break
1438                    # RFC 2965 section 3.3.3
1439                    v = v.lower()
1440                if k == "expires":
1441                    if max_age_set:
1442                        # Prefer max-age to expires (like Mozilla)
1443                        continue
1444                    if v is None:
1445                        _debug("   missing or invalid value for expires "
1446                              "attribute: treating as session cookie")
1447                        continue
1448                if k == "max-age":
1449                    max_age_set = True
1450                    try:
1451                        v = int(v)
1452                    except ValueError:
1453                        _debug("   missing or invalid (non-numeric) value for "
1454                              "max-age attribute")
1455                        bad_cookie = True
1456                        break
1457                    # convert RFC 2965 Max-Age to seconds since epoch
1458                    # XXX Strictly you're supposed to follow RFC 2616
1459                    #   age-calculation rules.  Remember that zero Max-Age
1460                    #   is a request to discard (old and new) cookie, though.
1461                    k = "expires"
1462                    v = self._now + v
1463                if (k in value_attrs) or (k in boolean_attrs):
1464                    if (v is None and
1465                        k not in ("port", "comment", "commenturl")):
1466                        _debug("   missing value for %s attribute" % k)
1467                        bad_cookie = True
1468                        break
1469                    standard[k] = v
1470                else:
1471                    rest[k] = v
1472
1473            if bad_cookie:
1474                continue
1475
1476            cookie_tuples.append((name, value, standard, rest))
1477
1478        return cookie_tuples
1479
1480    def _cookie_from_cookie_tuple(self, tup, request):
1481        # standard is dict of standard cookie-attributes, rest is dict of the
1482        # rest of them
1483        name, value, standard, rest = tup
1484
1485        domain = standard.get("domain", Absent)
1486        path = standard.get("path", Absent)
1487        port = standard.get("port", Absent)
1488        expires = standard.get("expires", Absent)
1489
1490        # set the easy defaults
1491        version = standard.get("version", None)
1492        if version is not None:
1493            try:
1494                version = int(version)
1495            except ValueError:
1496                return None  # invalid version, ignore cookie
1497        secure = standard.get("secure", False)
1498        # (discard is also set if expires is Absent)
1499        discard = standard.get("discard", False)
1500        comment = standard.get("comment", None)
1501        comment_url = standard.get("commenturl", None)
1502
1503        # set default path
1504        if path is not Absent and path != "":
1505            path_specified = True
1506            path = escape_path(path)
1507        else:
1508            path_specified = False
1509            path = request_path(request)
1510            i = path.rfind("/")
1511            if i != -1:
1512                if version == 0:
1513                    # Netscape spec parts company from reality here
1514                    path = path[:i]
1515                else:
1516                    path = path[:i+1]
1517            if len(path) == 0: path = "/"
1518
1519        # set default domain
1520        domain_specified = domain is not Absent
1521        # but first we have to remember whether it starts with a dot
1522        domain_initial_dot = False
1523        if domain_specified:
1524            domain_initial_dot = bool(domain.startswith("."))
1525        if domain is Absent:
1526            req_host, erhn = eff_request_host(request)
1527            domain = erhn
1528        elif not domain.startswith("."):
1529            domain = "."+domain
1530
1531        # set default port
1532        port_specified = False
1533        if port is not Absent:
1534            if port is None:
1535                # Port attr present, but has no value: default to request port.
1536                # Cookie should then only be sent back on that port.
1537                port = request_port(request)
1538            else:
1539                port_specified = True
1540                port = re.sub(r"\s+", "", port)
1541        else:
1542            # No port attr present.  Cookie can be sent back on any port.
1543            port = None
1544
1545        # set default expires and discard
1546        if expires is Absent:
1547            expires = None
1548            discard = True
1549        elif expires <= self._now:
1550            # Expiry date in past is request to delete cookie.  This can't be
1551            # in DefaultCookiePolicy, because can't delete cookies there.
1552            try:
1553                self.clear(domain, path, name)
1554            except KeyError:
1555                pass
1556            _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1557                   domain, path, name)
1558            return None
1559
1560        return Cookie(version,
1561                      name, value,
1562                      port, port_specified,
1563                      domain, domain_specified, domain_initial_dot,
1564                      path, path_specified,
1565                      secure,
1566                      expires,
1567                      discard,
1568                      comment,
1569                      comment_url,
1570                      rest)
1571
1572    def _cookies_from_attrs_set(self, attrs_set, request):
1573        cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1574
1575        cookies = []
1576        for tup in cookie_tuples:
1577            cookie = self._cookie_from_cookie_tuple(tup, request)
1578            if cookie: cookies.append(cookie)
1579        return cookies
1580
1581    def _process_rfc2109_cookies(self, cookies):
1582        rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1583        if rfc2109_as_ns is None:
1584            rfc2109_as_ns = not self._policy.rfc2965
1585        for cookie in cookies:
1586            if cookie.version == 1:
1587                cookie.rfc2109 = True
1588                if rfc2109_as_ns:
1589                    # treat 2109 cookies as Netscape cookies rather than
1590                    # as RFC2965 cookies
1591                    cookie.version = 0
1592
1593    def make_cookies(self, response, request):
1594        """Return sequence of Cookie objects extracted from response object."""
1595        # get cookie-attributes for RFC 2965 and Netscape protocols
1596        headers = response.info()
1597        rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1598        ns_hdrs = headers.get_all("Set-Cookie", [])
1599        self._policy._now = self._now = int(time.time())
1600
1601        rfc2965 = self._policy.rfc2965
1602        netscape = self._policy.netscape
1603
1604        if ((not rfc2965_hdrs and not ns_hdrs) or
1605            (not ns_hdrs and not rfc2965) or
1606            (not rfc2965_hdrs and not netscape) or
1607            (not netscape and not rfc2965)):
1608            return []  # no relevant cookie headers: quick exit
1609
1610        try:
1611            cookies = self._cookies_from_attrs_set(
1612                split_header_words(rfc2965_hdrs), request)
1613        except Exception:
1614            _warn_unhandled_exception()
1615            cookies = []
1616
1617        if ns_hdrs and netscape:
1618            try:
1619                # RFC 2109 and Netscape cookies
1620                ns_cookies = self._cookies_from_attrs_set(
1621                    parse_ns_headers(ns_hdrs), request)
1622            except Exception:
1623                _warn_unhandled_exception()
1624                ns_cookies = []
1625            self._process_rfc2109_cookies(ns_cookies)
1626
1627            # Look for Netscape cookies (from Set-Cookie headers) that match
1628            # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1629            # For each match, keep the RFC 2965 cookie and ignore the Netscape
1630            # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1631            # bundled in with the Netscape cookies for this purpose, which is
1632            # reasonable behaviour.
1633            if rfc2965:
1634                lookup = {}
1635                for cookie in cookies:
1636                    lookup[(cookie.domain, cookie.path, cookie.name)] = None
1637
1638                def no_matching_rfc2965(ns_cookie, lookup=lookup):
1639                    key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1640                    return key not in lookup
1641                ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1642
1643            if ns_cookies:
1644                cookies.extend(ns_cookies)
1645
1646        return cookies
1647
1648    def set_cookie_if_ok(self, cookie, request):
1649        """Set a cookie if policy says it's OK to do so."""
1650        self._cookies_lock.acquire()
1651        try:
1652            self._policy._now = self._now = int(time.time())
1653
1654            if self._policy.set_ok(cookie, request):
1655                self.set_cookie(cookie)
1656
1657
1658        finally:
1659            self._cookies_lock.release()
1660
1661    def set_cookie(self, cookie):
1662        """Set a cookie, without checking whether or not it should be set."""
1663        c = self._cookies
1664        self._cookies_lock.acquire()
1665        try:
1666            if cookie.domain not in c: c[cookie.domain] = {}
1667            c2 = c[cookie.domain]
1668            if cookie.path not in c2: c2[cookie.path] = {}
1669            c3 = c2[cookie.path]
1670            c3[cookie.name] = cookie
1671        finally:
1672            self._cookies_lock.release()
1673
1674    def extract_cookies(self, response, request):
1675        """Extract cookies from response, where allowable given the request."""
1676        _debug("extract_cookies: %s", response.info())
1677        self._cookies_lock.acquire()
1678        try:
1679            for cookie in self.make_cookies(response, request):
1680                if self._policy.set_ok(cookie, request):
1681                    _debug(" setting cookie: %s", cookie)
1682                    self.set_cookie(cookie)
1683        finally:
1684            self._cookies_lock.release()
1685
1686    def clear(self, domain=None, path=None, name=None):
1687        """Clear some cookies.
1688
1689        Invoking this method without arguments will clear all cookies.  If
1690        given a single argument, only cookies belonging to that domain will be
1691        removed.  If given two arguments, cookies belonging to the specified
1692        path within that domain are removed.  If given three arguments, then
1693        the cookie with the specified name, path and domain is removed.
1694
1695        Raises KeyError if no matching cookie exists.
1696
1697        """
1698        if name is not None:
1699            if (domain is None) or (path is None):
1700                raise ValueError(
1701                    "domain and path must be given to remove a cookie by name")
1702            del self._cookies[domain][path][name]
1703        elif path is not None:
1704            if domain is None:
1705                raise ValueError(
1706                    "domain must be given to remove cookies by path")
1707            del self._cookies[domain][path]
1708        elif domain is not None:
1709            del self._cookies[domain]
1710        else:
1711            self._cookies = {}
1712
1713    def clear_session_cookies(self):
1714        """Discard all session cookies.
1715
1716        Note that the .save() method won't save session cookies anyway, unless
1717        you ask otherwise by passing a true ignore_discard argument.
1718
1719        """
1720        self._cookies_lock.acquire()
1721        try:
1722            for cookie in self:
1723                if cookie.discard:
1724                    self.clear(cookie.domain, cookie.path, cookie.name)
1725        finally:
1726            self._cookies_lock.release()
1727
1728    def clear_expired_cookies(self):
1729        """Discard all expired cookies.
1730
1731        You probably don't need to call this method: expired cookies are never
1732        sent back to the server (provided you're using DefaultCookiePolicy),
1733        this method is called by CookieJar itself every so often, and the
1734        .save() method won't save expired cookies anyway (unless you ask
1735        otherwise by passing a true ignore_expires argument).
1736
1737        """
1738        self._cookies_lock.acquire()
1739        try:
1740            now = time.time()
1741            for cookie in self:
1742                if cookie.is_expired(now):
1743                    self.clear(cookie.domain, cookie.path, cookie.name)
1744        finally:
1745            self._cookies_lock.release()
1746
1747    def __iter__(self):
1748        return deepvalues(self._cookies)
1749
1750    def __len__(self):
1751        """Return number of contained cookies."""
1752        i = 0
1753        for cookie in self: i = i + 1
1754        return i
1755
1756    def __repr__(self):
1757        r = []
1758        for cookie in self: r.append(repr(cookie))
1759        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1760
1761    def __str__(self):
1762        r = []
1763        for cookie in self: r.append(str(cookie))
1764        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1765
1766
1767# derives from OSError for backwards-compatibility with Python 2.4.0
1768class LoadError(OSError): pass
1769
1770class FileCookieJar(CookieJar):
1771    """CookieJar that can be loaded from and saved to a file."""
1772
1773    def __init__(self, filename=None, delayload=False, policy=None):
1774        """
1775        Cookies are NOT loaded from the named file until either the .load() or
1776        .revert() method is called.
1777
1778        """
1779        CookieJar.__init__(self, policy)
1780        if filename is not None:
1781            try:
1782                filename+""
1783            except:
1784                raise ValueError("filename must be string-like")
1785        self.filename = filename
1786        self.delayload = bool(delayload)
1787
1788    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1789        """Save cookies to a file."""
1790        raise NotImplementedError()
1791
1792    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1793        """Load cookies from a file."""
1794        if filename is None:
1795            if self.filename is not None: filename = self.filename
1796            else: raise ValueError(MISSING_FILENAME_TEXT)
1797
1798        with open(filename) as f:
1799            self._really_load(f, filename, ignore_discard, ignore_expires)
1800
1801    def revert(self, filename=None,
1802               ignore_discard=False, ignore_expires=False):
1803        """Clear all cookies and reload cookies from a saved file.
1804
1805        Raises LoadError (or OSError) if reversion is not successful; the
1806        object's state will not be altered if this happens.
1807
1808        """
1809        if filename is None:
1810            if self.filename is not None: filename = self.filename
1811            else: raise ValueError(MISSING_FILENAME_TEXT)
1812
1813        self._cookies_lock.acquire()
1814        try:
1815
1816            old_state = copy.deepcopy(self._cookies)
1817            self._cookies = {}
1818            try:
1819                self.load(filename, ignore_discard, ignore_expires)
1820            except OSError:
1821                self._cookies = old_state
1822                raise
1823
1824        finally:
1825            self._cookies_lock.release()
1826
1827
1828def lwp_cookie_str(cookie):
1829    """Return string representation of Cookie in the LWP cookie file format.
1830
1831    Actually, the format is extended a bit -- see module docstring.
1832
1833    """
1834    h = [(cookie.name, cookie.value),
1835         ("path", cookie.path),
1836         ("domain", cookie.domain)]
1837    if cookie.port is not None: h.append(("port", cookie.port))
1838    if cookie.path_specified: h.append(("path_spec", None))
1839    if cookie.port_specified: h.append(("port_spec", None))
1840    if cookie.domain_initial_dot: h.append(("domain_dot", None))
1841    if cookie.secure: h.append(("secure", None))
1842    if cookie.expires: h.append(("expires",
1843                               time2isoz(float(cookie.expires))))
1844    if cookie.discard: h.append(("discard", None))
1845    if cookie.comment: h.append(("comment", cookie.comment))
1846    if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1847
1848    keys = sorted(cookie._rest.keys())
1849    for k in keys:
1850        h.append((k, str(cookie._rest[k])))
1851
1852    h.append(("version", str(cookie.version)))
1853
1854    return join_header_words([h])
1855
1856class LWPCookieJar(FileCookieJar):
1857    """
1858    The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
1859    "Set-Cookie3" is the format used by the libwww-perl library, not known
1860    to be compatible with any browser, but which is easy to read and
1861    doesn't lose information about RFC 2965 cookies.
1862
1863    Additional methods
1864
1865    as_lwp_str(ignore_discard=True, ignore_expired=True)
1866
1867    """
1868
1869    def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1870        """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
1871
1872        ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1873
1874        """
1875        now = time.time()
1876        r = []
1877        for cookie in self:
1878            if not ignore_discard and cookie.discard:
1879                continue
1880            if not ignore_expires and cookie.is_expired(now):
1881                continue
1882            r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1883        return "\n".join(r+[""])
1884
1885    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1886        if filename is None:
1887            if self.filename is not None: filename = self.filename
1888            else: raise ValueError(MISSING_FILENAME_TEXT)
1889
1890        with open(filename, "w") as f:
1891            # There really isn't an LWP Cookies 2.0 format, but this indicates
1892            # that there is extra information in here (domain_dot and
1893            # port_spec) while still being compatible with libwww-perl, I hope.
1894            f.write("#LWP-Cookies-2.0\n")
1895            f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1896
1897    def _really_load(self, f, filename, ignore_discard, ignore_expires):
1898        magic = f.readline()
1899        if not self.magic_re.search(magic):
1900            msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1901                   "file" % filename)
1902            raise LoadError(msg)
1903
1904        now = time.time()
1905
1906        header = "Set-Cookie3:"
1907        boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1908                         "secure", "discard")
1909        value_attrs = ("version",
1910                       "port", "path", "domain",
1911                       "expires",
1912                       "comment", "commenturl")
1913
1914        try:
1915            while 1:
1916                line = f.readline()
1917                if line == "": break
1918                if not line.startswith(header):
1919                    continue
1920                line = line[len(header):].strip()
1921
1922                for data in split_header_words([line]):
1923                    name, value = data[0]
1924                    standard = {}
1925                    rest = {}
1926                    for k in boolean_attrs:
1927                        standard[k] = False
1928                    for k, v in data[1:]:
1929                        if k is not None:
1930                            lc = k.lower()
1931                        else:
1932                            lc = None
1933                        # don't lose case distinction for unknown fields
1934                        if (lc in value_attrs) or (lc in boolean_attrs):
1935                            k = lc
1936                        if k in boolean_attrs:
1937                            if v is None: v = True
1938                            standard[k] = v
1939                        elif k in value_attrs:
1940                            standard[k] = v
1941                        else:
1942                            rest[k] = v
1943
1944                    h = standard.get
1945                    expires = h("expires")
1946                    discard = h("discard")
1947                    if expires is not None:
1948                        expires = iso2time(expires)
1949                    if expires is None:
1950                        discard = True
1951                    domain = h("domain")
1952                    domain_specified = domain.startswith(".")
1953                    c = Cookie(h("version"), name, value,
1954                               h("port"), h("port_spec"),
1955                               domain, domain_specified, h("domain_dot"),
1956                               h("path"), h("path_spec"),
1957                               h("secure"),
1958                               expires,
1959                               discard,
1960                               h("comment"),
1961                               h("commenturl"),
1962                               rest)
1963                    if not ignore_discard and c.discard:
1964                        continue
1965                    if not ignore_expires and c.is_expired(now):
1966                        continue
1967                    self.set_cookie(c)
1968        except OSError:
1969            raise
1970        except Exception:
1971            _warn_unhandled_exception()
1972            raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1973                            (filename, line))
1974
1975
1976class MozillaCookieJar(FileCookieJar):
1977    """
1978
1979    WARNING: you may want to backup your browser's cookies file if you use
1980    this class to save cookies.  I *think* it works, but there have been
1981    bugs in the past!
1982
1983    This class differs from CookieJar only in the format it uses to save and
1984    load cookies to and from a file.  This class uses the Mozilla/Netscape
1985    `cookies.txt' format.  lynx uses this file format, too.
1986
1987    Don't expect cookies saved while the browser is running to be noticed by
1988    the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1989    you change them on disk while it's running; on Windows, you probably can't
1990    save at all while the browser is running).
1991
1992    Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1993    Netscape cookies on saving.
1994
1995    In particular, the cookie version and port number information is lost,
1996    together with information about whether or not Path, Port and Discard were
1997    specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1998    domain as set in the HTTP header started with a dot (yes, I'm aware some
1999    domains in Netscape files start with a dot and some don't -- trust me, you
2000    really don't want to know any more about this).
2001
2002    Note that though Mozilla and Netscape use the same format, they use
2003    slightly different headers.  The class saves cookies using the Netscape
2004    header by default (Mozilla can cope with that).
2005
2006    """
2007    magic_re = re.compile("#( Netscape)? HTTP Cookie File")
2008    header = """\
2009# Netscape HTTP Cookie File
2010# http://curl.haxx.se/rfc/cookie_spec.html
2011# This is a generated file!  Do not edit.
2012
2013"""
2014
2015    def _really_load(self, f, filename, ignore_discard, ignore_expires):
2016        now = time.time()
2017
2018        magic = f.readline()
2019        if not self.magic_re.search(magic):
2020            raise LoadError(
2021                "%r does not look like a Netscape format cookies file" %
2022                filename)
2023
2024        try:
2025            while 1:
2026                line = f.readline()
2027                if line == "": break
2028
2029                # last field may be absent, so keep any trailing tab
2030                if line.endswith("\n"): line = line[:-1]
2031
2032                # skip comments and blank lines XXX what is $ for?
2033                if (line.strip().startswith(("#", "$")) or
2034                    line.strip() == ""):
2035                    continue
2036
2037                domain, domain_specified, path, secure, expires, name, value = \
2038                        line.split("\t")
2039                secure = (secure == "TRUE")
2040                domain_specified = (domain_specified == "TRUE")
2041                if name == "":
2042                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2043                    # with no name, whereas http.cookiejar regards it as a
2044                    # cookie with no value.
2045                    name = value
2046                    value = None
2047
2048                initial_dot = domain.startswith(".")
2049                assert domain_specified == initial_dot
2050
2051                discard = False
2052                if expires == "":
2053                    expires = None
2054                    discard = True
2055
2056                # assume path_specified is false
2057                c = Cookie(0, name, value,
2058                           None, False,
2059                           domain, domain_specified, initial_dot,
2060                           path, False,
2061                           secure,
2062                           expires,
2063                           discard,
2064                           None,
2065                           None,
2066                           {})
2067                if not ignore_discard and c.discard:
2068                    continue
2069                if not ignore_expires and c.is_expired(now):
2070                    continue
2071                self.set_cookie(c)
2072
2073        except OSError:
2074            raise
2075        except Exception:
2076            _warn_unhandled_exception()
2077            raise LoadError("invalid Netscape format cookies file %r: %r" %
2078                            (filename, line))
2079
2080    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2081        if filename is None:
2082            if self.filename is not None: filename = self.filename
2083            else: raise ValueError(MISSING_FILENAME_TEXT)
2084
2085        with open(filename, "w") as f:
2086            f.write(self.header)
2087            now = time.time()
2088            for cookie in self:
2089                if not ignore_discard and cookie.discard:
2090                    continue
2091                if not ignore_expires and cookie.is_expired(now):
2092                    continue
2093                if cookie.secure: secure = "TRUE"
2094                else: secure = "FALSE"
2095                if cookie.domain.startswith("."): initial_dot = "TRUE"
2096                else: initial_dot = "FALSE"
2097                if cookie.expires is not None:
2098                    expires = str(cookie.expires)
2099                else:
2100                    expires = ""
2101                if cookie.value is None:
2102                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2103                    # with no name, whereas http.cookiejar regards it as a
2104                    # cookie with no value.
2105                    name = ""
2106                    value = cookie.name
2107                else:
2108                    name = cookie.name
2109                    value = cookie.value
2110                f.write(
2111                    "\t".join([cookie.domain, initial_dot, cookie.path,
2112                               secure, expires, name, value])+
2113                    "\n")
2114