1r"""HTTP cookie handling for web clients.
2
3This module has (now fairly distant) origins in Gisle Aas' Perl module
4HTTP::Cookies, from the libwww-perl library.
5
6Docstrings, comments and debug strings in this code refer to the
7attributes of the HTTP cookie system as cookie-attributes, to distinguish
8them clearly from Python attributes.
9
10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11distributed with the Python standard library, but are available from
12http://wwwsearch.sf.net/):
13
14                        CookieJar____
15                        /     \      \
16            FileCookieJar      \      \
17             /    |   \         \      \
18 MozillaCookieJar | LWPCookieJar \      \
19                  |               |      \
20                  |   ---MSIEBase |       \
21                  |  /      |     |        \
22                  | /   MSIEDBCookieJar BSDDBCookieJar
23                  |/
24               MSIECookieJar
25
26"""
27
28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29           'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
30
31import os
32import copy
33import datetime
34import re
35import time
36import urllib.parse, urllib.request
37import threading as _threading
38import http.client  # only for the default HTTP port
39from calendar import timegm
40
41debug = False   # set to True to enable debugging via the logging module
42logger = None
43
44def _debug(*args):
45    if not debug:
46        return
47    global logger
48    if not logger:
49        import logging
50        logger = logging.getLogger("http.cookiejar")
51    return logger.debug(*args)
52
53
54DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT)
55MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
56                         "instance initialised with one)")
57
58def _warn_unhandled_exception():
59    # There are a few catch-all except: statements in this module, for
60    # catching input that's bad in unexpected ways.  Warn if any
61    # exceptions are caught there.
62    import io, warnings, traceback
63    f = io.StringIO()
64    traceback.print_exc(None, f)
65    msg = f.getvalue()
66    warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
67
68
69# Date/time conversion
70# -----------------------------------------------------------------------------
71
72EPOCH_YEAR = 1970
73def _timegm(tt):
74    year, month, mday, hour, min, sec = tt[:6]
75    if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
76        (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
77        return timegm(tt)
78    else:
79        return None
80
81DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
82MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
83          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
84MONTHS_LOWER = []
85for month in MONTHS: MONTHS_LOWER.append(month.lower())
86
87def time2isoz(t=None):
88    """Return a string representing time in seconds since epoch, t.
89
90    If the function is called without an argument, it will use the current
91    time.
92
93    The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
94    representing Universal Time (UTC, aka GMT).  An example of this format is:
95
96    1994-11-24 08:49:37Z
97
98    """
99    if t is None:
100        dt = datetime.datetime.utcnow()
101    else:
102        dt = datetime.datetime.utcfromtimestamp(t)
103    return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
104        dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
105
106def time2netscape(t=None):
107    """Return a string representing time in seconds since epoch, t.
108
109    If the function is called without an argument, it will use the current
110    time.
111
112    The format of the returned string is like this:
113
114    Wed, DD-Mon-YYYY HH:MM:SS GMT
115
116    """
117    if t is None:
118        dt = datetime.datetime.utcnow()
119    else:
120        dt = datetime.datetime.utcfromtimestamp(t)
121    return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % (
122        DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
123        dt.year, dt.hour, dt.minute, dt.second)
124
125
126UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
127
128TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
129def offset_from_tz_string(tz):
130    offset = None
131    if tz in UTC_ZONES:
132        offset = 0
133    else:
134        m = TIMEZONE_RE.search(tz)
135        if m:
136            offset = 3600 * int(m.group(2))
137            if m.group(3):
138                offset = offset + 60 * int(m.group(3))
139            if m.group(1) == '-':
140                offset = -offset
141    return offset
142
143def _str2time(day, mon, yr, hr, min, sec, tz):
144    yr = int(yr)
145    if yr > datetime.MAXYEAR:
146        return None
147
148    # translate month name to number
149    # month numbers start with 1 (January)
150    try:
151        mon = MONTHS_LOWER.index(mon.lower())+1
152    except ValueError:
153        # maybe it's already a number
154        try:
155            imon = int(mon)
156        except ValueError:
157            return None
158        if 1 <= imon <= 12:
159            mon = imon
160        else:
161            return None
162
163    # make sure clock elements are defined
164    if hr is None: hr = 0
165    if min is None: min = 0
166    if sec is None: sec = 0
167
168    day = int(day)
169    hr = int(hr)
170    min = int(min)
171    sec = int(sec)
172
173    if yr < 1000:
174        # find "obvious" year
175        cur_yr = time.localtime(time.time())[0]
176        m = cur_yr % 100
177        tmp = yr
178        yr = yr + cur_yr - m
179        m = m - tmp
180        if abs(m) > 50:
181            if m > 0: yr = yr + 100
182            else: yr = yr - 100
183
184    # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
185    t = _timegm((yr, mon, day, hr, min, sec, tz))
186
187    if t is not None:
188        # adjust time using timezone string, to get absolute time since epoch
189        if tz is None:
190            tz = "UTC"
191        tz = tz.upper()
192        offset = offset_from_tz_string(tz)
193        if offset is None:
194            return None
195        t = t - offset
196
197    return t
198
199STRICT_DATE_RE = re.compile(
200    r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
201    r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
202WEEKDAY_RE = re.compile(
203    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
204LOOSE_HTTP_DATE_RE = re.compile(
205    r"""^
206    (\d\d?)            # day
207       (?:\s+|[-\/])
208    (\w+)              # month
209        (?:\s+|[-\/])
210    (\d+)              # year
211    (?:
212          (?:\s+|:)    # separator before clock
213       (\d\d?):(\d\d)  # hour:min
214       (?::(\d\d))?    # optional seconds
215    )?                 # optional clock
216       \s*
217    (?:
218       ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+) # timezone
219       \s*
220    )?
221    (?:
222       \(\w+\)         # ASCII representation of timezone in parens.
223       \s*
224    )?$""", re.X | re.ASCII)
225def http2time(text):
226    """Returns time in seconds since epoch of time represented by a string.
227
228    Return value is an integer.
229
230    None is returned if the format of str is unrecognized, the time is outside
231    the representable range, or the timezone string is not recognized.  If the
232    string contains no timezone, UTC is assumed.
233
234    The timezone in the string may be numerical (like "-0800" or "+0100") or a
235    string timezone (like "UTC", "GMT", "BST" or "EST").  Currently, only the
236    timezone strings equivalent to UTC (zero offset) are known to the function.
237
238    The function loosely parses the following formats:
239
240    Wed, 09 Feb 1994 22:23:32 GMT       -- HTTP format
241    Tuesday, 08-Feb-94 14:15:29 GMT     -- old rfc850 HTTP format
242    Tuesday, 08-Feb-1994 14:15:29 GMT   -- broken rfc850 HTTP format
243    09 Feb 1994 22:23:32 GMT            -- HTTP format (no weekday)
244    08-Feb-94 14:15:29 GMT              -- rfc850 format (no weekday)
245    08-Feb-1994 14:15:29 GMT            -- broken rfc850 format (no weekday)
246
247    The parser ignores leading and trailing whitespace.  The time may be
248    absent.
249
250    If the year is given with only 2 digits, the function will select the
251    century that makes the year closest to the current date.
252
253    """
254    # fast exit for strictly conforming string
255    m = STRICT_DATE_RE.search(text)
256    if m:
257        g = m.groups()
258        mon = MONTHS_LOWER.index(g[1].lower()) + 1
259        tt = (int(g[2]), mon, int(g[0]),
260              int(g[3]), int(g[4]), float(g[5]))
261        return _timegm(tt)
262
263    # No, we need some messy parsing...
264
265    # clean up
266    text = text.lstrip()
267    text = WEEKDAY_RE.sub("", text, 1)  # Useless weekday
268
269    # tz is time zone specifier string
270    day, mon, yr, hr, min, sec, tz = [None]*7
271
272    # loose regexp parse
273    m = LOOSE_HTTP_DATE_RE.search(text)
274    if m is not None:
275        day, mon, yr, hr, min, sec, tz = m.groups()
276    else:
277        return None  # bad format
278
279    return _str2time(day, mon, yr, hr, min, sec, tz)
280
281ISO_DATE_RE = re.compile(
282    r"""^
283    (\d{4})              # year
284       [-\/]?
285    (\d\d?)              # numerical month
286       [-\/]?
287    (\d\d?)              # day
288   (?:
289         (?:\s+|[-:Tt])  # separator before clock
290      (\d\d?):?(\d\d)    # hour:min
291      (?::?(\d\d(?:\.\d*)?))?  # optional seconds (and fractional)
292   )?                    # optional clock
293      \s*
294   (?:
295      ([-+]?\d\d?:?(:?\d\d)?
296       |Z|z)             # timezone  (Z is "zero meridian", i.e. GMT)
297      \s*
298   )?$""", re.X | re. ASCII)
299def iso2time(text):
300    """
301    As for http2time, but parses the ISO 8601 formats:
302
303    1994-02-03 14:15:29 -0100    -- ISO 8601 format
304    1994-02-03 14:15:29          -- zone is optional
305    1994-02-03                   -- only date
306    1994-02-03T14:15:29          -- Use T as separator
307    19940203T141529Z             -- ISO 8601 compact format
308    19940203                     -- only date
309
310    """
311    # clean up
312    text = text.lstrip()
313
314    # tz is time zone specifier string
315    day, mon, yr, hr, min, sec, tz = [None]*7
316
317    # loose regexp parse
318    m = ISO_DATE_RE.search(text)
319    if m is not None:
320        # XXX there's an extra bit of the timezone I'm ignoring here: is
321        #   this the right thing to do?
322        yr, mon, day, hr, min, sec, tz, _ = m.groups()
323    else:
324        return None  # bad format
325
326    return _str2time(day, mon, yr, hr, min, sec, tz)
327
328
329# Header parsing
330# -----------------------------------------------------------------------------
331
332def unmatched(match):
333    """Return unmatched part of re.Match object."""
334    start, end = match.span(0)
335    return match.string[:start]+match.string[end:]
336
337HEADER_TOKEN_RE =        re.compile(r"^\s*([^=\s;,]+)")
338HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
339HEADER_VALUE_RE =        re.compile(r"^\s*=\s*([^\s;,]*)")
340HEADER_ESCAPE_RE = re.compile(r"\\(.)")
341def split_header_words(header_values):
342    r"""Parse header values into a list of lists containing key,value pairs.
343
344    The function knows how to deal with ",", ";" and "=" as well as quoted
345    values after "=".  A list of space separated tokens are parsed as if they
346    were separated by ";".
347
348    If the header_values passed as argument contains multiple values, then they
349    are treated as if they were a single value separated by comma ",".
350
351    This means that this function is useful for parsing header fields that
352    follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
353    the requirement for tokens).
354
355      headers           = #header
356      header            = (token | parameter) *( [";"] (token | parameter))
357
358      token             = 1*<any CHAR except CTLs or separators>
359      separators        = "(" | ")" | "<" | ">" | "@"
360                        | "," | ";" | ":" | "\" | <">
361                        | "/" | "[" | "]" | "?" | "="
362                        | "{" | "}" | SP | HT
363
364      quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
365      qdtext            = <any TEXT except <">>
366      quoted-pair       = "\" CHAR
367
368      parameter         = attribute "=" value
369      attribute         = token
370      value             = token | quoted-string
371
372    Each header is represented by a list of key/value pairs.  The value for a
373    simple token (not part of a parameter) is None.  Syntactically incorrect
374    headers will not necessarily be parsed as you would want.
375
376    This is easier to describe with some examples:
377
378    >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
379    [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
380    >>> split_header_words(['text/html; charset="iso-8859-1"'])
381    [[('text/html', None), ('charset', 'iso-8859-1')]]
382    >>> split_header_words([r'Basic realm="\"foo\bar\""'])
383    [[('Basic', None), ('realm', '"foobar"')]]
384
385    """
386    assert not isinstance(header_values, str)
387    result = []
388    for text in header_values:
389        orig_text = text
390        pairs = []
391        while text:
392            m = HEADER_TOKEN_RE.search(text)
393            if m:
394                text = unmatched(m)
395                name = m.group(1)
396                m = HEADER_QUOTED_VALUE_RE.search(text)
397                if m:  # quoted value
398                    text = unmatched(m)
399                    value = m.group(1)
400                    value = HEADER_ESCAPE_RE.sub(r"\1", value)
401                else:
402                    m = HEADER_VALUE_RE.search(text)
403                    if m:  # unquoted value
404                        text = unmatched(m)
405                        value = m.group(1)
406                        value = value.rstrip()
407                    else:
408                        # no value, a lone token
409                        value = None
410                pairs.append((name, value))
411            elif text.lstrip().startswith(","):
412                # concatenated headers, as per RFC 2616 section 4.2
413                text = text.lstrip()[1:]
414                if pairs: result.append(pairs)
415                pairs = []
416            else:
417                # skip junk
418                non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text)
419                assert nr_junk_chars > 0, (
420                    "split_header_words bug: '%s', '%s', %s" %
421                    (orig_text, text, pairs))
422                text = non_junk
423        if pairs: result.append(pairs)
424    return result
425
426HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
427def join_header_words(lists):
428    """Do the inverse (almost) of the conversion done by split_header_words.
429
430    Takes a list of lists of (key, value) pairs and produces a single header
431    value.  Attribute values are quoted if needed.
432
433    >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]])
434    'text/plain; charset="iso-8859-1"'
435    >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]])
436    'text/plain, charset="iso-8859-1"'
437
438    """
439    headers = []
440    for pairs in lists:
441        attr = []
442        for k, v in pairs:
443            if v is not None:
444                if not re.search(r"^\w+$", v):
445                    v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v)  # escape " and \
446                    v = '"%s"' % v
447                k = "%s=%s" % (k, v)
448            attr.append(k)
449        if attr: headers.append("; ".join(attr))
450    return ", ".join(headers)
451
452def strip_quotes(text):
453    if text.startswith('"'):
454        text = text[1:]
455    if text.endswith('"'):
456        text = text[:-1]
457    return text
458
459def parse_ns_headers(ns_headers):
460    """Ad-hoc parser for Netscape protocol cookie-attributes.
461
462    The old Netscape cookie format for Set-Cookie can for instance contain
463    an unquoted "," in the expires field, so we have to use this ad-hoc
464    parser instead of split_header_words.
465
466    XXX This may not make the best possible effort to parse all the crap
467    that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
468    parser is probably better, so could do worse than following that if
469    this ever gives any trouble.
470
471    Currently, this is also used for parsing RFC 2109 cookies.
472
473    """
474    known_attrs = ("expires", "domain", "path", "secure",
475                   # RFC 2109 attrs (may turn up in Netscape cookies, too)
476                   "version", "port", "max-age")
477
478    result = []
479    for ns_header in ns_headers:
480        pairs = []
481        version_set = False
482
483        # XXX: The following does not strictly adhere to RFCs in that empty
484        # names and values are legal (the former will only appear once and will
485        # be overwritten if multiple occurrences are present). This is
486        # mostly to deal with backwards compatibility.
487        for ii, param in enumerate(ns_header.split(';')):
488            param = param.strip()
489
490            key, sep, val = param.partition('=')
491            key = key.strip()
492
493            if not key:
494                if ii == 0:
495                    break
496                else:
497                    continue
498
499            # allow for a distinction between present and empty and missing
500            # altogether
501            val = val.strip() if sep else None
502
503            if ii != 0:
504                lc = key.lower()
505                if lc in known_attrs:
506                    key = lc
507
508                if key == "version":
509                    # This is an RFC 2109 cookie.
510                    if val is not None:
511                        val = strip_quotes(val)
512                    version_set = True
513                elif key == "expires":
514                    # convert expires date to seconds since epoch
515                    if val is not None:
516                        val = http2time(strip_quotes(val))  # None if invalid
517            pairs.append((key, val))
518
519        if pairs:
520            if not version_set:
521                pairs.append(("version", "0"))
522            result.append(pairs)
523
524    return result
525
526
527IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
528def is_HDN(text):
529    """Return True if text is a host domain name."""
530    # XXX
531    # This may well be wrong.  Which RFC is HDN defined in, if any (for
532    #  the purposes of RFC 2965)?
533    # For the current implementation, what about IPv6?  Remember to look
534    #  at other uses of IPV4_RE also, if change this.
535    if IPV4_RE.search(text):
536        return False
537    if text == "":
538        return False
539    if text[0] == "." or text[-1] == ".":
540        return False
541    return True
542
543def domain_match(A, B):
544    """Return True if domain A domain-matches domain B, according to RFC 2965.
545
546    A and B may be host domain names or IP addresses.
547
548    RFC 2965, section 1:
549
550    Host names can be specified either as an IP address or a HDN string.
551    Sometimes we compare one host name with another.  (Such comparisons SHALL
552    be case-insensitive.)  Host A's name domain-matches host B's if
553
554         *  their host name strings string-compare equal; or
555
556         * A is a HDN string and has the form NB, where N is a non-empty
557            name string, B has the form .B', and B' is a HDN string.  (So,
558            x.y.com domain-matches .Y.com but not Y.com.)
559
560    Note that domain-match is not a commutative operation: a.b.c.com
561    domain-matches .c.com, but not the reverse.
562
563    """
564    # Note that, if A or B are IP addresses, the only relevant part of the
565    # definition of the domain-match algorithm is the direct string-compare.
566    A = A.lower()
567    B = B.lower()
568    if A == B:
569        return True
570    if not is_HDN(A):
571        return False
572    i = A.rfind(B)
573    if i == -1 or i == 0:
574        # A does not have form NB, or N is the empty string
575        return False
576    if not B.startswith("."):
577        return False
578    if not is_HDN(B[1:]):
579        return False
580    return True
581
582def liberal_is_HDN(text):
583    """Return True if text is a sort-of-like a host domain name.
584
585    For accepting/blocking domains.
586
587    """
588    if IPV4_RE.search(text):
589        return False
590    return True
591
592def user_domain_match(A, B):
593    """For blocking/accepting domains.
594
595    A and B may be host domain names or IP addresses.
596
597    """
598    A = A.lower()
599    B = B.lower()
600    if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
601        if A == B:
602            # equal IP addresses
603            return True
604        return False
605    initial_dot = B.startswith(".")
606    if initial_dot and A.endswith(B):
607        return True
608    if not initial_dot and A == B:
609        return True
610    return False
611
612cut_port_re = re.compile(r":\d+$", re.ASCII)
613def request_host(request):
614    """Return request-host, as defined by RFC 2965.
615
616    Variation from RFC: returned value is lowercased, for convenient
617    comparison.
618
619    """
620    url = request.get_full_url()
621    host = urllib.parse.urlparse(url)[1]
622    if host == "":
623        host = request.get_header("Host", "")
624
625    # remove port, if present
626    host = cut_port_re.sub("", host, 1)
627    return host.lower()
628
629def eff_request_host(request):
630    """Return a tuple (request-host, effective request-host name).
631
632    As defined by RFC 2965, except both are lowercased.
633
634    """
635    erhn = req_host = request_host(request)
636    if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
637        erhn = req_host + ".local"
638    return req_host, erhn
639
640def request_path(request):
641    """Path component of request-URI, as defined by RFC 2965."""
642    url = request.get_full_url()
643    parts = urllib.parse.urlsplit(url)
644    path = escape_path(parts.path)
645    if not path.startswith("/"):
646        # fix bad RFC 2396 absoluteURI
647        path = "/" + path
648    return path
649
650def request_port(request):
651    host = request.host
652    i = host.find(':')
653    if i >= 0:
654        port = host[i+1:]
655        try:
656            int(port)
657        except ValueError:
658            _debug("nonnumeric port: '%s'", port)
659            return None
660    else:
661        port = DEFAULT_HTTP_PORT
662    return port
663
664# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
665# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
666HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
667ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
668def uppercase_escaped_char(match):
669    return "%%%s" % match.group(1).upper()
670def escape_path(path):
671    """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
672    # There's no knowing what character encoding was used to create URLs
673    # containing %-escapes, but since we have to pick one to escape invalid
674    # path characters, we pick UTF-8, as recommended in the HTML 4.0
675    # specification:
676    # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
677    # And here, kind of: draft-fielding-uri-rfc2396bis-03
678    # (And in draft IRI specification: draft-duerst-iri-05)
679    # (And here, for new URI schemes: RFC 2718)
680    path = urllib.parse.quote(path, HTTP_PATH_SAFE)
681    path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
682    return path
683
684def reach(h):
685    """Return reach of host h, as defined by RFC 2965, section 1.
686
687    The reach R of a host name H is defined as follows:
688
689       *  If
690
691          -  H is the host domain name of a host; and,
692
693          -  H has the form A.B; and
694
695          -  A has no embedded (that is, interior) dots; and
696
697          -  B has at least one embedded dot, or B is the string "local".
698             then the reach of H is .B.
699
700       *  Otherwise, the reach of H is H.
701
702    >>> reach("www.acme.com")
703    '.acme.com'
704    >>> reach("acme.com")
705    'acme.com'
706    >>> reach("acme.local")
707    '.local'
708
709    """
710    i = h.find(".")
711    if i >= 0:
712        #a = h[:i]  # this line is only here to show what a is
713        b = h[i+1:]
714        i = b.find(".")
715        if is_HDN(h) and (i >= 0 or b == "local"):
716            return "."+b
717    return h
718
719def is_third_party(request):
720    """
721
722    RFC 2965, section 3.3.6:
723
724        An unverifiable transaction is to a third-party host if its request-
725        host U does not domain-match the reach R of the request-host O in the
726        origin transaction.
727
728    """
729    req_host = request_host(request)
730    if not domain_match(req_host, reach(request.origin_req_host)):
731        return True
732    else:
733        return False
734
735
736class Cookie:
737    """HTTP Cookie.
738
739    This class represents both Netscape and RFC 2965 cookies.
740
741    This is deliberately a very simple class.  It just holds attributes.  It's
742    possible to construct Cookie instances that don't comply with the cookie
743    standards.  CookieJar.make_cookies is the factory function for Cookie
744    objects -- it deals with cookie parsing, supplying defaults, and
745    normalising to the representation used in this class.  CookiePolicy is
746    responsible for checking them to see whether they should be accepted from
747    and returned to the server.
748
749    Note that the port may be present in the headers, but unspecified ("Port"
750    rather than"Port=80", for example); if this is the case, port is None.
751
752    """
753
754    def __init__(self, version, name, value,
755                 port, port_specified,
756                 domain, domain_specified, domain_initial_dot,
757                 path, path_specified,
758                 secure,
759                 expires,
760                 discard,
761                 comment,
762                 comment_url,
763                 rest,
764                 rfc2109=False,
765                 ):
766
767        if version is not None: version = int(version)
768        if expires is not None: expires = int(float(expires))
769        if port is None and port_specified is True:
770            raise ValueError("if port is None, port_specified must be false")
771
772        self.version = version
773        self.name = name
774        self.value = value
775        self.port = port
776        self.port_specified = port_specified
777        # normalise case, as per RFC 2965 section 3.3.3
778        self.domain = domain.lower()
779        self.domain_specified = domain_specified
780        # Sigh.  We need to know whether the domain given in the
781        # cookie-attribute had an initial dot, in order to follow RFC 2965
782        # (as clarified in draft errata).  Needed for the returned $Domain
783        # value.
784        self.domain_initial_dot = domain_initial_dot
785        self.path = path
786        self.path_specified = path_specified
787        self.secure = secure
788        self.expires = expires
789        self.discard = discard
790        self.comment = comment
791        self.comment_url = comment_url
792        self.rfc2109 = rfc2109
793
794        self._rest = copy.copy(rest)
795
796    def has_nonstandard_attr(self, name):
797        return name in self._rest
798    def get_nonstandard_attr(self, name, default=None):
799        return self._rest.get(name, default)
800    def set_nonstandard_attr(self, name, value):
801        self._rest[name] = value
802
803    def is_expired(self, now=None):
804        if now is None: now = time.time()
805        if (self.expires is not None) and (self.expires <= now):
806            return True
807        return False
808
809    def __str__(self):
810        if self.port is None: p = ""
811        else: p = ":"+self.port
812        limit = self.domain + p + self.path
813        if self.value is not None:
814            namevalue = "%s=%s" % (self.name, self.value)
815        else:
816            namevalue = self.name
817        return "<Cookie %s for %s>" % (namevalue, limit)
818
819    def __repr__(self):
820        args = []
821        for name in ("version", "name", "value",
822                     "port", "port_specified",
823                     "domain", "domain_specified", "domain_initial_dot",
824                     "path", "path_specified",
825                     "secure", "expires", "discard", "comment", "comment_url",
826                     ):
827            attr = getattr(self, name)
828            args.append("%s=%s" % (name, repr(attr)))
829        args.append("rest=%s" % repr(self._rest))
830        args.append("rfc2109=%s" % repr(self.rfc2109))
831        return "%s(%s)" % (self.__class__.__name__, ", ".join(args))
832
833
834class CookiePolicy:
835    """Defines which cookies get accepted from and returned to server.
836
837    May also modify cookies, though this is probably a bad idea.
838
839    The subclass DefaultCookiePolicy defines the standard rules for Netscape
840    and RFC 2965 cookies -- override that if you want a customized policy.
841
842    """
843    def set_ok(self, cookie, request):
844        """Return true if (and only if) cookie should be accepted from server.
845
846        Currently, pre-expired cookies never get this far -- the CookieJar
847        class deletes such cookies itself.
848
849        """
850        raise NotImplementedError()
851
852    def return_ok(self, cookie, request):
853        """Return true if (and only if) cookie should be returned to server."""
854        raise NotImplementedError()
855
856    def domain_return_ok(self, domain, request):
857        """Return false if cookies should not be returned, given cookie domain.
858        """
859        return True
860
861    def path_return_ok(self, path, request):
862        """Return false if cookies should not be returned, given cookie path.
863        """
864        return True
865
866
867class DefaultCookiePolicy(CookiePolicy):
868    """Implements the standard rules for accepting and returning cookies."""
869
870    DomainStrictNoDots = 1
871    DomainStrictNonDomain = 2
872    DomainRFC2965Match = 4
873
874    DomainLiberal = 0
875    DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
876
877    def __init__(self,
878                 blocked_domains=None, allowed_domains=None,
879                 netscape=True, rfc2965=False,
880                 rfc2109_as_netscape=None,
881                 hide_cookie2=False,
882                 strict_domain=False,
883                 strict_rfc2965_unverifiable=True,
884                 strict_ns_unverifiable=False,
885                 strict_ns_domain=DomainLiberal,
886                 strict_ns_set_initial_dollar=False,
887                 strict_ns_set_path=False,
888                 secure_protocols=("https", "wss")
889                 ):
890        """Constructor arguments should be passed as keyword arguments only."""
891        self.netscape = netscape
892        self.rfc2965 = rfc2965
893        self.rfc2109_as_netscape = rfc2109_as_netscape
894        self.hide_cookie2 = hide_cookie2
895        self.strict_domain = strict_domain
896        self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
897        self.strict_ns_unverifiable = strict_ns_unverifiable
898        self.strict_ns_domain = strict_ns_domain
899        self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
900        self.strict_ns_set_path = strict_ns_set_path
901        self.secure_protocols = secure_protocols
902
903        if blocked_domains is not None:
904            self._blocked_domains = tuple(blocked_domains)
905        else:
906            self._blocked_domains = ()
907
908        if allowed_domains is not None:
909            allowed_domains = tuple(allowed_domains)
910        self._allowed_domains = allowed_domains
911
912    def blocked_domains(self):
913        """Return the sequence of blocked domains (as a tuple)."""
914        return self._blocked_domains
915    def set_blocked_domains(self, blocked_domains):
916        """Set the sequence of blocked domains."""
917        self._blocked_domains = tuple(blocked_domains)
918
919    def is_blocked(self, domain):
920        for blocked_domain in self._blocked_domains:
921            if user_domain_match(domain, blocked_domain):
922                return True
923        return False
924
925    def allowed_domains(self):
926        """Return None, or the sequence of allowed domains (as a tuple)."""
927        return self._allowed_domains
928    def set_allowed_domains(self, allowed_domains):
929        """Set the sequence of allowed domains, or None."""
930        if allowed_domains is not None:
931            allowed_domains = tuple(allowed_domains)
932        self._allowed_domains = allowed_domains
933
934    def is_not_allowed(self, domain):
935        if self._allowed_domains is None:
936            return False
937        for allowed_domain in self._allowed_domains:
938            if user_domain_match(domain, allowed_domain):
939                return False
940        return True
941
942    def set_ok(self, cookie, request):
943        """
944        If you override .set_ok(), be sure to call this method.  If it returns
945        false, so should your subclass (assuming your subclass wants to be more
946        strict about which cookies to accept).
947
948        """
949        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
950
951        assert cookie.name is not None
952
953        for n in "version", "verifiability", "name", "path", "domain", "port":
954            fn_name = "set_ok_"+n
955            fn = getattr(self, fn_name)
956            if not fn(cookie, request):
957                return False
958
959        return True
960
961    def set_ok_version(self, cookie, request):
962        if cookie.version is None:
963            # Version is always set to 0 by parse_ns_headers if it's a Netscape
964            # cookie, so this must be an invalid RFC 2965 cookie.
965            _debug("   Set-Cookie2 without version attribute (%s=%s)",
966                   cookie.name, cookie.value)
967            return False
968        if cookie.version > 0 and not self.rfc2965:
969            _debug("   RFC 2965 cookies are switched off")
970            return False
971        elif cookie.version == 0 and not self.netscape:
972            _debug("   Netscape cookies are switched off")
973            return False
974        return True
975
976    def set_ok_verifiability(self, cookie, request):
977        if request.unverifiable and is_third_party(request):
978            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
979                _debug("   third-party RFC 2965 cookie during "
980                             "unverifiable transaction")
981                return False
982            elif cookie.version == 0 and self.strict_ns_unverifiable:
983                _debug("   third-party Netscape cookie during "
984                             "unverifiable transaction")
985                return False
986        return True
987
988    def set_ok_name(self, cookie, request):
989        # Try and stop servers setting V0 cookies designed to hack other
990        # servers that know both V0 and V1 protocols.
991        if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
992            cookie.name.startswith("$")):
993            _debug("   illegal name (starts with '$'): '%s'", cookie.name)
994            return False
995        return True
996
997    def set_ok_path(self, cookie, request):
998        if cookie.path_specified:
999            req_path = request_path(request)
1000            if ((cookie.version > 0 or
1001                 (cookie.version == 0 and self.strict_ns_set_path)) and
1002                not self.path_return_ok(cookie.path, request)):
1003                _debug("   path attribute %s is not a prefix of request "
1004                       "path %s", cookie.path, req_path)
1005                return False
1006        return True
1007
1008    def set_ok_domain(self, cookie, request):
1009        if self.is_blocked(cookie.domain):
1010            _debug("   domain %s is in user block-list", cookie.domain)
1011            return False
1012        if self.is_not_allowed(cookie.domain):
1013            _debug("   domain %s is not in user allow-list", cookie.domain)
1014            return False
1015        if cookie.domain_specified:
1016            req_host, erhn = eff_request_host(request)
1017            domain = cookie.domain
1018            if self.strict_domain and (domain.count(".") >= 2):
1019                # XXX This should probably be compared with the Konqueror
1020                # (kcookiejar.cpp) and Mozilla implementations, but it's a
1021                # losing battle.
1022                i = domain.rfind(".")
1023                j = domain.rfind(".", 0, i)
1024                if j == 0:  # domain like .foo.bar
1025                    tld = domain[i+1:]
1026                    sld = domain[j+1:i]
1027                    if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1028                       "gov", "mil", "int", "aero", "biz", "cat", "coop",
1029                       "info", "jobs", "mobi", "museum", "name", "pro",
1030                       "travel", "eu") and len(tld) == 2:
1031                        # domain like .co.uk
1032                        _debug("   country-code second level domain %s", domain)
1033                        return False
1034            if domain.startswith("."):
1035                undotted_domain = domain[1:]
1036            else:
1037                undotted_domain = domain
1038            embedded_dots = (undotted_domain.find(".") >= 0)
1039            if not embedded_dots and domain != ".local":
1040                _debug("   non-local domain %s contains no embedded dot",
1041                       domain)
1042                return False
1043            if cookie.version == 0:
1044                if (not erhn.endswith(domain) and
1045                    (not erhn.startswith(".") and
1046                     not ("."+erhn).endswith(domain))):
1047                    _debug("   effective request-host %s (even with added "
1048                           "initial dot) does not end with %s",
1049                           erhn, domain)
1050                    return False
1051            if (cookie.version > 0 or
1052                (self.strict_ns_domain & self.DomainRFC2965Match)):
1053                if not domain_match(erhn, domain):
1054                    _debug("   effective request-host %s does not domain-match "
1055                           "%s", erhn, domain)
1056                    return False
1057            if (cookie.version > 0 or
1058                (self.strict_ns_domain & self.DomainStrictNoDots)):
1059                host_prefix = req_host[:-len(domain)]
1060                if (host_prefix.find(".") >= 0 and
1061                    not IPV4_RE.search(req_host)):
1062                    _debug("   host prefix %s for domain %s contains a dot",
1063                           host_prefix, domain)
1064                    return False
1065        return True
1066
1067    def set_ok_port(self, cookie, request):
1068        if cookie.port_specified:
1069            req_port = request_port(request)
1070            if req_port is None:
1071                req_port = "80"
1072            else:
1073                req_port = str(req_port)
1074            for p in cookie.port.split(","):
1075                try:
1076                    int(p)
1077                except ValueError:
1078                    _debug("   bad port %s (not numeric)", p)
1079                    return False
1080                if p == req_port:
1081                    break
1082            else:
1083                _debug("   request port (%s) not found in %s",
1084                       req_port, cookie.port)
1085                return False
1086        return True
1087
1088    def return_ok(self, cookie, request):
1089        """
1090        If you override .return_ok(), be sure to call this method.  If it
1091        returns false, so should your subclass (assuming your subclass wants to
1092        be more strict about which cookies to return).
1093
1094        """
1095        # Path has already been checked by .path_return_ok(), and domain
1096        # blocking done by .domain_return_ok().
1097        _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1098
1099        for n in "version", "verifiability", "secure", "expires", "port", "domain":
1100            fn_name = "return_ok_"+n
1101            fn = getattr(self, fn_name)
1102            if not fn(cookie, request):
1103                return False
1104        return True
1105
1106    def return_ok_version(self, cookie, request):
1107        if cookie.version > 0 and not self.rfc2965:
1108            _debug("   RFC 2965 cookies are switched off")
1109            return False
1110        elif cookie.version == 0 and not self.netscape:
1111            _debug("   Netscape cookies are switched off")
1112            return False
1113        return True
1114
1115    def return_ok_verifiability(self, cookie, request):
1116        if request.unverifiable and is_third_party(request):
1117            if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1118                _debug("   third-party RFC 2965 cookie during unverifiable "
1119                       "transaction")
1120                return False
1121            elif cookie.version == 0 and self.strict_ns_unverifiable:
1122                _debug("   third-party Netscape cookie during unverifiable "
1123                       "transaction")
1124                return False
1125        return True
1126
1127    def return_ok_secure(self, cookie, request):
1128        if cookie.secure and request.type not in self.secure_protocols:
1129            _debug("   secure cookie with non-secure request")
1130            return False
1131        return True
1132
1133    def return_ok_expires(self, cookie, request):
1134        if cookie.is_expired(self._now):
1135            _debug("   cookie expired")
1136            return False
1137        return True
1138
1139    def return_ok_port(self, cookie, request):
1140        if cookie.port:
1141            req_port = request_port(request)
1142            if req_port is None:
1143                req_port = "80"
1144            for p in cookie.port.split(","):
1145                if p == req_port:
1146                    break
1147            else:
1148                _debug("   request port %s does not match cookie port %s",
1149                       req_port, cookie.port)
1150                return False
1151        return True
1152
1153    def return_ok_domain(self, cookie, request):
1154        req_host, erhn = eff_request_host(request)
1155        domain = cookie.domain
1156
1157        if domain and not domain.startswith("."):
1158            dotdomain = "." + domain
1159        else:
1160            dotdomain = domain
1161
1162        # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1163        if (cookie.version == 0 and
1164            (self.strict_ns_domain & self.DomainStrictNonDomain) and
1165            not cookie.domain_specified and domain != erhn):
1166            _debug("   cookie with unspecified domain does not string-compare "
1167                   "equal to request domain")
1168            return False
1169
1170        if cookie.version > 0 and not domain_match(erhn, domain):
1171            _debug("   effective request-host name %s does not domain-match "
1172                   "RFC 2965 cookie domain %s", erhn, domain)
1173            return False
1174        if cookie.version == 0 and not ("."+erhn).endswith(dotdomain):
1175            _debug("   request-host %s does not match Netscape cookie domain "
1176                   "%s", req_host, domain)
1177            return False
1178        return True
1179
1180    def domain_return_ok(self, domain, request):
1181        # Liberal check of.  This is here as an optimization to avoid
1182        # having to load lots of MSIE cookie files unless necessary.
1183        req_host, erhn = eff_request_host(request)
1184        if not req_host.startswith("."):
1185            req_host = "."+req_host
1186        if not erhn.startswith("."):
1187            erhn = "."+erhn
1188        if domain and not domain.startswith("."):
1189            dotdomain = "." + domain
1190        else:
1191            dotdomain = domain
1192        if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)):
1193            #_debug("   request domain %s does not match cookie domain %s",
1194            #       req_host, domain)
1195            return False
1196
1197        if self.is_blocked(domain):
1198            _debug("   domain %s is in user block-list", domain)
1199            return False
1200        if self.is_not_allowed(domain):
1201            _debug("   domain %s is not in user allow-list", domain)
1202            return False
1203
1204        return True
1205
1206    def path_return_ok(self, path, request):
1207        _debug("- checking cookie path=%s", path)
1208        req_path = request_path(request)
1209        pathlen = len(path)
1210        if req_path == path:
1211            return True
1212        elif (req_path.startswith(path) and
1213              (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")):
1214            return True
1215
1216        _debug("  %s does not path-match %s", req_path, path)
1217        return False
1218
1219def vals_sorted_by_key(adict):
1220    keys = sorted(adict.keys())
1221    return map(adict.get, keys)
1222
1223def deepvalues(mapping):
1224    """Iterates over nested mapping, depth-first, in sorted order by key."""
1225    values = vals_sorted_by_key(mapping)
1226    for obj in values:
1227        mapping = False
1228        try:
1229            obj.items
1230        except AttributeError:
1231            pass
1232        else:
1233            mapping = True
1234            yield from deepvalues(obj)
1235        if not mapping:
1236            yield obj
1237
1238
1239# Used as second parameter to dict.get() method, to distinguish absent
1240# dict key from one with a None value.
1241class Absent: pass
1242
1243class CookieJar:
1244    """Collection of HTTP cookies.
1245
1246    You may not need to know about this class: try
1247    urllib.request.build_opener(HTTPCookieProcessor).open(url).
1248    """
1249
1250    non_word_re = re.compile(r"\W")
1251    quote_re = re.compile(r"([\"\\])")
1252    strict_domain_re = re.compile(r"\.?[^.]*")
1253    domain_re = re.compile(r"[^.]*")
1254    dots_re = re.compile(r"^\.+")
1255
1256    magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
1257
1258    def __init__(self, policy=None):
1259        if policy is None:
1260            policy = DefaultCookiePolicy()
1261        self._policy = policy
1262
1263        self._cookies_lock = _threading.RLock()
1264        self._cookies = {}
1265
1266    def set_policy(self, policy):
1267        self._policy = policy
1268
1269    def _cookies_for_domain(self, domain, request):
1270        cookies = []
1271        if not self._policy.domain_return_ok(domain, request):
1272            return []
1273        _debug("Checking %s for cookies to return", domain)
1274        cookies_by_path = self._cookies[domain]
1275        for path in cookies_by_path.keys():
1276            if not self._policy.path_return_ok(path, request):
1277                continue
1278            cookies_by_name = cookies_by_path[path]
1279            for cookie in cookies_by_name.values():
1280                if not self._policy.return_ok(cookie, request):
1281                    _debug("   not returning cookie")
1282                    continue
1283                _debug("   it's a match")
1284                cookies.append(cookie)
1285        return cookies
1286
1287    def _cookies_for_request(self, request):
1288        """Return a list of cookies to be returned to server."""
1289        cookies = []
1290        for domain in self._cookies.keys():
1291            cookies.extend(self._cookies_for_domain(domain, request))
1292        return cookies
1293
1294    def _cookie_attrs(self, cookies):
1295        """Return a list of cookie-attributes to be returned to server.
1296
1297        like ['foo="bar"; $Path="/"', ...]
1298
1299        The $Version attribute is also added when appropriate (currently only
1300        once per request).
1301
1302        """
1303        # add cookies in order of most specific (ie. longest) path first
1304        cookies.sort(key=lambda a: len(a.path), reverse=True)
1305
1306        version_set = False
1307
1308        attrs = []
1309        for cookie in cookies:
1310            # set version of Cookie header
1311            # XXX
1312            # What should it be if multiple matching Set-Cookie headers have
1313            #  different versions themselves?
1314            # Answer: there is no answer; was supposed to be settled by
1315            #  RFC 2965 errata, but that may never appear...
1316            version = cookie.version
1317            if not version_set:
1318                version_set = True
1319                if version > 0:
1320                    attrs.append("$Version=%s" % version)
1321
1322            # quote cookie value if necessary
1323            # (not for Netscape protocol, which already has any quotes
1324            #  intact, due to the poorly-specified Netscape Cookie: syntax)
1325            if ((cookie.value is not None) and
1326                self.non_word_re.search(cookie.value) and version > 0):
1327                value = self.quote_re.sub(r"\\\1", cookie.value)
1328            else:
1329                value = cookie.value
1330
1331            # add cookie-attributes to be returned in Cookie header
1332            if cookie.value is None:
1333                attrs.append(cookie.name)
1334            else:
1335                attrs.append("%s=%s" % (cookie.name, value))
1336            if version > 0:
1337                if cookie.path_specified:
1338                    attrs.append('$Path="%s"' % cookie.path)
1339                if cookie.domain.startswith("."):
1340                    domain = cookie.domain
1341                    if (not cookie.domain_initial_dot and
1342                        domain.startswith(".")):
1343                        domain = domain[1:]
1344                    attrs.append('$Domain="%s"' % domain)
1345                if cookie.port is not None:
1346                    p = "$Port"
1347                    if cookie.port_specified:
1348                        p = p + ('="%s"' % cookie.port)
1349                    attrs.append(p)
1350
1351        return attrs
1352
1353    def add_cookie_header(self, request):
1354        """Add correct Cookie: header to request (urllib.request.Request object).
1355
1356        The Cookie2 header is also added unless policy.hide_cookie2 is true.
1357
1358        """
1359        _debug("add_cookie_header")
1360        self._cookies_lock.acquire()
1361        try:
1362
1363            self._policy._now = self._now = int(time.time())
1364
1365            cookies = self._cookies_for_request(request)
1366
1367            attrs = self._cookie_attrs(cookies)
1368            if attrs:
1369                if not request.has_header("Cookie"):
1370                    request.add_unredirected_header(
1371                        "Cookie", "; ".join(attrs))
1372
1373            # if necessary, advertise that we know RFC 2965
1374            if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1375                not request.has_header("Cookie2")):
1376                for cookie in cookies:
1377                    if cookie.version != 1:
1378                        request.add_unredirected_header("Cookie2", '$Version="1"')
1379                        break
1380
1381        finally:
1382            self._cookies_lock.release()
1383
1384        self.clear_expired_cookies()
1385
1386    def _normalized_cookie_tuples(self, attrs_set):
1387        """Return list of tuples containing normalised cookie information.
1388
1389        attrs_set is the list of lists of key,value pairs extracted from
1390        the Set-Cookie or Set-Cookie2 headers.
1391
1392        Tuples are name, value, standard, rest, where name and value are the
1393        cookie name and value, standard is a dictionary containing the standard
1394        cookie-attributes (discard, secure, version, expires or max-age,
1395        domain, path and port) and rest is a dictionary containing the rest of
1396        the cookie-attributes.
1397
1398        """
1399        cookie_tuples = []
1400
1401        boolean_attrs = "discard", "secure"
1402        value_attrs = ("version",
1403                       "expires", "max-age",
1404                       "domain", "path", "port",
1405                       "comment", "commenturl")
1406
1407        for cookie_attrs in attrs_set:
1408            name, value = cookie_attrs[0]
1409
1410            # Build dictionary of standard cookie-attributes (standard) and
1411            # dictionary of other cookie-attributes (rest).
1412
1413            # Note: expiry time is normalised to seconds since epoch.  V0
1414            # cookies should have the Expires cookie-attribute, and V1 cookies
1415            # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1416            # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1417            # accept either (but prefer Max-Age).
1418            max_age_set = False
1419
1420            bad_cookie = False
1421
1422            standard = {}
1423            rest = {}
1424            for k, v in cookie_attrs[1:]:
1425                lc = k.lower()
1426                # don't lose case distinction for unknown fields
1427                if lc in value_attrs or lc in boolean_attrs:
1428                    k = lc
1429                if k in boolean_attrs and v is None:
1430                    # boolean cookie-attribute is present, but has no value
1431                    # (like "discard", rather than "port=80")
1432                    v = True
1433                if k in standard:
1434                    # only first value is significant
1435                    continue
1436                if k == "domain":
1437                    if v is None:
1438                        _debug("   missing value for domain attribute")
1439                        bad_cookie = True
1440                        break
1441                    # RFC 2965 section 3.3.3
1442                    v = v.lower()
1443                if k == "expires":
1444                    if max_age_set:
1445                        # Prefer max-age to expires (like Mozilla)
1446                        continue
1447                    if v is None:
1448                        _debug("   missing or invalid value for expires "
1449                              "attribute: treating as session cookie")
1450                        continue
1451                if k == "max-age":
1452                    max_age_set = True
1453                    try:
1454                        v = int(v)
1455                    except ValueError:
1456                        _debug("   missing or invalid (non-numeric) value for "
1457                              "max-age attribute")
1458                        bad_cookie = True
1459                        break
1460                    # convert RFC 2965 Max-Age to seconds since epoch
1461                    # XXX Strictly you're supposed to follow RFC 2616
1462                    #   age-calculation rules.  Remember that zero Max-Age
1463                    #   is a request to discard (old and new) cookie, though.
1464                    k = "expires"
1465                    v = self._now + v
1466                if (k in value_attrs) or (k in boolean_attrs):
1467                    if (v is None and
1468                        k not in ("port", "comment", "commenturl")):
1469                        _debug("   missing value for %s attribute" % k)
1470                        bad_cookie = True
1471                        break
1472                    standard[k] = v
1473                else:
1474                    rest[k] = v
1475
1476            if bad_cookie:
1477                continue
1478
1479            cookie_tuples.append((name, value, standard, rest))
1480
1481        return cookie_tuples
1482
1483    def _cookie_from_cookie_tuple(self, tup, request):
1484        # standard is dict of standard cookie-attributes, rest is dict of the
1485        # rest of them
1486        name, value, standard, rest = tup
1487
1488        domain = standard.get("domain", Absent)
1489        path = standard.get("path", Absent)
1490        port = standard.get("port", Absent)
1491        expires = standard.get("expires", Absent)
1492
1493        # set the easy defaults
1494        version = standard.get("version", None)
1495        if version is not None:
1496            try:
1497                version = int(version)
1498            except ValueError:
1499                return None  # invalid version, ignore cookie
1500        secure = standard.get("secure", False)
1501        # (discard is also set if expires is Absent)
1502        discard = standard.get("discard", False)
1503        comment = standard.get("comment", None)
1504        comment_url = standard.get("commenturl", None)
1505
1506        # set default path
1507        if path is not Absent and path != "":
1508            path_specified = True
1509            path = escape_path(path)
1510        else:
1511            path_specified = False
1512            path = request_path(request)
1513            i = path.rfind("/")
1514            if i != -1:
1515                if version == 0:
1516                    # Netscape spec parts company from reality here
1517                    path = path[:i]
1518                else:
1519                    path = path[:i+1]
1520            if len(path) == 0: path = "/"
1521
1522        # set default domain
1523        domain_specified = domain is not Absent
1524        # but first we have to remember whether it starts with a dot
1525        domain_initial_dot = False
1526        if domain_specified:
1527            domain_initial_dot = bool(domain.startswith("."))
1528        if domain is Absent:
1529            req_host, erhn = eff_request_host(request)
1530            domain = erhn
1531        elif not domain.startswith("."):
1532            domain = "."+domain
1533
1534        # set default port
1535        port_specified = False
1536        if port is not Absent:
1537            if port is None:
1538                # Port attr present, but has no value: default to request port.
1539                # Cookie should then only be sent back on that port.
1540                port = request_port(request)
1541            else:
1542                port_specified = True
1543                port = re.sub(r"\s+", "", port)
1544        else:
1545            # No port attr present.  Cookie can be sent back on any port.
1546            port = None
1547
1548        # set default expires and discard
1549        if expires is Absent:
1550            expires = None
1551            discard = True
1552        elif expires <= self._now:
1553            # Expiry date in past is request to delete cookie.  This can't be
1554            # in DefaultCookiePolicy, because can't delete cookies there.
1555            try:
1556                self.clear(domain, path, name)
1557            except KeyError:
1558                pass
1559            _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1560                   domain, path, name)
1561            return None
1562
1563        return Cookie(version,
1564                      name, value,
1565                      port, port_specified,
1566                      domain, domain_specified, domain_initial_dot,
1567                      path, path_specified,
1568                      secure,
1569                      expires,
1570                      discard,
1571                      comment,
1572                      comment_url,
1573                      rest)
1574
1575    def _cookies_from_attrs_set(self, attrs_set, request):
1576        cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1577
1578        cookies = []
1579        for tup in cookie_tuples:
1580            cookie = self._cookie_from_cookie_tuple(tup, request)
1581            if cookie: cookies.append(cookie)
1582        return cookies
1583
1584    def _process_rfc2109_cookies(self, cookies):
1585        rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1586        if rfc2109_as_ns is None:
1587            rfc2109_as_ns = not self._policy.rfc2965
1588        for cookie in cookies:
1589            if cookie.version == 1:
1590                cookie.rfc2109 = True
1591                if rfc2109_as_ns:
1592                    # treat 2109 cookies as Netscape cookies rather than
1593                    # as RFC2965 cookies
1594                    cookie.version = 0
1595
1596    def make_cookies(self, response, request):
1597        """Return sequence of Cookie objects extracted from response object."""
1598        # get cookie-attributes for RFC 2965 and Netscape protocols
1599        headers = response.info()
1600        rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1601        ns_hdrs = headers.get_all("Set-Cookie", [])
1602        self._policy._now = self._now = int(time.time())
1603
1604        rfc2965 = self._policy.rfc2965
1605        netscape = self._policy.netscape
1606
1607        if ((not rfc2965_hdrs and not ns_hdrs) or
1608            (not ns_hdrs and not rfc2965) or
1609            (not rfc2965_hdrs and not netscape) or
1610            (not netscape and not rfc2965)):
1611            return []  # no relevant cookie headers: quick exit
1612
1613        try:
1614            cookies = self._cookies_from_attrs_set(
1615                split_header_words(rfc2965_hdrs), request)
1616        except Exception:
1617            _warn_unhandled_exception()
1618            cookies = []
1619
1620        if ns_hdrs and netscape:
1621            try:
1622                # RFC 2109 and Netscape cookies
1623                ns_cookies = self._cookies_from_attrs_set(
1624                    parse_ns_headers(ns_hdrs), request)
1625            except Exception:
1626                _warn_unhandled_exception()
1627                ns_cookies = []
1628            self._process_rfc2109_cookies(ns_cookies)
1629
1630            # Look for Netscape cookies (from Set-Cookie headers) that match
1631            # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1632            # For each match, keep the RFC 2965 cookie and ignore the Netscape
1633            # cookie (RFC 2965 section 9.1).  Actually, RFC 2109 cookies are
1634            # bundled in with the Netscape cookies for this purpose, which is
1635            # reasonable behaviour.
1636            if rfc2965:
1637                lookup = {}
1638                for cookie in cookies:
1639                    lookup[(cookie.domain, cookie.path, cookie.name)] = None
1640
1641                def no_matching_rfc2965(ns_cookie, lookup=lookup):
1642                    key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1643                    return key not in lookup
1644                ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1645
1646            if ns_cookies:
1647                cookies.extend(ns_cookies)
1648
1649        return cookies
1650
1651    def set_cookie_if_ok(self, cookie, request):
1652        """Set a cookie if policy says it's OK to do so."""
1653        self._cookies_lock.acquire()
1654        try:
1655            self._policy._now = self._now = int(time.time())
1656
1657            if self._policy.set_ok(cookie, request):
1658                self.set_cookie(cookie)
1659
1660
1661        finally:
1662            self._cookies_lock.release()
1663
1664    def set_cookie(self, cookie):
1665        """Set a cookie, without checking whether or not it should be set."""
1666        c = self._cookies
1667        self._cookies_lock.acquire()
1668        try:
1669            if cookie.domain not in c: c[cookie.domain] = {}
1670            c2 = c[cookie.domain]
1671            if cookie.path not in c2: c2[cookie.path] = {}
1672            c3 = c2[cookie.path]
1673            c3[cookie.name] = cookie
1674        finally:
1675            self._cookies_lock.release()
1676
1677    def extract_cookies(self, response, request):
1678        """Extract cookies from response, where allowable given the request."""
1679        _debug("extract_cookies: %s", response.info())
1680        self._cookies_lock.acquire()
1681        try:
1682            for cookie in self.make_cookies(response, request):
1683                if self._policy.set_ok(cookie, request):
1684                    _debug(" setting cookie: %s", cookie)
1685                    self.set_cookie(cookie)
1686        finally:
1687            self._cookies_lock.release()
1688
1689    def clear(self, domain=None, path=None, name=None):
1690        """Clear some cookies.
1691
1692        Invoking this method without arguments will clear all cookies.  If
1693        given a single argument, only cookies belonging to that domain will be
1694        removed.  If given two arguments, cookies belonging to the specified
1695        path within that domain are removed.  If given three arguments, then
1696        the cookie with the specified name, path and domain is removed.
1697
1698        Raises KeyError if no matching cookie exists.
1699
1700        """
1701        if name is not None:
1702            if (domain is None) or (path is None):
1703                raise ValueError(
1704                    "domain and path must be given to remove a cookie by name")
1705            del self._cookies[domain][path][name]
1706        elif path is not None:
1707            if domain is None:
1708                raise ValueError(
1709                    "domain must be given to remove cookies by path")
1710            del self._cookies[domain][path]
1711        elif domain is not None:
1712            del self._cookies[domain]
1713        else:
1714            self._cookies = {}
1715
1716    def clear_session_cookies(self):
1717        """Discard all session cookies.
1718
1719        Note that the .save() method won't save session cookies anyway, unless
1720        you ask otherwise by passing a true ignore_discard argument.
1721
1722        """
1723        self._cookies_lock.acquire()
1724        try:
1725            for cookie in self:
1726                if cookie.discard:
1727                    self.clear(cookie.domain, cookie.path, cookie.name)
1728        finally:
1729            self._cookies_lock.release()
1730
1731    def clear_expired_cookies(self):
1732        """Discard all expired cookies.
1733
1734        You probably don't need to call this method: expired cookies are never
1735        sent back to the server (provided you're using DefaultCookiePolicy),
1736        this method is called by CookieJar itself every so often, and the
1737        .save() method won't save expired cookies anyway (unless you ask
1738        otherwise by passing a true ignore_expires argument).
1739
1740        """
1741        self._cookies_lock.acquire()
1742        try:
1743            now = time.time()
1744            for cookie in self:
1745                if cookie.is_expired(now):
1746                    self.clear(cookie.domain, cookie.path, cookie.name)
1747        finally:
1748            self._cookies_lock.release()
1749
1750    def __iter__(self):
1751        return deepvalues(self._cookies)
1752
1753    def __len__(self):
1754        """Return number of contained cookies."""
1755        i = 0
1756        for cookie in self: i = i + 1
1757        return i
1758
1759    def __repr__(self):
1760        r = []
1761        for cookie in self: r.append(repr(cookie))
1762        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1763
1764    def __str__(self):
1765        r = []
1766        for cookie in self: r.append(str(cookie))
1767        return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r))
1768
1769
1770# derives from OSError for backwards-compatibility with Python 2.4.0
1771class LoadError(OSError): pass
1772
1773class FileCookieJar(CookieJar):
1774    """CookieJar that can be loaded from and saved to a file."""
1775
1776    def __init__(self, filename=None, delayload=False, policy=None):
1777        """
1778        Cookies are NOT loaded from the named file until either the .load() or
1779        .revert() method is called.
1780
1781        """
1782        CookieJar.__init__(self, policy)
1783        if filename is not None:
1784            filename = os.fspath(filename)
1785        self.filename = filename
1786        self.delayload = bool(delayload)
1787
1788    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1789        """Save cookies to a file."""
1790        raise NotImplementedError()
1791
1792    def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1793        """Load cookies from a file."""
1794        if filename is None:
1795            if self.filename is not None: filename = self.filename
1796            else: raise ValueError(MISSING_FILENAME_TEXT)
1797
1798        with open(filename) as f:
1799            self._really_load(f, filename, ignore_discard, ignore_expires)
1800
1801    def revert(self, filename=None,
1802               ignore_discard=False, ignore_expires=False):
1803        """Clear all cookies and reload cookies from a saved file.
1804
1805        Raises LoadError (or OSError) if reversion is not successful; the
1806        object's state will not be altered if this happens.
1807
1808        """
1809        if filename is None:
1810            if self.filename is not None: filename = self.filename
1811            else: raise ValueError(MISSING_FILENAME_TEXT)
1812
1813        self._cookies_lock.acquire()
1814        try:
1815
1816            old_state = copy.deepcopy(self._cookies)
1817            self._cookies = {}
1818            try:
1819                self.load(filename, ignore_discard, ignore_expires)
1820            except OSError:
1821                self._cookies = old_state
1822                raise
1823
1824        finally:
1825            self._cookies_lock.release()
1826
1827
1828def lwp_cookie_str(cookie):
1829    """Return string representation of Cookie in the LWP cookie file format.
1830
1831    Actually, the format is extended a bit -- see module docstring.
1832
1833    """
1834    h = [(cookie.name, cookie.value),
1835         ("path", cookie.path),
1836         ("domain", cookie.domain)]
1837    if cookie.port is not None: h.append(("port", cookie.port))
1838    if cookie.path_specified: h.append(("path_spec", None))
1839    if cookie.port_specified: h.append(("port_spec", None))
1840    if cookie.domain_initial_dot: h.append(("domain_dot", None))
1841    if cookie.secure: h.append(("secure", None))
1842    if cookie.expires: h.append(("expires",
1843                               time2isoz(float(cookie.expires))))
1844    if cookie.discard: h.append(("discard", None))
1845    if cookie.comment: h.append(("comment", cookie.comment))
1846    if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1847
1848    keys = sorted(cookie._rest.keys())
1849    for k in keys:
1850        h.append((k, str(cookie._rest[k])))
1851
1852    h.append(("version", str(cookie.version)))
1853
1854    return join_header_words([h])
1855
1856class LWPCookieJar(FileCookieJar):
1857    """
1858    The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
1859    "Set-Cookie3" is the format used by the libwww-perl library, not known
1860    to be compatible with any browser, but which is easy to read and
1861    doesn't lose information about RFC 2965 cookies.
1862
1863    Additional methods
1864
1865    as_lwp_str(ignore_discard=True, ignore_expired=True)
1866
1867    """
1868
1869    def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1870        """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
1871
1872        ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1873
1874        """
1875        now = time.time()
1876        r = []
1877        for cookie in self:
1878            if not ignore_discard and cookie.discard:
1879                continue
1880            if not ignore_expires and cookie.is_expired(now):
1881                continue
1882            r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1883        return "\n".join(r+[""])
1884
1885    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1886        if filename is None:
1887            if self.filename is not None: filename = self.filename
1888            else: raise ValueError(MISSING_FILENAME_TEXT)
1889
1890        with open(filename, "w") as f:
1891            # There really isn't an LWP Cookies 2.0 format, but this indicates
1892            # that there is extra information in here (domain_dot and
1893            # port_spec) while still being compatible with libwww-perl, I hope.
1894            f.write("#LWP-Cookies-2.0\n")
1895            f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1896
1897    def _really_load(self, f, filename, ignore_discard, ignore_expires):
1898        magic = f.readline()
1899        if not self.magic_re.search(magic):
1900            msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1901                   "file" % filename)
1902            raise LoadError(msg)
1903
1904        now = time.time()
1905
1906        header = "Set-Cookie3:"
1907        boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1908                         "secure", "discard")
1909        value_attrs = ("version",
1910                       "port", "path", "domain",
1911                       "expires",
1912                       "comment", "commenturl")
1913
1914        try:
1915            while 1:
1916                line = f.readline()
1917                if line == "": break
1918                if not line.startswith(header):
1919                    continue
1920                line = line[len(header):].strip()
1921
1922                for data in split_header_words([line]):
1923                    name, value = data[0]
1924                    standard = {}
1925                    rest = {}
1926                    for k in boolean_attrs:
1927                        standard[k] = False
1928                    for k, v in data[1:]:
1929                        if k is not None:
1930                            lc = k.lower()
1931                        else:
1932                            lc = None
1933                        # don't lose case distinction for unknown fields
1934                        if (lc in value_attrs) or (lc in boolean_attrs):
1935                            k = lc
1936                        if k in boolean_attrs:
1937                            if v is None: v = True
1938                            standard[k] = v
1939                        elif k in value_attrs:
1940                            standard[k] = v
1941                        else:
1942                            rest[k] = v
1943
1944                    h = standard.get
1945                    expires = h("expires")
1946                    discard = h("discard")
1947                    if expires is not None:
1948                        expires = iso2time(expires)
1949                    if expires is None:
1950                        discard = True
1951                    domain = h("domain")
1952                    domain_specified = domain.startswith(".")
1953                    c = Cookie(h("version"), name, value,
1954                               h("port"), h("port_spec"),
1955                               domain, domain_specified, h("domain_dot"),
1956                               h("path"), h("path_spec"),
1957                               h("secure"),
1958                               expires,
1959                               discard,
1960                               h("comment"),
1961                               h("commenturl"),
1962                               rest)
1963                    if not ignore_discard and c.discard:
1964                        continue
1965                    if not ignore_expires and c.is_expired(now):
1966                        continue
1967                    self.set_cookie(c)
1968        except OSError:
1969            raise
1970        except Exception:
1971            _warn_unhandled_exception()
1972            raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1973                            (filename, line))
1974
1975
1976class MozillaCookieJar(FileCookieJar):
1977    """
1978
1979    WARNING: you may want to backup your browser's cookies file if you use
1980    this class to save cookies.  I *think* it works, but there have been
1981    bugs in the past!
1982
1983    This class differs from CookieJar only in the format it uses to save and
1984    load cookies to and from a file.  This class uses the Mozilla/Netscape
1985    `cookies.txt' format.  lynx uses this file format, too.
1986
1987    Don't expect cookies saved while the browser is running to be noticed by
1988    the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1989    you change them on disk while it's running; on Windows, you probably can't
1990    save at all while the browser is running).
1991
1992    Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1993    Netscape cookies on saving.
1994
1995    In particular, the cookie version and port number information is lost,
1996    together with information about whether or not Path, Port and Discard were
1997    specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1998    domain as set in the HTTP header started with a dot (yes, I'm aware some
1999    domains in Netscape files start with a dot and some don't -- trust me, you
2000    really don't want to know any more about this).
2001
2002    Note that though Mozilla and Netscape use the same format, they use
2003    slightly different headers.  The class saves cookies using the Netscape
2004    header by default (Mozilla can cope with that).
2005
2006    """
2007    magic_re = re.compile("#( Netscape)? HTTP Cookie File")
2008    header = """\
2009# Netscape HTTP Cookie File
2010# http://curl.haxx.se/rfc/cookie_spec.html
2011# This is a generated file!  Do not edit.
2012
2013"""
2014
2015    def _really_load(self, f, filename, ignore_discard, ignore_expires):
2016        now = time.time()
2017
2018        magic = f.readline()
2019        if not self.magic_re.search(magic):
2020            raise LoadError(
2021                "%r does not look like a Netscape format cookies file" %
2022                filename)
2023
2024        try:
2025            while 1:
2026                line = f.readline()
2027                if line == "": break
2028
2029                # last field may be absent, so keep any trailing tab
2030                if line.endswith("\n"): line = line[:-1]
2031
2032                # skip comments and blank lines XXX what is $ for?
2033                if (line.strip().startswith(("#", "$")) or
2034                    line.strip() == ""):
2035                    continue
2036
2037                domain, domain_specified, path, secure, expires, name, value = \
2038                        line.split("\t")
2039                secure = (secure == "TRUE")
2040                domain_specified = (domain_specified == "TRUE")
2041                if name == "":
2042                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2043                    # with no name, whereas http.cookiejar regards it as a
2044                    # cookie with no value.
2045                    name = value
2046                    value = None
2047
2048                initial_dot = domain.startswith(".")
2049                assert domain_specified == initial_dot
2050
2051                discard = False
2052                if expires == "":
2053                    expires = None
2054                    discard = True
2055
2056                # assume path_specified is false
2057                c = Cookie(0, name, value,
2058                           None, False,
2059                           domain, domain_specified, initial_dot,
2060                           path, False,
2061                           secure,
2062                           expires,
2063                           discard,
2064                           None,
2065                           None,
2066                           {})
2067                if not ignore_discard and c.discard:
2068                    continue
2069                if not ignore_expires and c.is_expired(now):
2070                    continue
2071                self.set_cookie(c)
2072
2073        except OSError:
2074            raise
2075        except Exception:
2076            _warn_unhandled_exception()
2077            raise LoadError("invalid Netscape format cookies file %r: %r" %
2078                            (filename, line))
2079
2080    def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2081        if filename is None:
2082            if self.filename is not None: filename = self.filename
2083            else: raise ValueError(MISSING_FILENAME_TEXT)
2084
2085        with open(filename, "w") as f:
2086            f.write(self.header)
2087            now = time.time()
2088            for cookie in self:
2089                if not ignore_discard and cookie.discard:
2090                    continue
2091                if not ignore_expires and cookie.is_expired(now):
2092                    continue
2093                if cookie.secure: secure = "TRUE"
2094                else: secure = "FALSE"
2095                if cookie.domain.startswith("."): initial_dot = "TRUE"
2096                else: initial_dot = "FALSE"
2097                if cookie.expires is not None:
2098                    expires = str(cookie.expires)
2099                else:
2100                    expires = ""
2101                if cookie.value is None:
2102                    # cookies.txt regards 'Set-Cookie: foo' as a cookie
2103                    # with no name, whereas http.cookiejar regards it as a
2104                    # cookie with no value.
2105                    name = ""
2106                    value = cookie.name
2107                else:
2108                    name = cookie.name
2109                    value = cookie.value
2110                f.write(
2111                    "\t".join([cookie.domain, initial_dot, cookie.path,
2112                               secure, expires, name, value])+
2113                    "\n")
2114