1r"""HTTP cookie handling for web clients. 2 3This module has (now fairly distant) origins in Gisle Aas' Perl module 4HTTP::Cookies, from the libwww-perl library. 5 6Docstrings, comments and debug strings in this code refer to the 7attributes of the HTTP cookie system as cookie-attributes, to distinguish 8them clearly from Python attributes. 9 10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not 11distributed with the Python standard library, but are available from 12http://wwwsearch.sf.net/): 13 14 CookieJar____ 15 / \ \ 16 FileCookieJar \ \ 17 / | \ \ \ 18 MozillaCookieJar | LWPCookieJar \ \ 19 | | \ 20 | ---MSIEBase | \ 21 | / | | \ 22 | / MSIEDBCookieJar BSDDBCookieJar 23 |/ 24 MSIECookieJar 25 26""" 27 28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', 29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] 30 31import os 32import copy 33import datetime 34import re 35import time 36import urllib.parse, urllib.request 37import threading as _threading 38import http.client # only for the default HTTP port 39from calendar import timegm 40 41debug = False # set to True to enable debugging via the logging module 42logger = None 43 44def _debug(*args): 45 if not debug: 46 return 47 global logger 48 if not logger: 49 import logging 50 logger = logging.getLogger("http.cookiejar") 51 return logger.debug(*args) 52 53 54DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT) 55MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " 56 "instance initialised with one)") 57 58def _warn_unhandled_exception(): 59 # There are a few catch-all except: statements in this module, for 60 # catching input that's bad in unexpected ways. Warn if any 61 # exceptions are caught there. 62 import io, warnings, traceback 63 f = io.StringIO() 64 traceback.print_exc(None, f) 65 msg = f.getvalue() 66 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2) 67 68 69# Date/time conversion 70# ----------------------------------------------------------------------------- 71 72EPOCH_YEAR = 1970 73def _timegm(tt): 74 year, month, mday, hour, min, sec = tt[:6] 75 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and 76 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): 77 return timegm(tt) 78 else: 79 return None 80 81DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 82MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 83 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 84MONTHS_LOWER = [] 85for month in MONTHS: MONTHS_LOWER.append(month.lower()) 86 87def time2isoz(t=None): 88 """Return a string representing time in seconds since epoch, t. 89 90 If the function is called without an argument, it will use the current 91 time. 92 93 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", 94 representing Universal Time (UTC, aka GMT). An example of this format is: 95 96 1994-11-24 08:49:37Z 97 98 """ 99 if t is None: 100 dt = datetime.datetime.utcnow() 101 else: 102 dt = datetime.datetime.utcfromtimestamp(t) 103 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( 104 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) 105 106def time2netscape(t=None): 107 """Return a string representing time in seconds since epoch, t. 108 109 If the function is called without an argument, it will use the current 110 time. 111 112 The format of the returned string is like this: 113 114 Wed, DD-Mon-YYYY HH:MM:SS GMT 115 116 """ 117 if t is None: 118 dt = datetime.datetime.utcnow() 119 else: 120 dt = datetime.datetime.utcfromtimestamp(t) 121 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % ( 122 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1], 123 dt.year, dt.hour, dt.minute, dt.second) 124 125 126UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} 127 128TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII) 129def offset_from_tz_string(tz): 130 offset = None 131 if tz in UTC_ZONES: 132 offset = 0 133 else: 134 m = TIMEZONE_RE.search(tz) 135 if m: 136 offset = 3600 * int(m.group(2)) 137 if m.group(3): 138 offset = offset + 60 * int(m.group(3)) 139 if m.group(1) == '-': 140 offset = -offset 141 return offset 142 143def _str2time(day, mon, yr, hr, min, sec, tz): 144 yr = int(yr) 145 if yr > datetime.MAXYEAR: 146 return None 147 148 # translate month name to number 149 # month numbers start with 1 (January) 150 try: 151 mon = MONTHS_LOWER.index(mon.lower())+1 152 except ValueError: 153 # maybe it's already a number 154 try: 155 imon = int(mon) 156 except ValueError: 157 return None 158 if 1 <= imon <= 12: 159 mon = imon 160 else: 161 return None 162 163 # make sure clock elements are defined 164 if hr is None: hr = 0 165 if min is None: min = 0 166 if sec is None: sec = 0 167 168 day = int(day) 169 hr = int(hr) 170 min = int(min) 171 sec = int(sec) 172 173 if yr < 1000: 174 # find "obvious" year 175 cur_yr = time.localtime(time.time())[0] 176 m = cur_yr % 100 177 tmp = yr 178 yr = yr + cur_yr - m 179 m = m - tmp 180 if abs(m) > 50: 181 if m > 0: yr = yr + 100 182 else: yr = yr - 100 183 184 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) 185 t = _timegm((yr, mon, day, hr, min, sec, tz)) 186 187 if t is not None: 188 # adjust time using timezone string, to get absolute time since epoch 189 if tz is None: 190 tz = "UTC" 191 tz = tz.upper() 192 offset = offset_from_tz_string(tz) 193 if offset is None: 194 return None 195 t = t - offset 196 197 return t 198 199STRICT_DATE_RE = re.compile( 200 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " 201 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII) 202WEEKDAY_RE = re.compile( 203 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII) 204LOOSE_HTTP_DATE_RE = re.compile( 205 r"""^ 206 (\d\d?) # day 207 (?:\s+|[-\/]) 208 (\w+) # month 209 (?:\s+|[-\/]) 210 (\d+) # year 211 (?: 212 (?:\s+|:) # separator before clock 213 (\d\d?):(\d\d) # hour:min 214 (?::(\d\d))? # optional seconds 215 )? # optional clock 216 \s* 217 (?: 218 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+) # timezone 219 \s* 220 )? 221 (?: 222 \(\w+\) # ASCII representation of timezone in parens. 223 \s* 224 )?$""", re.X | re.ASCII) 225def http2time(text): 226 """Returns time in seconds since epoch of time represented by a string. 227 228 Return value is an integer. 229 230 None is returned if the format of str is unrecognized, the time is outside 231 the representable range, or the timezone string is not recognized. If the 232 string contains no timezone, UTC is assumed. 233 234 The timezone in the string may be numerical (like "-0800" or "+0100") or a 235 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the 236 timezone strings equivalent to UTC (zero offset) are known to the function. 237 238 The function loosely parses the following formats: 239 240 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format 241 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format 242 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format 243 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) 244 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) 245 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) 246 247 The parser ignores leading and trailing whitespace. The time may be 248 absent. 249 250 If the year is given with only 2 digits, the function will select the 251 century that makes the year closest to the current date. 252 253 """ 254 # fast exit for strictly conforming string 255 m = STRICT_DATE_RE.search(text) 256 if m: 257 g = m.groups() 258 mon = MONTHS_LOWER.index(g[1].lower()) + 1 259 tt = (int(g[2]), mon, int(g[0]), 260 int(g[3]), int(g[4]), float(g[5])) 261 return _timegm(tt) 262 263 # No, we need some messy parsing... 264 265 # clean up 266 text = text.lstrip() 267 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday 268 269 # tz is time zone specifier string 270 day, mon, yr, hr, min, sec, tz = [None]*7 271 272 # loose regexp parse 273 m = LOOSE_HTTP_DATE_RE.search(text) 274 if m is not None: 275 day, mon, yr, hr, min, sec, tz = m.groups() 276 else: 277 return None # bad format 278 279 return _str2time(day, mon, yr, hr, min, sec, tz) 280 281ISO_DATE_RE = re.compile( 282 r"""^ 283 (\d{4}) # year 284 [-\/]? 285 (\d\d?) # numerical month 286 [-\/]? 287 (\d\d?) # day 288 (?: 289 (?:\s+|[-:Tt]) # separator before clock 290 (\d\d?):?(\d\d) # hour:min 291 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) 292 )? # optional clock 293 \s* 294 (?: 295 ([-+]?\d\d?:?(:?\d\d)? 296 |Z|z) # timezone (Z is "zero meridian", i.e. GMT) 297 \s* 298 )?$""", re.X | re. ASCII) 299def iso2time(text): 300 """ 301 As for http2time, but parses the ISO 8601 formats: 302 303 1994-02-03 14:15:29 -0100 -- ISO 8601 format 304 1994-02-03 14:15:29 -- zone is optional 305 1994-02-03 -- only date 306 1994-02-03T14:15:29 -- Use T as separator 307 19940203T141529Z -- ISO 8601 compact format 308 19940203 -- only date 309 310 """ 311 # clean up 312 text = text.lstrip() 313 314 # tz is time zone specifier string 315 day, mon, yr, hr, min, sec, tz = [None]*7 316 317 # loose regexp parse 318 m = ISO_DATE_RE.search(text) 319 if m is not None: 320 # XXX there's an extra bit of the timezone I'm ignoring here: is 321 # this the right thing to do? 322 yr, mon, day, hr, min, sec, tz, _ = m.groups() 323 else: 324 return None # bad format 325 326 return _str2time(day, mon, yr, hr, min, sec, tz) 327 328 329# Header parsing 330# ----------------------------------------------------------------------------- 331 332def unmatched(match): 333 """Return unmatched part of re.Match object.""" 334 start, end = match.span(0) 335 return match.string[:start]+match.string[end:] 336 337HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") 338HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") 339HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") 340HEADER_ESCAPE_RE = re.compile(r"\\(.)") 341def split_header_words(header_values): 342 r"""Parse header values into a list of lists containing key,value pairs. 343 344 The function knows how to deal with ",", ";" and "=" as well as quoted 345 values after "=". A list of space separated tokens are parsed as if they 346 were separated by ";". 347 348 If the header_values passed as argument contains multiple values, then they 349 are treated as if they were a single value separated by comma ",". 350 351 This means that this function is useful for parsing header fields that 352 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax 353 the requirement for tokens). 354 355 headers = #header 356 header = (token | parameter) *( [";"] (token | parameter)) 357 358 token = 1*<any CHAR except CTLs or separators> 359 separators = "(" | ")" | "<" | ">" | "@" 360 | "," | ";" | ":" | "\" | <"> 361 | "/" | "[" | "]" | "?" | "=" 362 | "{" | "}" | SP | HT 363 364 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) 365 qdtext = <any TEXT except <">> 366 quoted-pair = "\" CHAR 367 368 parameter = attribute "=" value 369 attribute = token 370 value = token | quoted-string 371 372 Each header is represented by a list of key/value pairs. The value for a 373 simple token (not part of a parameter) is None. Syntactically incorrect 374 headers will not necessarily be parsed as you would want. 375 376 This is easier to describe with some examples: 377 378 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) 379 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] 380 >>> split_header_words(['text/html; charset="iso-8859-1"']) 381 [[('text/html', None), ('charset', 'iso-8859-1')]] 382 >>> split_header_words([r'Basic realm="\"foo\bar\""']) 383 [[('Basic', None), ('realm', '"foobar"')]] 384 385 """ 386 assert not isinstance(header_values, str) 387 result = [] 388 for text in header_values: 389 orig_text = text 390 pairs = [] 391 while text: 392 m = HEADER_TOKEN_RE.search(text) 393 if m: 394 text = unmatched(m) 395 name = m.group(1) 396 m = HEADER_QUOTED_VALUE_RE.search(text) 397 if m: # quoted value 398 text = unmatched(m) 399 value = m.group(1) 400 value = HEADER_ESCAPE_RE.sub(r"\1", value) 401 else: 402 m = HEADER_VALUE_RE.search(text) 403 if m: # unquoted value 404 text = unmatched(m) 405 value = m.group(1) 406 value = value.rstrip() 407 else: 408 # no value, a lone token 409 value = None 410 pairs.append((name, value)) 411 elif text.lstrip().startswith(","): 412 # concatenated headers, as per RFC 2616 section 4.2 413 text = text.lstrip()[1:] 414 if pairs: result.append(pairs) 415 pairs = [] 416 else: 417 # skip junk 418 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text) 419 assert nr_junk_chars > 0, ( 420 "split_header_words bug: '%s', '%s', %s" % 421 (orig_text, text, pairs)) 422 text = non_junk 423 if pairs: result.append(pairs) 424 return result 425 426HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") 427def join_header_words(lists): 428 """Do the inverse (almost) of the conversion done by split_header_words. 429 430 Takes a list of lists of (key, value) pairs and produces a single header 431 value. Attribute values are quoted if needed. 432 433 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]]) 434 'text/plain; charset="iso-8859-1"' 435 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]]) 436 'text/plain, charset="iso-8859-1"' 437 438 """ 439 headers = [] 440 for pairs in lists: 441 attr = [] 442 for k, v in pairs: 443 if v is not None: 444 if not re.search(r"^\w+$", v): 445 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ 446 v = '"%s"' % v 447 k = "%s=%s" % (k, v) 448 attr.append(k) 449 if attr: headers.append("; ".join(attr)) 450 return ", ".join(headers) 451 452def strip_quotes(text): 453 if text.startswith('"'): 454 text = text[1:] 455 if text.endswith('"'): 456 text = text[:-1] 457 return text 458 459def parse_ns_headers(ns_headers): 460 """Ad-hoc parser for Netscape protocol cookie-attributes. 461 462 The old Netscape cookie format for Set-Cookie can for instance contain 463 an unquoted "," in the expires field, so we have to use this ad-hoc 464 parser instead of split_header_words. 465 466 XXX This may not make the best possible effort to parse all the crap 467 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient 468 parser is probably better, so could do worse than following that if 469 this ever gives any trouble. 470 471 Currently, this is also used for parsing RFC 2109 cookies. 472 473 """ 474 known_attrs = ("expires", "domain", "path", "secure", 475 # RFC 2109 attrs (may turn up in Netscape cookies, too) 476 "version", "port", "max-age") 477 478 result = [] 479 for ns_header in ns_headers: 480 pairs = [] 481 version_set = False 482 483 # XXX: The following does not strictly adhere to RFCs in that empty 484 # names and values are legal (the former will only appear once and will 485 # be overwritten if multiple occurrences are present). This is 486 # mostly to deal with backwards compatibility. 487 for ii, param in enumerate(ns_header.split(';')): 488 param = param.strip() 489 490 key, sep, val = param.partition('=') 491 key = key.strip() 492 493 if not key: 494 if ii == 0: 495 break 496 else: 497 continue 498 499 # allow for a distinction between present and empty and missing 500 # altogether 501 val = val.strip() if sep else None 502 503 if ii != 0: 504 lc = key.lower() 505 if lc in known_attrs: 506 key = lc 507 508 if key == "version": 509 # This is an RFC 2109 cookie. 510 if val is not None: 511 val = strip_quotes(val) 512 version_set = True 513 elif key == "expires": 514 # convert expires date to seconds since epoch 515 if val is not None: 516 val = http2time(strip_quotes(val)) # None if invalid 517 pairs.append((key, val)) 518 519 if pairs: 520 if not version_set: 521 pairs.append(("version", "0")) 522 result.append(pairs) 523 524 return result 525 526 527IPV4_RE = re.compile(r"\.\d+$", re.ASCII) 528def is_HDN(text): 529 """Return True if text is a host domain name.""" 530 # XXX 531 # This may well be wrong. Which RFC is HDN defined in, if any (for 532 # the purposes of RFC 2965)? 533 # For the current implementation, what about IPv6? Remember to look 534 # at other uses of IPV4_RE also, if change this. 535 if IPV4_RE.search(text): 536 return False 537 if text == "": 538 return False 539 if text[0] == "." or text[-1] == ".": 540 return False 541 return True 542 543def domain_match(A, B): 544 """Return True if domain A domain-matches domain B, according to RFC 2965. 545 546 A and B may be host domain names or IP addresses. 547 548 RFC 2965, section 1: 549 550 Host names can be specified either as an IP address or a HDN string. 551 Sometimes we compare one host name with another. (Such comparisons SHALL 552 be case-insensitive.) Host A's name domain-matches host B's if 553 554 * their host name strings string-compare equal; or 555 556 * A is a HDN string and has the form NB, where N is a non-empty 557 name string, B has the form .B', and B' is a HDN string. (So, 558 x.y.com domain-matches .Y.com but not Y.com.) 559 560 Note that domain-match is not a commutative operation: a.b.c.com 561 domain-matches .c.com, but not the reverse. 562 563 """ 564 # Note that, if A or B are IP addresses, the only relevant part of the 565 # definition of the domain-match algorithm is the direct string-compare. 566 A = A.lower() 567 B = B.lower() 568 if A == B: 569 return True 570 if not is_HDN(A): 571 return False 572 i = A.rfind(B) 573 if i == -1 or i == 0: 574 # A does not have form NB, or N is the empty string 575 return False 576 if not B.startswith("."): 577 return False 578 if not is_HDN(B[1:]): 579 return False 580 return True 581 582def liberal_is_HDN(text): 583 """Return True if text is a sort-of-like a host domain name. 584 585 For accepting/blocking domains. 586 587 """ 588 if IPV4_RE.search(text): 589 return False 590 return True 591 592def user_domain_match(A, B): 593 """For blocking/accepting domains. 594 595 A and B may be host domain names or IP addresses. 596 597 """ 598 A = A.lower() 599 B = B.lower() 600 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): 601 if A == B: 602 # equal IP addresses 603 return True 604 return False 605 initial_dot = B.startswith(".") 606 if initial_dot and A.endswith(B): 607 return True 608 if not initial_dot and A == B: 609 return True 610 return False 611 612cut_port_re = re.compile(r":\d+$", re.ASCII) 613def request_host(request): 614 """Return request-host, as defined by RFC 2965. 615 616 Variation from RFC: returned value is lowercased, for convenient 617 comparison. 618 619 """ 620 url = request.get_full_url() 621 host = urllib.parse.urlparse(url)[1] 622 if host == "": 623 host = request.get_header("Host", "") 624 625 # remove port, if present 626 host = cut_port_re.sub("", host, 1) 627 return host.lower() 628 629def eff_request_host(request): 630 """Return a tuple (request-host, effective request-host name). 631 632 As defined by RFC 2965, except both are lowercased. 633 634 """ 635 erhn = req_host = request_host(request) 636 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): 637 erhn = req_host + ".local" 638 return req_host, erhn 639 640def request_path(request): 641 """Path component of request-URI, as defined by RFC 2965.""" 642 url = request.get_full_url() 643 parts = urllib.parse.urlsplit(url) 644 path = escape_path(parts.path) 645 if not path.startswith("/"): 646 # fix bad RFC 2396 absoluteURI 647 path = "/" + path 648 return path 649 650def request_port(request): 651 host = request.host 652 i = host.find(':') 653 if i >= 0: 654 port = host[i+1:] 655 try: 656 int(port) 657 except ValueError: 658 _debug("nonnumeric port: '%s'", port) 659 return None 660 else: 661 port = DEFAULT_HTTP_PORT 662 return port 663 664# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't 665# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). 666HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" 667ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") 668def uppercase_escaped_char(match): 669 return "%%%s" % match.group(1).upper() 670def escape_path(path): 671 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" 672 # There's no knowing what character encoding was used to create URLs 673 # containing %-escapes, but since we have to pick one to escape invalid 674 # path characters, we pick UTF-8, as recommended in the HTML 4.0 675 # specification: 676 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 677 # And here, kind of: draft-fielding-uri-rfc2396bis-03 678 # (And in draft IRI specification: draft-duerst-iri-05) 679 # (And here, for new URI schemes: RFC 2718) 680 path = urllib.parse.quote(path, HTTP_PATH_SAFE) 681 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) 682 return path 683 684def reach(h): 685 """Return reach of host h, as defined by RFC 2965, section 1. 686 687 The reach R of a host name H is defined as follows: 688 689 * If 690 691 - H is the host domain name of a host; and, 692 693 - H has the form A.B; and 694 695 - A has no embedded (that is, interior) dots; and 696 697 - B has at least one embedded dot, or B is the string "local". 698 then the reach of H is .B. 699 700 * Otherwise, the reach of H is H. 701 702 >>> reach("www.acme.com") 703 '.acme.com' 704 >>> reach("acme.com") 705 'acme.com' 706 >>> reach("acme.local") 707 '.local' 708 709 """ 710 i = h.find(".") 711 if i >= 0: 712 #a = h[:i] # this line is only here to show what a is 713 b = h[i+1:] 714 i = b.find(".") 715 if is_HDN(h) and (i >= 0 or b == "local"): 716 return "."+b 717 return h 718 719def is_third_party(request): 720 """ 721 722 RFC 2965, section 3.3.6: 723 724 An unverifiable transaction is to a third-party host if its request- 725 host U does not domain-match the reach R of the request-host O in the 726 origin transaction. 727 728 """ 729 req_host = request_host(request) 730 if not domain_match(req_host, reach(request.origin_req_host)): 731 return True 732 else: 733 return False 734 735 736class Cookie: 737 """HTTP Cookie. 738 739 This class represents both Netscape and RFC 2965 cookies. 740 741 This is deliberately a very simple class. It just holds attributes. It's 742 possible to construct Cookie instances that don't comply with the cookie 743 standards. CookieJar.make_cookies is the factory function for Cookie 744 objects -- it deals with cookie parsing, supplying defaults, and 745 normalising to the representation used in this class. CookiePolicy is 746 responsible for checking them to see whether they should be accepted from 747 and returned to the server. 748 749 Note that the port may be present in the headers, but unspecified ("Port" 750 rather than"Port=80", for example); if this is the case, port is None. 751 752 """ 753 754 def __init__(self, version, name, value, 755 port, port_specified, 756 domain, domain_specified, domain_initial_dot, 757 path, path_specified, 758 secure, 759 expires, 760 discard, 761 comment, 762 comment_url, 763 rest, 764 rfc2109=False, 765 ): 766 767 if version is not None: version = int(version) 768 if expires is not None: expires = int(float(expires)) 769 if port is None and port_specified is True: 770 raise ValueError("if port is None, port_specified must be false") 771 772 self.version = version 773 self.name = name 774 self.value = value 775 self.port = port 776 self.port_specified = port_specified 777 # normalise case, as per RFC 2965 section 3.3.3 778 self.domain = domain.lower() 779 self.domain_specified = domain_specified 780 # Sigh. We need to know whether the domain given in the 781 # cookie-attribute had an initial dot, in order to follow RFC 2965 782 # (as clarified in draft errata). Needed for the returned $Domain 783 # value. 784 self.domain_initial_dot = domain_initial_dot 785 self.path = path 786 self.path_specified = path_specified 787 self.secure = secure 788 self.expires = expires 789 self.discard = discard 790 self.comment = comment 791 self.comment_url = comment_url 792 self.rfc2109 = rfc2109 793 794 self._rest = copy.copy(rest) 795 796 def has_nonstandard_attr(self, name): 797 return name in self._rest 798 def get_nonstandard_attr(self, name, default=None): 799 return self._rest.get(name, default) 800 def set_nonstandard_attr(self, name, value): 801 self._rest[name] = value 802 803 def is_expired(self, now=None): 804 if now is None: now = time.time() 805 if (self.expires is not None) and (self.expires <= now): 806 return True 807 return False 808 809 def __str__(self): 810 if self.port is None: p = "" 811 else: p = ":"+self.port 812 limit = self.domain + p + self.path 813 if self.value is not None: 814 namevalue = "%s=%s" % (self.name, self.value) 815 else: 816 namevalue = self.name 817 return "<Cookie %s for %s>" % (namevalue, limit) 818 819 def __repr__(self): 820 args = [] 821 for name in ("version", "name", "value", 822 "port", "port_specified", 823 "domain", "domain_specified", "domain_initial_dot", 824 "path", "path_specified", 825 "secure", "expires", "discard", "comment", "comment_url", 826 ): 827 attr = getattr(self, name) 828 args.append("%s=%s" % (name, repr(attr))) 829 args.append("rest=%s" % repr(self._rest)) 830 args.append("rfc2109=%s" % repr(self.rfc2109)) 831 return "%s(%s)" % (self.__class__.__name__, ", ".join(args)) 832 833 834class CookiePolicy: 835 """Defines which cookies get accepted from and returned to server. 836 837 May also modify cookies, though this is probably a bad idea. 838 839 The subclass DefaultCookiePolicy defines the standard rules for Netscape 840 and RFC 2965 cookies -- override that if you want a customized policy. 841 842 """ 843 def set_ok(self, cookie, request): 844 """Return true if (and only if) cookie should be accepted from server. 845 846 Currently, pre-expired cookies never get this far -- the CookieJar 847 class deletes such cookies itself. 848 849 """ 850 raise NotImplementedError() 851 852 def return_ok(self, cookie, request): 853 """Return true if (and only if) cookie should be returned to server.""" 854 raise NotImplementedError() 855 856 def domain_return_ok(self, domain, request): 857 """Return false if cookies should not be returned, given cookie domain. 858 """ 859 return True 860 861 def path_return_ok(self, path, request): 862 """Return false if cookies should not be returned, given cookie path. 863 """ 864 return True 865 866 867class DefaultCookiePolicy(CookiePolicy): 868 """Implements the standard rules for accepting and returning cookies.""" 869 870 DomainStrictNoDots = 1 871 DomainStrictNonDomain = 2 872 DomainRFC2965Match = 4 873 874 DomainLiberal = 0 875 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain 876 877 def __init__(self, 878 blocked_domains=None, allowed_domains=None, 879 netscape=True, rfc2965=False, 880 rfc2109_as_netscape=None, 881 hide_cookie2=False, 882 strict_domain=False, 883 strict_rfc2965_unverifiable=True, 884 strict_ns_unverifiable=False, 885 strict_ns_domain=DomainLiberal, 886 strict_ns_set_initial_dollar=False, 887 strict_ns_set_path=False, 888 secure_protocols=("https", "wss") 889 ): 890 """Constructor arguments should be passed as keyword arguments only.""" 891 self.netscape = netscape 892 self.rfc2965 = rfc2965 893 self.rfc2109_as_netscape = rfc2109_as_netscape 894 self.hide_cookie2 = hide_cookie2 895 self.strict_domain = strict_domain 896 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable 897 self.strict_ns_unverifiable = strict_ns_unverifiable 898 self.strict_ns_domain = strict_ns_domain 899 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar 900 self.strict_ns_set_path = strict_ns_set_path 901 self.secure_protocols = secure_protocols 902 903 if blocked_domains is not None: 904 self._blocked_domains = tuple(blocked_domains) 905 else: 906 self._blocked_domains = () 907 908 if allowed_domains is not None: 909 allowed_domains = tuple(allowed_domains) 910 self._allowed_domains = allowed_domains 911 912 def blocked_domains(self): 913 """Return the sequence of blocked domains (as a tuple).""" 914 return self._blocked_domains 915 def set_blocked_domains(self, blocked_domains): 916 """Set the sequence of blocked domains.""" 917 self._blocked_domains = tuple(blocked_domains) 918 919 def is_blocked(self, domain): 920 for blocked_domain in self._blocked_domains: 921 if user_domain_match(domain, blocked_domain): 922 return True 923 return False 924 925 def allowed_domains(self): 926 """Return None, or the sequence of allowed domains (as a tuple).""" 927 return self._allowed_domains 928 def set_allowed_domains(self, allowed_domains): 929 """Set the sequence of allowed domains, or None.""" 930 if allowed_domains is not None: 931 allowed_domains = tuple(allowed_domains) 932 self._allowed_domains = allowed_domains 933 934 def is_not_allowed(self, domain): 935 if self._allowed_domains is None: 936 return False 937 for allowed_domain in self._allowed_domains: 938 if user_domain_match(domain, allowed_domain): 939 return False 940 return True 941 942 def set_ok(self, cookie, request): 943 """ 944 If you override .set_ok(), be sure to call this method. If it returns 945 false, so should your subclass (assuming your subclass wants to be more 946 strict about which cookies to accept). 947 948 """ 949 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 950 951 assert cookie.name is not None 952 953 for n in "version", "verifiability", "name", "path", "domain", "port": 954 fn_name = "set_ok_"+n 955 fn = getattr(self, fn_name) 956 if not fn(cookie, request): 957 return False 958 959 return True 960 961 def set_ok_version(self, cookie, request): 962 if cookie.version is None: 963 # Version is always set to 0 by parse_ns_headers if it's a Netscape 964 # cookie, so this must be an invalid RFC 2965 cookie. 965 _debug(" Set-Cookie2 without version attribute (%s=%s)", 966 cookie.name, cookie.value) 967 return False 968 if cookie.version > 0 and not self.rfc2965: 969 _debug(" RFC 2965 cookies are switched off") 970 return False 971 elif cookie.version == 0 and not self.netscape: 972 _debug(" Netscape cookies are switched off") 973 return False 974 return True 975 976 def set_ok_verifiability(self, cookie, request): 977 if request.unverifiable and is_third_party(request): 978 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 979 _debug(" third-party RFC 2965 cookie during " 980 "unverifiable transaction") 981 return False 982 elif cookie.version == 0 and self.strict_ns_unverifiable: 983 _debug(" third-party Netscape cookie during " 984 "unverifiable transaction") 985 return False 986 return True 987 988 def set_ok_name(self, cookie, request): 989 # Try and stop servers setting V0 cookies designed to hack other 990 # servers that know both V0 and V1 protocols. 991 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and 992 cookie.name.startswith("$")): 993 _debug(" illegal name (starts with '$'): '%s'", cookie.name) 994 return False 995 return True 996 997 def set_ok_path(self, cookie, request): 998 if cookie.path_specified: 999 req_path = request_path(request) 1000 if ((cookie.version > 0 or 1001 (cookie.version == 0 and self.strict_ns_set_path)) and 1002 not self.path_return_ok(cookie.path, request)): 1003 _debug(" path attribute %s is not a prefix of request " 1004 "path %s", cookie.path, req_path) 1005 return False 1006 return True 1007 1008 def set_ok_domain(self, cookie, request): 1009 if self.is_blocked(cookie.domain): 1010 _debug(" domain %s is in user block-list", cookie.domain) 1011 return False 1012 if self.is_not_allowed(cookie.domain): 1013 _debug(" domain %s is not in user allow-list", cookie.domain) 1014 return False 1015 if cookie.domain_specified: 1016 req_host, erhn = eff_request_host(request) 1017 domain = cookie.domain 1018 if self.strict_domain and (domain.count(".") >= 2): 1019 # XXX This should probably be compared with the Konqueror 1020 # (kcookiejar.cpp) and Mozilla implementations, but it's a 1021 # losing battle. 1022 i = domain.rfind(".") 1023 j = domain.rfind(".", 0, i) 1024 if j == 0: # domain like .foo.bar 1025 tld = domain[i+1:] 1026 sld = domain[j+1:i] 1027 if sld.lower() in ("co", "ac", "com", "edu", "org", "net", 1028 "gov", "mil", "int", "aero", "biz", "cat", "coop", 1029 "info", "jobs", "mobi", "museum", "name", "pro", 1030 "travel", "eu") and len(tld) == 2: 1031 # domain like .co.uk 1032 _debug(" country-code second level domain %s", domain) 1033 return False 1034 if domain.startswith("."): 1035 undotted_domain = domain[1:] 1036 else: 1037 undotted_domain = domain 1038 embedded_dots = (undotted_domain.find(".") >= 0) 1039 if not embedded_dots and domain != ".local": 1040 _debug(" non-local domain %s contains no embedded dot", 1041 domain) 1042 return False 1043 if cookie.version == 0: 1044 if (not erhn.endswith(domain) and 1045 (not erhn.startswith(".") and 1046 not ("."+erhn).endswith(domain))): 1047 _debug(" effective request-host %s (even with added " 1048 "initial dot) does not end with %s", 1049 erhn, domain) 1050 return False 1051 if (cookie.version > 0 or 1052 (self.strict_ns_domain & self.DomainRFC2965Match)): 1053 if not domain_match(erhn, domain): 1054 _debug(" effective request-host %s does not domain-match " 1055 "%s", erhn, domain) 1056 return False 1057 if (cookie.version > 0 or 1058 (self.strict_ns_domain & self.DomainStrictNoDots)): 1059 host_prefix = req_host[:-len(domain)] 1060 if (host_prefix.find(".") >= 0 and 1061 not IPV4_RE.search(req_host)): 1062 _debug(" host prefix %s for domain %s contains a dot", 1063 host_prefix, domain) 1064 return False 1065 return True 1066 1067 def set_ok_port(self, cookie, request): 1068 if cookie.port_specified: 1069 req_port = request_port(request) 1070 if req_port is None: 1071 req_port = "80" 1072 else: 1073 req_port = str(req_port) 1074 for p in cookie.port.split(","): 1075 try: 1076 int(p) 1077 except ValueError: 1078 _debug(" bad port %s (not numeric)", p) 1079 return False 1080 if p == req_port: 1081 break 1082 else: 1083 _debug(" request port (%s) not found in %s", 1084 req_port, cookie.port) 1085 return False 1086 return True 1087 1088 def return_ok(self, cookie, request): 1089 """ 1090 If you override .return_ok(), be sure to call this method. If it 1091 returns false, so should your subclass (assuming your subclass wants to 1092 be more strict about which cookies to return). 1093 1094 """ 1095 # Path has already been checked by .path_return_ok(), and domain 1096 # blocking done by .domain_return_ok(). 1097 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 1098 1099 for n in "version", "verifiability", "secure", "expires", "port", "domain": 1100 fn_name = "return_ok_"+n 1101 fn = getattr(self, fn_name) 1102 if not fn(cookie, request): 1103 return False 1104 return True 1105 1106 def return_ok_version(self, cookie, request): 1107 if cookie.version > 0 and not self.rfc2965: 1108 _debug(" RFC 2965 cookies are switched off") 1109 return False 1110 elif cookie.version == 0 and not self.netscape: 1111 _debug(" Netscape cookies are switched off") 1112 return False 1113 return True 1114 1115 def return_ok_verifiability(self, cookie, request): 1116 if request.unverifiable and is_third_party(request): 1117 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 1118 _debug(" third-party RFC 2965 cookie during unverifiable " 1119 "transaction") 1120 return False 1121 elif cookie.version == 0 and self.strict_ns_unverifiable: 1122 _debug(" third-party Netscape cookie during unverifiable " 1123 "transaction") 1124 return False 1125 return True 1126 1127 def return_ok_secure(self, cookie, request): 1128 if cookie.secure and request.type not in self.secure_protocols: 1129 _debug(" secure cookie with non-secure request") 1130 return False 1131 return True 1132 1133 def return_ok_expires(self, cookie, request): 1134 if cookie.is_expired(self._now): 1135 _debug(" cookie expired") 1136 return False 1137 return True 1138 1139 def return_ok_port(self, cookie, request): 1140 if cookie.port: 1141 req_port = request_port(request) 1142 if req_port is None: 1143 req_port = "80" 1144 for p in cookie.port.split(","): 1145 if p == req_port: 1146 break 1147 else: 1148 _debug(" request port %s does not match cookie port %s", 1149 req_port, cookie.port) 1150 return False 1151 return True 1152 1153 def return_ok_domain(self, cookie, request): 1154 req_host, erhn = eff_request_host(request) 1155 domain = cookie.domain 1156 1157 if domain and not domain.startswith("."): 1158 dotdomain = "." + domain 1159 else: 1160 dotdomain = domain 1161 1162 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't 1163 if (cookie.version == 0 and 1164 (self.strict_ns_domain & self.DomainStrictNonDomain) and 1165 not cookie.domain_specified and domain != erhn): 1166 _debug(" cookie with unspecified domain does not string-compare " 1167 "equal to request domain") 1168 return False 1169 1170 if cookie.version > 0 and not domain_match(erhn, domain): 1171 _debug(" effective request-host name %s does not domain-match " 1172 "RFC 2965 cookie domain %s", erhn, domain) 1173 return False 1174 if cookie.version == 0 and not ("."+erhn).endswith(dotdomain): 1175 _debug(" request-host %s does not match Netscape cookie domain " 1176 "%s", req_host, domain) 1177 return False 1178 return True 1179 1180 def domain_return_ok(self, domain, request): 1181 # Liberal check of. This is here as an optimization to avoid 1182 # having to load lots of MSIE cookie files unless necessary. 1183 req_host, erhn = eff_request_host(request) 1184 if not req_host.startswith("."): 1185 req_host = "."+req_host 1186 if not erhn.startswith("."): 1187 erhn = "."+erhn 1188 if domain and not domain.startswith("."): 1189 dotdomain = "." + domain 1190 else: 1191 dotdomain = domain 1192 if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)): 1193 #_debug(" request domain %s does not match cookie domain %s", 1194 # req_host, domain) 1195 return False 1196 1197 if self.is_blocked(domain): 1198 _debug(" domain %s is in user block-list", domain) 1199 return False 1200 if self.is_not_allowed(domain): 1201 _debug(" domain %s is not in user allow-list", domain) 1202 return False 1203 1204 return True 1205 1206 def path_return_ok(self, path, request): 1207 _debug("- checking cookie path=%s", path) 1208 req_path = request_path(request) 1209 pathlen = len(path) 1210 if req_path == path: 1211 return True 1212 elif (req_path.startswith(path) and 1213 (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")): 1214 return True 1215 1216 _debug(" %s does not path-match %s", req_path, path) 1217 return False 1218 1219def vals_sorted_by_key(adict): 1220 keys = sorted(adict.keys()) 1221 return map(adict.get, keys) 1222 1223def deepvalues(mapping): 1224 """Iterates over nested mapping, depth-first, in sorted order by key.""" 1225 values = vals_sorted_by_key(mapping) 1226 for obj in values: 1227 mapping = False 1228 try: 1229 obj.items 1230 except AttributeError: 1231 pass 1232 else: 1233 mapping = True 1234 yield from deepvalues(obj) 1235 if not mapping: 1236 yield obj 1237 1238 1239# Used as second parameter to dict.get() method, to distinguish absent 1240# dict key from one with a None value. 1241class Absent: pass 1242 1243class CookieJar: 1244 """Collection of HTTP cookies. 1245 1246 You may not need to know about this class: try 1247 urllib.request.build_opener(HTTPCookieProcessor).open(url). 1248 """ 1249 1250 non_word_re = re.compile(r"\W") 1251 quote_re = re.compile(r"([\"\\])") 1252 strict_domain_re = re.compile(r"\.?[^.]*") 1253 domain_re = re.compile(r"[^.]*") 1254 dots_re = re.compile(r"^\.+") 1255 1256 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII) 1257 1258 def __init__(self, policy=None): 1259 if policy is None: 1260 policy = DefaultCookiePolicy() 1261 self._policy = policy 1262 1263 self._cookies_lock = _threading.RLock() 1264 self._cookies = {} 1265 1266 def set_policy(self, policy): 1267 self._policy = policy 1268 1269 def _cookies_for_domain(self, domain, request): 1270 cookies = [] 1271 if not self._policy.domain_return_ok(domain, request): 1272 return [] 1273 _debug("Checking %s for cookies to return", domain) 1274 cookies_by_path = self._cookies[domain] 1275 for path in cookies_by_path.keys(): 1276 if not self._policy.path_return_ok(path, request): 1277 continue 1278 cookies_by_name = cookies_by_path[path] 1279 for cookie in cookies_by_name.values(): 1280 if not self._policy.return_ok(cookie, request): 1281 _debug(" not returning cookie") 1282 continue 1283 _debug(" it's a match") 1284 cookies.append(cookie) 1285 return cookies 1286 1287 def _cookies_for_request(self, request): 1288 """Return a list of cookies to be returned to server.""" 1289 cookies = [] 1290 for domain in self._cookies.keys(): 1291 cookies.extend(self._cookies_for_domain(domain, request)) 1292 return cookies 1293 1294 def _cookie_attrs(self, cookies): 1295 """Return a list of cookie-attributes to be returned to server. 1296 1297 like ['foo="bar"; $Path="/"', ...] 1298 1299 The $Version attribute is also added when appropriate (currently only 1300 once per request). 1301 1302 """ 1303 # add cookies in order of most specific (ie. longest) path first 1304 cookies.sort(key=lambda a: len(a.path), reverse=True) 1305 1306 version_set = False 1307 1308 attrs = [] 1309 for cookie in cookies: 1310 # set version of Cookie header 1311 # XXX 1312 # What should it be if multiple matching Set-Cookie headers have 1313 # different versions themselves? 1314 # Answer: there is no answer; was supposed to be settled by 1315 # RFC 2965 errata, but that may never appear... 1316 version = cookie.version 1317 if not version_set: 1318 version_set = True 1319 if version > 0: 1320 attrs.append("$Version=%s" % version) 1321 1322 # quote cookie value if necessary 1323 # (not for Netscape protocol, which already has any quotes 1324 # intact, due to the poorly-specified Netscape Cookie: syntax) 1325 if ((cookie.value is not None) and 1326 self.non_word_re.search(cookie.value) and version > 0): 1327 value = self.quote_re.sub(r"\\\1", cookie.value) 1328 else: 1329 value = cookie.value 1330 1331 # add cookie-attributes to be returned in Cookie header 1332 if cookie.value is None: 1333 attrs.append(cookie.name) 1334 else: 1335 attrs.append("%s=%s" % (cookie.name, value)) 1336 if version > 0: 1337 if cookie.path_specified: 1338 attrs.append('$Path="%s"' % cookie.path) 1339 if cookie.domain.startswith("."): 1340 domain = cookie.domain 1341 if (not cookie.domain_initial_dot and 1342 domain.startswith(".")): 1343 domain = domain[1:] 1344 attrs.append('$Domain="%s"' % domain) 1345 if cookie.port is not None: 1346 p = "$Port" 1347 if cookie.port_specified: 1348 p = p + ('="%s"' % cookie.port) 1349 attrs.append(p) 1350 1351 return attrs 1352 1353 def add_cookie_header(self, request): 1354 """Add correct Cookie: header to request (urllib.request.Request object). 1355 1356 The Cookie2 header is also added unless policy.hide_cookie2 is true. 1357 1358 """ 1359 _debug("add_cookie_header") 1360 self._cookies_lock.acquire() 1361 try: 1362 1363 self._policy._now = self._now = int(time.time()) 1364 1365 cookies = self._cookies_for_request(request) 1366 1367 attrs = self._cookie_attrs(cookies) 1368 if attrs: 1369 if not request.has_header("Cookie"): 1370 request.add_unredirected_header( 1371 "Cookie", "; ".join(attrs)) 1372 1373 # if necessary, advertise that we know RFC 2965 1374 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and 1375 not request.has_header("Cookie2")): 1376 for cookie in cookies: 1377 if cookie.version != 1: 1378 request.add_unredirected_header("Cookie2", '$Version="1"') 1379 break 1380 1381 finally: 1382 self._cookies_lock.release() 1383 1384 self.clear_expired_cookies() 1385 1386 def _normalized_cookie_tuples(self, attrs_set): 1387 """Return list of tuples containing normalised cookie information. 1388 1389 attrs_set is the list of lists of key,value pairs extracted from 1390 the Set-Cookie or Set-Cookie2 headers. 1391 1392 Tuples are name, value, standard, rest, where name and value are the 1393 cookie name and value, standard is a dictionary containing the standard 1394 cookie-attributes (discard, secure, version, expires or max-age, 1395 domain, path and port) and rest is a dictionary containing the rest of 1396 the cookie-attributes. 1397 1398 """ 1399 cookie_tuples = [] 1400 1401 boolean_attrs = "discard", "secure" 1402 value_attrs = ("version", 1403 "expires", "max-age", 1404 "domain", "path", "port", 1405 "comment", "commenturl") 1406 1407 for cookie_attrs in attrs_set: 1408 name, value = cookie_attrs[0] 1409 1410 # Build dictionary of standard cookie-attributes (standard) and 1411 # dictionary of other cookie-attributes (rest). 1412 1413 # Note: expiry time is normalised to seconds since epoch. V0 1414 # cookies should have the Expires cookie-attribute, and V1 cookies 1415 # should have Max-Age, but since V1 includes RFC 2109 cookies (and 1416 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we 1417 # accept either (but prefer Max-Age). 1418 max_age_set = False 1419 1420 bad_cookie = False 1421 1422 standard = {} 1423 rest = {} 1424 for k, v in cookie_attrs[1:]: 1425 lc = k.lower() 1426 # don't lose case distinction for unknown fields 1427 if lc in value_attrs or lc in boolean_attrs: 1428 k = lc 1429 if k in boolean_attrs and v is None: 1430 # boolean cookie-attribute is present, but has no value 1431 # (like "discard", rather than "port=80") 1432 v = True 1433 if k in standard: 1434 # only first value is significant 1435 continue 1436 if k == "domain": 1437 if v is None: 1438 _debug(" missing value for domain attribute") 1439 bad_cookie = True 1440 break 1441 # RFC 2965 section 3.3.3 1442 v = v.lower() 1443 if k == "expires": 1444 if max_age_set: 1445 # Prefer max-age to expires (like Mozilla) 1446 continue 1447 if v is None: 1448 _debug(" missing or invalid value for expires " 1449 "attribute: treating as session cookie") 1450 continue 1451 if k == "max-age": 1452 max_age_set = True 1453 try: 1454 v = int(v) 1455 except ValueError: 1456 _debug(" missing or invalid (non-numeric) value for " 1457 "max-age attribute") 1458 bad_cookie = True 1459 break 1460 # convert RFC 2965 Max-Age to seconds since epoch 1461 # XXX Strictly you're supposed to follow RFC 2616 1462 # age-calculation rules. Remember that zero Max-Age 1463 # is a request to discard (old and new) cookie, though. 1464 k = "expires" 1465 v = self._now + v 1466 if (k in value_attrs) or (k in boolean_attrs): 1467 if (v is None and 1468 k not in ("port", "comment", "commenturl")): 1469 _debug(" missing value for %s attribute" % k) 1470 bad_cookie = True 1471 break 1472 standard[k] = v 1473 else: 1474 rest[k] = v 1475 1476 if bad_cookie: 1477 continue 1478 1479 cookie_tuples.append((name, value, standard, rest)) 1480 1481 return cookie_tuples 1482 1483 def _cookie_from_cookie_tuple(self, tup, request): 1484 # standard is dict of standard cookie-attributes, rest is dict of the 1485 # rest of them 1486 name, value, standard, rest = tup 1487 1488 domain = standard.get("domain", Absent) 1489 path = standard.get("path", Absent) 1490 port = standard.get("port", Absent) 1491 expires = standard.get("expires", Absent) 1492 1493 # set the easy defaults 1494 version = standard.get("version", None) 1495 if version is not None: 1496 try: 1497 version = int(version) 1498 except ValueError: 1499 return None # invalid version, ignore cookie 1500 secure = standard.get("secure", False) 1501 # (discard is also set if expires is Absent) 1502 discard = standard.get("discard", False) 1503 comment = standard.get("comment", None) 1504 comment_url = standard.get("commenturl", None) 1505 1506 # set default path 1507 if path is not Absent and path != "": 1508 path_specified = True 1509 path = escape_path(path) 1510 else: 1511 path_specified = False 1512 path = request_path(request) 1513 i = path.rfind("/") 1514 if i != -1: 1515 if version == 0: 1516 # Netscape spec parts company from reality here 1517 path = path[:i] 1518 else: 1519 path = path[:i+1] 1520 if len(path) == 0: path = "/" 1521 1522 # set default domain 1523 domain_specified = domain is not Absent 1524 # but first we have to remember whether it starts with a dot 1525 domain_initial_dot = False 1526 if domain_specified: 1527 domain_initial_dot = bool(domain.startswith(".")) 1528 if domain is Absent: 1529 req_host, erhn = eff_request_host(request) 1530 domain = erhn 1531 elif not domain.startswith("."): 1532 domain = "."+domain 1533 1534 # set default port 1535 port_specified = False 1536 if port is not Absent: 1537 if port is None: 1538 # Port attr present, but has no value: default to request port. 1539 # Cookie should then only be sent back on that port. 1540 port = request_port(request) 1541 else: 1542 port_specified = True 1543 port = re.sub(r"\s+", "", port) 1544 else: 1545 # No port attr present. Cookie can be sent back on any port. 1546 port = None 1547 1548 # set default expires and discard 1549 if expires is Absent: 1550 expires = None 1551 discard = True 1552 elif expires <= self._now: 1553 # Expiry date in past is request to delete cookie. This can't be 1554 # in DefaultCookiePolicy, because can't delete cookies there. 1555 try: 1556 self.clear(domain, path, name) 1557 except KeyError: 1558 pass 1559 _debug("Expiring cookie, domain='%s', path='%s', name='%s'", 1560 domain, path, name) 1561 return None 1562 1563 return Cookie(version, 1564 name, value, 1565 port, port_specified, 1566 domain, domain_specified, domain_initial_dot, 1567 path, path_specified, 1568 secure, 1569 expires, 1570 discard, 1571 comment, 1572 comment_url, 1573 rest) 1574 1575 def _cookies_from_attrs_set(self, attrs_set, request): 1576 cookie_tuples = self._normalized_cookie_tuples(attrs_set) 1577 1578 cookies = [] 1579 for tup in cookie_tuples: 1580 cookie = self._cookie_from_cookie_tuple(tup, request) 1581 if cookie: cookies.append(cookie) 1582 return cookies 1583 1584 def _process_rfc2109_cookies(self, cookies): 1585 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) 1586 if rfc2109_as_ns is None: 1587 rfc2109_as_ns = not self._policy.rfc2965 1588 for cookie in cookies: 1589 if cookie.version == 1: 1590 cookie.rfc2109 = True 1591 if rfc2109_as_ns: 1592 # treat 2109 cookies as Netscape cookies rather than 1593 # as RFC2965 cookies 1594 cookie.version = 0 1595 1596 def make_cookies(self, response, request): 1597 """Return sequence of Cookie objects extracted from response object.""" 1598 # get cookie-attributes for RFC 2965 and Netscape protocols 1599 headers = response.info() 1600 rfc2965_hdrs = headers.get_all("Set-Cookie2", []) 1601 ns_hdrs = headers.get_all("Set-Cookie", []) 1602 self._policy._now = self._now = int(time.time()) 1603 1604 rfc2965 = self._policy.rfc2965 1605 netscape = self._policy.netscape 1606 1607 if ((not rfc2965_hdrs and not ns_hdrs) or 1608 (not ns_hdrs and not rfc2965) or 1609 (not rfc2965_hdrs and not netscape) or 1610 (not netscape and not rfc2965)): 1611 return [] # no relevant cookie headers: quick exit 1612 1613 try: 1614 cookies = self._cookies_from_attrs_set( 1615 split_header_words(rfc2965_hdrs), request) 1616 except Exception: 1617 _warn_unhandled_exception() 1618 cookies = [] 1619 1620 if ns_hdrs and netscape: 1621 try: 1622 # RFC 2109 and Netscape cookies 1623 ns_cookies = self._cookies_from_attrs_set( 1624 parse_ns_headers(ns_hdrs), request) 1625 except Exception: 1626 _warn_unhandled_exception() 1627 ns_cookies = [] 1628 self._process_rfc2109_cookies(ns_cookies) 1629 1630 # Look for Netscape cookies (from Set-Cookie headers) that match 1631 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). 1632 # For each match, keep the RFC 2965 cookie and ignore the Netscape 1633 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are 1634 # bundled in with the Netscape cookies for this purpose, which is 1635 # reasonable behaviour. 1636 if rfc2965: 1637 lookup = {} 1638 for cookie in cookies: 1639 lookup[(cookie.domain, cookie.path, cookie.name)] = None 1640 1641 def no_matching_rfc2965(ns_cookie, lookup=lookup): 1642 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name 1643 return key not in lookup 1644 ns_cookies = filter(no_matching_rfc2965, ns_cookies) 1645 1646 if ns_cookies: 1647 cookies.extend(ns_cookies) 1648 1649 return cookies 1650 1651 def set_cookie_if_ok(self, cookie, request): 1652 """Set a cookie if policy says it's OK to do so.""" 1653 self._cookies_lock.acquire() 1654 try: 1655 self._policy._now = self._now = int(time.time()) 1656 1657 if self._policy.set_ok(cookie, request): 1658 self.set_cookie(cookie) 1659 1660 1661 finally: 1662 self._cookies_lock.release() 1663 1664 def set_cookie(self, cookie): 1665 """Set a cookie, without checking whether or not it should be set.""" 1666 c = self._cookies 1667 self._cookies_lock.acquire() 1668 try: 1669 if cookie.domain not in c: c[cookie.domain] = {} 1670 c2 = c[cookie.domain] 1671 if cookie.path not in c2: c2[cookie.path] = {} 1672 c3 = c2[cookie.path] 1673 c3[cookie.name] = cookie 1674 finally: 1675 self._cookies_lock.release() 1676 1677 def extract_cookies(self, response, request): 1678 """Extract cookies from response, where allowable given the request.""" 1679 _debug("extract_cookies: %s", response.info()) 1680 self._cookies_lock.acquire() 1681 try: 1682 for cookie in self.make_cookies(response, request): 1683 if self._policy.set_ok(cookie, request): 1684 _debug(" setting cookie: %s", cookie) 1685 self.set_cookie(cookie) 1686 finally: 1687 self._cookies_lock.release() 1688 1689 def clear(self, domain=None, path=None, name=None): 1690 """Clear some cookies. 1691 1692 Invoking this method without arguments will clear all cookies. If 1693 given a single argument, only cookies belonging to that domain will be 1694 removed. If given two arguments, cookies belonging to the specified 1695 path within that domain are removed. If given three arguments, then 1696 the cookie with the specified name, path and domain is removed. 1697 1698 Raises KeyError if no matching cookie exists. 1699 1700 """ 1701 if name is not None: 1702 if (domain is None) or (path is None): 1703 raise ValueError( 1704 "domain and path must be given to remove a cookie by name") 1705 del self._cookies[domain][path][name] 1706 elif path is not None: 1707 if domain is None: 1708 raise ValueError( 1709 "domain must be given to remove cookies by path") 1710 del self._cookies[domain][path] 1711 elif domain is not None: 1712 del self._cookies[domain] 1713 else: 1714 self._cookies = {} 1715 1716 def clear_session_cookies(self): 1717 """Discard all session cookies. 1718 1719 Note that the .save() method won't save session cookies anyway, unless 1720 you ask otherwise by passing a true ignore_discard argument. 1721 1722 """ 1723 self._cookies_lock.acquire() 1724 try: 1725 for cookie in self: 1726 if cookie.discard: 1727 self.clear(cookie.domain, cookie.path, cookie.name) 1728 finally: 1729 self._cookies_lock.release() 1730 1731 def clear_expired_cookies(self): 1732 """Discard all expired cookies. 1733 1734 You probably don't need to call this method: expired cookies are never 1735 sent back to the server (provided you're using DefaultCookiePolicy), 1736 this method is called by CookieJar itself every so often, and the 1737 .save() method won't save expired cookies anyway (unless you ask 1738 otherwise by passing a true ignore_expires argument). 1739 1740 """ 1741 self._cookies_lock.acquire() 1742 try: 1743 now = time.time() 1744 for cookie in self: 1745 if cookie.is_expired(now): 1746 self.clear(cookie.domain, cookie.path, cookie.name) 1747 finally: 1748 self._cookies_lock.release() 1749 1750 def __iter__(self): 1751 return deepvalues(self._cookies) 1752 1753 def __len__(self): 1754 """Return number of contained cookies.""" 1755 i = 0 1756 for cookie in self: i = i + 1 1757 return i 1758 1759 def __repr__(self): 1760 r = [] 1761 for cookie in self: r.append(repr(cookie)) 1762 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1763 1764 def __str__(self): 1765 r = [] 1766 for cookie in self: r.append(str(cookie)) 1767 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1768 1769 1770# derives from OSError for backwards-compatibility with Python 2.4.0 1771class LoadError(OSError): pass 1772 1773class FileCookieJar(CookieJar): 1774 """CookieJar that can be loaded from and saved to a file.""" 1775 1776 def __init__(self, filename=None, delayload=False, policy=None): 1777 """ 1778 Cookies are NOT loaded from the named file until either the .load() or 1779 .revert() method is called. 1780 1781 """ 1782 CookieJar.__init__(self, policy) 1783 if filename is not None: 1784 filename = os.fspath(filename) 1785 self.filename = filename 1786 self.delayload = bool(delayload) 1787 1788 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1789 """Save cookies to a file.""" 1790 raise NotImplementedError() 1791 1792 def load(self, filename=None, ignore_discard=False, ignore_expires=False): 1793 """Load cookies from a file.""" 1794 if filename is None: 1795 if self.filename is not None: filename = self.filename 1796 else: raise ValueError(MISSING_FILENAME_TEXT) 1797 1798 with open(filename) as f: 1799 self._really_load(f, filename, ignore_discard, ignore_expires) 1800 1801 def revert(self, filename=None, 1802 ignore_discard=False, ignore_expires=False): 1803 """Clear all cookies and reload cookies from a saved file. 1804 1805 Raises LoadError (or OSError) if reversion is not successful; the 1806 object's state will not be altered if this happens. 1807 1808 """ 1809 if filename is None: 1810 if self.filename is not None: filename = self.filename 1811 else: raise ValueError(MISSING_FILENAME_TEXT) 1812 1813 self._cookies_lock.acquire() 1814 try: 1815 1816 old_state = copy.deepcopy(self._cookies) 1817 self._cookies = {} 1818 try: 1819 self.load(filename, ignore_discard, ignore_expires) 1820 except OSError: 1821 self._cookies = old_state 1822 raise 1823 1824 finally: 1825 self._cookies_lock.release() 1826 1827 1828def lwp_cookie_str(cookie): 1829 """Return string representation of Cookie in the LWP cookie file format. 1830 1831 Actually, the format is extended a bit -- see module docstring. 1832 1833 """ 1834 h = [(cookie.name, cookie.value), 1835 ("path", cookie.path), 1836 ("domain", cookie.domain)] 1837 if cookie.port is not None: h.append(("port", cookie.port)) 1838 if cookie.path_specified: h.append(("path_spec", None)) 1839 if cookie.port_specified: h.append(("port_spec", None)) 1840 if cookie.domain_initial_dot: h.append(("domain_dot", None)) 1841 if cookie.secure: h.append(("secure", None)) 1842 if cookie.expires: h.append(("expires", 1843 time2isoz(float(cookie.expires)))) 1844 if cookie.discard: h.append(("discard", None)) 1845 if cookie.comment: h.append(("comment", cookie.comment)) 1846 if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) 1847 1848 keys = sorted(cookie._rest.keys()) 1849 for k in keys: 1850 h.append((k, str(cookie._rest[k]))) 1851 1852 h.append(("version", str(cookie.version))) 1853 1854 return join_header_words([h]) 1855 1856class LWPCookieJar(FileCookieJar): 1857 """ 1858 The LWPCookieJar saves a sequence of "Set-Cookie3" lines. 1859 "Set-Cookie3" is the format used by the libwww-perl library, not known 1860 to be compatible with any browser, but which is easy to read and 1861 doesn't lose information about RFC 2965 cookies. 1862 1863 Additional methods 1864 1865 as_lwp_str(ignore_discard=True, ignore_expired=True) 1866 1867 """ 1868 1869 def as_lwp_str(self, ignore_discard=True, ignore_expires=True): 1870 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers. 1871 1872 ignore_discard and ignore_expires: see docstring for FileCookieJar.save 1873 1874 """ 1875 now = time.time() 1876 r = [] 1877 for cookie in self: 1878 if not ignore_discard and cookie.discard: 1879 continue 1880 if not ignore_expires and cookie.is_expired(now): 1881 continue 1882 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) 1883 return "\n".join(r+[""]) 1884 1885 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1886 if filename is None: 1887 if self.filename is not None: filename = self.filename 1888 else: raise ValueError(MISSING_FILENAME_TEXT) 1889 1890 with open(filename, "w") as f: 1891 # There really isn't an LWP Cookies 2.0 format, but this indicates 1892 # that there is extra information in here (domain_dot and 1893 # port_spec) while still being compatible with libwww-perl, I hope. 1894 f.write("#LWP-Cookies-2.0\n") 1895 f.write(self.as_lwp_str(ignore_discard, ignore_expires)) 1896 1897 def _really_load(self, f, filename, ignore_discard, ignore_expires): 1898 magic = f.readline() 1899 if not self.magic_re.search(magic): 1900 msg = ("%r does not look like a Set-Cookie3 (LWP) format " 1901 "file" % filename) 1902 raise LoadError(msg) 1903 1904 now = time.time() 1905 1906 header = "Set-Cookie3:" 1907 boolean_attrs = ("port_spec", "path_spec", "domain_dot", 1908 "secure", "discard") 1909 value_attrs = ("version", 1910 "port", "path", "domain", 1911 "expires", 1912 "comment", "commenturl") 1913 1914 try: 1915 while 1: 1916 line = f.readline() 1917 if line == "": break 1918 if not line.startswith(header): 1919 continue 1920 line = line[len(header):].strip() 1921 1922 for data in split_header_words([line]): 1923 name, value = data[0] 1924 standard = {} 1925 rest = {} 1926 for k in boolean_attrs: 1927 standard[k] = False 1928 for k, v in data[1:]: 1929 if k is not None: 1930 lc = k.lower() 1931 else: 1932 lc = None 1933 # don't lose case distinction for unknown fields 1934 if (lc in value_attrs) or (lc in boolean_attrs): 1935 k = lc 1936 if k in boolean_attrs: 1937 if v is None: v = True 1938 standard[k] = v 1939 elif k in value_attrs: 1940 standard[k] = v 1941 else: 1942 rest[k] = v 1943 1944 h = standard.get 1945 expires = h("expires") 1946 discard = h("discard") 1947 if expires is not None: 1948 expires = iso2time(expires) 1949 if expires is None: 1950 discard = True 1951 domain = h("domain") 1952 domain_specified = domain.startswith(".") 1953 c = Cookie(h("version"), name, value, 1954 h("port"), h("port_spec"), 1955 domain, domain_specified, h("domain_dot"), 1956 h("path"), h("path_spec"), 1957 h("secure"), 1958 expires, 1959 discard, 1960 h("comment"), 1961 h("commenturl"), 1962 rest) 1963 if not ignore_discard and c.discard: 1964 continue 1965 if not ignore_expires and c.is_expired(now): 1966 continue 1967 self.set_cookie(c) 1968 except OSError: 1969 raise 1970 except Exception: 1971 _warn_unhandled_exception() 1972 raise LoadError("invalid Set-Cookie3 format file %r: %r" % 1973 (filename, line)) 1974 1975 1976class MozillaCookieJar(FileCookieJar): 1977 """ 1978 1979 WARNING: you may want to backup your browser's cookies file if you use 1980 this class to save cookies. I *think* it works, but there have been 1981 bugs in the past! 1982 1983 This class differs from CookieJar only in the format it uses to save and 1984 load cookies to and from a file. This class uses the Mozilla/Netscape 1985 `cookies.txt' format. lynx uses this file format, too. 1986 1987 Don't expect cookies saved while the browser is running to be noticed by 1988 the browser (in fact, Mozilla on unix will overwrite your saved cookies if 1989 you change them on disk while it's running; on Windows, you probably can't 1990 save at all while the browser is running). 1991 1992 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to 1993 Netscape cookies on saving. 1994 1995 In particular, the cookie version and port number information is lost, 1996 together with information about whether or not Path, Port and Discard were 1997 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the 1998 domain as set in the HTTP header started with a dot (yes, I'm aware some 1999 domains in Netscape files start with a dot and some don't -- trust me, you 2000 really don't want to know any more about this). 2001 2002 Note that though Mozilla and Netscape use the same format, they use 2003 slightly different headers. The class saves cookies using the Netscape 2004 header by default (Mozilla can cope with that). 2005 2006 """ 2007 magic_re = re.compile("#( Netscape)? HTTP Cookie File") 2008 header = """\ 2009# Netscape HTTP Cookie File 2010# http://curl.haxx.se/rfc/cookie_spec.html 2011# This is a generated file! Do not edit. 2012 2013""" 2014 2015 def _really_load(self, f, filename, ignore_discard, ignore_expires): 2016 now = time.time() 2017 2018 magic = f.readline() 2019 if not self.magic_re.search(magic): 2020 raise LoadError( 2021 "%r does not look like a Netscape format cookies file" % 2022 filename) 2023 2024 try: 2025 while 1: 2026 line = f.readline() 2027 if line == "": break 2028 2029 # last field may be absent, so keep any trailing tab 2030 if line.endswith("\n"): line = line[:-1] 2031 2032 # skip comments and blank lines XXX what is $ for? 2033 if (line.strip().startswith(("#", "$")) or 2034 line.strip() == ""): 2035 continue 2036 2037 domain, domain_specified, path, secure, expires, name, value = \ 2038 line.split("\t") 2039 secure = (secure == "TRUE") 2040 domain_specified = (domain_specified == "TRUE") 2041 if name == "": 2042 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2043 # with no name, whereas http.cookiejar regards it as a 2044 # cookie with no value. 2045 name = value 2046 value = None 2047 2048 initial_dot = domain.startswith(".") 2049 assert domain_specified == initial_dot 2050 2051 discard = False 2052 if expires == "": 2053 expires = None 2054 discard = True 2055 2056 # assume path_specified is false 2057 c = Cookie(0, name, value, 2058 None, False, 2059 domain, domain_specified, initial_dot, 2060 path, False, 2061 secure, 2062 expires, 2063 discard, 2064 None, 2065 None, 2066 {}) 2067 if not ignore_discard and c.discard: 2068 continue 2069 if not ignore_expires and c.is_expired(now): 2070 continue 2071 self.set_cookie(c) 2072 2073 except OSError: 2074 raise 2075 except Exception: 2076 _warn_unhandled_exception() 2077 raise LoadError("invalid Netscape format cookies file %r: %r" % 2078 (filename, line)) 2079 2080 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 2081 if filename is None: 2082 if self.filename is not None: filename = self.filename 2083 else: raise ValueError(MISSING_FILENAME_TEXT) 2084 2085 with open(filename, "w") as f: 2086 f.write(self.header) 2087 now = time.time() 2088 for cookie in self: 2089 if not ignore_discard and cookie.discard: 2090 continue 2091 if not ignore_expires and cookie.is_expired(now): 2092 continue 2093 if cookie.secure: secure = "TRUE" 2094 else: secure = "FALSE" 2095 if cookie.domain.startswith("."): initial_dot = "TRUE" 2096 else: initial_dot = "FALSE" 2097 if cookie.expires is not None: 2098 expires = str(cookie.expires) 2099 else: 2100 expires = "" 2101 if cookie.value is None: 2102 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2103 # with no name, whereas http.cookiejar regards it as a 2104 # cookie with no value. 2105 name = "" 2106 value = cookie.name 2107 else: 2108 name = cookie.name 2109 value = cookie.value 2110 f.write( 2111 "\t".join([cookie.domain, initial_dot, cookie.path, 2112 secure, expires, name, value])+ 2113 "\n") 2114