1r"""HTTP cookie handling for web clients. 2 3This module has (now fairly distant) origins in Gisle Aas' Perl module 4HTTP::Cookies, from the libwww-perl library. 5 6Docstrings, comments and debug strings in this code refer to the 7attributes of the HTTP cookie system as cookie-attributes, to distinguish 8them clearly from Python attributes. 9 10Class diagram (note that BSDDBCookieJar and the MSIE* classes are not 11distributed with the Python standard library, but are available from 12http://wwwsearch.sf.net/): 13 14 CookieJar____ 15 / \ \ 16 FileCookieJar \ \ 17 / | \ \ \ 18 MozillaCookieJar | LWPCookieJar \ \ 19 | | \ 20 | ---MSIEBase | \ 21 | / | | \ 22 | / MSIEDBCookieJar BSDDBCookieJar 23 |/ 24 MSIECookieJar 25 26""" 27 28__all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', 29 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] 30 31import copy 32import datetime 33import re 34import time 35import urllib.parse, urllib.request 36import threading as _threading 37import http.client # only for the default HTTP port 38from calendar import timegm 39 40debug = False # set to True to enable debugging via the logging module 41logger = None 42 43def _debug(*args): 44 if not debug: 45 return 46 global logger 47 if not logger: 48 import logging 49 logger = logging.getLogger("http.cookiejar") 50 return logger.debug(*args) 51 52 53DEFAULT_HTTP_PORT = str(http.client.HTTP_PORT) 54MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " 55 "instance initialised with one)") 56 57def _warn_unhandled_exception(): 58 # There are a few catch-all except: statements in this module, for 59 # catching input that's bad in unexpected ways. Warn if any 60 # exceptions are caught there. 61 import io, warnings, traceback 62 f = io.StringIO() 63 traceback.print_exc(None, f) 64 msg = f.getvalue() 65 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2) 66 67 68# Date/time conversion 69# ----------------------------------------------------------------------------- 70 71EPOCH_YEAR = 1970 72def _timegm(tt): 73 year, month, mday, hour, min, sec = tt[:6] 74 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and 75 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): 76 return timegm(tt) 77 else: 78 return None 79 80DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] 81MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", 82 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] 83MONTHS_LOWER = [] 84for month in MONTHS: MONTHS_LOWER.append(month.lower()) 85 86def time2isoz(t=None): 87 """Return a string representing time in seconds since epoch, t. 88 89 If the function is called without an argument, it will use the current 90 time. 91 92 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", 93 representing Universal Time (UTC, aka GMT). An example of this format is: 94 95 1994-11-24 08:49:37Z 96 97 """ 98 if t is None: 99 dt = datetime.datetime.utcnow() 100 else: 101 dt = datetime.datetime.utcfromtimestamp(t) 102 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( 103 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) 104 105def time2netscape(t=None): 106 """Return a string representing time in seconds since epoch, t. 107 108 If the function is called without an argument, it will use the current 109 time. 110 111 The format of the returned string is like this: 112 113 Wed, DD-Mon-YYYY HH:MM:SS GMT 114 115 """ 116 if t is None: 117 dt = datetime.datetime.utcnow() 118 else: 119 dt = datetime.datetime.utcfromtimestamp(t) 120 return "%s, %02d-%s-%04d %02d:%02d:%02d GMT" % ( 121 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1], 122 dt.year, dt.hour, dt.minute, dt.second) 123 124 125UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} 126 127TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII) 128def offset_from_tz_string(tz): 129 offset = None 130 if tz in UTC_ZONES: 131 offset = 0 132 else: 133 m = TIMEZONE_RE.search(tz) 134 if m: 135 offset = 3600 * int(m.group(2)) 136 if m.group(3): 137 offset = offset + 60 * int(m.group(3)) 138 if m.group(1) == '-': 139 offset = -offset 140 return offset 141 142def _str2time(day, mon, yr, hr, min, sec, tz): 143 yr = int(yr) 144 if yr > datetime.MAXYEAR: 145 return None 146 147 # translate month name to number 148 # month numbers start with 1 (January) 149 try: 150 mon = MONTHS_LOWER.index(mon.lower())+1 151 except ValueError: 152 # maybe it's already a number 153 try: 154 imon = int(mon) 155 except ValueError: 156 return None 157 if 1 <= imon <= 12: 158 mon = imon 159 else: 160 return None 161 162 # make sure clock elements are defined 163 if hr is None: hr = 0 164 if min is None: min = 0 165 if sec is None: sec = 0 166 167 day = int(day) 168 hr = int(hr) 169 min = int(min) 170 sec = int(sec) 171 172 if yr < 1000: 173 # find "obvious" year 174 cur_yr = time.localtime(time.time())[0] 175 m = cur_yr % 100 176 tmp = yr 177 yr = yr + cur_yr - m 178 m = m - tmp 179 if abs(m) > 50: 180 if m > 0: yr = yr + 100 181 else: yr = yr - 100 182 183 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) 184 t = _timegm((yr, mon, day, hr, min, sec, tz)) 185 186 if t is not None: 187 # adjust time using timezone string, to get absolute time since epoch 188 if tz is None: 189 tz = "UTC" 190 tz = tz.upper() 191 offset = offset_from_tz_string(tz) 192 if offset is None: 193 return None 194 t = t - offset 195 196 return t 197 198STRICT_DATE_RE = re.compile( 199 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " 200 r"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII) 201WEEKDAY_RE = re.compile( 202 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII) 203LOOSE_HTTP_DATE_RE = re.compile( 204 r"""^ 205 (\d\d?) # day 206 (?:\s+|[-\/]) 207 (\w+) # month 208 (?:\s+|[-\/]) 209 (\d+) # year 210 (?: 211 (?:\s+|:) # separator before clock 212 (\d\d?):(\d\d) # hour:min 213 (?::(\d\d))? # optional seconds 214 )? # optional clock 215 \s* 216 (?: 217 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+) # timezone 218 \s* 219 )? 220 (?: 221 \(\w+\) # ASCII representation of timezone in parens. 222 \s* 223 )?$""", re.X | re.ASCII) 224def http2time(text): 225 """Returns time in seconds since epoch of time represented by a string. 226 227 Return value is an integer. 228 229 None is returned if the format of str is unrecognized, the time is outside 230 the representable range, or the timezone string is not recognized. If the 231 string contains no timezone, UTC is assumed. 232 233 The timezone in the string may be numerical (like "-0800" or "+0100") or a 234 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the 235 timezone strings equivalent to UTC (zero offset) are known to the function. 236 237 The function loosely parses the following formats: 238 239 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format 240 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format 241 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format 242 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) 243 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) 244 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) 245 246 The parser ignores leading and trailing whitespace. The time may be 247 absent. 248 249 If the year is given with only 2 digits, the function will select the 250 century that makes the year closest to the current date. 251 252 """ 253 # fast exit for strictly conforming string 254 m = STRICT_DATE_RE.search(text) 255 if m: 256 g = m.groups() 257 mon = MONTHS_LOWER.index(g[1].lower()) + 1 258 tt = (int(g[2]), mon, int(g[0]), 259 int(g[3]), int(g[4]), float(g[5])) 260 return _timegm(tt) 261 262 # No, we need some messy parsing... 263 264 # clean up 265 text = text.lstrip() 266 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday 267 268 # tz is time zone specifier string 269 day, mon, yr, hr, min, sec, tz = [None]*7 270 271 # loose regexp parse 272 m = LOOSE_HTTP_DATE_RE.search(text) 273 if m is not None: 274 day, mon, yr, hr, min, sec, tz = m.groups() 275 else: 276 return None # bad format 277 278 return _str2time(day, mon, yr, hr, min, sec, tz) 279 280ISO_DATE_RE = re.compile( 281 r"""^ 282 (\d{4}) # year 283 [-\/]? 284 (\d\d?) # numerical month 285 [-\/]? 286 (\d\d?) # day 287 (?: 288 (?:\s+|[-:Tt]) # separator before clock 289 (\d\d?):?(\d\d) # hour:min 290 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) 291 )? # optional clock 292 \s* 293 (?: 294 ([-+]?\d\d?:?(:?\d\d)? 295 |Z|z) # timezone (Z is "zero meridian", i.e. GMT) 296 \s* 297 )?$""", re.X | re. ASCII) 298def iso2time(text): 299 """ 300 As for http2time, but parses the ISO 8601 formats: 301 302 1994-02-03 14:15:29 -0100 -- ISO 8601 format 303 1994-02-03 14:15:29 -- zone is optional 304 1994-02-03 -- only date 305 1994-02-03T14:15:29 -- Use T as separator 306 19940203T141529Z -- ISO 8601 compact format 307 19940203 -- only date 308 309 """ 310 # clean up 311 text = text.lstrip() 312 313 # tz is time zone specifier string 314 day, mon, yr, hr, min, sec, tz = [None]*7 315 316 # loose regexp parse 317 m = ISO_DATE_RE.search(text) 318 if m is not None: 319 # XXX there's an extra bit of the timezone I'm ignoring here: is 320 # this the right thing to do? 321 yr, mon, day, hr, min, sec, tz, _ = m.groups() 322 else: 323 return None # bad format 324 325 return _str2time(day, mon, yr, hr, min, sec, tz) 326 327 328# Header parsing 329# ----------------------------------------------------------------------------- 330 331def unmatched(match): 332 """Return unmatched part of re.Match object.""" 333 start, end = match.span(0) 334 return match.string[:start]+match.string[end:] 335 336HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") 337HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") 338HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") 339HEADER_ESCAPE_RE = re.compile(r"\\(.)") 340def split_header_words(header_values): 341 r"""Parse header values into a list of lists containing key,value pairs. 342 343 The function knows how to deal with ",", ";" and "=" as well as quoted 344 values after "=". A list of space separated tokens are parsed as if they 345 were separated by ";". 346 347 If the header_values passed as argument contains multiple values, then they 348 are treated as if they were a single value separated by comma ",". 349 350 This means that this function is useful for parsing header fields that 351 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax 352 the requirement for tokens). 353 354 headers = #header 355 header = (token | parameter) *( [";"] (token | parameter)) 356 357 token = 1*<any CHAR except CTLs or separators> 358 separators = "(" | ")" | "<" | ">" | "@" 359 | "," | ";" | ":" | "\" | <"> 360 | "/" | "[" | "]" | "?" | "=" 361 | "{" | "}" | SP | HT 362 363 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) 364 qdtext = <any TEXT except <">> 365 quoted-pair = "\" CHAR 366 367 parameter = attribute "=" value 368 attribute = token 369 value = token | quoted-string 370 371 Each header is represented by a list of key/value pairs. The value for a 372 simple token (not part of a parameter) is None. Syntactically incorrect 373 headers will not necessarily be parsed as you would want. 374 375 This is easier to describe with some examples: 376 377 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) 378 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] 379 >>> split_header_words(['text/html; charset="iso-8859-1"']) 380 [[('text/html', None), ('charset', 'iso-8859-1')]] 381 >>> split_header_words([r'Basic realm="\"foo\bar\""']) 382 [[('Basic', None), ('realm', '"foobar"')]] 383 384 """ 385 assert not isinstance(header_values, str) 386 result = [] 387 for text in header_values: 388 orig_text = text 389 pairs = [] 390 while text: 391 m = HEADER_TOKEN_RE.search(text) 392 if m: 393 text = unmatched(m) 394 name = m.group(1) 395 m = HEADER_QUOTED_VALUE_RE.search(text) 396 if m: # quoted value 397 text = unmatched(m) 398 value = m.group(1) 399 value = HEADER_ESCAPE_RE.sub(r"\1", value) 400 else: 401 m = HEADER_VALUE_RE.search(text) 402 if m: # unquoted value 403 text = unmatched(m) 404 value = m.group(1) 405 value = value.rstrip() 406 else: 407 # no value, a lone token 408 value = None 409 pairs.append((name, value)) 410 elif text.lstrip().startswith(","): 411 # concatenated headers, as per RFC 2616 section 4.2 412 text = text.lstrip()[1:] 413 if pairs: result.append(pairs) 414 pairs = [] 415 else: 416 # skip junk 417 non_junk, nr_junk_chars = re.subn(r"^[=\s;]*", "", text) 418 assert nr_junk_chars > 0, ( 419 "split_header_words bug: '%s', '%s', %s" % 420 (orig_text, text, pairs)) 421 text = non_junk 422 if pairs: result.append(pairs) 423 return result 424 425HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") 426def join_header_words(lists): 427 """Do the inverse (almost) of the conversion done by split_header_words. 428 429 Takes a list of lists of (key, value) pairs and produces a single header 430 value. Attribute values are quoted if needed. 431 432 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859-1")]]) 433 'text/plain; charset="iso-8859-1"' 434 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859-1")]]) 435 'text/plain, charset="iso-8859-1"' 436 437 """ 438 headers = [] 439 for pairs in lists: 440 attr = [] 441 for k, v in pairs: 442 if v is not None: 443 if not re.search(r"^\w+$", v): 444 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ 445 v = '"%s"' % v 446 k = "%s=%s" % (k, v) 447 attr.append(k) 448 if attr: headers.append("; ".join(attr)) 449 return ", ".join(headers) 450 451def strip_quotes(text): 452 if text.startswith('"'): 453 text = text[1:] 454 if text.endswith('"'): 455 text = text[:-1] 456 return text 457 458def parse_ns_headers(ns_headers): 459 """Ad-hoc parser for Netscape protocol cookie-attributes. 460 461 The old Netscape cookie format for Set-Cookie can for instance contain 462 an unquoted "," in the expires field, so we have to use this ad-hoc 463 parser instead of split_header_words. 464 465 XXX This may not make the best possible effort to parse all the crap 466 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient 467 parser is probably better, so could do worse than following that if 468 this ever gives any trouble. 469 470 Currently, this is also used for parsing RFC 2109 cookies. 471 472 """ 473 known_attrs = ("expires", "domain", "path", "secure", 474 # RFC 2109 attrs (may turn up in Netscape cookies, too) 475 "version", "port", "max-age") 476 477 result = [] 478 for ns_header in ns_headers: 479 pairs = [] 480 version_set = False 481 482 # XXX: The following does not strictly adhere to RFCs in that empty 483 # names and values are legal (the former will only appear once and will 484 # be overwritten if multiple occurrences are present). This is 485 # mostly to deal with backwards compatibility. 486 for ii, param in enumerate(ns_header.split(';')): 487 param = param.strip() 488 489 key, sep, val = param.partition('=') 490 key = key.strip() 491 492 if not key: 493 if ii == 0: 494 break 495 else: 496 continue 497 498 # allow for a distinction between present and empty and missing 499 # altogether 500 val = val.strip() if sep else None 501 502 if ii != 0: 503 lc = key.lower() 504 if lc in known_attrs: 505 key = lc 506 507 if key == "version": 508 # This is an RFC 2109 cookie. 509 if val is not None: 510 val = strip_quotes(val) 511 version_set = True 512 elif key == "expires": 513 # convert expires date to seconds since epoch 514 if val is not None: 515 val = http2time(strip_quotes(val)) # None if invalid 516 pairs.append((key, val)) 517 518 if pairs: 519 if not version_set: 520 pairs.append(("version", "0")) 521 result.append(pairs) 522 523 return result 524 525 526IPV4_RE = re.compile(r"\.\d+$", re.ASCII) 527def is_HDN(text): 528 """Return True if text is a host domain name.""" 529 # XXX 530 # This may well be wrong. Which RFC is HDN defined in, if any (for 531 # the purposes of RFC 2965)? 532 # For the current implementation, what about IPv6? Remember to look 533 # at other uses of IPV4_RE also, if change this. 534 if IPV4_RE.search(text): 535 return False 536 if text == "": 537 return False 538 if text[0] == "." or text[-1] == ".": 539 return False 540 return True 541 542def domain_match(A, B): 543 """Return True if domain A domain-matches domain B, according to RFC 2965. 544 545 A and B may be host domain names or IP addresses. 546 547 RFC 2965, section 1: 548 549 Host names can be specified either as an IP address or a HDN string. 550 Sometimes we compare one host name with another. (Such comparisons SHALL 551 be case-insensitive.) Host A's name domain-matches host B's if 552 553 * their host name strings string-compare equal; or 554 555 * A is a HDN string and has the form NB, where N is a non-empty 556 name string, B has the form .B', and B' is a HDN string. (So, 557 x.y.com domain-matches .Y.com but not Y.com.) 558 559 Note that domain-match is not a commutative operation: a.b.c.com 560 domain-matches .c.com, but not the reverse. 561 562 """ 563 # Note that, if A or B are IP addresses, the only relevant part of the 564 # definition of the domain-match algorithm is the direct string-compare. 565 A = A.lower() 566 B = B.lower() 567 if A == B: 568 return True 569 if not is_HDN(A): 570 return False 571 i = A.rfind(B) 572 if i == -1 or i == 0: 573 # A does not have form NB, or N is the empty string 574 return False 575 if not B.startswith("."): 576 return False 577 if not is_HDN(B[1:]): 578 return False 579 return True 580 581def liberal_is_HDN(text): 582 """Return True if text is a sort-of-like a host domain name. 583 584 For accepting/blocking domains. 585 586 """ 587 if IPV4_RE.search(text): 588 return False 589 return True 590 591def user_domain_match(A, B): 592 """For blocking/accepting domains. 593 594 A and B may be host domain names or IP addresses. 595 596 """ 597 A = A.lower() 598 B = B.lower() 599 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): 600 if A == B: 601 # equal IP addresses 602 return True 603 return False 604 initial_dot = B.startswith(".") 605 if initial_dot and A.endswith(B): 606 return True 607 if not initial_dot and A == B: 608 return True 609 return False 610 611cut_port_re = re.compile(r":\d+$", re.ASCII) 612def request_host(request): 613 """Return request-host, as defined by RFC 2965. 614 615 Variation from RFC: returned value is lowercased, for convenient 616 comparison. 617 618 """ 619 url = request.get_full_url() 620 host = urllib.parse.urlparse(url)[1] 621 if host == "": 622 host = request.get_header("Host", "") 623 624 # remove port, if present 625 host = cut_port_re.sub("", host, 1) 626 return host.lower() 627 628def eff_request_host(request): 629 """Return a tuple (request-host, effective request-host name). 630 631 As defined by RFC 2965, except both are lowercased. 632 633 """ 634 erhn = req_host = request_host(request) 635 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): 636 erhn = req_host + ".local" 637 return req_host, erhn 638 639def request_path(request): 640 """Path component of request-URI, as defined by RFC 2965.""" 641 url = request.get_full_url() 642 parts = urllib.parse.urlsplit(url) 643 path = escape_path(parts.path) 644 if not path.startswith("/"): 645 # fix bad RFC 2396 absoluteURI 646 path = "/" + path 647 return path 648 649def request_port(request): 650 host = request.host 651 i = host.find(':') 652 if i >= 0: 653 port = host[i+1:] 654 try: 655 int(port) 656 except ValueError: 657 _debug("nonnumeric port: '%s'", port) 658 return None 659 else: 660 port = DEFAULT_HTTP_PORT 661 return port 662 663# Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't 664# need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). 665HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" 666ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") 667def uppercase_escaped_char(match): 668 return "%%%s" % match.group(1).upper() 669def escape_path(path): 670 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" 671 # There's no knowing what character encoding was used to create URLs 672 # containing %-escapes, but since we have to pick one to escape invalid 673 # path characters, we pick UTF-8, as recommended in the HTML 4.0 674 # specification: 675 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 676 # And here, kind of: draft-fielding-uri-rfc2396bis-03 677 # (And in draft IRI specification: draft-duerst-iri-05) 678 # (And here, for new URI schemes: RFC 2718) 679 path = urllib.parse.quote(path, HTTP_PATH_SAFE) 680 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) 681 return path 682 683def reach(h): 684 """Return reach of host h, as defined by RFC 2965, section 1. 685 686 The reach R of a host name H is defined as follows: 687 688 * If 689 690 - H is the host domain name of a host; and, 691 692 - H has the form A.B; and 693 694 - A has no embedded (that is, interior) dots; and 695 696 - B has at least one embedded dot, or B is the string "local". 697 then the reach of H is .B. 698 699 * Otherwise, the reach of H is H. 700 701 >>> reach("www.acme.com") 702 '.acme.com' 703 >>> reach("acme.com") 704 'acme.com' 705 >>> reach("acme.local") 706 '.local' 707 708 """ 709 i = h.find(".") 710 if i >= 0: 711 #a = h[:i] # this line is only here to show what a is 712 b = h[i+1:] 713 i = b.find(".") 714 if is_HDN(h) and (i >= 0 or b == "local"): 715 return "."+b 716 return h 717 718def is_third_party(request): 719 """ 720 721 RFC 2965, section 3.3.6: 722 723 An unverifiable transaction is to a third-party host if its request- 724 host U does not domain-match the reach R of the request-host O in the 725 origin transaction. 726 727 """ 728 req_host = request_host(request) 729 if not domain_match(req_host, reach(request.origin_req_host)): 730 return True 731 else: 732 return False 733 734 735class Cookie: 736 """HTTP Cookie. 737 738 This class represents both Netscape and RFC 2965 cookies. 739 740 This is deliberately a very simple class. It just holds attributes. It's 741 possible to construct Cookie instances that don't comply with the cookie 742 standards. CookieJar.make_cookies is the factory function for Cookie 743 objects -- it deals with cookie parsing, supplying defaults, and 744 normalising to the representation used in this class. CookiePolicy is 745 responsible for checking them to see whether they should be accepted from 746 and returned to the server. 747 748 Note that the port may be present in the headers, but unspecified ("Port" 749 rather than"Port=80", for example); if this is the case, port is None. 750 751 """ 752 753 def __init__(self, version, name, value, 754 port, port_specified, 755 domain, domain_specified, domain_initial_dot, 756 path, path_specified, 757 secure, 758 expires, 759 discard, 760 comment, 761 comment_url, 762 rest, 763 rfc2109=False, 764 ): 765 766 if version is not None: version = int(version) 767 if expires is not None: expires = int(float(expires)) 768 if port is None and port_specified is True: 769 raise ValueError("if port is None, port_specified must be false") 770 771 self.version = version 772 self.name = name 773 self.value = value 774 self.port = port 775 self.port_specified = port_specified 776 # normalise case, as per RFC 2965 section 3.3.3 777 self.domain = domain.lower() 778 self.domain_specified = domain_specified 779 # Sigh. We need to know whether the domain given in the 780 # cookie-attribute had an initial dot, in order to follow RFC 2965 781 # (as clarified in draft errata). Needed for the returned $Domain 782 # value. 783 self.domain_initial_dot = domain_initial_dot 784 self.path = path 785 self.path_specified = path_specified 786 self.secure = secure 787 self.expires = expires 788 self.discard = discard 789 self.comment = comment 790 self.comment_url = comment_url 791 self.rfc2109 = rfc2109 792 793 self._rest = copy.copy(rest) 794 795 def has_nonstandard_attr(self, name): 796 return name in self._rest 797 def get_nonstandard_attr(self, name, default=None): 798 return self._rest.get(name, default) 799 def set_nonstandard_attr(self, name, value): 800 self._rest[name] = value 801 802 def is_expired(self, now=None): 803 if now is None: now = time.time() 804 if (self.expires is not None) and (self.expires <= now): 805 return True 806 return False 807 808 def __str__(self): 809 if self.port is None: p = "" 810 else: p = ":"+self.port 811 limit = self.domain + p + self.path 812 if self.value is not None: 813 namevalue = "%s=%s" % (self.name, self.value) 814 else: 815 namevalue = self.name 816 return "<Cookie %s for %s>" % (namevalue, limit) 817 818 def __repr__(self): 819 args = [] 820 for name in ("version", "name", "value", 821 "port", "port_specified", 822 "domain", "domain_specified", "domain_initial_dot", 823 "path", "path_specified", 824 "secure", "expires", "discard", "comment", "comment_url", 825 ): 826 attr = getattr(self, name) 827 args.append("%s=%s" % (name, repr(attr))) 828 args.append("rest=%s" % repr(self._rest)) 829 args.append("rfc2109=%s" % repr(self.rfc2109)) 830 return "%s(%s)" % (self.__class__.__name__, ", ".join(args)) 831 832 833class CookiePolicy: 834 """Defines which cookies get accepted from and returned to server. 835 836 May also modify cookies, though this is probably a bad idea. 837 838 The subclass DefaultCookiePolicy defines the standard rules for Netscape 839 and RFC 2965 cookies -- override that if you want a customized policy. 840 841 """ 842 def set_ok(self, cookie, request): 843 """Return true if (and only if) cookie should be accepted from server. 844 845 Currently, pre-expired cookies never get this far -- the CookieJar 846 class deletes such cookies itself. 847 848 """ 849 raise NotImplementedError() 850 851 def return_ok(self, cookie, request): 852 """Return true if (and only if) cookie should be returned to server.""" 853 raise NotImplementedError() 854 855 def domain_return_ok(self, domain, request): 856 """Return false if cookies should not be returned, given cookie domain. 857 """ 858 return True 859 860 def path_return_ok(self, path, request): 861 """Return false if cookies should not be returned, given cookie path. 862 """ 863 return True 864 865 866class DefaultCookiePolicy(CookiePolicy): 867 """Implements the standard rules for accepting and returning cookies.""" 868 869 DomainStrictNoDots = 1 870 DomainStrictNonDomain = 2 871 DomainRFC2965Match = 4 872 873 DomainLiberal = 0 874 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain 875 876 def __init__(self, 877 blocked_domains=None, allowed_domains=None, 878 netscape=True, rfc2965=False, 879 rfc2109_as_netscape=None, 880 hide_cookie2=False, 881 strict_domain=False, 882 strict_rfc2965_unverifiable=True, 883 strict_ns_unverifiable=False, 884 strict_ns_domain=DomainLiberal, 885 strict_ns_set_initial_dollar=False, 886 strict_ns_set_path=False, 887 ): 888 """Constructor arguments should be passed as keyword arguments only.""" 889 self.netscape = netscape 890 self.rfc2965 = rfc2965 891 self.rfc2109_as_netscape = rfc2109_as_netscape 892 self.hide_cookie2 = hide_cookie2 893 self.strict_domain = strict_domain 894 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable 895 self.strict_ns_unverifiable = strict_ns_unverifiable 896 self.strict_ns_domain = strict_ns_domain 897 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar 898 self.strict_ns_set_path = strict_ns_set_path 899 900 if blocked_domains is not None: 901 self._blocked_domains = tuple(blocked_domains) 902 else: 903 self._blocked_domains = () 904 905 if allowed_domains is not None: 906 allowed_domains = tuple(allowed_domains) 907 self._allowed_domains = allowed_domains 908 909 def blocked_domains(self): 910 """Return the sequence of blocked domains (as a tuple).""" 911 return self._blocked_domains 912 def set_blocked_domains(self, blocked_domains): 913 """Set the sequence of blocked domains.""" 914 self._blocked_domains = tuple(blocked_domains) 915 916 def is_blocked(self, domain): 917 for blocked_domain in self._blocked_domains: 918 if user_domain_match(domain, blocked_domain): 919 return True 920 return False 921 922 def allowed_domains(self): 923 """Return None, or the sequence of allowed domains (as a tuple).""" 924 return self._allowed_domains 925 def set_allowed_domains(self, allowed_domains): 926 """Set the sequence of allowed domains, or None.""" 927 if allowed_domains is not None: 928 allowed_domains = tuple(allowed_domains) 929 self._allowed_domains = allowed_domains 930 931 def is_not_allowed(self, domain): 932 if self._allowed_domains is None: 933 return False 934 for allowed_domain in self._allowed_domains: 935 if user_domain_match(domain, allowed_domain): 936 return False 937 return True 938 939 def set_ok(self, cookie, request): 940 """ 941 If you override .set_ok(), be sure to call this method. If it returns 942 false, so should your subclass (assuming your subclass wants to be more 943 strict about which cookies to accept). 944 945 """ 946 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 947 948 assert cookie.name is not None 949 950 for n in "version", "verifiability", "name", "path", "domain", "port": 951 fn_name = "set_ok_"+n 952 fn = getattr(self, fn_name) 953 if not fn(cookie, request): 954 return False 955 956 return True 957 958 def set_ok_version(self, cookie, request): 959 if cookie.version is None: 960 # Version is always set to 0 by parse_ns_headers if it's a Netscape 961 # cookie, so this must be an invalid RFC 2965 cookie. 962 _debug(" Set-Cookie2 without version attribute (%s=%s)", 963 cookie.name, cookie.value) 964 return False 965 if cookie.version > 0 and not self.rfc2965: 966 _debug(" RFC 2965 cookies are switched off") 967 return False 968 elif cookie.version == 0 and not self.netscape: 969 _debug(" Netscape cookies are switched off") 970 return False 971 return True 972 973 def set_ok_verifiability(self, cookie, request): 974 if request.unverifiable and is_third_party(request): 975 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 976 _debug(" third-party RFC 2965 cookie during " 977 "unverifiable transaction") 978 return False 979 elif cookie.version == 0 and self.strict_ns_unverifiable: 980 _debug(" third-party Netscape cookie during " 981 "unverifiable transaction") 982 return False 983 return True 984 985 def set_ok_name(self, cookie, request): 986 # Try and stop servers setting V0 cookies designed to hack other 987 # servers that know both V0 and V1 protocols. 988 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and 989 cookie.name.startswith("$")): 990 _debug(" illegal name (starts with '$'): '%s'", cookie.name) 991 return False 992 return True 993 994 def set_ok_path(self, cookie, request): 995 if cookie.path_specified: 996 req_path = request_path(request) 997 if ((cookie.version > 0 or 998 (cookie.version == 0 and self.strict_ns_set_path)) and 999 not self.path_return_ok(cookie.path, request)): 1000 _debug(" path attribute %s is not a prefix of request " 1001 "path %s", cookie.path, req_path) 1002 return False 1003 return True 1004 1005 def set_ok_domain(self, cookie, request): 1006 if self.is_blocked(cookie.domain): 1007 _debug(" domain %s is in user block-list", cookie.domain) 1008 return False 1009 if self.is_not_allowed(cookie.domain): 1010 _debug(" domain %s is not in user allow-list", cookie.domain) 1011 return False 1012 if cookie.domain_specified: 1013 req_host, erhn = eff_request_host(request) 1014 domain = cookie.domain 1015 if self.strict_domain and (domain.count(".") >= 2): 1016 # XXX This should probably be compared with the Konqueror 1017 # (kcookiejar.cpp) and Mozilla implementations, but it's a 1018 # losing battle. 1019 i = domain.rfind(".") 1020 j = domain.rfind(".", 0, i) 1021 if j == 0: # domain like .foo.bar 1022 tld = domain[i+1:] 1023 sld = domain[j+1:i] 1024 if sld.lower() in ("co", "ac", "com", "edu", "org", "net", 1025 "gov", "mil", "int", "aero", "biz", "cat", "coop", 1026 "info", "jobs", "mobi", "museum", "name", "pro", 1027 "travel", "eu") and len(tld) == 2: 1028 # domain like .co.uk 1029 _debug(" country-code second level domain %s", domain) 1030 return False 1031 if domain.startswith("."): 1032 undotted_domain = domain[1:] 1033 else: 1034 undotted_domain = domain 1035 embedded_dots = (undotted_domain.find(".") >= 0) 1036 if not embedded_dots and domain != ".local": 1037 _debug(" non-local domain %s contains no embedded dot", 1038 domain) 1039 return False 1040 if cookie.version == 0: 1041 if (not erhn.endswith(domain) and 1042 (not erhn.startswith(".") and 1043 not ("."+erhn).endswith(domain))): 1044 _debug(" effective request-host %s (even with added " 1045 "initial dot) does not end with %s", 1046 erhn, domain) 1047 return False 1048 if (cookie.version > 0 or 1049 (self.strict_ns_domain & self.DomainRFC2965Match)): 1050 if not domain_match(erhn, domain): 1051 _debug(" effective request-host %s does not domain-match " 1052 "%s", erhn, domain) 1053 return False 1054 if (cookie.version > 0 or 1055 (self.strict_ns_domain & self.DomainStrictNoDots)): 1056 host_prefix = req_host[:-len(domain)] 1057 if (host_prefix.find(".") >= 0 and 1058 not IPV4_RE.search(req_host)): 1059 _debug(" host prefix %s for domain %s contains a dot", 1060 host_prefix, domain) 1061 return False 1062 return True 1063 1064 def set_ok_port(self, cookie, request): 1065 if cookie.port_specified: 1066 req_port = request_port(request) 1067 if req_port is None: 1068 req_port = "80" 1069 else: 1070 req_port = str(req_port) 1071 for p in cookie.port.split(","): 1072 try: 1073 int(p) 1074 except ValueError: 1075 _debug(" bad port %s (not numeric)", p) 1076 return False 1077 if p == req_port: 1078 break 1079 else: 1080 _debug(" request port (%s) not found in %s", 1081 req_port, cookie.port) 1082 return False 1083 return True 1084 1085 def return_ok(self, cookie, request): 1086 """ 1087 If you override .return_ok(), be sure to call this method. If it 1088 returns false, so should your subclass (assuming your subclass wants to 1089 be more strict about which cookies to return). 1090 1091 """ 1092 # Path has already been checked by .path_return_ok(), and domain 1093 # blocking done by .domain_return_ok(). 1094 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) 1095 1096 for n in "version", "verifiability", "secure", "expires", "port", "domain": 1097 fn_name = "return_ok_"+n 1098 fn = getattr(self, fn_name) 1099 if not fn(cookie, request): 1100 return False 1101 return True 1102 1103 def return_ok_version(self, cookie, request): 1104 if cookie.version > 0 and not self.rfc2965: 1105 _debug(" RFC 2965 cookies are switched off") 1106 return False 1107 elif cookie.version == 0 and not self.netscape: 1108 _debug(" Netscape cookies are switched off") 1109 return False 1110 return True 1111 1112 def return_ok_verifiability(self, cookie, request): 1113 if request.unverifiable and is_third_party(request): 1114 if cookie.version > 0 and self.strict_rfc2965_unverifiable: 1115 _debug(" third-party RFC 2965 cookie during unverifiable " 1116 "transaction") 1117 return False 1118 elif cookie.version == 0 and self.strict_ns_unverifiable: 1119 _debug(" third-party Netscape cookie during unverifiable " 1120 "transaction") 1121 return False 1122 return True 1123 1124 def return_ok_secure(self, cookie, request): 1125 if cookie.secure and request.type != "https": 1126 _debug(" secure cookie with non-secure request") 1127 return False 1128 return True 1129 1130 def return_ok_expires(self, cookie, request): 1131 if cookie.is_expired(self._now): 1132 _debug(" cookie expired") 1133 return False 1134 return True 1135 1136 def return_ok_port(self, cookie, request): 1137 if cookie.port: 1138 req_port = request_port(request) 1139 if req_port is None: 1140 req_port = "80" 1141 for p in cookie.port.split(","): 1142 if p == req_port: 1143 break 1144 else: 1145 _debug(" request port %s does not match cookie port %s", 1146 req_port, cookie.port) 1147 return False 1148 return True 1149 1150 def return_ok_domain(self, cookie, request): 1151 req_host, erhn = eff_request_host(request) 1152 domain = cookie.domain 1153 1154 if domain and not domain.startswith("."): 1155 dotdomain = "." + domain 1156 else: 1157 dotdomain = domain 1158 1159 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't 1160 if (cookie.version == 0 and 1161 (self.strict_ns_domain & self.DomainStrictNonDomain) and 1162 not cookie.domain_specified and domain != erhn): 1163 _debug(" cookie with unspecified domain does not string-compare " 1164 "equal to request domain") 1165 return False 1166 1167 if cookie.version > 0 and not domain_match(erhn, domain): 1168 _debug(" effective request-host name %s does not domain-match " 1169 "RFC 2965 cookie domain %s", erhn, domain) 1170 return False 1171 if cookie.version == 0 and not ("."+erhn).endswith(dotdomain): 1172 _debug(" request-host %s does not match Netscape cookie domain " 1173 "%s", req_host, domain) 1174 return False 1175 return True 1176 1177 def domain_return_ok(self, domain, request): 1178 # Liberal check of. This is here as an optimization to avoid 1179 # having to load lots of MSIE cookie files unless necessary. 1180 req_host, erhn = eff_request_host(request) 1181 if not req_host.startswith("."): 1182 req_host = "."+req_host 1183 if not erhn.startswith("."): 1184 erhn = "."+erhn 1185 if domain and not domain.startswith("."): 1186 dotdomain = "." + domain 1187 else: 1188 dotdomain = domain 1189 if not (req_host.endswith(dotdomain) or erhn.endswith(dotdomain)): 1190 #_debug(" request domain %s does not match cookie domain %s", 1191 # req_host, domain) 1192 return False 1193 1194 if self.is_blocked(domain): 1195 _debug(" domain %s is in user block-list", domain) 1196 return False 1197 if self.is_not_allowed(domain): 1198 _debug(" domain %s is not in user allow-list", domain) 1199 return False 1200 1201 return True 1202 1203 def path_return_ok(self, path, request): 1204 _debug("- checking cookie path=%s", path) 1205 req_path = request_path(request) 1206 pathlen = len(path) 1207 if req_path == path: 1208 return True 1209 elif (req_path.startswith(path) and 1210 (path.endswith("/") or req_path[pathlen:pathlen+1] == "/")): 1211 return True 1212 1213 _debug(" %s does not path-match %s", req_path, path) 1214 return False 1215 1216def vals_sorted_by_key(adict): 1217 keys = sorted(adict.keys()) 1218 return map(adict.get, keys) 1219 1220def deepvalues(mapping): 1221 """Iterates over nested mapping, depth-first, in sorted order by key.""" 1222 values = vals_sorted_by_key(mapping) 1223 for obj in values: 1224 mapping = False 1225 try: 1226 obj.items 1227 except AttributeError: 1228 pass 1229 else: 1230 mapping = True 1231 yield from deepvalues(obj) 1232 if not mapping: 1233 yield obj 1234 1235 1236# Used as second parameter to dict.get() method, to distinguish absent 1237# dict key from one with a None value. 1238class Absent: pass 1239 1240class CookieJar: 1241 """Collection of HTTP cookies. 1242 1243 You may not need to know about this class: try 1244 urllib.request.build_opener(HTTPCookieProcessor).open(url). 1245 """ 1246 1247 non_word_re = re.compile(r"\W") 1248 quote_re = re.compile(r"([\"\\])") 1249 strict_domain_re = re.compile(r"\.?[^.]*") 1250 domain_re = re.compile(r"[^.]*") 1251 dots_re = re.compile(r"^\.+") 1252 1253 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII) 1254 1255 def __init__(self, policy=None): 1256 if policy is None: 1257 policy = DefaultCookiePolicy() 1258 self._policy = policy 1259 1260 self._cookies_lock = _threading.RLock() 1261 self._cookies = {} 1262 1263 def set_policy(self, policy): 1264 self._policy = policy 1265 1266 def _cookies_for_domain(self, domain, request): 1267 cookies = [] 1268 if not self._policy.domain_return_ok(domain, request): 1269 return [] 1270 _debug("Checking %s for cookies to return", domain) 1271 cookies_by_path = self._cookies[domain] 1272 for path in cookies_by_path.keys(): 1273 if not self._policy.path_return_ok(path, request): 1274 continue 1275 cookies_by_name = cookies_by_path[path] 1276 for cookie in cookies_by_name.values(): 1277 if not self._policy.return_ok(cookie, request): 1278 _debug(" not returning cookie") 1279 continue 1280 _debug(" it's a match") 1281 cookies.append(cookie) 1282 return cookies 1283 1284 def _cookies_for_request(self, request): 1285 """Return a list of cookies to be returned to server.""" 1286 cookies = [] 1287 for domain in self._cookies.keys(): 1288 cookies.extend(self._cookies_for_domain(domain, request)) 1289 return cookies 1290 1291 def _cookie_attrs(self, cookies): 1292 """Return a list of cookie-attributes to be returned to server. 1293 1294 like ['foo="bar"; $Path="/"', ...] 1295 1296 The $Version attribute is also added when appropriate (currently only 1297 once per request). 1298 1299 """ 1300 # add cookies in order of most specific (ie. longest) path first 1301 cookies.sort(key=lambda a: len(a.path), reverse=True) 1302 1303 version_set = False 1304 1305 attrs = [] 1306 for cookie in cookies: 1307 # set version of Cookie header 1308 # XXX 1309 # What should it be if multiple matching Set-Cookie headers have 1310 # different versions themselves? 1311 # Answer: there is no answer; was supposed to be settled by 1312 # RFC 2965 errata, but that may never appear... 1313 version = cookie.version 1314 if not version_set: 1315 version_set = True 1316 if version > 0: 1317 attrs.append("$Version=%s" % version) 1318 1319 # quote cookie value if necessary 1320 # (not for Netscape protocol, which already has any quotes 1321 # intact, due to the poorly-specified Netscape Cookie: syntax) 1322 if ((cookie.value is not None) and 1323 self.non_word_re.search(cookie.value) and version > 0): 1324 value = self.quote_re.sub(r"\\\1", cookie.value) 1325 else: 1326 value = cookie.value 1327 1328 # add cookie-attributes to be returned in Cookie header 1329 if cookie.value is None: 1330 attrs.append(cookie.name) 1331 else: 1332 attrs.append("%s=%s" % (cookie.name, value)) 1333 if version > 0: 1334 if cookie.path_specified: 1335 attrs.append('$Path="%s"' % cookie.path) 1336 if cookie.domain.startswith("."): 1337 domain = cookie.domain 1338 if (not cookie.domain_initial_dot and 1339 domain.startswith(".")): 1340 domain = domain[1:] 1341 attrs.append('$Domain="%s"' % domain) 1342 if cookie.port is not None: 1343 p = "$Port" 1344 if cookie.port_specified: 1345 p = p + ('="%s"' % cookie.port) 1346 attrs.append(p) 1347 1348 return attrs 1349 1350 def add_cookie_header(self, request): 1351 """Add correct Cookie: header to request (urllib.request.Request object). 1352 1353 The Cookie2 header is also added unless policy.hide_cookie2 is true. 1354 1355 """ 1356 _debug("add_cookie_header") 1357 self._cookies_lock.acquire() 1358 try: 1359 1360 self._policy._now = self._now = int(time.time()) 1361 1362 cookies = self._cookies_for_request(request) 1363 1364 attrs = self._cookie_attrs(cookies) 1365 if attrs: 1366 if not request.has_header("Cookie"): 1367 request.add_unredirected_header( 1368 "Cookie", "; ".join(attrs)) 1369 1370 # if necessary, advertise that we know RFC 2965 1371 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and 1372 not request.has_header("Cookie2")): 1373 for cookie in cookies: 1374 if cookie.version != 1: 1375 request.add_unredirected_header("Cookie2", '$Version="1"') 1376 break 1377 1378 finally: 1379 self._cookies_lock.release() 1380 1381 self.clear_expired_cookies() 1382 1383 def _normalized_cookie_tuples(self, attrs_set): 1384 """Return list of tuples containing normalised cookie information. 1385 1386 attrs_set is the list of lists of key,value pairs extracted from 1387 the Set-Cookie or Set-Cookie2 headers. 1388 1389 Tuples are name, value, standard, rest, where name and value are the 1390 cookie name and value, standard is a dictionary containing the standard 1391 cookie-attributes (discard, secure, version, expires or max-age, 1392 domain, path and port) and rest is a dictionary containing the rest of 1393 the cookie-attributes. 1394 1395 """ 1396 cookie_tuples = [] 1397 1398 boolean_attrs = "discard", "secure" 1399 value_attrs = ("version", 1400 "expires", "max-age", 1401 "domain", "path", "port", 1402 "comment", "commenturl") 1403 1404 for cookie_attrs in attrs_set: 1405 name, value = cookie_attrs[0] 1406 1407 # Build dictionary of standard cookie-attributes (standard) and 1408 # dictionary of other cookie-attributes (rest). 1409 1410 # Note: expiry time is normalised to seconds since epoch. V0 1411 # cookies should have the Expires cookie-attribute, and V1 cookies 1412 # should have Max-Age, but since V1 includes RFC 2109 cookies (and 1413 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we 1414 # accept either (but prefer Max-Age). 1415 max_age_set = False 1416 1417 bad_cookie = False 1418 1419 standard = {} 1420 rest = {} 1421 for k, v in cookie_attrs[1:]: 1422 lc = k.lower() 1423 # don't lose case distinction for unknown fields 1424 if lc in value_attrs or lc in boolean_attrs: 1425 k = lc 1426 if k in boolean_attrs and v is None: 1427 # boolean cookie-attribute is present, but has no value 1428 # (like "discard", rather than "port=80") 1429 v = True 1430 if k in standard: 1431 # only first value is significant 1432 continue 1433 if k == "domain": 1434 if v is None: 1435 _debug(" missing value for domain attribute") 1436 bad_cookie = True 1437 break 1438 # RFC 2965 section 3.3.3 1439 v = v.lower() 1440 if k == "expires": 1441 if max_age_set: 1442 # Prefer max-age to expires (like Mozilla) 1443 continue 1444 if v is None: 1445 _debug(" missing or invalid value for expires " 1446 "attribute: treating as session cookie") 1447 continue 1448 if k == "max-age": 1449 max_age_set = True 1450 try: 1451 v = int(v) 1452 except ValueError: 1453 _debug(" missing or invalid (non-numeric) value for " 1454 "max-age attribute") 1455 bad_cookie = True 1456 break 1457 # convert RFC 2965 Max-Age to seconds since epoch 1458 # XXX Strictly you're supposed to follow RFC 2616 1459 # age-calculation rules. Remember that zero Max-Age 1460 # is a request to discard (old and new) cookie, though. 1461 k = "expires" 1462 v = self._now + v 1463 if (k in value_attrs) or (k in boolean_attrs): 1464 if (v is None and 1465 k not in ("port", "comment", "commenturl")): 1466 _debug(" missing value for %s attribute" % k) 1467 bad_cookie = True 1468 break 1469 standard[k] = v 1470 else: 1471 rest[k] = v 1472 1473 if bad_cookie: 1474 continue 1475 1476 cookie_tuples.append((name, value, standard, rest)) 1477 1478 return cookie_tuples 1479 1480 def _cookie_from_cookie_tuple(self, tup, request): 1481 # standard is dict of standard cookie-attributes, rest is dict of the 1482 # rest of them 1483 name, value, standard, rest = tup 1484 1485 domain = standard.get("domain", Absent) 1486 path = standard.get("path", Absent) 1487 port = standard.get("port", Absent) 1488 expires = standard.get("expires", Absent) 1489 1490 # set the easy defaults 1491 version = standard.get("version", None) 1492 if version is not None: 1493 try: 1494 version = int(version) 1495 except ValueError: 1496 return None # invalid version, ignore cookie 1497 secure = standard.get("secure", False) 1498 # (discard is also set if expires is Absent) 1499 discard = standard.get("discard", False) 1500 comment = standard.get("comment", None) 1501 comment_url = standard.get("commenturl", None) 1502 1503 # set default path 1504 if path is not Absent and path != "": 1505 path_specified = True 1506 path = escape_path(path) 1507 else: 1508 path_specified = False 1509 path = request_path(request) 1510 i = path.rfind("/") 1511 if i != -1: 1512 if version == 0: 1513 # Netscape spec parts company from reality here 1514 path = path[:i] 1515 else: 1516 path = path[:i+1] 1517 if len(path) == 0: path = "/" 1518 1519 # set default domain 1520 domain_specified = domain is not Absent 1521 # but first we have to remember whether it starts with a dot 1522 domain_initial_dot = False 1523 if domain_specified: 1524 domain_initial_dot = bool(domain.startswith(".")) 1525 if domain is Absent: 1526 req_host, erhn = eff_request_host(request) 1527 domain = erhn 1528 elif not domain.startswith("."): 1529 domain = "."+domain 1530 1531 # set default port 1532 port_specified = False 1533 if port is not Absent: 1534 if port is None: 1535 # Port attr present, but has no value: default to request port. 1536 # Cookie should then only be sent back on that port. 1537 port = request_port(request) 1538 else: 1539 port_specified = True 1540 port = re.sub(r"\s+", "", port) 1541 else: 1542 # No port attr present. Cookie can be sent back on any port. 1543 port = None 1544 1545 # set default expires and discard 1546 if expires is Absent: 1547 expires = None 1548 discard = True 1549 elif expires <= self._now: 1550 # Expiry date in past is request to delete cookie. This can't be 1551 # in DefaultCookiePolicy, because can't delete cookies there. 1552 try: 1553 self.clear(domain, path, name) 1554 except KeyError: 1555 pass 1556 _debug("Expiring cookie, domain='%s', path='%s', name='%s'", 1557 domain, path, name) 1558 return None 1559 1560 return Cookie(version, 1561 name, value, 1562 port, port_specified, 1563 domain, domain_specified, domain_initial_dot, 1564 path, path_specified, 1565 secure, 1566 expires, 1567 discard, 1568 comment, 1569 comment_url, 1570 rest) 1571 1572 def _cookies_from_attrs_set(self, attrs_set, request): 1573 cookie_tuples = self._normalized_cookie_tuples(attrs_set) 1574 1575 cookies = [] 1576 for tup in cookie_tuples: 1577 cookie = self._cookie_from_cookie_tuple(tup, request) 1578 if cookie: cookies.append(cookie) 1579 return cookies 1580 1581 def _process_rfc2109_cookies(self, cookies): 1582 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) 1583 if rfc2109_as_ns is None: 1584 rfc2109_as_ns = not self._policy.rfc2965 1585 for cookie in cookies: 1586 if cookie.version == 1: 1587 cookie.rfc2109 = True 1588 if rfc2109_as_ns: 1589 # treat 2109 cookies as Netscape cookies rather than 1590 # as RFC2965 cookies 1591 cookie.version = 0 1592 1593 def make_cookies(self, response, request): 1594 """Return sequence of Cookie objects extracted from response object.""" 1595 # get cookie-attributes for RFC 2965 and Netscape protocols 1596 headers = response.info() 1597 rfc2965_hdrs = headers.get_all("Set-Cookie2", []) 1598 ns_hdrs = headers.get_all("Set-Cookie", []) 1599 self._policy._now = self._now = int(time.time()) 1600 1601 rfc2965 = self._policy.rfc2965 1602 netscape = self._policy.netscape 1603 1604 if ((not rfc2965_hdrs and not ns_hdrs) or 1605 (not ns_hdrs and not rfc2965) or 1606 (not rfc2965_hdrs and not netscape) or 1607 (not netscape and not rfc2965)): 1608 return [] # no relevant cookie headers: quick exit 1609 1610 try: 1611 cookies = self._cookies_from_attrs_set( 1612 split_header_words(rfc2965_hdrs), request) 1613 except Exception: 1614 _warn_unhandled_exception() 1615 cookies = [] 1616 1617 if ns_hdrs and netscape: 1618 try: 1619 # RFC 2109 and Netscape cookies 1620 ns_cookies = self._cookies_from_attrs_set( 1621 parse_ns_headers(ns_hdrs), request) 1622 except Exception: 1623 _warn_unhandled_exception() 1624 ns_cookies = [] 1625 self._process_rfc2109_cookies(ns_cookies) 1626 1627 # Look for Netscape cookies (from Set-Cookie headers) that match 1628 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). 1629 # For each match, keep the RFC 2965 cookie and ignore the Netscape 1630 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are 1631 # bundled in with the Netscape cookies for this purpose, which is 1632 # reasonable behaviour. 1633 if rfc2965: 1634 lookup = {} 1635 for cookie in cookies: 1636 lookup[(cookie.domain, cookie.path, cookie.name)] = None 1637 1638 def no_matching_rfc2965(ns_cookie, lookup=lookup): 1639 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name 1640 return key not in lookup 1641 ns_cookies = filter(no_matching_rfc2965, ns_cookies) 1642 1643 if ns_cookies: 1644 cookies.extend(ns_cookies) 1645 1646 return cookies 1647 1648 def set_cookie_if_ok(self, cookie, request): 1649 """Set a cookie if policy says it's OK to do so.""" 1650 self._cookies_lock.acquire() 1651 try: 1652 self._policy._now = self._now = int(time.time()) 1653 1654 if self._policy.set_ok(cookie, request): 1655 self.set_cookie(cookie) 1656 1657 1658 finally: 1659 self._cookies_lock.release() 1660 1661 def set_cookie(self, cookie): 1662 """Set a cookie, without checking whether or not it should be set.""" 1663 c = self._cookies 1664 self._cookies_lock.acquire() 1665 try: 1666 if cookie.domain not in c: c[cookie.domain] = {} 1667 c2 = c[cookie.domain] 1668 if cookie.path not in c2: c2[cookie.path] = {} 1669 c3 = c2[cookie.path] 1670 c3[cookie.name] = cookie 1671 finally: 1672 self._cookies_lock.release() 1673 1674 def extract_cookies(self, response, request): 1675 """Extract cookies from response, where allowable given the request.""" 1676 _debug("extract_cookies: %s", response.info()) 1677 self._cookies_lock.acquire() 1678 try: 1679 for cookie in self.make_cookies(response, request): 1680 if self._policy.set_ok(cookie, request): 1681 _debug(" setting cookie: %s", cookie) 1682 self.set_cookie(cookie) 1683 finally: 1684 self._cookies_lock.release() 1685 1686 def clear(self, domain=None, path=None, name=None): 1687 """Clear some cookies. 1688 1689 Invoking this method without arguments will clear all cookies. If 1690 given a single argument, only cookies belonging to that domain will be 1691 removed. If given two arguments, cookies belonging to the specified 1692 path within that domain are removed. If given three arguments, then 1693 the cookie with the specified name, path and domain is removed. 1694 1695 Raises KeyError if no matching cookie exists. 1696 1697 """ 1698 if name is not None: 1699 if (domain is None) or (path is None): 1700 raise ValueError( 1701 "domain and path must be given to remove a cookie by name") 1702 del self._cookies[domain][path][name] 1703 elif path is not None: 1704 if domain is None: 1705 raise ValueError( 1706 "domain must be given to remove cookies by path") 1707 del self._cookies[domain][path] 1708 elif domain is not None: 1709 del self._cookies[domain] 1710 else: 1711 self._cookies = {} 1712 1713 def clear_session_cookies(self): 1714 """Discard all session cookies. 1715 1716 Note that the .save() method won't save session cookies anyway, unless 1717 you ask otherwise by passing a true ignore_discard argument. 1718 1719 """ 1720 self._cookies_lock.acquire() 1721 try: 1722 for cookie in self: 1723 if cookie.discard: 1724 self.clear(cookie.domain, cookie.path, cookie.name) 1725 finally: 1726 self._cookies_lock.release() 1727 1728 def clear_expired_cookies(self): 1729 """Discard all expired cookies. 1730 1731 You probably don't need to call this method: expired cookies are never 1732 sent back to the server (provided you're using DefaultCookiePolicy), 1733 this method is called by CookieJar itself every so often, and the 1734 .save() method won't save expired cookies anyway (unless you ask 1735 otherwise by passing a true ignore_expires argument). 1736 1737 """ 1738 self._cookies_lock.acquire() 1739 try: 1740 now = time.time() 1741 for cookie in self: 1742 if cookie.is_expired(now): 1743 self.clear(cookie.domain, cookie.path, cookie.name) 1744 finally: 1745 self._cookies_lock.release() 1746 1747 def __iter__(self): 1748 return deepvalues(self._cookies) 1749 1750 def __len__(self): 1751 """Return number of contained cookies.""" 1752 i = 0 1753 for cookie in self: i = i + 1 1754 return i 1755 1756 def __repr__(self): 1757 r = [] 1758 for cookie in self: r.append(repr(cookie)) 1759 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1760 1761 def __str__(self): 1762 r = [] 1763 for cookie in self: r.append(str(cookie)) 1764 return "<%s[%s]>" % (self.__class__.__name__, ", ".join(r)) 1765 1766 1767# derives from OSError for backwards-compatibility with Python 2.4.0 1768class LoadError(OSError): pass 1769 1770class FileCookieJar(CookieJar): 1771 """CookieJar that can be loaded from and saved to a file.""" 1772 1773 def __init__(self, filename=None, delayload=False, policy=None): 1774 """ 1775 Cookies are NOT loaded from the named file until either the .load() or 1776 .revert() method is called. 1777 1778 """ 1779 CookieJar.__init__(self, policy) 1780 if filename is not None: 1781 try: 1782 filename+"" 1783 except: 1784 raise ValueError("filename must be string-like") 1785 self.filename = filename 1786 self.delayload = bool(delayload) 1787 1788 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1789 """Save cookies to a file.""" 1790 raise NotImplementedError() 1791 1792 def load(self, filename=None, ignore_discard=False, ignore_expires=False): 1793 """Load cookies from a file.""" 1794 if filename is None: 1795 if self.filename is not None: filename = self.filename 1796 else: raise ValueError(MISSING_FILENAME_TEXT) 1797 1798 with open(filename) as f: 1799 self._really_load(f, filename, ignore_discard, ignore_expires) 1800 1801 def revert(self, filename=None, 1802 ignore_discard=False, ignore_expires=False): 1803 """Clear all cookies and reload cookies from a saved file. 1804 1805 Raises LoadError (or OSError) if reversion is not successful; the 1806 object's state will not be altered if this happens. 1807 1808 """ 1809 if filename is None: 1810 if self.filename is not None: filename = self.filename 1811 else: raise ValueError(MISSING_FILENAME_TEXT) 1812 1813 self._cookies_lock.acquire() 1814 try: 1815 1816 old_state = copy.deepcopy(self._cookies) 1817 self._cookies = {} 1818 try: 1819 self.load(filename, ignore_discard, ignore_expires) 1820 except OSError: 1821 self._cookies = old_state 1822 raise 1823 1824 finally: 1825 self._cookies_lock.release() 1826 1827 1828def lwp_cookie_str(cookie): 1829 """Return string representation of Cookie in the LWP cookie file format. 1830 1831 Actually, the format is extended a bit -- see module docstring. 1832 1833 """ 1834 h = [(cookie.name, cookie.value), 1835 ("path", cookie.path), 1836 ("domain", cookie.domain)] 1837 if cookie.port is not None: h.append(("port", cookie.port)) 1838 if cookie.path_specified: h.append(("path_spec", None)) 1839 if cookie.port_specified: h.append(("port_spec", None)) 1840 if cookie.domain_initial_dot: h.append(("domain_dot", None)) 1841 if cookie.secure: h.append(("secure", None)) 1842 if cookie.expires: h.append(("expires", 1843 time2isoz(float(cookie.expires)))) 1844 if cookie.discard: h.append(("discard", None)) 1845 if cookie.comment: h.append(("comment", cookie.comment)) 1846 if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) 1847 1848 keys = sorted(cookie._rest.keys()) 1849 for k in keys: 1850 h.append((k, str(cookie._rest[k]))) 1851 1852 h.append(("version", str(cookie.version))) 1853 1854 return join_header_words([h]) 1855 1856class LWPCookieJar(FileCookieJar): 1857 """ 1858 The LWPCookieJar saves a sequence of "Set-Cookie3" lines. 1859 "Set-Cookie3" is the format used by the libwww-perl library, not known 1860 to be compatible with any browser, but which is easy to read and 1861 doesn't lose information about RFC 2965 cookies. 1862 1863 Additional methods 1864 1865 as_lwp_str(ignore_discard=True, ignore_expired=True) 1866 1867 """ 1868 1869 def as_lwp_str(self, ignore_discard=True, ignore_expires=True): 1870 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers. 1871 1872 ignore_discard and ignore_expires: see docstring for FileCookieJar.save 1873 1874 """ 1875 now = time.time() 1876 r = [] 1877 for cookie in self: 1878 if not ignore_discard and cookie.discard: 1879 continue 1880 if not ignore_expires and cookie.is_expired(now): 1881 continue 1882 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) 1883 return "\n".join(r+[""]) 1884 1885 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 1886 if filename is None: 1887 if self.filename is not None: filename = self.filename 1888 else: raise ValueError(MISSING_FILENAME_TEXT) 1889 1890 with open(filename, "w") as f: 1891 # There really isn't an LWP Cookies 2.0 format, but this indicates 1892 # that there is extra information in here (domain_dot and 1893 # port_spec) while still being compatible with libwww-perl, I hope. 1894 f.write("#LWP-Cookies-2.0\n") 1895 f.write(self.as_lwp_str(ignore_discard, ignore_expires)) 1896 1897 def _really_load(self, f, filename, ignore_discard, ignore_expires): 1898 magic = f.readline() 1899 if not self.magic_re.search(magic): 1900 msg = ("%r does not look like a Set-Cookie3 (LWP) format " 1901 "file" % filename) 1902 raise LoadError(msg) 1903 1904 now = time.time() 1905 1906 header = "Set-Cookie3:" 1907 boolean_attrs = ("port_spec", "path_spec", "domain_dot", 1908 "secure", "discard") 1909 value_attrs = ("version", 1910 "port", "path", "domain", 1911 "expires", 1912 "comment", "commenturl") 1913 1914 try: 1915 while 1: 1916 line = f.readline() 1917 if line == "": break 1918 if not line.startswith(header): 1919 continue 1920 line = line[len(header):].strip() 1921 1922 for data in split_header_words([line]): 1923 name, value = data[0] 1924 standard = {} 1925 rest = {} 1926 for k in boolean_attrs: 1927 standard[k] = False 1928 for k, v in data[1:]: 1929 if k is not None: 1930 lc = k.lower() 1931 else: 1932 lc = None 1933 # don't lose case distinction for unknown fields 1934 if (lc in value_attrs) or (lc in boolean_attrs): 1935 k = lc 1936 if k in boolean_attrs: 1937 if v is None: v = True 1938 standard[k] = v 1939 elif k in value_attrs: 1940 standard[k] = v 1941 else: 1942 rest[k] = v 1943 1944 h = standard.get 1945 expires = h("expires") 1946 discard = h("discard") 1947 if expires is not None: 1948 expires = iso2time(expires) 1949 if expires is None: 1950 discard = True 1951 domain = h("domain") 1952 domain_specified = domain.startswith(".") 1953 c = Cookie(h("version"), name, value, 1954 h("port"), h("port_spec"), 1955 domain, domain_specified, h("domain_dot"), 1956 h("path"), h("path_spec"), 1957 h("secure"), 1958 expires, 1959 discard, 1960 h("comment"), 1961 h("commenturl"), 1962 rest) 1963 if not ignore_discard and c.discard: 1964 continue 1965 if not ignore_expires and c.is_expired(now): 1966 continue 1967 self.set_cookie(c) 1968 except OSError: 1969 raise 1970 except Exception: 1971 _warn_unhandled_exception() 1972 raise LoadError("invalid Set-Cookie3 format file %r: %r" % 1973 (filename, line)) 1974 1975 1976class MozillaCookieJar(FileCookieJar): 1977 """ 1978 1979 WARNING: you may want to backup your browser's cookies file if you use 1980 this class to save cookies. I *think* it works, but there have been 1981 bugs in the past! 1982 1983 This class differs from CookieJar only in the format it uses to save and 1984 load cookies to and from a file. This class uses the Mozilla/Netscape 1985 `cookies.txt' format. lynx uses this file format, too. 1986 1987 Don't expect cookies saved while the browser is running to be noticed by 1988 the browser (in fact, Mozilla on unix will overwrite your saved cookies if 1989 you change them on disk while it's running; on Windows, you probably can't 1990 save at all while the browser is running). 1991 1992 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to 1993 Netscape cookies on saving. 1994 1995 In particular, the cookie version and port number information is lost, 1996 together with information about whether or not Path, Port and Discard were 1997 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the 1998 domain as set in the HTTP header started with a dot (yes, I'm aware some 1999 domains in Netscape files start with a dot and some don't -- trust me, you 2000 really don't want to know any more about this). 2001 2002 Note that though Mozilla and Netscape use the same format, they use 2003 slightly different headers. The class saves cookies using the Netscape 2004 header by default (Mozilla can cope with that). 2005 2006 """ 2007 magic_re = re.compile("#( Netscape)? HTTP Cookie File") 2008 header = """\ 2009# Netscape HTTP Cookie File 2010# http://curl.haxx.se/rfc/cookie_spec.html 2011# This is a generated file! Do not edit. 2012 2013""" 2014 2015 def _really_load(self, f, filename, ignore_discard, ignore_expires): 2016 now = time.time() 2017 2018 magic = f.readline() 2019 if not self.magic_re.search(magic): 2020 raise LoadError( 2021 "%r does not look like a Netscape format cookies file" % 2022 filename) 2023 2024 try: 2025 while 1: 2026 line = f.readline() 2027 if line == "": break 2028 2029 # last field may be absent, so keep any trailing tab 2030 if line.endswith("\n"): line = line[:-1] 2031 2032 # skip comments and blank lines XXX what is $ for? 2033 if (line.strip().startswith(("#", "$")) or 2034 line.strip() == ""): 2035 continue 2036 2037 domain, domain_specified, path, secure, expires, name, value = \ 2038 line.split("\t") 2039 secure = (secure == "TRUE") 2040 domain_specified = (domain_specified == "TRUE") 2041 if name == "": 2042 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2043 # with no name, whereas http.cookiejar regards it as a 2044 # cookie with no value. 2045 name = value 2046 value = None 2047 2048 initial_dot = domain.startswith(".") 2049 assert domain_specified == initial_dot 2050 2051 discard = False 2052 if expires == "": 2053 expires = None 2054 discard = True 2055 2056 # assume path_specified is false 2057 c = Cookie(0, name, value, 2058 None, False, 2059 domain, domain_specified, initial_dot, 2060 path, False, 2061 secure, 2062 expires, 2063 discard, 2064 None, 2065 None, 2066 {}) 2067 if not ignore_discard and c.discard: 2068 continue 2069 if not ignore_expires and c.is_expired(now): 2070 continue 2071 self.set_cookie(c) 2072 2073 except OSError: 2074 raise 2075 except Exception: 2076 _warn_unhandled_exception() 2077 raise LoadError("invalid Netscape format cookies file %r: %r" % 2078 (filename, line)) 2079 2080 def save(self, filename=None, ignore_discard=False, ignore_expires=False): 2081 if filename is None: 2082 if self.filename is not None: filename = self.filename 2083 else: raise ValueError(MISSING_FILENAME_TEXT) 2084 2085 with open(filename, "w") as f: 2086 f.write(self.header) 2087 now = time.time() 2088 for cookie in self: 2089 if not ignore_discard and cookie.discard: 2090 continue 2091 if not ignore_expires and cookie.is_expired(now): 2092 continue 2093 if cookie.secure: secure = "TRUE" 2094 else: secure = "FALSE" 2095 if cookie.domain.startswith("."): initial_dot = "TRUE" 2096 else: initial_dot = "FALSE" 2097 if cookie.expires is not None: 2098 expires = str(cookie.expires) 2099 else: 2100 expires = "" 2101 if cookie.value is None: 2102 # cookies.txt regards 'Set-Cookie: foo' as a cookie 2103 # with no name, whereas http.cookiejar regards it as a 2104 # cookie with no value. 2105 name = "" 2106 value = cookie.name 2107 else: 2108 name = cookie.name 2109 value = cookie.value 2110 f.write( 2111 "\t".join([cookie.domain, initial_dot, cookie.path, 2112 secure, expires, name, value])+ 2113 "\n") 2114