1"""PyPI and direct package downloading""" 2import sys 3import os 4import re 5import shutil 6import socket 7import base64 8import hashlib 9import itertools 10import warnings 11from functools import wraps 12 13from setuptools.extern import six 14from setuptools.extern.six.moves import urllib, http_client, configparser, map 15 16import setuptools 17from pkg_resources import ( 18 CHECKOUT_DIST, Distribution, BINARY_DIST, normalize_path, SOURCE_DIST, 19 Environment, find_distributions, safe_name, safe_version, 20 to_filename, Requirement, DEVELOP_DIST, EGG_DIST, 21) 22from setuptools import ssl_support 23from distutils import log 24from distutils.errors import DistutilsError 25from fnmatch import translate 26from setuptools.py27compat import get_all_headers 27from setuptools.py33compat import unescape 28from setuptools.wheel import Wheel 29 30__metaclass__ = type 31 32EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.+!]+)$') 33HREF = re.compile(r"""href\s*=\s*['"]?([^'"> ]+)""", re.I) 34PYPI_MD5 = re.compile( 35 r'<a href="([^"#]+)">([^<]+)</a>\n\s+\(<a (?:title="MD5 hash"\n\s+)' 36 r'href="[^?]+\?:action=show_md5&digest=([0-9a-f]{32})">md5</a>\)' 37) 38URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match 39EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split() 40 41__all__ = [ 42 'PackageIndex', 'distros_for_url', 'parse_bdist_wininst', 43 'interpret_distro_name', 44] 45 46_SOCKET_TIMEOUT = 15 47 48_tmpl = "setuptools/{setuptools.__version__} Python-urllib/{py_major}" 49user_agent = _tmpl.format( 50 py_major='{}.{}'.format(*sys.version_info), setuptools=setuptools) 51 52 53def parse_requirement_arg(spec): 54 try: 55 return Requirement.parse(spec) 56 except ValueError as e: 57 raise DistutilsError( 58 "Not a URL, existing file, or requirement spec: %r" % (spec,) 59 ) from e 60 61 62def parse_bdist_wininst(name): 63 """Return (base,pyversion) or (None,None) for possible .exe name""" 64 65 lower = name.lower() 66 base, py_ver, plat = None, None, None 67 68 if lower.endswith('.exe'): 69 if lower.endswith('.win32.exe'): 70 base = name[:-10] 71 plat = 'win32' 72 elif lower.startswith('.win32-py', -16): 73 py_ver = name[-7:-4] 74 base = name[:-16] 75 plat = 'win32' 76 elif lower.endswith('.win-amd64.exe'): 77 base = name[:-14] 78 plat = 'win-amd64' 79 elif lower.startswith('.win-amd64-py', -20): 80 py_ver = name[-7:-4] 81 base = name[:-20] 82 plat = 'win-amd64' 83 return base, py_ver, plat 84 85 86def egg_info_for_url(url): 87 parts = urllib.parse.urlparse(url) 88 scheme, server, path, parameters, query, fragment = parts 89 base = urllib.parse.unquote(path.split('/')[-1]) 90 if server == 'sourceforge.net' and base == 'download': # XXX Yuck 91 base = urllib.parse.unquote(path.split('/')[-2]) 92 if '#' in base: 93 base, fragment = base.split('#', 1) 94 return base, fragment 95 96 97def distros_for_url(url, metadata=None): 98 """Yield egg or source distribution objects that might be found at a URL""" 99 base, fragment = egg_info_for_url(url) 100 for dist in distros_for_location(url, base, metadata): 101 yield dist 102 if fragment: 103 match = EGG_FRAGMENT.match(fragment) 104 if match: 105 for dist in interpret_distro_name( 106 url, match.group(1), metadata, precedence=CHECKOUT_DIST 107 ): 108 yield dist 109 110 111def distros_for_location(location, basename, metadata=None): 112 """Yield egg or source distribution objects based on basename""" 113 if basename.endswith('.egg.zip'): 114 basename = basename[:-4] # strip the .zip 115 if basename.endswith('.egg') and '-' in basename: 116 # only one, unambiguous interpretation 117 return [Distribution.from_location(location, basename, metadata)] 118 if basename.endswith('.whl') and '-' in basename: 119 wheel = Wheel(basename) 120 if not wheel.is_compatible(): 121 return [] 122 return [Distribution( 123 location=location, 124 project_name=wheel.project_name, 125 version=wheel.version, 126 # Increase priority over eggs. 127 precedence=EGG_DIST + 1, 128 )] 129 if basename.endswith('.exe'): 130 win_base, py_ver, platform = parse_bdist_wininst(basename) 131 if win_base is not None: 132 return interpret_distro_name( 133 location, win_base, metadata, py_ver, BINARY_DIST, platform 134 ) 135 # Try source distro extensions (.zip, .tgz, etc.) 136 # 137 for ext in EXTENSIONS: 138 if basename.endswith(ext): 139 basename = basename[:-len(ext)] 140 return interpret_distro_name(location, basename, metadata) 141 return [] # no extension matched 142 143 144def distros_for_filename(filename, metadata=None): 145 """Yield possible egg or source distribution objects based on a filename""" 146 return distros_for_location( 147 normalize_path(filename), os.path.basename(filename), metadata 148 ) 149 150 151def interpret_distro_name( 152 location, basename, metadata, py_version=None, precedence=SOURCE_DIST, 153 platform=None 154): 155 """Generate alternative interpretations of a source distro name 156 157 Note: if `location` is a filesystem filename, you should call 158 ``pkg_resources.normalize_path()`` on it before passing it to this 159 routine! 160 """ 161 # Generate alternative interpretations of a source distro name 162 # Because some packages are ambiguous as to name/versions split 163 # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc. 164 # So, we generate each possible interepretation (e.g. "adns, python-1.1.0" 165 # "adns-python, 1.1.0", and "adns-python-1.1.0, no version"). In practice, 166 # the spurious interpretations should be ignored, because in the event 167 # there's also an "adns" package, the spurious "python-1.1.0" version will 168 # compare lower than any numeric version number, and is therefore unlikely 169 # to match a request for it. It's still a potential problem, though, and 170 # in the long run PyPI and the distutils should go for "safe" names and 171 # versions in distribution archive names (sdist and bdist). 172 173 parts = basename.split('-') 174 if not py_version and any(re.match(r'py\d\.\d$', p) for p in parts[2:]): 175 # it is a bdist_dumb, not an sdist -- bail out 176 return 177 178 for p in range(1, len(parts) + 1): 179 yield Distribution( 180 location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]), 181 py_version=py_version, precedence=precedence, 182 platform=platform 183 ) 184 185 186# From Python 2.7 docs 187def unique_everseen(iterable, key=None): 188 "List unique elements, preserving order. Remember all elements ever seen." 189 # unique_everseen('AAAABBBCCDAABBB') --> A B C D 190 # unique_everseen('ABBCcAD', str.lower) --> A B C D 191 seen = set() 192 seen_add = seen.add 193 if key is None: 194 for element in six.moves.filterfalse(seen.__contains__, iterable): 195 seen_add(element) 196 yield element 197 else: 198 for element in iterable: 199 k = key(element) 200 if k not in seen: 201 seen_add(k) 202 yield element 203 204 205def unique_values(func): 206 """ 207 Wrap a function returning an iterable such that the resulting iterable 208 only ever yields unique items. 209 """ 210 211 @wraps(func) 212 def wrapper(*args, **kwargs): 213 return unique_everseen(func(*args, **kwargs)) 214 215 return wrapper 216 217 218REL = re.compile(r"""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I) 219# this line is here to fix emacs' cruddy broken syntax highlighting 220 221 222@unique_values 223def find_external_links(url, page): 224 """Find rel="homepage" and rel="download" links in `page`, yielding URLs""" 225 226 for match in REL.finditer(page): 227 tag, rel = match.groups() 228 rels = set(map(str.strip, rel.lower().split(','))) 229 if 'homepage' in rels or 'download' in rels: 230 for match in HREF.finditer(tag): 231 yield urllib.parse.urljoin(url, htmldecode(match.group(1))) 232 233 for tag in ("<th>Home Page", "<th>Download URL"): 234 pos = page.find(tag) 235 if pos != -1: 236 match = HREF.search(page, pos) 237 if match: 238 yield urllib.parse.urljoin(url, htmldecode(match.group(1))) 239 240 241class ContentChecker: 242 """ 243 A null content checker that defines the interface for checking content 244 """ 245 246 def feed(self, block): 247 """ 248 Feed a block of data to the hash. 249 """ 250 return 251 252 def is_valid(self): 253 """ 254 Check the hash. Return False if validation fails. 255 """ 256 return True 257 258 def report(self, reporter, template): 259 """ 260 Call reporter with information about the checker (hash name) 261 substituted into the template. 262 """ 263 return 264 265 266class HashChecker(ContentChecker): 267 pattern = re.compile( 268 r'(?P<hash_name>sha1|sha224|sha384|sha256|sha512|md5)=' 269 r'(?P<expected>[a-f0-9]+)' 270 ) 271 272 def __init__(self, hash_name, expected): 273 self.hash_name = hash_name 274 self.hash = hashlib.new(hash_name) 275 self.expected = expected 276 277 @classmethod 278 def from_url(cls, url): 279 "Construct a (possibly null) ContentChecker from a URL" 280 fragment = urllib.parse.urlparse(url)[-1] 281 if not fragment: 282 return ContentChecker() 283 match = cls.pattern.search(fragment) 284 if not match: 285 return ContentChecker() 286 return cls(**match.groupdict()) 287 288 def feed(self, block): 289 self.hash.update(block) 290 291 def is_valid(self): 292 return self.hash.hexdigest() == self.expected 293 294 def report(self, reporter, template): 295 msg = template % self.hash_name 296 return reporter(msg) 297 298 299class PackageIndex(Environment): 300 """A distribution index that scans web pages for download URLs""" 301 302 def __init__( 303 self, index_url="https://pypi.org/simple/", hosts=('*',), 304 ca_bundle=None, verify_ssl=True, *args, **kw 305 ): 306 Environment.__init__(self, *args, **kw) 307 self.index_url = index_url + "/" [:not index_url.endswith('/')] 308 self.scanned_urls = {} 309 self.fetched_urls = {} 310 self.package_pages = {} 311 self.allows = re.compile('|'.join(map(translate, hosts))).match 312 self.to_scan = [] 313 use_ssl = ( 314 verify_ssl 315 and ssl_support.is_available 316 and (ca_bundle or ssl_support.find_ca_bundle()) 317 ) 318 if use_ssl: 319 self.opener = ssl_support.opener_for(ca_bundle) 320 else: 321 self.opener = urllib.request.urlopen 322 323 def process_url(self, url, retrieve=False): 324 """Evaluate a URL as a possible download, and maybe retrieve it""" 325 if url in self.scanned_urls and not retrieve: 326 return 327 self.scanned_urls[url] = True 328 if not URL_SCHEME(url): 329 self.process_filename(url) 330 return 331 else: 332 dists = list(distros_for_url(url)) 333 if dists: 334 if not self.url_ok(url): 335 return 336 self.debug("Found link: %s", url) 337 338 if dists or not retrieve or url in self.fetched_urls: 339 list(map(self.add, dists)) 340 return # don't need the actual page 341 342 if not self.url_ok(url): 343 self.fetched_urls[url] = True 344 return 345 346 self.info("Reading %s", url) 347 self.fetched_urls[url] = True # prevent multiple fetch attempts 348 tmpl = "Download error on %s: %%s -- Some packages may not be found!" 349 f = self.open_url(url, tmpl % url) 350 if f is None: 351 return 352 if isinstance(f, urllib.error.HTTPError) and f.code == 401: 353 self.info("Authentication error: %s" % f.msg) 354 self.fetched_urls[f.url] = True 355 if 'html' not in f.headers.get('content-type', '').lower(): 356 f.close() # not html, we can't process it 357 return 358 359 base = f.url # handle redirects 360 page = f.read() 361 if not isinstance(page, str): 362 # In Python 3 and got bytes but want str. 363 if isinstance(f, urllib.error.HTTPError): 364 # Errors have no charset, assume latin1: 365 charset = 'latin-1' 366 else: 367 charset = f.headers.get_param('charset') or 'latin-1' 368 page = page.decode(charset, "ignore") 369 f.close() 370 for match in HREF.finditer(page): 371 link = urllib.parse.urljoin(base, htmldecode(match.group(1))) 372 self.process_url(link) 373 if url.startswith(self.index_url) and getattr(f, 'code', None) != 404: 374 page = self.process_index(url, page) 375 376 def process_filename(self, fn, nested=False): 377 # process filenames or directories 378 if not os.path.exists(fn): 379 self.warn("Not found: %s", fn) 380 return 381 382 if os.path.isdir(fn) and not nested: 383 path = os.path.realpath(fn) 384 for item in os.listdir(path): 385 self.process_filename(os.path.join(path, item), True) 386 387 dists = distros_for_filename(fn) 388 if dists: 389 self.debug("Found: %s", fn) 390 list(map(self.add, dists)) 391 392 def url_ok(self, url, fatal=False): 393 s = URL_SCHEME(url) 394 is_file = s and s.group(1).lower() == 'file' 395 if is_file or self.allows(urllib.parse.urlparse(url)[1]): 396 return True 397 msg = ( 398 "\nNote: Bypassing %s (disallowed host; see " 399 "http://bit.ly/2hrImnY for details).\n") 400 if fatal: 401 raise DistutilsError(msg % url) 402 else: 403 self.warn(msg, url) 404 405 def scan_egg_links(self, search_path): 406 dirs = filter(os.path.isdir, search_path) 407 egg_links = ( 408 (path, entry) 409 for path in dirs 410 for entry in os.listdir(path) 411 if entry.endswith('.egg-link') 412 ) 413 list(itertools.starmap(self.scan_egg_link, egg_links)) 414 415 def scan_egg_link(self, path, entry): 416 with open(os.path.join(path, entry)) as raw_lines: 417 # filter non-empty lines 418 lines = list(filter(None, map(str.strip, raw_lines))) 419 420 if len(lines) != 2: 421 # format is not recognized; punt 422 return 423 424 egg_path, setup_path = lines 425 426 for dist in find_distributions(os.path.join(path, egg_path)): 427 dist.location = os.path.join(path, *lines) 428 dist.precedence = SOURCE_DIST 429 self.add(dist) 430 431 def process_index(self, url, page): 432 """Process the contents of a PyPI page""" 433 434 def scan(link): 435 # Process a URL to see if it's for a package page 436 if link.startswith(self.index_url): 437 parts = list(map( 438 urllib.parse.unquote, link[len(self.index_url):].split('/') 439 )) 440 if len(parts) == 2 and '#' not in parts[1]: 441 # it's a package page, sanitize and index it 442 pkg = safe_name(parts[0]) 443 ver = safe_version(parts[1]) 444 self.package_pages.setdefault(pkg.lower(), {})[link] = True 445 return to_filename(pkg), to_filename(ver) 446 return None, None 447 448 # process an index page into the package-page index 449 for match in HREF.finditer(page): 450 try: 451 scan(urllib.parse.urljoin(url, htmldecode(match.group(1)))) 452 except ValueError: 453 pass 454 455 pkg, ver = scan(url) # ensure this page is in the page index 456 if pkg: 457 # process individual package page 458 for new_url in find_external_links(url, page): 459 # Process the found URL 460 base, frag = egg_info_for_url(new_url) 461 if base.endswith('.py') and not frag: 462 if ver: 463 new_url += '#egg=%s-%s' % (pkg, ver) 464 else: 465 self.need_version_info(url) 466 self.scan_url(new_url) 467 468 return PYPI_MD5.sub( 469 lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page 470 ) 471 else: 472 return "" # no sense double-scanning non-package pages 473 474 def need_version_info(self, url): 475 self.scan_all( 476 "Page at %s links to .py file(s) without version info; an index " 477 "scan is required.", url 478 ) 479 480 def scan_all(self, msg=None, *args): 481 if self.index_url not in self.fetched_urls: 482 if msg: 483 self.warn(msg, *args) 484 self.info( 485 "Scanning index of all packages (this may take a while)" 486 ) 487 self.scan_url(self.index_url) 488 489 def find_packages(self, requirement): 490 self.scan_url(self.index_url + requirement.unsafe_name + '/') 491 492 if not self.package_pages.get(requirement.key): 493 # Fall back to safe version of the name 494 self.scan_url(self.index_url + requirement.project_name + '/') 495 496 if not self.package_pages.get(requirement.key): 497 # We couldn't find the target package, so search the index page too 498 self.not_found_in_index(requirement) 499 500 for url in list(self.package_pages.get(requirement.key, ())): 501 # scan each page that might be related to the desired package 502 self.scan_url(url) 503 504 def obtain(self, requirement, installer=None): 505 self.prescan() 506 self.find_packages(requirement) 507 for dist in self[requirement.key]: 508 if dist in requirement: 509 return dist 510 self.debug("%s does not match %s", requirement, dist) 511 return super(PackageIndex, self).obtain(requirement, installer) 512 513 def check_hash(self, checker, filename, tfp): 514 """ 515 checker is a ContentChecker 516 """ 517 checker.report( 518 self.debug, 519 "Validating %%s checksum for %s" % filename) 520 if not checker.is_valid(): 521 tfp.close() 522 os.unlink(filename) 523 raise DistutilsError( 524 "%s validation failed for %s; " 525 "possible download problem?" 526 % (checker.hash.name, os.path.basename(filename)) 527 ) 528 529 def add_find_links(self, urls): 530 """Add `urls` to the list that will be prescanned for searches""" 531 for url in urls: 532 if ( 533 self.to_scan is None # if we have already "gone online" 534 or not URL_SCHEME(url) # or it's a local file/directory 535 or url.startswith('file:') 536 or list(distros_for_url(url)) # or a direct package link 537 ): 538 # then go ahead and process it now 539 self.scan_url(url) 540 else: 541 # otherwise, defer retrieval till later 542 self.to_scan.append(url) 543 544 def prescan(self): 545 """Scan urls scheduled for prescanning (e.g. --find-links)""" 546 if self.to_scan: 547 list(map(self.scan_url, self.to_scan)) 548 self.to_scan = None # from now on, go ahead and process immediately 549 550 def not_found_in_index(self, requirement): 551 if self[requirement.key]: # we've seen at least one distro 552 meth, msg = self.info, "Couldn't retrieve index page for %r" 553 else: # no distros seen for this name, might be misspelled 554 meth, msg = ( 555 self.warn, 556 "Couldn't find index page for %r (maybe misspelled?)") 557 meth(msg, requirement.unsafe_name) 558 self.scan_all() 559 560 def download(self, spec, tmpdir): 561 """Locate and/or download `spec` to `tmpdir`, returning a local path 562 563 `spec` may be a ``Requirement`` object, or a string containing a URL, 564 an existing local filename, or a project/version requirement spec 565 (i.e. the string form of a ``Requirement`` object). If it is the URL 566 of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one 567 that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is 568 automatically created alongside the downloaded file. 569 570 If `spec` is a ``Requirement`` object or a string containing a 571 project/version requirement spec, this method returns the location of 572 a matching distribution (possibly after downloading it to `tmpdir`). 573 If `spec` is a locally existing file or directory name, it is simply 574 returned unchanged. If `spec` is a URL, it is downloaded to a subpath 575 of `tmpdir`, and the local filename is returned. Various errors may be 576 raised if a problem occurs during downloading. 577 """ 578 if not isinstance(spec, Requirement): 579 scheme = URL_SCHEME(spec) 580 if scheme: 581 # It's a url, download it to tmpdir 582 found = self._download_url(scheme.group(1), spec, tmpdir) 583 base, fragment = egg_info_for_url(spec) 584 if base.endswith('.py'): 585 found = self.gen_setup(found, fragment, tmpdir) 586 return found 587 elif os.path.exists(spec): 588 # Existing file or directory, just return it 589 return spec 590 else: 591 spec = parse_requirement_arg(spec) 592 return getattr(self.fetch_distribution(spec, tmpdir), 'location', None) 593 594 def fetch_distribution( 595 self, requirement, tmpdir, force_scan=False, source=False, 596 develop_ok=False, local_index=None): 597 """Obtain a distribution suitable for fulfilling `requirement` 598 599 `requirement` must be a ``pkg_resources.Requirement`` instance. 600 If necessary, or if the `force_scan` flag is set, the requirement is 601 searched for in the (online) package index as well as the locally 602 installed packages. If a distribution matching `requirement` is found, 603 the returned distribution's ``location`` is the value you would have 604 gotten from calling the ``download()`` method with the matching 605 distribution's URL or filename. If no matching distribution is found, 606 ``None`` is returned. 607 608 If the `source` flag is set, only source distributions and source 609 checkout links will be considered. Unless the `develop_ok` flag is 610 set, development and system eggs (i.e., those using the ``.egg-info`` 611 format) will be ignored. 612 """ 613 # process a Requirement 614 self.info("Searching for %s", requirement) 615 skipped = {} 616 dist = None 617 618 def find(req, env=None): 619 if env is None: 620 env = self 621 # Find a matching distribution; may be called more than once 622 623 for dist in env[req.key]: 624 625 if dist.precedence == DEVELOP_DIST and not develop_ok: 626 if dist not in skipped: 627 self.warn( 628 "Skipping development or system egg: %s", dist, 629 ) 630 skipped[dist] = 1 631 continue 632 633 test = ( 634 dist in req 635 and (dist.precedence <= SOURCE_DIST or not source) 636 ) 637 if test: 638 loc = self.download(dist.location, tmpdir) 639 dist.download_location = loc 640 if os.path.exists(dist.download_location): 641 return dist 642 643 if force_scan: 644 self.prescan() 645 self.find_packages(requirement) 646 dist = find(requirement) 647 648 if not dist and local_index is not None: 649 dist = find(requirement, local_index) 650 651 if dist is None: 652 if self.to_scan is not None: 653 self.prescan() 654 dist = find(requirement) 655 656 if dist is None and not force_scan: 657 self.find_packages(requirement) 658 dist = find(requirement) 659 660 if dist is None: 661 self.warn( 662 "No local packages or working download links found for %s%s", 663 (source and "a source distribution of " or ""), 664 requirement, 665 ) 666 else: 667 self.info("Best match: %s", dist) 668 return dist.clone(location=dist.download_location) 669 670 def fetch(self, requirement, tmpdir, force_scan=False, source=False): 671 """Obtain a file suitable for fulfilling `requirement` 672 673 DEPRECATED; use the ``fetch_distribution()`` method now instead. For 674 backward compatibility, this routine is identical but returns the 675 ``location`` of the downloaded distribution instead of a distribution 676 object. 677 """ 678 dist = self.fetch_distribution(requirement, tmpdir, force_scan, source) 679 if dist is not None: 680 return dist.location 681 return None 682 683 def gen_setup(self, filename, fragment, tmpdir): 684 match = EGG_FRAGMENT.match(fragment) 685 dists = match and [ 686 d for d in 687 interpret_distro_name(filename, match.group(1), None) if d.version 688 ] or [] 689 690 if len(dists) == 1: # unambiguous ``#egg`` fragment 691 basename = os.path.basename(filename) 692 693 # Make sure the file has been downloaded to the temp dir. 694 if os.path.dirname(filename) != tmpdir: 695 dst = os.path.join(tmpdir, basename) 696 from setuptools.command.easy_install import samefile 697 if not samefile(filename, dst): 698 shutil.copy2(filename, dst) 699 filename = dst 700 701 with open(os.path.join(tmpdir, 'setup.py'), 'w') as file: 702 file.write( 703 "from setuptools import setup\n" 704 "setup(name=%r, version=%r, py_modules=[%r])\n" 705 % ( 706 dists[0].project_name, dists[0].version, 707 os.path.splitext(basename)[0] 708 ) 709 ) 710 return filename 711 712 elif match: 713 raise DistutilsError( 714 "Can't unambiguously interpret project/version identifier %r; " 715 "any dashes in the name or version should be escaped using " 716 "underscores. %r" % (fragment, dists) 717 ) 718 else: 719 raise DistutilsError( 720 "Can't process plain .py files without an '#egg=name-version'" 721 " suffix to enable automatic setup script generation." 722 ) 723 724 dl_blocksize = 8192 725 726 def _download_to(self, url, filename): 727 self.info("Downloading %s", url) 728 # Download the file 729 fp = None 730 try: 731 checker = HashChecker.from_url(url) 732 fp = self.open_url(url) 733 if isinstance(fp, urllib.error.HTTPError): 734 raise DistutilsError( 735 "Can't download %s: %s %s" % (url, fp.code, fp.msg) 736 ) 737 headers = fp.info() 738 blocknum = 0 739 bs = self.dl_blocksize 740 size = -1 741 if "content-length" in headers: 742 # Some servers return multiple Content-Length headers :( 743 sizes = get_all_headers(headers, 'Content-Length') 744 size = max(map(int, sizes)) 745 self.reporthook(url, filename, blocknum, bs, size) 746 with open(filename, 'wb') as tfp: 747 while True: 748 block = fp.read(bs) 749 if block: 750 checker.feed(block) 751 tfp.write(block) 752 blocknum += 1 753 self.reporthook(url, filename, blocknum, bs, size) 754 else: 755 break 756 self.check_hash(checker, filename, tfp) 757 return headers 758 finally: 759 if fp: 760 fp.close() 761 762 def reporthook(self, url, filename, blocknum, blksize, size): 763 pass # no-op 764 765 def open_url(self, url, warning=None): 766 if url.startswith('file:'): 767 return local_open(url) 768 try: 769 return open_with_auth(url, self.opener) 770 except (ValueError, http_client.InvalidURL) as v: 771 msg = ' '.join([str(arg) for arg in v.args]) 772 if warning: 773 self.warn(warning, msg) 774 else: 775 raise DistutilsError('%s %s' % (url, msg)) from v 776 except urllib.error.HTTPError as v: 777 return v 778 except urllib.error.URLError as v: 779 if warning: 780 self.warn(warning, v.reason) 781 else: 782 raise DistutilsError("Download error for %s: %s" 783 % (url, v.reason)) from v 784 except http_client.BadStatusLine as v: 785 if warning: 786 self.warn(warning, v.line) 787 else: 788 raise DistutilsError( 789 '%s returned a bad status line. The server might be ' 790 'down, %s' % 791 (url, v.line) 792 ) from v 793 except (http_client.HTTPException, socket.error) as v: 794 if warning: 795 self.warn(warning, v) 796 else: 797 raise DistutilsError("Download error for %s: %s" 798 % (url, v)) from v 799 800 def _download_url(self, scheme, url, tmpdir): 801 # Determine download filename 802 # 803 name, fragment = egg_info_for_url(url) 804 if name: 805 while '..' in name: 806 name = name.replace('..', '.').replace('\\', '_') 807 else: 808 name = "__downloaded__" # default if URL has no path contents 809 810 if name.endswith('.egg.zip'): 811 name = name[:-4] # strip the extra .zip before download 812 813 filename = os.path.join(tmpdir, name) 814 815 # Download the file 816 # 817 if scheme == 'svn' or scheme.startswith('svn+'): 818 return self._download_svn(url, filename) 819 elif scheme == 'git' or scheme.startswith('git+'): 820 return self._download_git(url, filename) 821 elif scheme.startswith('hg+'): 822 return self._download_hg(url, filename) 823 elif scheme == 'file': 824 return urllib.request.url2pathname(urllib.parse.urlparse(url)[2]) 825 else: 826 self.url_ok(url, True) # raises error if not allowed 827 return self._attempt_download(url, filename) 828 829 def scan_url(self, url): 830 self.process_url(url, True) 831 832 def _attempt_download(self, url, filename): 833 headers = self._download_to(url, filename) 834 if 'html' in headers.get('content-type', '').lower(): 835 return self._download_html(url, headers, filename) 836 else: 837 return filename 838 839 def _download_html(self, url, headers, filename): 840 file = open(filename) 841 for line in file: 842 if line.strip(): 843 # Check for a subversion index page 844 if re.search(r'<title>([^- ]+ - )?Revision \d+:', line): 845 # it's a subversion index page: 846 file.close() 847 os.unlink(filename) 848 return self._download_svn(url, filename) 849 break # not an index page 850 file.close() 851 os.unlink(filename) 852 raise DistutilsError("Unexpected HTML page found at " + url) 853 854 def _download_svn(self, url, filename): 855 warnings.warn("SVN download support is deprecated", UserWarning) 856 url = url.split('#', 1)[0] # remove any fragment for svn's sake 857 creds = '' 858 if url.lower().startswith('svn:') and '@' in url: 859 scheme, netloc, path, p, q, f = urllib.parse.urlparse(url) 860 if not netloc and path.startswith('//') and '/' in path[2:]: 861 netloc, path = path[2:].split('/', 1) 862 auth, host = _splituser(netloc) 863 if auth: 864 if ':' in auth: 865 user, pw = auth.split(':', 1) 866 creds = " --username=%s --password=%s" % (user, pw) 867 else: 868 creds = " --username=" + auth 869 netloc = host 870 parts = scheme, netloc, url, p, q, f 871 url = urllib.parse.urlunparse(parts) 872 self.info("Doing subversion checkout from %s to %s", url, filename) 873 os.system("svn checkout%s -q %s %s" % (creds, url, filename)) 874 return filename 875 876 @staticmethod 877 def _vcs_split_rev_from_url(url, pop_prefix=False): 878 scheme, netloc, path, query, frag = urllib.parse.urlsplit(url) 879 880 scheme = scheme.split('+', 1)[-1] 881 882 # Some fragment identification fails 883 path = path.split('#', 1)[0] 884 885 rev = None 886 if '@' in path: 887 path, rev = path.rsplit('@', 1) 888 889 # Also, discard fragment 890 url = urllib.parse.urlunsplit((scheme, netloc, path, query, '')) 891 892 return url, rev 893 894 def _download_git(self, url, filename): 895 filename = filename.split('#', 1)[0] 896 url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True) 897 898 self.info("Doing git clone from %s to %s", url, filename) 899 os.system("git clone --quiet %s %s" % (url, filename)) 900 901 if rev is not None: 902 self.info("Checking out %s", rev) 903 os.system("git -C %s checkout --quiet %s" % ( 904 filename, 905 rev, 906 )) 907 908 return filename 909 910 def _download_hg(self, url, filename): 911 filename = filename.split('#', 1)[0] 912 url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True) 913 914 self.info("Doing hg clone from %s to %s", url, filename) 915 os.system("hg clone --quiet %s %s" % (url, filename)) 916 917 if rev is not None: 918 self.info("Updating to %s", rev) 919 os.system("hg --cwd %s up -C -r %s -q" % ( 920 filename, 921 rev, 922 )) 923 924 return filename 925 926 def debug(self, msg, *args): 927 log.debug(msg, *args) 928 929 def info(self, msg, *args): 930 log.info(msg, *args) 931 932 def warn(self, msg, *args): 933 log.warn(msg, *args) 934 935 936# This pattern matches a character entity reference (a decimal numeric 937# references, a hexadecimal numeric reference, or a named reference). 938entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub 939 940 941def decode_entity(match): 942 what = match.group(0) 943 return unescape(what) 944 945 946def htmldecode(text): 947 """ 948 Decode HTML entities in the given text. 949 950 >>> htmldecode( 951 ... 'https://../package_name-0.1.2.tar.gz' 952 ... '?tokena=A&tokenb=B">package_name-0.1.2.tar.gz') 953 'https://../package_name-0.1.2.tar.gz?tokena=A&tokenb=B">package_name-0.1.2.tar.gz' 954 """ 955 return entity_sub(decode_entity, text) 956 957 958def socket_timeout(timeout=15): 959 def _socket_timeout(func): 960 def _socket_timeout(*args, **kwargs): 961 old_timeout = socket.getdefaulttimeout() 962 socket.setdefaulttimeout(timeout) 963 try: 964 return func(*args, **kwargs) 965 finally: 966 socket.setdefaulttimeout(old_timeout) 967 968 return _socket_timeout 969 970 return _socket_timeout 971 972 973def _encode_auth(auth): 974 """ 975 A function compatible with Python 2.3-3.3 that will encode 976 auth from a URL suitable for an HTTP header. 977 >>> str(_encode_auth('username%3Apassword')) 978 'dXNlcm5hbWU6cGFzc3dvcmQ=' 979 980 Long auth strings should not cause a newline to be inserted. 981 >>> long_auth = 'username:' + 'password'*10 982 >>> chr(10) in str(_encode_auth(long_auth)) 983 False 984 """ 985 auth_s = urllib.parse.unquote(auth) 986 # convert to bytes 987 auth_bytes = auth_s.encode() 988 encoded_bytes = base64.b64encode(auth_bytes) 989 # convert back to a string 990 encoded = encoded_bytes.decode() 991 # strip the trailing carriage return 992 return encoded.replace('\n', '') 993 994 995class Credential: 996 """ 997 A username/password pair. Use like a namedtuple. 998 """ 999 1000 def __init__(self, username, password): 1001 self.username = username 1002 self.password = password 1003 1004 def __iter__(self): 1005 yield self.username 1006 yield self.password 1007 1008 def __str__(self): 1009 return '%(username)s:%(password)s' % vars(self) 1010 1011 1012class PyPIConfig(configparser.RawConfigParser): 1013 def __init__(self): 1014 """ 1015 Load from ~/.pypirc 1016 """ 1017 defaults = dict.fromkeys(['username', 'password', 'repository'], '') 1018 configparser.RawConfigParser.__init__(self, defaults) 1019 1020 rc = os.path.join(os.path.expanduser('~'), '.pypirc') 1021 if os.path.exists(rc): 1022 self.read(rc) 1023 1024 @property 1025 def creds_by_repository(self): 1026 sections_with_repositories = [ 1027 section for section in self.sections() 1028 if self.get(section, 'repository').strip() 1029 ] 1030 1031 return dict(map(self._get_repo_cred, sections_with_repositories)) 1032 1033 def _get_repo_cred(self, section): 1034 repo = self.get(section, 'repository').strip() 1035 return repo, Credential( 1036 self.get(section, 'username').strip(), 1037 self.get(section, 'password').strip(), 1038 ) 1039 1040 def find_credential(self, url): 1041 """ 1042 If the URL indicated appears to be a repository defined in this 1043 config, return the credential for that repository. 1044 """ 1045 for repository, cred in self.creds_by_repository.items(): 1046 if url.startswith(repository): 1047 return cred 1048 1049 1050def open_with_auth(url, opener=urllib.request.urlopen): 1051 """Open a urllib2 request, handling HTTP authentication""" 1052 1053 parsed = urllib.parse.urlparse(url) 1054 scheme, netloc, path, params, query, frag = parsed 1055 1056 # Double scheme does not raise on macOS as revealed by a 1057 # failing test. We would expect "nonnumeric port". Refs #20. 1058 if netloc.endswith(':'): 1059 raise http_client.InvalidURL("nonnumeric port: ''") 1060 1061 if scheme in ('http', 'https'): 1062 auth, address = _splituser(netloc) 1063 else: 1064 auth = None 1065 1066 if not auth: 1067 cred = PyPIConfig().find_credential(url) 1068 if cred: 1069 auth = str(cred) 1070 info = cred.username, url 1071 log.info('Authenticating as %s for %s (from .pypirc)', *info) 1072 1073 if auth: 1074 auth = "Basic " + _encode_auth(auth) 1075 parts = scheme, address, path, params, query, frag 1076 new_url = urllib.parse.urlunparse(parts) 1077 request = urllib.request.Request(new_url) 1078 request.add_header("Authorization", auth) 1079 else: 1080 request = urllib.request.Request(url) 1081 1082 request.add_header('User-Agent', user_agent) 1083 fp = opener(request) 1084 1085 if auth: 1086 # Put authentication info back into request URL if same host, 1087 # so that links found on the page will work 1088 s2, h2, path2, param2, query2, frag2 = urllib.parse.urlparse(fp.url) 1089 if s2 == scheme and h2 == address: 1090 parts = s2, netloc, path2, param2, query2, frag2 1091 fp.url = urllib.parse.urlunparse(parts) 1092 1093 return fp 1094 1095 1096# copy of urllib.parse._splituser from Python 3.8 1097def _splituser(host): 1098 """splituser('user[:passwd]@host[:port]') 1099 --> 'user[:passwd]', 'host[:port]'.""" 1100 user, delim, host = host.rpartition('@') 1101 return (user if delim else None), host 1102 1103 1104# adding a timeout to avoid freezing package_index 1105open_with_auth = socket_timeout(_SOCKET_TIMEOUT)(open_with_auth) 1106 1107 1108def fix_sf_url(url): 1109 return url # backward compatibility 1110 1111 1112def local_open(url): 1113 """Read a local path, with special support for directories""" 1114 scheme, server, path, param, query, frag = urllib.parse.urlparse(url) 1115 filename = urllib.request.url2pathname(path) 1116 if os.path.isfile(filename): 1117 return urllib.request.urlopen(url) 1118 elif path.endswith('/') and os.path.isdir(filename): 1119 files = [] 1120 for f in os.listdir(filename): 1121 filepath = os.path.join(filename, f) 1122 if f == 'index.html': 1123 with open(filepath, 'r') as fp: 1124 body = fp.read() 1125 break 1126 elif os.path.isdir(filepath): 1127 f += '/' 1128 files.append('<a href="{name}">{name}</a>'.format(name=f)) 1129 else: 1130 tmpl = ( 1131 "<html><head><title>{url}</title>" 1132 "</head><body>{files}</body></html>") 1133 body = tmpl.format(url=url, files='\n'.join(files)) 1134 status, message = 200, "OK" 1135 else: 1136 status, message, body = 404, "Path not found", "Not found" 1137 1138 headers = {'content-type': 'text/html'} 1139 body_stream = six.StringIO(body) 1140 return urllib.error.HTTPError(url, status, message, headers, body_stream) 1141