1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2012-2015 Vinay Sajip. 4# Licensed to the Python Software Foundation under a contributor agreement. 5# See LICENSE.txt and CONTRIBUTORS.txt. 6# 7 8import gzip 9from io import BytesIO 10import json 11import logging 12import os 13import posixpath 14import re 15try: 16 import threading 17except ImportError: # pragma: no cover 18 import dummy_threading as threading 19import zlib 20 21from . import DistlibException 22from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url, 23 queue, quote, unescape, string_types, build_opener, 24 HTTPRedirectHandler as BaseRedirectHandler, text_type, 25 Request, HTTPError, URLError) 26from .database import Distribution, DistributionPath, make_dist 27from .metadata import Metadata, MetadataInvalidError 28from .util import (cached_property, parse_credentials, ensure_slash, 29 split_filename, get_project_data, parse_requirement, 30 parse_name_and_version, ServerProxy, normalize_name) 31from .version import get_scheme, UnsupportedVersionError 32from .wheel import Wheel, is_compatible 33 34logger = logging.getLogger(__name__) 35 36HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)') 37CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I) 38HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml') 39DEFAULT_INDEX = 'https://pypi.org/pypi' 40 41def get_all_distribution_names(url=None): 42 """ 43 Return all distribution names known by an index. 44 :param url: The URL of the index. 45 :return: A list of all known distribution names. 46 """ 47 if url is None: 48 url = DEFAULT_INDEX 49 client = ServerProxy(url, timeout=3.0) 50 try: 51 return client.list_packages() 52 finally: 53 client('close')() 54 55class RedirectHandler(BaseRedirectHandler): 56 """ 57 A class to work around a bug in some Python 3.2.x releases. 58 """ 59 # There's a bug in the base version for some 3.2.x 60 # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header 61 # returns e.g. /abc, it bails because it says the scheme '' 62 # is bogus, when actually it should use the request's 63 # URL for the scheme. See Python issue #13696. 64 def http_error_302(self, req, fp, code, msg, headers): 65 # Some servers (incorrectly) return multiple Location headers 66 # (so probably same goes for URI). Use first header. 67 newurl = None 68 for key in ('location', 'uri'): 69 if key in headers: 70 newurl = headers[key] 71 break 72 if newurl is None: # pragma: no cover 73 return 74 urlparts = urlparse(newurl) 75 if urlparts.scheme == '': 76 newurl = urljoin(req.get_full_url(), newurl) 77 if hasattr(headers, 'replace_header'): 78 headers.replace_header(key, newurl) 79 else: 80 headers[key] = newurl 81 return BaseRedirectHandler.http_error_302(self, req, fp, code, msg, 82 headers) 83 84 http_error_301 = http_error_303 = http_error_307 = http_error_302 85 86class Locator(object): 87 """ 88 A base class for locators - things that locate distributions. 89 """ 90 source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz') 91 binary_extensions = ('.egg', '.exe', '.whl') 92 excluded_extensions = ('.pdf',) 93 94 # A list of tags indicating which wheels you want to match. The default 95 # value of None matches against the tags compatible with the running 96 # Python. If you want to match other values, set wheel_tags on a locator 97 # instance to a list of tuples (pyver, abi, arch) which you want to match. 98 wheel_tags = None 99 100 downloadable_extensions = source_extensions + ('.whl',) 101 102 def __init__(self, scheme='default'): 103 """ 104 Initialise an instance. 105 :param scheme: Because locators look for most recent versions, they 106 need to know the version scheme to use. This specifies 107 the current PEP-recommended scheme - use ``'legacy'`` 108 if you need to support existing distributions on PyPI. 109 """ 110 self._cache = {} 111 self.scheme = scheme 112 # Because of bugs in some of the handlers on some of the platforms, 113 # we use our own opener rather than just using urlopen. 114 self.opener = build_opener(RedirectHandler()) 115 # If get_project() is called from locate(), the matcher instance 116 # is set from the requirement passed to locate(). See issue #18 for 117 # why this can be useful to know. 118 self.matcher = None 119 self.errors = queue.Queue() 120 121 def get_errors(self): 122 """ 123 Return any errors which have occurred. 124 """ 125 result = [] 126 while not self.errors.empty(): # pragma: no cover 127 try: 128 e = self.errors.get(False) 129 result.append(e) 130 except self.errors.Empty: 131 continue 132 self.errors.task_done() 133 return result 134 135 def clear_errors(self): 136 """ 137 Clear any errors which may have been logged. 138 """ 139 # Just get the errors and throw them away 140 self.get_errors() 141 142 def clear_cache(self): 143 self._cache.clear() 144 145 def _get_scheme(self): 146 return self._scheme 147 148 def _set_scheme(self, value): 149 self._scheme = value 150 151 scheme = property(_get_scheme, _set_scheme) 152 153 def _get_project(self, name): 154 """ 155 For a given project, get a dictionary mapping available versions to Distribution 156 instances. 157 158 This should be implemented in subclasses. 159 160 If called from a locate() request, self.matcher will be set to a 161 matcher for the requirement to satisfy, otherwise it will be None. 162 """ 163 raise NotImplementedError('Please implement in the subclass') 164 165 def get_distribution_names(self): 166 """ 167 Return all the distribution names known to this locator. 168 """ 169 raise NotImplementedError('Please implement in the subclass') 170 171 def get_project(self, name): 172 """ 173 For a given project, get a dictionary mapping available versions to Distribution 174 instances. 175 176 This calls _get_project to do all the work, and just implements a caching layer on top. 177 """ 178 if self._cache is None: # pragma: no cover 179 result = self._get_project(name) 180 elif name in self._cache: 181 result = self._cache[name] 182 else: 183 self.clear_errors() 184 result = self._get_project(name) 185 self._cache[name] = result 186 return result 187 188 def score_url(self, url): 189 """ 190 Give an url a score which can be used to choose preferred URLs 191 for a given project release. 192 """ 193 t = urlparse(url) 194 basename = posixpath.basename(t.path) 195 compatible = True 196 is_wheel = basename.endswith('.whl') 197 is_downloadable = basename.endswith(self.downloadable_extensions) 198 if is_wheel: 199 compatible = is_compatible(Wheel(basename), self.wheel_tags) 200 return (t.scheme == 'https', 'pypi.org' in t.netloc, 201 is_downloadable, is_wheel, compatible, basename) 202 203 def prefer_url(self, url1, url2): 204 """ 205 Choose one of two URLs where both are candidates for distribution 206 archives for the same version of a distribution (for example, 207 .tar.gz vs. zip). 208 209 The current implementation favours https:// URLs over http://, archives 210 from PyPI over those from other locations, wheel compatibility (if a 211 wheel) and then the archive name. 212 """ 213 result = url2 214 if url1: 215 s1 = self.score_url(url1) 216 s2 = self.score_url(url2) 217 if s1 > s2: 218 result = url1 219 if result != url2: 220 logger.debug('Not replacing %r with %r', url1, url2) 221 else: 222 logger.debug('Replacing %r with %r', url1, url2) 223 return result 224 225 def split_filename(self, filename, project_name): 226 """ 227 Attempt to split a filename in project name, version and Python version. 228 """ 229 return split_filename(filename, project_name) 230 231 def convert_url_to_download_info(self, url, project_name): 232 """ 233 See if a URL is a candidate for a download URL for a project (the URL 234 has typically been scraped from an HTML page). 235 236 If it is, a dictionary is returned with keys "name", "version", 237 "filename" and "url"; otherwise, None is returned. 238 """ 239 def same_project(name1, name2): 240 return normalize_name(name1) == normalize_name(name2) 241 242 result = None 243 scheme, netloc, path, params, query, frag = urlparse(url) 244 if frag.lower().startswith('egg='): # pragma: no cover 245 logger.debug('%s: version hint in fragment: %r', 246 project_name, frag) 247 m = HASHER_HASH.match(frag) 248 if m: 249 algo, digest = m.groups() 250 else: 251 algo, digest = None, None 252 origpath = path 253 if path and path[-1] == '/': # pragma: no cover 254 path = path[:-1] 255 if path.endswith('.whl'): 256 try: 257 wheel = Wheel(path) 258 if not is_compatible(wheel, self.wheel_tags): 259 logger.debug('Wheel not compatible: %s', path) 260 else: 261 if project_name is None: 262 include = True 263 else: 264 include = same_project(wheel.name, project_name) 265 if include: 266 result = { 267 'name': wheel.name, 268 'version': wheel.version, 269 'filename': wheel.filename, 270 'url': urlunparse((scheme, netloc, origpath, 271 params, query, '')), 272 'python-version': ', '.join( 273 ['.'.join(list(v[2:])) for v in wheel.pyver]), 274 } 275 except Exception as e: # pragma: no cover 276 logger.warning('invalid path for wheel: %s', path) 277 elif not path.endswith(self.downloadable_extensions): # pragma: no cover 278 logger.debug('Not downloadable: %s', path) 279 else: # downloadable extension 280 path = filename = posixpath.basename(path) 281 for ext in self.downloadable_extensions: 282 if path.endswith(ext): 283 path = path[:-len(ext)] 284 t = self.split_filename(path, project_name) 285 if not t: # pragma: no cover 286 logger.debug('No match for project/version: %s', path) 287 else: 288 name, version, pyver = t 289 if not project_name or same_project(project_name, name): 290 result = { 291 'name': name, 292 'version': version, 293 'filename': filename, 294 'url': urlunparse((scheme, netloc, origpath, 295 params, query, '')), 296 #'packagetype': 'sdist', 297 } 298 if pyver: # pragma: no cover 299 result['python-version'] = pyver 300 break 301 if result and algo: 302 result['%s_digest' % algo] = digest 303 return result 304 305 def _get_digest(self, info): 306 """ 307 Get a digest from a dictionary by looking at a "digests" dictionary 308 or keys of the form 'algo_digest'. 309 310 Returns a 2-tuple (algo, digest) if found, else None. Currently 311 looks only for SHA256, then MD5. 312 """ 313 result = None 314 if 'digests' in info: 315 digests = info['digests'] 316 for algo in ('sha256', 'md5'): 317 if algo in digests: 318 result = (algo, digests[algo]) 319 break 320 if not result: 321 for algo in ('sha256', 'md5'): 322 key = '%s_digest' % algo 323 if key in info: 324 result = (algo, info[key]) 325 break 326 return result 327 328 def _update_version_data(self, result, info): 329 """ 330 Update a result dictionary (the final result from _get_project) with a 331 dictionary for a specific version, which typically holds information 332 gleaned from a filename or URL for an archive for the distribution. 333 """ 334 name = info.pop('name') 335 version = info.pop('version') 336 if version in result: 337 dist = result[version] 338 md = dist.metadata 339 else: 340 dist = make_dist(name, version, scheme=self.scheme) 341 md = dist.metadata 342 dist.digest = digest = self._get_digest(info) 343 url = info['url'] 344 result['digests'][url] = digest 345 if md.source_url != info['url']: 346 md.source_url = self.prefer_url(md.source_url, url) 347 result['urls'].setdefault(version, set()).add(url) 348 dist.locator = self 349 result[version] = dist 350 351 def locate(self, requirement, prereleases=False): 352 """ 353 Find the most recent distribution which matches the given 354 requirement. 355 356 :param requirement: A requirement of the form 'foo (1.0)' or perhaps 357 'foo (>= 1.0, < 2.0, != 1.3)' 358 :param prereleases: If ``True``, allow pre-release versions 359 to be located. Otherwise, pre-release versions 360 are not returned. 361 :return: A :class:`Distribution` instance, or ``None`` if no such 362 distribution could be located. 363 """ 364 result = None 365 r = parse_requirement(requirement) 366 if r is None: # pragma: no cover 367 raise DistlibException('Not a valid requirement: %r' % requirement) 368 scheme = get_scheme(self.scheme) 369 self.matcher = matcher = scheme.matcher(r.requirement) 370 logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__) 371 versions = self.get_project(r.name) 372 if len(versions) > 2: # urls and digests keys are present 373 # sometimes, versions are invalid 374 slist = [] 375 vcls = matcher.version_class 376 for k in versions: 377 if k in ('urls', 'digests'): 378 continue 379 try: 380 if not matcher.match(k): 381 logger.debug('%s did not match %r', matcher, k) 382 else: 383 if prereleases or not vcls(k).is_prerelease: 384 slist.append(k) 385 else: 386 logger.debug('skipping pre-release ' 387 'version %s of %s', k, matcher.name) 388 except Exception: # pragma: no cover 389 logger.warning('error matching %s with %r', matcher, k) 390 pass # slist.append(k) 391 if len(slist) > 1: 392 slist = sorted(slist, key=scheme.key) 393 if slist: 394 logger.debug('sorted list: %s', slist) 395 version = slist[-1] 396 result = versions[version] 397 if result: 398 if r.extras: 399 result.extras = r.extras 400 result.download_urls = versions.get('urls', {}).get(version, set()) 401 d = {} 402 sd = versions.get('digests', {}) 403 for url in result.download_urls: 404 if url in sd: # pragma: no cover 405 d[url] = sd[url] 406 result.digests = d 407 self.matcher = None 408 return result 409 410 411class PyPIRPCLocator(Locator): 412 """ 413 This locator uses XML-RPC to locate distributions. It therefore 414 cannot be used with simple mirrors (that only mirror file content). 415 """ 416 def __init__(self, url, **kwargs): 417 """ 418 Initialise an instance. 419 420 :param url: The URL to use for XML-RPC. 421 :param kwargs: Passed to the superclass constructor. 422 """ 423 super(PyPIRPCLocator, self).__init__(**kwargs) 424 self.base_url = url 425 self.client = ServerProxy(url, timeout=3.0) 426 427 def get_distribution_names(self): 428 """ 429 Return all the distribution names known to this locator. 430 """ 431 return set(self.client.list_packages()) 432 433 def _get_project(self, name): 434 result = {'urls': {}, 'digests': {}} 435 versions = self.client.package_releases(name, True) 436 for v in versions: 437 urls = self.client.release_urls(name, v) 438 data = self.client.release_data(name, v) 439 metadata = Metadata(scheme=self.scheme) 440 metadata.name = data['name'] 441 metadata.version = data['version'] 442 metadata.license = data.get('license') 443 metadata.keywords = data.get('keywords', []) 444 metadata.summary = data.get('summary') 445 dist = Distribution(metadata) 446 if urls: 447 info = urls[0] 448 metadata.source_url = info['url'] 449 dist.digest = self._get_digest(info) 450 dist.locator = self 451 result[v] = dist 452 for info in urls: 453 url = info['url'] 454 digest = self._get_digest(info) 455 result['urls'].setdefault(v, set()).add(url) 456 result['digests'][url] = digest 457 return result 458 459class PyPIJSONLocator(Locator): 460 """ 461 This locator uses PyPI's JSON interface. It's very limited in functionality 462 and probably not worth using. 463 """ 464 def __init__(self, url, **kwargs): 465 super(PyPIJSONLocator, self).__init__(**kwargs) 466 self.base_url = ensure_slash(url) 467 468 def get_distribution_names(self): 469 """ 470 Return all the distribution names known to this locator. 471 """ 472 raise NotImplementedError('Not available from this locator') 473 474 def _get_project(self, name): 475 result = {'urls': {}, 'digests': {}} 476 url = urljoin(self.base_url, '%s/json' % quote(name)) 477 try: 478 resp = self.opener.open(url) 479 data = resp.read().decode() # for now 480 d = json.loads(data) 481 md = Metadata(scheme=self.scheme) 482 data = d['info'] 483 md.name = data['name'] 484 md.version = data['version'] 485 md.license = data.get('license') 486 md.keywords = data.get('keywords', []) 487 md.summary = data.get('summary') 488 dist = Distribution(md) 489 dist.locator = self 490 urls = d['urls'] 491 result[md.version] = dist 492 for info in d['urls']: 493 url = info['url'] 494 dist.download_urls.add(url) 495 dist.digests[url] = self._get_digest(info) 496 result['urls'].setdefault(md.version, set()).add(url) 497 result['digests'][url] = self._get_digest(info) 498 # Now get other releases 499 for version, infos in d['releases'].items(): 500 if version == md.version: 501 continue # already done 502 omd = Metadata(scheme=self.scheme) 503 omd.name = md.name 504 omd.version = version 505 odist = Distribution(omd) 506 odist.locator = self 507 result[version] = odist 508 for info in infos: 509 url = info['url'] 510 odist.download_urls.add(url) 511 odist.digests[url] = self._get_digest(info) 512 result['urls'].setdefault(version, set()).add(url) 513 result['digests'][url] = self._get_digest(info) 514# for info in urls: 515# md.source_url = info['url'] 516# dist.digest = self._get_digest(info) 517# dist.locator = self 518# for info in urls: 519# url = info['url'] 520# result['urls'].setdefault(md.version, set()).add(url) 521# result['digests'][url] = self._get_digest(info) 522 except Exception as e: 523 self.errors.put(text_type(e)) 524 logger.exception('JSON fetch failed: %s', e) 525 return result 526 527 528class Page(object): 529 """ 530 This class represents a scraped HTML page. 531 """ 532 # The following slightly hairy-looking regex just looks for the contents of 533 # an anchor link, which has an attribute "href" either immediately preceded 534 # or immediately followed by a "rel" attribute. The attribute values can be 535 # declared with double quotes, single quotes or no quotes - which leads to 536 # the length of the expression. 537 _href = re.compile(""" 538(rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)? 539href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*)) 540(\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))? 541""", re.I | re.S | re.X) 542 _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S) 543 544 def __init__(self, data, url): 545 """ 546 Initialise an instance with the Unicode page contents and the URL they 547 came from. 548 """ 549 self.data = data 550 self.base_url = self.url = url 551 m = self._base.search(self.data) 552 if m: 553 self.base_url = m.group(1) 554 555 _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I) 556 557 @cached_property 558 def links(self): 559 """ 560 Return the URLs of all the links on a page together with information 561 about their "rel" attribute, for determining which ones to treat as 562 downloads and which ones to queue for further scraping. 563 """ 564 def clean(url): 565 "Tidy up an URL." 566 scheme, netloc, path, params, query, frag = urlparse(url) 567 return urlunparse((scheme, netloc, quote(path), 568 params, query, frag)) 569 570 result = set() 571 for match in self._href.finditer(self.data): 572 d = match.groupdict('') 573 rel = (d['rel1'] or d['rel2'] or d['rel3'] or 574 d['rel4'] or d['rel5'] or d['rel6']) 575 url = d['url1'] or d['url2'] or d['url3'] 576 url = urljoin(self.base_url, url) 577 url = unescape(url) 578 url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url) 579 result.add((url, rel)) 580 # We sort the result, hoping to bring the most recent versions 581 # to the front 582 result = sorted(result, key=lambda t: t[0], reverse=True) 583 return result 584 585 586class SimpleScrapingLocator(Locator): 587 """ 588 A locator which scrapes HTML pages to locate downloads for a distribution. 589 This runs multiple threads to do the I/O; performance is at least as good 590 as pip's PackageFinder, which works in an analogous fashion. 591 """ 592 593 # These are used to deal with various Content-Encoding schemes. 594 decoders = { 595 'deflate': zlib.decompress, 596 'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(d)).read(), 597 'none': lambda b: b, 598 } 599 600 def __init__(self, url, timeout=None, num_workers=10, **kwargs): 601 """ 602 Initialise an instance. 603 :param url: The root URL to use for scraping. 604 :param timeout: The timeout, in seconds, to be applied to requests. 605 This defaults to ``None`` (no timeout specified). 606 :param num_workers: The number of worker threads you want to do I/O, 607 This defaults to 10. 608 :param kwargs: Passed to the superclass. 609 """ 610 super(SimpleScrapingLocator, self).__init__(**kwargs) 611 self.base_url = ensure_slash(url) 612 self.timeout = timeout 613 self._page_cache = {} 614 self._seen = set() 615 self._to_fetch = queue.Queue() 616 self._bad_hosts = set() 617 self.skip_externals = False 618 self.num_workers = num_workers 619 self._lock = threading.RLock() 620 # See issue #45: we need to be resilient when the locator is used 621 # in a thread, e.g. with concurrent.futures. We can't use self._lock 622 # as it is for coordinating our internal threads - the ones created 623 # in _prepare_threads. 624 self._gplock = threading.RLock() 625 self.platform_check = False # See issue #112 626 627 def _prepare_threads(self): 628 """ 629 Threads are created only when get_project is called, and terminate 630 before it returns. They are there primarily to parallelise I/O (i.e. 631 fetching web pages). 632 """ 633 self._threads = [] 634 for i in range(self.num_workers): 635 t = threading.Thread(target=self._fetch) 636 t.setDaemon(True) 637 t.start() 638 self._threads.append(t) 639 640 def _wait_threads(self): 641 """ 642 Tell all the threads to terminate (by sending a sentinel value) and 643 wait for them to do so. 644 """ 645 # Note that you need two loops, since you can't say which 646 # thread will get each sentinel 647 for t in self._threads: 648 self._to_fetch.put(None) # sentinel 649 for t in self._threads: 650 t.join() 651 self._threads = [] 652 653 def _get_project(self, name): 654 result = {'urls': {}, 'digests': {}} 655 with self._gplock: 656 self.result = result 657 self.project_name = name 658 url = urljoin(self.base_url, '%s/' % quote(name)) 659 self._seen.clear() 660 self._page_cache.clear() 661 self._prepare_threads() 662 try: 663 logger.debug('Queueing %s', url) 664 self._to_fetch.put(url) 665 self._to_fetch.join() 666 finally: 667 self._wait_threads() 668 del self.result 669 return result 670 671 platform_dependent = re.compile(r'\b(linux_(i\d86|x86_64|arm\w+)|' 672 r'win(32|_amd64)|macosx_?\d+)\b', re.I) 673 674 def _is_platform_dependent(self, url): 675 """ 676 Does an URL refer to a platform-specific download? 677 """ 678 return self.platform_dependent.search(url) 679 680 def _process_download(self, url): 681 """ 682 See if an URL is a suitable download for a project. 683 684 If it is, register information in the result dictionary (for 685 _get_project) about the specific version it's for. 686 687 Note that the return value isn't actually used other than as a boolean 688 value. 689 """ 690 if self.platform_check and self._is_platform_dependent(url): 691 info = None 692 else: 693 info = self.convert_url_to_download_info(url, self.project_name) 694 logger.debug('process_download: %s -> %s', url, info) 695 if info: 696 with self._lock: # needed because self.result is shared 697 self._update_version_data(self.result, info) 698 return info 699 700 def _should_queue(self, link, referrer, rel): 701 """ 702 Determine whether a link URL from a referring page and with a 703 particular "rel" attribute should be queued for scraping. 704 """ 705 scheme, netloc, path, _, _, _ = urlparse(link) 706 if path.endswith(self.source_extensions + self.binary_extensions + 707 self.excluded_extensions): 708 result = False 709 elif self.skip_externals and not link.startswith(self.base_url): 710 result = False 711 elif not referrer.startswith(self.base_url): 712 result = False 713 elif rel not in ('homepage', 'download'): 714 result = False 715 elif scheme not in ('http', 'https', 'ftp'): 716 result = False 717 elif self._is_platform_dependent(link): 718 result = False 719 else: 720 host = netloc.split(':', 1)[0] 721 if host.lower() == 'localhost': 722 result = False 723 else: 724 result = True 725 logger.debug('should_queue: %s (%s) from %s -> %s', link, rel, 726 referrer, result) 727 return result 728 729 def _fetch(self): 730 """ 731 Get a URL to fetch from the work queue, get the HTML page, examine its 732 links for download candidates and candidates for further scraping. 733 734 This is a handy method to run in a thread. 735 """ 736 while True: 737 url = self._to_fetch.get() 738 try: 739 if url: 740 page = self.get_page(url) 741 if page is None: # e.g. after an error 742 continue 743 for link, rel in page.links: 744 if link not in self._seen: 745 try: 746 self._seen.add(link) 747 if (not self._process_download(link) and 748 self._should_queue(link, url, rel)): 749 logger.debug('Queueing %s from %s', link, url) 750 self._to_fetch.put(link) 751 except MetadataInvalidError: # e.g. invalid versions 752 pass 753 except Exception as e: # pragma: no cover 754 self.errors.put(text_type(e)) 755 finally: 756 # always do this, to avoid hangs :-) 757 self._to_fetch.task_done() 758 if not url: 759 #logger.debug('Sentinel seen, quitting.') 760 break 761 762 def get_page(self, url): 763 """ 764 Get the HTML for an URL, possibly from an in-memory cache. 765 766 XXX TODO Note: this cache is never actually cleared. It's assumed that 767 the data won't get stale over the lifetime of a locator instance (not 768 necessarily true for the default_locator). 769 """ 770 # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api 771 scheme, netloc, path, _, _, _ = urlparse(url) 772 if scheme == 'file' and os.path.isdir(url2pathname(path)): 773 url = urljoin(ensure_slash(url), 'index.html') 774 775 if url in self._page_cache: 776 result = self._page_cache[url] 777 logger.debug('Returning %s from cache: %s', url, result) 778 else: 779 host = netloc.split(':', 1)[0] 780 result = None 781 if host in self._bad_hosts: 782 logger.debug('Skipping %s due to bad host %s', url, host) 783 else: 784 req = Request(url, headers={'Accept-encoding': 'identity'}) 785 try: 786 logger.debug('Fetching %s', url) 787 resp = self.opener.open(req, timeout=self.timeout) 788 logger.debug('Fetched %s', url) 789 headers = resp.info() 790 content_type = headers.get('Content-Type', '') 791 if HTML_CONTENT_TYPE.match(content_type): 792 final_url = resp.geturl() 793 data = resp.read() 794 encoding = headers.get('Content-Encoding') 795 if encoding: 796 decoder = self.decoders[encoding] # fail if not found 797 data = decoder(data) 798 encoding = 'utf-8' 799 m = CHARSET.search(content_type) 800 if m: 801 encoding = m.group(1) 802 try: 803 data = data.decode(encoding) 804 except UnicodeError: # pragma: no cover 805 data = data.decode('latin-1') # fallback 806 result = Page(data, final_url) 807 self._page_cache[final_url] = result 808 except HTTPError as e: 809 if e.code != 404: 810 logger.exception('Fetch failed: %s: %s', url, e) 811 except URLError as e: # pragma: no cover 812 logger.exception('Fetch failed: %s: %s', url, e) 813 with self._lock: 814 self._bad_hosts.add(host) 815 except Exception as e: # pragma: no cover 816 logger.exception('Fetch failed: %s: %s', url, e) 817 finally: 818 self._page_cache[url] = result # even if None (failure) 819 return result 820 821 _distname_re = re.compile('<a href=[^>]*>([^<]+)<') 822 823 def get_distribution_names(self): 824 """ 825 Return all the distribution names known to this locator. 826 """ 827 result = set() 828 page = self.get_page(self.base_url) 829 if not page: 830 raise DistlibException('Unable to get %s' % self.base_url) 831 for match in self._distname_re.finditer(page.data): 832 result.add(match.group(1)) 833 return result 834 835class DirectoryLocator(Locator): 836 """ 837 This class locates distributions in a directory tree. 838 """ 839 840 def __init__(self, path, **kwargs): 841 """ 842 Initialise an instance. 843 :param path: The root of the directory tree to search. 844 :param kwargs: Passed to the superclass constructor, 845 except for: 846 * recursive - if True (the default), subdirectories are 847 recursed into. If False, only the top-level directory 848 is searched, 849 """ 850 self.recursive = kwargs.pop('recursive', True) 851 super(DirectoryLocator, self).__init__(**kwargs) 852 path = os.path.abspath(path) 853 if not os.path.isdir(path): # pragma: no cover 854 raise DistlibException('Not a directory: %r' % path) 855 self.base_dir = path 856 857 def should_include(self, filename, parent): 858 """ 859 Should a filename be considered as a candidate for a distribution 860 archive? As well as the filename, the directory which contains it 861 is provided, though not used by the current implementation. 862 """ 863 return filename.endswith(self.downloadable_extensions) 864 865 def _get_project(self, name): 866 result = {'urls': {}, 'digests': {}} 867 for root, dirs, files in os.walk(self.base_dir): 868 for fn in files: 869 if self.should_include(fn, root): 870 fn = os.path.join(root, fn) 871 url = urlunparse(('file', '', 872 pathname2url(os.path.abspath(fn)), 873 '', '', '')) 874 info = self.convert_url_to_download_info(url, name) 875 if info: 876 self._update_version_data(result, info) 877 if not self.recursive: 878 break 879 return result 880 881 def get_distribution_names(self): 882 """ 883 Return all the distribution names known to this locator. 884 """ 885 result = set() 886 for root, dirs, files in os.walk(self.base_dir): 887 for fn in files: 888 if self.should_include(fn, root): 889 fn = os.path.join(root, fn) 890 url = urlunparse(('file', '', 891 pathname2url(os.path.abspath(fn)), 892 '', '', '')) 893 info = self.convert_url_to_download_info(url, None) 894 if info: 895 result.add(info['name']) 896 if not self.recursive: 897 break 898 return result 899 900class JSONLocator(Locator): 901 """ 902 This locator uses special extended metadata (not available on PyPI) and is 903 the basis of performant dependency resolution in distlib. Other locators 904 require archive downloads before dependencies can be determined! As you 905 might imagine, that can be slow. 906 """ 907 def get_distribution_names(self): 908 """ 909 Return all the distribution names known to this locator. 910 """ 911 raise NotImplementedError('Not available from this locator') 912 913 def _get_project(self, name): 914 result = {'urls': {}, 'digests': {}} 915 data = get_project_data(name) 916 if data: 917 for info in data.get('files', []): 918 if info['ptype'] != 'sdist' or info['pyversion'] != 'source': 919 continue 920 # We don't store summary in project metadata as it makes 921 # the data bigger for no benefit during dependency 922 # resolution 923 dist = make_dist(data['name'], info['version'], 924 summary=data.get('summary', 925 'Placeholder for summary'), 926 scheme=self.scheme) 927 md = dist.metadata 928 md.source_url = info['url'] 929 # TODO SHA256 digest 930 if 'digest' in info and info['digest']: 931 dist.digest = ('md5', info['digest']) 932 md.dependencies = info.get('requirements', {}) 933 dist.exports = info.get('exports', {}) 934 result[dist.version] = dist 935 result['urls'].setdefault(dist.version, set()).add(info['url']) 936 return result 937 938class DistPathLocator(Locator): 939 """ 940 This locator finds installed distributions in a path. It can be useful for 941 adding to an :class:`AggregatingLocator`. 942 """ 943 def __init__(self, distpath, **kwargs): 944 """ 945 Initialise an instance. 946 947 :param distpath: A :class:`DistributionPath` instance to search. 948 """ 949 super(DistPathLocator, self).__init__(**kwargs) 950 assert isinstance(distpath, DistributionPath) 951 self.distpath = distpath 952 953 def _get_project(self, name): 954 dist = self.distpath.get_distribution(name) 955 if dist is None: 956 result = {'urls': {}, 'digests': {}} 957 else: 958 result = { 959 dist.version: dist, 960 'urls': {dist.version: set([dist.source_url])}, 961 'digests': {dist.version: set([None])} 962 } 963 return result 964 965 966class AggregatingLocator(Locator): 967 """ 968 This class allows you to chain and/or merge a list of locators. 969 """ 970 def __init__(self, *locators, **kwargs): 971 """ 972 Initialise an instance. 973 974 :param locators: The list of locators to search. 975 :param kwargs: Passed to the superclass constructor, 976 except for: 977 * merge - if False (the default), the first successful 978 search from any of the locators is returned. If True, 979 the results from all locators are merged (this can be 980 slow). 981 """ 982 self.merge = kwargs.pop('merge', False) 983 self.locators = locators 984 super(AggregatingLocator, self).__init__(**kwargs) 985 986 def clear_cache(self): 987 super(AggregatingLocator, self).clear_cache() 988 for locator in self.locators: 989 locator.clear_cache() 990 991 def _set_scheme(self, value): 992 self._scheme = value 993 for locator in self.locators: 994 locator.scheme = value 995 996 scheme = property(Locator.scheme.fget, _set_scheme) 997 998 def _get_project(self, name): 999 result = {} 1000 for locator in self.locators: 1001 d = locator.get_project(name) 1002 if d: 1003 if self.merge: 1004 files = result.get('urls', {}) 1005 digests = result.get('digests', {}) 1006 # next line could overwrite result['urls'], result['digests'] 1007 result.update(d) 1008 df = result.get('urls') 1009 if files and df: 1010 for k, v in files.items(): 1011 if k in df: 1012 df[k] |= v 1013 else: 1014 df[k] = v 1015 dd = result.get('digests') 1016 if digests and dd: 1017 dd.update(digests) 1018 else: 1019 # See issue #18. If any dists are found and we're looking 1020 # for specific constraints, we only return something if 1021 # a match is found. For example, if a DirectoryLocator 1022 # returns just foo (1.0) while we're looking for 1023 # foo (>= 2.0), we'll pretend there was nothing there so 1024 # that subsequent locators can be queried. Otherwise we 1025 # would just return foo (1.0) which would then lead to a 1026 # failure to find foo (>= 2.0), because other locators 1027 # weren't searched. Note that this only matters when 1028 # merge=False. 1029 if self.matcher is None: 1030 found = True 1031 else: 1032 found = False 1033 for k in d: 1034 if self.matcher.match(k): 1035 found = True 1036 break 1037 if found: 1038 result = d 1039 break 1040 return result 1041 1042 def get_distribution_names(self): 1043 """ 1044 Return all the distribution names known to this locator. 1045 """ 1046 result = set() 1047 for locator in self.locators: 1048 try: 1049 result |= locator.get_distribution_names() 1050 except NotImplementedError: 1051 pass 1052 return result 1053 1054 1055# We use a legacy scheme simply because most of the dists on PyPI use legacy 1056# versions which don't conform to PEP 426 / PEP 440. 1057default_locator = AggregatingLocator( 1058 JSONLocator(), 1059 SimpleScrapingLocator('https://pypi.org/simple/', 1060 timeout=3.0), 1061 scheme='legacy') 1062 1063locate = default_locator.locate 1064 1065NAME_VERSION_RE = re.compile(r'(?P<name>[\w-]+)\s*' 1066 r'\(\s*(==\s*)?(?P<ver>[^)]+)\)$') 1067 1068class DependencyFinder(object): 1069 """ 1070 Locate dependencies for distributions. 1071 """ 1072 1073 def __init__(self, locator=None): 1074 """ 1075 Initialise an instance, using the specified locator 1076 to locate distributions. 1077 """ 1078 self.locator = locator or default_locator 1079 self.scheme = get_scheme(self.locator.scheme) 1080 1081 def add_distribution(self, dist): 1082 """ 1083 Add a distribution to the finder. This will update internal information 1084 about who provides what. 1085 :param dist: The distribution to add. 1086 """ 1087 logger.debug('adding distribution %s', dist) 1088 name = dist.key 1089 self.dists_by_name[name] = dist 1090 self.dists[(name, dist.version)] = dist 1091 for p in dist.provides: 1092 name, version = parse_name_and_version(p) 1093 logger.debug('Add to provided: %s, %s, %s', name, version, dist) 1094 self.provided.setdefault(name, set()).add((version, dist)) 1095 1096 def remove_distribution(self, dist): 1097 """ 1098 Remove a distribution from the finder. This will update internal 1099 information about who provides what. 1100 :param dist: The distribution to remove. 1101 """ 1102 logger.debug('removing distribution %s', dist) 1103 name = dist.key 1104 del self.dists_by_name[name] 1105 del self.dists[(name, dist.version)] 1106 for p in dist.provides: 1107 name, version = parse_name_and_version(p) 1108 logger.debug('Remove from provided: %s, %s, %s', name, version, dist) 1109 s = self.provided[name] 1110 s.remove((version, dist)) 1111 if not s: 1112 del self.provided[name] 1113 1114 def get_matcher(self, reqt): 1115 """ 1116 Get a version matcher for a requirement. 1117 :param reqt: The requirement 1118 :type reqt: str 1119 :return: A version matcher (an instance of 1120 :class:`distlib.version.Matcher`). 1121 """ 1122 try: 1123 matcher = self.scheme.matcher(reqt) 1124 except UnsupportedVersionError: # pragma: no cover 1125 # XXX compat-mode if cannot read the version 1126 name = reqt.split()[0] 1127 matcher = self.scheme.matcher(name) 1128 return matcher 1129 1130 def find_providers(self, reqt): 1131 """ 1132 Find the distributions which can fulfill a requirement. 1133 1134 :param reqt: The requirement. 1135 :type reqt: str 1136 :return: A set of distribution which can fulfill the requirement. 1137 """ 1138 matcher = self.get_matcher(reqt) 1139 name = matcher.key # case-insensitive 1140 result = set() 1141 provided = self.provided 1142 if name in provided: 1143 for version, provider in provided[name]: 1144 try: 1145 match = matcher.match(version) 1146 except UnsupportedVersionError: 1147 match = False 1148 1149 if match: 1150 result.add(provider) 1151 break 1152 return result 1153 1154 def try_to_replace(self, provider, other, problems): 1155 """ 1156 Attempt to replace one provider with another. This is typically used 1157 when resolving dependencies from multiple sources, e.g. A requires 1158 (B >= 1.0) while C requires (B >= 1.1). 1159 1160 For successful replacement, ``provider`` must meet all the requirements 1161 which ``other`` fulfills. 1162 1163 :param provider: The provider we are trying to replace with. 1164 :param other: The provider we're trying to replace. 1165 :param problems: If False is returned, this will contain what 1166 problems prevented replacement. This is currently 1167 a tuple of the literal string 'cantreplace', 1168 ``provider``, ``other`` and the set of requirements 1169 that ``provider`` couldn't fulfill. 1170 :return: True if we can replace ``other`` with ``provider``, else 1171 False. 1172 """ 1173 rlist = self.reqts[other] 1174 unmatched = set() 1175 for s in rlist: 1176 matcher = self.get_matcher(s) 1177 if not matcher.match(provider.version): 1178 unmatched.add(s) 1179 if unmatched: 1180 # can't replace other with provider 1181 problems.add(('cantreplace', provider, other, 1182 frozenset(unmatched))) 1183 result = False 1184 else: 1185 # can replace other with provider 1186 self.remove_distribution(other) 1187 del self.reqts[other] 1188 for s in rlist: 1189 self.reqts.setdefault(provider, set()).add(s) 1190 self.add_distribution(provider) 1191 result = True 1192 return result 1193 1194 def find(self, requirement, meta_extras=None, prereleases=False): 1195 """ 1196 Find a distribution and all distributions it depends on. 1197 1198 :param requirement: The requirement specifying the distribution to 1199 find, or a Distribution instance. 1200 :param meta_extras: A list of meta extras such as :test:, :build: and 1201 so on. 1202 :param prereleases: If ``True``, allow pre-release versions to be 1203 returned - otherwise, don't return prereleases 1204 unless they're all that's available. 1205 1206 Return a set of :class:`Distribution` instances and a set of 1207 problems. 1208 1209 The distributions returned should be such that they have the 1210 :attr:`required` attribute set to ``True`` if they were 1211 from the ``requirement`` passed to ``find()``, and they have the 1212 :attr:`build_time_dependency` attribute set to ``True`` unless they 1213 are post-installation dependencies of the ``requirement``. 1214 1215 The problems should be a tuple consisting of the string 1216 ``'unsatisfied'`` and the requirement which couldn't be satisfied 1217 by any distribution known to the locator. 1218 """ 1219 1220 self.provided = {} 1221 self.dists = {} 1222 self.dists_by_name = {} 1223 self.reqts = {} 1224 1225 meta_extras = set(meta_extras or []) 1226 if ':*:' in meta_extras: 1227 meta_extras.remove(':*:') 1228 # :meta: and :run: are implicitly included 1229 meta_extras |= set([':test:', ':build:', ':dev:']) 1230 1231 if isinstance(requirement, Distribution): 1232 dist = odist = requirement 1233 logger.debug('passed %s as requirement', odist) 1234 else: 1235 dist = odist = self.locator.locate(requirement, 1236 prereleases=prereleases) 1237 if dist is None: 1238 raise DistlibException('Unable to locate %r' % requirement) 1239 logger.debug('located %s', odist) 1240 dist.requested = True 1241 problems = set() 1242 todo = set([dist]) 1243 install_dists = set([odist]) 1244 while todo: 1245 dist = todo.pop() 1246 name = dist.key # case-insensitive 1247 if name not in self.dists_by_name: 1248 self.add_distribution(dist) 1249 else: 1250 #import pdb; pdb.set_trace() 1251 other = self.dists_by_name[name] 1252 if other != dist: 1253 self.try_to_replace(dist, other, problems) 1254 1255 ireqts = dist.run_requires | dist.meta_requires 1256 sreqts = dist.build_requires 1257 ereqts = set() 1258 if meta_extras and dist in install_dists: 1259 for key in ('test', 'build', 'dev'): 1260 e = ':%s:' % key 1261 if e in meta_extras: 1262 ereqts |= getattr(dist, '%s_requires' % key) 1263 all_reqts = ireqts | sreqts | ereqts 1264 for r in all_reqts: 1265 providers = self.find_providers(r) 1266 if not providers: 1267 logger.debug('No providers found for %r', r) 1268 provider = self.locator.locate(r, prereleases=prereleases) 1269 # If no provider is found and we didn't consider 1270 # prereleases, consider them now. 1271 if provider is None and not prereleases: 1272 provider = self.locator.locate(r, prereleases=True) 1273 if provider is None: 1274 logger.debug('Cannot satisfy %r', r) 1275 problems.add(('unsatisfied', r)) 1276 else: 1277 n, v = provider.key, provider.version 1278 if (n, v) not in self.dists: 1279 todo.add(provider) 1280 providers.add(provider) 1281 if r in ireqts and dist in install_dists: 1282 install_dists.add(provider) 1283 logger.debug('Adding %s to install_dists', 1284 provider.name_and_version) 1285 for p in providers: 1286 name = p.key 1287 if name not in self.dists_by_name: 1288 self.reqts.setdefault(p, set()).add(r) 1289 else: 1290 other = self.dists_by_name[name] 1291 if other != p: 1292 # see if other can be replaced by p 1293 self.try_to_replace(p, other, problems) 1294 1295 dists = set(self.dists.values()) 1296 for dist in dists: 1297 dist.build_time_dependency = dist not in install_dists 1298 if dist.build_time_dependency: 1299 logger.debug('%s is a build-time dependency only.', 1300 dist.name_and_version) 1301 logger.debug('find done for %s', odist) 1302 return dists, problems 1303