1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2012-2015 Vinay Sajip. 4# Licensed to the Python Software Foundation under a contributor agreement. 5# See LICENSE.txt and CONTRIBUTORS.txt. 6# 7 8import gzip 9from io import BytesIO 10import json 11import logging 12import os 13import posixpath 14import re 15try: 16 import threading 17except ImportError: # pragma: no cover 18 import dummy_threading as threading 19import zlib 20 21from . import DistlibException 22from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url, 23 queue, quote, unescape, build_opener, 24 HTTPRedirectHandler as BaseRedirectHandler, text_type, 25 Request, HTTPError, URLError) 26from .database import Distribution, DistributionPath, make_dist 27from .metadata import Metadata, MetadataInvalidError 28from .util import (cached_property, ensure_slash, split_filename, get_project_data, 29 parse_requirement, parse_name_and_version, ServerProxy, 30 normalize_name) 31from .version import get_scheme, UnsupportedVersionError 32from .wheel import Wheel, is_compatible 33 34logger = logging.getLogger(__name__) 35 36HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)') 37CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I) 38HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml') 39DEFAULT_INDEX = 'https://pypi.org/pypi' 40 41def get_all_distribution_names(url=None): 42 """ 43 Return all distribution names known by an index. 44 :param url: The URL of the index. 45 :return: A list of all known distribution names. 46 """ 47 if url is None: 48 url = DEFAULT_INDEX 49 client = ServerProxy(url, timeout=3.0) 50 try: 51 return client.list_packages() 52 finally: 53 client('close')() 54 55class RedirectHandler(BaseRedirectHandler): 56 """ 57 A class to work around a bug in some Python 3.2.x releases. 58 """ 59 # There's a bug in the base version for some 3.2.x 60 # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header 61 # returns e.g. /abc, it bails because it says the scheme '' 62 # is bogus, when actually it should use the request's 63 # URL for the scheme. See Python issue #13696. 64 def http_error_302(self, req, fp, code, msg, headers): 65 # Some servers (incorrectly) return multiple Location headers 66 # (so probably same goes for URI). Use first header. 67 newurl = None 68 for key in ('location', 'uri'): 69 if key in headers: 70 newurl = headers[key] 71 break 72 if newurl is None: # pragma: no cover 73 return 74 urlparts = urlparse(newurl) 75 if urlparts.scheme == '': 76 newurl = urljoin(req.get_full_url(), newurl) 77 if hasattr(headers, 'replace_header'): 78 headers.replace_header(key, newurl) 79 else: 80 headers[key] = newurl 81 return BaseRedirectHandler.http_error_302(self, req, fp, code, msg, 82 headers) 83 84 http_error_301 = http_error_303 = http_error_307 = http_error_302 85 86class Locator(object): 87 """ 88 A base class for locators - things that locate distributions. 89 """ 90 source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz') 91 binary_extensions = ('.egg', '.exe', '.whl') 92 excluded_extensions = ('.pdf',) 93 94 # A list of tags indicating which wheels you want to match. The default 95 # value of None matches against the tags compatible with the running 96 # Python. If you want to match other values, set wheel_tags on a locator 97 # instance to a list of tuples (pyver, abi, arch) which you want to match. 98 wheel_tags = None 99 100 downloadable_extensions = source_extensions + ('.whl',) 101 102 def __init__(self, scheme='default'): 103 """ 104 Initialise an instance. 105 :param scheme: Because locators look for most recent versions, they 106 need to know the version scheme to use. This specifies 107 the current PEP-recommended scheme - use ``'legacy'`` 108 if you need to support existing distributions on PyPI. 109 """ 110 self._cache = {} 111 self.scheme = scheme 112 # Because of bugs in some of the handlers on some of the platforms, 113 # we use our own opener rather than just using urlopen. 114 self.opener = build_opener(RedirectHandler()) 115 # If get_project() is called from locate(), the matcher instance 116 # is set from the requirement passed to locate(). See issue #18 for 117 # why this can be useful to know. 118 self.matcher = None 119 self.errors = queue.Queue() 120 121 def get_errors(self): 122 """ 123 Return any errors which have occurred. 124 """ 125 result = [] 126 while not self.errors.empty(): # pragma: no cover 127 try: 128 e = self.errors.get(False) 129 result.append(e) 130 except self.errors.Empty: 131 continue 132 self.errors.task_done() 133 return result 134 135 def clear_errors(self): 136 """ 137 Clear any errors which may have been logged. 138 """ 139 # Just get the errors and throw them away 140 self.get_errors() 141 142 def clear_cache(self): 143 self._cache.clear() 144 145 def _get_scheme(self): 146 return self._scheme 147 148 def _set_scheme(self, value): 149 self._scheme = value 150 151 scheme = property(_get_scheme, _set_scheme) 152 153 def _get_project(self, name): 154 """ 155 For a given project, get a dictionary mapping available versions to Distribution 156 instances. 157 158 This should be implemented in subclasses. 159 160 If called from a locate() request, self.matcher will be set to a 161 matcher for the requirement to satisfy, otherwise it will be None. 162 """ 163 raise NotImplementedError('Please implement in the subclass') 164 165 def get_distribution_names(self): 166 """ 167 Return all the distribution names known to this locator. 168 """ 169 raise NotImplementedError('Please implement in the subclass') 170 171 def get_project(self, name): 172 """ 173 For a given project, get a dictionary mapping available versions to Distribution 174 instances. 175 176 This calls _get_project to do all the work, and just implements a caching layer on top. 177 """ 178 if self._cache is None: # pragma: no cover 179 result = self._get_project(name) 180 elif name in self._cache: 181 result = self._cache[name] 182 else: 183 self.clear_errors() 184 result = self._get_project(name) 185 self._cache[name] = result 186 return result 187 188 def score_url(self, url): 189 """ 190 Give an url a score which can be used to choose preferred URLs 191 for a given project release. 192 """ 193 t = urlparse(url) 194 basename = posixpath.basename(t.path) 195 compatible = True 196 is_wheel = basename.endswith('.whl') 197 is_downloadable = basename.endswith(self.downloadable_extensions) 198 if is_wheel: 199 compatible = is_compatible(Wheel(basename), self.wheel_tags) 200 return (t.scheme == 'https', 'pypi.org' in t.netloc, 201 is_downloadable, is_wheel, compatible, basename) 202 203 def prefer_url(self, url1, url2): 204 """ 205 Choose one of two URLs where both are candidates for distribution 206 archives for the same version of a distribution (for example, 207 .tar.gz vs. zip). 208 209 The current implementation favours https:// URLs over http://, archives 210 from PyPI over those from other locations, wheel compatibility (if a 211 wheel) and then the archive name. 212 """ 213 result = url2 214 if url1: 215 s1 = self.score_url(url1) 216 s2 = self.score_url(url2) 217 if s1 > s2: 218 result = url1 219 if result != url2: 220 logger.debug('Not replacing %r with %r', url1, url2) 221 else: 222 logger.debug('Replacing %r with %r', url1, url2) 223 return result 224 225 def split_filename(self, filename, project_name): 226 """ 227 Attempt to split a filename in project name, version and Python version. 228 """ 229 return split_filename(filename, project_name) 230 231 def convert_url_to_download_info(self, url, project_name): 232 """ 233 See if a URL is a candidate for a download URL for a project (the URL 234 has typically been scraped from an HTML page). 235 236 If it is, a dictionary is returned with keys "name", "version", 237 "filename" and "url"; otherwise, None is returned. 238 """ 239 def same_project(name1, name2): 240 return normalize_name(name1) == normalize_name(name2) 241 242 result = None 243 scheme, netloc, path, params, query, frag = urlparse(url) 244 if frag.lower().startswith('egg='): # pragma: no cover 245 logger.debug('%s: version hint in fragment: %r', 246 project_name, frag) 247 m = HASHER_HASH.match(frag) 248 if m: 249 algo, digest = m.groups() 250 else: 251 algo, digest = None, None 252 origpath = path 253 if path and path[-1] == '/': # pragma: no cover 254 path = path[:-1] 255 if path.endswith('.whl'): 256 try: 257 wheel = Wheel(path) 258 if not is_compatible(wheel, self.wheel_tags): 259 logger.debug('Wheel not compatible: %s', path) 260 else: 261 if project_name is None: 262 include = True 263 else: 264 include = same_project(wheel.name, project_name) 265 if include: 266 result = { 267 'name': wheel.name, 268 'version': wheel.version, 269 'filename': wheel.filename, 270 'url': urlunparse((scheme, netloc, origpath, 271 params, query, '')), 272 'python-version': ', '.join( 273 ['.'.join(list(v[2:])) for v in wheel.pyver]), 274 } 275 except Exception as e: # pragma: no cover 276 logger.warning('invalid path for wheel: %s', path) 277 elif not path.endswith(self.downloadable_extensions): # pragma: no cover 278 logger.debug('Not downloadable: %s', path) 279 else: # downloadable extension 280 path = filename = posixpath.basename(path) 281 for ext in self.downloadable_extensions: 282 if path.endswith(ext): 283 path = path[:-len(ext)] 284 t = self.split_filename(path, project_name) 285 if not t: # pragma: no cover 286 logger.debug('No match for project/version: %s', path) 287 else: 288 name, version, pyver = t 289 if not project_name or same_project(project_name, name): 290 result = { 291 'name': name, 292 'version': version, 293 'filename': filename, 294 'url': urlunparse((scheme, netloc, origpath, 295 params, query, '')), 296 #'packagetype': 'sdist', 297 } 298 if pyver: # pragma: no cover 299 result['python-version'] = pyver 300 break 301 if result and algo: 302 result['%s_digest' % algo] = digest 303 return result 304 305 def _get_digest(self, info): 306 """ 307 Get a digest from a dictionary by looking at a "digests" dictionary 308 or keys of the form 'algo_digest'. 309 310 Returns a 2-tuple (algo, digest) if found, else None. Currently 311 looks only for SHA256, then MD5. 312 """ 313 result = None 314 if 'digests' in info: 315 digests = info['digests'] 316 for algo in ('sha256', 'md5'): 317 if algo in digests: 318 result = (algo, digests[algo]) 319 break 320 if not result: 321 for algo in ('sha256', 'md5'): 322 key = '%s_digest' % algo 323 if key in info: 324 result = (algo, info[key]) 325 break 326 return result 327 328 def _update_version_data(self, result, info): 329 """ 330 Update a result dictionary (the final result from _get_project) with a 331 dictionary for a specific version, which typically holds information 332 gleaned from a filename or URL for an archive for the distribution. 333 """ 334 name = info.pop('name') 335 version = info.pop('version') 336 if version in result: 337 dist = result[version] 338 md = dist.metadata 339 else: 340 dist = make_dist(name, version, scheme=self.scheme) 341 md = dist.metadata 342 dist.digest = digest = self._get_digest(info) 343 url = info['url'] 344 result['digests'][url] = digest 345 if md.source_url != info['url']: 346 md.source_url = self.prefer_url(md.source_url, url) 347 result['urls'].setdefault(version, set()).add(url) 348 dist.locator = self 349 result[version] = dist 350 351 def locate(self, requirement, prereleases=False): 352 """ 353 Find the most recent distribution which matches the given 354 requirement. 355 356 :param requirement: A requirement of the form 'foo (1.0)' or perhaps 357 'foo (>= 1.0, < 2.0, != 1.3)' 358 :param prereleases: If ``True``, allow pre-release versions 359 to be located. Otherwise, pre-release versions 360 are not returned. 361 :return: A :class:`Distribution` instance, or ``None`` if no such 362 distribution could be located. 363 """ 364 result = None 365 r = parse_requirement(requirement) 366 if r is None: # pragma: no cover 367 raise DistlibException('Not a valid requirement: %r' % requirement) 368 scheme = get_scheme(self.scheme) 369 self.matcher = matcher = scheme.matcher(r.requirement) 370 logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__) 371 versions = self.get_project(r.name) 372 if len(versions) > 2: # urls and digests keys are present 373 # sometimes, versions are invalid 374 slist = [] 375 vcls = matcher.version_class 376 for k in versions: 377 if k in ('urls', 'digests'): 378 continue 379 try: 380 if not matcher.match(k): 381 pass # logger.debug('%s did not match %r', matcher, k) 382 else: 383 if prereleases or not vcls(k).is_prerelease: 384 slist.append(k) 385 # else: 386 # logger.debug('skipping pre-release ' 387 # 'version %s of %s', k, matcher.name) 388 except Exception: # pragma: no cover 389 logger.warning('error matching %s with %r', matcher, k) 390 pass # slist.append(k) 391 if len(slist) > 1: 392 slist = sorted(slist, key=scheme.key) 393 if slist: 394 logger.debug('sorted list: %s', slist) 395 version = slist[-1] 396 result = versions[version] 397 if result: 398 if r.extras: 399 result.extras = r.extras 400 result.download_urls = versions.get('urls', {}).get(version, set()) 401 d = {} 402 sd = versions.get('digests', {}) 403 for url in result.download_urls: 404 if url in sd: # pragma: no cover 405 d[url] = sd[url] 406 result.digests = d 407 self.matcher = None 408 return result 409 410 411class PyPIRPCLocator(Locator): 412 """ 413 This locator uses XML-RPC to locate distributions. It therefore 414 cannot be used with simple mirrors (that only mirror file content). 415 """ 416 def __init__(self, url, **kwargs): 417 """ 418 Initialise an instance. 419 420 :param url: The URL to use for XML-RPC. 421 :param kwargs: Passed to the superclass constructor. 422 """ 423 super(PyPIRPCLocator, self).__init__(**kwargs) 424 self.base_url = url 425 self.client = ServerProxy(url, timeout=3.0) 426 427 def get_distribution_names(self): 428 """ 429 Return all the distribution names known to this locator. 430 """ 431 return set(self.client.list_packages()) 432 433 def _get_project(self, name): 434 result = {'urls': {}, 'digests': {}} 435 versions = self.client.package_releases(name, True) 436 for v in versions: 437 urls = self.client.release_urls(name, v) 438 data = self.client.release_data(name, v) 439 metadata = Metadata(scheme=self.scheme) 440 metadata.name = data['name'] 441 metadata.version = data['version'] 442 metadata.license = data.get('license') 443 metadata.keywords = data.get('keywords', []) 444 metadata.summary = data.get('summary') 445 dist = Distribution(metadata) 446 if urls: 447 info = urls[0] 448 metadata.source_url = info['url'] 449 dist.digest = self._get_digest(info) 450 dist.locator = self 451 result[v] = dist 452 for info in urls: 453 url = info['url'] 454 digest = self._get_digest(info) 455 result['urls'].setdefault(v, set()).add(url) 456 result['digests'][url] = digest 457 return result 458 459class PyPIJSONLocator(Locator): 460 """ 461 This locator uses PyPI's JSON interface. It's very limited in functionality 462 and probably not worth using. 463 """ 464 def __init__(self, url, **kwargs): 465 super(PyPIJSONLocator, self).__init__(**kwargs) 466 self.base_url = ensure_slash(url) 467 468 def get_distribution_names(self): 469 """ 470 Return all the distribution names known to this locator. 471 """ 472 raise NotImplementedError('Not available from this locator') 473 474 def _get_project(self, name): 475 result = {'urls': {}, 'digests': {}} 476 url = urljoin(self.base_url, '%s/json' % quote(name)) 477 try: 478 resp = self.opener.open(url) 479 data = resp.read().decode() # for now 480 d = json.loads(data) 481 md = Metadata(scheme=self.scheme) 482 data = d['info'] 483 md.name = data['name'] 484 md.version = data['version'] 485 md.license = data.get('license') 486 md.keywords = data.get('keywords', []) 487 md.summary = data.get('summary') 488 dist = Distribution(md) 489 dist.locator = self 490 urls = d['urls'] 491 result[md.version] = dist 492 for info in d['urls']: 493 url = info['url'] 494 dist.download_urls.add(url) 495 dist.digests[url] = self._get_digest(info) 496 result['urls'].setdefault(md.version, set()).add(url) 497 result['digests'][url] = self._get_digest(info) 498 # Now get other releases 499 for version, infos in d['releases'].items(): 500 if version == md.version: 501 continue # already done 502 omd = Metadata(scheme=self.scheme) 503 omd.name = md.name 504 omd.version = version 505 odist = Distribution(omd) 506 odist.locator = self 507 result[version] = odist 508 for info in infos: 509 url = info['url'] 510 odist.download_urls.add(url) 511 odist.digests[url] = self._get_digest(info) 512 result['urls'].setdefault(version, set()).add(url) 513 result['digests'][url] = self._get_digest(info) 514# for info in urls: 515# md.source_url = info['url'] 516# dist.digest = self._get_digest(info) 517# dist.locator = self 518# for info in urls: 519# url = info['url'] 520# result['urls'].setdefault(md.version, set()).add(url) 521# result['digests'][url] = self._get_digest(info) 522 except Exception as e: 523 self.errors.put(text_type(e)) 524 logger.exception('JSON fetch failed: %s', e) 525 return result 526 527 528class Page(object): 529 """ 530 This class represents a scraped HTML page. 531 """ 532 # The following slightly hairy-looking regex just looks for the contents of 533 # an anchor link, which has an attribute "href" either immediately preceded 534 # or immediately followed by a "rel" attribute. The attribute values can be 535 # declared with double quotes, single quotes or no quotes - which leads to 536 # the length of the expression. 537 _href = re.compile(""" 538(rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)? 539href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*)) 540(\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))? 541""", re.I | re.S | re.X) 542 _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S) 543 544 def __init__(self, data, url): 545 """ 546 Initialise an instance with the Unicode page contents and the URL they 547 came from. 548 """ 549 self.data = data 550 self.base_url = self.url = url 551 m = self._base.search(self.data) 552 if m: 553 self.base_url = m.group(1) 554 555 _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I) 556 557 @cached_property 558 def links(self): 559 """ 560 Return the URLs of all the links on a page together with information 561 about their "rel" attribute, for determining which ones to treat as 562 downloads and which ones to queue for further scraping. 563 """ 564 def clean(url): 565 "Tidy up an URL." 566 scheme, netloc, path, params, query, frag = urlparse(url) 567 return urlunparse((scheme, netloc, quote(path), 568 params, query, frag)) 569 570 result = set() 571 for match in self._href.finditer(self.data): 572 d = match.groupdict('') 573 rel = (d['rel1'] or d['rel2'] or d['rel3'] or 574 d['rel4'] or d['rel5'] or d['rel6']) 575 url = d['url1'] or d['url2'] or d['url3'] 576 url = urljoin(self.base_url, url) 577 url = unescape(url) 578 url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url) 579 result.add((url, rel)) 580 # We sort the result, hoping to bring the most recent versions 581 # to the front 582 result = sorted(result, key=lambda t: t[0], reverse=True) 583 return result 584 585 586class SimpleScrapingLocator(Locator): 587 """ 588 A locator which scrapes HTML pages to locate downloads for a distribution. 589 This runs multiple threads to do the I/O; performance is at least as good 590 as pip's PackageFinder, which works in an analogous fashion. 591 """ 592 593 # These are used to deal with various Content-Encoding schemes. 594 decoders = { 595 'deflate': zlib.decompress, 596 'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(b)).read(), 597 'none': lambda b: b, 598 } 599 600 def __init__(self, url, timeout=None, num_workers=10, **kwargs): 601 """ 602 Initialise an instance. 603 :param url: The root URL to use for scraping. 604 :param timeout: The timeout, in seconds, to be applied to requests. 605 This defaults to ``None`` (no timeout specified). 606 :param num_workers: The number of worker threads you want to do I/O, 607 This defaults to 10. 608 :param kwargs: Passed to the superclass. 609 """ 610 super(SimpleScrapingLocator, self).__init__(**kwargs) 611 self.base_url = ensure_slash(url) 612 self.timeout = timeout 613 self._page_cache = {} 614 self._seen = set() 615 self._to_fetch = queue.Queue() 616 self._bad_hosts = set() 617 self.skip_externals = False 618 self.num_workers = num_workers 619 self._lock = threading.RLock() 620 # See issue #45: we need to be resilient when the locator is used 621 # in a thread, e.g. with concurrent.futures. We can't use self._lock 622 # as it is for coordinating our internal threads - the ones created 623 # in _prepare_threads. 624 self._gplock = threading.RLock() 625 self.platform_check = False # See issue #112 626 627 def _prepare_threads(self): 628 """ 629 Threads are created only when get_project is called, and terminate 630 before it returns. They are there primarily to parallelise I/O (i.e. 631 fetching web pages). 632 """ 633 self._threads = [] 634 for i in range(self.num_workers): 635 t = threading.Thread(target=self._fetch) 636 t.daemon = True 637 t.start() 638 self._threads.append(t) 639 640 def _wait_threads(self): 641 """ 642 Tell all the threads to terminate (by sending a sentinel value) and 643 wait for them to do so. 644 """ 645 # Note that you need two loops, since you can't say which 646 # thread will get each sentinel 647 for t in self._threads: 648 self._to_fetch.put(None) # sentinel 649 for t in self._threads: 650 t.join() 651 self._threads = [] 652 653 def _get_project(self, name): 654 result = {'urls': {}, 'digests': {}} 655 with self._gplock: 656 self.result = result 657 self.project_name = name 658 url = urljoin(self.base_url, '%s/' % quote(name)) 659 self._seen.clear() 660 self._page_cache.clear() 661 self._prepare_threads() 662 try: 663 logger.debug('Queueing %s', url) 664 self._to_fetch.put(url) 665 self._to_fetch.join() 666 finally: 667 self._wait_threads() 668 del self.result 669 return result 670 671 platform_dependent = re.compile(r'\b(linux_(i\d86|x86_64|arm\w+)|' 672 r'win(32|_amd64)|macosx_?\d+)\b', re.I) 673 674 def _is_platform_dependent(self, url): 675 """ 676 Does an URL refer to a platform-specific download? 677 """ 678 return self.platform_dependent.search(url) 679 680 def _process_download(self, url): 681 """ 682 See if an URL is a suitable download for a project. 683 684 If it is, register information in the result dictionary (for 685 _get_project) about the specific version it's for. 686 687 Note that the return value isn't actually used other than as a boolean 688 value. 689 """ 690 if self.platform_check and self._is_platform_dependent(url): 691 info = None 692 else: 693 info = self.convert_url_to_download_info(url, self.project_name) 694 logger.debug('process_download: %s -> %s', url, info) 695 if info: 696 with self._lock: # needed because self.result is shared 697 self._update_version_data(self.result, info) 698 return info 699 700 def _should_queue(self, link, referrer, rel): 701 """ 702 Determine whether a link URL from a referring page and with a 703 particular "rel" attribute should be queued for scraping. 704 """ 705 scheme, netloc, path, _, _, _ = urlparse(link) 706 if path.endswith(self.source_extensions + self.binary_extensions + 707 self.excluded_extensions): 708 result = False 709 elif self.skip_externals and not link.startswith(self.base_url): 710 result = False 711 elif not referrer.startswith(self.base_url): 712 result = False 713 elif rel not in ('homepage', 'download'): 714 result = False 715 elif scheme not in ('http', 'https', 'ftp'): 716 result = False 717 elif self._is_platform_dependent(link): 718 result = False 719 else: 720 host = netloc.split(':', 1)[0] 721 if host.lower() == 'localhost': 722 result = False 723 else: 724 result = True 725 logger.debug('should_queue: %s (%s) from %s -> %s', link, rel, 726 referrer, result) 727 return result 728 729 def _fetch(self): 730 """ 731 Get a URL to fetch from the work queue, get the HTML page, examine its 732 links for download candidates and candidates for further scraping. 733 734 This is a handy method to run in a thread. 735 """ 736 while True: 737 url = self._to_fetch.get() 738 try: 739 if url: 740 page = self.get_page(url) 741 if page is None: # e.g. after an error 742 continue 743 for link, rel in page.links: 744 if link not in self._seen: 745 try: 746 self._seen.add(link) 747 if (not self._process_download(link) and 748 self._should_queue(link, url, rel)): 749 logger.debug('Queueing %s from %s', link, url) 750 self._to_fetch.put(link) 751 except MetadataInvalidError: # e.g. invalid versions 752 pass 753 except Exception as e: # pragma: no cover 754 self.errors.put(text_type(e)) 755 finally: 756 # always do this, to avoid hangs :-) 757 self._to_fetch.task_done() 758 if not url: 759 #logger.debug('Sentinel seen, quitting.') 760 break 761 762 def get_page(self, url): 763 """ 764 Get the HTML for an URL, possibly from an in-memory cache. 765 766 XXX TODO Note: this cache is never actually cleared. It's assumed that 767 the data won't get stale over the lifetime of a locator instance (not 768 necessarily true for the default_locator). 769 """ 770 # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api 771 scheme, netloc, path, _, _, _ = urlparse(url) 772 if scheme == 'file' and os.path.isdir(url2pathname(path)): 773 url = urljoin(ensure_slash(url), 'index.html') 774 775 if url in self._page_cache: 776 result = self._page_cache[url] 777 logger.debug('Returning %s from cache: %s', url, result) 778 else: 779 host = netloc.split(':', 1)[0] 780 result = None 781 if host in self._bad_hosts: 782 logger.debug('Skipping %s due to bad host %s', url, host) 783 else: 784 req = Request(url, headers={'Accept-encoding': 'identity'}) 785 try: 786 logger.debug('Fetching %s', url) 787 resp = self.opener.open(req, timeout=self.timeout) 788 logger.debug('Fetched %s', url) 789 headers = resp.info() 790 content_type = headers.get('Content-Type', '') 791 if HTML_CONTENT_TYPE.match(content_type): 792 final_url = resp.geturl() 793 data = resp.read() 794 encoding = headers.get('Content-Encoding') 795 if encoding: 796 decoder = self.decoders[encoding] # fail if not found 797 data = decoder(data) 798 encoding = 'utf-8' 799 m = CHARSET.search(content_type) 800 if m: 801 encoding = m.group(1) 802 try: 803 data = data.decode(encoding) 804 except UnicodeError: # pragma: no cover 805 data = data.decode('latin-1') # fallback 806 result = Page(data, final_url) 807 self._page_cache[final_url] = result 808 except HTTPError as e: 809 if e.code != 404: 810 logger.exception('Fetch failed: %s: %s', url, e) 811 except URLError as e: # pragma: no cover 812 logger.exception('Fetch failed: %s: %s', url, e) 813 with self._lock: 814 self._bad_hosts.add(host) 815 except Exception as e: # pragma: no cover 816 logger.exception('Fetch failed: %s: %s', url, e) 817 finally: 818 self._page_cache[url] = result # even if None (failure) 819 return result 820 821 _distname_re = re.compile('<a href=[^>]*>([^<]+)<') 822 823 def get_distribution_names(self): 824 """ 825 Return all the distribution names known to this locator. 826 """ 827 result = set() 828 page = self.get_page(self.base_url) 829 if not page: 830 raise DistlibException('Unable to get %s' % self.base_url) 831 for match in self._distname_re.finditer(page.data): 832 result.add(match.group(1)) 833 return result 834 835class DirectoryLocator(Locator): 836 """ 837 This class locates distributions in a directory tree. 838 """ 839 840 def __init__(self, path, **kwargs): 841 """ 842 Initialise an instance. 843 :param path: The root of the directory tree to search. 844 :param kwargs: Passed to the superclass constructor, 845 except for: 846 * recursive - if True (the default), subdirectories are 847 recursed into. If False, only the top-level directory 848 is searched, 849 """ 850 self.recursive = kwargs.pop('recursive', True) 851 super(DirectoryLocator, self).__init__(**kwargs) 852 path = os.path.abspath(path) 853 if not os.path.isdir(path): # pragma: no cover 854 raise DistlibException('Not a directory: %r' % path) 855 self.base_dir = path 856 857 def should_include(self, filename, parent): 858 """ 859 Should a filename be considered as a candidate for a distribution 860 archive? As well as the filename, the directory which contains it 861 is provided, though not used by the current implementation. 862 """ 863 return filename.endswith(self.downloadable_extensions) 864 865 def _get_project(self, name): 866 result = {'urls': {}, 'digests': {}} 867 for root, dirs, files in os.walk(self.base_dir): 868 for fn in files: 869 if self.should_include(fn, root): 870 fn = os.path.join(root, fn) 871 url = urlunparse(('file', '', 872 pathname2url(os.path.abspath(fn)), 873 '', '', '')) 874 info = self.convert_url_to_download_info(url, name) 875 if info: 876 self._update_version_data(result, info) 877 if not self.recursive: 878 break 879 return result 880 881 def get_distribution_names(self): 882 """ 883 Return all the distribution names known to this locator. 884 """ 885 result = set() 886 for root, dirs, files in os.walk(self.base_dir): 887 for fn in files: 888 if self.should_include(fn, root): 889 fn = os.path.join(root, fn) 890 url = urlunparse(('file', '', 891 pathname2url(os.path.abspath(fn)), 892 '', '', '')) 893 info = self.convert_url_to_download_info(url, None) 894 if info: 895 result.add(info['name']) 896 if not self.recursive: 897 break 898 return result 899 900class JSONLocator(Locator): 901 """ 902 This locator uses special extended metadata (not available on PyPI) and is 903 the basis of performant dependency resolution in distlib. Other locators 904 require archive downloads before dependencies can be determined! As you 905 might imagine, that can be slow. 906 """ 907 def get_distribution_names(self): 908 """ 909 Return all the distribution names known to this locator. 910 """ 911 raise NotImplementedError('Not available from this locator') 912 913 def _get_project(self, name): 914 result = {'urls': {}, 'digests': {}} 915 data = get_project_data(name) 916 if data: 917 for info in data.get('files', []): 918 if info['ptype'] != 'sdist' or info['pyversion'] != 'source': 919 continue 920 # We don't store summary in project metadata as it makes 921 # the data bigger for no benefit during dependency 922 # resolution 923 dist = make_dist(data['name'], info['version'], 924 summary=data.get('summary', 925 'Placeholder for summary'), 926 scheme=self.scheme) 927 md = dist.metadata 928 md.source_url = info['url'] 929 # TODO SHA256 digest 930 if 'digest' in info and info['digest']: 931 dist.digest = ('md5', info['digest']) 932 md.dependencies = info.get('requirements', {}) 933 dist.exports = info.get('exports', {}) 934 result[dist.version] = dist 935 result['urls'].setdefault(dist.version, set()).add(info['url']) 936 return result 937 938class DistPathLocator(Locator): 939 """ 940 This locator finds installed distributions in a path. It can be useful for 941 adding to an :class:`AggregatingLocator`. 942 """ 943 def __init__(self, distpath, **kwargs): 944 """ 945 Initialise an instance. 946 947 :param distpath: A :class:`DistributionPath` instance to search. 948 """ 949 super(DistPathLocator, self).__init__(**kwargs) 950 assert isinstance(distpath, DistributionPath) 951 self.distpath = distpath 952 953 def _get_project(self, name): 954 dist = self.distpath.get_distribution(name) 955 if dist is None: 956 result = {'urls': {}, 'digests': {}} 957 else: 958 result = { 959 dist.version: dist, 960 'urls': {dist.version: set([dist.source_url])}, 961 'digests': {dist.version: set([None])} 962 } 963 return result 964 965 966class AggregatingLocator(Locator): 967 """ 968 This class allows you to chain and/or merge a list of locators. 969 """ 970 def __init__(self, *locators, **kwargs): 971 """ 972 Initialise an instance. 973 974 :param locators: The list of locators to search. 975 :param kwargs: Passed to the superclass constructor, 976 except for: 977 * merge - if False (the default), the first successful 978 search from any of the locators is returned. If True, 979 the results from all locators are merged (this can be 980 slow). 981 """ 982 self.merge = kwargs.pop('merge', False) 983 self.locators = locators 984 super(AggregatingLocator, self).__init__(**kwargs) 985 986 def clear_cache(self): 987 super(AggregatingLocator, self).clear_cache() 988 for locator in self.locators: 989 locator.clear_cache() 990 991 def _set_scheme(self, value): 992 self._scheme = value 993 for locator in self.locators: 994 locator.scheme = value 995 996 scheme = property(Locator.scheme.fget, _set_scheme) 997 998 def _get_project(self, name): 999 result = {} 1000 for locator in self.locators: 1001 d = locator.get_project(name) 1002 if d: 1003 if self.merge: 1004 files = result.get('urls', {}) 1005 digests = result.get('digests', {}) 1006 # next line could overwrite result['urls'], result['digests'] 1007 result.update(d) 1008 df = result.get('urls') 1009 if files and df: 1010 for k, v in files.items(): 1011 if k in df: 1012 df[k] |= v 1013 else: 1014 df[k] = v 1015 dd = result.get('digests') 1016 if digests and dd: 1017 dd.update(digests) 1018 else: 1019 # See issue #18. If any dists are found and we're looking 1020 # for specific constraints, we only return something if 1021 # a match is found. For example, if a DirectoryLocator 1022 # returns just foo (1.0) while we're looking for 1023 # foo (>= 2.0), we'll pretend there was nothing there so 1024 # that subsequent locators can be queried. Otherwise we 1025 # would just return foo (1.0) which would then lead to a 1026 # failure to find foo (>= 2.0), because other locators 1027 # weren't searched. Note that this only matters when 1028 # merge=False. 1029 if self.matcher is None: 1030 found = True 1031 else: 1032 found = False 1033 for k in d: 1034 if self.matcher.match(k): 1035 found = True 1036 break 1037 if found: 1038 result = d 1039 break 1040 return result 1041 1042 def get_distribution_names(self): 1043 """ 1044 Return all the distribution names known to this locator. 1045 """ 1046 result = set() 1047 for locator in self.locators: 1048 try: 1049 result |= locator.get_distribution_names() 1050 except NotImplementedError: 1051 pass 1052 return result 1053 1054 1055# We use a legacy scheme simply because most of the dists on PyPI use legacy 1056# versions which don't conform to PEP 426 / PEP 440. 1057default_locator = AggregatingLocator( 1058 JSONLocator(), 1059 SimpleScrapingLocator('https://pypi.org/simple/', 1060 timeout=3.0), 1061 scheme='legacy') 1062 1063locate = default_locator.locate 1064 1065 1066class DependencyFinder(object): 1067 """ 1068 Locate dependencies for distributions. 1069 """ 1070 1071 def __init__(self, locator=None): 1072 """ 1073 Initialise an instance, using the specified locator 1074 to locate distributions. 1075 """ 1076 self.locator = locator or default_locator 1077 self.scheme = get_scheme(self.locator.scheme) 1078 1079 def add_distribution(self, dist): 1080 """ 1081 Add a distribution to the finder. This will update internal information 1082 about who provides what. 1083 :param dist: The distribution to add. 1084 """ 1085 logger.debug('adding distribution %s', dist) 1086 name = dist.key 1087 self.dists_by_name[name] = dist 1088 self.dists[(name, dist.version)] = dist 1089 for p in dist.provides: 1090 name, version = parse_name_and_version(p) 1091 logger.debug('Add to provided: %s, %s, %s', name, version, dist) 1092 self.provided.setdefault(name, set()).add((version, dist)) 1093 1094 def remove_distribution(self, dist): 1095 """ 1096 Remove a distribution from the finder. This will update internal 1097 information about who provides what. 1098 :param dist: The distribution to remove. 1099 """ 1100 logger.debug('removing distribution %s', dist) 1101 name = dist.key 1102 del self.dists_by_name[name] 1103 del self.dists[(name, dist.version)] 1104 for p in dist.provides: 1105 name, version = parse_name_and_version(p) 1106 logger.debug('Remove from provided: %s, %s, %s', name, version, dist) 1107 s = self.provided[name] 1108 s.remove((version, dist)) 1109 if not s: 1110 del self.provided[name] 1111 1112 def get_matcher(self, reqt): 1113 """ 1114 Get a version matcher for a requirement. 1115 :param reqt: The requirement 1116 :type reqt: str 1117 :return: A version matcher (an instance of 1118 :class:`distlib.version.Matcher`). 1119 """ 1120 try: 1121 matcher = self.scheme.matcher(reqt) 1122 except UnsupportedVersionError: # pragma: no cover 1123 # XXX compat-mode if cannot read the version 1124 name = reqt.split()[0] 1125 matcher = self.scheme.matcher(name) 1126 return matcher 1127 1128 def find_providers(self, reqt): 1129 """ 1130 Find the distributions which can fulfill a requirement. 1131 1132 :param reqt: The requirement. 1133 :type reqt: str 1134 :return: A set of distribution which can fulfill the requirement. 1135 """ 1136 matcher = self.get_matcher(reqt) 1137 name = matcher.key # case-insensitive 1138 result = set() 1139 provided = self.provided 1140 if name in provided: 1141 for version, provider in provided[name]: 1142 try: 1143 match = matcher.match(version) 1144 except UnsupportedVersionError: 1145 match = False 1146 1147 if match: 1148 result.add(provider) 1149 break 1150 return result 1151 1152 def try_to_replace(self, provider, other, problems): 1153 """ 1154 Attempt to replace one provider with another. This is typically used 1155 when resolving dependencies from multiple sources, e.g. A requires 1156 (B >= 1.0) while C requires (B >= 1.1). 1157 1158 For successful replacement, ``provider`` must meet all the requirements 1159 which ``other`` fulfills. 1160 1161 :param provider: The provider we are trying to replace with. 1162 :param other: The provider we're trying to replace. 1163 :param problems: If False is returned, this will contain what 1164 problems prevented replacement. This is currently 1165 a tuple of the literal string 'cantreplace', 1166 ``provider``, ``other`` and the set of requirements 1167 that ``provider`` couldn't fulfill. 1168 :return: True if we can replace ``other`` with ``provider``, else 1169 False. 1170 """ 1171 rlist = self.reqts[other] 1172 unmatched = set() 1173 for s in rlist: 1174 matcher = self.get_matcher(s) 1175 if not matcher.match(provider.version): 1176 unmatched.add(s) 1177 if unmatched: 1178 # can't replace other with provider 1179 problems.add(('cantreplace', provider, other, 1180 frozenset(unmatched))) 1181 result = False 1182 else: 1183 # can replace other with provider 1184 self.remove_distribution(other) 1185 del self.reqts[other] 1186 for s in rlist: 1187 self.reqts.setdefault(provider, set()).add(s) 1188 self.add_distribution(provider) 1189 result = True 1190 return result 1191 1192 def find(self, requirement, meta_extras=None, prereleases=False): 1193 """ 1194 Find a distribution and all distributions it depends on. 1195 1196 :param requirement: The requirement specifying the distribution to 1197 find, or a Distribution instance. 1198 :param meta_extras: A list of meta extras such as :test:, :build: and 1199 so on. 1200 :param prereleases: If ``True``, allow pre-release versions to be 1201 returned - otherwise, don't return prereleases 1202 unless they're all that's available. 1203 1204 Return a set of :class:`Distribution` instances and a set of 1205 problems. 1206 1207 The distributions returned should be such that they have the 1208 :attr:`required` attribute set to ``True`` if they were 1209 from the ``requirement`` passed to ``find()``, and they have the 1210 :attr:`build_time_dependency` attribute set to ``True`` unless they 1211 are post-installation dependencies of the ``requirement``. 1212 1213 The problems should be a tuple consisting of the string 1214 ``'unsatisfied'`` and the requirement which couldn't be satisfied 1215 by any distribution known to the locator. 1216 """ 1217 1218 self.provided = {} 1219 self.dists = {} 1220 self.dists_by_name = {} 1221 self.reqts = {} 1222 1223 meta_extras = set(meta_extras or []) 1224 if ':*:' in meta_extras: 1225 meta_extras.remove(':*:') 1226 # :meta: and :run: are implicitly included 1227 meta_extras |= set([':test:', ':build:', ':dev:']) 1228 1229 if isinstance(requirement, Distribution): 1230 dist = odist = requirement 1231 logger.debug('passed %s as requirement', odist) 1232 else: 1233 dist = odist = self.locator.locate(requirement, 1234 prereleases=prereleases) 1235 if dist is None: 1236 raise DistlibException('Unable to locate %r' % requirement) 1237 logger.debug('located %s', odist) 1238 dist.requested = True 1239 problems = set() 1240 todo = set([dist]) 1241 install_dists = set([odist]) 1242 while todo: 1243 dist = todo.pop() 1244 name = dist.key # case-insensitive 1245 if name not in self.dists_by_name: 1246 self.add_distribution(dist) 1247 else: 1248 #import pdb; pdb.set_trace() 1249 other = self.dists_by_name[name] 1250 if other != dist: 1251 self.try_to_replace(dist, other, problems) 1252 1253 ireqts = dist.run_requires | dist.meta_requires 1254 sreqts = dist.build_requires 1255 ereqts = set() 1256 if meta_extras and dist in install_dists: 1257 for key in ('test', 'build', 'dev'): 1258 e = ':%s:' % key 1259 if e in meta_extras: 1260 ereqts |= getattr(dist, '%s_requires' % key) 1261 all_reqts = ireqts | sreqts | ereqts 1262 for r in all_reqts: 1263 providers = self.find_providers(r) 1264 if not providers: 1265 logger.debug('No providers found for %r', r) 1266 provider = self.locator.locate(r, prereleases=prereleases) 1267 # If no provider is found and we didn't consider 1268 # prereleases, consider them now. 1269 if provider is None and not prereleases: 1270 provider = self.locator.locate(r, prereleases=True) 1271 if provider is None: 1272 logger.debug('Cannot satisfy %r', r) 1273 problems.add(('unsatisfied', r)) 1274 else: 1275 n, v = provider.key, provider.version 1276 if (n, v) not in self.dists: 1277 todo.add(provider) 1278 providers.add(provider) 1279 if r in ireqts and dist in install_dists: 1280 install_dists.add(provider) 1281 logger.debug('Adding %s to install_dists', 1282 provider.name_and_version) 1283 for p in providers: 1284 name = p.key 1285 if name not in self.dists_by_name: 1286 self.reqts.setdefault(p, set()).add(r) 1287 else: 1288 other = self.dists_by_name[name] 1289 if other != p: 1290 # see if other can be replaced by p 1291 self.try_to_replace(p, other, problems) 1292 1293 dists = set(self.dists.values()) 1294 for dist in dists: 1295 dist.build_time_dependency = dist not in install_dists 1296 if dist.build_time_dependency: 1297 logger.debug('%s is a build-time dependency only.', 1298 dist.name_and_version) 1299 logger.debug('find done for %s', odist) 1300 return dists, problems 1301