1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2012-2015 Vinay Sajip.
4# Licensed to the Python Software Foundation under a contributor agreement.
5# See LICENSE.txt and CONTRIBUTORS.txt.
6#
7
8import gzip
9from io import BytesIO
10import json
11import logging
12import os
13import posixpath
14import re
15try:
16    import threading
17except ImportError:  # pragma: no cover
18    import dummy_threading as threading
19import zlib
20
21from . import DistlibException
22from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url,
23                     queue, quote, unescape, build_opener,
24                     HTTPRedirectHandler as BaseRedirectHandler, text_type,
25                     Request, HTTPError, URLError)
26from .database import Distribution, DistributionPath, make_dist
27from .metadata import Metadata, MetadataInvalidError
28from .util import (cached_property, ensure_slash, split_filename, get_project_data,
29                   parse_requirement, parse_name_and_version, ServerProxy,
30                   normalize_name)
31from .version import get_scheme, UnsupportedVersionError
32from .wheel import Wheel, is_compatible
33
34logger = logging.getLogger(__name__)
35
36HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)')
37CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I)
38HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml')
39DEFAULT_INDEX = 'https://pypi.org/pypi'
40
41def get_all_distribution_names(url=None):
42    """
43    Return all distribution names known by an index.
44    :param url: The URL of the index.
45    :return: A list of all known distribution names.
46    """
47    if url is None:
48        url = DEFAULT_INDEX
49    client = ServerProxy(url, timeout=3.0)
50    try:
51        return client.list_packages()
52    finally:
53        client('close')()
54
55class RedirectHandler(BaseRedirectHandler):
56    """
57    A class to work around a bug in some Python 3.2.x releases.
58    """
59    # There's a bug in the base version for some 3.2.x
60    # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header
61    # returns e.g. /abc, it bails because it says the scheme ''
62    # is bogus, when actually it should use the request's
63    # URL for the scheme. See Python issue #13696.
64    def http_error_302(self, req, fp, code, msg, headers):
65        # Some servers (incorrectly) return multiple Location headers
66        # (so probably same goes for URI).  Use first header.
67        newurl = None
68        for key in ('location', 'uri'):
69            if key in headers:
70                newurl = headers[key]
71                break
72        if newurl is None:  # pragma: no cover
73            return
74        urlparts = urlparse(newurl)
75        if urlparts.scheme == '':
76            newurl = urljoin(req.get_full_url(), newurl)
77            if hasattr(headers, 'replace_header'):
78                headers.replace_header(key, newurl)
79            else:
80                headers[key] = newurl
81        return BaseRedirectHandler.http_error_302(self, req, fp, code, msg,
82                                                  headers)
83
84    http_error_301 = http_error_303 = http_error_307 = http_error_302
85
86class Locator(object):
87    """
88    A base class for locators - things that locate distributions.
89    """
90    source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz')
91    binary_extensions = ('.egg', '.exe', '.whl')
92    excluded_extensions = ('.pdf',)
93
94    # A list of tags indicating which wheels you want to match. The default
95    # value of None matches against the tags compatible with the running
96    # Python. If you want to match other values, set wheel_tags on a locator
97    # instance to a list of tuples (pyver, abi, arch) which you want to match.
98    wheel_tags = None
99
100    downloadable_extensions = source_extensions + ('.whl',)
101
102    def __init__(self, scheme='default'):
103        """
104        Initialise an instance.
105        :param scheme: Because locators look for most recent versions, they
106                       need to know the version scheme to use. This specifies
107                       the current PEP-recommended scheme - use ``'legacy'``
108                       if you need to support existing distributions on PyPI.
109        """
110        self._cache = {}
111        self.scheme = scheme
112        # Because of bugs in some of the handlers on some of the platforms,
113        # we use our own opener rather than just using urlopen.
114        self.opener = build_opener(RedirectHandler())
115        # If get_project() is called from locate(), the matcher instance
116        # is set from the requirement passed to locate(). See issue #18 for
117        # why this can be useful to know.
118        self.matcher = None
119        self.errors = queue.Queue()
120
121    def get_errors(self):
122        """
123        Return any errors which have occurred.
124        """
125        result = []
126        while not self.errors.empty():  # pragma: no cover
127            try:
128                e = self.errors.get(False)
129                result.append(e)
130            except self.errors.Empty:
131                continue
132            self.errors.task_done()
133        return result
134
135    def clear_errors(self):
136        """
137        Clear any errors which may have been logged.
138        """
139        # Just get the errors and throw them away
140        self.get_errors()
141
142    def clear_cache(self):
143        self._cache.clear()
144
145    def _get_scheme(self):
146        return self._scheme
147
148    def _set_scheme(self, value):
149        self._scheme = value
150
151    scheme = property(_get_scheme, _set_scheme)
152
153    def _get_project(self, name):
154        """
155        For a given project, get a dictionary mapping available versions to Distribution
156        instances.
157
158        This should be implemented in subclasses.
159
160        If called from a locate() request, self.matcher will be set to a
161        matcher for the requirement to satisfy, otherwise it will be None.
162        """
163        raise NotImplementedError('Please implement in the subclass')
164
165    def get_distribution_names(self):
166        """
167        Return all the distribution names known to this locator.
168        """
169        raise NotImplementedError('Please implement in the subclass')
170
171    def get_project(self, name):
172        """
173        For a given project, get a dictionary mapping available versions to Distribution
174        instances.
175
176        This calls _get_project to do all the work, and just implements a caching layer on top.
177        """
178        if self._cache is None:  # pragma: no cover
179            result = self._get_project(name)
180        elif name in self._cache:
181            result = self._cache[name]
182        else:
183            self.clear_errors()
184            result = self._get_project(name)
185            self._cache[name] = result
186        return result
187
188    def score_url(self, url):
189        """
190        Give an url a score which can be used to choose preferred URLs
191        for a given project release.
192        """
193        t = urlparse(url)
194        basename = posixpath.basename(t.path)
195        compatible = True
196        is_wheel = basename.endswith('.whl')
197        is_downloadable = basename.endswith(self.downloadable_extensions)
198        if is_wheel:
199            compatible = is_compatible(Wheel(basename), self.wheel_tags)
200        return (t.scheme == 'https', 'pypi.org' in t.netloc,
201                is_downloadable, is_wheel, compatible, basename)
202
203    def prefer_url(self, url1, url2):
204        """
205        Choose one of two URLs where both are candidates for distribution
206        archives for the same version of a distribution (for example,
207        .tar.gz vs. zip).
208
209        The current implementation favours https:// URLs over http://, archives
210        from PyPI over those from other locations, wheel compatibility (if a
211        wheel) and then the archive name.
212        """
213        result = url2
214        if url1:
215            s1 = self.score_url(url1)
216            s2 = self.score_url(url2)
217            if s1 > s2:
218                result = url1
219            if result != url2:
220                logger.debug('Not replacing %r with %r', url1, url2)
221            else:
222                logger.debug('Replacing %r with %r', url1, url2)
223        return result
224
225    def split_filename(self, filename, project_name):
226        """
227        Attempt to split a filename in project name, version and Python version.
228        """
229        return split_filename(filename, project_name)
230
231    def convert_url_to_download_info(self, url, project_name):
232        """
233        See if a URL is a candidate for a download URL for a project (the URL
234        has typically been scraped from an HTML page).
235
236        If it is, a dictionary is returned with keys "name", "version",
237        "filename" and "url"; otherwise, None is returned.
238        """
239        def same_project(name1, name2):
240            return normalize_name(name1) == normalize_name(name2)
241
242        result = None
243        scheme, netloc, path, params, query, frag = urlparse(url)
244        if frag.lower().startswith('egg='):  # pragma: no cover
245            logger.debug('%s: version hint in fragment: %r',
246                         project_name, frag)
247        m = HASHER_HASH.match(frag)
248        if m:
249            algo, digest = m.groups()
250        else:
251            algo, digest = None, None
252        origpath = path
253        if path and path[-1] == '/':  # pragma: no cover
254            path = path[:-1]
255        if path.endswith('.whl'):
256            try:
257                wheel = Wheel(path)
258                if not is_compatible(wheel, self.wheel_tags):
259                    logger.debug('Wheel not compatible: %s', path)
260                else:
261                    if project_name is None:
262                        include = True
263                    else:
264                        include = same_project(wheel.name, project_name)
265                    if include:
266                        result = {
267                            'name': wheel.name,
268                            'version': wheel.version,
269                            'filename': wheel.filename,
270                            'url': urlunparse((scheme, netloc, origpath,
271                                               params, query, '')),
272                            'python-version': ', '.join(
273                                ['.'.join(list(v[2:])) for v in wheel.pyver]),
274                        }
275            except Exception as e:  # pragma: no cover
276                logger.warning('invalid path for wheel: %s', path)
277        elif not path.endswith(self.downloadable_extensions):  # pragma: no cover
278            logger.debug('Not downloadable: %s', path)
279        else:  # downloadable extension
280            path = filename = posixpath.basename(path)
281            for ext in self.downloadable_extensions:
282                if path.endswith(ext):
283                    path = path[:-len(ext)]
284                    t = self.split_filename(path, project_name)
285                    if not t:  # pragma: no cover
286                        logger.debug('No match for project/version: %s', path)
287                    else:
288                        name, version, pyver = t
289                        if not project_name or same_project(project_name, name):
290                            result = {
291                                'name': name,
292                                'version': version,
293                                'filename': filename,
294                                'url': urlunparse((scheme, netloc, origpath,
295                                                   params, query, '')),
296                                #'packagetype': 'sdist',
297                            }
298                            if pyver:  # pragma: no cover
299                                result['python-version'] = pyver
300                    break
301        if result and algo:
302            result['%s_digest' % algo] = digest
303        return result
304
305    def _get_digest(self, info):
306        """
307        Get a digest from a dictionary by looking at a "digests" dictionary
308        or keys of the form 'algo_digest'.
309
310        Returns a 2-tuple (algo, digest) if found, else None. Currently
311        looks only for SHA256, then MD5.
312        """
313        result = None
314        if 'digests' in info:
315            digests = info['digests']
316            for algo in ('sha256', 'md5'):
317                if algo in digests:
318                    result = (algo, digests[algo])
319                    break
320        if not result:
321            for algo in ('sha256', 'md5'):
322                key = '%s_digest' % algo
323                if key in info:
324                    result = (algo, info[key])
325                    break
326        return result
327
328    def _update_version_data(self, result, info):
329        """
330        Update a result dictionary (the final result from _get_project) with a
331        dictionary for a specific version, which typically holds information
332        gleaned from a filename or URL for an archive for the distribution.
333        """
334        name = info.pop('name')
335        version = info.pop('version')
336        if version in result:
337            dist = result[version]
338            md = dist.metadata
339        else:
340            dist = make_dist(name, version, scheme=self.scheme)
341            md = dist.metadata
342        dist.digest = digest = self._get_digest(info)
343        url = info['url']
344        result['digests'][url] = digest
345        if md.source_url != info['url']:
346            md.source_url = self.prefer_url(md.source_url, url)
347            result['urls'].setdefault(version, set()).add(url)
348        dist.locator = self
349        result[version] = dist
350
351    def locate(self, requirement, prereleases=False):
352        """
353        Find the most recent distribution which matches the given
354        requirement.
355
356        :param requirement: A requirement of the form 'foo (1.0)' or perhaps
357                            'foo (>= 1.0, < 2.0, != 1.3)'
358        :param prereleases: If ``True``, allow pre-release versions
359                            to be located. Otherwise, pre-release versions
360                            are not returned.
361        :return: A :class:`Distribution` instance, or ``None`` if no such
362                 distribution could be located.
363        """
364        result = None
365        r = parse_requirement(requirement)
366        if r is None:  # pragma: no cover
367            raise DistlibException('Not a valid requirement: %r' % requirement)
368        scheme = get_scheme(self.scheme)
369        self.matcher = matcher = scheme.matcher(r.requirement)
370        logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__)
371        versions = self.get_project(r.name)
372        if len(versions) > 2:   # urls and digests keys are present
373            # sometimes, versions are invalid
374            slist = []
375            vcls = matcher.version_class
376            for k in versions:
377                if k in ('urls', 'digests'):
378                    continue
379                try:
380                    if not matcher.match(k):
381                        pass  # logger.debug('%s did not match %r', matcher, k)
382                    else:
383                        if prereleases or not vcls(k).is_prerelease:
384                            slist.append(k)
385                        # else:
386                            # logger.debug('skipping pre-release '
387                                         # 'version %s of %s', k, matcher.name)
388                except Exception:  # pragma: no cover
389                    logger.warning('error matching %s with %r', matcher, k)
390                    pass # slist.append(k)
391            if len(slist) > 1:
392                slist = sorted(slist, key=scheme.key)
393            if slist:
394                logger.debug('sorted list: %s', slist)
395                version = slist[-1]
396                result = versions[version]
397        if result:
398            if r.extras:
399                result.extras = r.extras
400            result.download_urls = versions.get('urls', {}).get(version, set())
401            d = {}
402            sd = versions.get('digests', {})
403            for url in result.download_urls:
404                if url in sd:  # pragma: no cover
405                    d[url] = sd[url]
406            result.digests = d
407        self.matcher = None
408        return result
409
410
411class PyPIRPCLocator(Locator):
412    """
413    This locator uses XML-RPC to locate distributions. It therefore
414    cannot be used with simple mirrors (that only mirror file content).
415    """
416    def __init__(self, url, **kwargs):
417        """
418        Initialise an instance.
419
420        :param url: The URL to use for XML-RPC.
421        :param kwargs: Passed to the superclass constructor.
422        """
423        super(PyPIRPCLocator, self).__init__(**kwargs)
424        self.base_url = url
425        self.client = ServerProxy(url, timeout=3.0)
426
427    def get_distribution_names(self):
428        """
429        Return all the distribution names known to this locator.
430        """
431        return set(self.client.list_packages())
432
433    def _get_project(self, name):
434        result = {'urls': {}, 'digests': {}}
435        versions = self.client.package_releases(name, True)
436        for v in versions:
437            urls = self.client.release_urls(name, v)
438            data = self.client.release_data(name, v)
439            metadata = Metadata(scheme=self.scheme)
440            metadata.name = data['name']
441            metadata.version = data['version']
442            metadata.license = data.get('license')
443            metadata.keywords = data.get('keywords', [])
444            metadata.summary = data.get('summary')
445            dist = Distribution(metadata)
446            if urls:
447                info = urls[0]
448                metadata.source_url = info['url']
449                dist.digest = self._get_digest(info)
450                dist.locator = self
451                result[v] = dist
452                for info in urls:
453                    url = info['url']
454                    digest = self._get_digest(info)
455                    result['urls'].setdefault(v, set()).add(url)
456                    result['digests'][url] = digest
457        return result
458
459class PyPIJSONLocator(Locator):
460    """
461    This locator uses PyPI's JSON interface. It's very limited in functionality
462    and probably not worth using.
463    """
464    def __init__(self, url, **kwargs):
465        super(PyPIJSONLocator, self).__init__(**kwargs)
466        self.base_url = ensure_slash(url)
467
468    def get_distribution_names(self):
469        """
470        Return all the distribution names known to this locator.
471        """
472        raise NotImplementedError('Not available from this locator')
473
474    def _get_project(self, name):
475        result = {'urls': {}, 'digests': {}}
476        url = urljoin(self.base_url, '%s/json' % quote(name))
477        try:
478            resp = self.opener.open(url)
479            data = resp.read().decode() # for now
480            d = json.loads(data)
481            md = Metadata(scheme=self.scheme)
482            data = d['info']
483            md.name = data['name']
484            md.version = data['version']
485            md.license = data.get('license')
486            md.keywords = data.get('keywords', [])
487            md.summary = data.get('summary')
488            dist = Distribution(md)
489            dist.locator = self
490            urls = d['urls']
491            result[md.version] = dist
492            for info in d['urls']:
493                url = info['url']
494                dist.download_urls.add(url)
495                dist.digests[url] = self._get_digest(info)
496                result['urls'].setdefault(md.version, set()).add(url)
497                result['digests'][url] = self._get_digest(info)
498            # Now get other releases
499            for version, infos in d['releases'].items():
500                if version == md.version:
501                    continue    # already done
502                omd = Metadata(scheme=self.scheme)
503                omd.name = md.name
504                omd.version = version
505                odist = Distribution(omd)
506                odist.locator = self
507                result[version] = odist
508                for info in infos:
509                    url = info['url']
510                    odist.download_urls.add(url)
511                    odist.digests[url] = self._get_digest(info)
512                    result['urls'].setdefault(version, set()).add(url)
513                    result['digests'][url] = self._get_digest(info)
514#            for info in urls:
515#                md.source_url = info['url']
516#                dist.digest = self._get_digest(info)
517#                dist.locator = self
518#                for info in urls:
519#                    url = info['url']
520#                    result['urls'].setdefault(md.version, set()).add(url)
521#                    result['digests'][url] = self._get_digest(info)
522        except Exception as e:
523            self.errors.put(text_type(e))
524            logger.exception('JSON fetch failed: %s', e)
525        return result
526
527
528class Page(object):
529    """
530    This class represents a scraped HTML page.
531    """
532    # The following slightly hairy-looking regex just looks for the contents of
533    # an anchor link, which has an attribute "href" either immediately preceded
534    # or immediately followed by a "rel" attribute. The attribute values can be
535    # declared with double quotes, single quotes or no quotes - which leads to
536    # the length of the expression.
537    _href = re.compile("""
538(rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)?
539href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*))
540(\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))?
541""", re.I | re.S | re.X)
542    _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S)
543
544    def __init__(self, data, url):
545        """
546        Initialise an instance with the Unicode page contents and the URL they
547        came from.
548        """
549        self.data = data
550        self.base_url = self.url = url
551        m = self._base.search(self.data)
552        if m:
553            self.base_url = m.group(1)
554
555    _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
556
557    @cached_property
558    def links(self):
559        """
560        Return the URLs of all the links on a page together with information
561        about their "rel" attribute, for determining which ones to treat as
562        downloads and which ones to queue for further scraping.
563        """
564        def clean(url):
565            "Tidy up an URL."
566            scheme, netloc, path, params, query, frag = urlparse(url)
567            return urlunparse((scheme, netloc, quote(path),
568                               params, query, frag))
569
570        result = set()
571        for match in self._href.finditer(self.data):
572            d = match.groupdict('')
573            rel = (d['rel1'] or d['rel2'] or d['rel3'] or
574                   d['rel4'] or d['rel5'] or d['rel6'])
575            url = d['url1'] or d['url2'] or d['url3']
576            url = urljoin(self.base_url, url)
577            url = unescape(url)
578            url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
579            result.add((url, rel))
580        # We sort the result, hoping to bring the most recent versions
581        # to the front
582        result = sorted(result, key=lambda t: t[0], reverse=True)
583        return result
584
585
586class SimpleScrapingLocator(Locator):
587    """
588    A locator which scrapes HTML pages to locate downloads for a distribution.
589    This runs multiple threads to do the I/O; performance is at least as good
590    as pip's PackageFinder, which works in an analogous fashion.
591    """
592
593    # These are used to deal with various Content-Encoding schemes.
594    decoders = {
595        'deflate': zlib.decompress,
596        'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(b)).read(),
597        'none': lambda b: b,
598    }
599
600    def __init__(self, url, timeout=None, num_workers=10, **kwargs):
601        """
602        Initialise an instance.
603        :param url: The root URL to use for scraping.
604        :param timeout: The timeout, in seconds, to be applied to requests.
605                        This defaults to ``None`` (no timeout specified).
606        :param num_workers: The number of worker threads you want to do I/O,
607                            This defaults to 10.
608        :param kwargs: Passed to the superclass.
609        """
610        super(SimpleScrapingLocator, self).__init__(**kwargs)
611        self.base_url = ensure_slash(url)
612        self.timeout = timeout
613        self._page_cache = {}
614        self._seen = set()
615        self._to_fetch = queue.Queue()
616        self._bad_hosts = set()
617        self.skip_externals = False
618        self.num_workers = num_workers
619        self._lock = threading.RLock()
620        # See issue #45: we need to be resilient when the locator is used
621        # in a thread, e.g. with concurrent.futures. We can't use self._lock
622        # as it is for coordinating our internal threads - the ones created
623        # in _prepare_threads.
624        self._gplock = threading.RLock()
625        self.platform_check = False  # See issue #112
626
627    def _prepare_threads(self):
628        """
629        Threads are created only when get_project is called, and terminate
630        before it returns. They are there primarily to parallelise I/O (i.e.
631        fetching web pages).
632        """
633        self._threads = []
634        for i in range(self.num_workers):
635            t = threading.Thread(target=self._fetch)
636            t.daemon = True
637            t.start()
638            self._threads.append(t)
639
640    def _wait_threads(self):
641        """
642        Tell all the threads to terminate (by sending a sentinel value) and
643        wait for them to do so.
644        """
645        # Note that you need two loops, since you can't say which
646        # thread will get each sentinel
647        for t in self._threads:
648            self._to_fetch.put(None)    # sentinel
649        for t in self._threads:
650            t.join()
651        self._threads = []
652
653    def _get_project(self, name):
654        result = {'urls': {}, 'digests': {}}
655        with self._gplock:
656            self.result = result
657            self.project_name = name
658            url = urljoin(self.base_url, '%s/' % quote(name))
659            self._seen.clear()
660            self._page_cache.clear()
661            self._prepare_threads()
662            try:
663                logger.debug('Queueing %s', url)
664                self._to_fetch.put(url)
665                self._to_fetch.join()
666            finally:
667                self._wait_threads()
668            del self.result
669        return result
670
671    platform_dependent = re.compile(r'\b(linux_(i\d86|x86_64|arm\w+)|'
672                                    r'win(32|_amd64)|macosx_?\d+)\b', re.I)
673
674    def _is_platform_dependent(self, url):
675        """
676        Does an URL refer to a platform-specific download?
677        """
678        return self.platform_dependent.search(url)
679
680    def _process_download(self, url):
681        """
682        See if an URL is a suitable download for a project.
683
684        If it is, register information in the result dictionary (for
685        _get_project) about the specific version it's for.
686
687        Note that the return value isn't actually used other than as a boolean
688        value.
689        """
690        if self.platform_check and self._is_platform_dependent(url):
691            info = None
692        else:
693            info = self.convert_url_to_download_info(url, self.project_name)
694        logger.debug('process_download: %s -> %s', url, info)
695        if info:
696            with self._lock:    # needed because self.result is shared
697                self._update_version_data(self.result, info)
698        return info
699
700    def _should_queue(self, link, referrer, rel):
701        """
702        Determine whether a link URL from a referring page and with a
703        particular "rel" attribute should be queued for scraping.
704        """
705        scheme, netloc, path, _, _, _ = urlparse(link)
706        if path.endswith(self.source_extensions + self.binary_extensions +
707                         self.excluded_extensions):
708            result = False
709        elif self.skip_externals and not link.startswith(self.base_url):
710            result = False
711        elif not referrer.startswith(self.base_url):
712            result = False
713        elif rel not in ('homepage', 'download'):
714            result = False
715        elif scheme not in ('http', 'https', 'ftp'):
716            result = False
717        elif self._is_platform_dependent(link):
718            result = False
719        else:
720            host = netloc.split(':', 1)[0]
721            if host.lower() == 'localhost':
722                result = False
723            else:
724                result = True
725        logger.debug('should_queue: %s (%s) from %s -> %s', link, rel,
726                     referrer, result)
727        return result
728
729    def _fetch(self):
730        """
731        Get a URL to fetch from the work queue, get the HTML page, examine its
732        links for download candidates and candidates for further scraping.
733
734        This is a handy method to run in a thread.
735        """
736        while True:
737            url = self._to_fetch.get()
738            try:
739                if url:
740                    page = self.get_page(url)
741                    if page is None:    # e.g. after an error
742                        continue
743                    for link, rel in page.links:
744                        if link not in self._seen:
745                            try:
746                                self._seen.add(link)
747                                if (not self._process_download(link) and
748                                    self._should_queue(link, url, rel)):
749                                    logger.debug('Queueing %s from %s', link, url)
750                                    self._to_fetch.put(link)
751                            except MetadataInvalidError:  # e.g. invalid versions
752                                pass
753            except Exception as e:  # pragma: no cover
754                self.errors.put(text_type(e))
755            finally:
756                # always do this, to avoid hangs :-)
757                self._to_fetch.task_done()
758            if not url:
759                #logger.debug('Sentinel seen, quitting.')
760                break
761
762    def get_page(self, url):
763        """
764        Get the HTML for an URL, possibly from an in-memory cache.
765
766        XXX TODO Note: this cache is never actually cleared. It's assumed that
767        the data won't get stale over the lifetime of a locator instance (not
768        necessarily true for the default_locator).
769        """
770        # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api
771        scheme, netloc, path, _, _, _ = urlparse(url)
772        if scheme == 'file' and os.path.isdir(url2pathname(path)):
773            url = urljoin(ensure_slash(url), 'index.html')
774
775        if url in self._page_cache:
776            result = self._page_cache[url]
777            logger.debug('Returning %s from cache: %s', url, result)
778        else:
779            host = netloc.split(':', 1)[0]
780            result = None
781            if host in self._bad_hosts:
782                logger.debug('Skipping %s due to bad host %s', url, host)
783            else:
784                req = Request(url, headers={'Accept-encoding': 'identity'})
785                try:
786                    logger.debug('Fetching %s', url)
787                    resp = self.opener.open(req, timeout=self.timeout)
788                    logger.debug('Fetched %s', url)
789                    headers = resp.info()
790                    content_type = headers.get('Content-Type', '')
791                    if HTML_CONTENT_TYPE.match(content_type):
792                        final_url = resp.geturl()
793                        data = resp.read()
794                        encoding = headers.get('Content-Encoding')
795                        if encoding:
796                            decoder = self.decoders[encoding]   # fail if not found
797                            data = decoder(data)
798                        encoding = 'utf-8'
799                        m = CHARSET.search(content_type)
800                        if m:
801                            encoding = m.group(1)
802                        try:
803                            data = data.decode(encoding)
804                        except UnicodeError:  # pragma: no cover
805                            data = data.decode('latin-1')    # fallback
806                        result = Page(data, final_url)
807                        self._page_cache[final_url] = result
808                except HTTPError as e:
809                    if e.code != 404:
810                        logger.exception('Fetch failed: %s: %s', url, e)
811                except URLError as e:  # pragma: no cover
812                    logger.exception('Fetch failed: %s: %s', url, e)
813                    with self._lock:
814                        self._bad_hosts.add(host)
815                except Exception as e:  # pragma: no cover
816                    logger.exception('Fetch failed: %s: %s', url, e)
817                finally:
818                    self._page_cache[url] = result   # even if None (failure)
819        return result
820
821    _distname_re = re.compile('<a href=[^>]*>([^<]+)<')
822
823    def get_distribution_names(self):
824        """
825        Return all the distribution names known to this locator.
826        """
827        result = set()
828        page = self.get_page(self.base_url)
829        if not page:
830            raise DistlibException('Unable to get %s' % self.base_url)
831        for match in self._distname_re.finditer(page.data):
832            result.add(match.group(1))
833        return result
834
835class DirectoryLocator(Locator):
836    """
837    This class locates distributions in a directory tree.
838    """
839
840    def __init__(self, path, **kwargs):
841        """
842        Initialise an instance.
843        :param path: The root of the directory tree to search.
844        :param kwargs: Passed to the superclass constructor,
845                       except for:
846                       * recursive - if True (the default), subdirectories are
847                         recursed into. If False, only the top-level directory
848                         is searched,
849        """
850        self.recursive = kwargs.pop('recursive', True)
851        super(DirectoryLocator, self).__init__(**kwargs)
852        path = os.path.abspath(path)
853        if not os.path.isdir(path):  # pragma: no cover
854            raise DistlibException('Not a directory: %r' % path)
855        self.base_dir = path
856
857    def should_include(self, filename, parent):
858        """
859        Should a filename be considered as a candidate for a distribution
860        archive? As well as the filename, the directory which contains it
861        is provided, though not used by the current implementation.
862        """
863        return filename.endswith(self.downloadable_extensions)
864
865    def _get_project(self, name):
866        result = {'urls': {}, 'digests': {}}
867        for root, dirs, files in os.walk(self.base_dir):
868            for fn in files:
869                if self.should_include(fn, root):
870                    fn = os.path.join(root, fn)
871                    url = urlunparse(('file', '',
872                                      pathname2url(os.path.abspath(fn)),
873                                      '', '', ''))
874                    info = self.convert_url_to_download_info(url, name)
875                    if info:
876                        self._update_version_data(result, info)
877            if not self.recursive:
878                break
879        return result
880
881    def get_distribution_names(self):
882        """
883        Return all the distribution names known to this locator.
884        """
885        result = set()
886        for root, dirs, files in os.walk(self.base_dir):
887            for fn in files:
888                if self.should_include(fn, root):
889                    fn = os.path.join(root, fn)
890                    url = urlunparse(('file', '',
891                                      pathname2url(os.path.abspath(fn)),
892                                      '', '', ''))
893                    info = self.convert_url_to_download_info(url, None)
894                    if info:
895                        result.add(info['name'])
896            if not self.recursive:
897                break
898        return result
899
900class JSONLocator(Locator):
901    """
902    This locator uses special extended metadata (not available on PyPI) and is
903    the basis of performant dependency resolution in distlib. Other locators
904    require archive downloads before dependencies can be determined! As you
905    might imagine, that can be slow.
906    """
907    def get_distribution_names(self):
908        """
909        Return all the distribution names known to this locator.
910        """
911        raise NotImplementedError('Not available from this locator')
912
913    def _get_project(self, name):
914        result = {'urls': {}, 'digests': {}}
915        data = get_project_data(name)
916        if data:
917            for info in data.get('files', []):
918                if info['ptype'] != 'sdist' or info['pyversion'] != 'source':
919                    continue
920                # We don't store summary in project metadata as it makes
921                # the data bigger for no benefit during dependency
922                # resolution
923                dist = make_dist(data['name'], info['version'],
924                                 summary=data.get('summary',
925                                                  'Placeholder for summary'),
926                                 scheme=self.scheme)
927                md = dist.metadata
928                md.source_url = info['url']
929                # TODO SHA256 digest
930                if 'digest' in info and info['digest']:
931                    dist.digest = ('md5', info['digest'])
932                md.dependencies = info.get('requirements', {})
933                dist.exports = info.get('exports', {})
934                result[dist.version] = dist
935                result['urls'].setdefault(dist.version, set()).add(info['url'])
936        return result
937
938class DistPathLocator(Locator):
939    """
940    This locator finds installed distributions in a path. It can be useful for
941    adding to an :class:`AggregatingLocator`.
942    """
943    def __init__(self, distpath, **kwargs):
944        """
945        Initialise an instance.
946
947        :param distpath: A :class:`DistributionPath` instance to search.
948        """
949        super(DistPathLocator, self).__init__(**kwargs)
950        assert isinstance(distpath, DistributionPath)
951        self.distpath = distpath
952
953    def _get_project(self, name):
954        dist = self.distpath.get_distribution(name)
955        if dist is None:
956            result = {'urls': {}, 'digests': {}}
957        else:
958            result = {
959                dist.version: dist,
960                'urls': {dist.version: set([dist.source_url])},
961                'digests': {dist.version: set([None])}
962            }
963        return result
964
965
966class AggregatingLocator(Locator):
967    """
968    This class allows you to chain and/or merge a list of locators.
969    """
970    def __init__(self, *locators, **kwargs):
971        """
972        Initialise an instance.
973
974        :param locators: The list of locators to search.
975        :param kwargs: Passed to the superclass constructor,
976                       except for:
977                       * merge - if False (the default), the first successful
978                         search from any of the locators is returned. If True,
979                         the results from all locators are merged (this can be
980                         slow).
981        """
982        self.merge = kwargs.pop('merge', False)
983        self.locators = locators
984        super(AggregatingLocator, self).__init__(**kwargs)
985
986    def clear_cache(self):
987        super(AggregatingLocator, self).clear_cache()
988        for locator in self.locators:
989            locator.clear_cache()
990
991    def _set_scheme(self, value):
992        self._scheme = value
993        for locator in self.locators:
994            locator.scheme = value
995
996    scheme = property(Locator.scheme.fget, _set_scheme)
997
998    def _get_project(self, name):
999        result = {}
1000        for locator in self.locators:
1001            d = locator.get_project(name)
1002            if d:
1003                if self.merge:
1004                    files = result.get('urls', {})
1005                    digests = result.get('digests', {})
1006                    # next line could overwrite result['urls'], result['digests']
1007                    result.update(d)
1008                    df = result.get('urls')
1009                    if files and df:
1010                        for k, v in files.items():
1011                            if k in df:
1012                                df[k] |= v
1013                            else:
1014                                df[k] = v
1015                    dd = result.get('digests')
1016                    if digests and dd:
1017                        dd.update(digests)
1018                else:
1019                    # See issue #18. If any dists are found and we're looking
1020                    # for specific constraints, we only return something if
1021                    # a match is found. For example, if a DirectoryLocator
1022                    # returns just foo (1.0) while we're looking for
1023                    # foo (>= 2.0), we'll pretend there was nothing there so
1024                    # that subsequent locators can be queried. Otherwise we
1025                    # would just return foo (1.0) which would then lead to a
1026                    # failure to find foo (>= 2.0), because other locators
1027                    # weren't searched. Note that this only matters when
1028                    # merge=False.
1029                    if self.matcher is None:
1030                        found = True
1031                    else:
1032                        found = False
1033                        for k in d:
1034                            if self.matcher.match(k):
1035                                found = True
1036                                break
1037                    if found:
1038                        result = d
1039                        break
1040        return result
1041
1042    def get_distribution_names(self):
1043        """
1044        Return all the distribution names known to this locator.
1045        """
1046        result = set()
1047        for locator in self.locators:
1048            try:
1049                result |= locator.get_distribution_names()
1050            except NotImplementedError:
1051                pass
1052        return result
1053
1054
1055# We use a legacy scheme simply because most of the dists on PyPI use legacy
1056# versions which don't conform to PEP 426 / PEP 440.
1057default_locator = AggregatingLocator(
1058                    JSONLocator(),
1059                    SimpleScrapingLocator('https://pypi.org/simple/',
1060                                          timeout=3.0),
1061                    scheme='legacy')
1062
1063locate = default_locator.locate
1064
1065
1066class DependencyFinder(object):
1067    """
1068    Locate dependencies for distributions.
1069    """
1070
1071    def __init__(self, locator=None):
1072        """
1073        Initialise an instance, using the specified locator
1074        to locate distributions.
1075        """
1076        self.locator = locator or default_locator
1077        self.scheme = get_scheme(self.locator.scheme)
1078
1079    def add_distribution(self, dist):
1080        """
1081        Add a distribution to the finder. This will update internal information
1082        about who provides what.
1083        :param dist: The distribution to add.
1084        """
1085        logger.debug('adding distribution %s', dist)
1086        name = dist.key
1087        self.dists_by_name[name] = dist
1088        self.dists[(name, dist.version)] = dist
1089        for p in dist.provides:
1090            name, version = parse_name_and_version(p)
1091            logger.debug('Add to provided: %s, %s, %s', name, version, dist)
1092            self.provided.setdefault(name, set()).add((version, dist))
1093
1094    def remove_distribution(self, dist):
1095        """
1096        Remove a distribution from the finder. This will update internal
1097        information about who provides what.
1098        :param dist: The distribution to remove.
1099        """
1100        logger.debug('removing distribution %s', dist)
1101        name = dist.key
1102        del self.dists_by_name[name]
1103        del self.dists[(name, dist.version)]
1104        for p in dist.provides:
1105            name, version = parse_name_and_version(p)
1106            logger.debug('Remove from provided: %s, %s, %s', name, version, dist)
1107            s = self.provided[name]
1108            s.remove((version, dist))
1109            if not s:
1110                del self.provided[name]
1111
1112    def get_matcher(self, reqt):
1113        """
1114        Get a version matcher for a requirement.
1115        :param reqt: The requirement
1116        :type reqt: str
1117        :return: A version matcher (an instance of
1118                 :class:`distlib.version.Matcher`).
1119        """
1120        try:
1121            matcher = self.scheme.matcher(reqt)
1122        except UnsupportedVersionError:  # pragma: no cover
1123            # XXX compat-mode if cannot read the version
1124            name = reqt.split()[0]
1125            matcher = self.scheme.matcher(name)
1126        return matcher
1127
1128    def find_providers(self, reqt):
1129        """
1130        Find the distributions which can fulfill a requirement.
1131
1132        :param reqt: The requirement.
1133         :type reqt: str
1134        :return: A set of distribution which can fulfill the requirement.
1135        """
1136        matcher = self.get_matcher(reqt)
1137        name = matcher.key   # case-insensitive
1138        result = set()
1139        provided = self.provided
1140        if name in provided:
1141            for version, provider in provided[name]:
1142                try:
1143                    match = matcher.match(version)
1144                except UnsupportedVersionError:
1145                    match = False
1146
1147                if match:
1148                    result.add(provider)
1149                    break
1150        return result
1151
1152    def try_to_replace(self, provider, other, problems):
1153        """
1154        Attempt to replace one provider with another. This is typically used
1155        when resolving dependencies from multiple sources, e.g. A requires
1156        (B >= 1.0) while C requires (B >= 1.1).
1157
1158        For successful replacement, ``provider`` must meet all the requirements
1159        which ``other`` fulfills.
1160
1161        :param provider: The provider we are trying to replace with.
1162        :param other: The provider we're trying to replace.
1163        :param problems: If False is returned, this will contain what
1164                         problems prevented replacement. This is currently
1165                         a tuple of the literal string 'cantreplace',
1166                         ``provider``, ``other``  and the set of requirements
1167                         that ``provider`` couldn't fulfill.
1168        :return: True if we can replace ``other`` with ``provider``, else
1169                 False.
1170        """
1171        rlist = self.reqts[other]
1172        unmatched = set()
1173        for s in rlist:
1174            matcher = self.get_matcher(s)
1175            if not matcher.match(provider.version):
1176                unmatched.add(s)
1177        if unmatched:
1178            # can't replace other with provider
1179            problems.add(('cantreplace', provider, other,
1180                          frozenset(unmatched)))
1181            result = False
1182        else:
1183            # can replace other with provider
1184            self.remove_distribution(other)
1185            del self.reqts[other]
1186            for s in rlist:
1187                self.reqts.setdefault(provider, set()).add(s)
1188            self.add_distribution(provider)
1189            result = True
1190        return result
1191
1192    def find(self, requirement, meta_extras=None, prereleases=False):
1193        """
1194        Find a distribution and all distributions it depends on.
1195
1196        :param requirement: The requirement specifying the distribution to
1197                            find, or a Distribution instance.
1198        :param meta_extras: A list of meta extras such as :test:, :build: and
1199                            so on.
1200        :param prereleases: If ``True``, allow pre-release versions to be
1201                            returned - otherwise, don't return prereleases
1202                            unless they're all that's available.
1203
1204        Return a set of :class:`Distribution` instances and a set of
1205        problems.
1206
1207        The distributions returned should be such that they have the
1208        :attr:`required` attribute set to ``True`` if they were
1209        from the ``requirement`` passed to ``find()``, and they have the
1210        :attr:`build_time_dependency` attribute set to ``True`` unless they
1211        are post-installation dependencies of the ``requirement``.
1212
1213        The problems should be a tuple consisting of the string
1214        ``'unsatisfied'`` and the requirement which couldn't be satisfied
1215        by any distribution known to the locator.
1216        """
1217
1218        self.provided = {}
1219        self.dists = {}
1220        self.dists_by_name = {}
1221        self.reqts = {}
1222
1223        meta_extras = set(meta_extras or [])
1224        if ':*:' in meta_extras:
1225            meta_extras.remove(':*:')
1226            # :meta: and :run: are implicitly included
1227            meta_extras |= set([':test:', ':build:', ':dev:'])
1228
1229        if isinstance(requirement, Distribution):
1230            dist = odist = requirement
1231            logger.debug('passed %s as requirement', odist)
1232        else:
1233            dist = odist = self.locator.locate(requirement,
1234                                               prereleases=prereleases)
1235            if dist is None:
1236                raise DistlibException('Unable to locate %r' % requirement)
1237            logger.debug('located %s', odist)
1238        dist.requested = True
1239        problems = set()
1240        todo = set([dist])
1241        install_dists = set([odist])
1242        while todo:
1243            dist = todo.pop()
1244            name = dist.key     # case-insensitive
1245            if name not in self.dists_by_name:
1246                self.add_distribution(dist)
1247            else:
1248                #import pdb; pdb.set_trace()
1249                other = self.dists_by_name[name]
1250                if other != dist:
1251                    self.try_to_replace(dist, other, problems)
1252
1253            ireqts = dist.run_requires | dist.meta_requires
1254            sreqts = dist.build_requires
1255            ereqts = set()
1256            if meta_extras and dist in install_dists:
1257                for key in ('test', 'build', 'dev'):
1258                    e = ':%s:' % key
1259                    if e in meta_extras:
1260                        ereqts |= getattr(dist, '%s_requires' % key)
1261            all_reqts = ireqts | sreqts | ereqts
1262            for r in all_reqts:
1263                providers = self.find_providers(r)
1264                if not providers:
1265                    logger.debug('No providers found for %r', r)
1266                    provider = self.locator.locate(r, prereleases=prereleases)
1267                    # If no provider is found and we didn't consider
1268                    # prereleases, consider them now.
1269                    if provider is None and not prereleases:
1270                        provider = self.locator.locate(r, prereleases=True)
1271                    if provider is None:
1272                        logger.debug('Cannot satisfy %r', r)
1273                        problems.add(('unsatisfied', r))
1274                    else:
1275                        n, v = provider.key, provider.version
1276                        if (n, v) not in self.dists:
1277                            todo.add(provider)
1278                        providers.add(provider)
1279                        if r in ireqts and dist in install_dists:
1280                            install_dists.add(provider)
1281                            logger.debug('Adding %s to install_dists',
1282                                         provider.name_and_version)
1283                for p in providers:
1284                    name = p.key
1285                    if name not in self.dists_by_name:
1286                        self.reqts.setdefault(p, set()).add(r)
1287                    else:
1288                        other = self.dists_by_name[name]
1289                        if other != p:
1290                            # see if other can be replaced by p
1291                            self.try_to_replace(p, other, problems)
1292
1293        dists = set(self.dists.values())
1294        for dist in dists:
1295            dist.build_time_dependency = dist not in install_dists
1296            if dist.build_time_dependency:
1297                logger.debug('%s is a build-time dependency only.',
1298                             dist.name_and_version)
1299        logger.debug('find done for %s', odist)
1300        return dists, problems
1301