1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2012-2015 Vinay Sajip.
4# Licensed to the Python Software Foundation under a contributor agreement.
5# See LICENSE.txt and CONTRIBUTORS.txt.
6#
7
8import gzip
9from io import BytesIO
10import json
11import logging
12import os
13import posixpath
14import re
15try:
16    import threading
17except ImportError:  # pragma: no cover
18    import dummy_threading as threading
19import zlib
20
21from . import DistlibException
22from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url,
23                     queue, quote, unescape, string_types, build_opener,
24                     HTTPRedirectHandler as BaseRedirectHandler, text_type,
25                     Request, HTTPError, URLError)
26from .database import Distribution, DistributionPath, make_dist
27from .metadata import Metadata, MetadataInvalidError
28from .util import (cached_property, parse_credentials, ensure_slash,
29                   split_filename, get_project_data, parse_requirement,
30                   parse_name_and_version, ServerProxy, normalize_name)
31from .version import get_scheme, UnsupportedVersionError
32from .wheel import Wheel, is_compatible
33
34logger = logging.getLogger(__name__)
35
36HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)')
37CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I)
38HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml')
39DEFAULT_INDEX = 'https://pypi.org/pypi'
40
41def get_all_distribution_names(url=None):
42    """
43    Return all distribution names known by an index.
44    :param url: The URL of the index.
45    :return: A list of all known distribution names.
46    """
47    if url is None:
48        url = DEFAULT_INDEX
49    client = ServerProxy(url, timeout=3.0)
50    try:
51        return client.list_packages()
52    finally:
53        client('close')()
54
55class RedirectHandler(BaseRedirectHandler):
56    """
57    A class to work around a bug in some Python 3.2.x releases.
58    """
59    # There's a bug in the base version for some 3.2.x
60    # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header
61    # returns e.g. /abc, it bails because it says the scheme ''
62    # is bogus, when actually it should use the request's
63    # URL for the scheme. See Python issue #13696.
64    def http_error_302(self, req, fp, code, msg, headers):
65        # Some servers (incorrectly) return multiple Location headers
66        # (so probably same goes for URI).  Use first header.
67        newurl = None
68        for key in ('location', 'uri'):
69            if key in headers:
70                newurl = headers[key]
71                break
72        if newurl is None:  # pragma: no cover
73            return
74        urlparts = urlparse(newurl)
75        if urlparts.scheme == '':
76            newurl = urljoin(req.get_full_url(), newurl)
77            if hasattr(headers, 'replace_header'):
78                headers.replace_header(key, newurl)
79            else:
80                headers[key] = newurl
81        return BaseRedirectHandler.http_error_302(self, req, fp, code, msg,
82                                                  headers)
83
84    http_error_301 = http_error_303 = http_error_307 = http_error_302
85
86class Locator(object):
87    """
88    A base class for locators - things that locate distributions.
89    """
90    source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz')
91    binary_extensions = ('.egg', '.exe', '.whl')
92    excluded_extensions = ('.pdf',)
93
94    # A list of tags indicating which wheels you want to match. The default
95    # value of None matches against the tags compatible with the running
96    # Python. If you want to match other values, set wheel_tags on a locator
97    # instance to a list of tuples (pyver, abi, arch) which you want to match.
98    wheel_tags = None
99
100    downloadable_extensions = source_extensions + ('.whl',)
101
102    def __init__(self, scheme='default'):
103        """
104        Initialise an instance.
105        :param scheme: Because locators look for most recent versions, they
106                       need to know the version scheme to use. This specifies
107                       the current PEP-recommended scheme - use ``'legacy'``
108                       if you need to support existing distributions on PyPI.
109        """
110        self._cache = {}
111        self.scheme = scheme
112        # Because of bugs in some of the handlers on some of the platforms,
113        # we use our own opener rather than just using urlopen.
114        self.opener = build_opener(RedirectHandler())
115        # If get_project() is called from locate(), the matcher instance
116        # is set from the requirement passed to locate(). See issue #18 for
117        # why this can be useful to know.
118        self.matcher = None
119        self.errors = queue.Queue()
120
121    def get_errors(self):
122        """
123        Return any errors which have occurred.
124        """
125        result = []
126        while not self.errors.empty():  # pragma: no cover
127            try:
128                e = self.errors.get(False)
129                result.append(e)
130            except self.errors.Empty:
131                continue
132            self.errors.task_done()
133        return result
134
135    def clear_errors(self):
136        """
137        Clear any errors which may have been logged.
138        """
139        # Just get the errors and throw them away
140        self.get_errors()
141
142    def clear_cache(self):
143        self._cache.clear()
144
145    def _get_scheme(self):
146        return self._scheme
147
148    def _set_scheme(self, value):
149        self._scheme = value
150
151    scheme = property(_get_scheme, _set_scheme)
152
153    def _get_project(self, name):
154        """
155        For a given project, get a dictionary mapping available versions to Distribution
156        instances.
157
158        This should be implemented in subclasses.
159
160        If called from a locate() request, self.matcher will be set to a
161        matcher for the requirement to satisfy, otherwise it will be None.
162        """
163        raise NotImplementedError('Please implement in the subclass')
164
165    def get_distribution_names(self):
166        """
167        Return all the distribution names known to this locator.
168        """
169        raise NotImplementedError('Please implement in the subclass')
170
171    def get_project(self, name):
172        """
173        For a given project, get a dictionary mapping available versions to Distribution
174        instances.
175
176        This calls _get_project to do all the work, and just implements a caching layer on top.
177        """
178        if self._cache is None:  # pragma: no cover
179            result = self._get_project(name)
180        elif name in self._cache:
181            result = self._cache[name]
182        else:
183            self.clear_errors()
184            result = self._get_project(name)
185            self._cache[name] = result
186        return result
187
188    def score_url(self, url):
189        """
190        Give an url a score which can be used to choose preferred URLs
191        for a given project release.
192        """
193        t = urlparse(url)
194        basename = posixpath.basename(t.path)
195        compatible = True
196        is_wheel = basename.endswith('.whl')
197        is_downloadable = basename.endswith(self.downloadable_extensions)
198        if is_wheel:
199            compatible = is_compatible(Wheel(basename), self.wheel_tags)
200        return (t.scheme == 'https', 'pypi.org' in t.netloc,
201                is_downloadable, is_wheel, compatible, basename)
202
203    def prefer_url(self, url1, url2):
204        """
205        Choose one of two URLs where both are candidates for distribution
206        archives for the same version of a distribution (for example,
207        .tar.gz vs. zip).
208
209        The current implementation favours https:// URLs over http://, archives
210        from PyPI over those from other locations, wheel compatibility (if a
211        wheel) and then the archive name.
212        """
213        result = url2
214        if url1:
215            s1 = self.score_url(url1)
216            s2 = self.score_url(url2)
217            if s1 > s2:
218                result = url1
219            if result != url2:
220                logger.debug('Not replacing %r with %r', url1, url2)
221            else:
222                logger.debug('Replacing %r with %r', url1, url2)
223        return result
224
225    def split_filename(self, filename, project_name):
226        """
227        Attempt to split a filename in project name, version and Python version.
228        """
229        return split_filename(filename, project_name)
230
231    def convert_url_to_download_info(self, url, project_name):
232        """
233        See if a URL is a candidate for a download URL for a project (the URL
234        has typically been scraped from an HTML page).
235
236        If it is, a dictionary is returned with keys "name", "version",
237        "filename" and "url"; otherwise, None is returned.
238        """
239        def same_project(name1, name2):
240            return normalize_name(name1) == normalize_name(name2)
241
242        result = None
243        scheme, netloc, path, params, query, frag = urlparse(url)
244        if frag.lower().startswith('egg='):  # pragma: no cover
245            logger.debug('%s: version hint in fragment: %r',
246                         project_name, frag)
247        m = HASHER_HASH.match(frag)
248        if m:
249            algo, digest = m.groups()
250        else:
251            algo, digest = None, None
252        origpath = path
253        if path and path[-1] == '/':  # pragma: no cover
254            path = path[:-1]
255        if path.endswith('.whl'):
256            try:
257                wheel = Wheel(path)
258                if not is_compatible(wheel, self.wheel_tags):
259                    logger.debug('Wheel not compatible: %s', path)
260                else:
261                    if project_name is None:
262                        include = True
263                    else:
264                        include = same_project(wheel.name, project_name)
265                    if include:
266                        result = {
267                            'name': wheel.name,
268                            'version': wheel.version,
269                            'filename': wheel.filename,
270                            'url': urlunparse((scheme, netloc, origpath,
271                                               params, query, '')),
272                            'python-version': ', '.join(
273                                ['.'.join(list(v[2:])) for v in wheel.pyver]),
274                        }
275            except Exception as e:  # pragma: no cover
276                logger.warning('invalid path for wheel: %s', path)
277        elif not path.endswith(self.downloadable_extensions):  # pragma: no cover
278            logger.debug('Not downloadable: %s', path)
279        else:  # downloadable extension
280            path = filename = posixpath.basename(path)
281            for ext in self.downloadable_extensions:
282                if path.endswith(ext):
283                    path = path[:-len(ext)]
284                    t = self.split_filename(path, project_name)
285                    if not t:  # pragma: no cover
286                        logger.debug('No match for project/version: %s', path)
287                    else:
288                        name, version, pyver = t
289                        if not project_name or same_project(project_name, name):
290                            result = {
291                                'name': name,
292                                'version': version,
293                                'filename': filename,
294                                'url': urlunparse((scheme, netloc, origpath,
295                                                   params, query, '')),
296                                #'packagetype': 'sdist',
297                            }
298                            if pyver:  # pragma: no cover
299                                result['python-version'] = pyver
300                    break
301        if result and algo:
302            result['%s_digest' % algo] = digest
303        return result
304
305    def _get_digest(self, info):
306        """
307        Get a digest from a dictionary by looking at a "digests" dictionary
308        or keys of the form 'algo_digest'.
309
310        Returns a 2-tuple (algo, digest) if found, else None. Currently
311        looks only for SHA256, then MD5.
312        """
313        result = None
314        if 'digests' in info:
315            digests = info['digests']
316            for algo in ('sha256', 'md5'):
317                if algo in digests:
318                    result = (algo, digests[algo])
319                    break
320        if not result:
321            for algo in ('sha256', 'md5'):
322                key = '%s_digest' % algo
323                if key in info:
324                    result = (algo, info[key])
325                    break
326        return result
327
328    def _update_version_data(self, result, info):
329        """
330        Update a result dictionary (the final result from _get_project) with a
331        dictionary for a specific version, which typically holds information
332        gleaned from a filename or URL for an archive for the distribution.
333        """
334        name = info.pop('name')
335        version = info.pop('version')
336        if version in result:
337            dist = result[version]
338            md = dist.metadata
339        else:
340            dist = make_dist(name, version, scheme=self.scheme)
341            md = dist.metadata
342        dist.digest = digest = self._get_digest(info)
343        url = info['url']
344        result['digests'][url] = digest
345        if md.source_url != info['url']:
346            md.source_url = self.prefer_url(md.source_url, url)
347            result['urls'].setdefault(version, set()).add(url)
348        dist.locator = self
349        result[version] = dist
350
351    def locate(self, requirement, prereleases=False):
352        """
353        Find the most recent distribution which matches the given
354        requirement.
355
356        :param requirement: A requirement of the form 'foo (1.0)' or perhaps
357                            'foo (>= 1.0, < 2.0, != 1.3)'
358        :param prereleases: If ``True``, allow pre-release versions
359                            to be located. Otherwise, pre-release versions
360                            are not returned.
361        :return: A :class:`Distribution` instance, or ``None`` if no such
362                 distribution could be located.
363        """
364        result = None
365        r = parse_requirement(requirement)
366        if r is None:  # pragma: no cover
367            raise DistlibException('Not a valid requirement: %r' % requirement)
368        scheme = get_scheme(self.scheme)
369        self.matcher = matcher = scheme.matcher(r.requirement)
370        logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__)
371        versions = self.get_project(r.name)
372        if len(versions) > 2:   # urls and digests keys are present
373            # sometimes, versions are invalid
374            slist = []
375            vcls = matcher.version_class
376            for k in versions:
377                if k in ('urls', 'digests'):
378                    continue
379                try:
380                    if not matcher.match(k):
381                        logger.debug('%s did not match %r', matcher, k)
382                    else:
383                        if prereleases or not vcls(k).is_prerelease:
384                            slist.append(k)
385                        else:
386                            logger.debug('skipping pre-release '
387                                         'version %s of %s', k, matcher.name)
388                except Exception:  # pragma: no cover
389                    logger.warning('error matching %s with %r', matcher, k)
390                    pass # slist.append(k)
391            if len(slist) > 1:
392                slist = sorted(slist, key=scheme.key)
393            if slist:
394                logger.debug('sorted list: %s', slist)
395                version = slist[-1]
396                result = versions[version]
397        if result:
398            if r.extras:
399                result.extras = r.extras
400            result.download_urls = versions.get('urls', {}).get(version, set())
401            d = {}
402            sd = versions.get('digests', {})
403            for url in result.download_urls:
404                if url in sd:  # pragma: no cover
405                    d[url] = sd[url]
406            result.digests = d
407        self.matcher = None
408        return result
409
410
411class PyPIRPCLocator(Locator):
412    """
413    This locator uses XML-RPC to locate distributions. It therefore
414    cannot be used with simple mirrors (that only mirror file content).
415    """
416    def __init__(self, url, **kwargs):
417        """
418        Initialise an instance.
419
420        :param url: The URL to use for XML-RPC.
421        :param kwargs: Passed to the superclass constructor.
422        """
423        super(PyPIRPCLocator, self).__init__(**kwargs)
424        self.base_url = url
425        self.client = ServerProxy(url, timeout=3.0)
426
427    def get_distribution_names(self):
428        """
429        Return all the distribution names known to this locator.
430        """
431        return set(self.client.list_packages())
432
433    def _get_project(self, name):
434        result = {'urls': {}, 'digests': {}}
435        versions = self.client.package_releases(name, True)
436        for v in versions:
437            urls = self.client.release_urls(name, v)
438            data = self.client.release_data(name, v)
439            metadata = Metadata(scheme=self.scheme)
440            metadata.name = data['name']
441            metadata.version = data['version']
442            metadata.license = data.get('license')
443            metadata.keywords = data.get('keywords', [])
444            metadata.summary = data.get('summary')
445            dist = Distribution(metadata)
446            if urls:
447                info = urls[0]
448                metadata.source_url = info['url']
449                dist.digest = self._get_digest(info)
450                dist.locator = self
451                result[v] = dist
452                for info in urls:
453                    url = info['url']
454                    digest = self._get_digest(info)
455                    result['urls'].setdefault(v, set()).add(url)
456                    result['digests'][url] = digest
457        return result
458
459class PyPIJSONLocator(Locator):
460    """
461    This locator uses PyPI's JSON interface. It's very limited in functionality
462    and probably not worth using.
463    """
464    def __init__(self, url, **kwargs):
465        super(PyPIJSONLocator, self).__init__(**kwargs)
466        self.base_url = ensure_slash(url)
467
468    def get_distribution_names(self):
469        """
470        Return all the distribution names known to this locator.
471        """
472        raise NotImplementedError('Not available from this locator')
473
474    def _get_project(self, name):
475        result = {'urls': {}, 'digests': {}}
476        url = urljoin(self.base_url, '%s/json' % quote(name))
477        try:
478            resp = self.opener.open(url)
479            data = resp.read().decode() # for now
480            d = json.loads(data)
481            md = Metadata(scheme=self.scheme)
482            data = d['info']
483            md.name = data['name']
484            md.version = data['version']
485            md.license = data.get('license')
486            md.keywords = data.get('keywords', [])
487            md.summary = data.get('summary')
488            dist = Distribution(md)
489            dist.locator = self
490            urls = d['urls']
491            result[md.version] = dist
492            for info in d['urls']:
493                url = info['url']
494                dist.download_urls.add(url)
495                dist.digests[url] = self._get_digest(info)
496                result['urls'].setdefault(md.version, set()).add(url)
497                result['digests'][url] = self._get_digest(info)
498            # Now get other releases
499            for version, infos in d['releases'].items():
500                if version == md.version:
501                    continue    # already done
502                omd = Metadata(scheme=self.scheme)
503                omd.name = md.name
504                omd.version = version
505                odist = Distribution(omd)
506                odist.locator = self
507                result[version] = odist
508                for info in infos:
509                    url = info['url']
510                    odist.download_urls.add(url)
511                    odist.digests[url] = self._get_digest(info)
512                    result['urls'].setdefault(version, set()).add(url)
513                    result['digests'][url] = self._get_digest(info)
514#            for info in urls:
515#                md.source_url = info['url']
516#                dist.digest = self._get_digest(info)
517#                dist.locator = self
518#                for info in urls:
519#                    url = info['url']
520#                    result['urls'].setdefault(md.version, set()).add(url)
521#                    result['digests'][url] = self._get_digest(info)
522        except Exception as e:
523            self.errors.put(text_type(e))
524            logger.exception('JSON fetch failed: %s', e)
525        return result
526
527
528class Page(object):
529    """
530    This class represents a scraped HTML page.
531    """
532    # The following slightly hairy-looking regex just looks for the contents of
533    # an anchor link, which has an attribute "href" either immediately preceded
534    # or immediately followed by a "rel" attribute. The attribute values can be
535    # declared with double quotes, single quotes or no quotes - which leads to
536    # the length of the expression.
537    _href = re.compile("""
538(rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)?
539href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*))
540(\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))?
541""", re.I | re.S | re.X)
542    _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S)
543
544    def __init__(self, data, url):
545        """
546        Initialise an instance with the Unicode page contents and the URL they
547        came from.
548        """
549        self.data = data
550        self.base_url = self.url = url
551        m = self._base.search(self.data)
552        if m:
553            self.base_url = m.group(1)
554
555    _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
556
557    @cached_property
558    def links(self):
559        """
560        Return the URLs of all the links on a page together with information
561        about their "rel" attribute, for determining which ones to treat as
562        downloads and which ones to queue for further scraping.
563        """
564        def clean(url):
565            "Tidy up an URL."
566            scheme, netloc, path, params, query, frag = urlparse(url)
567            return urlunparse((scheme, netloc, quote(path),
568                               params, query, frag))
569
570        result = set()
571        for match in self._href.finditer(self.data):
572            d = match.groupdict('')
573            rel = (d['rel1'] or d['rel2'] or d['rel3'] or
574                   d['rel4'] or d['rel5'] or d['rel6'])
575            url = d['url1'] or d['url2'] or d['url3']
576            url = urljoin(self.base_url, url)
577            url = unescape(url)
578            url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
579            result.add((url, rel))
580        # We sort the result, hoping to bring the most recent versions
581        # to the front
582        result = sorted(result, key=lambda t: t[0], reverse=True)
583        return result
584
585
586class SimpleScrapingLocator(Locator):
587    """
588    A locator which scrapes HTML pages to locate downloads for a distribution.
589    This runs multiple threads to do the I/O; performance is at least as good
590    as pip's PackageFinder, which works in an analogous fashion.
591    """
592
593    # These are used to deal with various Content-Encoding schemes.
594    decoders = {
595        'deflate': zlib.decompress,
596        'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(d)).read(),
597        'none': lambda b: b,
598    }
599
600    def __init__(self, url, timeout=None, num_workers=10, **kwargs):
601        """
602        Initialise an instance.
603        :param url: The root URL to use for scraping.
604        :param timeout: The timeout, in seconds, to be applied to requests.
605                        This defaults to ``None`` (no timeout specified).
606        :param num_workers: The number of worker threads you want to do I/O,
607                            This defaults to 10.
608        :param kwargs: Passed to the superclass.
609        """
610        super(SimpleScrapingLocator, self).__init__(**kwargs)
611        self.base_url = ensure_slash(url)
612        self.timeout = timeout
613        self._page_cache = {}
614        self._seen = set()
615        self._to_fetch = queue.Queue()
616        self._bad_hosts = set()
617        self.skip_externals = False
618        self.num_workers = num_workers
619        self._lock = threading.RLock()
620        # See issue #45: we need to be resilient when the locator is used
621        # in a thread, e.g. with concurrent.futures. We can't use self._lock
622        # as it is for coordinating our internal threads - the ones created
623        # in _prepare_threads.
624        self._gplock = threading.RLock()
625        self.platform_check = False  # See issue #112
626
627    def _prepare_threads(self):
628        """
629        Threads are created only when get_project is called, and terminate
630        before it returns. They are there primarily to parallelise I/O (i.e.
631        fetching web pages).
632        """
633        self._threads = []
634        for i in range(self.num_workers):
635            t = threading.Thread(target=self._fetch)
636            t.setDaemon(True)
637            t.start()
638            self._threads.append(t)
639
640    def _wait_threads(self):
641        """
642        Tell all the threads to terminate (by sending a sentinel value) and
643        wait for them to do so.
644        """
645        # Note that you need two loops, since you can't say which
646        # thread will get each sentinel
647        for t in self._threads:
648            self._to_fetch.put(None)    # sentinel
649        for t in self._threads:
650            t.join()
651        self._threads = []
652
653    def _get_project(self, name):
654        result = {'urls': {}, 'digests': {}}
655        with self._gplock:
656            self.result = result
657            self.project_name = name
658            url = urljoin(self.base_url, '%s/' % quote(name))
659            self._seen.clear()
660            self._page_cache.clear()
661            self._prepare_threads()
662            try:
663                logger.debug('Queueing %s', url)
664                self._to_fetch.put(url)
665                self._to_fetch.join()
666            finally:
667                self._wait_threads()
668            del self.result
669        return result
670
671    platform_dependent = re.compile(r'\b(linux_(i\d86|x86_64|arm\w+)|'
672                                    r'win(32|_amd64)|macosx_?\d+)\b', re.I)
673
674    def _is_platform_dependent(self, url):
675        """
676        Does an URL refer to a platform-specific download?
677        """
678        return self.platform_dependent.search(url)
679
680    def _process_download(self, url):
681        """
682        See if an URL is a suitable download for a project.
683
684        If it is, register information in the result dictionary (for
685        _get_project) about the specific version it's for.
686
687        Note that the return value isn't actually used other than as a boolean
688        value.
689        """
690        if self.platform_check and self._is_platform_dependent(url):
691            info = None
692        else:
693            info = self.convert_url_to_download_info(url, self.project_name)
694        logger.debug('process_download: %s -> %s', url, info)
695        if info:
696            with self._lock:    # needed because self.result is shared
697                self._update_version_data(self.result, info)
698        return info
699
700    def _should_queue(self, link, referrer, rel):
701        """
702        Determine whether a link URL from a referring page and with a
703        particular "rel" attribute should be queued for scraping.
704        """
705        scheme, netloc, path, _, _, _ = urlparse(link)
706        if path.endswith(self.source_extensions + self.binary_extensions +
707                         self.excluded_extensions):
708            result = False
709        elif self.skip_externals and not link.startswith(self.base_url):
710            result = False
711        elif not referrer.startswith(self.base_url):
712            result = False
713        elif rel not in ('homepage', 'download'):
714            result = False
715        elif scheme not in ('http', 'https', 'ftp'):
716            result = False
717        elif self._is_platform_dependent(link):
718            result = False
719        else:
720            host = netloc.split(':', 1)[0]
721            if host.lower() == 'localhost':
722                result = False
723            else:
724                result = True
725        logger.debug('should_queue: %s (%s) from %s -> %s', link, rel,
726                     referrer, result)
727        return result
728
729    def _fetch(self):
730        """
731        Get a URL to fetch from the work queue, get the HTML page, examine its
732        links for download candidates and candidates for further scraping.
733
734        This is a handy method to run in a thread.
735        """
736        while True:
737            url = self._to_fetch.get()
738            try:
739                if url:
740                    page = self.get_page(url)
741                    if page is None:    # e.g. after an error
742                        continue
743                    for link, rel in page.links:
744                        if link not in self._seen:
745                            try:
746                                self._seen.add(link)
747                                if (not self._process_download(link) and
748                                    self._should_queue(link, url, rel)):
749                                    logger.debug('Queueing %s from %s', link, url)
750                                    self._to_fetch.put(link)
751                            except MetadataInvalidError:  # e.g. invalid versions
752                                pass
753            except Exception as e:  # pragma: no cover
754                self.errors.put(text_type(e))
755            finally:
756                # always do this, to avoid hangs :-)
757                self._to_fetch.task_done()
758            if not url:
759                #logger.debug('Sentinel seen, quitting.')
760                break
761
762    def get_page(self, url):
763        """
764        Get the HTML for an URL, possibly from an in-memory cache.
765
766        XXX TODO Note: this cache is never actually cleared. It's assumed that
767        the data won't get stale over the lifetime of a locator instance (not
768        necessarily true for the default_locator).
769        """
770        # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api
771        scheme, netloc, path, _, _, _ = urlparse(url)
772        if scheme == 'file' and os.path.isdir(url2pathname(path)):
773            url = urljoin(ensure_slash(url), 'index.html')
774
775        if url in self._page_cache:
776            result = self._page_cache[url]
777            logger.debug('Returning %s from cache: %s', url, result)
778        else:
779            host = netloc.split(':', 1)[0]
780            result = None
781            if host in self._bad_hosts:
782                logger.debug('Skipping %s due to bad host %s', url, host)
783            else:
784                req = Request(url, headers={'Accept-encoding': 'identity'})
785                try:
786                    logger.debug('Fetching %s', url)
787                    resp = self.opener.open(req, timeout=self.timeout)
788                    logger.debug('Fetched %s', url)
789                    headers = resp.info()
790                    content_type = headers.get('Content-Type', '')
791                    if HTML_CONTENT_TYPE.match(content_type):
792                        final_url = resp.geturl()
793                        data = resp.read()
794                        encoding = headers.get('Content-Encoding')
795                        if encoding:
796                            decoder = self.decoders[encoding]   # fail if not found
797                            data = decoder(data)
798                        encoding = 'utf-8'
799                        m = CHARSET.search(content_type)
800                        if m:
801                            encoding = m.group(1)
802                        try:
803                            data = data.decode(encoding)
804                        except UnicodeError:  # pragma: no cover
805                            data = data.decode('latin-1')    # fallback
806                        result = Page(data, final_url)
807                        self._page_cache[final_url] = result
808                except HTTPError as e:
809                    if e.code != 404:
810                        logger.exception('Fetch failed: %s: %s', url, e)
811                except URLError as e:  # pragma: no cover
812                    logger.exception('Fetch failed: %s: %s', url, e)
813                    with self._lock:
814                        self._bad_hosts.add(host)
815                except Exception as e:  # pragma: no cover
816                    logger.exception('Fetch failed: %s: %s', url, e)
817                finally:
818                    self._page_cache[url] = result   # even if None (failure)
819        return result
820
821    _distname_re = re.compile('<a href=[^>]*>([^<]+)<')
822
823    def get_distribution_names(self):
824        """
825        Return all the distribution names known to this locator.
826        """
827        result = set()
828        page = self.get_page(self.base_url)
829        if not page:
830            raise DistlibException('Unable to get %s' % self.base_url)
831        for match in self._distname_re.finditer(page.data):
832            result.add(match.group(1))
833        return result
834
835class DirectoryLocator(Locator):
836    """
837    This class locates distributions in a directory tree.
838    """
839
840    def __init__(self, path, **kwargs):
841        """
842        Initialise an instance.
843        :param path: The root of the directory tree to search.
844        :param kwargs: Passed to the superclass constructor,
845                       except for:
846                       * recursive - if True (the default), subdirectories are
847                         recursed into. If False, only the top-level directory
848                         is searched,
849        """
850        self.recursive = kwargs.pop('recursive', True)
851        super(DirectoryLocator, self).__init__(**kwargs)
852        path = os.path.abspath(path)
853        if not os.path.isdir(path):  # pragma: no cover
854            raise DistlibException('Not a directory: %r' % path)
855        self.base_dir = path
856
857    def should_include(self, filename, parent):
858        """
859        Should a filename be considered as a candidate for a distribution
860        archive? As well as the filename, the directory which contains it
861        is provided, though not used by the current implementation.
862        """
863        return filename.endswith(self.downloadable_extensions)
864
865    def _get_project(self, name):
866        result = {'urls': {}, 'digests': {}}
867        for root, dirs, files in os.walk(self.base_dir):
868            for fn in files:
869                if self.should_include(fn, root):
870                    fn = os.path.join(root, fn)
871                    url = urlunparse(('file', '',
872                                      pathname2url(os.path.abspath(fn)),
873                                      '', '', ''))
874                    info = self.convert_url_to_download_info(url, name)
875                    if info:
876                        self._update_version_data(result, info)
877            if not self.recursive:
878                break
879        return result
880
881    def get_distribution_names(self):
882        """
883        Return all the distribution names known to this locator.
884        """
885        result = set()
886        for root, dirs, files in os.walk(self.base_dir):
887            for fn in files:
888                if self.should_include(fn, root):
889                    fn = os.path.join(root, fn)
890                    url = urlunparse(('file', '',
891                                      pathname2url(os.path.abspath(fn)),
892                                      '', '', ''))
893                    info = self.convert_url_to_download_info(url, None)
894                    if info:
895                        result.add(info['name'])
896            if not self.recursive:
897                break
898        return result
899
900class JSONLocator(Locator):
901    """
902    This locator uses special extended metadata (not available on PyPI) and is
903    the basis of performant dependency resolution in distlib. Other locators
904    require archive downloads before dependencies can be determined! As you
905    might imagine, that can be slow.
906    """
907    def get_distribution_names(self):
908        """
909        Return all the distribution names known to this locator.
910        """
911        raise NotImplementedError('Not available from this locator')
912
913    def _get_project(self, name):
914        result = {'urls': {}, 'digests': {}}
915        data = get_project_data(name)
916        if data:
917            for info in data.get('files', []):
918                if info['ptype'] != 'sdist' or info['pyversion'] != 'source':
919                    continue
920                # We don't store summary in project metadata as it makes
921                # the data bigger for no benefit during dependency
922                # resolution
923                dist = make_dist(data['name'], info['version'],
924                                 summary=data.get('summary',
925                                                  'Placeholder for summary'),
926                                 scheme=self.scheme)
927                md = dist.metadata
928                md.source_url = info['url']
929                # TODO SHA256 digest
930                if 'digest' in info and info['digest']:
931                    dist.digest = ('md5', info['digest'])
932                md.dependencies = info.get('requirements', {})
933                dist.exports = info.get('exports', {})
934                result[dist.version] = dist
935                result['urls'].setdefault(dist.version, set()).add(info['url'])
936        return result
937
938class DistPathLocator(Locator):
939    """
940    This locator finds installed distributions in a path. It can be useful for
941    adding to an :class:`AggregatingLocator`.
942    """
943    def __init__(self, distpath, **kwargs):
944        """
945        Initialise an instance.
946
947        :param distpath: A :class:`DistributionPath` instance to search.
948        """
949        super(DistPathLocator, self).__init__(**kwargs)
950        assert isinstance(distpath, DistributionPath)
951        self.distpath = distpath
952
953    def _get_project(self, name):
954        dist = self.distpath.get_distribution(name)
955        if dist is None:
956            result = {'urls': {}, 'digests': {}}
957        else:
958            result = {
959                dist.version: dist,
960                'urls': {dist.version: set([dist.source_url])},
961                'digests': {dist.version: set([None])}
962            }
963        return result
964
965
966class AggregatingLocator(Locator):
967    """
968    This class allows you to chain and/or merge a list of locators.
969    """
970    def __init__(self, *locators, **kwargs):
971        """
972        Initialise an instance.
973
974        :param locators: The list of locators to search.
975        :param kwargs: Passed to the superclass constructor,
976                       except for:
977                       * merge - if False (the default), the first successful
978                         search from any of the locators is returned. If True,
979                         the results from all locators are merged (this can be
980                         slow).
981        """
982        self.merge = kwargs.pop('merge', False)
983        self.locators = locators
984        super(AggregatingLocator, self).__init__(**kwargs)
985
986    def clear_cache(self):
987        super(AggregatingLocator, self).clear_cache()
988        for locator in self.locators:
989            locator.clear_cache()
990
991    def _set_scheme(self, value):
992        self._scheme = value
993        for locator in self.locators:
994            locator.scheme = value
995
996    scheme = property(Locator.scheme.fget, _set_scheme)
997
998    def _get_project(self, name):
999        result = {}
1000        for locator in self.locators:
1001            d = locator.get_project(name)
1002            if d:
1003                if self.merge:
1004                    files = result.get('urls', {})
1005                    digests = result.get('digests', {})
1006                    # next line could overwrite result['urls'], result['digests']
1007                    result.update(d)
1008                    df = result.get('urls')
1009                    if files and df:
1010                        for k, v in files.items():
1011                            if k in df:
1012                                df[k] |= v
1013                            else:
1014                                df[k] = v
1015                    dd = result.get('digests')
1016                    if digests and dd:
1017                        dd.update(digests)
1018                else:
1019                    # See issue #18. If any dists are found and we're looking
1020                    # for specific constraints, we only return something if
1021                    # a match is found. For example, if a DirectoryLocator
1022                    # returns just foo (1.0) while we're looking for
1023                    # foo (>= 2.0), we'll pretend there was nothing there so
1024                    # that subsequent locators can be queried. Otherwise we
1025                    # would just return foo (1.0) which would then lead to a
1026                    # failure to find foo (>= 2.0), because other locators
1027                    # weren't searched. Note that this only matters when
1028                    # merge=False.
1029                    if self.matcher is None:
1030                        found = True
1031                    else:
1032                        found = False
1033                        for k in d:
1034                            if self.matcher.match(k):
1035                                found = True
1036                                break
1037                    if found:
1038                        result = d
1039                        break
1040        return result
1041
1042    def get_distribution_names(self):
1043        """
1044        Return all the distribution names known to this locator.
1045        """
1046        result = set()
1047        for locator in self.locators:
1048            try:
1049                result |= locator.get_distribution_names()
1050            except NotImplementedError:
1051                pass
1052        return result
1053
1054
1055# We use a legacy scheme simply because most of the dists on PyPI use legacy
1056# versions which don't conform to PEP 426 / PEP 440.
1057default_locator = AggregatingLocator(
1058                    JSONLocator(),
1059                    SimpleScrapingLocator('https://pypi.org/simple/',
1060                                          timeout=3.0),
1061                    scheme='legacy')
1062
1063locate = default_locator.locate
1064
1065NAME_VERSION_RE = re.compile(r'(?P<name>[\w-]+)\s*'
1066                             r'\(\s*(==\s*)?(?P<ver>[^)]+)\)$')
1067
1068class DependencyFinder(object):
1069    """
1070    Locate dependencies for distributions.
1071    """
1072
1073    def __init__(self, locator=None):
1074        """
1075        Initialise an instance, using the specified locator
1076        to locate distributions.
1077        """
1078        self.locator = locator or default_locator
1079        self.scheme = get_scheme(self.locator.scheme)
1080
1081    def add_distribution(self, dist):
1082        """
1083        Add a distribution to the finder. This will update internal information
1084        about who provides what.
1085        :param dist: The distribution to add.
1086        """
1087        logger.debug('adding distribution %s', dist)
1088        name = dist.key
1089        self.dists_by_name[name] = dist
1090        self.dists[(name, dist.version)] = dist
1091        for p in dist.provides:
1092            name, version = parse_name_and_version(p)
1093            logger.debug('Add to provided: %s, %s, %s', name, version, dist)
1094            self.provided.setdefault(name, set()).add((version, dist))
1095
1096    def remove_distribution(self, dist):
1097        """
1098        Remove a distribution from the finder. This will update internal
1099        information about who provides what.
1100        :param dist: The distribution to remove.
1101        """
1102        logger.debug('removing distribution %s', dist)
1103        name = dist.key
1104        del self.dists_by_name[name]
1105        del self.dists[(name, dist.version)]
1106        for p in dist.provides:
1107            name, version = parse_name_and_version(p)
1108            logger.debug('Remove from provided: %s, %s, %s', name, version, dist)
1109            s = self.provided[name]
1110            s.remove((version, dist))
1111            if not s:
1112                del self.provided[name]
1113
1114    def get_matcher(self, reqt):
1115        """
1116        Get a version matcher for a requirement.
1117        :param reqt: The requirement
1118        :type reqt: str
1119        :return: A version matcher (an instance of
1120                 :class:`distlib.version.Matcher`).
1121        """
1122        try:
1123            matcher = self.scheme.matcher(reqt)
1124        except UnsupportedVersionError:  # pragma: no cover
1125            # XXX compat-mode if cannot read the version
1126            name = reqt.split()[0]
1127            matcher = self.scheme.matcher(name)
1128        return matcher
1129
1130    def find_providers(self, reqt):
1131        """
1132        Find the distributions which can fulfill a requirement.
1133
1134        :param reqt: The requirement.
1135         :type reqt: str
1136        :return: A set of distribution which can fulfill the requirement.
1137        """
1138        matcher = self.get_matcher(reqt)
1139        name = matcher.key   # case-insensitive
1140        result = set()
1141        provided = self.provided
1142        if name in provided:
1143            for version, provider in provided[name]:
1144                try:
1145                    match = matcher.match(version)
1146                except UnsupportedVersionError:
1147                    match = False
1148
1149                if match:
1150                    result.add(provider)
1151                    break
1152        return result
1153
1154    def try_to_replace(self, provider, other, problems):
1155        """
1156        Attempt to replace one provider with another. This is typically used
1157        when resolving dependencies from multiple sources, e.g. A requires
1158        (B >= 1.0) while C requires (B >= 1.1).
1159
1160        For successful replacement, ``provider`` must meet all the requirements
1161        which ``other`` fulfills.
1162
1163        :param provider: The provider we are trying to replace with.
1164        :param other: The provider we're trying to replace.
1165        :param problems: If False is returned, this will contain what
1166                         problems prevented replacement. This is currently
1167                         a tuple of the literal string 'cantreplace',
1168                         ``provider``, ``other``  and the set of requirements
1169                         that ``provider`` couldn't fulfill.
1170        :return: True if we can replace ``other`` with ``provider``, else
1171                 False.
1172        """
1173        rlist = self.reqts[other]
1174        unmatched = set()
1175        for s in rlist:
1176            matcher = self.get_matcher(s)
1177            if not matcher.match(provider.version):
1178                unmatched.add(s)
1179        if unmatched:
1180            # can't replace other with provider
1181            problems.add(('cantreplace', provider, other,
1182                          frozenset(unmatched)))
1183            result = False
1184        else:
1185            # can replace other with provider
1186            self.remove_distribution(other)
1187            del self.reqts[other]
1188            for s in rlist:
1189                self.reqts.setdefault(provider, set()).add(s)
1190            self.add_distribution(provider)
1191            result = True
1192        return result
1193
1194    def find(self, requirement, meta_extras=None, prereleases=False):
1195        """
1196        Find a distribution and all distributions it depends on.
1197
1198        :param requirement: The requirement specifying the distribution to
1199                            find, or a Distribution instance.
1200        :param meta_extras: A list of meta extras such as :test:, :build: and
1201                            so on.
1202        :param prereleases: If ``True``, allow pre-release versions to be
1203                            returned - otherwise, don't return prereleases
1204                            unless they're all that's available.
1205
1206        Return a set of :class:`Distribution` instances and a set of
1207        problems.
1208
1209        The distributions returned should be such that they have the
1210        :attr:`required` attribute set to ``True`` if they were
1211        from the ``requirement`` passed to ``find()``, and they have the
1212        :attr:`build_time_dependency` attribute set to ``True`` unless they
1213        are post-installation dependencies of the ``requirement``.
1214
1215        The problems should be a tuple consisting of the string
1216        ``'unsatisfied'`` and the requirement which couldn't be satisfied
1217        by any distribution known to the locator.
1218        """
1219
1220        self.provided = {}
1221        self.dists = {}
1222        self.dists_by_name = {}
1223        self.reqts = {}
1224
1225        meta_extras = set(meta_extras or [])
1226        if ':*:' in meta_extras:
1227            meta_extras.remove(':*:')
1228            # :meta: and :run: are implicitly included
1229            meta_extras |= set([':test:', ':build:', ':dev:'])
1230
1231        if isinstance(requirement, Distribution):
1232            dist = odist = requirement
1233            logger.debug('passed %s as requirement', odist)
1234        else:
1235            dist = odist = self.locator.locate(requirement,
1236                                               prereleases=prereleases)
1237            if dist is None:
1238                raise DistlibException('Unable to locate %r' % requirement)
1239            logger.debug('located %s', odist)
1240        dist.requested = True
1241        problems = set()
1242        todo = set([dist])
1243        install_dists = set([odist])
1244        while todo:
1245            dist = todo.pop()
1246            name = dist.key     # case-insensitive
1247            if name not in self.dists_by_name:
1248                self.add_distribution(dist)
1249            else:
1250                #import pdb; pdb.set_trace()
1251                other = self.dists_by_name[name]
1252                if other != dist:
1253                    self.try_to_replace(dist, other, problems)
1254
1255            ireqts = dist.run_requires | dist.meta_requires
1256            sreqts = dist.build_requires
1257            ereqts = set()
1258            if meta_extras and dist in install_dists:
1259                for key in ('test', 'build', 'dev'):
1260                    e = ':%s:' % key
1261                    if e in meta_extras:
1262                        ereqts |= getattr(dist, '%s_requires' % key)
1263            all_reqts = ireqts | sreqts | ereqts
1264            for r in all_reqts:
1265                providers = self.find_providers(r)
1266                if not providers:
1267                    logger.debug('No providers found for %r', r)
1268                    provider = self.locator.locate(r, prereleases=prereleases)
1269                    # If no provider is found and we didn't consider
1270                    # prereleases, consider them now.
1271                    if provider is None and not prereleases:
1272                        provider = self.locator.locate(r, prereleases=True)
1273                    if provider is None:
1274                        logger.debug('Cannot satisfy %r', r)
1275                        problems.add(('unsatisfied', r))
1276                    else:
1277                        n, v = provider.key, provider.version
1278                        if (n, v) not in self.dists:
1279                            todo.add(provider)
1280                        providers.add(provider)
1281                        if r in ireqts and dist in install_dists:
1282                            install_dists.add(provider)
1283                            logger.debug('Adding %s to install_dists',
1284                                         provider.name_and_version)
1285                for p in providers:
1286                    name = p.key
1287                    if name not in self.dists_by_name:
1288                        self.reqts.setdefault(p, set()).add(r)
1289                    else:
1290                        other = self.dists_by_name[name]
1291                        if other != p:
1292                            # see if other can be replaced by p
1293                            self.try_to_replace(p, other, problems)
1294
1295        dists = set(self.dists.values())
1296        for dist in dists:
1297            dist.build_time_dependency = dist not in install_dists
1298            if dist.build_time_dependency:
1299                logger.debug('%s is a build-time dependency only.',
1300                             dist.name_and_version)
1301        logger.debug('find done for %s', odist)
1302        return dists, problems
1303