1"""PyPI and direct package downloading"""
2import sys
3import os
4import re
5import shutil
6import socket
7import base64
8import hashlib
9import itertools
10import warnings
11from functools import wraps
12
13from setuptools.extern import six
14from setuptools.extern.six.moves import urllib, http_client, configparser, map
15
16import setuptools
17from pkg_resources import (
18    CHECKOUT_DIST, Distribution, BINARY_DIST, normalize_path, SOURCE_DIST,
19    Environment, find_distributions, safe_name, safe_version,
20    to_filename, Requirement, DEVELOP_DIST, EGG_DIST,
21)
22from setuptools import ssl_support
23from distutils import log
24from distutils.errors import DistutilsError
25from fnmatch import translate
26from setuptools.py27compat import get_all_headers
27from setuptools.py33compat import unescape
28from setuptools.wheel import Wheel
29
30__metaclass__ = type
31
32EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.+!]+)$')
33HREF = re.compile(r"""href\s*=\s*['"]?([^'"> ]+)""", re.I)
34PYPI_MD5 = re.compile(
35    r'<a href="([^"#]+)">([^<]+)</a>\n\s+\(<a (?:title="MD5 hash"\n\s+)'
36    r'href="[^?]+\?:action=show_md5&amp;digest=([0-9a-f]{32})">md5</a>\)'
37)
38URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
39EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split()
40
41__all__ = [
42    'PackageIndex', 'distros_for_url', 'parse_bdist_wininst',
43    'interpret_distro_name',
44]
45
46_SOCKET_TIMEOUT = 15
47
48_tmpl = "setuptools/{setuptools.__version__} Python-urllib/{py_major}"
49user_agent = _tmpl.format(
50    py_major='{}.{}'.format(*sys.version_info), setuptools=setuptools)
51
52
53def parse_requirement_arg(spec):
54    try:
55        return Requirement.parse(spec)
56    except ValueError as e:
57        raise DistutilsError(
58            "Not a URL, existing file, or requirement spec: %r" % (spec,)
59        ) from e
60
61
62def parse_bdist_wininst(name):
63    """Return (base,pyversion) or (None,None) for possible .exe name"""
64
65    lower = name.lower()
66    base, py_ver, plat = None, None, None
67
68    if lower.endswith('.exe'):
69        if lower.endswith('.win32.exe'):
70            base = name[:-10]
71            plat = 'win32'
72        elif lower.startswith('.win32-py', -16):
73            py_ver = name[-7:-4]
74            base = name[:-16]
75            plat = 'win32'
76        elif lower.endswith('.win-amd64.exe'):
77            base = name[:-14]
78            plat = 'win-amd64'
79        elif lower.startswith('.win-amd64-py', -20):
80            py_ver = name[-7:-4]
81            base = name[:-20]
82            plat = 'win-amd64'
83    return base, py_ver, plat
84
85
86def egg_info_for_url(url):
87    parts = urllib.parse.urlparse(url)
88    scheme, server, path, parameters, query, fragment = parts
89    base = urllib.parse.unquote(path.split('/')[-1])
90    if server == 'sourceforge.net' and base == 'download':  # XXX Yuck
91        base = urllib.parse.unquote(path.split('/')[-2])
92    if '#' in base:
93        base, fragment = base.split('#', 1)
94    return base, fragment
95
96
97def distros_for_url(url, metadata=None):
98    """Yield egg or source distribution objects that might be found at a URL"""
99    base, fragment = egg_info_for_url(url)
100    for dist in distros_for_location(url, base, metadata):
101        yield dist
102    if fragment:
103        match = EGG_FRAGMENT.match(fragment)
104        if match:
105            for dist in interpret_distro_name(
106                url, match.group(1), metadata, precedence=CHECKOUT_DIST
107            ):
108                yield dist
109
110
111def distros_for_location(location, basename, metadata=None):
112    """Yield egg or source distribution objects based on basename"""
113    if basename.endswith('.egg.zip'):
114        basename = basename[:-4]  # strip the .zip
115    if basename.endswith('.egg') and '-' in basename:
116        # only one, unambiguous interpretation
117        return [Distribution.from_location(location, basename, metadata)]
118    if basename.endswith('.whl') and '-' in basename:
119        wheel = Wheel(basename)
120        if not wheel.is_compatible():
121            return []
122        return [Distribution(
123            location=location,
124            project_name=wheel.project_name,
125            version=wheel.version,
126            # Increase priority over eggs.
127            precedence=EGG_DIST + 1,
128        )]
129    if basename.endswith('.exe'):
130        win_base, py_ver, platform = parse_bdist_wininst(basename)
131        if win_base is not None:
132            return interpret_distro_name(
133                location, win_base, metadata, py_ver, BINARY_DIST, platform
134            )
135    # Try source distro extensions (.zip, .tgz, etc.)
136    #
137    for ext in EXTENSIONS:
138        if basename.endswith(ext):
139            basename = basename[:-len(ext)]
140            return interpret_distro_name(location, basename, metadata)
141    return []  # no extension matched
142
143
144def distros_for_filename(filename, metadata=None):
145    """Yield possible egg or source distribution objects based on a filename"""
146    return distros_for_location(
147        normalize_path(filename), os.path.basename(filename), metadata
148    )
149
150
151def interpret_distro_name(
152        location, basename, metadata, py_version=None, precedence=SOURCE_DIST,
153        platform=None
154):
155    """Generate alternative interpretations of a source distro name
156
157    Note: if `location` is a filesystem filename, you should call
158    ``pkg_resources.normalize_path()`` on it before passing it to this
159    routine!
160    """
161    # Generate alternative interpretations of a source distro name
162    # Because some packages are ambiguous as to name/versions split
163    # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc.
164    # So, we generate each possible interepretation (e.g. "adns, python-1.1.0"
165    # "adns-python, 1.1.0", and "adns-python-1.1.0, no version").  In practice,
166    # the spurious interpretations should be ignored, because in the event
167    # there's also an "adns" package, the spurious "python-1.1.0" version will
168    # compare lower than any numeric version number, and is therefore unlikely
169    # to match a request for it.  It's still a potential problem, though, and
170    # in the long run PyPI and the distutils should go for "safe" names and
171    # versions in distribution archive names (sdist and bdist).
172
173    parts = basename.split('-')
174    if not py_version and any(re.match(r'py\d\.\d$', p) for p in parts[2:]):
175        # it is a bdist_dumb, not an sdist -- bail out
176        return
177
178    for p in range(1, len(parts) + 1):
179        yield Distribution(
180            location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]),
181            py_version=py_version, precedence=precedence,
182            platform=platform
183        )
184
185
186# From Python 2.7 docs
187def unique_everseen(iterable, key=None):
188    "List unique elements, preserving order. Remember all elements ever seen."
189    # unique_everseen('AAAABBBCCDAABBB') --> A B C D
190    # unique_everseen('ABBCcAD', str.lower) --> A B C D
191    seen = set()
192    seen_add = seen.add
193    if key is None:
194        for element in six.moves.filterfalse(seen.__contains__, iterable):
195            seen_add(element)
196            yield element
197    else:
198        for element in iterable:
199            k = key(element)
200            if k not in seen:
201                seen_add(k)
202                yield element
203
204
205def unique_values(func):
206    """
207    Wrap a function returning an iterable such that the resulting iterable
208    only ever yields unique items.
209    """
210
211    @wraps(func)
212    def wrapper(*args, **kwargs):
213        return unique_everseen(func(*args, **kwargs))
214
215    return wrapper
216
217
218REL = re.compile(r"""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
219# this line is here to fix emacs' cruddy broken syntax highlighting
220
221
222@unique_values
223def find_external_links(url, page):
224    """Find rel="homepage" and rel="download" links in `page`, yielding URLs"""
225
226    for match in REL.finditer(page):
227        tag, rel = match.groups()
228        rels = set(map(str.strip, rel.lower().split(',')))
229        if 'homepage' in rels or 'download' in rels:
230            for match in HREF.finditer(tag):
231                yield urllib.parse.urljoin(url, htmldecode(match.group(1)))
232
233    for tag in ("<th>Home Page", "<th>Download URL"):
234        pos = page.find(tag)
235        if pos != -1:
236            match = HREF.search(page, pos)
237            if match:
238                yield urllib.parse.urljoin(url, htmldecode(match.group(1)))
239
240
241class ContentChecker:
242    """
243    A null content checker that defines the interface for checking content
244    """
245
246    def feed(self, block):
247        """
248        Feed a block of data to the hash.
249        """
250        return
251
252    def is_valid(self):
253        """
254        Check the hash. Return False if validation fails.
255        """
256        return True
257
258    def report(self, reporter, template):
259        """
260        Call reporter with information about the checker (hash name)
261        substituted into the template.
262        """
263        return
264
265
266class HashChecker(ContentChecker):
267    pattern = re.compile(
268        r'(?P<hash_name>sha1|sha224|sha384|sha256|sha512|md5)='
269        r'(?P<expected>[a-f0-9]+)'
270    )
271
272    def __init__(self, hash_name, expected):
273        self.hash_name = hash_name
274        self.hash = hashlib.new(hash_name)
275        self.expected = expected
276
277    @classmethod
278    def from_url(cls, url):
279        "Construct a (possibly null) ContentChecker from a URL"
280        fragment = urllib.parse.urlparse(url)[-1]
281        if not fragment:
282            return ContentChecker()
283        match = cls.pattern.search(fragment)
284        if not match:
285            return ContentChecker()
286        return cls(**match.groupdict())
287
288    def feed(self, block):
289        self.hash.update(block)
290
291    def is_valid(self):
292        return self.hash.hexdigest() == self.expected
293
294    def report(self, reporter, template):
295        msg = template % self.hash_name
296        return reporter(msg)
297
298
299class PackageIndex(Environment):
300    """A distribution index that scans web pages for download URLs"""
301
302    def __init__(
303            self, index_url="https://pypi.org/simple/", hosts=('*',),
304            ca_bundle=None, verify_ssl=True, *args, **kw
305    ):
306        Environment.__init__(self, *args, **kw)
307        self.index_url = index_url + "/" [:not index_url.endswith('/')]
308        self.scanned_urls = {}
309        self.fetched_urls = {}
310        self.package_pages = {}
311        self.allows = re.compile('|'.join(map(translate, hosts))).match
312        self.to_scan = []
313        use_ssl = (
314            verify_ssl
315            and ssl_support.is_available
316            and (ca_bundle or ssl_support.find_ca_bundle())
317        )
318        if use_ssl:
319            self.opener = ssl_support.opener_for(ca_bundle)
320        else:
321            self.opener = urllib.request.urlopen
322
323    def process_url(self, url, retrieve=False):
324        """Evaluate a URL as a possible download, and maybe retrieve it"""
325        if url in self.scanned_urls and not retrieve:
326            return
327        self.scanned_urls[url] = True
328        if not URL_SCHEME(url):
329            self.process_filename(url)
330            return
331        else:
332            dists = list(distros_for_url(url))
333            if dists:
334                if not self.url_ok(url):
335                    return
336                self.debug("Found link: %s", url)
337
338        if dists or not retrieve or url in self.fetched_urls:
339            list(map(self.add, dists))
340            return  # don't need the actual page
341
342        if not self.url_ok(url):
343            self.fetched_urls[url] = True
344            return
345
346        self.info("Reading %s", url)
347        self.fetched_urls[url] = True  # prevent multiple fetch attempts
348        tmpl = "Download error on %s: %%s -- Some packages may not be found!"
349        f = self.open_url(url, tmpl % url)
350        if f is None:
351            return
352        if isinstance(f, urllib.error.HTTPError) and f.code == 401:
353            self.info("Authentication error: %s" % f.msg)
354        self.fetched_urls[f.url] = True
355        if 'html' not in f.headers.get('content-type', '').lower():
356            f.close()  # not html, we can't process it
357            return
358
359        base = f.url  # handle redirects
360        page = f.read()
361        if not isinstance(page, str):
362            # In Python 3 and got bytes but want str.
363            if isinstance(f, urllib.error.HTTPError):
364                # Errors have no charset, assume latin1:
365                charset = 'latin-1'
366            else:
367                charset = f.headers.get_param('charset') or 'latin-1'
368            page = page.decode(charset, "ignore")
369        f.close()
370        for match in HREF.finditer(page):
371            link = urllib.parse.urljoin(base, htmldecode(match.group(1)))
372            self.process_url(link)
373        if url.startswith(self.index_url) and getattr(f, 'code', None) != 404:
374            page = self.process_index(url, page)
375
376    def process_filename(self, fn, nested=False):
377        # process filenames or directories
378        if not os.path.exists(fn):
379            self.warn("Not found: %s", fn)
380            return
381
382        if os.path.isdir(fn) and not nested:
383            path = os.path.realpath(fn)
384            for item in os.listdir(path):
385                self.process_filename(os.path.join(path, item), True)
386
387        dists = distros_for_filename(fn)
388        if dists:
389            self.debug("Found: %s", fn)
390            list(map(self.add, dists))
391
392    def url_ok(self, url, fatal=False):
393        s = URL_SCHEME(url)
394        is_file = s and s.group(1).lower() == 'file'
395        if is_file or self.allows(urllib.parse.urlparse(url)[1]):
396            return True
397        msg = (
398            "\nNote: Bypassing %s (disallowed host; see "
399            "http://bit.ly/2hrImnY for details).\n")
400        if fatal:
401            raise DistutilsError(msg % url)
402        else:
403            self.warn(msg, url)
404
405    def scan_egg_links(self, search_path):
406        dirs = filter(os.path.isdir, search_path)
407        egg_links = (
408            (path, entry)
409            for path in dirs
410            for entry in os.listdir(path)
411            if entry.endswith('.egg-link')
412        )
413        list(itertools.starmap(self.scan_egg_link, egg_links))
414
415    def scan_egg_link(self, path, entry):
416        with open(os.path.join(path, entry)) as raw_lines:
417            # filter non-empty lines
418            lines = list(filter(None, map(str.strip, raw_lines)))
419
420        if len(lines) != 2:
421            # format is not recognized; punt
422            return
423
424        egg_path, setup_path = lines
425
426        for dist in find_distributions(os.path.join(path, egg_path)):
427            dist.location = os.path.join(path, *lines)
428            dist.precedence = SOURCE_DIST
429            self.add(dist)
430
431    def process_index(self, url, page):
432        """Process the contents of a PyPI page"""
433
434        def scan(link):
435            # Process a URL to see if it's for a package page
436            if link.startswith(self.index_url):
437                parts = list(map(
438                    urllib.parse.unquote, link[len(self.index_url):].split('/')
439                ))
440                if len(parts) == 2 and '#' not in parts[1]:
441                    # it's a package page, sanitize and index it
442                    pkg = safe_name(parts[0])
443                    ver = safe_version(parts[1])
444                    self.package_pages.setdefault(pkg.lower(), {})[link] = True
445                    return to_filename(pkg), to_filename(ver)
446            return None, None
447
448        # process an index page into the package-page index
449        for match in HREF.finditer(page):
450            try:
451                scan(urllib.parse.urljoin(url, htmldecode(match.group(1))))
452            except ValueError:
453                pass
454
455        pkg, ver = scan(url)  # ensure this page is in the page index
456        if pkg:
457            # process individual package page
458            for new_url in find_external_links(url, page):
459                # Process the found URL
460                base, frag = egg_info_for_url(new_url)
461                if base.endswith('.py') and not frag:
462                    if ver:
463                        new_url += '#egg=%s-%s' % (pkg, ver)
464                    else:
465                        self.need_version_info(url)
466                self.scan_url(new_url)
467
468            return PYPI_MD5.sub(
469                lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1, 3, 2), page
470            )
471        else:
472            return ""  # no sense double-scanning non-package pages
473
474    def need_version_info(self, url):
475        self.scan_all(
476            "Page at %s links to .py file(s) without version info; an index "
477            "scan is required.", url
478        )
479
480    def scan_all(self, msg=None, *args):
481        if self.index_url not in self.fetched_urls:
482            if msg:
483                self.warn(msg, *args)
484            self.info(
485                "Scanning index of all packages (this may take a while)"
486            )
487        self.scan_url(self.index_url)
488
489    def find_packages(self, requirement):
490        self.scan_url(self.index_url + requirement.unsafe_name + '/')
491
492        if not self.package_pages.get(requirement.key):
493            # Fall back to safe version of the name
494            self.scan_url(self.index_url + requirement.project_name + '/')
495
496        if not self.package_pages.get(requirement.key):
497            # We couldn't find the target package, so search the index page too
498            self.not_found_in_index(requirement)
499
500        for url in list(self.package_pages.get(requirement.key, ())):
501            # scan each page that might be related to the desired package
502            self.scan_url(url)
503
504    def obtain(self, requirement, installer=None):
505        self.prescan()
506        self.find_packages(requirement)
507        for dist in self[requirement.key]:
508            if dist in requirement:
509                return dist
510            self.debug("%s does not match %s", requirement, dist)
511        return super(PackageIndex, self).obtain(requirement, installer)
512
513    def check_hash(self, checker, filename, tfp):
514        """
515        checker is a ContentChecker
516        """
517        checker.report(
518            self.debug,
519            "Validating %%s checksum for %s" % filename)
520        if not checker.is_valid():
521            tfp.close()
522            os.unlink(filename)
523            raise DistutilsError(
524                "%s validation failed for %s; "
525                "possible download problem?"
526                % (checker.hash.name, os.path.basename(filename))
527            )
528
529    def add_find_links(self, urls):
530        """Add `urls` to the list that will be prescanned for searches"""
531        for url in urls:
532            if (
533                self.to_scan is None  # if we have already "gone online"
534                or not URL_SCHEME(url)  # or it's a local file/directory
535                or url.startswith('file:')
536                or list(distros_for_url(url))  # or a direct package link
537            ):
538                # then go ahead and process it now
539                self.scan_url(url)
540            else:
541                # otherwise, defer retrieval till later
542                self.to_scan.append(url)
543
544    def prescan(self):
545        """Scan urls scheduled for prescanning (e.g. --find-links)"""
546        if self.to_scan:
547            list(map(self.scan_url, self.to_scan))
548        self.to_scan = None  # from now on, go ahead and process immediately
549
550    def not_found_in_index(self, requirement):
551        if self[requirement.key]:  # we've seen at least one distro
552            meth, msg = self.info, "Couldn't retrieve index page for %r"
553        else:  # no distros seen for this name, might be misspelled
554            meth, msg = (
555                self.warn,
556                "Couldn't find index page for %r (maybe misspelled?)")
557        meth(msg, requirement.unsafe_name)
558        self.scan_all()
559
560    def download(self, spec, tmpdir):
561        """Locate and/or download `spec` to `tmpdir`, returning a local path
562
563        `spec` may be a ``Requirement`` object, or a string containing a URL,
564        an existing local filename, or a project/version requirement spec
565        (i.e. the string form of a ``Requirement`` object).  If it is the URL
566        of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one
567        that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is
568        automatically created alongside the downloaded file.
569
570        If `spec` is a ``Requirement`` object or a string containing a
571        project/version requirement spec, this method returns the location of
572        a matching distribution (possibly after downloading it to `tmpdir`).
573        If `spec` is a locally existing file or directory name, it is simply
574        returned unchanged.  If `spec` is a URL, it is downloaded to a subpath
575        of `tmpdir`, and the local filename is returned.  Various errors may be
576        raised if a problem occurs during downloading.
577        """
578        if not isinstance(spec, Requirement):
579            scheme = URL_SCHEME(spec)
580            if scheme:
581                # It's a url, download it to tmpdir
582                found = self._download_url(scheme.group(1), spec, tmpdir)
583                base, fragment = egg_info_for_url(spec)
584                if base.endswith('.py'):
585                    found = self.gen_setup(found, fragment, tmpdir)
586                return found
587            elif os.path.exists(spec):
588                # Existing file or directory, just return it
589                return spec
590            else:
591                spec = parse_requirement_arg(spec)
592        return getattr(self.fetch_distribution(spec, tmpdir), 'location', None)
593
594    def fetch_distribution(
595            self, requirement, tmpdir, force_scan=False, source=False,
596            develop_ok=False, local_index=None):
597        """Obtain a distribution suitable for fulfilling `requirement`
598
599        `requirement` must be a ``pkg_resources.Requirement`` instance.
600        If necessary, or if the `force_scan` flag is set, the requirement is
601        searched for in the (online) package index as well as the locally
602        installed packages.  If a distribution matching `requirement` is found,
603        the returned distribution's ``location`` is the value you would have
604        gotten from calling the ``download()`` method with the matching
605        distribution's URL or filename.  If no matching distribution is found,
606        ``None`` is returned.
607
608        If the `source` flag is set, only source distributions and source
609        checkout links will be considered.  Unless the `develop_ok` flag is
610        set, development and system eggs (i.e., those using the ``.egg-info``
611        format) will be ignored.
612        """
613        # process a Requirement
614        self.info("Searching for %s", requirement)
615        skipped = {}
616        dist = None
617
618        def find(req, env=None):
619            if env is None:
620                env = self
621            # Find a matching distribution; may be called more than once
622
623            for dist in env[req.key]:
624
625                if dist.precedence == DEVELOP_DIST and not develop_ok:
626                    if dist not in skipped:
627                        self.warn(
628                            "Skipping development or system egg: %s", dist,
629                        )
630                        skipped[dist] = 1
631                    continue
632
633                test = (
634                    dist in req
635                    and (dist.precedence <= SOURCE_DIST or not source)
636                )
637                if test:
638                    loc = self.download(dist.location, tmpdir)
639                    dist.download_location = loc
640                    if os.path.exists(dist.download_location):
641                        return dist
642
643        if force_scan:
644            self.prescan()
645            self.find_packages(requirement)
646            dist = find(requirement)
647
648        if not dist and local_index is not None:
649            dist = find(requirement, local_index)
650
651        if dist is None:
652            if self.to_scan is not None:
653                self.prescan()
654            dist = find(requirement)
655
656        if dist is None and not force_scan:
657            self.find_packages(requirement)
658            dist = find(requirement)
659
660        if dist is None:
661            self.warn(
662                "No local packages or working download links found for %s%s",
663                (source and "a source distribution of " or ""),
664                requirement,
665            )
666        else:
667            self.info("Best match: %s", dist)
668            return dist.clone(location=dist.download_location)
669
670    def fetch(self, requirement, tmpdir, force_scan=False, source=False):
671        """Obtain a file suitable for fulfilling `requirement`
672
673        DEPRECATED; use the ``fetch_distribution()`` method now instead.  For
674        backward compatibility, this routine is identical but returns the
675        ``location`` of the downloaded distribution instead of a distribution
676        object.
677        """
678        dist = self.fetch_distribution(requirement, tmpdir, force_scan, source)
679        if dist is not None:
680            return dist.location
681        return None
682
683    def gen_setup(self, filename, fragment, tmpdir):
684        match = EGG_FRAGMENT.match(fragment)
685        dists = match and [
686            d for d in
687            interpret_distro_name(filename, match.group(1), None) if d.version
688        ] or []
689
690        if len(dists) == 1:  # unambiguous ``#egg`` fragment
691            basename = os.path.basename(filename)
692
693            # Make sure the file has been downloaded to the temp dir.
694            if os.path.dirname(filename) != tmpdir:
695                dst = os.path.join(tmpdir, basename)
696                from setuptools.command.easy_install import samefile
697                if not samefile(filename, dst):
698                    shutil.copy2(filename, dst)
699                    filename = dst
700
701            with open(os.path.join(tmpdir, 'setup.py'), 'w') as file:
702                file.write(
703                    "from setuptools import setup\n"
704                    "setup(name=%r, version=%r, py_modules=[%r])\n"
705                    % (
706                        dists[0].project_name, dists[0].version,
707                        os.path.splitext(basename)[0]
708                    )
709                )
710            return filename
711
712        elif match:
713            raise DistutilsError(
714                "Can't unambiguously interpret project/version identifier %r; "
715                "any dashes in the name or version should be escaped using "
716                "underscores. %r" % (fragment, dists)
717            )
718        else:
719            raise DistutilsError(
720                "Can't process plain .py files without an '#egg=name-version'"
721                " suffix to enable automatic setup script generation."
722            )
723
724    dl_blocksize = 8192
725
726    def _download_to(self, url, filename):
727        self.info("Downloading %s", url)
728        # Download the file
729        fp = None
730        try:
731            checker = HashChecker.from_url(url)
732            fp = self.open_url(url)
733            if isinstance(fp, urllib.error.HTTPError):
734                raise DistutilsError(
735                    "Can't download %s: %s %s" % (url, fp.code, fp.msg)
736                )
737            headers = fp.info()
738            blocknum = 0
739            bs = self.dl_blocksize
740            size = -1
741            if "content-length" in headers:
742                # Some servers return multiple Content-Length headers :(
743                sizes = get_all_headers(headers, 'Content-Length')
744                size = max(map(int, sizes))
745                self.reporthook(url, filename, blocknum, bs, size)
746            with open(filename, 'wb') as tfp:
747                while True:
748                    block = fp.read(bs)
749                    if block:
750                        checker.feed(block)
751                        tfp.write(block)
752                        blocknum += 1
753                        self.reporthook(url, filename, blocknum, bs, size)
754                    else:
755                        break
756                self.check_hash(checker, filename, tfp)
757            return headers
758        finally:
759            if fp:
760                fp.close()
761
762    def reporthook(self, url, filename, blocknum, blksize, size):
763        pass  # no-op
764
765    def open_url(self, url, warning=None):
766        if url.startswith('file:'):
767            return local_open(url)
768        try:
769            return open_with_auth(url, self.opener)
770        except (ValueError, http_client.InvalidURL) as v:
771            msg = ' '.join([str(arg) for arg in v.args])
772            if warning:
773                self.warn(warning, msg)
774            else:
775                raise DistutilsError('%s %s' % (url, msg)) from v
776        except urllib.error.HTTPError as v:
777            return v
778        except urllib.error.URLError as v:
779            if warning:
780                self.warn(warning, v.reason)
781            else:
782                raise DistutilsError("Download error for %s: %s"
783                                     % (url, v.reason)) from v
784        except http_client.BadStatusLine as v:
785            if warning:
786                self.warn(warning, v.line)
787            else:
788                raise DistutilsError(
789                    '%s returned a bad status line. The server might be '
790                    'down, %s' %
791                    (url, v.line)
792                ) from v
793        except (http_client.HTTPException, socket.error) as v:
794            if warning:
795                self.warn(warning, v)
796            else:
797                raise DistutilsError("Download error for %s: %s"
798                                     % (url, v)) from v
799
800    def _download_url(self, scheme, url, tmpdir):
801        # Determine download filename
802        #
803        name, fragment = egg_info_for_url(url)
804        if name:
805            while '..' in name:
806                name = name.replace('..', '.').replace('\\', '_')
807        else:
808            name = "__downloaded__"  # default if URL has no path contents
809
810        if name.endswith('.egg.zip'):
811            name = name[:-4]  # strip the extra .zip before download
812
813        filename = os.path.join(tmpdir, name)
814
815        # Download the file
816        #
817        if scheme == 'svn' or scheme.startswith('svn+'):
818            return self._download_svn(url, filename)
819        elif scheme == 'git' or scheme.startswith('git+'):
820            return self._download_git(url, filename)
821        elif scheme.startswith('hg+'):
822            return self._download_hg(url, filename)
823        elif scheme == 'file':
824            return urllib.request.url2pathname(urllib.parse.urlparse(url)[2])
825        else:
826            self.url_ok(url, True)  # raises error if not allowed
827            return self._attempt_download(url, filename)
828
829    def scan_url(self, url):
830        self.process_url(url, True)
831
832    def _attempt_download(self, url, filename):
833        headers = self._download_to(url, filename)
834        if 'html' in headers.get('content-type', '').lower():
835            return self._download_html(url, headers, filename)
836        else:
837            return filename
838
839    def _download_html(self, url, headers, filename):
840        file = open(filename)
841        for line in file:
842            if line.strip():
843                # Check for a subversion index page
844                if re.search(r'<title>([^- ]+ - )?Revision \d+:', line):
845                    # it's a subversion index page:
846                    file.close()
847                    os.unlink(filename)
848                    return self._download_svn(url, filename)
849                break  # not an index page
850        file.close()
851        os.unlink(filename)
852        raise DistutilsError("Unexpected HTML page found at " + url)
853
854    def _download_svn(self, url, filename):
855        warnings.warn("SVN download support is deprecated", UserWarning)
856        url = url.split('#', 1)[0]  # remove any fragment for svn's sake
857        creds = ''
858        if url.lower().startswith('svn:') and '@' in url:
859            scheme, netloc, path, p, q, f = urllib.parse.urlparse(url)
860            if not netloc and path.startswith('//') and '/' in path[2:]:
861                netloc, path = path[2:].split('/', 1)
862                auth, host = _splituser(netloc)
863                if auth:
864                    if ':' in auth:
865                        user, pw = auth.split(':', 1)
866                        creds = " --username=%s --password=%s" % (user, pw)
867                    else:
868                        creds = " --username=" + auth
869                    netloc = host
870                    parts = scheme, netloc, url, p, q, f
871                    url = urllib.parse.urlunparse(parts)
872        self.info("Doing subversion checkout from %s to %s", url, filename)
873        os.system("svn checkout%s -q %s %s" % (creds, url, filename))
874        return filename
875
876    @staticmethod
877    def _vcs_split_rev_from_url(url, pop_prefix=False):
878        scheme, netloc, path, query, frag = urllib.parse.urlsplit(url)
879
880        scheme = scheme.split('+', 1)[-1]
881
882        # Some fragment identification fails
883        path = path.split('#', 1)[0]
884
885        rev = None
886        if '@' in path:
887            path, rev = path.rsplit('@', 1)
888
889        # Also, discard fragment
890        url = urllib.parse.urlunsplit((scheme, netloc, path, query, ''))
891
892        return url, rev
893
894    def _download_git(self, url, filename):
895        filename = filename.split('#', 1)[0]
896        url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True)
897
898        self.info("Doing git clone from %s to %s", url, filename)
899        os.system("git clone --quiet %s %s" % (url, filename))
900
901        if rev is not None:
902            self.info("Checking out %s", rev)
903            os.system("git -C %s checkout --quiet %s" % (
904                filename,
905                rev,
906            ))
907
908        return filename
909
910    def _download_hg(self, url, filename):
911        filename = filename.split('#', 1)[0]
912        url, rev = self._vcs_split_rev_from_url(url, pop_prefix=True)
913
914        self.info("Doing hg clone from %s to %s", url, filename)
915        os.system("hg clone --quiet %s %s" % (url, filename))
916
917        if rev is not None:
918            self.info("Updating to %s", rev)
919            os.system("hg --cwd %s up -C -r %s -q" % (
920                filename,
921                rev,
922            ))
923
924        return filename
925
926    def debug(self, msg, *args):
927        log.debug(msg, *args)
928
929    def info(self, msg, *args):
930        log.info(msg, *args)
931
932    def warn(self, msg, *args):
933        log.warn(msg, *args)
934
935
936# This pattern matches a character entity reference (a decimal numeric
937# references, a hexadecimal numeric reference, or a named reference).
938entity_sub = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
939
940
941def decode_entity(match):
942    what = match.group(0)
943    return unescape(what)
944
945
946def htmldecode(text):
947    """
948    Decode HTML entities in the given text.
949
950    >>> htmldecode(
951    ...     'https://../package_name-0.1.2.tar.gz'
952    ...     '?tokena=A&amp;tokenb=B">package_name-0.1.2.tar.gz')
953    'https://../package_name-0.1.2.tar.gz?tokena=A&tokenb=B">package_name-0.1.2.tar.gz'
954    """
955    return entity_sub(decode_entity, text)
956
957
958def socket_timeout(timeout=15):
959    def _socket_timeout(func):
960        def _socket_timeout(*args, **kwargs):
961            old_timeout = socket.getdefaulttimeout()
962            socket.setdefaulttimeout(timeout)
963            try:
964                return func(*args, **kwargs)
965            finally:
966                socket.setdefaulttimeout(old_timeout)
967
968        return _socket_timeout
969
970    return _socket_timeout
971
972
973def _encode_auth(auth):
974    """
975    A function compatible with Python 2.3-3.3 that will encode
976    auth from a URL suitable for an HTTP header.
977    >>> str(_encode_auth('username%3Apassword'))
978    'dXNlcm5hbWU6cGFzc3dvcmQ='
979
980    Long auth strings should not cause a newline to be inserted.
981    >>> long_auth = 'username:' + 'password'*10
982    >>> chr(10) in str(_encode_auth(long_auth))
983    False
984    """
985    auth_s = urllib.parse.unquote(auth)
986    # convert to bytes
987    auth_bytes = auth_s.encode()
988    encoded_bytes = base64.b64encode(auth_bytes)
989    # convert back to a string
990    encoded = encoded_bytes.decode()
991    # strip the trailing carriage return
992    return encoded.replace('\n', '')
993
994
995class Credential:
996    """
997    A username/password pair. Use like a namedtuple.
998    """
999
1000    def __init__(self, username, password):
1001        self.username = username
1002        self.password = password
1003
1004    def __iter__(self):
1005        yield self.username
1006        yield self.password
1007
1008    def __str__(self):
1009        return '%(username)s:%(password)s' % vars(self)
1010
1011
1012class PyPIConfig(configparser.RawConfigParser):
1013    def __init__(self):
1014        """
1015        Load from ~/.pypirc
1016        """
1017        defaults = dict.fromkeys(['username', 'password', 'repository'], '')
1018        configparser.RawConfigParser.__init__(self, defaults)
1019
1020        rc = os.path.join(os.path.expanduser('~'), '.pypirc')
1021        if os.path.exists(rc):
1022            self.read(rc)
1023
1024    @property
1025    def creds_by_repository(self):
1026        sections_with_repositories = [
1027            section for section in self.sections()
1028            if self.get(section, 'repository').strip()
1029        ]
1030
1031        return dict(map(self._get_repo_cred, sections_with_repositories))
1032
1033    def _get_repo_cred(self, section):
1034        repo = self.get(section, 'repository').strip()
1035        return repo, Credential(
1036            self.get(section, 'username').strip(),
1037            self.get(section, 'password').strip(),
1038        )
1039
1040    def find_credential(self, url):
1041        """
1042        If the URL indicated appears to be a repository defined in this
1043        config, return the credential for that repository.
1044        """
1045        for repository, cred in self.creds_by_repository.items():
1046            if url.startswith(repository):
1047                return cred
1048
1049
1050def open_with_auth(url, opener=urllib.request.urlopen):
1051    """Open a urllib2 request, handling HTTP authentication"""
1052
1053    parsed = urllib.parse.urlparse(url)
1054    scheme, netloc, path, params, query, frag = parsed
1055
1056    # Double scheme does not raise on macOS as revealed by a
1057    # failing test. We would expect "nonnumeric port". Refs #20.
1058    if netloc.endswith(':'):
1059        raise http_client.InvalidURL("nonnumeric port: ''")
1060
1061    if scheme in ('http', 'https'):
1062        auth, address = _splituser(netloc)
1063    else:
1064        auth = None
1065
1066    if not auth:
1067        cred = PyPIConfig().find_credential(url)
1068        if cred:
1069            auth = str(cred)
1070            info = cred.username, url
1071            log.info('Authenticating as %s for %s (from .pypirc)', *info)
1072
1073    if auth:
1074        auth = "Basic " + _encode_auth(auth)
1075        parts = scheme, address, path, params, query, frag
1076        new_url = urllib.parse.urlunparse(parts)
1077        request = urllib.request.Request(new_url)
1078        request.add_header("Authorization", auth)
1079    else:
1080        request = urllib.request.Request(url)
1081
1082    request.add_header('User-Agent', user_agent)
1083    fp = opener(request)
1084
1085    if auth:
1086        # Put authentication info back into request URL if same host,
1087        # so that links found on the page will work
1088        s2, h2, path2, param2, query2, frag2 = urllib.parse.urlparse(fp.url)
1089        if s2 == scheme and h2 == address:
1090            parts = s2, netloc, path2, param2, query2, frag2
1091            fp.url = urllib.parse.urlunparse(parts)
1092
1093    return fp
1094
1095
1096# copy of urllib.parse._splituser from Python 3.8
1097def _splituser(host):
1098    """splituser('user[:passwd]@host[:port]')
1099    --> 'user[:passwd]', 'host[:port]'."""
1100    user, delim, host = host.rpartition('@')
1101    return (user if delim else None), host
1102
1103
1104# adding a timeout to avoid freezing package_index
1105open_with_auth = socket_timeout(_SOCKET_TIMEOUT)(open_with_auth)
1106
1107
1108def fix_sf_url(url):
1109    return url  # backward compatibility
1110
1111
1112def local_open(url):
1113    """Read a local path, with special support for directories"""
1114    scheme, server, path, param, query, frag = urllib.parse.urlparse(url)
1115    filename = urllib.request.url2pathname(path)
1116    if os.path.isfile(filename):
1117        return urllib.request.urlopen(url)
1118    elif path.endswith('/') and os.path.isdir(filename):
1119        files = []
1120        for f in os.listdir(filename):
1121            filepath = os.path.join(filename, f)
1122            if f == 'index.html':
1123                with open(filepath, 'r') as fp:
1124                    body = fp.read()
1125                break
1126            elif os.path.isdir(filepath):
1127                f += '/'
1128            files.append('<a href="{name}">{name}</a>'.format(name=f))
1129        else:
1130            tmpl = (
1131                "<html><head><title>{url}</title>"
1132                "</head><body>{files}</body></html>")
1133            body = tmpl.format(url=url, files='\n'.join(files))
1134        status, message = 200, "OK"
1135    else:
1136        status, message, body = 404, "Path not found", "Not found"
1137
1138    headers = {'content-type': 'text/html'}
1139    body_stream = six.StringIO(body)
1140    return urllib.error.HTTPError(url, status, message, headers, body_stream)
1141