comparison planemo/lib/python3.7/site-packages/distlib/locators.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright (C) 2012-2015 Vinay Sajip.
4 # Licensed to the Python Software Foundation under a contributor agreement.
5 # See LICENSE.txt and CONTRIBUTORS.txt.
6 #
7
8 import gzip
9 from io import BytesIO
10 import json
11 import logging
12 import os
13 import posixpath
14 import re
15 try:
16 import threading
17 except ImportError: # pragma: no cover
18 import dummy_threading as threading
19 import zlib
20
21 from . import DistlibException
22 from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url,
23 queue, quote, unescape, string_types, build_opener,
24 HTTPRedirectHandler as BaseRedirectHandler, text_type,
25 Request, HTTPError, URLError)
26 from .database import Distribution, DistributionPath, make_dist
27 from .metadata import Metadata, MetadataInvalidError
28 from .util import (cached_property, parse_credentials, ensure_slash,
29 split_filename, get_project_data, parse_requirement,
30 parse_name_and_version, ServerProxy, normalize_name)
31 from .version import get_scheme, UnsupportedVersionError
32 from .wheel import Wheel, is_compatible
33
34 logger = logging.getLogger(__name__)
35
36 HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)')
37 CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I)
38 HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml')
39 DEFAULT_INDEX = 'https://pypi.org/pypi'
40
41 def get_all_distribution_names(url=None):
42 """
43 Return all distribution names known by an index.
44 :param url: The URL of the index.
45 :return: A list of all known distribution names.
46 """
47 if url is None:
48 url = DEFAULT_INDEX
49 client = ServerProxy(url, timeout=3.0)
50 try:
51 return client.list_packages()
52 finally:
53 client('close')()
54
55 class RedirectHandler(BaseRedirectHandler):
56 """
57 A class to work around a bug in some Python 3.2.x releases.
58 """
59 # There's a bug in the base version for some 3.2.x
60 # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header
61 # returns e.g. /abc, it bails because it says the scheme ''
62 # is bogus, when actually it should use the request's
63 # URL for the scheme. See Python issue #13696.
64 def http_error_302(self, req, fp, code, msg, headers):
65 # Some servers (incorrectly) return multiple Location headers
66 # (so probably same goes for URI). Use first header.
67 newurl = None
68 for key in ('location', 'uri'):
69 if key in headers:
70 newurl = headers[key]
71 break
72 if newurl is None: # pragma: no cover
73 return
74 urlparts = urlparse(newurl)
75 if urlparts.scheme == '':
76 newurl = urljoin(req.get_full_url(), newurl)
77 if hasattr(headers, 'replace_header'):
78 headers.replace_header(key, newurl)
79 else:
80 headers[key] = newurl
81 return BaseRedirectHandler.http_error_302(self, req, fp, code, msg,
82 headers)
83
84 http_error_301 = http_error_303 = http_error_307 = http_error_302
85
86 class Locator(object):
87 """
88 A base class for locators - things that locate distributions.
89 """
90 source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz')
91 binary_extensions = ('.egg', '.exe', '.whl')
92 excluded_extensions = ('.pdf',)
93
94 # A list of tags indicating which wheels you want to match. The default
95 # value of None matches against the tags compatible with the running
96 # Python. If you want to match other values, set wheel_tags on a locator
97 # instance to a list of tuples (pyver, abi, arch) which you want to match.
98 wheel_tags = None
99
100 downloadable_extensions = source_extensions + ('.whl',)
101
102 def __init__(self, scheme='default'):
103 """
104 Initialise an instance.
105 :param scheme: Because locators look for most recent versions, they
106 need to know the version scheme to use. This specifies
107 the current PEP-recommended scheme - use ``'legacy'``
108 if you need to support existing distributions on PyPI.
109 """
110 self._cache = {}
111 self.scheme = scheme
112 # Because of bugs in some of the handlers on some of the platforms,
113 # we use our own opener rather than just using urlopen.
114 self.opener = build_opener(RedirectHandler())
115 # If get_project() is called from locate(), the matcher instance
116 # is set from the requirement passed to locate(). See issue #18 for
117 # why this can be useful to know.
118 self.matcher = None
119 self.errors = queue.Queue()
120
121 def get_errors(self):
122 """
123 Return any errors which have occurred.
124 """
125 result = []
126 while not self.errors.empty(): # pragma: no cover
127 try:
128 e = self.errors.get(False)
129 result.append(e)
130 except self.errors.Empty:
131 continue
132 self.errors.task_done()
133 return result
134
135 def clear_errors(self):
136 """
137 Clear any errors which may have been logged.
138 """
139 # Just get the errors and throw them away
140 self.get_errors()
141
142 def clear_cache(self):
143 self._cache.clear()
144
145 def _get_scheme(self):
146 return self._scheme
147
148 def _set_scheme(self, value):
149 self._scheme = value
150
151 scheme = property(_get_scheme, _set_scheme)
152
153 def _get_project(self, name):
154 """
155 For a given project, get a dictionary mapping available versions to Distribution
156 instances.
157
158 This should be implemented in subclasses.
159
160 If called from a locate() request, self.matcher will be set to a
161 matcher for the requirement to satisfy, otherwise it will be None.
162 """
163 raise NotImplementedError('Please implement in the subclass')
164
165 def get_distribution_names(self):
166 """
167 Return all the distribution names known to this locator.
168 """
169 raise NotImplementedError('Please implement in the subclass')
170
171 def get_project(self, name):
172 """
173 For a given project, get a dictionary mapping available versions to Distribution
174 instances.
175
176 This calls _get_project to do all the work, and just implements a caching layer on top.
177 """
178 if self._cache is None: # pragma: no cover
179 result = self._get_project(name)
180 elif name in self._cache:
181 result = self._cache[name]
182 else:
183 self.clear_errors()
184 result = self._get_project(name)
185 self._cache[name] = result
186 return result
187
188 def score_url(self, url):
189 """
190 Give an url a score which can be used to choose preferred URLs
191 for a given project release.
192 """
193 t = urlparse(url)
194 basename = posixpath.basename(t.path)
195 compatible = True
196 is_wheel = basename.endswith('.whl')
197 is_downloadable = basename.endswith(self.downloadable_extensions)
198 if is_wheel:
199 compatible = is_compatible(Wheel(basename), self.wheel_tags)
200 return (t.scheme == 'https', 'pypi.org' in t.netloc,
201 is_downloadable, is_wheel, compatible, basename)
202
203 def prefer_url(self, url1, url2):
204 """
205 Choose one of two URLs where both are candidates for distribution
206 archives for the same version of a distribution (for example,
207 .tar.gz vs. zip).
208
209 The current implementation favours https:// URLs over http://, archives
210 from PyPI over those from other locations, wheel compatibility (if a
211 wheel) and then the archive name.
212 """
213 result = url2
214 if url1:
215 s1 = self.score_url(url1)
216 s2 = self.score_url(url2)
217 if s1 > s2:
218 result = url1
219 if result != url2:
220 logger.debug('Not replacing %r with %r', url1, url2)
221 else:
222 logger.debug('Replacing %r with %r', url1, url2)
223 return result
224
225 def split_filename(self, filename, project_name):
226 """
227 Attempt to split a filename in project name, version and Python version.
228 """
229 return split_filename(filename, project_name)
230
231 def convert_url_to_download_info(self, url, project_name):
232 """
233 See if a URL is a candidate for a download URL for a project (the URL
234 has typically been scraped from an HTML page).
235
236 If it is, a dictionary is returned with keys "name", "version",
237 "filename" and "url"; otherwise, None is returned.
238 """
239 def same_project(name1, name2):
240 return normalize_name(name1) == normalize_name(name2)
241
242 result = None
243 scheme, netloc, path, params, query, frag = urlparse(url)
244 if frag.lower().startswith('egg='): # pragma: no cover
245 logger.debug('%s: version hint in fragment: %r',
246 project_name, frag)
247 m = HASHER_HASH.match(frag)
248 if m:
249 algo, digest = m.groups()
250 else:
251 algo, digest = None, None
252 origpath = path
253 if path and path[-1] == '/': # pragma: no cover
254 path = path[:-1]
255 if path.endswith('.whl'):
256 try:
257 wheel = Wheel(path)
258 if not is_compatible(wheel, self.wheel_tags):
259 logger.debug('Wheel not compatible: %s', path)
260 else:
261 if project_name is None:
262 include = True
263 else:
264 include = same_project(wheel.name, project_name)
265 if include:
266 result = {
267 'name': wheel.name,
268 'version': wheel.version,
269 'filename': wheel.filename,
270 'url': urlunparse((scheme, netloc, origpath,
271 params, query, '')),
272 'python-version': ', '.join(
273 ['.'.join(list(v[2:])) for v in wheel.pyver]),
274 }
275 except Exception as e: # pragma: no cover
276 logger.warning('invalid path for wheel: %s', path)
277 elif not path.endswith(self.downloadable_extensions): # pragma: no cover
278 logger.debug('Not downloadable: %s', path)
279 else: # downloadable extension
280 path = filename = posixpath.basename(path)
281 for ext in self.downloadable_extensions:
282 if path.endswith(ext):
283 path = path[:-len(ext)]
284 t = self.split_filename(path, project_name)
285 if not t: # pragma: no cover
286 logger.debug('No match for project/version: %s', path)
287 else:
288 name, version, pyver = t
289 if not project_name or same_project(project_name, name):
290 result = {
291 'name': name,
292 'version': version,
293 'filename': filename,
294 'url': urlunparse((scheme, netloc, origpath,
295 params, query, '')),
296 #'packagetype': 'sdist',
297 }
298 if pyver: # pragma: no cover
299 result['python-version'] = pyver
300 break
301 if result and algo:
302 result['%s_digest' % algo] = digest
303 return result
304
305 def _get_digest(self, info):
306 """
307 Get a digest from a dictionary by looking at a "digests" dictionary
308 or keys of the form 'algo_digest'.
309
310 Returns a 2-tuple (algo, digest) if found, else None. Currently
311 looks only for SHA256, then MD5.
312 """
313 result = None
314 if 'digests' in info:
315 digests = info['digests']
316 for algo in ('sha256', 'md5'):
317 if algo in digests:
318 result = (algo, digests[algo])
319 break
320 if not result:
321 for algo in ('sha256', 'md5'):
322 key = '%s_digest' % algo
323 if key in info:
324 result = (algo, info[key])
325 break
326 return result
327
328 def _update_version_data(self, result, info):
329 """
330 Update a result dictionary (the final result from _get_project) with a
331 dictionary for a specific version, which typically holds information
332 gleaned from a filename or URL for an archive for the distribution.
333 """
334 name = info.pop('name')
335 version = info.pop('version')
336 if version in result:
337 dist = result[version]
338 md = dist.metadata
339 else:
340 dist = make_dist(name, version, scheme=self.scheme)
341 md = dist.metadata
342 dist.digest = digest = self._get_digest(info)
343 url = info['url']
344 result['digests'][url] = digest
345 if md.source_url != info['url']:
346 md.source_url = self.prefer_url(md.source_url, url)
347 result['urls'].setdefault(version, set()).add(url)
348 dist.locator = self
349 result[version] = dist
350
351 def locate(self, requirement, prereleases=False):
352 """
353 Find the most recent distribution which matches the given
354 requirement.
355
356 :param requirement: A requirement of the form 'foo (1.0)' or perhaps
357 'foo (>= 1.0, < 2.0, != 1.3)'
358 :param prereleases: If ``True``, allow pre-release versions
359 to be located. Otherwise, pre-release versions
360 are not returned.
361 :return: A :class:`Distribution` instance, or ``None`` if no such
362 distribution could be located.
363 """
364 result = None
365 r = parse_requirement(requirement)
366 if r is None: # pragma: no cover
367 raise DistlibException('Not a valid requirement: %r' % requirement)
368 scheme = get_scheme(self.scheme)
369 self.matcher = matcher = scheme.matcher(r.requirement)
370 logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__)
371 versions = self.get_project(r.name)
372 if len(versions) > 2: # urls and digests keys are present
373 # sometimes, versions are invalid
374 slist = []
375 vcls = matcher.version_class
376 for k in versions:
377 if k in ('urls', 'digests'):
378 continue
379 try:
380 if not matcher.match(k):
381 logger.debug('%s did not match %r', matcher, k)
382 else:
383 if prereleases or not vcls(k).is_prerelease:
384 slist.append(k)
385 else:
386 logger.debug('skipping pre-release '
387 'version %s of %s', k, matcher.name)
388 except Exception: # pragma: no cover
389 logger.warning('error matching %s with %r', matcher, k)
390 pass # slist.append(k)
391 if len(slist) > 1:
392 slist = sorted(slist, key=scheme.key)
393 if slist:
394 logger.debug('sorted list: %s', slist)
395 version = slist[-1]
396 result = versions[version]
397 if result:
398 if r.extras:
399 result.extras = r.extras
400 result.download_urls = versions.get('urls', {}).get(version, set())
401 d = {}
402 sd = versions.get('digests', {})
403 for url in result.download_urls:
404 if url in sd: # pragma: no cover
405 d[url] = sd[url]
406 result.digests = d
407 self.matcher = None
408 return result
409
410
411 class PyPIRPCLocator(Locator):
412 """
413 This locator uses XML-RPC to locate distributions. It therefore
414 cannot be used with simple mirrors (that only mirror file content).
415 """
416 def __init__(self, url, **kwargs):
417 """
418 Initialise an instance.
419
420 :param url: The URL to use for XML-RPC.
421 :param kwargs: Passed to the superclass constructor.
422 """
423 super(PyPIRPCLocator, self).__init__(**kwargs)
424 self.base_url = url
425 self.client = ServerProxy(url, timeout=3.0)
426
427 def get_distribution_names(self):
428 """
429 Return all the distribution names known to this locator.
430 """
431 return set(self.client.list_packages())
432
433 def _get_project(self, name):
434 result = {'urls': {}, 'digests': {}}
435 versions = self.client.package_releases(name, True)
436 for v in versions:
437 urls = self.client.release_urls(name, v)
438 data = self.client.release_data(name, v)
439 metadata = Metadata(scheme=self.scheme)
440 metadata.name = data['name']
441 metadata.version = data['version']
442 metadata.license = data.get('license')
443 metadata.keywords = data.get('keywords', [])
444 metadata.summary = data.get('summary')
445 dist = Distribution(metadata)
446 if urls:
447 info = urls[0]
448 metadata.source_url = info['url']
449 dist.digest = self._get_digest(info)
450 dist.locator = self
451 result[v] = dist
452 for info in urls:
453 url = info['url']
454 digest = self._get_digest(info)
455 result['urls'].setdefault(v, set()).add(url)
456 result['digests'][url] = digest
457 return result
458
459 class PyPIJSONLocator(Locator):
460 """
461 This locator uses PyPI's JSON interface. It's very limited in functionality
462 and probably not worth using.
463 """
464 def __init__(self, url, **kwargs):
465 super(PyPIJSONLocator, self).__init__(**kwargs)
466 self.base_url = ensure_slash(url)
467
468 def get_distribution_names(self):
469 """
470 Return all the distribution names known to this locator.
471 """
472 raise NotImplementedError('Not available from this locator')
473
474 def _get_project(self, name):
475 result = {'urls': {}, 'digests': {}}
476 url = urljoin(self.base_url, '%s/json' % quote(name))
477 try:
478 resp = self.opener.open(url)
479 data = resp.read().decode() # for now
480 d = json.loads(data)
481 md = Metadata(scheme=self.scheme)
482 data = d['info']
483 md.name = data['name']
484 md.version = data['version']
485 md.license = data.get('license')
486 md.keywords = data.get('keywords', [])
487 md.summary = data.get('summary')
488 dist = Distribution(md)
489 dist.locator = self
490 urls = d['urls']
491 result[md.version] = dist
492 for info in d['urls']:
493 url = info['url']
494 dist.download_urls.add(url)
495 dist.digests[url] = self._get_digest(info)
496 result['urls'].setdefault(md.version, set()).add(url)
497 result['digests'][url] = self._get_digest(info)
498 # Now get other releases
499 for version, infos in d['releases'].items():
500 if version == md.version:
501 continue # already done
502 omd = Metadata(scheme=self.scheme)
503 omd.name = md.name
504 omd.version = version
505 odist = Distribution(omd)
506 odist.locator = self
507 result[version] = odist
508 for info in infos:
509 url = info['url']
510 odist.download_urls.add(url)
511 odist.digests[url] = self._get_digest(info)
512 result['urls'].setdefault(version, set()).add(url)
513 result['digests'][url] = self._get_digest(info)
514 # for info in urls:
515 # md.source_url = info['url']
516 # dist.digest = self._get_digest(info)
517 # dist.locator = self
518 # for info in urls:
519 # url = info['url']
520 # result['urls'].setdefault(md.version, set()).add(url)
521 # result['digests'][url] = self._get_digest(info)
522 except Exception as e:
523 self.errors.put(text_type(e))
524 logger.exception('JSON fetch failed: %s', e)
525 return result
526
527
528 class Page(object):
529 """
530 This class represents a scraped HTML page.
531 """
532 # The following slightly hairy-looking regex just looks for the contents of
533 # an anchor link, which has an attribute "href" either immediately preceded
534 # or immediately followed by a "rel" attribute. The attribute values can be
535 # declared with double quotes, single quotes or no quotes - which leads to
536 # the length of the expression.
537 _href = re.compile("""
538 (rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)?
539 href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*))
540 (\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))?
541 """, re.I | re.S | re.X)
542 _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S)
543
544 def __init__(self, data, url):
545 """
546 Initialise an instance with the Unicode page contents and the URL they
547 came from.
548 """
549 self.data = data
550 self.base_url = self.url = url
551 m = self._base.search(self.data)
552 if m:
553 self.base_url = m.group(1)
554
555 _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
556
557 @cached_property
558 def links(self):
559 """
560 Return the URLs of all the links on a page together with information
561 about their "rel" attribute, for determining which ones to treat as
562 downloads and which ones to queue for further scraping.
563 """
564 def clean(url):
565 "Tidy up an URL."
566 scheme, netloc, path, params, query, frag = urlparse(url)
567 return urlunparse((scheme, netloc, quote(path),
568 params, query, frag))
569
570 result = set()
571 for match in self._href.finditer(self.data):
572 d = match.groupdict('')
573 rel = (d['rel1'] or d['rel2'] or d['rel3'] or
574 d['rel4'] or d['rel5'] or d['rel6'])
575 url = d['url1'] or d['url2'] or d['url3']
576 url = urljoin(self.base_url, url)
577 url = unescape(url)
578 url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
579 result.add((url, rel))
580 # We sort the result, hoping to bring the most recent versions
581 # to the front
582 result = sorted(result, key=lambda t: t[0], reverse=True)
583 return result
584
585
586 class SimpleScrapingLocator(Locator):
587 """
588 A locator which scrapes HTML pages to locate downloads for a distribution.
589 This runs multiple threads to do the I/O; performance is at least as good
590 as pip's PackageFinder, which works in an analogous fashion.
591 """
592
593 # These are used to deal with various Content-Encoding schemes.
594 decoders = {
595 'deflate': zlib.decompress,
596 'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(d)).read(),
597 'none': lambda b: b,
598 }
599
600 def __init__(self, url, timeout=None, num_workers=10, **kwargs):
601 """
602 Initialise an instance.
603 :param url: The root URL to use for scraping.
604 :param timeout: The timeout, in seconds, to be applied to requests.
605 This defaults to ``None`` (no timeout specified).
606 :param num_workers: The number of worker threads you want to do I/O,
607 This defaults to 10.
608 :param kwargs: Passed to the superclass.
609 """
610 super(SimpleScrapingLocator, self).__init__(**kwargs)
611 self.base_url = ensure_slash(url)
612 self.timeout = timeout
613 self._page_cache = {}
614 self._seen = set()
615 self._to_fetch = queue.Queue()
616 self._bad_hosts = set()
617 self.skip_externals = False
618 self.num_workers = num_workers
619 self._lock = threading.RLock()
620 # See issue #45: we need to be resilient when the locator is used
621 # in a thread, e.g. with concurrent.futures. We can't use self._lock
622 # as it is for coordinating our internal threads - the ones created
623 # in _prepare_threads.
624 self._gplock = threading.RLock()
625 self.platform_check = False # See issue #112
626
627 def _prepare_threads(self):
628 """
629 Threads are created only when get_project is called, and terminate
630 before it returns. They are there primarily to parallelise I/O (i.e.
631 fetching web pages).
632 """
633 self._threads = []
634 for i in range(self.num_workers):
635 t = threading.Thread(target=self._fetch)
636 t.setDaemon(True)
637 t.start()
638 self._threads.append(t)
639
640 def _wait_threads(self):
641 """
642 Tell all the threads to terminate (by sending a sentinel value) and
643 wait for them to do so.
644 """
645 # Note that you need two loops, since you can't say which
646 # thread will get each sentinel
647 for t in self._threads:
648 self._to_fetch.put(None) # sentinel
649 for t in self._threads:
650 t.join()
651 self._threads = []
652
653 def _get_project(self, name):
654 result = {'urls': {}, 'digests': {}}
655 with self._gplock:
656 self.result = result
657 self.project_name = name
658 url = urljoin(self.base_url, '%s/' % quote(name))
659 self._seen.clear()
660 self._page_cache.clear()
661 self._prepare_threads()
662 try:
663 logger.debug('Queueing %s', url)
664 self._to_fetch.put(url)
665 self._to_fetch.join()
666 finally:
667 self._wait_threads()
668 del self.result
669 return result
670
671 platform_dependent = re.compile(r'\b(linux_(i\d86|x86_64|arm\w+)|'
672 r'win(32|_amd64)|macosx_?\d+)\b', re.I)
673
674 def _is_platform_dependent(self, url):
675 """
676 Does an URL refer to a platform-specific download?
677 """
678 return self.platform_dependent.search(url)
679
680 def _process_download(self, url):
681 """
682 See if an URL is a suitable download for a project.
683
684 If it is, register information in the result dictionary (for
685 _get_project) about the specific version it's for.
686
687 Note that the return value isn't actually used other than as a boolean
688 value.
689 """
690 if self.platform_check and self._is_platform_dependent(url):
691 info = None
692 else:
693 info = self.convert_url_to_download_info(url, self.project_name)
694 logger.debug('process_download: %s -> %s', url, info)
695 if info:
696 with self._lock: # needed because self.result is shared
697 self._update_version_data(self.result, info)
698 return info
699
700 def _should_queue(self, link, referrer, rel):
701 """
702 Determine whether a link URL from a referring page and with a
703 particular "rel" attribute should be queued for scraping.
704 """
705 scheme, netloc, path, _, _, _ = urlparse(link)
706 if path.endswith(self.source_extensions + self.binary_extensions +
707 self.excluded_extensions):
708 result = False
709 elif self.skip_externals and not link.startswith(self.base_url):
710 result = False
711 elif not referrer.startswith(self.base_url):
712 result = False
713 elif rel not in ('homepage', 'download'):
714 result = False
715 elif scheme not in ('http', 'https', 'ftp'):
716 result = False
717 elif self._is_platform_dependent(link):
718 result = False
719 else:
720 host = netloc.split(':', 1)[0]
721 if host.lower() == 'localhost':
722 result = False
723 else:
724 result = True
725 logger.debug('should_queue: %s (%s) from %s -> %s', link, rel,
726 referrer, result)
727 return result
728
729 def _fetch(self):
730 """
731 Get a URL to fetch from the work queue, get the HTML page, examine its
732 links for download candidates and candidates for further scraping.
733
734 This is a handy method to run in a thread.
735 """
736 while True:
737 url = self._to_fetch.get()
738 try:
739 if url:
740 page = self.get_page(url)
741 if page is None: # e.g. after an error
742 continue
743 for link, rel in page.links:
744 if link not in self._seen:
745 try:
746 self._seen.add(link)
747 if (not self._process_download(link) and
748 self._should_queue(link, url, rel)):
749 logger.debug('Queueing %s from %s', link, url)
750 self._to_fetch.put(link)
751 except MetadataInvalidError: # e.g. invalid versions
752 pass
753 except Exception as e: # pragma: no cover
754 self.errors.put(text_type(e))
755 finally:
756 # always do this, to avoid hangs :-)
757 self._to_fetch.task_done()
758 if not url:
759 #logger.debug('Sentinel seen, quitting.')
760 break
761
762 def get_page(self, url):
763 """
764 Get the HTML for an URL, possibly from an in-memory cache.
765
766 XXX TODO Note: this cache is never actually cleared. It's assumed that
767 the data won't get stale over the lifetime of a locator instance (not
768 necessarily true for the default_locator).
769 """
770 # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api
771 scheme, netloc, path, _, _, _ = urlparse(url)
772 if scheme == 'file' and os.path.isdir(url2pathname(path)):
773 url = urljoin(ensure_slash(url), 'index.html')
774
775 if url in self._page_cache:
776 result = self._page_cache[url]
777 logger.debug('Returning %s from cache: %s', url, result)
778 else:
779 host = netloc.split(':', 1)[0]
780 result = None
781 if host in self._bad_hosts:
782 logger.debug('Skipping %s due to bad host %s', url, host)
783 else:
784 req = Request(url, headers={'Accept-encoding': 'identity'})
785 try:
786 logger.debug('Fetching %s', url)
787 resp = self.opener.open(req, timeout=self.timeout)
788 logger.debug('Fetched %s', url)
789 headers = resp.info()
790 content_type = headers.get('Content-Type', '')
791 if HTML_CONTENT_TYPE.match(content_type):
792 final_url = resp.geturl()
793 data = resp.read()
794 encoding = headers.get('Content-Encoding')
795 if encoding:
796 decoder = self.decoders[encoding] # fail if not found
797 data = decoder(data)
798 encoding = 'utf-8'
799 m = CHARSET.search(content_type)
800 if m:
801 encoding = m.group(1)
802 try:
803 data = data.decode(encoding)
804 except UnicodeError: # pragma: no cover
805 data = data.decode('latin-1') # fallback
806 result = Page(data, final_url)
807 self._page_cache[final_url] = result
808 except HTTPError as e:
809 if e.code != 404:
810 logger.exception('Fetch failed: %s: %s', url, e)
811 except URLError as e: # pragma: no cover
812 logger.exception('Fetch failed: %s: %s', url, e)
813 with self._lock:
814 self._bad_hosts.add(host)
815 except Exception as e: # pragma: no cover
816 logger.exception('Fetch failed: %s: %s', url, e)
817 finally:
818 self._page_cache[url] = result # even if None (failure)
819 return result
820
821 _distname_re = re.compile('<a href=[^>]*>([^<]+)<')
822
823 def get_distribution_names(self):
824 """
825 Return all the distribution names known to this locator.
826 """
827 result = set()
828 page = self.get_page(self.base_url)
829 if not page:
830 raise DistlibException('Unable to get %s' % self.base_url)
831 for match in self._distname_re.finditer(page.data):
832 result.add(match.group(1))
833 return result
834
835 class DirectoryLocator(Locator):
836 """
837 This class locates distributions in a directory tree.
838 """
839
840 def __init__(self, path, **kwargs):
841 """
842 Initialise an instance.
843 :param path: The root of the directory tree to search.
844 :param kwargs: Passed to the superclass constructor,
845 except for:
846 * recursive - if True (the default), subdirectories are
847 recursed into. If False, only the top-level directory
848 is searched,
849 """
850 self.recursive = kwargs.pop('recursive', True)
851 super(DirectoryLocator, self).__init__(**kwargs)
852 path = os.path.abspath(path)
853 if not os.path.isdir(path): # pragma: no cover
854 raise DistlibException('Not a directory: %r' % path)
855 self.base_dir = path
856
857 def should_include(self, filename, parent):
858 """
859 Should a filename be considered as a candidate for a distribution
860 archive? As well as the filename, the directory which contains it
861 is provided, though not used by the current implementation.
862 """
863 return filename.endswith(self.downloadable_extensions)
864
865 def _get_project(self, name):
866 result = {'urls': {}, 'digests': {}}
867 for root, dirs, files in os.walk(self.base_dir):
868 for fn in files:
869 if self.should_include(fn, root):
870 fn = os.path.join(root, fn)
871 url = urlunparse(('file', '',
872 pathname2url(os.path.abspath(fn)),
873 '', '', ''))
874 info = self.convert_url_to_download_info(url, name)
875 if info:
876 self._update_version_data(result, info)
877 if not self.recursive:
878 break
879 return result
880
881 def get_distribution_names(self):
882 """
883 Return all the distribution names known to this locator.
884 """
885 result = set()
886 for root, dirs, files in os.walk(self.base_dir):
887 for fn in files:
888 if self.should_include(fn, root):
889 fn = os.path.join(root, fn)
890 url = urlunparse(('file', '',
891 pathname2url(os.path.abspath(fn)),
892 '', '', ''))
893 info = self.convert_url_to_download_info(url, None)
894 if info:
895 result.add(info['name'])
896 if not self.recursive:
897 break
898 return result
899
900 class JSONLocator(Locator):
901 """
902 This locator uses special extended metadata (not available on PyPI) and is
903 the basis of performant dependency resolution in distlib. Other locators
904 require archive downloads before dependencies can be determined! As you
905 might imagine, that can be slow.
906 """
907 def get_distribution_names(self):
908 """
909 Return all the distribution names known to this locator.
910 """
911 raise NotImplementedError('Not available from this locator')
912
913 def _get_project(self, name):
914 result = {'urls': {}, 'digests': {}}
915 data = get_project_data(name)
916 if data:
917 for info in data.get('files', []):
918 if info['ptype'] != 'sdist' or info['pyversion'] != 'source':
919 continue
920 # We don't store summary in project metadata as it makes
921 # the data bigger for no benefit during dependency
922 # resolution
923 dist = make_dist(data['name'], info['version'],
924 summary=data.get('summary',
925 'Placeholder for summary'),
926 scheme=self.scheme)
927 md = dist.metadata
928 md.source_url = info['url']
929 # TODO SHA256 digest
930 if 'digest' in info and info['digest']:
931 dist.digest = ('md5', info['digest'])
932 md.dependencies = info.get('requirements', {})
933 dist.exports = info.get('exports', {})
934 result[dist.version] = dist
935 result['urls'].setdefault(dist.version, set()).add(info['url'])
936 return result
937
938 class DistPathLocator(Locator):
939 """
940 This locator finds installed distributions in a path. It can be useful for
941 adding to an :class:`AggregatingLocator`.
942 """
943 def __init__(self, distpath, **kwargs):
944 """
945 Initialise an instance.
946
947 :param distpath: A :class:`DistributionPath` instance to search.
948 """
949 super(DistPathLocator, self).__init__(**kwargs)
950 assert isinstance(distpath, DistributionPath)
951 self.distpath = distpath
952
953 def _get_project(self, name):
954 dist = self.distpath.get_distribution(name)
955 if dist is None:
956 result = {'urls': {}, 'digests': {}}
957 else:
958 result = {
959 dist.version: dist,
960 'urls': {dist.version: set([dist.source_url])},
961 'digests': {dist.version: set([None])}
962 }
963 return result
964
965
966 class AggregatingLocator(Locator):
967 """
968 This class allows you to chain and/or merge a list of locators.
969 """
970 def __init__(self, *locators, **kwargs):
971 """
972 Initialise an instance.
973
974 :param locators: The list of locators to search.
975 :param kwargs: Passed to the superclass constructor,
976 except for:
977 * merge - if False (the default), the first successful
978 search from any of the locators is returned. If True,
979 the results from all locators are merged (this can be
980 slow).
981 """
982 self.merge = kwargs.pop('merge', False)
983 self.locators = locators
984 super(AggregatingLocator, self).__init__(**kwargs)
985
986 def clear_cache(self):
987 super(AggregatingLocator, self).clear_cache()
988 for locator in self.locators:
989 locator.clear_cache()
990
991 def _set_scheme(self, value):
992 self._scheme = value
993 for locator in self.locators:
994 locator.scheme = value
995
996 scheme = property(Locator.scheme.fget, _set_scheme)
997
998 def _get_project(self, name):
999 result = {}
1000 for locator in self.locators:
1001 d = locator.get_project(name)
1002 if d:
1003 if self.merge:
1004 files = result.get('urls', {})
1005 digests = result.get('digests', {})
1006 # next line could overwrite result['urls'], result['digests']
1007 result.update(d)
1008 df = result.get('urls')
1009 if files and df:
1010 for k, v in files.items():
1011 if k in df:
1012 df[k] |= v
1013 else:
1014 df[k] = v
1015 dd = result.get('digests')
1016 if digests and dd:
1017 dd.update(digests)
1018 else:
1019 # See issue #18. If any dists are found and we're looking
1020 # for specific constraints, we only return something if
1021 # a match is found. For example, if a DirectoryLocator
1022 # returns just foo (1.0) while we're looking for
1023 # foo (>= 2.0), we'll pretend there was nothing there so
1024 # that subsequent locators can be queried. Otherwise we
1025 # would just return foo (1.0) which would then lead to a
1026 # failure to find foo (>= 2.0), because other locators
1027 # weren't searched. Note that this only matters when
1028 # merge=False.
1029 if self.matcher is None:
1030 found = True
1031 else:
1032 found = False
1033 for k in d:
1034 if self.matcher.match(k):
1035 found = True
1036 break
1037 if found:
1038 result = d
1039 break
1040 return result
1041
1042 def get_distribution_names(self):
1043 """
1044 Return all the distribution names known to this locator.
1045 """
1046 result = set()
1047 for locator in self.locators:
1048 try:
1049 result |= locator.get_distribution_names()
1050 except NotImplementedError:
1051 pass
1052 return result
1053
1054
1055 # We use a legacy scheme simply because most of the dists on PyPI use legacy
1056 # versions which don't conform to PEP 426 / PEP 440.
1057 default_locator = AggregatingLocator(
1058 JSONLocator(),
1059 SimpleScrapingLocator('https://pypi.org/simple/',
1060 timeout=3.0),
1061 scheme='legacy')
1062
1063 locate = default_locator.locate
1064
1065 NAME_VERSION_RE = re.compile(r'(?P<name>[\w-]+)\s*'
1066 r'\(\s*(==\s*)?(?P<ver>[^)]+)\)$')
1067
1068 class DependencyFinder(object):
1069 """
1070 Locate dependencies for distributions.
1071 """
1072
1073 def __init__(self, locator=None):
1074 """
1075 Initialise an instance, using the specified locator
1076 to locate distributions.
1077 """
1078 self.locator = locator or default_locator
1079 self.scheme = get_scheme(self.locator.scheme)
1080
1081 def add_distribution(self, dist):
1082 """
1083 Add a distribution to the finder. This will update internal information
1084 about who provides what.
1085 :param dist: The distribution to add.
1086 """
1087 logger.debug('adding distribution %s', dist)
1088 name = dist.key
1089 self.dists_by_name[name] = dist
1090 self.dists[(name, dist.version)] = dist
1091 for p in dist.provides:
1092 name, version = parse_name_and_version(p)
1093 logger.debug('Add to provided: %s, %s, %s', name, version, dist)
1094 self.provided.setdefault(name, set()).add((version, dist))
1095
1096 def remove_distribution(self, dist):
1097 """
1098 Remove a distribution from the finder. This will update internal
1099 information about who provides what.
1100 :param dist: The distribution to remove.
1101 """
1102 logger.debug('removing distribution %s', dist)
1103 name = dist.key
1104 del self.dists_by_name[name]
1105 del self.dists[(name, dist.version)]
1106 for p in dist.provides:
1107 name, version = parse_name_and_version(p)
1108 logger.debug('Remove from provided: %s, %s, %s', name, version, dist)
1109 s = self.provided[name]
1110 s.remove((version, dist))
1111 if not s:
1112 del self.provided[name]
1113
1114 def get_matcher(self, reqt):
1115 """
1116 Get a version matcher for a requirement.
1117 :param reqt: The requirement
1118 :type reqt: str
1119 :return: A version matcher (an instance of
1120 :class:`distlib.version.Matcher`).
1121 """
1122 try:
1123 matcher = self.scheme.matcher(reqt)
1124 except UnsupportedVersionError: # pragma: no cover
1125 # XXX compat-mode if cannot read the version
1126 name = reqt.split()[0]
1127 matcher = self.scheme.matcher(name)
1128 return matcher
1129
1130 def find_providers(self, reqt):
1131 """
1132 Find the distributions which can fulfill a requirement.
1133
1134 :param reqt: The requirement.
1135 :type reqt: str
1136 :return: A set of distribution which can fulfill the requirement.
1137 """
1138 matcher = self.get_matcher(reqt)
1139 name = matcher.key # case-insensitive
1140 result = set()
1141 provided = self.provided
1142 if name in provided:
1143 for version, provider in provided[name]:
1144 try:
1145 match = matcher.match(version)
1146 except UnsupportedVersionError:
1147 match = False
1148
1149 if match:
1150 result.add(provider)
1151 break
1152 return result
1153
1154 def try_to_replace(self, provider, other, problems):
1155 """
1156 Attempt to replace one provider with another. This is typically used
1157 when resolving dependencies from multiple sources, e.g. A requires
1158 (B >= 1.0) while C requires (B >= 1.1).
1159
1160 For successful replacement, ``provider`` must meet all the requirements
1161 which ``other`` fulfills.
1162
1163 :param provider: The provider we are trying to replace with.
1164 :param other: The provider we're trying to replace.
1165 :param problems: If False is returned, this will contain what
1166 problems prevented replacement. This is currently
1167 a tuple of the literal string 'cantreplace',
1168 ``provider``, ``other`` and the set of requirements
1169 that ``provider`` couldn't fulfill.
1170 :return: True if we can replace ``other`` with ``provider``, else
1171 False.
1172 """
1173 rlist = self.reqts[other]
1174 unmatched = set()
1175 for s in rlist:
1176 matcher = self.get_matcher(s)
1177 if not matcher.match(provider.version):
1178 unmatched.add(s)
1179 if unmatched:
1180 # can't replace other with provider
1181 problems.add(('cantreplace', provider, other,
1182 frozenset(unmatched)))
1183 result = False
1184 else:
1185 # can replace other with provider
1186 self.remove_distribution(other)
1187 del self.reqts[other]
1188 for s in rlist:
1189 self.reqts.setdefault(provider, set()).add(s)
1190 self.add_distribution(provider)
1191 result = True
1192 return result
1193
1194 def find(self, requirement, meta_extras=None, prereleases=False):
1195 """
1196 Find a distribution and all distributions it depends on.
1197
1198 :param requirement: The requirement specifying the distribution to
1199 find, or a Distribution instance.
1200 :param meta_extras: A list of meta extras such as :test:, :build: and
1201 so on.
1202 :param prereleases: If ``True``, allow pre-release versions to be
1203 returned - otherwise, don't return prereleases
1204 unless they're all that's available.
1205
1206 Return a set of :class:`Distribution` instances and a set of
1207 problems.
1208
1209 The distributions returned should be such that they have the
1210 :attr:`required` attribute set to ``True`` if they were
1211 from the ``requirement`` passed to ``find()``, and they have the
1212 :attr:`build_time_dependency` attribute set to ``True`` unless they
1213 are post-installation dependencies of the ``requirement``.
1214
1215 The problems should be a tuple consisting of the string
1216 ``'unsatisfied'`` and the requirement which couldn't be satisfied
1217 by any distribution known to the locator.
1218 """
1219
1220 self.provided = {}
1221 self.dists = {}
1222 self.dists_by_name = {}
1223 self.reqts = {}
1224
1225 meta_extras = set(meta_extras or [])
1226 if ':*:' in meta_extras:
1227 meta_extras.remove(':*:')
1228 # :meta: and :run: are implicitly included
1229 meta_extras |= set([':test:', ':build:', ':dev:'])
1230
1231 if isinstance(requirement, Distribution):
1232 dist = odist = requirement
1233 logger.debug('passed %s as requirement', odist)
1234 else:
1235 dist = odist = self.locator.locate(requirement,
1236 prereleases=prereleases)
1237 if dist is None:
1238 raise DistlibException('Unable to locate %r' % requirement)
1239 logger.debug('located %s', odist)
1240 dist.requested = True
1241 problems = set()
1242 todo = set([dist])
1243 install_dists = set([odist])
1244 while todo:
1245 dist = todo.pop()
1246 name = dist.key # case-insensitive
1247 if name not in self.dists_by_name:
1248 self.add_distribution(dist)
1249 else:
1250 #import pdb; pdb.set_trace()
1251 other = self.dists_by_name[name]
1252 if other != dist:
1253 self.try_to_replace(dist, other, problems)
1254
1255 ireqts = dist.run_requires | dist.meta_requires
1256 sreqts = dist.build_requires
1257 ereqts = set()
1258 if meta_extras and dist in install_dists:
1259 for key in ('test', 'build', 'dev'):
1260 e = ':%s:' % key
1261 if e in meta_extras:
1262 ereqts |= getattr(dist, '%s_requires' % key)
1263 all_reqts = ireqts | sreqts | ereqts
1264 for r in all_reqts:
1265 providers = self.find_providers(r)
1266 if not providers:
1267 logger.debug('No providers found for %r', r)
1268 provider = self.locator.locate(r, prereleases=prereleases)
1269 # If no provider is found and we didn't consider
1270 # prereleases, consider them now.
1271 if provider is None and not prereleases:
1272 provider = self.locator.locate(r, prereleases=True)
1273 if provider is None:
1274 logger.debug('Cannot satisfy %r', r)
1275 problems.add(('unsatisfied', r))
1276 else:
1277 n, v = provider.key, provider.version
1278 if (n, v) not in self.dists:
1279 todo.add(provider)
1280 providers.add(provider)
1281 if r in ireqts and dist in install_dists:
1282 install_dists.add(provider)
1283 logger.debug('Adding %s to install_dists',
1284 provider.name_and_version)
1285 for p in providers:
1286 name = p.key
1287 if name not in self.dists_by_name:
1288 self.reqts.setdefault(p, set()).add(r)
1289 else:
1290 other = self.dists_by_name[name]
1291 if other != p:
1292 # see if other can be replaced by p
1293 self.try_to_replace(p, other, problems)
1294
1295 dists = set(self.dists.values())
1296 for dist in dists:
1297 dist.build_time_dependency = dist not in install_dists
1298 if dist.build_time_dependency:
1299 logger.debug('%s is a build-time dependency only.',
1300 dist.name_and_version)
1301 logger.debug('find done for %s', odist)
1302 return dists, problems