comparison lib/python3.8/site-packages/pip/_internal/index/collector.py @ 1:64071f2a4cf0 draft default tip

Deleted selected files
author guerler
date Mon, 27 Jul 2020 03:55:49 -0400
parents 9e54283cc701
children
comparison
equal deleted inserted replaced
0:9e54283cc701 1:64071f2a4cf0
1 """
2 The main purpose of this module is to expose LinkCollector.collect_links().
3 """
4
5 import cgi
6 import itertools
7 import logging
8 import mimetypes
9 import os
10 from collections import OrderedDict
11
12 from pip._vendor import html5lib, requests
13 from pip._vendor.distlib.compat import unescape
14 from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError
15 from pip._vendor.six.moves.urllib import parse as urllib_parse
16 from pip._vendor.six.moves.urllib import request as urllib_request
17
18 from pip._internal.models.link import Link
19 from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
20 from pip._internal.utils.misc import redact_auth_from_url
21 from pip._internal.utils.typing import MYPY_CHECK_RUNNING
22 from pip._internal.utils.urls import path_to_url, url_to_path
23 from pip._internal.vcs import is_url, vcs
24
25 if MYPY_CHECK_RUNNING:
26 from typing import (
27 Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
28 Union,
29 )
30 import xml.etree.ElementTree
31
32 from pip._vendor.requests import Response
33
34 from pip._internal.models.search_scope import SearchScope
35 from pip._internal.network.session import PipSession
36
37 HTMLElement = xml.etree.ElementTree.Element
38 ResponseHeaders = MutableMapping[str, str]
39
40
41 logger = logging.getLogger(__name__)
42
43
44 def _match_vcs_scheme(url):
45 # type: (str) -> Optional[str]
46 """Look for VCS schemes in the URL.
47
48 Returns the matched VCS scheme, or None if there's no match.
49 """
50 for scheme in vcs.schemes:
51 if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
52 return scheme
53 return None
54
55
56 def _is_url_like_archive(url):
57 # type: (str) -> bool
58 """Return whether the URL looks like an archive.
59 """
60 filename = Link(url).filename
61 for bad_ext in ARCHIVE_EXTENSIONS:
62 if filename.endswith(bad_ext):
63 return True
64 return False
65
66
67 class _NotHTML(Exception):
68 def __init__(self, content_type, request_desc):
69 # type: (str, str) -> None
70 super(_NotHTML, self).__init__(content_type, request_desc)
71 self.content_type = content_type
72 self.request_desc = request_desc
73
74
75 def _ensure_html_header(response):
76 # type: (Response) -> None
77 """Check the Content-Type header to ensure the response contains HTML.
78
79 Raises `_NotHTML` if the content type is not text/html.
80 """
81 content_type = response.headers.get("Content-Type", "")
82 if not content_type.lower().startswith("text/html"):
83 raise _NotHTML(content_type, response.request.method)
84
85
86 class _NotHTTP(Exception):
87 pass
88
89
90 def _ensure_html_response(url, session):
91 # type: (str, PipSession) -> None
92 """Send a HEAD request to the URL, and ensure the response contains HTML.
93
94 Raises `_NotHTTP` if the URL is not available for a HEAD request, or
95 `_NotHTML` if the content type is not text/html.
96 """
97 scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
98 if scheme not in {'http', 'https'}:
99 raise _NotHTTP()
100
101 resp = session.head(url, allow_redirects=True)
102 resp.raise_for_status()
103
104 _ensure_html_header(resp)
105
106
107 def _get_html_response(url, session):
108 # type: (str, PipSession) -> Response
109 """Access an HTML page with GET, and return the response.
110
111 This consists of three parts:
112
113 1. If the URL looks suspiciously like an archive, send a HEAD first to
114 check the Content-Type is HTML, to avoid downloading a large file.
115 Raise `_NotHTTP` if the content type cannot be determined, or
116 `_NotHTML` if it is not HTML.
117 2. Actually perform the request. Raise HTTP exceptions on network failures.
118 3. Check the Content-Type header to make sure we got HTML, and raise
119 `_NotHTML` otherwise.
120 """
121 if _is_url_like_archive(url):
122 _ensure_html_response(url, session=session)
123
124 logger.debug('Getting page %s', redact_auth_from_url(url))
125
126 resp = session.get(
127 url,
128 headers={
129 "Accept": "text/html",
130 # We don't want to blindly returned cached data for
131 # /simple/, because authors generally expecting that
132 # twine upload && pip install will function, but if
133 # they've done a pip install in the last ~10 minutes
134 # it won't. Thus by setting this to zero we will not
135 # blindly use any cached data, however the benefit of
136 # using max-age=0 instead of no-cache, is that we will
137 # still support conditional requests, so we will still
138 # minimize traffic sent in cases where the page hasn't
139 # changed at all, we will just always incur the round
140 # trip for the conditional GET now instead of only
141 # once per 10 minutes.
142 # For more information, please see pypa/pip#5670.
143 "Cache-Control": "max-age=0",
144 },
145 )
146 resp.raise_for_status()
147
148 # The check for archives above only works if the url ends with
149 # something that looks like an archive. However that is not a
150 # requirement of an url. Unless we issue a HEAD request on every
151 # url we cannot know ahead of time for sure if something is HTML
152 # or not. However we can check after we've downloaded it.
153 _ensure_html_header(resp)
154
155 return resp
156
157
158 def _get_encoding_from_headers(headers):
159 # type: (ResponseHeaders) -> Optional[str]
160 """Determine if we have any encoding information in our headers.
161 """
162 if headers and "Content-Type" in headers:
163 content_type, params = cgi.parse_header(headers["Content-Type"])
164 if "charset" in params:
165 return params['charset']
166 return None
167
168
169 def _determine_base_url(document, page_url):
170 # type: (HTMLElement, str) -> str
171 """Determine the HTML document's base URL.
172
173 This looks for a ``<base>`` tag in the HTML document. If present, its href
174 attribute denotes the base URL of anchor tags in the document. If there is
175 no such tag (or if it does not have a valid href attribute), the HTML
176 file's URL is used as the base URL.
177
178 :param document: An HTML document representation. The current
179 implementation expects the result of ``html5lib.parse()``.
180 :param page_url: The URL of the HTML document.
181 """
182 for base in document.findall(".//base"):
183 href = base.get("href")
184 if href is not None:
185 return href
186 return page_url
187
188
189 def _clean_link(url):
190 # type: (str) -> str
191 """Makes sure a link is fully encoded. That is, if a ' ' shows up in
192 the link, it will be rewritten to %20 (while not over-quoting
193 % or other characters)."""
194 # Split the URL into parts according to the general structure
195 # `scheme://netloc/path;parameters?query#fragment`. Note that the
196 # `netloc` can be empty and the URI will then refer to a local
197 # filesystem path.
198 result = urllib_parse.urlparse(url)
199 # In both cases below we unquote prior to quoting to make sure
200 # nothing is double quoted.
201 if result.netloc == "":
202 # On Windows the path part might contain a drive letter which
203 # should not be quoted. On Linux where drive letters do not
204 # exist, the colon should be quoted. We rely on urllib.request
205 # to do the right thing here.
206 path = urllib_request.pathname2url(
207 urllib_request.url2pathname(result.path))
208 else:
209 # In addition to the `/` character we protect `@` so that
210 # revision strings in VCS URLs are properly parsed.
211 path = urllib_parse.quote(urllib_parse.unquote(result.path), safe="/@")
212 return urllib_parse.urlunparse(result._replace(path=path))
213
214
215 def _create_link_from_element(
216 anchor, # type: HTMLElement
217 page_url, # type: str
218 base_url, # type: str
219 ):
220 # type: (...) -> Optional[Link]
221 """
222 Convert an anchor element in a simple repository page to a Link.
223 """
224 href = anchor.get("href")
225 if not href:
226 return None
227
228 url = _clean_link(urllib_parse.urljoin(base_url, href))
229 pyrequire = anchor.get('data-requires-python')
230 pyrequire = unescape(pyrequire) if pyrequire else None
231
232 yanked_reason = anchor.get('data-yanked')
233 if yanked_reason:
234 # This is a unicode string in Python 2 (and 3).
235 yanked_reason = unescape(yanked_reason)
236
237 link = Link(
238 url,
239 comes_from=page_url,
240 requires_python=pyrequire,
241 yanked_reason=yanked_reason,
242 )
243
244 return link
245
246
247 def parse_links(page):
248 # type: (HTMLPage) -> Iterable[Link]
249 """
250 Parse an HTML document, and yield its anchor elements as Link objects.
251 """
252 document = html5lib.parse(
253 page.content,
254 transport_encoding=page.encoding,
255 namespaceHTMLElements=False,
256 )
257
258 url = page.url
259 base_url = _determine_base_url(document, url)
260 for anchor in document.findall(".//a"):
261 link = _create_link_from_element(
262 anchor,
263 page_url=url,
264 base_url=base_url,
265 )
266 if link is None:
267 continue
268 yield link
269
270
271 class HTMLPage(object):
272 """Represents one page, along with its URL"""
273
274 def __init__(
275 self,
276 content, # type: bytes
277 encoding, # type: Optional[str]
278 url, # type: str
279 ):
280 # type: (...) -> None
281 """
282 :param encoding: the encoding to decode the given content.
283 :param url: the URL from which the HTML was downloaded.
284 """
285 self.content = content
286 self.encoding = encoding
287 self.url = url
288
289 def __str__(self):
290 # type: () -> str
291 return redact_auth_from_url(self.url)
292
293
294 def _handle_get_page_fail(
295 link, # type: Link
296 reason, # type: Union[str, Exception]
297 meth=None # type: Optional[Callable[..., None]]
298 ):
299 # type: (...) -> None
300 if meth is None:
301 meth = logger.debug
302 meth("Could not fetch URL %s: %s - skipping", link, reason)
303
304
305 def _make_html_page(response):
306 # type: (Response) -> HTMLPage
307 encoding = _get_encoding_from_headers(response.headers)
308 return HTMLPage(response.content, encoding=encoding, url=response.url)
309
310
311 def _get_html_page(link, session=None):
312 # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
313 if session is None:
314 raise TypeError(
315 "_get_html_page() missing 1 required keyword argument: 'session'"
316 )
317
318 url = link.url.split('#', 1)[0]
319
320 # Check for VCS schemes that do not support lookup as web pages.
321 vcs_scheme = _match_vcs_scheme(url)
322 if vcs_scheme:
323 logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
324 return None
325
326 # Tack index.html onto file:// URLs that point to directories
327 scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
328 if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
329 # add trailing slash if not present so urljoin doesn't trim
330 # final segment
331 if not url.endswith('/'):
332 url += '/'
333 url = urllib_parse.urljoin(url, 'index.html')
334 logger.debug(' file: URL is directory, getting %s', url)
335
336 try:
337 resp = _get_html_response(url, session=session)
338 except _NotHTTP:
339 logger.debug(
340 'Skipping page %s because it looks like an archive, and cannot '
341 'be checked by HEAD.', link,
342 )
343 except _NotHTML as exc:
344 logger.debug(
345 'Skipping page %s because the %s request got Content-Type: %s',
346 link, exc.request_desc, exc.content_type,
347 )
348 except HTTPError as exc:
349 _handle_get_page_fail(link, exc)
350 except RetryError as exc:
351 _handle_get_page_fail(link, exc)
352 except SSLError as exc:
353 reason = "There was a problem confirming the ssl certificate: "
354 reason += str(exc)
355 _handle_get_page_fail(link, reason, meth=logger.info)
356 except requests.ConnectionError as exc:
357 _handle_get_page_fail(link, "connection error: %s" % exc)
358 except requests.Timeout:
359 _handle_get_page_fail(link, "timed out")
360 else:
361 return _make_html_page(resp)
362 return None
363
364
365 def _remove_duplicate_links(links):
366 # type: (Iterable[Link]) -> List[Link]
367 """
368 Return a list of links, with duplicates removed and ordering preserved.
369 """
370 # We preserve the ordering when removing duplicates because we can.
371 return list(OrderedDict.fromkeys(links))
372
373
374 def group_locations(locations, expand_dir=False):
375 # type: (Sequence[str], bool) -> Tuple[List[str], List[str]]
376 """
377 Divide a list of locations into two groups: "files" (archives) and "urls."
378
379 :return: A pair of lists (files, urls).
380 """
381 files = []
382 urls = []
383
384 # puts the url for the given file path into the appropriate list
385 def sort_path(path):
386 # type: (str) -> None
387 url = path_to_url(path)
388 if mimetypes.guess_type(url, strict=False)[0] == 'text/html':
389 urls.append(url)
390 else:
391 files.append(url)
392
393 for url in locations:
394
395 is_local_path = os.path.exists(url)
396 is_file_url = url.startswith('file:')
397
398 if is_local_path or is_file_url:
399 if is_local_path:
400 path = url
401 else:
402 path = url_to_path(url)
403 if os.path.isdir(path):
404 if expand_dir:
405 path = os.path.realpath(path)
406 for item in os.listdir(path):
407 sort_path(os.path.join(path, item))
408 elif is_file_url:
409 urls.append(url)
410 else:
411 logger.warning(
412 "Path '{0}' is ignored: "
413 "it is a directory.".format(path),
414 )
415 elif os.path.isfile(path):
416 sort_path(path)
417 else:
418 logger.warning(
419 "Url '%s' is ignored: it is neither a file "
420 "nor a directory.", url,
421 )
422 elif is_url(url):
423 # Only add url with clear scheme
424 urls.append(url)
425 else:
426 logger.warning(
427 "Url '%s' is ignored. It is either a non-existing "
428 "path or lacks a specific scheme.", url,
429 )
430
431 return files, urls
432
433
434 class CollectedLinks(object):
435
436 """
437 Encapsulates the return value of a call to LinkCollector.collect_links().
438
439 The return value includes both URLs to project pages containing package
440 links, as well as individual package Link objects collected from other
441 sources.
442
443 This info is stored separately as:
444
445 (1) links from the configured file locations,
446 (2) links from the configured find_links, and
447 (3) urls to HTML project pages, as described by the PEP 503 simple
448 repository API.
449 """
450
451 def __init__(
452 self,
453 files, # type: List[Link]
454 find_links, # type: List[Link]
455 project_urls, # type: List[Link]
456 ):
457 # type: (...) -> None
458 """
459 :param files: Links from file locations.
460 :param find_links: Links from find_links.
461 :param project_urls: URLs to HTML project pages, as described by
462 the PEP 503 simple repository API.
463 """
464 self.files = files
465 self.find_links = find_links
466 self.project_urls = project_urls
467
468
469 class LinkCollector(object):
470
471 """
472 Responsible for collecting Link objects from all configured locations,
473 making network requests as needed.
474
475 The class's main method is its collect_links() method.
476 """
477
478 def __init__(
479 self,
480 session, # type: PipSession
481 search_scope, # type: SearchScope
482 ):
483 # type: (...) -> None
484 self.search_scope = search_scope
485 self.session = session
486
487 @property
488 def find_links(self):
489 # type: () -> List[str]
490 return self.search_scope.find_links
491
492 def fetch_page(self, location):
493 # type: (Link) -> Optional[HTMLPage]
494 """
495 Fetch an HTML page containing package links.
496 """
497 return _get_html_page(location, session=self.session)
498
499 def collect_links(self, project_name):
500 # type: (str) -> CollectedLinks
501 """Find all available links for the given project name.
502
503 :return: All the Link objects (unfiltered), as a CollectedLinks object.
504 """
505 search_scope = self.search_scope
506 index_locations = search_scope.get_index_urls_locations(project_name)
507 index_file_loc, index_url_loc = group_locations(index_locations)
508 fl_file_loc, fl_url_loc = group_locations(
509 self.find_links, expand_dir=True,
510 )
511
512 file_links = [
513 Link(url) for url in itertools.chain(index_file_loc, fl_file_loc)
514 ]
515
516 # We trust every directly linked archive in find_links
517 find_link_links = [Link(url, '-f') for url in self.find_links]
518
519 # We trust every url that the user has given us whether it was given
520 # via --index-url or --find-links.
521 # We want to filter out anything that does not have a secure origin.
522 url_locations = [
523 link for link in itertools.chain(
524 (Link(url) for url in index_url_loc),
525 (Link(url) for url in fl_url_loc),
526 )
527 if self.session.is_secure_origin(link)
528 ]
529
530 url_locations = _remove_duplicate_links(url_locations)
531 lines = [
532 '{} location(s) to search for versions of {}:'.format(
533 len(url_locations), project_name,
534 ),
535 ]
536 for link in url_locations:
537 lines.append('* {}'.format(link))
538 logger.debug('\n'.join(lines))
539
540 return CollectedLinks(
541 files=file_links,
542 find_links=find_link_links,
543 project_urls=url_locations,
544 )