Mercurial > repos > guerler > hhblits
comparison lib/python3.8/site-packages/pip/_internal/index/collector.py @ 1:64071f2a4cf0 draft default tip
Deleted selected files
| author | guerler |
|---|---|
| date | Mon, 27 Jul 2020 03:55:49 -0400 |
| parents | 9e54283cc701 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:9e54283cc701 | 1:64071f2a4cf0 |
|---|---|
| 1 """ | |
| 2 The main purpose of this module is to expose LinkCollector.collect_links(). | |
| 3 """ | |
| 4 | |
| 5 import cgi | |
| 6 import itertools | |
| 7 import logging | |
| 8 import mimetypes | |
| 9 import os | |
| 10 from collections import OrderedDict | |
| 11 | |
| 12 from pip._vendor import html5lib, requests | |
| 13 from pip._vendor.distlib.compat import unescape | |
| 14 from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError | |
| 15 from pip._vendor.six.moves.urllib import parse as urllib_parse | |
| 16 from pip._vendor.six.moves.urllib import request as urllib_request | |
| 17 | |
| 18 from pip._internal.models.link import Link | |
| 19 from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS | |
| 20 from pip._internal.utils.misc import redact_auth_from_url | |
| 21 from pip._internal.utils.typing import MYPY_CHECK_RUNNING | |
| 22 from pip._internal.utils.urls import path_to_url, url_to_path | |
| 23 from pip._internal.vcs import is_url, vcs | |
| 24 | |
| 25 if MYPY_CHECK_RUNNING: | |
| 26 from typing import ( | |
| 27 Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple, | |
| 28 Union, | |
| 29 ) | |
| 30 import xml.etree.ElementTree | |
| 31 | |
| 32 from pip._vendor.requests import Response | |
| 33 | |
| 34 from pip._internal.models.search_scope import SearchScope | |
| 35 from pip._internal.network.session import PipSession | |
| 36 | |
| 37 HTMLElement = xml.etree.ElementTree.Element | |
| 38 ResponseHeaders = MutableMapping[str, str] | |
| 39 | |
| 40 | |
| 41 logger = logging.getLogger(__name__) | |
| 42 | |
| 43 | |
| 44 def _match_vcs_scheme(url): | |
| 45 # type: (str) -> Optional[str] | |
| 46 """Look for VCS schemes in the URL. | |
| 47 | |
| 48 Returns the matched VCS scheme, or None if there's no match. | |
| 49 """ | |
| 50 for scheme in vcs.schemes: | |
| 51 if url.lower().startswith(scheme) and url[len(scheme)] in '+:': | |
| 52 return scheme | |
| 53 return None | |
| 54 | |
| 55 | |
| 56 def _is_url_like_archive(url): | |
| 57 # type: (str) -> bool | |
| 58 """Return whether the URL looks like an archive. | |
| 59 """ | |
| 60 filename = Link(url).filename | |
| 61 for bad_ext in ARCHIVE_EXTENSIONS: | |
| 62 if filename.endswith(bad_ext): | |
| 63 return True | |
| 64 return False | |
| 65 | |
| 66 | |
| 67 class _NotHTML(Exception): | |
| 68 def __init__(self, content_type, request_desc): | |
| 69 # type: (str, str) -> None | |
| 70 super(_NotHTML, self).__init__(content_type, request_desc) | |
| 71 self.content_type = content_type | |
| 72 self.request_desc = request_desc | |
| 73 | |
| 74 | |
| 75 def _ensure_html_header(response): | |
| 76 # type: (Response) -> None | |
| 77 """Check the Content-Type header to ensure the response contains HTML. | |
| 78 | |
| 79 Raises `_NotHTML` if the content type is not text/html. | |
| 80 """ | |
| 81 content_type = response.headers.get("Content-Type", "") | |
| 82 if not content_type.lower().startswith("text/html"): | |
| 83 raise _NotHTML(content_type, response.request.method) | |
| 84 | |
| 85 | |
| 86 class _NotHTTP(Exception): | |
| 87 pass | |
| 88 | |
| 89 | |
| 90 def _ensure_html_response(url, session): | |
| 91 # type: (str, PipSession) -> None | |
| 92 """Send a HEAD request to the URL, and ensure the response contains HTML. | |
| 93 | |
| 94 Raises `_NotHTTP` if the URL is not available for a HEAD request, or | |
| 95 `_NotHTML` if the content type is not text/html. | |
| 96 """ | |
| 97 scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url) | |
| 98 if scheme not in {'http', 'https'}: | |
| 99 raise _NotHTTP() | |
| 100 | |
| 101 resp = session.head(url, allow_redirects=True) | |
| 102 resp.raise_for_status() | |
| 103 | |
| 104 _ensure_html_header(resp) | |
| 105 | |
| 106 | |
| 107 def _get_html_response(url, session): | |
| 108 # type: (str, PipSession) -> Response | |
| 109 """Access an HTML page with GET, and return the response. | |
| 110 | |
| 111 This consists of three parts: | |
| 112 | |
| 113 1. If the URL looks suspiciously like an archive, send a HEAD first to | |
| 114 check the Content-Type is HTML, to avoid downloading a large file. | |
| 115 Raise `_NotHTTP` if the content type cannot be determined, or | |
| 116 `_NotHTML` if it is not HTML. | |
| 117 2. Actually perform the request. Raise HTTP exceptions on network failures. | |
| 118 3. Check the Content-Type header to make sure we got HTML, and raise | |
| 119 `_NotHTML` otherwise. | |
| 120 """ | |
| 121 if _is_url_like_archive(url): | |
| 122 _ensure_html_response(url, session=session) | |
| 123 | |
| 124 logger.debug('Getting page %s', redact_auth_from_url(url)) | |
| 125 | |
| 126 resp = session.get( | |
| 127 url, | |
| 128 headers={ | |
| 129 "Accept": "text/html", | |
| 130 # We don't want to blindly returned cached data for | |
| 131 # /simple/, because authors generally expecting that | |
| 132 # twine upload && pip install will function, but if | |
| 133 # they've done a pip install in the last ~10 minutes | |
| 134 # it won't. Thus by setting this to zero we will not | |
| 135 # blindly use any cached data, however the benefit of | |
| 136 # using max-age=0 instead of no-cache, is that we will | |
| 137 # still support conditional requests, so we will still | |
| 138 # minimize traffic sent in cases where the page hasn't | |
| 139 # changed at all, we will just always incur the round | |
| 140 # trip for the conditional GET now instead of only | |
| 141 # once per 10 minutes. | |
| 142 # For more information, please see pypa/pip#5670. | |
| 143 "Cache-Control": "max-age=0", | |
| 144 }, | |
| 145 ) | |
| 146 resp.raise_for_status() | |
| 147 | |
| 148 # The check for archives above only works if the url ends with | |
| 149 # something that looks like an archive. However that is not a | |
| 150 # requirement of an url. Unless we issue a HEAD request on every | |
| 151 # url we cannot know ahead of time for sure if something is HTML | |
| 152 # or not. However we can check after we've downloaded it. | |
| 153 _ensure_html_header(resp) | |
| 154 | |
| 155 return resp | |
| 156 | |
| 157 | |
| 158 def _get_encoding_from_headers(headers): | |
| 159 # type: (ResponseHeaders) -> Optional[str] | |
| 160 """Determine if we have any encoding information in our headers. | |
| 161 """ | |
| 162 if headers and "Content-Type" in headers: | |
| 163 content_type, params = cgi.parse_header(headers["Content-Type"]) | |
| 164 if "charset" in params: | |
| 165 return params['charset'] | |
| 166 return None | |
| 167 | |
| 168 | |
| 169 def _determine_base_url(document, page_url): | |
| 170 # type: (HTMLElement, str) -> str | |
| 171 """Determine the HTML document's base URL. | |
| 172 | |
| 173 This looks for a ``<base>`` tag in the HTML document. If present, its href | |
| 174 attribute denotes the base URL of anchor tags in the document. If there is | |
| 175 no such tag (or if it does not have a valid href attribute), the HTML | |
| 176 file's URL is used as the base URL. | |
| 177 | |
| 178 :param document: An HTML document representation. The current | |
| 179 implementation expects the result of ``html5lib.parse()``. | |
| 180 :param page_url: The URL of the HTML document. | |
| 181 """ | |
| 182 for base in document.findall(".//base"): | |
| 183 href = base.get("href") | |
| 184 if href is not None: | |
| 185 return href | |
| 186 return page_url | |
| 187 | |
| 188 | |
| 189 def _clean_link(url): | |
| 190 # type: (str) -> str | |
| 191 """Makes sure a link is fully encoded. That is, if a ' ' shows up in | |
| 192 the link, it will be rewritten to %20 (while not over-quoting | |
| 193 % or other characters).""" | |
| 194 # Split the URL into parts according to the general structure | |
| 195 # `scheme://netloc/path;parameters?query#fragment`. Note that the | |
| 196 # `netloc` can be empty and the URI will then refer to a local | |
| 197 # filesystem path. | |
| 198 result = urllib_parse.urlparse(url) | |
| 199 # In both cases below we unquote prior to quoting to make sure | |
| 200 # nothing is double quoted. | |
| 201 if result.netloc == "": | |
| 202 # On Windows the path part might contain a drive letter which | |
| 203 # should not be quoted. On Linux where drive letters do not | |
| 204 # exist, the colon should be quoted. We rely on urllib.request | |
| 205 # to do the right thing here. | |
| 206 path = urllib_request.pathname2url( | |
| 207 urllib_request.url2pathname(result.path)) | |
| 208 else: | |
| 209 # In addition to the `/` character we protect `@` so that | |
| 210 # revision strings in VCS URLs are properly parsed. | |
| 211 path = urllib_parse.quote(urllib_parse.unquote(result.path), safe="/@") | |
| 212 return urllib_parse.urlunparse(result._replace(path=path)) | |
| 213 | |
| 214 | |
| 215 def _create_link_from_element( | |
| 216 anchor, # type: HTMLElement | |
| 217 page_url, # type: str | |
| 218 base_url, # type: str | |
| 219 ): | |
| 220 # type: (...) -> Optional[Link] | |
| 221 """ | |
| 222 Convert an anchor element in a simple repository page to a Link. | |
| 223 """ | |
| 224 href = anchor.get("href") | |
| 225 if not href: | |
| 226 return None | |
| 227 | |
| 228 url = _clean_link(urllib_parse.urljoin(base_url, href)) | |
| 229 pyrequire = anchor.get('data-requires-python') | |
| 230 pyrequire = unescape(pyrequire) if pyrequire else None | |
| 231 | |
| 232 yanked_reason = anchor.get('data-yanked') | |
| 233 if yanked_reason: | |
| 234 # This is a unicode string in Python 2 (and 3). | |
| 235 yanked_reason = unescape(yanked_reason) | |
| 236 | |
| 237 link = Link( | |
| 238 url, | |
| 239 comes_from=page_url, | |
| 240 requires_python=pyrequire, | |
| 241 yanked_reason=yanked_reason, | |
| 242 ) | |
| 243 | |
| 244 return link | |
| 245 | |
| 246 | |
| 247 def parse_links(page): | |
| 248 # type: (HTMLPage) -> Iterable[Link] | |
| 249 """ | |
| 250 Parse an HTML document, and yield its anchor elements as Link objects. | |
| 251 """ | |
| 252 document = html5lib.parse( | |
| 253 page.content, | |
| 254 transport_encoding=page.encoding, | |
| 255 namespaceHTMLElements=False, | |
| 256 ) | |
| 257 | |
| 258 url = page.url | |
| 259 base_url = _determine_base_url(document, url) | |
| 260 for anchor in document.findall(".//a"): | |
| 261 link = _create_link_from_element( | |
| 262 anchor, | |
| 263 page_url=url, | |
| 264 base_url=base_url, | |
| 265 ) | |
| 266 if link is None: | |
| 267 continue | |
| 268 yield link | |
| 269 | |
| 270 | |
| 271 class HTMLPage(object): | |
| 272 """Represents one page, along with its URL""" | |
| 273 | |
| 274 def __init__( | |
| 275 self, | |
| 276 content, # type: bytes | |
| 277 encoding, # type: Optional[str] | |
| 278 url, # type: str | |
| 279 ): | |
| 280 # type: (...) -> None | |
| 281 """ | |
| 282 :param encoding: the encoding to decode the given content. | |
| 283 :param url: the URL from which the HTML was downloaded. | |
| 284 """ | |
| 285 self.content = content | |
| 286 self.encoding = encoding | |
| 287 self.url = url | |
| 288 | |
| 289 def __str__(self): | |
| 290 # type: () -> str | |
| 291 return redact_auth_from_url(self.url) | |
| 292 | |
| 293 | |
| 294 def _handle_get_page_fail( | |
| 295 link, # type: Link | |
| 296 reason, # type: Union[str, Exception] | |
| 297 meth=None # type: Optional[Callable[..., None]] | |
| 298 ): | |
| 299 # type: (...) -> None | |
| 300 if meth is None: | |
| 301 meth = logger.debug | |
| 302 meth("Could not fetch URL %s: %s - skipping", link, reason) | |
| 303 | |
| 304 | |
| 305 def _make_html_page(response): | |
| 306 # type: (Response) -> HTMLPage | |
| 307 encoding = _get_encoding_from_headers(response.headers) | |
| 308 return HTMLPage(response.content, encoding=encoding, url=response.url) | |
| 309 | |
| 310 | |
| 311 def _get_html_page(link, session=None): | |
| 312 # type: (Link, Optional[PipSession]) -> Optional[HTMLPage] | |
| 313 if session is None: | |
| 314 raise TypeError( | |
| 315 "_get_html_page() missing 1 required keyword argument: 'session'" | |
| 316 ) | |
| 317 | |
| 318 url = link.url.split('#', 1)[0] | |
| 319 | |
| 320 # Check for VCS schemes that do not support lookup as web pages. | |
| 321 vcs_scheme = _match_vcs_scheme(url) | |
| 322 if vcs_scheme: | |
| 323 logger.debug('Cannot look at %s URL %s', vcs_scheme, link) | |
| 324 return None | |
| 325 | |
| 326 # Tack index.html onto file:// URLs that point to directories | |
| 327 scheme, _, path, _, _, _ = urllib_parse.urlparse(url) | |
| 328 if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))): | |
| 329 # add trailing slash if not present so urljoin doesn't trim | |
| 330 # final segment | |
| 331 if not url.endswith('/'): | |
| 332 url += '/' | |
| 333 url = urllib_parse.urljoin(url, 'index.html') | |
| 334 logger.debug(' file: URL is directory, getting %s', url) | |
| 335 | |
| 336 try: | |
| 337 resp = _get_html_response(url, session=session) | |
| 338 except _NotHTTP: | |
| 339 logger.debug( | |
| 340 'Skipping page %s because it looks like an archive, and cannot ' | |
| 341 'be checked by HEAD.', link, | |
| 342 ) | |
| 343 except _NotHTML as exc: | |
| 344 logger.debug( | |
| 345 'Skipping page %s because the %s request got Content-Type: %s', | |
| 346 link, exc.request_desc, exc.content_type, | |
| 347 ) | |
| 348 except HTTPError as exc: | |
| 349 _handle_get_page_fail(link, exc) | |
| 350 except RetryError as exc: | |
| 351 _handle_get_page_fail(link, exc) | |
| 352 except SSLError as exc: | |
| 353 reason = "There was a problem confirming the ssl certificate: " | |
| 354 reason += str(exc) | |
| 355 _handle_get_page_fail(link, reason, meth=logger.info) | |
| 356 except requests.ConnectionError as exc: | |
| 357 _handle_get_page_fail(link, "connection error: %s" % exc) | |
| 358 except requests.Timeout: | |
| 359 _handle_get_page_fail(link, "timed out") | |
| 360 else: | |
| 361 return _make_html_page(resp) | |
| 362 return None | |
| 363 | |
| 364 | |
| 365 def _remove_duplicate_links(links): | |
| 366 # type: (Iterable[Link]) -> List[Link] | |
| 367 """ | |
| 368 Return a list of links, with duplicates removed and ordering preserved. | |
| 369 """ | |
| 370 # We preserve the ordering when removing duplicates because we can. | |
| 371 return list(OrderedDict.fromkeys(links)) | |
| 372 | |
| 373 | |
| 374 def group_locations(locations, expand_dir=False): | |
| 375 # type: (Sequence[str], bool) -> Tuple[List[str], List[str]] | |
| 376 """ | |
| 377 Divide a list of locations into two groups: "files" (archives) and "urls." | |
| 378 | |
| 379 :return: A pair of lists (files, urls). | |
| 380 """ | |
| 381 files = [] | |
| 382 urls = [] | |
| 383 | |
| 384 # puts the url for the given file path into the appropriate list | |
| 385 def sort_path(path): | |
| 386 # type: (str) -> None | |
| 387 url = path_to_url(path) | |
| 388 if mimetypes.guess_type(url, strict=False)[0] == 'text/html': | |
| 389 urls.append(url) | |
| 390 else: | |
| 391 files.append(url) | |
| 392 | |
| 393 for url in locations: | |
| 394 | |
| 395 is_local_path = os.path.exists(url) | |
| 396 is_file_url = url.startswith('file:') | |
| 397 | |
| 398 if is_local_path or is_file_url: | |
| 399 if is_local_path: | |
| 400 path = url | |
| 401 else: | |
| 402 path = url_to_path(url) | |
| 403 if os.path.isdir(path): | |
| 404 if expand_dir: | |
| 405 path = os.path.realpath(path) | |
| 406 for item in os.listdir(path): | |
| 407 sort_path(os.path.join(path, item)) | |
| 408 elif is_file_url: | |
| 409 urls.append(url) | |
| 410 else: | |
| 411 logger.warning( | |
| 412 "Path '{0}' is ignored: " | |
| 413 "it is a directory.".format(path), | |
| 414 ) | |
| 415 elif os.path.isfile(path): | |
| 416 sort_path(path) | |
| 417 else: | |
| 418 logger.warning( | |
| 419 "Url '%s' is ignored: it is neither a file " | |
| 420 "nor a directory.", url, | |
| 421 ) | |
| 422 elif is_url(url): | |
| 423 # Only add url with clear scheme | |
| 424 urls.append(url) | |
| 425 else: | |
| 426 logger.warning( | |
| 427 "Url '%s' is ignored. It is either a non-existing " | |
| 428 "path or lacks a specific scheme.", url, | |
| 429 ) | |
| 430 | |
| 431 return files, urls | |
| 432 | |
| 433 | |
| 434 class CollectedLinks(object): | |
| 435 | |
| 436 """ | |
| 437 Encapsulates the return value of a call to LinkCollector.collect_links(). | |
| 438 | |
| 439 The return value includes both URLs to project pages containing package | |
| 440 links, as well as individual package Link objects collected from other | |
| 441 sources. | |
| 442 | |
| 443 This info is stored separately as: | |
| 444 | |
| 445 (1) links from the configured file locations, | |
| 446 (2) links from the configured find_links, and | |
| 447 (3) urls to HTML project pages, as described by the PEP 503 simple | |
| 448 repository API. | |
| 449 """ | |
| 450 | |
| 451 def __init__( | |
| 452 self, | |
| 453 files, # type: List[Link] | |
| 454 find_links, # type: List[Link] | |
| 455 project_urls, # type: List[Link] | |
| 456 ): | |
| 457 # type: (...) -> None | |
| 458 """ | |
| 459 :param files: Links from file locations. | |
| 460 :param find_links: Links from find_links. | |
| 461 :param project_urls: URLs to HTML project pages, as described by | |
| 462 the PEP 503 simple repository API. | |
| 463 """ | |
| 464 self.files = files | |
| 465 self.find_links = find_links | |
| 466 self.project_urls = project_urls | |
| 467 | |
| 468 | |
| 469 class LinkCollector(object): | |
| 470 | |
| 471 """ | |
| 472 Responsible for collecting Link objects from all configured locations, | |
| 473 making network requests as needed. | |
| 474 | |
| 475 The class's main method is its collect_links() method. | |
| 476 """ | |
| 477 | |
| 478 def __init__( | |
| 479 self, | |
| 480 session, # type: PipSession | |
| 481 search_scope, # type: SearchScope | |
| 482 ): | |
| 483 # type: (...) -> None | |
| 484 self.search_scope = search_scope | |
| 485 self.session = session | |
| 486 | |
| 487 @property | |
| 488 def find_links(self): | |
| 489 # type: () -> List[str] | |
| 490 return self.search_scope.find_links | |
| 491 | |
| 492 def fetch_page(self, location): | |
| 493 # type: (Link) -> Optional[HTMLPage] | |
| 494 """ | |
| 495 Fetch an HTML page containing package links. | |
| 496 """ | |
| 497 return _get_html_page(location, session=self.session) | |
| 498 | |
| 499 def collect_links(self, project_name): | |
| 500 # type: (str) -> CollectedLinks | |
| 501 """Find all available links for the given project name. | |
| 502 | |
| 503 :return: All the Link objects (unfiltered), as a CollectedLinks object. | |
| 504 """ | |
| 505 search_scope = self.search_scope | |
| 506 index_locations = search_scope.get_index_urls_locations(project_name) | |
| 507 index_file_loc, index_url_loc = group_locations(index_locations) | |
| 508 fl_file_loc, fl_url_loc = group_locations( | |
| 509 self.find_links, expand_dir=True, | |
| 510 ) | |
| 511 | |
| 512 file_links = [ | |
| 513 Link(url) for url in itertools.chain(index_file_loc, fl_file_loc) | |
| 514 ] | |
| 515 | |
| 516 # We trust every directly linked archive in find_links | |
| 517 find_link_links = [Link(url, '-f') for url in self.find_links] | |
| 518 | |
| 519 # We trust every url that the user has given us whether it was given | |
| 520 # via --index-url or --find-links. | |
| 521 # We want to filter out anything that does not have a secure origin. | |
| 522 url_locations = [ | |
| 523 link for link in itertools.chain( | |
| 524 (Link(url) for url in index_url_loc), | |
| 525 (Link(url) for url in fl_url_loc), | |
| 526 ) | |
| 527 if self.session.is_secure_origin(link) | |
| 528 ] | |
| 529 | |
| 530 url_locations = _remove_duplicate_links(url_locations) | |
| 531 lines = [ | |
| 532 '{} location(s) to search for versions of {}:'.format( | |
| 533 len(url_locations), project_name, | |
| 534 ), | |
| 535 ] | |
| 536 for link in url_locations: | |
| 537 lines.append('* {}'.format(link)) | |
| 538 logger.debug('\n'.join(lines)) | |
| 539 | |
| 540 return CollectedLinks( | |
| 541 files=file_links, | |
| 542 find_links=find_link_links, | |
| 543 project_urls=url_locations, | |
| 544 ) |
