annotate lib/python3.8/site-packages/pip/_internal/index/collector.py @ 0:9e54283cc701 draft

"planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
author guerler
date Mon, 27 Jul 2020 03:47:31 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
1 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
2 The main purpose of this module is to expose LinkCollector.collect_links().
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
3 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
4
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
5 import cgi
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
6 import itertools
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
7 import logging
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
8 import mimetypes
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
9 import os
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
10 from collections import OrderedDict
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
11
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
12 from pip._vendor import html5lib, requests
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
13 from pip._vendor.distlib.compat import unescape
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
14 from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
15 from pip._vendor.six.moves.urllib import parse as urllib_parse
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
16 from pip._vendor.six.moves.urllib import request as urllib_request
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
17
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
18 from pip._internal.models.link import Link
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
19 from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
20 from pip._internal.utils.misc import redact_auth_from_url
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
21 from pip._internal.utils.typing import MYPY_CHECK_RUNNING
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
22 from pip._internal.utils.urls import path_to_url, url_to_path
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
23 from pip._internal.vcs import is_url, vcs
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
24
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
25 if MYPY_CHECK_RUNNING:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
26 from typing import (
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
27 Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
28 Union,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
29 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
30 import xml.etree.ElementTree
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
31
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
32 from pip._vendor.requests import Response
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
33
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
34 from pip._internal.models.search_scope import SearchScope
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
35 from pip._internal.network.session import PipSession
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
36
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
37 HTMLElement = xml.etree.ElementTree.Element
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
38 ResponseHeaders = MutableMapping[str, str]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
39
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
40
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
41 logger = logging.getLogger(__name__)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
42
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
43
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
44 def _match_vcs_scheme(url):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
45 # type: (str) -> Optional[str]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
46 """Look for VCS schemes in the URL.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
47
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
48 Returns the matched VCS scheme, or None if there's no match.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
49 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
50 for scheme in vcs.schemes:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
51 if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
52 return scheme
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
53 return None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
54
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
55
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
56 def _is_url_like_archive(url):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
57 # type: (str) -> bool
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
58 """Return whether the URL looks like an archive.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
59 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
60 filename = Link(url).filename
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
61 for bad_ext in ARCHIVE_EXTENSIONS:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
62 if filename.endswith(bad_ext):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
63 return True
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
64 return False
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
65
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
66
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
67 class _NotHTML(Exception):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
68 def __init__(self, content_type, request_desc):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
69 # type: (str, str) -> None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
70 super(_NotHTML, self).__init__(content_type, request_desc)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
71 self.content_type = content_type
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
72 self.request_desc = request_desc
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
73
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
74
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
75 def _ensure_html_header(response):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
76 # type: (Response) -> None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
77 """Check the Content-Type header to ensure the response contains HTML.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
78
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
79 Raises `_NotHTML` if the content type is not text/html.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
80 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
81 content_type = response.headers.get("Content-Type", "")
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
82 if not content_type.lower().startswith("text/html"):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
83 raise _NotHTML(content_type, response.request.method)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
84
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
85
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
86 class _NotHTTP(Exception):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
87 pass
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
88
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
89
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
90 def _ensure_html_response(url, session):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
91 # type: (str, PipSession) -> None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
92 """Send a HEAD request to the URL, and ensure the response contains HTML.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
93
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
94 Raises `_NotHTTP` if the URL is not available for a HEAD request, or
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
95 `_NotHTML` if the content type is not text/html.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
96 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
97 scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
98 if scheme not in {'http', 'https'}:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
99 raise _NotHTTP()
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
100
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
101 resp = session.head(url, allow_redirects=True)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
102 resp.raise_for_status()
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
103
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
104 _ensure_html_header(resp)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
105
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
106
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
107 def _get_html_response(url, session):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
108 # type: (str, PipSession) -> Response
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
109 """Access an HTML page with GET, and return the response.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
110
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
111 This consists of three parts:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
112
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
113 1. If the URL looks suspiciously like an archive, send a HEAD first to
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
114 check the Content-Type is HTML, to avoid downloading a large file.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
115 Raise `_NotHTTP` if the content type cannot be determined, or
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
116 `_NotHTML` if it is not HTML.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
117 2. Actually perform the request. Raise HTTP exceptions on network failures.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
118 3. Check the Content-Type header to make sure we got HTML, and raise
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
119 `_NotHTML` otherwise.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
120 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
121 if _is_url_like_archive(url):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
122 _ensure_html_response(url, session=session)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
123
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
124 logger.debug('Getting page %s', redact_auth_from_url(url))
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
125
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
126 resp = session.get(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
127 url,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
128 headers={
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
129 "Accept": "text/html",
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
130 # We don't want to blindly returned cached data for
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
131 # /simple/, because authors generally expecting that
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
132 # twine upload && pip install will function, but if
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
133 # they've done a pip install in the last ~10 minutes
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
134 # it won't. Thus by setting this to zero we will not
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
135 # blindly use any cached data, however the benefit of
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
136 # using max-age=0 instead of no-cache, is that we will
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
137 # still support conditional requests, so we will still
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
138 # minimize traffic sent in cases where the page hasn't
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
139 # changed at all, we will just always incur the round
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
140 # trip for the conditional GET now instead of only
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
141 # once per 10 minutes.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
142 # For more information, please see pypa/pip#5670.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
143 "Cache-Control": "max-age=0",
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
144 },
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
145 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
146 resp.raise_for_status()
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
147
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
148 # The check for archives above only works if the url ends with
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
149 # something that looks like an archive. However that is not a
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
150 # requirement of an url. Unless we issue a HEAD request on every
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
151 # url we cannot know ahead of time for sure if something is HTML
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
152 # or not. However we can check after we've downloaded it.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
153 _ensure_html_header(resp)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
154
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
155 return resp
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
156
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
157
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
158 def _get_encoding_from_headers(headers):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
159 # type: (ResponseHeaders) -> Optional[str]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
160 """Determine if we have any encoding information in our headers.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
161 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
162 if headers and "Content-Type" in headers:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
163 content_type, params = cgi.parse_header(headers["Content-Type"])
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
164 if "charset" in params:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
165 return params['charset']
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
166 return None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
167
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
168
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
169 def _determine_base_url(document, page_url):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
170 # type: (HTMLElement, str) -> str
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
171 """Determine the HTML document's base URL.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
172
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
173 This looks for a ``<base>`` tag in the HTML document. If present, its href
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
174 attribute denotes the base URL of anchor tags in the document. If there is
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
175 no such tag (or if it does not have a valid href attribute), the HTML
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
176 file's URL is used as the base URL.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
177
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
178 :param document: An HTML document representation. The current
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
179 implementation expects the result of ``html5lib.parse()``.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
180 :param page_url: The URL of the HTML document.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
181 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
182 for base in document.findall(".//base"):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
183 href = base.get("href")
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
184 if href is not None:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
185 return href
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
186 return page_url
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
187
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
188
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
189 def _clean_link(url):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
190 # type: (str) -> str
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
191 """Makes sure a link is fully encoded. That is, if a ' ' shows up in
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
192 the link, it will be rewritten to %20 (while not over-quoting
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
193 % or other characters)."""
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
194 # Split the URL into parts according to the general structure
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
195 # `scheme://netloc/path;parameters?query#fragment`. Note that the
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
196 # `netloc` can be empty and the URI will then refer to a local
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
197 # filesystem path.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
198 result = urllib_parse.urlparse(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
199 # In both cases below we unquote prior to quoting to make sure
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
200 # nothing is double quoted.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
201 if result.netloc == "":
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
202 # On Windows the path part might contain a drive letter which
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
203 # should not be quoted. On Linux where drive letters do not
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
204 # exist, the colon should be quoted. We rely on urllib.request
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
205 # to do the right thing here.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
206 path = urllib_request.pathname2url(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
207 urllib_request.url2pathname(result.path))
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
208 else:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
209 # In addition to the `/` character we protect `@` so that
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
210 # revision strings in VCS URLs are properly parsed.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
211 path = urllib_parse.quote(urllib_parse.unquote(result.path), safe="/@")
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
212 return urllib_parse.urlunparse(result._replace(path=path))
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
213
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
214
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
215 def _create_link_from_element(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
216 anchor, # type: HTMLElement
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
217 page_url, # type: str
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
218 base_url, # type: str
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
219 ):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
220 # type: (...) -> Optional[Link]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
221 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
222 Convert an anchor element in a simple repository page to a Link.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
223 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
224 href = anchor.get("href")
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
225 if not href:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
226 return None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
227
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
228 url = _clean_link(urllib_parse.urljoin(base_url, href))
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
229 pyrequire = anchor.get('data-requires-python')
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
230 pyrequire = unescape(pyrequire) if pyrequire else None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
231
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
232 yanked_reason = anchor.get('data-yanked')
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
233 if yanked_reason:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
234 # This is a unicode string in Python 2 (and 3).
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
235 yanked_reason = unescape(yanked_reason)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
236
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
237 link = Link(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
238 url,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
239 comes_from=page_url,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
240 requires_python=pyrequire,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
241 yanked_reason=yanked_reason,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
242 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
243
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
244 return link
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
245
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
246
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
247 def parse_links(page):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
248 # type: (HTMLPage) -> Iterable[Link]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
249 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
250 Parse an HTML document, and yield its anchor elements as Link objects.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
251 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
252 document = html5lib.parse(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
253 page.content,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
254 transport_encoding=page.encoding,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
255 namespaceHTMLElements=False,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
256 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
257
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
258 url = page.url
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
259 base_url = _determine_base_url(document, url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
260 for anchor in document.findall(".//a"):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
261 link = _create_link_from_element(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
262 anchor,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
263 page_url=url,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
264 base_url=base_url,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
265 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
266 if link is None:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
267 continue
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
268 yield link
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
269
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
270
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
271 class HTMLPage(object):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
272 """Represents one page, along with its URL"""
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
273
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
274 def __init__(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
275 self,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
276 content, # type: bytes
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
277 encoding, # type: Optional[str]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
278 url, # type: str
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
279 ):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
280 # type: (...) -> None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
281 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
282 :param encoding: the encoding to decode the given content.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
283 :param url: the URL from which the HTML was downloaded.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
284 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
285 self.content = content
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
286 self.encoding = encoding
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
287 self.url = url
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
288
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
289 def __str__(self):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
290 # type: () -> str
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
291 return redact_auth_from_url(self.url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
292
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
293
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
294 def _handle_get_page_fail(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
295 link, # type: Link
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
296 reason, # type: Union[str, Exception]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
297 meth=None # type: Optional[Callable[..., None]]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
298 ):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
299 # type: (...) -> None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
300 if meth is None:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
301 meth = logger.debug
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
302 meth("Could not fetch URL %s: %s - skipping", link, reason)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
303
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
304
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
305 def _make_html_page(response):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
306 # type: (Response) -> HTMLPage
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
307 encoding = _get_encoding_from_headers(response.headers)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
308 return HTMLPage(response.content, encoding=encoding, url=response.url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
309
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
310
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
311 def _get_html_page(link, session=None):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
312 # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
313 if session is None:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
314 raise TypeError(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
315 "_get_html_page() missing 1 required keyword argument: 'session'"
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
316 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
317
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
318 url = link.url.split('#', 1)[0]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
319
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
320 # Check for VCS schemes that do not support lookup as web pages.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
321 vcs_scheme = _match_vcs_scheme(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
322 if vcs_scheme:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
323 logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
324 return None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
325
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
326 # Tack index.html onto file:// URLs that point to directories
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
327 scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
328 if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
329 # add trailing slash if not present so urljoin doesn't trim
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
330 # final segment
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
331 if not url.endswith('/'):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
332 url += '/'
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
333 url = urllib_parse.urljoin(url, 'index.html')
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
334 logger.debug(' file: URL is directory, getting %s', url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
335
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
336 try:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
337 resp = _get_html_response(url, session=session)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
338 except _NotHTTP:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
339 logger.debug(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
340 'Skipping page %s because it looks like an archive, and cannot '
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
341 'be checked by HEAD.', link,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
342 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
343 except _NotHTML as exc:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
344 logger.debug(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
345 'Skipping page %s because the %s request got Content-Type: %s',
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
346 link, exc.request_desc, exc.content_type,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
347 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
348 except HTTPError as exc:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
349 _handle_get_page_fail(link, exc)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
350 except RetryError as exc:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
351 _handle_get_page_fail(link, exc)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
352 except SSLError as exc:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
353 reason = "There was a problem confirming the ssl certificate: "
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
354 reason += str(exc)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
355 _handle_get_page_fail(link, reason, meth=logger.info)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
356 except requests.ConnectionError as exc:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
357 _handle_get_page_fail(link, "connection error: %s" % exc)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
358 except requests.Timeout:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
359 _handle_get_page_fail(link, "timed out")
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
360 else:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
361 return _make_html_page(resp)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
362 return None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
363
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
364
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
365 def _remove_duplicate_links(links):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
366 # type: (Iterable[Link]) -> List[Link]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
367 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
368 Return a list of links, with duplicates removed and ordering preserved.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
369 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
370 # We preserve the ordering when removing duplicates because we can.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
371 return list(OrderedDict.fromkeys(links))
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
372
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
373
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
374 def group_locations(locations, expand_dir=False):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
375 # type: (Sequence[str], bool) -> Tuple[List[str], List[str]]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
376 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
377 Divide a list of locations into two groups: "files" (archives) and "urls."
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
378
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
379 :return: A pair of lists (files, urls).
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
380 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
381 files = []
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
382 urls = []
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
383
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
384 # puts the url for the given file path into the appropriate list
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
385 def sort_path(path):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
386 # type: (str) -> None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
387 url = path_to_url(path)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
388 if mimetypes.guess_type(url, strict=False)[0] == 'text/html':
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
389 urls.append(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
390 else:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
391 files.append(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
392
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
393 for url in locations:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
394
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
395 is_local_path = os.path.exists(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
396 is_file_url = url.startswith('file:')
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
397
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
398 if is_local_path or is_file_url:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
399 if is_local_path:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
400 path = url
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
401 else:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
402 path = url_to_path(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
403 if os.path.isdir(path):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
404 if expand_dir:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
405 path = os.path.realpath(path)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
406 for item in os.listdir(path):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
407 sort_path(os.path.join(path, item))
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
408 elif is_file_url:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
409 urls.append(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
410 else:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
411 logger.warning(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
412 "Path '{0}' is ignored: "
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
413 "it is a directory.".format(path),
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
414 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
415 elif os.path.isfile(path):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
416 sort_path(path)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
417 else:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
418 logger.warning(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
419 "Url '%s' is ignored: it is neither a file "
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
420 "nor a directory.", url,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
421 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
422 elif is_url(url):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
423 # Only add url with clear scheme
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
424 urls.append(url)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
425 else:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
426 logger.warning(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
427 "Url '%s' is ignored. It is either a non-existing "
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
428 "path or lacks a specific scheme.", url,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
429 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
430
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
431 return files, urls
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
432
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
433
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
434 class CollectedLinks(object):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
435
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
436 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
437 Encapsulates the return value of a call to LinkCollector.collect_links().
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
438
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
439 The return value includes both URLs to project pages containing package
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
440 links, as well as individual package Link objects collected from other
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
441 sources.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
442
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
443 This info is stored separately as:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
444
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
445 (1) links from the configured file locations,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
446 (2) links from the configured find_links, and
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
447 (3) urls to HTML project pages, as described by the PEP 503 simple
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
448 repository API.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
449 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
450
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
451 def __init__(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
452 self,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
453 files, # type: List[Link]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
454 find_links, # type: List[Link]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
455 project_urls, # type: List[Link]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
456 ):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
457 # type: (...) -> None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
458 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
459 :param files: Links from file locations.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
460 :param find_links: Links from find_links.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
461 :param project_urls: URLs to HTML project pages, as described by
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
462 the PEP 503 simple repository API.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
463 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
464 self.files = files
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
465 self.find_links = find_links
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
466 self.project_urls = project_urls
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
467
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
468
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
469 class LinkCollector(object):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
470
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
471 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
472 Responsible for collecting Link objects from all configured locations,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
473 making network requests as needed.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
474
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
475 The class's main method is its collect_links() method.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
476 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
477
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
478 def __init__(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
479 self,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
480 session, # type: PipSession
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
481 search_scope, # type: SearchScope
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
482 ):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
483 # type: (...) -> None
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
484 self.search_scope = search_scope
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
485 self.session = session
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
486
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
487 @property
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
488 def find_links(self):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
489 # type: () -> List[str]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
490 return self.search_scope.find_links
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
491
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
492 def fetch_page(self, location):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
493 # type: (Link) -> Optional[HTMLPage]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
494 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
495 Fetch an HTML page containing package links.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
496 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
497 return _get_html_page(location, session=self.session)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
498
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
499 def collect_links(self, project_name):
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
500 # type: (str) -> CollectedLinks
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
501 """Find all available links for the given project name.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
502
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
503 :return: All the Link objects (unfiltered), as a CollectedLinks object.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
504 """
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
505 search_scope = self.search_scope
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
506 index_locations = search_scope.get_index_urls_locations(project_name)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
507 index_file_loc, index_url_loc = group_locations(index_locations)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
508 fl_file_loc, fl_url_loc = group_locations(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
509 self.find_links, expand_dir=True,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
510 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
511
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
512 file_links = [
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
513 Link(url) for url in itertools.chain(index_file_loc, fl_file_loc)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
514 ]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
515
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
516 # We trust every directly linked archive in find_links
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
517 find_link_links = [Link(url, '-f') for url in self.find_links]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
518
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
519 # We trust every url that the user has given us whether it was given
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
520 # via --index-url or --find-links.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
521 # We want to filter out anything that does not have a secure origin.
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
522 url_locations = [
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
523 link for link in itertools.chain(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
524 (Link(url) for url in index_url_loc),
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
525 (Link(url) for url in fl_url_loc),
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
526 )
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
527 if self.session.is_secure_origin(link)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
528 ]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
529
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
530 url_locations = _remove_duplicate_links(url_locations)
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
531 lines = [
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
532 '{} location(s) to search for versions of {}:'.format(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
533 len(url_locations), project_name,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
534 ),
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
535 ]
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
536 for link in url_locations:
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
537 lines.append('* {}'.format(link))
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
538 logger.debug('\n'.join(lines))
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
539
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
540 return CollectedLinks(
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
541 files=file_links,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
542 find_links=find_link_links,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
543 project_urls=url_locations,
9e54283cc701 "planemo upload commit d12c32a45bcd441307e632fca6d9af7d60289d44"
guerler
parents:
diff changeset
544 )