comparison env/lib/python3.9/site-packages/schema_salad/fetcher.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """Resource fetching."""
2 import logging
3 import os
4 import re
5 import sys
6 import urllib
7 from typing import List, Optional
8
9 import requests
10
11 from .exceptions import ValidationException
12 from .utils import CacheType
13
14 _re_drive = re.compile(r"/([a-zA-Z]):")
15 _logger = logging.getLogger("salad")
16
17
18 class Fetcher:
19 def __init__(
20 self,
21 cache: CacheType,
22 session: Optional[requests.sessions.Session],
23 ) -> None:
24 pass
25
26 def fetch_text(self, url: str, content_types: Optional[List[str]] = None) -> str:
27 raise NotImplementedError()
28
29 def check_exists(self, url: str) -> bool:
30 raise NotImplementedError()
31
32 def urljoin(self, base_url: str, url: str) -> str:
33 raise NotImplementedError()
34
35 schemes = ["file", "http", "https", "mailto"]
36
37 def supported_schemes(self) -> List[str]:
38 return self.schemes
39
40
41 class DefaultFetcher(Fetcher):
42 def __init__(
43 self,
44 cache: CacheType,
45 session: Optional[requests.sessions.Session],
46 ) -> None:
47 self.cache = cache
48 self.session = session
49
50 def fetch_text(self, url: str, content_types: Optional[List[str]] = None) -> str:
51 """Retrieve the given resource as a string."""
52 result = self.cache.get(url, None)
53 if isinstance(result, str):
54 return result
55
56 split = urllib.parse.urlsplit(url)
57 scheme, path = split.scheme, split.path
58
59 if scheme in ["http", "https"] and self.session is not None:
60 try:
61 headers = {}
62 if content_types:
63 headers["Accept"] = ", ".join(content_types) + ", */*;q=0.8"
64 resp = self.session.get(url, headers=headers)
65 resp.raise_for_status()
66 except Exception as e:
67 raise ValidationException(f"Error fetching {url}: {e}") from e
68 if content_types and "content-type" in resp.headers:
69 content_type = resp.headers["content-type"].split(";")[:1][0]
70 if content_type not in content_types:
71 _logger.warning(
72 f"While fetching {url}, got content-type of "
73 f"'{content_type}'. Expected one of {content_types}."
74 )
75 return resp.text
76 if scheme == "file":
77 try:
78 # On Windows, url.path will be /drive:/path ; on Unix systems,
79 # /path. As we want drive:/path instead of /drive:/path on Windows,
80 # remove the leading /.
81 if os.path.isabs(
82 path[1:]
83 ): # checking if pathis valid after removing front / or not
84 path = path[1:]
85 with open(
86 urllib.request.url2pathname(str(path)), encoding="utf-8"
87 ) as fp:
88 return str(fp.read())
89
90 except OSError as err:
91 if err.filename == path:
92 raise ValidationException(str(err)) from err
93 else:
94 raise ValidationException(f"Error reading {url}: {err}") from err
95 raise ValidationException(f"Unsupported scheme in url: {url}")
96
97 def check_exists(self, url: str) -> bool:
98 if url in self.cache:
99 return True
100
101 split = urllib.parse.urlsplit(url)
102 scheme, path = split.scheme, split.path
103
104 if scheme in ["http", "https"] and self.session is not None:
105 try:
106 resp = self.session.head(url)
107 resp.raise_for_status()
108 except Exception:
109 return False
110 self.cache[url] = True
111 return True
112 if scheme == "file":
113 return os.path.exists(urllib.request.url2pathname(str(path)))
114 if scheme == "mailto":
115 return True
116 raise ValidationException(f"Unsupported scheme in url: {url}")
117
118 def urljoin(self, base_url: str, url: str) -> str:
119 if url.startswith("_:"):
120 return url
121
122 basesplit = urllib.parse.urlsplit(base_url)
123 split = urllib.parse.urlsplit(url)
124 if basesplit.scheme and basesplit.scheme != "file" and split.scheme == "file":
125 raise ValidationException(
126 "Not resolving potential remote exploit {} from base {}".format(
127 url, base_url
128 )
129 )
130
131 if sys.platform == "win32":
132 if base_url == url:
133 return url
134 basesplit = urllib.parse.urlsplit(base_url)
135 # note that below might split
136 # "C:" with "C" as URI scheme
137 split = urllib.parse.urlsplit(url)
138
139 has_drive = split.scheme and len(split.scheme) == 1
140
141 if basesplit.scheme == "file":
142 # Special handling of relative file references on Windows
143 # as urllib seems to not be quite up to the job
144
145 # netloc MIGHT appear in equivalents of UNC Strings
146 # \\server1.example.com\path as
147 # file:///server1.example.com/path
148 # https://tools.ietf.org/html/rfc8089#appendix-E.3.2
149 # (TODO: test this)
150 netloc = split.netloc or basesplit.netloc
151
152 # Check if url is a local path like "C:/Users/fred"
153 # or actually an absolute URI like http://example.com/fred
154 if has_drive:
155 # Assume split.scheme is actually a drive, e.g. "C:"
156 # so we'll recombine into a path
157 path_with_drive = urllib.parse.urlunsplit(
158 (split.scheme, "", split.path, "", "")
159 )
160 # Compose new file:/// URI with path_with_drive
161 # .. carrying over any #fragment (?query just in case..)
162 return urllib.parse.urlunsplit(
163 ("file", netloc, path_with_drive, split.query, split.fragment)
164 )
165 if (
166 not split.scheme
167 and not netloc
168 and split.path
169 and split.path.startswith("/")
170 ):
171 # Relative - but does it have a drive?
172 base_drive = _re_drive.match(basesplit.path)
173 drive = _re_drive.match(split.path)
174 if base_drive and not drive:
175 # Keep drive letter from base_url
176 # https://tools.ietf.org/html/rfc8089#appendix-E.2.1
177 # e.g. urljoin("file:///D:/bar/a.txt", "/foo/b.txt")
178 # == file:///D:/foo/b.txt
179 path_with_drive = "/{}:{}".format(
180 base_drive.group(1), split.path
181 )
182 return urllib.parse.urlunsplit(
183 (
184 "file",
185 netloc,
186 path_with_drive,
187 split.query,
188 split.fragment,
189 )
190 )
191
192 # else: fall-through to resolve as relative URI
193 elif has_drive:
194 # Base is http://something but url is C:/something - which urllib
195 # would wrongly resolve as an absolute path that could later be used
196 # to access local files
197 raise ValidationException(
198 "Not resolving potential remote exploit {} from base {}".format(
199 url, base_url
200 )
201 )
202
203 return urllib.parse.urljoin(base_url, url)