comparison planemo/lib/python3.7/site-packages/cachecontrol/controller.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 """
2 The httplib2 algorithms ported for use with requests.
3 """
4 import logging
5 import re
6 import calendar
7 import time
8 from email.utils import parsedate_tz
9
10 from requests.structures import CaseInsensitiveDict
11
12 from .cache import DictCache
13 from .serialize import Serializer
14
15
16 logger = logging.getLogger(__name__)
17
18 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
19
20
21 def parse_uri(uri):
22 """Parses a URI using the regex given in Appendix B of RFC 3986.
23
24 (scheme, authority, path, query, fragment) = parse_uri(uri)
25 """
26 groups = URI.match(uri).groups()
27 return (groups[1], groups[3], groups[4], groups[6], groups[8])
28
29
30 class CacheController(object):
31 """An interface to see if request should cached or not.
32 """
33 def __init__(self, cache=None, cache_etags=True, serializer=None):
34 self.cache = cache or DictCache()
35 self.cache_etags = cache_etags
36 self.serializer = serializer or Serializer()
37
38 @classmethod
39 def _urlnorm(cls, uri):
40 """Normalize the URL to create a safe key for the cache"""
41 (scheme, authority, path, query, fragment) = parse_uri(uri)
42 if not scheme or not authority:
43 raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
44
45 scheme = scheme.lower()
46 authority = authority.lower()
47
48 if not path:
49 path = "/"
50
51 # Could do syntax based normalization of the URI before
52 # computing the digest. See Section 6.2.2 of Std 66.
53 request_uri = query and "?".join([path, query]) or path
54 defrag_uri = scheme + "://" + authority + request_uri
55
56 return defrag_uri
57
58 @classmethod
59 def cache_url(cls, uri):
60 return cls._urlnorm(uri)
61
62 def parse_cache_control(self, headers):
63 """
64 Parse the cache control headers returning a dictionary with values
65 for the different directives.
66 """
67 retval = {}
68
69 cc_header = 'cache-control'
70 if 'Cache-Control' in headers:
71 cc_header = 'Cache-Control'
72
73 if cc_header in headers:
74 parts = headers[cc_header].split(',')
75 parts_with_args = [
76 tuple([x.strip().lower() for x in part.split("=", 1)])
77 for part in parts if -1 != part.find("=")
78 ]
79 parts_wo_args = [
80 (name.strip().lower(), 1)
81 for name in parts if -1 == name.find("=")
82 ]
83 retval = dict(parts_with_args + parts_wo_args)
84 return retval
85
86 def cached_request(self, request):
87 """
88 Return a cached response if it exists in the cache, otherwise
89 return False.
90 """
91 cache_url = self.cache_url(request.url)
92 logger.debug('Looking up "%s" in the cache', cache_url)
93 cc = self.parse_cache_control(request.headers)
94
95 # Bail out if the request insists on fresh data
96 if 'no-cache' in cc:
97 logger.debug('Request header has "no-cache", cache bypassed')
98 return False
99
100 if 'max-age' in cc and cc['max-age'] == 0:
101 logger.debug('Request header has "max_age" as 0, cache bypassed')
102 return False
103
104 # Request allows serving from the cache, let's see if we find something
105 cache_data = self.cache.get(cache_url)
106 if cache_data is None:
107 logger.debug('No cache entry available')
108 return False
109
110 # Check whether it can be deserialized
111 resp = self.serializer.loads(request, cache_data)
112 if not resp:
113 logger.warning('Cache entry deserialization failed, entry ignored')
114 return False
115
116 # If we have a cached 301, return it immediately. We don't
117 # need to test our response for other headers b/c it is
118 # intrinsically "cacheable" as it is Permanent.
119 # See:
120 # https://tools.ietf.org/html/rfc7231#section-6.4.2
121 #
122 # Client can try to refresh the value by repeating the request
123 # with cache busting headers as usual (ie no-cache).
124 if resp.status == 301:
125 msg = ('Returning cached "301 Moved Permanently" response '
126 '(ignoring date and etag information)')
127 logger.debug(msg)
128 return resp
129
130 headers = CaseInsensitiveDict(resp.headers)
131 if not headers or 'date' not in headers:
132 if 'etag' not in headers:
133 # Without date or etag, the cached response can never be used
134 # and should be deleted.
135 logger.debug('Purging cached response: no date or etag')
136 self.cache.delete(cache_url)
137 logger.debug('Ignoring cached response: no date')
138 return False
139
140 now = time.time()
141 date = calendar.timegm(
142 parsedate_tz(headers['date'])
143 )
144 current_age = max(0, now - date)
145 logger.debug('Current age based on date: %i', current_age)
146
147 # TODO: There is an assumption that the result will be a
148 # urllib3 response object. This may not be best since we
149 # could probably avoid instantiating or constructing the
150 # response until we know we need it.
151 resp_cc = self.parse_cache_control(headers)
152
153 # determine freshness
154 freshness_lifetime = 0
155
156 # Check the max-age pragma in the cache control header
157 if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
158 freshness_lifetime = int(resp_cc['max-age'])
159 logger.debug('Freshness lifetime from max-age: %i',
160 freshness_lifetime)
161
162 # If there isn't a max-age, check for an expires header
163 elif 'expires' in headers:
164 expires = parsedate_tz(headers['expires'])
165 if expires is not None:
166 expire_time = calendar.timegm(expires) - date
167 freshness_lifetime = max(0, expire_time)
168 logger.debug("Freshness lifetime from expires: %i",
169 freshness_lifetime)
170
171 # Determine if we are setting freshness limit in the
172 # request. Note, this overrides what was in the response.
173 if 'max-age' in cc:
174 try:
175 freshness_lifetime = int(cc['max-age'])
176 logger.debug('Freshness lifetime from request max-age: %i',
177 freshness_lifetime)
178 except ValueError:
179 freshness_lifetime = 0
180
181 if 'min-fresh' in cc:
182 try:
183 min_fresh = int(cc['min-fresh'])
184 except ValueError:
185 min_fresh = 0
186 # adjust our current age by our min fresh
187 current_age += min_fresh
188 logger.debug('Adjusted current age from min-fresh: %i',
189 current_age)
190
191 # Return entry if it is fresh enough
192 if freshness_lifetime > current_age:
193 logger.debug('The response is "fresh", returning cached response')
194 logger.debug('%i > %i', freshness_lifetime, current_age)
195 return resp
196
197 # we're not fresh. If we don't have an Etag, clear it out
198 if 'etag' not in headers:
199 logger.debug(
200 'The cached response is "stale" with no etag, purging'
201 )
202 self.cache.delete(cache_url)
203
204 # return the original handler
205 return False
206
207 def conditional_headers(self, request):
208 cache_url = self.cache_url(request.url)
209 resp = self.serializer.loads(request, self.cache.get(cache_url))
210 new_headers = {}
211
212 if resp:
213 headers = CaseInsensitiveDict(resp.headers)
214
215 if 'etag' in headers:
216 new_headers['If-None-Match'] = headers['ETag']
217
218 if 'last-modified' in headers:
219 new_headers['If-Modified-Since'] = headers['Last-Modified']
220
221 return new_headers
222
223 def cache_response(self, request, response, body=None):
224 """
225 Algorithm for caching requests.
226
227 This assumes a requests Response object.
228 """
229 # From httplib2: Don't cache 206's since we aren't going to
230 # handle byte range requests
231 cacheable_status_codes = [200, 203, 300, 301]
232 if response.status not in cacheable_status_codes:
233 logger.debug(
234 'Status code %s not in %s',
235 response.status,
236 cacheable_status_codes
237 )
238 return
239
240 response_headers = CaseInsensitiveDict(response.headers)
241
242 # If we've been given a body, our response has a Content-Length, that
243 # Content-Length is valid then we can check to see if the body we've
244 # been given matches the expected size, and if it doesn't we'll just
245 # skip trying to cache it.
246 if (body is not None and
247 "content-length" in response_headers and
248 response_headers["content-length"].isdigit() and
249 int(response_headers["content-length"]) != len(body)):
250 return
251
252 cc_req = self.parse_cache_control(request.headers)
253 cc = self.parse_cache_control(response_headers)
254
255 cache_url = self.cache_url(request.url)
256 logger.debug('Updating cache with response from "%s"', cache_url)
257
258 # Delete it from the cache if we happen to have it stored there
259 no_store = False
260 if cc.get('no-store'):
261 no_store = True
262 logger.debug('Response header has "no-store"')
263 if cc_req.get('no-store'):
264 no_store = True
265 logger.debug('Request header has "no-store"')
266 if no_store and self.cache.get(cache_url):
267 logger.debug('Purging existing cache entry to honor "no-store"')
268 self.cache.delete(cache_url)
269
270 # If we've been given an etag, then keep the response
271 if self.cache_etags and 'etag' in response_headers:
272 logger.debug('Caching due to etag')
273 self.cache.set(
274 cache_url,
275 self.serializer.dumps(request, response, body=body),
276 )
277
278 # Add to the cache any 301s. We do this before looking that
279 # the Date headers.
280 elif response.status == 301:
281 logger.debug('Caching permanant redirect')
282 self.cache.set(
283 cache_url,
284 self.serializer.dumps(request, response)
285 )
286
287 # Add to the cache if the response headers demand it. If there
288 # is no date header then we can't do anything about expiring
289 # the cache.
290 elif 'date' in response_headers:
291 # cache when there is a max-age > 0
292 if cc and cc.get('max-age'):
293 if cc['max-age'].isdigit() and int(cc['max-age']) > 0:
294 logger.debug('Caching b/c date exists and max-age > 0')
295 self.cache.set(
296 cache_url,
297 self.serializer.dumps(request, response, body=body),
298 )
299
300 # If the request can expire, it means we should cache it
301 # in the meantime.
302 elif 'expires' in response_headers:
303 if response_headers['expires']:
304 logger.debug('Caching b/c of expires header')
305 self.cache.set(
306 cache_url,
307 self.serializer.dumps(request, response, body=body),
308 )
309
310 def update_cached_response(self, request, response):
311 """On a 304 we will get a new set of headers that we want to
312 update our cached value with, assuming we have one.
313
314 This should only ever be called when we've sent an ETag and
315 gotten a 304 as the response.
316 """
317 cache_url = self.cache_url(request.url)
318
319 cached_response = self.serializer.loads(
320 request,
321 self.cache.get(cache_url)
322 )
323
324 if not cached_response:
325 # we didn't have a cached response
326 return response
327
328 # Lets update our headers with the headers from the new request:
329 # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
330 #
331 # The server isn't supposed to send headers that would make
332 # the cached body invalid. But... just in case, we'll be sure
333 # to strip out ones we know that might be problmatic due to
334 # typical assumptions.
335 excluded_headers = [
336 "content-length",
337 ]
338
339 cached_response.headers.update(
340 dict((k, v) for k, v in response.headers.items()
341 if k.lower() not in excluded_headers)
342 )
343
344 # we want a 200 b/c we have content via the cache
345 cached_response.status = 200
346
347 # update our cache
348 self.cache.set(
349 cache_url,
350 self.serializer.dumps(request, cached_response),
351 )
352
353 return cached_response