Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/future/backports/http/cookiejar.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 r"""HTTP cookie handling for web clients. | |
2 | |
3 This is a backport of the Py3.3 ``http.cookiejar`` module for | |
4 python-future. | |
5 | |
6 This module has (now fairly distant) origins in Gisle Aas' Perl module | |
7 HTTP::Cookies, from the libwww-perl library. | |
8 | |
9 Docstrings, comments and debug strings in this code refer to the | |
10 attributes of the HTTP cookie system as cookie-attributes, to distinguish | |
11 them clearly from Python attributes. | |
12 | |
13 Class diagram (note that BSDDBCookieJar and the MSIE* classes are not | |
14 distributed with the Python standard library, but are available from | |
15 http://wwwsearch.sf.net/): | |
16 | |
17 CookieJar____ | |
18 / \ \ | |
19 FileCookieJar \ \ | |
20 / | \ \ \ | |
21 MozillaCookieJar | LWPCookieJar \ \ | |
22 | | \ | |
23 | ---MSIEBase | \ | |
24 | / | | \ | |
25 | / MSIEDBCookieJar BSDDBCookieJar | |
26 |/ | |
27 MSIECookieJar | |
28 | |
29 """ | |
30 | |
31 from __future__ import unicode_literals | |
32 from __future__ import print_function | |
33 from __future__ import division | |
34 from __future__ import absolute_import | |
35 from future.builtins import filter, int, map, open, str | |
36 from future.utils import as_native_str, PY2 | |
37 | |
38 __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy', | |
39 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar'] | |
40 | |
41 import copy | |
42 import datetime | |
43 import re | |
44 if PY2: | |
45 re.ASCII = 0 | |
46 import time | |
47 from future.backports.urllib.parse import urlparse, urlsplit, quote | |
48 from future.backports.http.client import HTTP_PORT | |
49 try: | |
50 import threading as _threading | |
51 except ImportError: | |
52 import dummy_threading as _threading | |
53 from calendar import timegm | |
54 | |
55 debug = False # set to True to enable debugging via the logging module | |
56 logger = None | |
57 | |
58 def _debug(*args): | |
59 if not debug: | |
60 return | |
61 global logger | |
62 if not logger: | |
63 import logging | |
64 logger = logging.getLogger("http.cookiejar") | |
65 return logger.debug(*args) | |
66 | |
67 | |
68 DEFAULT_HTTP_PORT = str(HTTP_PORT) | |
69 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar " | |
70 "instance initialised with one)") | |
71 | |
72 def _warn_unhandled_exception(): | |
73 # There are a few catch-all except: statements in this module, for | |
74 # catching input that's bad in unexpected ways. Warn if any | |
75 # exceptions are caught there. | |
76 import io, warnings, traceback | |
77 f = io.StringIO() | |
78 traceback.print_exc(None, f) | |
79 msg = f.getvalue() | |
80 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2) | |
81 | |
82 | |
83 # Date/time conversion | |
84 # ----------------------------------------------------------------------------- | |
85 | |
86 EPOCH_YEAR = 1970 | |
87 def _timegm(tt): | |
88 year, month, mday, hour, min, sec = tt[:6] | |
89 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and | |
90 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)): | |
91 return timegm(tt) | |
92 else: | |
93 return None | |
94 | |
95 DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] | |
96 MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", | |
97 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] | |
98 MONTHS_LOWER = [] | |
99 for month in MONTHS: MONTHS_LOWER.append(month.lower()) | |
100 | |
101 def time2isoz(t=None): | |
102 """Return a string representing time in seconds since epoch, t. | |
103 | |
104 If the function is called without an argument, it will use the current | |
105 time. | |
106 | |
107 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ", | |
108 representing Universal Time (UTC, aka GMT). An example of this format is: | |
109 | |
110 1994-11-24 08:49:37Z | |
111 | |
112 """ | |
113 if t is None: | |
114 dt = datetime.datetime.utcnow() | |
115 else: | |
116 dt = datetime.datetime.utcfromtimestamp(t) | |
117 return "%04d-%02d-%02d %02d:%02d:%02dZ" % ( | |
118 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) | |
119 | |
120 def time2netscape(t=None): | |
121 """Return a string representing time in seconds since epoch, t. | |
122 | |
123 If the function is called without an argument, it will use the current | |
124 time. | |
125 | |
126 The format of the returned string is like this: | |
127 | |
128 Wed, DD-Mon-YYYY HH:MM:SS GMT | |
129 | |
130 """ | |
131 if t is None: | |
132 dt = datetime.datetime.utcnow() | |
133 else: | |
134 dt = datetime.datetime.utcfromtimestamp(t) | |
135 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % ( | |
136 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1], | |
137 dt.year, dt.hour, dt.minute, dt.second) | |
138 | |
139 | |
140 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None} | |
141 | |
142 TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII) | |
143 def offset_from_tz_string(tz): | |
144 offset = None | |
145 if tz in UTC_ZONES: | |
146 offset = 0 | |
147 else: | |
148 m = TIMEZONE_RE.search(tz) | |
149 if m: | |
150 offset = 3600 * int(m.group(2)) | |
151 if m.group(3): | |
152 offset = offset + 60 * int(m.group(3)) | |
153 if m.group(1) == '-': | |
154 offset = -offset | |
155 return offset | |
156 | |
157 def _str2time(day, mon, yr, hr, min, sec, tz): | |
158 # translate month name to number | |
159 # month numbers start with 1 (January) | |
160 try: | |
161 mon = MONTHS_LOWER.index(mon.lower())+1 | |
162 except ValueError: | |
163 # maybe it's already a number | |
164 try: | |
165 imon = int(mon) | |
166 except ValueError: | |
167 return None | |
168 if 1 <= imon <= 12: | |
169 mon = imon | |
170 else: | |
171 return None | |
172 | |
173 # make sure clock elements are defined | |
174 if hr is None: hr = 0 | |
175 if min is None: min = 0 | |
176 if sec is None: sec = 0 | |
177 | |
178 yr = int(yr) | |
179 day = int(day) | |
180 hr = int(hr) | |
181 min = int(min) | |
182 sec = int(sec) | |
183 | |
184 if yr < 1000: | |
185 # find "obvious" year | |
186 cur_yr = time.localtime(time.time())[0] | |
187 m = cur_yr % 100 | |
188 tmp = yr | |
189 yr = yr + cur_yr - m | |
190 m = m - tmp | |
191 if abs(m) > 50: | |
192 if m > 0: yr = yr + 100 | |
193 else: yr = yr - 100 | |
194 | |
195 # convert UTC time tuple to seconds since epoch (not timezone-adjusted) | |
196 t = _timegm((yr, mon, day, hr, min, sec, tz)) | |
197 | |
198 if t is not None: | |
199 # adjust time using timezone string, to get absolute time since epoch | |
200 if tz is None: | |
201 tz = "UTC" | |
202 tz = tz.upper() | |
203 offset = offset_from_tz_string(tz) | |
204 if offset is None: | |
205 return None | |
206 t = t - offset | |
207 | |
208 return t | |
209 | |
210 STRICT_DATE_RE = re.compile( | |
211 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) " | |
212 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII) | |
213 WEEKDAY_RE = re.compile( | |
214 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII) | |
215 LOOSE_HTTP_DATE_RE = re.compile( | |
216 r"""^ | |
217 (\d\d?) # day | |
218 (?:\s+|[-\/]) | |
219 (\w+) # month | |
220 (?:\s+|[-\/]) | |
221 (\d+) # year | |
222 (?: | |
223 (?:\s+|:) # separator before clock | |
224 (\d\d?):(\d\d) # hour:min | |
225 (?::(\d\d))? # optional seconds | |
226 )? # optional clock | |
227 \s* | |
228 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone | |
229 \s* | |
230 (?:\(\w+\))? # ASCII representation of timezone in parens. | |
231 \s*$""", re.X | re.ASCII) | |
232 def http2time(text): | |
233 """Returns time in seconds since epoch of time represented by a string. | |
234 | |
235 Return value is an integer. | |
236 | |
237 None is returned if the format of str is unrecognized, the time is outside | |
238 the representable range, or the timezone string is not recognized. If the | |
239 string contains no timezone, UTC is assumed. | |
240 | |
241 The timezone in the string may be numerical (like "-0800" or "+0100") or a | |
242 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the | |
243 timezone strings equivalent to UTC (zero offset) are known to the function. | |
244 | |
245 The function loosely parses the following formats: | |
246 | |
247 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format | |
248 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format | |
249 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format | |
250 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday) | |
251 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday) | |
252 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday) | |
253 | |
254 The parser ignores leading and trailing whitespace. The time may be | |
255 absent. | |
256 | |
257 If the year is given with only 2 digits, the function will select the | |
258 century that makes the year closest to the current date. | |
259 | |
260 """ | |
261 # fast exit for strictly conforming string | |
262 m = STRICT_DATE_RE.search(text) | |
263 if m: | |
264 g = m.groups() | |
265 mon = MONTHS_LOWER.index(g[1].lower()) + 1 | |
266 tt = (int(g[2]), mon, int(g[0]), | |
267 int(g[3]), int(g[4]), float(g[5])) | |
268 return _timegm(tt) | |
269 | |
270 # No, we need some messy parsing... | |
271 | |
272 # clean up | |
273 text = text.lstrip() | |
274 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday | |
275 | |
276 # tz is time zone specifier string | |
277 day, mon, yr, hr, min, sec, tz = [None]*7 | |
278 | |
279 # loose regexp parse | |
280 m = LOOSE_HTTP_DATE_RE.search(text) | |
281 if m is not None: | |
282 day, mon, yr, hr, min, sec, tz = m.groups() | |
283 else: | |
284 return None # bad format | |
285 | |
286 return _str2time(day, mon, yr, hr, min, sec, tz) | |
287 | |
288 ISO_DATE_RE = re.compile( | |
289 """^ | |
290 (\d{4}) # year | |
291 [-\/]? | |
292 (\d\d?) # numerical month | |
293 [-\/]? | |
294 (\d\d?) # day | |
295 (?: | |
296 (?:\s+|[-:Tt]) # separator before clock | |
297 (\d\d?):?(\d\d) # hour:min | |
298 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) | |
299 )? # optional clock | |
300 \s* | |
301 ([-+]?\d\d?:?(:?\d\d)? | |
302 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) | |
303 \s*$""", re.X | re. ASCII) | |
304 def iso2time(text): | |
305 """ | |
306 As for http2time, but parses the ISO 8601 formats: | |
307 | |
308 1994-02-03 14:15:29 -0100 -- ISO 8601 format | |
309 1994-02-03 14:15:29 -- zone is optional | |
310 1994-02-03 -- only date | |
311 1994-02-03T14:15:29 -- Use T as separator | |
312 19940203T141529Z -- ISO 8601 compact format | |
313 19940203 -- only date | |
314 | |
315 """ | |
316 # clean up | |
317 text = text.lstrip() | |
318 | |
319 # tz is time zone specifier string | |
320 day, mon, yr, hr, min, sec, tz = [None]*7 | |
321 | |
322 # loose regexp parse | |
323 m = ISO_DATE_RE.search(text) | |
324 if m is not None: | |
325 # XXX there's an extra bit of the timezone I'm ignoring here: is | |
326 # this the right thing to do? | |
327 yr, mon, day, hr, min, sec, tz, _ = m.groups() | |
328 else: | |
329 return None # bad format | |
330 | |
331 return _str2time(day, mon, yr, hr, min, sec, tz) | |
332 | |
333 | |
334 # Header parsing | |
335 # ----------------------------------------------------------------------------- | |
336 | |
337 def unmatched(match): | |
338 """Return unmatched part of re.Match object.""" | |
339 start, end = match.span(0) | |
340 return match.string[:start]+match.string[end:] | |
341 | |
342 HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)") | |
343 HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"") | |
344 HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)") | |
345 HEADER_ESCAPE_RE = re.compile(r"\\(.)") | |
346 def split_header_words(header_values): | |
347 r"""Parse header values into a list of lists containing key,value pairs. | |
348 | |
349 The function knows how to deal with ",", ";" and "=" as well as quoted | |
350 values after "=". A list of space separated tokens are parsed as if they | |
351 were separated by ";". | |
352 | |
353 If the header_values passed as argument contains multiple values, then they | |
354 are treated as if they were a single value separated by comma ",". | |
355 | |
356 This means that this function is useful for parsing header fields that | |
357 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax | |
358 the requirement for tokens). | |
359 | |
360 headers = #header | |
361 header = (token | parameter) *( [";"] (token | parameter)) | |
362 | |
363 token = 1*<any CHAR except CTLs or separators> | |
364 separators = "(" | ")" | "<" | ">" | "@" | |
365 | "," | ";" | ":" | "\" | <"> | |
366 | "/" | "[" | "]" | "?" | "=" | |
367 | "{" | "}" | SP | HT | |
368 | |
369 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) | |
370 qdtext = <any TEXT except <">> | |
371 quoted-pair = "\" CHAR | |
372 | |
373 parameter = attribute "=" value | |
374 attribute = token | |
375 value = token | quoted-string | |
376 | |
377 Each header is represented by a list of key/value pairs. The value for a | |
378 simple token (not part of a parameter) is None. Syntactically incorrect | |
379 headers will not necessarily be parsed as you would want. | |
380 | |
381 This is easier to describe with some examples: | |
382 | |
383 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz']) | |
384 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]] | |
385 >>> split_header_words(['text/html; charset="iso-8859-1"']) | |
386 [[('text/html', None), ('charset', 'iso-8859-1')]] | |
387 >>> split_header_words([r'Basic realm="\"foo\bar\""']) | |
388 [[('Basic', None), ('realm', '"foobar"')]] | |
389 | |
390 """ | |
391 assert not isinstance(header_values, str) | |
392 result = [] | |
393 for text in header_values: | |
394 orig_text = text | |
395 pairs = [] | |
396 while text: | |
397 m = HEADER_TOKEN_RE.search(text) | |
398 if m: | |
399 text = unmatched(m) | |
400 name = m.group(1) | |
401 m = HEADER_QUOTED_VALUE_RE.search(text) | |
402 if m: # quoted value | |
403 text = unmatched(m) | |
404 value = m.group(1) | |
405 value = HEADER_ESCAPE_RE.sub(r"\1", value) | |
406 else: | |
407 m = HEADER_VALUE_RE.search(text) | |
408 if m: # unquoted value | |
409 text = unmatched(m) | |
410 value = m.group(1) | |
411 value = value.rstrip() | |
412 else: | |
413 # no value, a lone token | |
414 value = None | |
415 pairs.append((name, value)) | |
416 elif text.lstrip().startswith(","): | |
417 # concatenated headers, as per RFC 2616 section 4.2 | |
418 text = text.lstrip()[1:] | |
419 if pairs: result.append(pairs) | |
420 pairs = [] | |
421 else: | |
422 # skip junk | |
423 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text) | |
424 assert nr_junk_chars > 0, ( | |
425 "split_header_words bug: '%s', '%s', %s" % | |
426 (orig_text, text, pairs)) | |
427 text = non_junk | |
428 if pairs: result.append(pairs) | |
429 return result | |
430 | |
431 HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])") | |
432 def join_header_words(lists): | |
433 """Do the inverse (almost) of the conversion done by split_header_words. | |
434 | |
435 Takes a list of lists of (key, value) pairs and produces a single header | |
436 value. Attribute values are quoted if needed. | |
437 | |
438 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]]) | |
439 'text/plain; charset="iso-8859/1"' | |
440 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]]) | |
441 'text/plain, charset="iso-8859/1"' | |
442 | |
443 """ | |
444 headers = [] | |
445 for pairs in lists: | |
446 attr = [] | |
447 for k, v in pairs: | |
448 if v is not None: | |
449 if not re.search(r"^\w+$", v): | |
450 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \ | |
451 v = '"%s"' % v | |
452 k = "%s=%s" % (k, v) | |
453 attr.append(k) | |
454 if attr: headers.append("; ".join(attr)) | |
455 return ", ".join(headers) | |
456 | |
457 def strip_quotes(text): | |
458 if text.startswith('"'): | |
459 text = text[1:] | |
460 if text.endswith('"'): | |
461 text = text[:-1] | |
462 return text | |
463 | |
464 def parse_ns_headers(ns_headers): | |
465 """Ad-hoc parser for Netscape protocol cookie-attributes. | |
466 | |
467 The old Netscape cookie format for Set-Cookie can for instance contain | |
468 an unquoted "," in the expires field, so we have to use this ad-hoc | |
469 parser instead of split_header_words. | |
470 | |
471 XXX This may not make the best possible effort to parse all the crap | |
472 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient | |
473 parser is probably better, so could do worse than following that if | |
474 this ever gives any trouble. | |
475 | |
476 Currently, this is also used for parsing RFC 2109 cookies. | |
477 | |
478 """ | |
479 known_attrs = ("expires", "domain", "path", "secure", | |
480 # RFC 2109 attrs (may turn up in Netscape cookies, too) | |
481 "version", "port", "max-age") | |
482 | |
483 result = [] | |
484 for ns_header in ns_headers: | |
485 pairs = [] | |
486 version_set = False | |
487 for ii, param in enumerate(re.split(r";\s*", ns_header)): | |
488 param = param.rstrip() | |
489 if param == "": continue | |
490 if "=" not in param: | |
491 k, v = param, None | |
492 else: | |
493 k, v = re.split(r"\s*=\s*", param, 1) | |
494 k = k.lstrip() | |
495 if ii != 0: | |
496 lc = k.lower() | |
497 if lc in known_attrs: | |
498 k = lc | |
499 if k == "version": | |
500 # This is an RFC 2109 cookie. | |
501 v = strip_quotes(v) | |
502 version_set = True | |
503 if k == "expires": | |
504 # convert expires date to seconds since epoch | |
505 v = http2time(strip_quotes(v)) # None if invalid | |
506 pairs.append((k, v)) | |
507 | |
508 if pairs: | |
509 if not version_set: | |
510 pairs.append(("version", "0")) | |
511 result.append(pairs) | |
512 | |
513 return result | |
514 | |
515 | |
516 IPV4_RE = re.compile(r"\.\d+$", re.ASCII) | |
517 def is_HDN(text): | |
518 """Return True if text is a host domain name.""" | |
519 # XXX | |
520 # This may well be wrong. Which RFC is HDN defined in, if any (for | |
521 # the purposes of RFC 2965)? | |
522 # For the current implementation, what about IPv6? Remember to look | |
523 # at other uses of IPV4_RE also, if change this. | |
524 if IPV4_RE.search(text): | |
525 return False | |
526 if text == "": | |
527 return False | |
528 if text[0] == "." or text[-1] == ".": | |
529 return False | |
530 return True | |
531 | |
532 def domain_match(A, B): | |
533 """Return True if domain A domain-matches domain B, according to RFC 2965. | |
534 | |
535 A and B may be host domain names or IP addresses. | |
536 | |
537 RFC 2965, section 1: | |
538 | |
539 Host names can be specified either as an IP address or a HDN string. | |
540 Sometimes we compare one host name with another. (Such comparisons SHALL | |
541 be case-insensitive.) Host A's name domain-matches host B's if | |
542 | |
543 * their host name strings string-compare equal; or | |
544 | |
545 * A is a HDN string and has the form NB, where N is a non-empty | |
546 name string, B has the form .B', and B' is a HDN string. (So, | |
547 x.y.com domain-matches .Y.com but not Y.com.) | |
548 | |
549 Note that domain-match is not a commutative operation: a.b.c.com | |
550 domain-matches .c.com, but not the reverse. | |
551 | |
552 """ | |
553 # Note that, if A or B are IP addresses, the only relevant part of the | |
554 # definition of the domain-match algorithm is the direct string-compare. | |
555 A = A.lower() | |
556 B = B.lower() | |
557 if A == B: | |
558 return True | |
559 if not is_HDN(A): | |
560 return False | |
561 i = A.rfind(B) | |
562 if i == -1 or i == 0: | |
563 # A does not have form NB, or N is the empty string | |
564 return False | |
565 if not B.startswith("."): | |
566 return False | |
567 if not is_HDN(B[1:]): | |
568 return False | |
569 return True | |
570 | |
571 def liberal_is_HDN(text): | |
572 """Return True if text is a sort-of-like a host domain name. | |
573 | |
574 For accepting/blocking domains. | |
575 | |
576 """ | |
577 if IPV4_RE.search(text): | |
578 return False | |
579 return True | |
580 | |
581 def user_domain_match(A, B): | |
582 """For blocking/accepting domains. | |
583 | |
584 A and B may be host domain names or IP addresses. | |
585 | |
586 """ | |
587 A = A.lower() | |
588 B = B.lower() | |
589 if not (liberal_is_HDN(A) and liberal_is_HDN(B)): | |
590 if A == B: | |
591 # equal IP addresses | |
592 return True | |
593 return False | |
594 initial_dot = B.startswith(".") | |
595 if initial_dot and A.endswith(B): | |
596 return True | |
597 if not initial_dot and A == B: | |
598 return True | |
599 return False | |
600 | |
601 cut_port_re = re.compile(r":\d+$", re.ASCII) | |
602 def request_host(request): | |
603 """Return request-host, as defined by RFC 2965. | |
604 | |
605 Variation from RFC: returned value is lowercased, for convenient | |
606 comparison. | |
607 | |
608 """ | |
609 url = request.get_full_url() | |
610 host = urlparse(url)[1] | |
611 if host == "": | |
612 host = request.get_header("Host", "") | |
613 | |
614 # remove port, if present | |
615 host = cut_port_re.sub("", host, 1) | |
616 return host.lower() | |
617 | |
618 def eff_request_host(request): | |
619 """Return a tuple (request-host, effective request-host name). | |
620 | |
621 As defined by RFC 2965, except both are lowercased. | |
622 | |
623 """ | |
624 erhn = req_host = request_host(request) | |
625 if req_host.find(".") == -1 and not IPV4_RE.search(req_host): | |
626 erhn = req_host + ".local" | |
627 return req_host, erhn | |
628 | |
629 def request_path(request): | |
630 """Path component of request-URI, as defined by RFC 2965.""" | |
631 url = request.get_full_url() | |
632 parts = urlsplit(url) | |
633 path = escape_path(parts.path) | |
634 if not path.startswith("/"): | |
635 # fix bad RFC 2396 absoluteURI | |
636 path = "/" + path | |
637 return path | |
638 | |
639 def request_port(request): | |
640 host = request.host | |
641 i = host.find(':') | |
642 if i >= 0: | |
643 port = host[i+1:] | |
644 try: | |
645 int(port) | |
646 except ValueError: | |
647 _debug("nonnumeric port: '%s'", port) | |
648 return None | |
649 else: | |
650 port = DEFAULT_HTTP_PORT | |
651 return port | |
652 | |
653 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't | |
654 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738). | |
655 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()" | |
656 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])") | |
657 def uppercase_escaped_char(match): | |
658 return "%%%s" % match.group(1).upper() | |
659 def escape_path(path): | |
660 """Escape any invalid characters in HTTP URL, and uppercase all escapes.""" | |
661 # There's no knowing what character encoding was used to create URLs | |
662 # containing %-escapes, but since we have to pick one to escape invalid | |
663 # path characters, we pick UTF-8, as recommended in the HTML 4.0 | |
664 # specification: | |
665 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1 | |
666 # And here, kind of: draft-fielding-uri-rfc2396bis-03 | |
667 # (And in draft IRI specification: draft-duerst-iri-05) | |
668 # (And here, for new URI schemes: RFC 2718) | |
669 path = quote(path, HTTP_PATH_SAFE) | |
670 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path) | |
671 return path | |
672 | |
673 def reach(h): | |
674 """Return reach of host h, as defined by RFC 2965, section 1. | |
675 | |
676 The reach R of a host name H is defined as follows: | |
677 | |
678 * If | |
679 | |
680 - H is the host domain name of a host; and, | |
681 | |
682 - H has the form A.B; and | |
683 | |
684 - A has no embedded (that is, interior) dots; and | |
685 | |
686 - B has at least one embedded dot, or B is the string "local". | |
687 then the reach of H is .B. | |
688 | |
689 * Otherwise, the reach of H is H. | |
690 | |
691 >>> reach("www.acme.com") | |
692 '.acme.com' | |
693 >>> reach("acme.com") | |
694 'acme.com' | |
695 >>> reach("acme.local") | |
696 '.local' | |
697 | |
698 """ | |
699 i = h.find(".") | |
700 if i >= 0: | |
701 #a = h[:i] # this line is only here to show what a is | |
702 b = h[i+1:] | |
703 i = b.find(".") | |
704 if is_HDN(h) and (i >= 0 or b == "local"): | |
705 return "."+b | |
706 return h | |
707 | |
708 def is_third_party(request): | |
709 """ | |
710 | |
711 RFC 2965, section 3.3.6: | |
712 | |
713 An unverifiable transaction is to a third-party host if its request- | |
714 host U does not domain-match the reach R of the request-host O in the | |
715 origin transaction. | |
716 | |
717 """ | |
718 req_host = request_host(request) | |
719 if not domain_match(req_host, reach(request.get_origin_req_host())): | |
720 return True | |
721 else: | |
722 return False | |
723 | |
724 | |
725 class Cookie(object): | |
726 """HTTP Cookie. | |
727 | |
728 This class represents both Netscape and RFC 2965 cookies. | |
729 | |
730 This is deliberately a very simple class. It just holds attributes. It's | |
731 possible to construct Cookie instances that don't comply with the cookie | |
732 standards. CookieJar.make_cookies is the factory function for Cookie | |
733 objects -- it deals with cookie parsing, supplying defaults, and | |
734 normalising to the representation used in this class. CookiePolicy is | |
735 responsible for checking them to see whether they should be accepted from | |
736 and returned to the server. | |
737 | |
738 Note that the port may be present in the headers, but unspecified ("Port" | |
739 rather than"Port=80", for example); if this is the case, port is None. | |
740 | |
741 """ | |
742 | |
743 def __init__(self, version, name, value, | |
744 port, port_specified, | |
745 domain, domain_specified, domain_initial_dot, | |
746 path, path_specified, | |
747 secure, | |
748 expires, | |
749 discard, | |
750 comment, | |
751 comment_url, | |
752 rest, | |
753 rfc2109=False, | |
754 ): | |
755 | |
756 if version is not None: version = int(version) | |
757 if expires is not None: expires = int(expires) | |
758 if port is None and port_specified is True: | |
759 raise ValueError("if port is None, port_specified must be false") | |
760 | |
761 self.version = version | |
762 self.name = name | |
763 self.value = value | |
764 self.port = port | |
765 self.port_specified = port_specified | |
766 # normalise case, as per RFC 2965 section 3.3.3 | |
767 self.domain = domain.lower() | |
768 self.domain_specified = domain_specified | |
769 # Sigh. We need to know whether the domain given in the | |
770 # cookie-attribute had an initial dot, in order to follow RFC 2965 | |
771 # (as clarified in draft errata). Needed for the returned $Domain | |
772 # value. | |
773 self.domain_initial_dot = domain_initial_dot | |
774 self.path = path | |
775 self.path_specified = path_specified | |
776 self.secure = secure | |
777 self.expires = expires | |
778 self.discard = discard | |
779 self.comment = comment | |
780 self.comment_url = comment_url | |
781 self.rfc2109 = rfc2109 | |
782 | |
783 self._rest = copy.copy(rest) | |
784 | |
785 def has_nonstandard_attr(self, name): | |
786 return name in self._rest | |
787 def get_nonstandard_attr(self, name, default=None): | |
788 return self._rest.get(name, default) | |
789 def set_nonstandard_attr(self, name, value): | |
790 self._rest[name] = value | |
791 | |
792 def is_expired(self, now=None): | |
793 if now is None: now = time.time() | |
794 if (self.expires is not None) and (self.expires <= now): | |
795 return True | |
796 return False | |
797 | |
798 def __str__(self): | |
799 if self.port is None: p = "" | |
800 else: p = ":"+self.port | |
801 limit = self.domain + p + self.path | |
802 if self.value is not None: | |
803 namevalue = "%s=%s" % (self.name, self.value) | |
804 else: | |
805 namevalue = self.name | |
806 return "<Cookie %s for %s>" % (namevalue, limit) | |
807 | |
808 @as_native_str() | |
809 def __repr__(self): | |
810 args = [] | |
811 for name in ("version", "name", "value", | |
812 "port", "port_specified", | |
813 "domain", "domain_specified", "domain_initial_dot", | |
814 "path", "path_specified", | |
815 "secure", "expires", "discard", "comment", "comment_url", | |
816 ): | |
817 attr = getattr(self, name) | |
818 ### Python-Future: | |
819 # Avoid u'...' prefixes for unicode strings: | |
820 if isinstance(attr, str): | |
821 attr = str(attr) | |
822 ### | |
823 args.append(str("%s=%s") % (name, repr(attr))) | |
824 args.append("rest=%s" % repr(self._rest)) | |
825 args.append("rfc2109=%s" % repr(self.rfc2109)) | |
826 return "Cookie(%s)" % ", ".join(args) | |
827 | |
828 | |
829 class CookiePolicy(object): | |
830 """Defines which cookies get accepted from and returned to server. | |
831 | |
832 May also modify cookies, though this is probably a bad idea. | |
833 | |
834 The subclass DefaultCookiePolicy defines the standard rules for Netscape | |
835 and RFC 2965 cookies -- override that if you want a customised policy. | |
836 | |
837 """ | |
838 def set_ok(self, cookie, request): | |
839 """Return true if (and only if) cookie should be accepted from server. | |
840 | |
841 Currently, pre-expired cookies never get this far -- the CookieJar | |
842 class deletes such cookies itself. | |
843 | |
844 """ | |
845 raise NotImplementedError() | |
846 | |
847 def return_ok(self, cookie, request): | |
848 """Return true if (and only if) cookie should be returned to server.""" | |
849 raise NotImplementedError() | |
850 | |
851 def domain_return_ok(self, domain, request): | |
852 """Return false if cookies should not be returned, given cookie domain. | |
853 """ | |
854 return True | |
855 | |
856 def path_return_ok(self, path, request): | |
857 """Return false if cookies should not be returned, given cookie path. | |
858 """ | |
859 return True | |
860 | |
861 | |
862 class DefaultCookiePolicy(CookiePolicy): | |
863 """Implements the standard rules for accepting and returning cookies.""" | |
864 | |
865 DomainStrictNoDots = 1 | |
866 DomainStrictNonDomain = 2 | |
867 DomainRFC2965Match = 4 | |
868 | |
869 DomainLiberal = 0 | |
870 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain | |
871 | |
872 def __init__(self, | |
873 blocked_domains=None, allowed_domains=None, | |
874 netscape=True, rfc2965=False, | |
875 rfc2109_as_netscape=None, | |
876 hide_cookie2=False, | |
877 strict_domain=False, | |
878 strict_rfc2965_unverifiable=True, | |
879 strict_ns_unverifiable=False, | |
880 strict_ns_domain=DomainLiberal, | |
881 strict_ns_set_initial_dollar=False, | |
882 strict_ns_set_path=False, | |
883 ): | |
884 """Constructor arguments should be passed as keyword arguments only.""" | |
885 self.netscape = netscape | |
886 self.rfc2965 = rfc2965 | |
887 self.rfc2109_as_netscape = rfc2109_as_netscape | |
888 self.hide_cookie2 = hide_cookie2 | |
889 self.strict_domain = strict_domain | |
890 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable | |
891 self.strict_ns_unverifiable = strict_ns_unverifiable | |
892 self.strict_ns_domain = strict_ns_domain | |
893 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar | |
894 self.strict_ns_set_path = strict_ns_set_path | |
895 | |
896 if blocked_domains is not None: | |
897 self._blocked_domains = tuple(blocked_domains) | |
898 else: | |
899 self._blocked_domains = () | |
900 | |
901 if allowed_domains is not None: | |
902 allowed_domains = tuple(allowed_domains) | |
903 self._allowed_domains = allowed_domains | |
904 | |
905 def blocked_domains(self): | |
906 """Return the sequence of blocked domains (as a tuple).""" | |
907 return self._blocked_domains | |
908 def set_blocked_domains(self, blocked_domains): | |
909 """Set the sequence of blocked domains.""" | |
910 self._blocked_domains = tuple(blocked_domains) | |
911 | |
912 def is_blocked(self, domain): | |
913 for blocked_domain in self._blocked_domains: | |
914 if user_domain_match(domain, blocked_domain): | |
915 return True | |
916 return False | |
917 | |
918 def allowed_domains(self): | |
919 """Return None, or the sequence of allowed domains (as a tuple).""" | |
920 return self._allowed_domains | |
921 def set_allowed_domains(self, allowed_domains): | |
922 """Set the sequence of allowed domains, or None.""" | |
923 if allowed_domains is not None: | |
924 allowed_domains = tuple(allowed_domains) | |
925 self._allowed_domains = allowed_domains | |
926 | |
927 def is_not_allowed(self, domain): | |
928 if self._allowed_domains is None: | |
929 return False | |
930 for allowed_domain in self._allowed_domains: | |
931 if user_domain_match(domain, allowed_domain): | |
932 return False | |
933 return True | |
934 | |
935 def set_ok(self, cookie, request): | |
936 """ | |
937 If you override .set_ok(), be sure to call this method. If it returns | |
938 false, so should your subclass (assuming your subclass wants to be more | |
939 strict about which cookies to accept). | |
940 | |
941 """ | |
942 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) | |
943 | |
944 assert cookie.name is not None | |
945 | |
946 for n in "version", "verifiability", "name", "path", "domain", "port": | |
947 fn_name = "set_ok_"+n | |
948 fn = getattr(self, fn_name) | |
949 if not fn(cookie, request): | |
950 return False | |
951 | |
952 return True | |
953 | |
954 def set_ok_version(self, cookie, request): | |
955 if cookie.version is None: | |
956 # Version is always set to 0 by parse_ns_headers if it's a Netscape | |
957 # cookie, so this must be an invalid RFC 2965 cookie. | |
958 _debug(" Set-Cookie2 without version attribute (%s=%s)", | |
959 cookie.name, cookie.value) | |
960 return False | |
961 if cookie.version > 0 and not self.rfc2965: | |
962 _debug(" RFC 2965 cookies are switched off") | |
963 return False | |
964 elif cookie.version == 0 and not self.netscape: | |
965 _debug(" Netscape cookies are switched off") | |
966 return False | |
967 return True | |
968 | |
969 def set_ok_verifiability(self, cookie, request): | |
970 if request.unverifiable and is_third_party(request): | |
971 if cookie.version > 0 and self.strict_rfc2965_unverifiable: | |
972 _debug(" third-party RFC 2965 cookie during " | |
973 "unverifiable transaction") | |
974 return False | |
975 elif cookie.version == 0 and self.strict_ns_unverifiable: | |
976 _debug(" third-party Netscape cookie during " | |
977 "unverifiable transaction") | |
978 return False | |
979 return True | |
980 | |
981 def set_ok_name(self, cookie, request): | |
982 # Try and stop servers setting V0 cookies designed to hack other | |
983 # servers that know both V0 and V1 protocols. | |
984 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and | |
985 cookie.name.startswith("$")): | |
986 _debug(" illegal name (starts with '$'): '%s'", cookie.name) | |
987 return False | |
988 return True | |
989 | |
990 def set_ok_path(self, cookie, request): | |
991 if cookie.path_specified: | |
992 req_path = request_path(request) | |
993 if ((cookie.version > 0 or | |
994 (cookie.version == 0 and self.strict_ns_set_path)) and | |
995 not req_path.startswith(cookie.path)): | |
996 _debug(" path attribute %s is not a prefix of request " | |
997 "path %s", cookie.path, req_path) | |
998 return False | |
999 return True | |
1000 | |
1001 def set_ok_domain(self, cookie, request): | |
1002 if self.is_blocked(cookie.domain): | |
1003 _debug(" domain %s is in user block-list", cookie.domain) | |
1004 return False | |
1005 if self.is_not_allowed(cookie.domain): | |
1006 _debug(" domain %s is not in user allow-list", cookie.domain) | |
1007 return False | |
1008 if cookie.domain_specified: | |
1009 req_host, erhn = eff_request_host(request) | |
1010 domain = cookie.domain | |
1011 if self.strict_domain and (domain.count(".") >= 2): | |
1012 # XXX This should probably be compared with the Konqueror | |
1013 # (kcookiejar.cpp) and Mozilla implementations, but it's a | |
1014 # losing battle. | |
1015 i = domain.rfind(".") | |
1016 j = domain.rfind(".", 0, i) | |
1017 if j == 0: # domain like .foo.bar | |
1018 tld = domain[i+1:] | |
1019 sld = domain[j+1:i] | |
1020 if sld.lower() in ("co", "ac", "com", "edu", "org", "net", | |
1021 "gov", "mil", "int", "aero", "biz", "cat", "coop", | |
1022 "info", "jobs", "mobi", "museum", "name", "pro", | |
1023 "travel", "eu") and len(tld) == 2: | |
1024 # domain like .co.uk | |
1025 _debug(" country-code second level domain %s", domain) | |
1026 return False | |
1027 if domain.startswith("."): | |
1028 undotted_domain = domain[1:] | |
1029 else: | |
1030 undotted_domain = domain | |
1031 embedded_dots = (undotted_domain.find(".") >= 0) | |
1032 if not embedded_dots and domain != ".local": | |
1033 _debug(" non-local domain %s contains no embedded dot", | |
1034 domain) | |
1035 return False | |
1036 if cookie.version == 0: | |
1037 if (not erhn.endswith(domain) and | |
1038 (not erhn.startswith(".") and | |
1039 not ("."+erhn).endswith(domain))): | |
1040 _debug(" effective request-host %s (even with added " | |
1041 "initial dot) does not end with %s", | |
1042 erhn, domain) | |
1043 return False | |
1044 if (cookie.version > 0 or | |
1045 (self.strict_ns_domain & self.DomainRFC2965Match)): | |
1046 if not domain_match(erhn, domain): | |
1047 _debug(" effective request-host %s does not domain-match " | |
1048 "%s", erhn, domain) | |
1049 return False | |
1050 if (cookie.version > 0 or | |
1051 (self.strict_ns_domain & self.DomainStrictNoDots)): | |
1052 host_prefix = req_host[:-len(domain)] | |
1053 if (host_prefix.find(".") >= 0 and | |
1054 not IPV4_RE.search(req_host)): | |
1055 _debug(" host prefix %s for domain %s contains a dot", | |
1056 host_prefix, domain) | |
1057 return False | |
1058 return True | |
1059 | |
1060 def set_ok_port(self, cookie, request): | |
1061 if cookie.port_specified: | |
1062 req_port = request_port(request) | |
1063 if req_port is None: | |
1064 req_port = "80" | |
1065 else: | |
1066 req_port = str(req_port) | |
1067 for p in cookie.port.split(","): | |
1068 try: | |
1069 int(p) | |
1070 except ValueError: | |
1071 _debug(" bad port %s (not numeric)", p) | |
1072 return False | |
1073 if p == req_port: | |
1074 break | |
1075 else: | |
1076 _debug(" request port (%s) not found in %s", | |
1077 req_port, cookie.port) | |
1078 return False | |
1079 return True | |
1080 | |
1081 def return_ok(self, cookie, request): | |
1082 """ | |
1083 If you override .return_ok(), be sure to call this method. If it | |
1084 returns false, so should your subclass (assuming your subclass wants to | |
1085 be more strict about which cookies to return). | |
1086 | |
1087 """ | |
1088 # Path has already been checked by .path_return_ok(), and domain | |
1089 # blocking done by .domain_return_ok(). | |
1090 _debug(" - checking cookie %s=%s", cookie.name, cookie.value) | |
1091 | |
1092 for n in "version", "verifiability", "secure", "expires", "port", "domain": | |
1093 fn_name = "return_ok_"+n | |
1094 fn = getattr(self, fn_name) | |
1095 if not fn(cookie, request): | |
1096 return False | |
1097 return True | |
1098 | |
1099 def return_ok_version(self, cookie, request): | |
1100 if cookie.version > 0 and not self.rfc2965: | |
1101 _debug(" RFC 2965 cookies are switched off") | |
1102 return False | |
1103 elif cookie.version == 0 and not self.netscape: | |
1104 _debug(" Netscape cookies are switched off") | |
1105 return False | |
1106 return True | |
1107 | |
1108 def return_ok_verifiability(self, cookie, request): | |
1109 if request.unverifiable and is_third_party(request): | |
1110 if cookie.version > 0 and self.strict_rfc2965_unverifiable: | |
1111 _debug(" third-party RFC 2965 cookie during unverifiable " | |
1112 "transaction") | |
1113 return False | |
1114 elif cookie.version == 0 and self.strict_ns_unverifiable: | |
1115 _debug(" third-party Netscape cookie during unverifiable " | |
1116 "transaction") | |
1117 return False | |
1118 return True | |
1119 | |
1120 def return_ok_secure(self, cookie, request): | |
1121 if cookie.secure and request.type != "https": | |
1122 _debug(" secure cookie with non-secure request") | |
1123 return False | |
1124 return True | |
1125 | |
1126 def return_ok_expires(self, cookie, request): | |
1127 if cookie.is_expired(self._now): | |
1128 _debug(" cookie expired") | |
1129 return False | |
1130 return True | |
1131 | |
1132 def return_ok_port(self, cookie, request): | |
1133 if cookie.port: | |
1134 req_port = request_port(request) | |
1135 if req_port is None: | |
1136 req_port = "80" | |
1137 for p in cookie.port.split(","): | |
1138 if p == req_port: | |
1139 break | |
1140 else: | |
1141 _debug(" request port %s does not match cookie port %s", | |
1142 req_port, cookie.port) | |
1143 return False | |
1144 return True | |
1145 | |
1146 def return_ok_domain(self, cookie, request): | |
1147 req_host, erhn = eff_request_host(request) | |
1148 domain = cookie.domain | |
1149 | |
1150 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't | |
1151 if (cookie.version == 0 and | |
1152 (self.strict_ns_domain & self.DomainStrictNonDomain) and | |
1153 not cookie.domain_specified and domain != erhn): | |
1154 _debug(" cookie with unspecified domain does not string-compare " | |
1155 "equal to request domain") | |
1156 return False | |
1157 | |
1158 if cookie.version > 0 and not domain_match(erhn, domain): | |
1159 _debug(" effective request-host name %s does not domain-match " | |
1160 "RFC 2965 cookie domain %s", erhn, domain) | |
1161 return False | |
1162 if cookie.version == 0 and not ("."+erhn).endswith(domain): | |
1163 _debug(" request-host %s does not match Netscape cookie domain " | |
1164 "%s", req_host, domain) | |
1165 return False | |
1166 return True | |
1167 | |
1168 def domain_return_ok(self, domain, request): | |
1169 # Liberal check of. This is here as an optimization to avoid | |
1170 # having to load lots of MSIE cookie files unless necessary. | |
1171 req_host, erhn = eff_request_host(request) | |
1172 if not req_host.startswith("."): | |
1173 req_host = "."+req_host | |
1174 if not erhn.startswith("."): | |
1175 erhn = "."+erhn | |
1176 if not (req_host.endswith(domain) or erhn.endswith(domain)): | |
1177 #_debug(" request domain %s does not match cookie domain %s", | |
1178 # req_host, domain) | |
1179 return False | |
1180 | |
1181 if self.is_blocked(domain): | |
1182 _debug(" domain %s is in user block-list", domain) | |
1183 return False | |
1184 if self.is_not_allowed(domain): | |
1185 _debug(" domain %s is not in user allow-list", domain) | |
1186 return False | |
1187 | |
1188 return True | |
1189 | |
1190 def path_return_ok(self, path, request): | |
1191 _debug("- checking cookie path=%s", path) | |
1192 req_path = request_path(request) | |
1193 if not req_path.startswith(path): | |
1194 _debug(" %s does not path-match %s", req_path, path) | |
1195 return False | |
1196 return True | |
1197 | |
1198 | |
1199 def vals_sorted_by_key(adict): | |
1200 keys = sorted(adict.keys()) | |
1201 return map(adict.get, keys) | |
1202 | |
1203 def deepvalues(mapping): | |
1204 """Iterates over nested mapping, depth-first, in sorted order by key.""" | |
1205 values = vals_sorted_by_key(mapping) | |
1206 for obj in values: | |
1207 mapping = False | |
1208 try: | |
1209 obj.items | |
1210 except AttributeError: | |
1211 pass | |
1212 else: | |
1213 mapping = True | |
1214 for subobj in deepvalues(obj): | |
1215 yield subobj | |
1216 if not mapping: | |
1217 yield obj | |
1218 | |
1219 | |
1220 # Used as second parameter to dict.get() method, to distinguish absent | |
1221 # dict key from one with a None value. | |
1222 class Absent(object): pass | |
1223 | |
1224 class CookieJar(object): | |
1225 """Collection of HTTP cookies. | |
1226 | |
1227 You may not need to know about this class: try | |
1228 urllib.request.build_opener(HTTPCookieProcessor).open(url). | |
1229 """ | |
1230 | |
1231 non_word_re = re.compile(r"\W") | |
1232 quote_re = re.compile(r"([\"\\])") | |
1233 strict_domain_re = re.compile(r"\.?[^.]*") | |
1234 domain_re = re.compile(r"[^.]*") | |
1235 dots_re = re.compile(r"^\.+") | |
1236 | |
1237 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII) | |
1238 | |
1239 def __init__(self, policy=None): | |
1240 if policy is None: | |
1241 policy = DefaultCookiePolicy() | |
1242 self._policy = policy | |
1243 | |
1244 self._cookies_lock = _threading.RLock() | |
1245 self._cookies = {} | |
1246 | |
1247 def set_policy(self, policy): | |
1248 self._policy = policy | |
1249 | |
1250 def _cookies_for_domain(self, domain, request): | |
1251 cookies = [] | |
1252 if not self._policy.domain_return_ok(domain, request): | |
1253 return [] | |
1254 _debug("Checking %s for cookies to return", domain) | |
1255 cookies_by_path = self._cookies[domain] | |
1256 for path in cookies_by_path.keys(): | |
1257 if not self._policy.path_return_ok(path, request): | |
1258 continue | |
1259 cookies_by_name = cookies_by_path[path] | |
1260 for cookie in cookies_by_name.values(): | |
1261 if not self._policy.return_ok(cookie, request): | |
1262 _debug(" not returning cookie") | |
1263 continue | |
1264 _debug(" it's a match") | |
1265 cookies.append(cookie) | |
1266 return cookies | |
1267 | |
1268 def _cookies_for_request(self, request): | |
1269 """Return a list of cookies to be returned to server.""" | |
1270 cookies = [] | |
1271 for domain in self._cookies.keys(): | |
1272 cookies.extend(self._cookies_for_domain(domain, request)) | |
1273 return cookies | |
1274 | |
1275 def _cookie_attrs(self, cookies): | |
1276 """Return a list of cookie-attributes to be returned to server. | |
1277 | |
1278 like ['foo="bar"; $Path="/"', ...] | |
1279 | |
1280 The $Version attribute is also added when appropriate (currently only | |
1281 once per request). | |
1282 | |
1283 """ | |
1284 # add cookies in order of most specific (ie. longest) path first | |
1285 cookies.sort(key=lambda a: len(a.path), reverse=True) | |
1286 | |
1287 version_set = False | |
1288 | |
1289 attrs = [] | |
1290 for cookie in cookies: | |
1291 # set version of Cookie header | |
1292 # XXX | |
1293 # What should it be if multiple matching Set-Cookie headers have | |
1294 # different versions themselves? | |
1295 # Answer: there is no answer; was supposed to be settled by | |
1296 # RFC 2965 errata, but that may never appear... | |
1297 version = cookie.version | |
1298 if not version_set: | |
1299 version_set = True | |
1300 if version > 0: | |
1301 attrs.append("$Version=%s" % version) | |
1302 | |
1303 # quote cookie value if necessary | |
1304 # (not for Netscape protocol, which already has any quotes | |
1305 # intact, due to the poorly-specified Netscape Cookie: syntax) | |
1306 if ((cookie.value is not None) and | |
1307 self.non_word_re.search(cookie.value) and version > 0): | |
1308 value = self.quote_re.sub(r"\\\1", cookie.value) | |
1309 else: | |
1310 value = cookie.value | |
1311 | |
1312 # add cookie-attributes to be returned in Cookie header | |
1313 if cookie.value is None: | |
1314 attrs.append(cookie.name) | |
1315 else: | |
1316 attrs.append("%s=%s" % (cookie.name, value)) | |
1317 if version > 0: | |
1318 if cookie.path_specified: | |
1319 attrs.append('$Path="%s"' % cookie.path) | |
1320 if cookie.domain.startswith("."): | |
1321 domain = cookie.domain | |
1322 if (not cookie.domain_initial_dot and | |
1323 domain.startswith(".")): | |
1324 domain = domain[1:] | |
1325 attrs.append('$Domain="%s"' % domain) | |
1326 if cookie.port is not None: | |
1327 p = "$Port" | |
1328 if cookie.port_specified: | |
1329 p = p + ('="%s"' % cookie.port) | |
1330 attrs.append(p) | |
1331 | |
1332 return attrs | |
1333 | |
1334 def add_cookie_header(self, request): | |
1335 """Add correct Cookie: header to request (urllib.request.Request object). | |
1336 | |
1337 The Cookie2 header is also added unless policy.hide_cookie2 is true. | |
1338 | |
1339 """ | |
1340 _debug("add_cookie_header") | |
1341 self._cookies_lock.acquire() | |
1342 try: | |
1343 | |
1344 self._policy._now = self._now = int(time.time()) | |
1345 | |
1346 cookies = self._cookies_for_request(request) | |
1347 | |
1348 attrs = self._cookie_attrs(cookies) | |
1349 if attrs: | |
1350 if not request.has_header("Cookie"): | |
1351 request.add_unredirected_header( | |
1352 "Cookie", "; ".join(attrs)) | |
1353 | |
1354 # if necessary, advertise that we know RFC 2965 | |
1355 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and | |
1356 not request.has_header("Cookie2")): | |
1357 for cookie in cookies: | |
1358 if cookie.version != 1: | |
1359 request.add_unredirected_header("Cookie2", '$Version="1"') | |
1360 break | |
1361 | |
1362 finally: | |
1363 self._cookies_lock.release() | |
1364 | |
1365 self.clear_expired_cookies() | |
1366 | |
1367 def _normalized_cookie_tuples(self, attrs_set): | |
1368 """Return list of tuples containing normalised cookie information. | |
1369 | |
1370 attrs_set is the list of lists of key,value pairs extracted from | |
1371 the Set-Cookie or Set-Cookie2 headers. | |
1372 | |
1373 Tuples are name, value, standard, rest, where name and value are the | |
1374 cookie name and value, standard is a dictionary containing the standard | |
1375 cookie-attributes (discard, secure, version, expires or max-age, | |
1376 domain, path and port) and rest is a dictionary containing the rest of | |
1377 the cookie-attributes. | |
1378 | |
1379 """ | |
1380 cookie_tuples = [] | |
1381 | |
1382 boolean_attrs = "discard", "secure" | |
1383 value_attrs = ("version", | |
1384 "expires", "max-age", | |
1385 "domain", "path", "port", | |
1386 "comment", "commenturl") | |
1387 | |
1388 for cookie_attrs in attrs_set: | |
1389 name, value = cookie_attrs[0] | |
1390 | |
1391 # Build dictionary of standard cookie-attributes (standard) and | |
1392 # dictionary of other cookie-attributes (rest). | |
1393 | |
1394 # Note: expiry time is normalised to seconds since epoch. V0 | |
1395 # cookies should have the Expires cookie-attribute, and V1 cookies | |
1396 # should have Max-Age, but since V1 includes RFC 2109 cookies (and | |
1397 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we | |
1398 # accept either (but prefer Max-Age). | |
1399 max_age_set = False | |
1400 | |
1401 bad_cookie = False | |
1402 | |
1403 standard = {} | |
1404 rest = {} | |
1405 for k, v in cookie_attrs[1:]: | |
1406 lc = k.lower() | |
1407 # don't lose case distinction for unknown fields | |
1408 if lc in value_attrs or lc in boolean_attrs: | |
1409 k = lc | |
1410 if k in boolean_attrs and v is None: | |
1411 # boolean cookie-attribute is present, but has no value | |
1412 # (like "discard", rather than "port=80") | |
1413 v = True | |
1414 if k in standard: | |
1415 # only first value is significant | |
1416 continue | |
1417 if k == "domain": | |
1418 if v is None: | |
1419 _debug(" missing value for domain attribute") | |
1420 bad_cookie = True | |
1421 break | |
1422 # RFC 2965 section 3.3.3 | |
1423 v = v.lower() | |
1424 if k == "expires": | |
1425 if max_age_set: | |
1426 # Prefer max-age to expires (like Mozilla) | |
1427 continue | |
1428 if v is None: | |
1429 _debug(" missing or invalid value for expires " | |
1430 "attribute: treating as session cookie") | |
1431 continue | |
1432 if k == "max-age": | |
1433 max_age_set = True | |
1434 try: | |
1435 v = int(v) | |
1436 except ValueError: | |
1437 _debug(" missing or invalid (non-numeric) value for " | |
1438 "max-age attribute") | |
1439 bad_cookie = True | |
1440 break | |
1441 # convert RFC 2965 Max-Age to seconds since epoch | |
1442 # XXX Strictly you're supposed to follow RFC 2616 | |
1443 # age-calculation rules. Remember that zero Max-Age is a | |
1444 # is a request to discard (old and new) cookie, though. | |
1445 k = "expires" | |
1446 v = self._now + v | |
1447 if (k in value_attrs) or (k in boolean_attrs): | |
1448 if (v is None and | |
1449 k not in ("port", "comment", "commenturl")): | |
1450 _debug(" missing value for %s attribute" % k) | |
1451 bad_cookie = True | |
1452 break | |
1453 standard[k] = v | |
1454 else: | |
1455 rest[k] = v | |
1456 | |
1457 if bad_cookie: | |
1458 continue | |
1459 | |
1460 cookie_tuples.append((name, value, standard, rest)) | |
1461 | |
1462 return cookie_tuples | |
1463 | |
1464 def _cookie_from_cookie_tuple(self, tup, request): | |
1465 # standard is dict of standard cookie-attributes, rest is dict of the | |
1466 # rest of them | |
1467 name, value, standard, rest = tup | |
1468 | |
1469 domain = standard.get("domain", Absent) | |
1470 path = standard.get("path", Absent) | |
1471 port = standard.get("port", Absent) | |
1472 expires = standard.get("expires", Absent) | |
1473 | |
1474 # set the easy defaults | |
1475 version = standard.get("version", None) | |
1476 if version is not None: | |
1477 try: | |
1478 version = int(version) | |
1479 except ValueError: | |
1480 return None # invalid version, ignore cookie | |
1481 secure = standard.get("secure", False) | |
1482 # (discard is also set if expires is Absent) | |
1483 discard = standard.get("discard", False) | |
1484 comment = standard.get("comment", None) | |
1485 comment_url = standard.get("commenturl", None) | |
1486 | |
1487 # set default path | |
1488 if path is not Absent and path != "": | |
1489 path_specified = True | |
1490 path = escape_path(path) | |
1491 else: | |
1492 path_specified = False | |
1493 path = request_path(request) | |
1494 i = path.rfind("/") | |
1495 if i != -1: | |
1496 if version == 0: | |
1497 # Netscape spec parts company from reality here | |
1498 path = path[:i] | |
1499 else: | |
1500 path = path[:i+1] | |
1501 if len(path) == 0: path = "/" | |
1502 | |
1503 # set default domain | |
1504 domain_specified = domain is not Absent | |
1505 # but first we have to remember whether it starts with a dot | |
1506 domain_initial_dot = False | |
1507 if domain_specified: | |
1508 domain_initial_dot = bool(domain.startswith(".")) | |
1509 if domain is Absent: | |
1510 req_host, erhn = eff_request_host(request) | |
1511 domain = erhn | |
1512 elif not domain.startswith("."): | |
1513 domain = "."+domain | |
1514 | |
1515 # set default port | |
1516 port_specified = False | |
1517 if port is not Absent: | |
1518 if port is None: | |
1519 # Port attr present, but has no value: default to request port. | |
1520 # Cookie should then only be sent back on that port. | |
1521 port = request_port(request) | |
1522 else: | |
1523 port_specified = True | |
1524 port = re.sub(r"\s+", "", port) | |
1525 else: | |
1526 # No port attr present. Cookie can be sent back on any port. | |
1527 port = None | |
1528 | |
1529 # set default expires and discard | |
1530 if expires is Absent: | |
1531 expires = None | |
1532 discard = True | |
1533 elif expires <= self._now: | |
1534 # Expiry date in past is request to delete cookie. This can't be | |
1535 # in DefaultCookiePolicy, because can't delete cookies there. | |
1536 try: | |
1537 self.clear(domain, path, name) | |
1538 except KeyError: | |
1539 pass | |
1540 _debug("Expiring cookie, domain='%s', path='%s', name='%s'", | |
1541 domain, path, name) | |
1542 return None | |
1543 | |
1544 return Cookie(version, | |
1545 name, value, | |
1546 port, port_specified, | |
1547 domain, domain_specified, domain_initial_dot, | |
1548 path, path_specified, | |
1549 secure, | |
1550 expires, | |
1551 discard, | |
1552 comment, | |
1553 comment_url, | |
1554 rest) | |
1555 | |
1556 def _cookies_from_attrs_set(self, attrs_set, request): | |
1557 cookie_tuples = self._normalized_cookie_tuples(attrs_set) | |
1558 | |
1559 cookies = [] | |
1560 for tup in cookie_tuples: | |
1561 cookie = self._cookie_from_cookie_tuple(tup, request) | |
1562 if cookie: cookies.append(cookie) | |
1563 return cookies | |
1564 | |
1565 def _process_rfc2109_cookies(self, cookies): | |
1566 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None) | |
1567 if rfc2109_as_ns is None: | |
1568 rfc2109_as_ns = not self._policy.rfc2965 | |
1569 for cookie in cookies: | |
1570 if cookie.version == 1: | |
1571 cookie.rfc2109 = True | |
1572 if rfc2109_as_ns: | |
1573 # treat 2109 cookies as Netscape cookies rather than | |
1574 # as RFC2965 cookies | |
1575 cookie.version = 0 | |
1576 | |
1577 def make_cookies(self, response, request): | |
1578 """Return sequence of Cookie objects extracted from response object.""" | |
1579 # get cookie-attributes for RFC 2965 and Netscape protocols | |
1580 headers = response.info() | |
1581 rfc2965_hdrs = headers.get_all("Set-Cookie2", []) | |
1582 ns_hdrs = headers.get_all("Set-Cookie", []) | |
1583 | |
1584 rfc2965 = self._policy.rfc2965 | |
1585 netscape = self._policy.netscape | |
1586 | |
1587 if ((not rfc2965_hdrs and not ns_hdrs) or | |
1588 (not ns_hdrs and not rfc2965) or | |
1589 (not rfc2965_hdrs and not netscape) or | |
1590 (not netscape and not rfc2965)): | |
1591 return [] # no relevant cookie headers: quick exit | |
1592 | |
1593 try: | |
1594 cookies = self._cookies_from_attrs_set( | |
1595 split_header_words(rfc2965_hdrs), request) | |
1596 except Exception: | |
1597 _warn_unhandled_exception() | |
1598 cookies = [] | |
1599 | |
1600 if ns_hdrs and netscape: | |
1601 try: | |
1602 # RFC 2109 and Netscape cookies | |
1603 ns_cookies = self._cookies_from_attrs_set( | |
1604 parse_ns_headers(ns_hdrs), request) | |
1605 except Exception: | |
1606 _warn_unhandled_exception() | |
1607 ns_cookies = [] | |
1608 self._process_rfc2109_cookies(ns_cookies) | |
1609 | |
1610 # Look for Netscape cookies (from Set-Cookie headers) that match | |
1611 # corresponding RFC 2965 cookies (from Set-Cookie2 headers). | |
1612 # For each match, keep the RFC 2965 cookie and ignore the Netscape | |
1613 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are | |
1614 # bundled in with the Netscape cookies for this purpose, which is | |
1615 # reasonable behaviour. | |
1616 if rfc2965: | |
1617 lookup = {} | |
1618 for cookie in cookies: | |
1619 lookup[(cookie.domain, cookie.path, cookie.name)] = None | |
1620 | |
1621 def no_matching_rfc2965(ns_cookie, lookup=lookup): | |
1622 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name | |
1623 return key not in lookup | |
1624 ns_cookies = filter(no_matching_rfc2965, ns_cookies) | |
1625 | |
1626 if ns_cookies: | |
1627 cookies.extend(ns_cookies) | |
1628 | |
1629 return cookies | |
1630 | |
1631 def set_cookie_if_ok(self, cookie, request): | |
1632 """Set a cookie if policy says it's OK to do so.""" | |
1633 self._cookies_lock.acquire() | |
1634 try: | |
1635 self._policy._now = self._now = int(time.time()) | |
1636 | |
1637 if self._policy.set_ok(cookie, request): | |
1638 self.set_cookie(cookie) | |
1639 | |
1640 | |
1641 finally: | |
1642 self._cookies_lock.release() | |
1643 | |
1644 def set_cookie(self, cookie): | |
1645 """Set a cookie, without checking whether or not it should be set.""" | |
1646 c = self._cookies | |
1647 self._cookies_lock.acquire() | |
1648 try: | |
1649 if cookie.domain not in c: c[cookie.domain] = {} | |
1650 c2 = c[cookie.domain] | |
1651 if cookie.path not in c2: c2[cookie.path] = {} | |
1652 c3 = c2[cookie.path] | |
1653 c3[cookie.name] = cookie | |
1654 finally: | |
1655 self._cookies_lock.release() | |
1656 | |
1657 def extract_cookies(self, response, request): | |
1658 """Extract cookies from response, where allowable given the request.""" | |
1659 _debug("extract_cookies: %s", response.info()) | |
1660 self._cookies_lock.acquire() | |
1661 try: | |
1662 self._policy._now = self._now = int(time.time()) | |
1663 | |
1664 for cookie in self.make_cookies(response, request): | |
1665 if self._policy.set_ok(cookie, request): | |
1666 _debug(" setting cookie: %s", cookie) | |
1667 self.set_cookie(cookie) | |
1668 finally: | |
1669 self._cookies_lock.release() | |
1670 | |
1671 def clear(self, domain=None, path=None, name=None): | |
1672 """Clear some cookies. | |
1673 | |
1674 Invoking this method without arguments will clear all cookies. If | |
1675 given a single argument, only cookies belonging to that domain will be | |
1676 removed. If given two arguments, cookies belonging to the specified | |
1677 path within that domain are removed. If given three arguments, then | |
1678 the cookie with the specified name, path and domain is removed. | |
1679 | |
1680 Raises KeyError if no matching cookie exists. | |
1681 | |
1682 """ | |
1683 if name is not None: | |
1684 if (domain is None) or (path is None): | |
1685 raise ValueError( | |
1686 "domain and path must be given to remove a cookie by name") | |
1687 del self._cookies[domain][path][name] | |
1688 elif path is not None: | |
1689 if domain is None: | |
1690 raise ValueError( | |
1691 "domain must be given to remove cookies by path") | |
1692 del self._cookies[domain][path] | |
1693 elif domain is not None: | |
1694 del self._cookies[domain] | |
1695 else: | |
1696 self._cookies = {} | |
1697 | |
1698 def clear_session_cookies(self): | |
1699 """Discard all session cookies. | |
1700 | |
1701 Note that the .save() method won't save session cookies anyway, unless | |
1702 you ask otherwise by passing a true ignore_discard argument. | |
1703 | |
1704 """ | |
1705 self._cookies_lock.acquire() | |
1706 try: | |
1707 for cookie in self: | |
1708 if cookie.discard: | |
1709 self.clear(cookie.domain, cookie.path, cookie.name) | |
1710 finally: | |
1711 self._cookies_lock.release() | |
1712 | |
1713 def clear_expired_cookies(self): | |
1714 """Discard all expired cookies. | |
1715 | |
1716 You probably don't need to call this method: expired cookies are never | |
1717 sent back to the server (provided you're using DefaultCookiePolicy), | |
1718 this method is called by CookieJar itself every so often, and the | |
1719 .save() method won't save expired cookies anyway (unless you ask | |
1720 otherwise by passing a true ignore_expires argument). | |
1721 | |
1722 """ | |
1723 self._cookies_lock.acquire() | |
1724 try: | |
1725 now = time.time() | |
1726 for cookie in self: | |
1727 if cookie.is_expired(now): | |
1728 self.clear(cookie.domain, cookie.path, cookie.name) | |
1729 finally: | |
1730 self._cookies_lock.release() | |
1731 | |
1732 def __iter__(self): | |
1733 return deepvalues(self._cookies) | |
1734 | |
1735 def __len__(self): | |
1736 """Return number of contained cookies.""" | |
1737 i = 0 | |
1738 for cookie in self: i = i + 1 | |
1739 return i | |
1740 | |
1741 @as_native_str() | |
1742 def __repr__(self): | |
1743 r = [] | |
1744 for cookie in self: r.append(repr(cookie)) | |
1745 return "<%s[%s]>" % (self.__class__, ", ".join(r)) | |
1746 | |
1747 def __str__(self): | |
1748 r = [] | |
1749 for cookie in self: r.append(str(cookie)) | |
1750 return "<%s[%s]>" % (self.__class__, ", ".join(r)) | |
1751 | |
1752 | |
1753 # derives from IOError for backwards-compatibility with Python 2.4.0 | |
1754 class LoadError(IOError): pass | |
1755 | |
1756 class FileCookieJar(CookieJar): | |
1757 """CookieJar that can be loaded from and saved to a file.""" | |
1758 | |
1759 def __init__(self, filename=None, delayload=False, policy=None): | |
1760 """ | |
1761 Cookies are NOT loaded from the named file until either the .load() or | |
1762 .revert() method is called. | |
1763 | |
1764 """ | |
1765 CookieJar.__init__(self, policy) | |
1766 if filename is not None: | |
1767 try: | |
1768 filename+"" | |
1769 except: | |
1770 raise ValueError("filename must be string-like") | |
1771 self.filename = filename | |
1772 self.delayload = bool(delayload) | |
1773 | |
1774 def save(self, filename=None, ignore_discard=False, ignore_expires=False): | |
1775 """Save cookies to a file.""" | |
1776 raise NotImplementedError() | |
1777 | |
1778 def load(self, filename=None, ignore_discard=False, ignore_expires=False): | |
1779 """Load cookies from a file.""" | |
1780 if filename is None: | |
1781 if self.filename is not None: filename = self.filename | |
1782 else: raise ValueError(MISSING_FILENAME_TEXT) | |
1783 | |
1784 f = open(filename) | |
1785 try: | |
1786 self._really_load(f, filename, ignore_discard, ignore_expires) | |
1787 finally: | |
1788 f.close() | |
1789 | |
1790 def revert(self, filename=None, | |
1791 ignore_discard=False, ignore_expires=False): | |
1792 """Clear all cookies and reload cookies from a saved file. | |
1793 | |
1794 Raises LoadError (or IOError) if reversion is not successful; the | |
1795 object's state will not be altered if this happens. | |
1796 | |
1797 """ | |
1798 if filename is None: | |
1799 if self.filename is not None: filename = self.filename | |
1800 else: raise ValueError(MISSING_FILENAME_TEXT) | |
1801 | |
1802 self._cookies_lock.acquire() | |
1803 try: | |
1804 | |
1805 old_state = copy.deepcopy(self._cookies) | |
1806 self._cookies = {} | |
1807 try: | |
1808 self.load(filename, ignore_discard, ignore_expires) | |
1809 except (LoadError, IOError): | |
1810 self._cookies = old_state | |
1811 raise | |
1812 | |
1813 finally: | |
1814 self._cookies_lock.release() | |
1815 | |
1816 | |
1817 def lwp_cookie_str(cookie): | |
1818 """Return string representation of Cookie in an the LWP cookie file format. | |
1819 | |
1820 Actually, the format is extended a bit -- see module docstring. | |
1821 | |
1822 """ | |
1823 h = [(cookie.name, cookie.value), | |
1824 ("path", cookie.path), | |
1825 ("domain", cookie.domain)] | |
1826 if cookie.port is not None: h.append(("port", cookie.port)) | |
1827 if cookie.path_specified: h.append(("path_spec", None)) | |
1828 if cookie.port_specified: h.append(("port_spec", None)) | |
1829 if cookie.domain_initial_dot: h.append(("domain_dot", None)) | |
1830 if cookie.secure: h.append(("secure", None)) | |
1831 if cookie.expires: h.append(("expires", | |
1832 time2isoz(float(cookie.expires)))) | |
1833 if cookie.discard: h.append(("discard", None)) | |
1834 if cookie.comment: h.append(("comment", cookie.comment)) | |
1835 if cookie.comment_url: h.append(("commenturl", cookie.comment_url)) | |
1836 | |
1837 keys = sorted(cookie._rest.keys()) | |
1838 for k in keys: | |
1839 h.append((k, str(cookie._rest[k]))) | |
1840 | |
1841 h.append(("version", str(cookie.version))) | |
1842 | |
1843 return join_header_words([h]) | |
1844 | |
1845 class LWPCookieJar(FileCookieJar): | |
1846 """ | |
1847 The LWPCookieJar saves a sequence of "Set-Cookie3" lines. | |
1848 "Set-Cookie3" is the format used by the libwww-perl libary, not known | |
1849 to be compatible with any browser, but which is easy to read and | |
1850 doesn't lose information about RFC 2965 cookies. | |
1851 | |
1852 Additional methods | |
1853 | |
1854 as_lwp_str(ignore_discard=True, ignore_expired=True) | |
1855 | |
1856 """ | |
1857 | |
1858 def as_lwp_str(self, ignore_discard=True, ignore_expires=True): | |
1859 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers. | |
1860 | |
1861 ignore_discard and ignore_expires: see docstring for FileCookieJar.save | |
1862 | |
1863 """ | |
1864 now = time.time() | |
1865 r = [] | |
1866 for cookie in self: | |
1867 if not ignore_discard and cookie.discard: | |
1868 continue | |
1869 if not ignore_expires and cookie.is_expired(now): | |
1870 continue | |
1871 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie)) | |
1872 return "\n".join(r+[""]) | |
1873 | |
1874 def save(self, filename=None, ignore_discard=False, ignore_expires=False): | |
1875 if filename is None: | |
1876 if self.filename is not None: filename = self.filename | |
1877 else: raise ValueError(MISSING_FILENAME_TEXT) | |
1878 | |
1879 f = open(filename, "w") | |
1880 try: | |
1881 # There really isn't an LWP Cookies 2.0 format, but this indicates | |
1882 # that there is extra information in here (domain_dot and | |
1883 # port_spec) while still being compatible with libwww-perl, I hope. | |
1884 f.write("#LWP-Cookies-2.0\n") | |
1885 f.write(self.as_lwp_str(ignore_discard, ignore_expires)) | |
1886 finally: | |
1887 f.close() | |
1888 | |
1889 def _really_load(self, f, filename, ignore_discard, ignore_expires): | |
1890 magic = f.readline() | |
1891 if not self.magic_re.search(magic): | |
1892 msg = ("%r does not look like a Set-Cookie3 (LWP) format " | |
1893 "file" % filename) | |
1894 raise LoadError(msg) | |
1895 | |
1896 now = time.time() | |
1897 | |
1898 header = "Set-Cookie3:" | |
1899 boolean_attrs = ("port_spec", "path_spec", "domain_dot", | |
1900 "secure", "discard") | |
1901 value_attrs = ("version", | |
1902 "port", "path", "domain", | |
1903 "expires", | |
1904 "comment", "commenturl") | |
1905 | |
1906 try: | |
1907 while 1: | |
1908 line = f.readline() | |
1909 if line == "": break | |
1910 if not line.startswith(header): | |
1911 continue | |
1912 line = line[len(header):].strip() | |
1913 | |
1914 for data in split_header_words([line]): | |
1915 name, value = data[0] | |
1916 standard = {} | |
1917 rest = {} | |
1918 for k in boolean_attrs: | |
1919 standard[k] = False | |
1920 for k, v in data[1:]: | |
1921 if k is not None: | |
1922 lc = k.lower() | |
1923 else: | |
1924 lc = None | |
1925 # don't lose case distinction for unknown fields | |
1926 if (lc in value_attrs) or (lc in boolean_attrs): | |
1927 k = lc | |
1928 if k in boolean_attrs: | |
1929 if v is None: v = True | |
1930 standard[k] = v | |
1931 elif k in value_attrs: | |
1932 standard[k] = v | |
1933 else: | |
1934 rest[k] = v | |
1935 | |
1936 h = standard.get | |
1937 expires = h("expires") | |
1938 discard = h("discard") | |
1939 if expires is not None: | |
1940 expires = iso2time(expires) | |
1941 if expires is None: | |
1942 discard = True | |
1943 domain = h("domain") | |
1944 domain_specified = domain.startswith(".") | |
1945 c = Cookie(h("version"), name, value, | |
1946 h("port"), h("port_spec"), | |
1947 domain, domain_specified, h("domain_dot"), | |
1948 h("path"), h("path_spec"), | |
1949 h("secure"), | |
1950 expires, | |
1951 discard, | |
1952 h("comment"), | |
1953 h("commenturl"), | |
1954 rest) | |
1955 if not ignore_discard and c.discard: | |
1956 continue | |
1957 if not ignore_expires and c.is_expired(now): | |
1958 continue | |
1959 self.set_cookie(c) | |
1960 | |
1961 except IOError: | |
1962 raise | |
1963 except Exception: | |
1964 _warn_unhandled_exception() | |
1965 raise LoadError("invalid Set-Cookie3 format file %r: %r" % | |
1966 (filename, line)) | |
1967 | |
1968 | |
1969 class MozillaCookieJar(FileCookieJar): | |
1970 """ | |
1971 | |
1972 WARNING: you may want to backup your browser's cookies file if you use | |
1973 this class to save cookies. I *think* it works, but there have been | |
1974 bugs in the past! | |
1975 | |
1976 This class differs from CookieJar only in the format it uses to save and | |
1977 load cookies to and from a file. This class uses the Mozilla/Netscape | |
1978 `cookies.txt' format. lynx uses this file format, too. | |
1979 | |
1980 Don't expect cookies saved while the browser is running to be noticed by | |
1981 the browser (in fact, Mozilla on unix will overwrite your saved cookies if | |
1982 you change them on disk while it's running; on Windows, you probably can't | |
1983 save at all while the browser is running). | |
1984 | |
1985 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to | |
1986 Netscape cookies on saving. | |
1987 | |
1988 In particular, the cookie version and port number information is lost, | |
1989 together with information about whether or not Path, Port and Discard were | |
1990 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the | |
1991 domain as set in the HTTP header started with a dot (yes, I'm aware some | |
1992 domains in Netscape files start with a dot and some don't -- trust me, you | |
1993 really don't want to know any more about this). | |
1994 | |
1995 Note that though Mozilla and Netscape use the same format, they use | |
1996 slightly different headers. The class saves cookies using the Netscape | |
1997 header by default (Mozilla can cope with that). | |
1998 | |
1999 """ | |
2000 magic_re = re.compile("#( Netscape)? HTTP Cookie File") | |
2001 header = """\ | |
2002 # Netscape HTTP Cookie File | |
2003 # http://www.netscape.com/newsref/std/cookie_spec.html | |
2004 # This is a generated file! Do not edit. | |
2005 | |
2006 """ | |
2007 | |
2008 def _really_load(self, f, filename, ignore_discard, ignore_expires): | |
2009 now = time.time() | |
2010 | |
2011 magic = f.readline() | |
2012 if not self.magic_re.search(magic): | |
2013 f.close() | |
2014 raise LoadError( | |
2015 "%r does not look like a Netscape format cookies file" % | |
2016 filename) | |
2017 | |
2018 try: | |
2019 while 1: | |
2020 line = f.readline() | |
2021 if line == "": break | |
2022 | |
2023 # last field may be absent, so keep any trailing tab | |
2024 if line.endswith("\n"): line = line[:-1] | |
2025 | |
2026 # skip comments and blank lines XXX what is $ for? | |
2027 if (line.strip().startswith(("#", "$")) or | |
2028 line.strip() == ""): | |
2029 continue | |
2030 | |
2031 domain, domain_specified, path, secure, expires, name, value = \ | |
2032 line.split("\t") | |
2033 secure = (secure == "TRUE") | |
2034 domain_specified = (domain_specified == "TRUE") | |
2035 if name == "": | |
2036 # cookies.txt regards 'Set-Cookie: foo' as a cookie | |
2037 # with no name, whereas http.cookiejar regards it as a | |
2038 # cookie with no value. | |
2039 name = value | |
2040 value = None | |
2041 | |
2042 initial_dot = domain.startswith(".") | |
2043 assert domain_specified == initial_dot | |
2044 | |
2045 discard = False | |
2046 if expires == "": | |
2047 expires = None | |
2048 discard = True | |
2049 | |
2050 # assume path_specified is false | |
2051 c = Cookie(0, name, value, | |
2052 None, False, | |
2053 domain, domain_specified, initial_dot, | |
2054 path, False, | |
2055 secure, | |
2056 expires, | |
2057 discard, | |
2058 None, | |
2059 None, | |
2060 {}) | |
2061 if not ignore_discard and c.discard: | |
2062 continue | |
2063 if not ignore_expires and c.is_expired(now): | |
2064 continue | |
2065 self.set_cookie(c) | |
2066 | |
2067 except IOError: | |
2068 raise | |
2069 except Exception: | |
2070 _warn_unhandled_exception() | |
2071 raise LoadError("invalid Netscape format cookies file %r: %r" % | |
2072 (filename, line)) | |
2073 | |
2074 def save(self, filename=None, ignore_discard=False, ignore_expires=False): | |
2075 if filename is None: | |
2076 if self.filename is not None: filename = self.filename | |
2077 else: raise ValueError(MISSING_FILENAME_TEXT) | |
2078 | |
2079 f = open(filename, "w") | |
2080 try: | |
2081 f.write(self.header) | |
2082 now = time.time() | |
2083 for cookie in self: | |
2084 if not ignore_discard and cookie.discard: | |
2085 continue | |
2086 if not ignore_expires and cookie.is_expired(now): | |
2087 continue | |
2088 if cookie.secure: secure = "TRUE" | |
2089 else: secure = "FALSE" | |
2090 if cookie.domain.startswith("."): initial_dot = "TRUE" | |
2091 else: initial_dot = "FALSE" | |
2092 if cookie.expires is not None: | |
2093 expires = str(cookie.expires) | |
2094 else: | |
2095 expires = "" | |
2096 if cookie.value is None: | |
2097 # cookies.txt regards 'Set-Cookie: foo' as a cookie | |
2098 # with no name, whereas http.cookiejar regards it as a | |
2099 # cookie with no value. | |
2100 name = "" | |
2101 value = cookie.name | |
2102 else: | |
2103 name = cookie.name | |
2104 value = cookie.value | |
2105 f.write( | |
2106 "\t".join([cookie.domain, initial_dot, cookie.path, | |
2107 secure, expires, name, value])+ | |
2108 "\n") | |
2109 finally: | |
2110 f.close() |