comparison planemo/lib/python3.7/site-packages/future/backports/http/cookiejar.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 r"""HTTP cookie handling for web clients.
2
3 This is a backport of the Py3.3 ``http.cookiejar`` module for
4 python-future.
5
6 This module has (now fairly distant) origins in Gisle Aas' Perl module
7 HTTP::Cookies, from the libwww-perl library.
8
9 Docstrings, comments and debug strings in this code refer to the
10 attributes of the HTTP cookie system as cookie-attributes, to distinguish
11 them clearly from Python attributes.
12
13 Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
14 distributed with the Python standard library, but are available from
15 http://wwwsearch.sf.net/):
16
17 CookieJar____
18 / \ \
19 FileCookieJar \ \
20 / | \ \ \
21 MozillaCookieJar | LWPCookieJar \ \
22 | | \
23 | ---MSIEBase | \
24 | / | | \
25 | / MSIEDBCookieJar BSDDBCookieJar
26 |/
27 MSIECookieJar
28
29 """
30
31 from __future__ import unicode_literals
32 from __future__ import print_function
33 from __future__ import division
34 from __future__ import absolute_import
35 from future.builtins import filter, int, map, open, str
36 from future.utils import as_native_str, PY2
37
38 __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
39 'FileCookieJar', 'LWPCookieJar', 'LoadError', 'MozillaCookieJar']
40
41 import copy
42 import datetime
43 import re
44 if PY2:
45 re.ASCII = 0
46 import time
47 from future.backports.urllib.parse import urlparse, urlsplit, quote
48 from future.backports.http.client import HTTP_PORT
49 try:
50 import threading as _threading
51 except ImportError:
52 import dummy_threading as _threading
53 from calendar import timegm
54
55 debug = False # set to True to enable debugging via the logging module
56 logger = None
57
58 def _debug(*args):
59 if not debug:
60 return
61 global logger
62 if not logger:
63 import logging
64 logger = logging.getLogger("http.cookiejar")
65 return logger.debug(*args)
66
67
68 DEFAULT_HTTP_PORT = str(HTTP_PORT)
69 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
70 "instance initialised with one)")
71
72 def _warn_unhandled_exception():
73 # There are a few catch-all except: statements in this module, for
74 # catching input that's bad in unexpected ways. Warn if any
75 # exceptions are caught there.
76 import io, warnings, traceback
77 f = io.StringIO()
78 traceback.print_exc(None, f)
79 msg = f.getvalue()
80 warnings.warn("http.cookiejar bug!\n%s" % msg, stacklevel=2)
81
82
83 # Date/time conversion
84 # -----------------------------------------------------------------------------
85
86 EPOCH_YEAR = 1970
87 def _timegm(tt):
88 year, month, mday, hour, min, sec = tt[:6]
89 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
90 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
91 return timegm(tt)
92 else:
93 return None
94
95 DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
96 MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
97 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
98 MONTHS_LOWER = []
99 for month in MONTHS: MONTHS_LOWER.append(month.lower())
100
101 def time2isoz(t=None):
102 """Return a string representing time in seconds since epoch, t.
103
104 If the function is called without an argument, it will use the current
105 time.
106
107 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
108 representing Universal Time (UTC, aka GMT). An example of this format is:
109
110 1994-11-24 08:49:37Z
111
112 """
113 if t is None:
114 dt = datetime.datetime.utcnow()
115 else:
116 dt = datetime.datetime.utcfromtimestamp(t)
117 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
118 dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
119
120 def time2netscape(t=None):
121 """Return a string representing time in seconds since epoch, t.
122
123 If the function is called without an argument, it will use the current
124 time.
125
126 The format of the returned string is like this:
127
128 Wed, DD-Mon-YYYY HH:MM:SS GMT
129
130 """
131 if t is None:
132 dt = datetime.datetime.utcnow()
133 else:
134 dt = datetime.datetime.utcfromtimestamp(t)
135 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
136 DAYS[dt.weekday()], dt.day, MONTHS[dt.month-1],
137 dt.year, dt.hour, dt.minute, dt.second)
138
139
140 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
141
142 TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
143 def offset_from_tz_string(tz):
144 offset = None
145 if tz in UTC_ZONES:
146 offset = 0
147 else:
148 m = TIMEZONE_RE.search(tz)
149 if m:
150 offset = 3600 * int(m.group(2))
151 if m.group(3):
152 offset = offset + 60 * int(m.group(3))
153 if m.group(1) == '-':
154 offset = -offset
155 return offset
156
157 def _str2time(day, mon, yr, hr, min, sec, tz):
158 # translate month name to number
159 # month numbers start with 1 (January)
160 try:
161 mon = MONTHS_LOWER.index(mon.lower())+1
162 except ValueError:
163 # maybe it's already a number
164 try:
165 imon = int(mon)
166 except ValueError:
167 return None
168 if 1 <= imon <= 12:
169 mon = imon
170 else:
171 return None
172
173 # make sure clock elements are defined
174 if hr is None: hr = 0
175 if min is None: min = 0
176 if sec is None: sec = 0
177
178 yr = int(yr)
179 day = int(day)
180 hr = int(hr)
181 min = int(min)
182 sec = int(sec)
183
184 if yr < 1000:
185 # find "obvious" year
186 cur_yr = time.localtime(time.time())[0]
187 m = cur_yr % 100
188 tmp = yr
189 yr = yr + cur_yr - m
190 m = m - tmp
191 if abs(m) > 50:
192 if m > 0: yr = yr + 100
193 else: yr = yr - 100
194
195 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
196 t = _timegm((yr, mon, day, hr, min, sec, tz))
197
198 if t is not None:
199 # adjust time using timezone string, to get absolute time since epoch
200 if tz is None:
201 tz = "UTC"
202 tz = tz.upper()
203 offset = offset_from_tz_string(tz)
204 if offset is None:
205 return None
206 t = t - offset
207
208 return t
209
210 STRICT_DATE_RE = re.compile(
211 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
212 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
213 WEEKDAY_RE = re.compile(
214 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
215 LOOSE_HTTP_DATE_RE = re.compile(
216 r"""^
217 (\d\d?) # day
218 (?:\s+|[-\/])
219 (\w+) # month
220 (?:\s+|[-\/])
221 (\d+) # year
222 (?:
223 (?:\s+|:) # separator before clock
224 (\d\d?):(\d\d) # hour:min
225 (?::(\d\d))? # optional seconds
226 )? # optional clock
227 \s*
228 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
229 \s*
230 (?:\(\w+\))? # ASCII representation of timezone in parens.
231 \s*$""", re.X | re.ASCII)
232 def http2time(text):
233 """Returns time in seconds since epoch of time represented by a string.
234
235 Return value is an integer.
236
237 None is returned if the format of str is unrecognized, the time is outside
238 the representable range, or the timezone string is not recognized. If the
239 string contains no timezone, UTC is assumed.
240
241 The timezone in the string may be numerical (like "-0800" or "+0100") or a
242 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
243 timezone strings equivalent to UTC (zero offset) are known to the function.
244
245 The function loosely parses the following formats:
246
247 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
248 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
249 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
250 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
251 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
252 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
253
254 The parser ignores leading and trailing whitespace. The time may be
255 absent.
256
257 If the year is given with only 2 digits, the function will select the
258 century that makes the year closest to the current date.
259
260 """
261 # fast exit for strictly conforming string
262 m = STRICT_DATE_RE.search(text)
263 if m:
264 g = m.groups()
265 mon = MONTHS_LOWER.index(g[1].lower()) + 1
266 tt = (int(g[2]), mon, int(g[0]),
267 int(g[3]), int(g[4]), float(g[5]))
268 return _timegm(tt)
269
270 # No, we need some messy parsing...
271
272 # clean up
273 text = text.lstrip()
274 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
275
276 # tz is time zone specifier string
277 day, mon, yr, hr, min, sec, tz = [None]*7
278
279 # loose regexp parse
280 m = LOOSE_HTTP_DATE_RE.search(text)
281 if m is not None:
282 day, mon, yr, hr, min, sec, tz = m.groups()
283 else:
284 return None # bad format
285
286 return _str2time(day, mon, yr, hr, min, sec, tz)
287
288 ISO_DATE_RE = re.compile(
289 """^
290 (\d{4}) # year
291 [-\/]?
292 (\d\d?) # numerical month
293 [-\/]?
294 (\d\d?) # day
295 (?:
296 (?:\s+|[-:Tt]) # separator before clock
297 (\d\d?):?(\d\d) # hour:min
298 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
299 )? # optional clock
300 \s*
301 ([-+]?\d\d?:?(:?\d\d)?
302 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
303 \s*$""", re.X | re. ASCII)
304 def iso2time(text):
305 """
306 As for http2time, but parses the ISO 8601 formats:
307
308 1994-02-03 14:15:29 -0100 -- ISO 8601 format
309 1994-02-03 14:15:29 -- zone is optional
310 1994-02-03 -- only date
311 1994-02-03T14:15:29 -- Use T as separator
312 19940203T141529Z -- ISO 8601 compact format
313 19940203 -- only date
314
315 """
316 # clean up
317 text = text.lstrip()
318
319 # tz is time zone specifier string
320 day, mon, yr, hr, min, sec, tz = [None]*7
321
322 # loose regexp parse
323 m = ISO_DATE_RE.search(text)
324 if m is not None:
325 # XXX there's an extra bit of the timezone I'm ignoring here: is
326 # this the right thing to do?
327 yr, mon, day, hr, min, sec, tz, _ = m.groups()
328 else:
329 return None # bad format
330
331 return _str2time(day, mon, yr, hr, min, sec, tz)
332
333
334 # Header parsing
335 # -----------------------------------------------------------------------------
336
337 def unmatched(match):
338 """Return unmatched part of re.Match object."""
339 start, end = match.span(0)
340 return match.string[:start]+match.string[end:]
341
342 HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
343 HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
344 HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
345 HEADER_ESCAPE_RE = re.compile(r"\\(.)")
346 def split_header_words(header_values):
347 r"""Parse header values into a list of lists containing key,value pairs.
348
349 The function knows how to deal with ",", ";" and "=" as well as quoted
350 values after "=". A list of space separated tokens are parsed as if they
351 were separated by ";".
352
353 If the header_values passed as argument contains multiple values, then they
354 are treated as if they were a single value separated by comma ",".
355
356 This means that this function is useful for parsing header fields that
357 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
358 the requirement for tokens).
359
360 headers = #header
361 header = (token | parameter) *( [";"] (token | parameter))
362
363 token = 1*<any CHAR except CTLs or separators>
364 separators = "(" | ")" | "<" | ">" | "@"
365 | "," | ";" | ":" | "\" | <">
366 | "/" | "[" | "]" | "?" | "="
367 | "{" | "}" | SP | HT
368
369 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
370 qdtext = <any TEXT except <">>
371 quoted-pair = "\" CHAR
372
373 parameter = attribute "=" value
374 attribute = token
375 value = token | quoted-string
376
377 Each header is represented by a list of key/value pairs. The value for a
378 simple token (not part of a parameter) is None. Syntactically incorrect
379 headers will not necessarily be parsed as you would want.
380
381 This is easier to describe with some examples:
382
383 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
384 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
385 >>> split_header_words(['text/html; charset="iso-8859-1"'])
386 [[('text/html', None), ('charset', 'iso-8859-1')]]
387 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
388 [[('Basic', None), ('realm', '"foobar"')]]
389
390 """
391 assert not isinstance(header_values, str)
392 result = []
393 for text in header_values:
394 orig_text = text
395 pairs = []
396 while text:
397 m = HEADER_TOKEN_RE.search(text)
398 if m:
399 text = unmatched(m)
400 name = m.group(1)
401 m = HEADER_QUOTED_VALUE_RE.search(text)
402 if m: # quoted value
403 text = unmatched(m)
404 value = m.group(1)
405 value = HEADER_ESCAPE_RE.sub(r"\1", value)
406 else:
407 m = HEADER_VALUE_RE.search(text)
408 if m: # unquoted value
409 text = unmatched(m)
410 value = m.group(1)
411 value = value.rstrip()
412 else:
413 # no value, a lone token
414 value = None
415 pairs.append((name, value))
416 elif text.lstrip().startswith(","):
417 # concatenated headers, as per RFC 2616 section 4.2
418 text = text.lstrip()[1:]
419 if pairs: result.append(pairs)
420 pairs = []
421 else:
422 # skip junk
423 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
424 assert nr_junk_chars > 0, (
425 "split_header_words bug: '%s', '%s', %s" %
426 (orig_text, text, pairs))
427 text = non_junk
428 if pairs: result.append(pairs)
429 return result
430
431 HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
432 def join_header_words(lists):
433 """Do the inverse (almost) of the conversion done by split_header_words.
434
435 Takes a list of lists of (key, value) pairs and produces a single header
436 value. Attribute values are quoted if needed.
437
438 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
439 'text/plain; charset="iso-8859/1"'
440 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
441 'text/plain, charset="iso-8859/1"'
442
443 """
444 headers = []
445 for pairs in lists:
446 attr = []
447 for k, v in pairs:
448 if v is not None:
449 if not re.search(r"^\w+$", v):
450 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
451 v = '"%s"' % v
452 k = "%s=%s" % (k, v)
453 attr.append(k)
454 if attr: headers.append("; ".join(attr))
455 return ", ".join(headers)
456
457 def strip_quotes(text):
458 if text.startswith('"'):
459 text = text[1:]
460 if text.endswith('"'):
461 text = text[:-1]
462 return text
463
464 def parse_ns_headers(ns_headers):
465 """Ad-hoc parser for Netscape protocol cookie-attributes.
466
467 The old Netscape cookie format for Set-Cookie can for instance contain
468 an unquoted "," in the expires field, so we have to use this ad-hoc
469 parser instead of split_header_words.
470
471 XXX This may not make the best possible effort to parse all the crap
472 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
473 parser is probably better, so could do worse than following that if
474 this ever gives any trouble.
475
476 Currently, this is also used for parsing RFC 2109 cookies.
477
478 """
479 known_attrs = ("expires", "domain", "path", "secure",
480 # RFC 2109 attrs (may turn up in Netscape cookies, too)
481 "version", "port", "max-age")
482
483 result = []
484 for ns_header in ns_headers:
485 pairs = []
486 version_set = False
487 for ii, param in enumerate(re.split(r";\s*", ns_header)):
488 param = param.rstrip()
489 if param == "": continue
490 if "=" not in param:
491 k, v = param, None
492 else:
493 k, v = re.split(r"\s*=\s*", param, 1)
494 k = k.lstrip()
495 if ii != 0:
496 lc = k.lower()
497 if lc in known_attrs:
498 k = lc
499 if k == "version":
500 # This is an RFC 2109 cookie.
501 v = strip_quotes(v)
502 version_set = True
503 if k == "expires":
504 # convert expires date to seconds since epoch
505 v = http2time(strip_quotes(v)) # None if invalid
506 pairs.append((k, v))
507
508 if pairs:
509 if not version_set:
510 pairs.append(("version", "0"))
511 result.append(pairs)
512
513 return result
514
515
516 IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
517 def is_HDN(text):
518 """Return True if text is a host domain name."""
519 # XXX
520 # This may well be wrong. Which RFC is HDN defined in, if any (for
521 # the purposes of RFC 2965)?
522 # For the current implementation, what about IPv6? Remember to look
523 # at other uses of IPV4_RE also, if change this.
524 if IPV4_RE.search(text):
525 return False
526 if text == "":
527 return False
528 if text[0] == "." or text[-1] == ".":
529 return False
530 return True
531
532 def domain_match(A, B):
533 """Return True if domain A domain-matches domain B, according to RFC 2965.
534
535 A and B may be host domain names or IP addresses.
536
537 RFC 2965, section 1:
538
539 Host names can be specified either as an IP address or a HDN string.
540 Sometimes we compare one host name with another. (Such comparisons SHALL
541 be case-insensitive.) Host A's name domain-matches host B's if
542
543 * their host name strings string-compare equal; or
544
545 * A is a HDN string and has the form NB, where N is a non-empty
546 name string, B has the form .B', and B' is a HDN string. (So,
547 x.y.com domain-matches .Y.com but not Y.com.)
548
549 Note that domain-match is not a commutative operation: a.b.c.com
550 domain-matches .c.com, but not the reverse.
551
552 """
553 # Note that, if A or B are IP addresses, the only relevant part of the
554 # definition of the domain-match algorithm is the direct string-compare.
555 A = A.lower()
556 B = B.lower()
557 if A == B:
558 return True
559 if not is_HDN(A):
560 return False
561 i = A.rfind(B)
562 if i == -1 or i == 0:
563 # A does not have form NB, or N is the empty string
564 return False
565 if not B.startswith("."):
566 return False
567 if not is_HDN(B[1:]):
568 return False
569 return True
570
571 def liberal_is_HDN(text):
572 """Return True if text is a sort-of-like a host domain name.
573
574 For accepting/blocking domains.
575
576 """
577 if IPV4_RE.search(text):
578 return False
579 return True
580
581 def user_domain_match(A, B):
582 """For blocking/accepting domains.
583
584 A and B may be host domain names or IP addresses.
585
586 """
587 A = A.lower()
588 B = B.lower()
589 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
590 if A == B:
591 # equal IP addresses
592 return True
593 return False
594 initial_dot = B.startswith(".")
595 if initial_dot and A.endswith(B):
596 return True
597 if not initial_dot and A == B:
598 return True
599 return False
600
601 cut_port_re = re.compile(r":\d+$", re.ASCII)
602 def request_host(request):
603 """Return request-host, as defined by RFC 2965.
604
605 Variation from RFC: returned value is lowercased, for convenient
606 comparison.
607
608 """
609 url = request.get_full_url()
610 host = urlparse(url)[1]
611 if host == "":
612 host = request.get_header("Host", "")
613
614 # remove port, if present
615 host = cut_port_re.sub("", host, 1)
616 return host.lower()
617
618 def eff_request_host(request):
619 """Return a tuple (request-host, effective request-host name).
620
621 As defined by RFC 2965, except both are lowercased.
622
623 """
624 erhn = req_host = request_host(request)
625 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
626 erhn = req_host + ".local"
627 return req_host, erhn
628
629 def request_path(request):
630 """Path component of request-URI, as defined by RFC 2965."""
631 url = request.get_full_url()
632 parts = urlsplit(url)
633 path = escape_path(parts.path)
634 if not path.startswith("/"):
635 # fix bad RFC 2396 absoluteURI
636 path = "/" + path
637 return path
638
639 def request_port(request):
640 host = request.host
641 i = host.find(':')
642 if i >= 0:
643 port = host[i+1:]
644 try:
645 int(port)
646 except ValueError:
647 _debug("nonnumeric port: '%s'", port)
648 return None
649 else:
650 port = DEFAULT_HTTP_PORT
651 return port
652
653 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
654 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
655 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
656 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
657 def uppercase_escaped_char(match):
658 return "%%%s" % match.group(1).upper()
659 def escape_path(path):
660 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
661 # There's no knowing what character encoding was used to create URLs
662 # containing %-escapes, but since we have to pick one to escape invalid
663 # path characters, we pick UTF-8, as recommended in the HTML 4.0
664 # specification:
665 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
666 # And here, kind of: draft-fielding-uri-rfc2396bis-03
667 # (And in draft IRI specification: draft-duerst-iri-05)
668 # (And here, for new URI schemes: RFC 2718)
669 path = quote(path, HTTP_PATH_SAFE)
670 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
671 return path
672
673 def reach(h):
674 """Return reach of host h, as defined by RFC 2965, section 1.
675
676 The reach R of a host name H is defined as follows:
677
678 * If
679
680 - H is the host domain name of a host; and,
681
682 - H has the form A.B; and
683
684 - A has no embedded (that is, interior) dots; and
685
686 - B has at least one embedded dot, or B is the string "local".
687 then the reach of H is .B.
688
689 * Otherwise, the reach of H is H.
690
691 >>> reach("www.acme.com")
692 '.acme.com'
693 >>> reach("acme.com")
694 'acme.com'
695 >>> reach("acme.local")
696 '.local'
697
698 """
699 i = h.find(".")
700 if i >= 0:
701 #a = h[:i] # this line is only here to show what a is
702 b = h[i+1:]
703 i = b.find(".")
704 if is_HDN(h) and (i >= 0 or b == "local"):
705 return "."+b
706 return h
707
708 def is_third_party(request):
709 """
710
711 RFC 2965, section 3.3.6:
712
713 An unverifiable transaction is to a third-party host if its request-
714 host U does not domain-match the reach R of the request-host O in the
715 origin transaction.
716
717 """
718 req_host = request_host(request)
719 if not domain_match(req_host, reach(request.get_origin_req_host())):
720 return True
721 else:
722 return False
723
724
725 class Cookie(object):
726 """HTTP Cookie.
727
728 This class represents both Netscape and RFC 2965 cookies.
729
730 This is deliberately a very simple class. It just holds attributes. It's
731 possible to construct Cookie instances that don't comply with the cookie
732 standards. CookieJar.make_cookies is the factory function for Cookie
733 objects -- it deals with cookie parsing, supplying defaults, and
734 normalising to the representation used in this class. CookiePolicy is
735 responsible for checking them to see whether they should be accepted from
736 and returned to the server.
737
738 Note that the port may be present in the headers, but unspecified ("Port"
739 rather than"Port=80", for example); if this is the case, port is None.
740
741 """
742
743 def __init__(self, version, name, value,
744 port, port_specified,
745 domain, domain_specified, domain_initial_dot,
746 path, path_specified,
747 secure,
748 expires,
749 discard,
750 comment,
751 comment_url,
752 rest,
753 rfc2109=False,
754 ):
755
756 if version is not None: version = int(version)
757 if expires is not None: expires = int(expires)
758 if port is None and port_specified is True:
759 raise ValueError("if port is None, port_specified must be false")
760
761 self.version = version
762 self.name = name
763 self.value = value
764 self.port = port
765 self.port_specified = port_specified
766 # normalise case, as per RFC 2965 section 3.3.3
767 self.domain = domain.lower()
768 self.domain_specified = domain_specified
769 # Sigh. We need to know whether the domain given in the
770 # cookie-attribute had an initial dot, in order to follow RFC 2965
771 # (as clarified in draft errata). Needed for the returned $Domain
772 # value.
773 self.domain_initial_dot = domain_initial_dot
774 self.path = path
775 self.path_specified = path_specified
776 self.secure = secure
777 self.expires = expires
778 self.discard = discard
779 self.comment = comment
780 self.comment_url = comment_url
781 self.rfc2109 = rfc2109
782
783 self._rest = copy.copy(rest)
784
785 def has_nonstandard_attr(self, name):
786 return name in self._rest
787 def get_nonstandard_attr(self, name, default=None):
788 return self._rest.get(name, default)
789 def set_nonstandard_attr(self, name, value):
790 self._rest[name] = value
791
792 def is_expired(self, now=None):
793 if now is None: now = time.time()
794 if (self.expires is not None) and (self.expires <= now):
795 return True
796 return False
797
798 def __str__(self):
799 if self.port is None: p = ""
800 else: p = ":"+self.port
801 limit = self.domain + p + self.path
802 if self.value is not None:
803 namevalue = "%s=%s" % (self.name, self.value)
804 else:
805 namevalue = self.name
806 return "<Cookie %s for %s>" % (namevalue, limit)
807
808 @as_native_str()
809 def __repr__(self):
810 args = []
811 for name in ("version", "name", "value",
812 "port", "port_specified",
813 "domain", "domain_specified", "domain_initial_dot",
814 "path", "path_specified",
815 "secure", "expires", "discard", "comment", "comment_url",
816 ):
817 attr = getattr(self, name)
818 ### Python-Future:
819 # Avoid u'...' prefixes for unicode strings:
820 if isinstance(attr, str):
821 attr = str(attr)
822 ###
823 args.append(str("%s=%s") % (name, repr(attr)))
824 args.append("rest=%s" % repr(self._rest))
825 args.append("rfc2109=%s" % repr(self.rfc2109))
826 return "Cookie(%s)" % ", ".join(args)
827
828
829 class CookiePolicy(object):
830 """Defines which cookies get accepted from and returned to server.
831
832 May also modify cookies, though this is probably a bad idea.
833
834 The subclass DefaultCookiePolicy defines the standard rules for Netscape
835 and RFC 2965 cookies -- override that if you want a customised policy.
836
837 """
838 def set_ok(self, cookie, request):
839 """Return true if (and only if) cookie should be accepted from server.
840
841 Currently, pre-expired cookies never get this far -- the CookieJar
842 class deletes such cookies itself.
843
844 """
845 raise NotImplementedError()
846
847 def return_ok(self, cookie, request):
848 """Return true if (and only if) cookie should be returned to server."""
849 raise NotImplementedError()
850
851 def domain_return_ok(self, domain, request):
852 """Return false if cookies should not be returned, given cookie domain.
853 """
854 return True
855
856 def path_return_ok(self, path, request):
857 """Return false if cookies should not be returned, given cookie path.
858 """
859 return True
860
861
862 class DefaultCookiePolicy(CookiePolicy):
863 """Implements the standard rules for accepting and returning cookies."""
864
865 DomainStrictNoDots = 1
866 DomainStrictNonDomain = 2
867 DomainRFC2965Match = 4
868
869 DomainLiberal = 0
870 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
871
872 def __init__(self,
873 blocked_domains=None, allowed_domains=None,
874 netscape=True, rfc2965=False,
875 rfc2109_as_netscape=None,
876 hide_cookie2=False,
877 strict_domain=False,
878 strict_rfc2965_unverifiable=True,
879 strict_ns_unverifiable=False,
880 strict_ns_domain=DomainLiberal,
881 strict_ns_set_initial_dollar=False,
882 strict_ns_set_path=False,
883 ):
884 """Constructor arguments should be passed as keyword arguments only."""
885 self.netscape = netscape
886 self.rfc2965 = rfc2965
887 self.rfc2109_as_netscape = rfc2109_as_netscape
888 self.hide_cookie2 = hide_cookie2
889 self.strict_domain = strict_domain
890 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
891 self.strict_ns_unverifiable = strict_ns_unverifiable
892 self.strict_ns_domain = strict_ns_domain
893 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
894 self.strict_ns_set_path = strict_ns_set_path
895
896 if blocked_domains is not None:
897 self._blocked_domains = tuple(blocked_domains)
898 else:
899 self._blocked_domains = ()
900
901 if allowed_domains is not None:
902 allowed_domains = tuple(allowed_domains)
903 self._allowed_domains = allowed_domains
904
905 def blocked_domains(self):
906 """Return the sequence of blocked domains (as a tuple)."""
907 return self._blocked_domains
908 def set_blocked_domains(self, blocked_domains):
909 """Set the sequence of blocked domains."""
910 self._blocked_domains = tuple(blocked_domains)
911
912 def is_blocked(self, domain):
913 for blocked_domain in self._blocked_domains:
914 if user_domain_match(domain, blocked_domain):
915 return True
916 return False
917
918 def allowed_domains(self):
919 """Return None, or the sequence of allowed domains (as a tuple)."""
920 return self._allowed_domains
921 def set_allowed_domains(self, allowed_domains):
922 """Set the sequence of allowed domains, or None."""
923 if allowed_domains is not None:
924 allowed_domains = tuple(allowed_domains)
925 self._allowed_domains = allowed_domains
926
927 def is_not_allowed(self, domain):
928 if self._allowed_domains is None:
929 return False
930 for allowed_domain in self._allowed_domains:
931 if user_domain_match(domain, allowed_domain):
932 return False
933 return True
934
935 def set_ok(self, cookie, request):
936 """
937 If you override .set_ok(), be sure to call this method. If it returns
938 false, so should your subclass (assuming your subclass wants to be more
939 strict about which cookies to accept).
940
941 """
942 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
943
944 assert cookie.name is not None
945
946 for n in "version", "verifiability", "name", "path", "domain", "port":
947 fn_name = "set_ok_"+n
948 fn = getattr(self, fn_name)
949 if not fn(cookie, request):
950 return False
951
952 return True
953
954 def set_ok_version(self, cookie, request):
955 if cookie.version is None:
956 # Version is always set to 0 by parse_ns_headers if it's a Netscape
957 # cookie, so this must be an invalid RFC 2965 cookie.
958 _debug(" Set-Cookie2 without version attribute (%s=%s)",
959 cookie.name, cookie.value)
960 return False
961 if cookie.version > 0 and not self.rfc2965:
962 _debug(" RFC 2965 cookies are switched off")
963 return False
964 elif cookie.version == 0 and not self.netscape:
965 _debug(" Netscape cookies are switched off")
966 return False
967 return True
968
969 def set_ok_verifiability(self, cookie, request):
970 if request.unverifiable and is_third_party(request):
971 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
972 _debug(" third-party RFC 2965 cookie during "
973 "unverifiable transaction")
974 return False
975 elif cookie.version == 0 and self.strict_ns_unverifiable:
976 _debug(" third-party Netscape cookie during "
977 "unverifiable transaction")
978 return False
979 return True
980
981 def set_ok_name(self, cookie, request):
982 # Try and stop servers setting V0 cookies designed to hack other
983 # servers that know both V0 and V1 protocols.
984 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
985 cookie.name.startswith("$")):
986 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
987 return False
988 return True
989
990 def set_ok_path(self, cookie, request):
991 if cookie.path_specified:
992 req_path = request_path(request)
993 if ((cookie.version > 0 or
994 (cookie.version == 0 and self.strict_ns_set_path)) and
995 not req_path.startswith(cookie.path)):
996 _debug(" path attribute %s is not a prefix of request "
997 "path %s", cookie.path, req_path)
998 return False
999 return True
1000
1001 def set_ok_domain(self, cookie, request):
1002 if self.is_blocked(cookie.domain):
1003 _debug(" domain %s is in user block-list", cookie.domain)
1004 return False
1005 if self.is_not_allowed(cookie.domain):
1006 _debug(" domain %s is not in user allow-list", cookie.domain)
1007 return False
1008 if cookie.domain_specified:
1009 req_host, erhn = eff_request_host(request)
1010 domain = cookie.domain
1011 if self.strict_domain and (domain.count(".") >= 2):
1012 # XXX This should probably be compared with the Konqueror
1013 # (kcookiejar.cpp) and Mozilla implementations, but it's a
1014 # losing battle.
1015 i = domain.rfind(".")
1016 j = domain.rfind(".", 0, i)
1017 if j == 0: # domain like .foo.bar
1018 tld = domain[i+1:]
1019 sld = domain[j+1:i]
1020 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
1021 "gov", "mil", "int", "aero", "biz", "cat", "coop",
1022 "info", "jobs", "mobi", "museum", "name", "pro",
1023 "travel", "eu") and len(tld) == 2:
1024 # domain like .co.uk
1025 _debug(" country-code second level domain %s", domain)
1026 return False
1027 if domain.startswith("."):
1028 undotted_domain = domain[1:]
1029 else:
1030 undotted_domain = domain
1031 embedded_dots = (undotted_domain.find(".") >= 0)
1032 if not embedded_dots and domain != ".local":
1033 _debug(" non-local domain %s contains no embedded dot",
1034 domain)
1035 return False
1036 if cookie.version == 0:
1037 if (not erhn.endswith(domain) and
1038 (not erhn.startswith(".") and
1039 not ("."+erhn).endswith(domain))):
1040 _debug(" effective request-host %s (even with added "
1041 "initial dot) does not end with %s",
1042 erhn, domain)
1043 return False
1044 if (cookie.version > 0 or
1045 (self.strict_ns_domain & self.DomainRFC2965Match)):
1046 if not domain_match(erhn, domain):
1047 _debug(" effective request-host %s does not domain-match "
1048 "%s", erhn, domain)
1049 return False
1050 if (cookie.version > 0 or
1051 (self.strict_ns_domain & self.DomainStrictNoDots)):
1052 host_prefix = req_host[:-len(domain)]
1053 if (host_prefix.find(".") >= 0 and
1054 not IPV4_RE.search(req_host)):
1055 _debug(" host prefix %s for domain %s contains a dot",
1056 host_prefix, domain)
1057 return False
1058 return True
1059
1060 def set_ok_port(self, cookie, request):
1061 if cookie.port_specified:
1062 req_port = request_port(request)
1063 if req_port is None:
1064 req_port = "80"
1065 else:
1066 req_port = str(req_port)
1067 for p in cookie.port.split(","):
1068 try:
1069 int(p)
1070 except ValueError:
1071 _debug(" bad port %s (not numeric)", p)
1072 return False
1073 if p == req_port:
1074 break
1075 else:
1076 _debug(" request port (%s) not found in %s",
1077 req_port, cookie.port)
1078 return False
1079 return True
1080
1081 def return_ok(self, cookie, request):
1082 """
1083 If you override .return_ok(), be sure to call this method. If it
1084 returns false, so should your subclass (assuming your subclass wants to
1085 be more strict about which cookies to return).
1086
1087 """
1088 # Path has already been checked by .path_return_ok(), and domain
1089 # blocking done by .domain_return_ok().
1090 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1091
1092 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1093 fn_name = "return_ok_"+n
1094 fn = getattr(self, fn_name)
1095 if not fn(cookie, request):
1096 return False
1097 return True
1098
1099 def return_ok_version(self, cookie, request):
1100 if cookie.version > 0 and not self.rfc2965:
1101 _debug(" RFC 2965 cookies are switched off")
1102 return False
1103 elif cookie.version == 0 and not self.netscape:
1104 _debug(" Netscape cookies are switched off")
1105 return False
1106 return True
1107
1108 def return_ok_verifiability(self, cookie, request):
1109 if request.unverifiable and is_third_party(request):
1110 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1111 _debug(" third-party RFC 2965 cookie during unverifiable "
1112 "transaction")
1113 return False
1114 elif cookie.version == 0 and self.strict_ns_unverifiable:
1115 _debug(" third-party Netscape cookie during unverifiable "
1116 "transaction")
1117 return False
1118 return True
1119
1120 def return_ok_secure(self, cookie, request):
1121 if cookie.secure and request.type != "https":
1122 _debug(" secure cookie with non-secure request")
1123 return False
1124 return True
1125
1126 def return_ok_expires(self, cookie, request):
1127 if cookie.is_expired(self._now):
1128 _debug(" cookie expired")
1129 return False
1130 return True
1131
1132 def return_ok_port(self, cookie, request):
1133 if cookie.port:
1134 req_port = request_port(request)
1135 if req_port is None:
1136 req_port = "80"
1137 for p in cookie.port.split(","):
1138 if p == req_port:
1139 break
1140 else:
1141 _debug(" request port %s does not match cookie port %s",
1142 req_port, cookie.port)
1143 return False
1144 return True
1145
1146 def return_ok_domain(self, cookie, request):
1147 req_host, erhn = eff_request_host(request)
1148 domain = cookie.domain
1149
1150 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1151 if (cookie.version == 0 and
1152 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1153 not cookie.domain_specified and domain != erhn):
1154 _debug(" cookie with unspecified domain does not string-compare "
1155 "equal to request domain")
1156 return False
1157
1158 if cookie.version > 0 and not domain_match(erhn, domain):
1159 _debug(" effective request-host name %s does not domain-match "
1160 "RFC 2965 cookie domain %s", erhn, domain)
1161 return False
1162 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1163 _debug(" request-host %s does not match Netscape cookie domain "
1164 "%s", req_host, domain)
1165 return False
1166 return True
1167
1168 def domain_return_ok(self, domain, request):
1169 # Liberal check of. This is here as an optimization to avoid
1170 # having to load lots of MSIE cookie files unless necessary.
1171 req_host, erhn = eff_request_host(request)
1172 if not req_host.startswith("."):
1173 req_host = "."+req_host
1174 if not erhn.startswith("."):
1175 erhn = "."+erhn
1176 if not (req_host.endswith(domain) or erhn.endswith(domain)):
1177 #_debug(" request domain %s does not match cookie domain %s",
1178 # req_host, domain)
1179 return False
1180
1181 if self.is_blocked(domain):
1182 _debug(" domain %s is in user block-list", domain)
1183 return False
1184 if self.is_not_allowed(domain):
1185 _debug(" domain %s is not in user allow-list", domain)
1186 return False
1187
1188 return True
1189
1190 def path_return_ok(self, path, request):
1191 _debug("- checking cookie path=%s", path)
1192 req_path = request_path(request)
1193 if not req_path.startswith(path):
1194 _debug(" %s does not path-match %s", req_path, path)
1195 return False
1196 return True
1197
1198
1199 def vals_sorted_by_key(adict):
1200 keys = sorted(adict.keys())
1201 return map(adict.get, keys)
1202
1203 def deepvalues(mapping):
1204 """Iterates over nested mapping, depth-first, in sorted order by key."""
1205 values = vals_sorted_by_key(mapping)
1206 for obj in values:
1207 mapping = False
1208 try:
1209 obj.items
1210 except AttributeError:
1211 pass
1212 else:
1213 mapping = True
1214 for subobj in deepvalues(obj):
1215 yield subobj
1216 if not mapping:
1217 yield obj
1218
1219
1220 # Used as second parameter to dict.get() method, to distinguish absent
1221 # dict key from one with a None value.
1222 class Absent(object): pass
1223
1224 class CookieJar(object):
1225 """Collection of HTTP cookies.
1226
1227 You may not need to know about this class: try
1228 urllib.request.build_opener(HTTPCookieProcessor).open(url).
1229 """
1230
1231 non_word_re = re.compile(r"\W")
1232 quote_re = re.compile(r"([\"\\])")
1233 strict_domain_re = re.compile(r"\.?[^.]*")
1234 domain_re = re.compile(r"[^.]*")
1235 dots_re = re.compile(r"^\.+")
1236
1237 magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
1238
1239 def __init__(self, policy=None):
1240 if policy is None:
1241 policy = DefaultCookiePolicy()
1242 self._policy = policy
1243
1244 self._cookies_lock = _threading.RLock()
1245 self._cookies = {}
1246
1247 def set_policy(self, policy):
1248 self._policy = policy
1249
1250 def _cookies_for_domain(self, domain, request):
1251 cookies = []
1252 if not self._policy.domain_return_ok(domain, request):
1253 return []
1254 _debug("Checking %s for cookies to return", domain)
1255 cookies_by_path = self._cookies[domain]
1256 for path in cookies_by_path.keys():
1257 if not self._policy.path_return_ok(path, request):
1258 continue
1259 cookies_by_name = cookies_by_path[path]
1260 for cookie in cookies_by_name.values():
1261 if not self._policy.return_ok(cookie, request):
1262 _debug(" not returning cookie")
1263 continue
1264 _debug(" it's a match")
1265 cookies.append(cookie)
1266 return cookies
1267
1268 def _cookies_for_request(self, request):
1269 """Return a list of cookies to be returned to server."""
1270 cookies = []
1271 for domain in self._cookies.keys():
1272 cookies.extend(self._cookies_for_domain(domain, request))
1273 return cookies
1274
1275 def _cookie_attrs(self, cookies):
1276 """Return a list of cookie-attributes to be returned to server.
1277
1278 like ['foo="bar"; $Path="/"', ...]
1279
1280 The $Version attribute is also added when appropriate (currently only
1281 once per request).
1282
1283 """
1284 # add cookies in order of most specific (ie. longest) path first
1285 cookies.sort(key=lambda a: len(a.path), reverse=True)
1286
1287 version_set = False
1288
1289 attrs = []
1290 for cookie in cookies:
1291 # set version of Cookie header
1292 # XXX
1293 # What should it be if multiple matching Set-Cookie headers have
1294 # different versions themselves?
1295 # Answer: there is no answer; was supposed to be settled by
1296 # RFC 2965 errata, but that may never appear...
1297 version = cookie.version
1298 if not version_set:
1299 version_set = True
1300 if version > 0:
1301 attrs.append("$Version=%s" % version)
1302
1303 # quote cookie value if necessary
1304 # (not for Netscape protocol, which already has any quotes
1305 # intact, due to the poorly-specified Netscape Cookie: syntax)
1306 if ((cookie.value is not None) and
1307 self.non_word_re.search(cookie.value) and version > 0):
1308 value = self.quote_re.sub(r"\\\1", cookie.value)
1309 else:
1310 value = cookie.value
1311
1312 # add cookie-attributes to be returned in Cookie header
1313 if cookie.value is None:
1314 attrs.append(cookie.name)
1315 else:
1316 attrs.append("%s=%s" % (cookie.name, value))
1317 if version > 0:
1318 if cookie.path_specified:
1319 attrs.append('$Path="%s"' % cookie.path)
1320 if cookie.domain.startswith("."):
1321 domain = cookie.domain
1322 if (not cookie.domain_initial_dot and
1323 domain.startswith(".")):
1324 domain = domain[1:]
1325 attrs.append('$Domain="%s"' % domain)
1326 if cookie.port is not None:
1327 p = "$Port"
1328 if cookie.port_specified:
1329 p = p + ('="%s"' % cookie.port)
1330 attrs.append(p)
1331
1332 return attrs
1333
1334 def add_cookie_header(self, request):
1335 """Add correct Cookie: header to request (urllib.request.Request object).
1336
1337 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1338
1339 """
1340 _debug("add_cookie_header")
1341 self._cookies_lock.acquire()
1342 try:
1343
1344 self._policy._now = self._now = int(time.time())
1345
1346 cookies = self._cookies_for_request(request)
1347
1348 attrs = self._cookie_attrs(cookies)
1349 if attrs:
1350 if not request.has_header("Cookie"):
1351 request.add_unredirected_header(
1352 "Cookie", "; ".join(attrs))
1353
1354 # if necessary, advertise that we know RFC 2965
1355 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1356 not request.has_header("Cookie2")):
1357 for cookie in cookies:
1358 if cookie.version != 1:
1359 request.add_unredirected_header("Cookie2", '$Version="1"')
1360 break
1361
1362 finally:
1363 self._cookies_lock.release()
1364
1365 self.clear_expired_cookies()
1366
1367 def _normalized_cookie_tuples(self, attrs_set):
1368 """Return list of tuples containing normalised cookie information.
1369
1370 attrs_set is the list of lists of key,value pairs extracted from
1371 the Set-Cookie or Set-Cookie2 headers.
1372
1373 Tuples are name, value, standard, rest, where name and value are the
1374 cookie name and value, standard is a dictionary containing the standard
1375 cookie-attributes (discard, secure, version, expires or max-age,
1376 domain, path and port) and rest is a dictionary containing the rest of
1377 the cookie-attributes.
1378
1379 """
1380 cookie_tuples = []
1381
1382 boolean_attrs = "discard", "secure"
1383 value_attrs = ("version",
1384 "expires", "max-age",
1385 "domain", "path", "port",
1386 "comment", "commenturl")
1387
1388 for cookie_attrs in attrs_set:
1389 name, value = cookie_attrs[0]
1390
1391 # Build dictionary of standard cookie-attributes (standard) and
1392 # dictionary of other cookie-attributes (rest).
1393
1394 # Note: expiry time is normalised to seconds since epoch. V0
1395 # cookies should have the Expires cookie-attribute, and V1 cookies
1396 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1397 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1398 # accept either (but prefer Max-Age).
1399 max_age_set = False
1400
1401 bad_cookie = False
1402
1403 standard = {}
1404 rest = {}
1405 for k, v in cookie_attrs[1:]:
1406 lc = k.lower()
1407 # don't lose case distinction for unknown fields
1408 if lc in value_attrs or lc in boolean_attrs:
1409 k = lc
1410 if k in boolean_attrs and v is None:
1411 # boolean cookie-attribute is present, but has no value
1412 # (like "discard", rather than "port=80")
1413 v = True
1414 if k in standard:
1415 # only first value is significant
1416 continue
1417 if k == "domain":
1418 if v is None:
1419 _debug(" missing value for domain attribute")
1420 bad_cookie = True
1421 break
1422 # RFC 2965 section 3.3.3
1423 v = v.lower()
1424 if k == "expires":
1425 if max_age_set:
1426 # Prefer max-age to expires (like Mozilla)
1427 continue
1428 if v is None:
1429 _debug(" missing or invalid value for expires "
1430 "attribute: treating as session cookie")
1431 continue
1432 if k == "max-age":
1433 max_age_set = True
1434 try:
1435 v = int(v)
1436 except ValueError:
1437 _debug(" missing or invalid (non-numeric) value for "
1438 "max-age attribute")
1439 bad_cookie = True
1440 break
1441 # convert RFC 2965 Max-Age to seconds since epoch
1442 # XXX Strictly you're supposed to follow RFC 2616
1443 # age-calculation rules. Remember that zero Max-Age is a
1444 # is a request to discard (old and new) cookie, though.
1445 k = "expires"
1446 v = self._now + v
1447 if (k in value_attrs) or (k in boolean_attrs):
1448 if (v is None and
1449 k not in ("port", "comment", "commenturl")):
1450 _debug(" missing value for %s attribute" % k)
1451 bad_cookie = True
1452 break
1453 standard[k] = v
1454 else:
1455 rest[k] = v
1456
1457 if bad_cookie:
1458 continue
1459
1460 cookie_tuples.append((name, value, standard, rest))
1461
1462 return cookie_tuples
1463
1464 def _cookie_from_cookie_tuple(self, tup, request):
1465 # standard is dict of standard cookie-attributes, rest is dict of the
1466 # rest of them
1467 name, value, standard, rest = tup
1468
1469 domain = standard.get("domain", Absent)
1470 path = standard.get("path", Absent)
1471 port = standard.get("port", Absent)
1472 expires = standard.get("expires", Absent)
1473
1474 # set the easy defaults
1475 version = standard.get("version", None)
1476 if version is not None:
1477 try:
1478 version = int(version)
1479 except ValueError:
1480 return None # invalid version, ignore cookie
1481 secure = standard.get("secure", False)
1482 # (discard is also set if expires is Absent)
1483 discard = standard.get("discard", False)
1484 comment = standard.get("comment", None)
1485 comment_url = standard.get("commenturl", None)
1486
1487 # set default path
1488 if path is not Absent and path != "":
1489 path_specified = True
1490 path = escape_path(path)
1491 else:
1492 path_specified = False
1493 path = request_path(request)
1494 i = path.rfind("/")
1495 if i != -1:
1496 if version == 0:
1497 # Netscape spec parts company from reality here
1498 path = path[:i]
1499 else:
1500 path = path[:i+1]
1501 if len(path) == 0: path = "/"
1502
1503 # set default domain
1504 domain_specified = domain is not Absent
1505 # but first we have to remember whether it starts with a dot
1506 domain_initial_dot = False
1507 if domain_specified:
1508 domain_initial_dot = bool(domain.startswith("."))
1509 if domain is Absent:
1510 req_host, erhn = eff_request_host(request)
1511 domain = erhn
1512 elif not domain.startswith("."):
1513 domain = "."+domain
1514
1515 # set default port
1516 port_specified = False
1517 if port is not Absent:
1518 if port is None:
1519 # Port attr present, but has no value: default to request port.
1520 # Cookie should then only be sent back on that port.
1521 port = request_port(request)
1522 else:
1523 port_specified = True
1524 port = re.sub(r"\s+", "", port)
1525 else:
1526 # No port attr present. Cookie can be sent back on any port.
1527 port = None
1528
1529 # set default expires and discard
1530 if expires is Absent:
1531 expires = None
1532 discard = True
1533 elif expires <= self._now:
1534 # Expiry date in past is request to delete cookie. This can't be
1535 # in DefaultCookiePolicy, because can't delete cookies there.
1536 try:
1537 self.clear(domain, path, name)
1538 except KeyError:
1539 pass
1540 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1541 domain, path, name)
1542 return None
1543
1544 return Cookie(version,
1545 name, value,
1546 port, port_specified,
1547 domain, domain_specified, domain_initial_dot,
1548 path, path_specified,
1549 secure,
1550 expires,
1551 discard,
1552 comment,
1553 comment_url,
1554 rest)
1555
1556 def _cookies_from_attrs_set(self, attrs_set, request):
1557 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1558
1559 cookies = []
1560 for tup in cookie_tuples:
1561 cookie = self._cookie_from_cookie_tuple(tup, request)
1562 if cookie: cookies.append(cookie)
1563 return cookies
1564
1565 def _process_rfc2109_cookies(self, cookies):
1566 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1567 if rfc2109_as_ns is None:
1568 rfc2109_as_ns = not self._policy.rfc2965
1569 for cookie in cookies:
1570 if cookie.version == 1:
1571 cookie.rfc2109 = True
1572 if rfc2109_as_ns:
1573 # treat 2109 cookies as Netscape cookies rather than
1574 # as RFC2965 cookies
1575 cookie.version = 0
1576
1577 def make_cookies(self, response, request):
1578 """Return sequence of Cookie objects extracted from response object."""
1579 # get cookie-attributes for RFC 2965 and Netscape protocols
1580 headers = response.info()
1581 rfc2965_hdrs = headers.get_all("Set-Cookie2", [])
1582 ns_hdrs = headers.get_all("Set-Cookie", [])
1583
1584 rfc2965 = self._policy.rfc2965
1585 netscape = self._policy.netscape
1586
1587 if ((not rfc2965_hdrs and not ns_hdrs) or
1588 (not ns_hdrs and not rfc2965) or
1589 (not rfc2965_hdrs and not netscape) or
1590 (not netscape and not rfc2965)):
1591 return [] # no relevant cookie headers: quick exit
1592
1593 try:
1594 cookies = self._cookies_from_attrs_set(
1595 split_header_words(rfc2965_hdrs), request)
1596 except Exception:
1597 _warn_unhandled_exception()
1598 cookies = []
1599
1600 if ns_hdrs and netscape:
1601 try:
1602 # RFC 2109 and Netscape cookies
1603 ns_cookies = self._cookies_from_attrs_set(
1604 parse_ns_headers(ns_hdrs), request)
1605 except Exception:
1606 _warn_unhandled_exception()
1607 ns_cookies = []
1608 self._process_rfc2109_cookies(ns_cookies)
1609
1610 # Look for Netscape cookies (from Set-Cookie headers) that match
1611 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1612 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1613 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1614 # bundled in with the Netscape cookies for this purpose, which is
1615 # reasonable behaviour.
1616 if rfc2965:
1617 lookup = {}
1618 for cookie in cookies:
1619 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1620
1621 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1622 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1623 return key not in lookup
1624 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1625
1626 if ns_cookies:
1627 cookies.extend(ns_cookies)
1628
1629 return cookies
1630
1631 def set_cookie_if_ok(self, cookie, request):
1632 """Set a cookie if policy says it's OK to do so."""
1633 self._cookies_lock.acquire()
1634 try:
1635 self._policy._now = self._now = int(time.time())
1636
1637 if self._policy.set_ok(cookie, request):
1638 self.set_cookie(cookie)
1639
1640
1641 finally:
1642 self._cookies_lock.release()
1643
1644 def set_cookie(self, cookie):
1645 """Set a cookie, without checking whether or not it should be set."""
1646 c = self._cookies
1647 self._cookies_lock.acquire()
1648 try:
1649 if cookie.domain not in c: c[cookie.domain] = {}
1650 c2 = c[cookie.domain]
1651 if cookie.path not in c2: c2[cookie.path] = {}
1652 c3 = c2[cookie.path]
1653 c3[cookie.name] = cookie
1654 finally:
1655 self._cookies_lock.release()
1656
1657 def extract_cookies(self, response, request):
1658 """Extract cookies from response, where allowable given the request."""
1659 _debug("extract_cookies: %s", response.info())
1660 self._cookies_lock.acquire()
1661 try:
1662 self._policy._now = self._now = int(time.time())
1663
1664 for cookie in self.make_cookies(response, request):
1665 if self._policy.set_ok(cookie, request):
1666 _debug(" setting cookie: %s", cookie)
1667 self.set_cookie(cookie)
1668 finally:
1669 self._cookies_lock.release()
1670
1671 def clear(self, domain=None, path=None, name=None):
1672 """Clear some cookies.
1673
1674 Invoking this method without arguments will clear all cookies. If
1675 given a single argument, only cookies belonging to that domain will be
1676 removed. If given two arguments, cookies belonging to the specified
1677 path within that domain are removed. If given three arguments, then
1678 the cookie with the specified name, path and domain is removed.
1679
1680 Raises KeyError if no matching cookie exists.
1681
1682 """
1683 if name is not None:
1684 if (domain is None) or (path is None):
1685 raise ValueError(
1686 "domain and path must be given to remove a cookie by name")
1687 del self._cookies[domain][path][name]
1688 elif path is not None:
1689 if domain is None:
1690 raise ValueError(
1691 "domain must be given to remove cookies by path")
1692 del self._cookies[domain][path]
1693 elif domain is not None:
1694 del self._cookies[domain]
1695 else:
1696 self._cookies = {}
1697
1698 def clear_session_cookies(self):
1699 """Discard all session cookies.
1700
1701 Note that the .save() method won't save session cookies anyway, unless
1702 you ask otherwise by passing a true ignore_discard argument.
1703
1704 """
1705 self._cookies_lock.acquire()
1706 try:
1707 for cookie in self:
1708 if cookie.discard:
1709 self.clear(cookie.domain, cookie.path, cookie.name)
1710 finally:
1711 self._cookies_lock.release()
1712
1713 def clear_expired_cookies(self):
1714 """Discard all expired cookies.
1715
1716 You probably don't need to call this method: expired cookies are never
1717 sent back to the server (provided you're using DefaultCookiePolicy),
1718 this method is called by CookieJar itself every so often, and the
1719 .save() method won't save expired cookies anyway (unless you ask
1720 otherwise by passing a true ignore_expires argument).
1721
1722 """
1723 self._cookies_lock.acquire()
1724 try:
1725 now = time.time()
1726 for cookie in self:
1727 if cookie.is_expired(now):
1728 self.clear(cookie.domain, cookie.path, cookie.name)
1729 finally:
1730 self._cookies_lock.release()
1731
1732 def __iter__(self):
1733 return deepvalues(self._cookies)
1734
1735 def __len__(self):
1736 """Return number of contained cookies."""
1737 i = 0
1738 for cookie in self: i = i + 1
1739 return i
1740
1741 @as_native_str()
1742 def __repr__(self):
1743 r = []
1744 for cookie in self: r.append(repr(cookie))
1745 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1746
1747 def __str__(self):
1748 r = []
1749 for cookie in self: r.append(str(cookie))
1750 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1751
1752
1753 # derives from IOError for backwards-compatibility with Python 2.4.0
1754 class LoadError(IOError): pass
1755
1756 class FileCookieJar(CookieJar):
1757 """CookieJar that can be loaded from and saved to a file."""
1758
1759 def __init__(self, filename=None, delayload=False, policy=None):
1760 """
1761 Cookies are NOT loaded from the named file until either the .load() or
1762 .revert() method is called.
1763
1764 """
1765 CookieJar.__init__(self, policy)
1766 if filename is not None:
1767 try:
1768 filename+""
1769 except:
1770 raise ValueError("filename must be string-like")
1771 self.filename = filename
1772 self.delayload = bool(delayload)
1773
1774 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1775 """Save cookies to a file."""
1776 raise NotImplementedError()
1777
1778 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1779 """Load cookies from a file."""
1780 if filename is None:
1781 if self.filename is not None: filename = self.filename
1782 else: raise ValueError(MISSING_FILENAME_TEXT)
1783
1784 f = open(filename)
1785 try:
1786 self._really_load(f, filename, ignore_discard, ignore_expires)
1787 finally:
1788 f.close()
1789
1790 def revert(self, filename=None,
1791 ignore_discard=False, ignore_expires=False):
1792 """Clear all cookies and reload cookies from a saved file.
1793
1794 Raises LoadError (or IOError) if reversion is not successful; the
1795 object's state will not be altered if this happens.
1796
1797 """
1798 if filename is None:
1799 if self.filename is not None: filename = self.filename
1800 else: raise ValueError(MISSING_FILENAME_TEXT)
1801
1802 self._cookies_lock.acquire()
1803 try:
1804
1805 old_state = copy.deepcopy(self._cookies)
1806 self._cookies = {}
1807 try:
1808 self.load(filename, ignore_discard, ignore_expires)
1809 except (LoadError, IOError):
1810 self._cookies = old_state
1811 raise
1812
1813 finally:
1814 self._cookies_lock.release()
1815
1816
1817 def lwp_cookie_str(cookie):
1818 """Return string representation of Cookie in an the LWP cookie file format.
1819
1820 Actually, the format is extended a bit -- see module docstring.
1821
1822 """
1823 h = [(cookie.name, cookie.value),
1824 ("path", cookie.path),
1825 ("domain", cookie.domain)]
1826 if cookie.port is not None: h.append(("port", cookie.port))
1827 if cookie.path_specified: h.append(("path_spec", None))
1828 if cookie.port_specified: h.append(("port_spec", None))
1829 if cookie.domain_initial_dot: h.append(("domain_dot", None))
1830 if cookie.secure: h.append(("secure", None))
1831 if cookie.expires: h.append(("expires",
1832 time2isoz(float(cookie.expires))))
1833 if cookie.discard: h.append(("discard", None))
1834 if cookie.comment: h.append(("comment", cookie.comment))
1835 if cookie.comment_url: h.append(("commenturl", cookie.comment_url))
1836
1837 keys = sorted(cookie._rest.keys())
1838 for k in keys:
1839 h.append((k, str(cookie._rest[k])))
1840
1841 h.append(("version", str(cookie.version)))
1842
1843 return join_header_words([h])
1844
1845 class LWPCookieJar(FileCookieJar):
1846 """
1847 The LWPCookieJar saves a sequence of "Set-Cookie3" lines.
1848 "Set-Cookie3" is the format used by the libwww-perl libary, not known
1849 to be compatible with any browser, but which is easy to read and
1850 doesn't lose information about RFC 2965 cookies.
1851
1852 Additional methods
1853
1854 as_lwp_str(ignore_discard=True, ignore_expired=True)
1855
1856 """
1857
1858 def as_lwp_str(self, ignore_discard=True, ignore_expires=True):
1859 """Return cookies as a string of "\\n"-separated "Set-Cookie3" headers.
1860
1861 ignore_discard and ignore_expires: see docstring for FileCookieJar.save
1862
1863 """
1864 now = time.time()
1865 r = []
1866 for cookie in self:
1867 if not ignore_discard and cookie.discard:
1868 continue
1869 if not ignore_expires and cookie.is_expired(now):
1870 continue
1871 r.append("Set-Cookie3: %s" % lwp_cookie_str(cookie))
1872 return "\n".join(r+[""])
1873
1874 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1875 if filename is None:
1876 if self.filename is not None: filename = self.filename
1877 else: raise ValueError(MISSING_FILENAME_TEXT)
1878
1879 f = open(filename, "w")
1880 try:
1881 # There really isn't an LWP Cookies 2.0 format, but this indicates
1882 # that there is extra information in here (domain_dot and
1883 # port_spec) while still being compatible with libwww-perl, I hope.
1884 f.write("#LWP-Cookies-2.0\n")
1885 f.write(self.as_lwp_str(ignore_discard, ignore_expires))
1886 finally:
1887 f.close()
1888
1889 def _really_load(self, f, filename, ignore_discard, ignore_expires):
1890 magic = f.readline()
1891 if not self.magic_re.search(magic):
1892 msg = ("%r does not look like a Set-Cookie3 (LWP) format "
1893 "file" % filename)
1894 raise LoadError(msg)
1895
1896 now = time.time()
1897
1898 header = "Set-Cookie3:"
1899 boolean_attrs = ("port_spec", "path_spec", "domain_dot",
1900 "secure", "discard")
1901 value_attrs = ("version",
1902 "port", "path", "domain",
1903 "expires",
1904 "comment", "commenturl")
1905
1906 try:
1907 while 1:
1908 line = f.readline()
1909 if line == "": break
1910 if not line.startswith(header):
1911 continue
1912 line = line[len(header):].strip()
1913
1914 for data in split_header_words([line]):
1915 name, value = data[0]
1916 standard = {}
1917 rest = {}
1918 for k in boolean_attrs:
1919 standard[k] = False
1920 for k, v in data[1:]:
1921 if k is not None:
1922 lc = k.lower()
1923 else:
1924 lc = None
1925 # don't lose case distinction for unknown fields
1926 if (lc in value_attrs) or (lc in boolean_attrs):
1927 k = lc
1928 if k in boolean_attrs:
1929 if v is None: v = True
1930 standard[k] = v
1931 elif k in value_attrs:
1932 standard[k] = v
1933 else:
1934 rest[k] = v
1935
1936 h = standard.get
1937 expires = h("expires")
1938 discard = h("discard")
1939 if expires is not None:
1940 expires = iso2time(expires)
1941 if expires is None:
1942 discard = True
1943 domain = h("domain")
1944 domain_specified = domain.startswith(".")
1945 c = Cookie(h("version"), name, value,
1946 h("port"), h("port_spec"),
1947 domain, domain_specified, h("domain_dot"),
1948 h("path"), h("path_spec"),
1949 h("secure"),
1950 expires,
1951 discard,
1952 h("comment"),
1953 h("commenturl"),
1954 rest)
1955 if not ignore_discard and c.discard:
1956 continue
1957 if not ignore_expires and c.is_expired(now):
1958 continue
1959 self.set_cookie(c)
1960
1961 except IOError:
1962 raise
1963 except Exception:
1964 _warn_unhandled_exception()
1965 raise LoadError("invalid Set-Cookie3 format file %r: %r" %
1966 (filename, line))
1967
1968
1969 class MozillaCookieJar(FileCookieJar):
1970 """
1971
1972 WARNING: you may want to backup your browser's cookies file if you use
1973 this class to save cookies. I *think* it works, but there have been
1974 bugs in the past!
1975
1976 This class differs from CookieJar only in the format it uses to save and
1977 load cookies to and from a file. This class uses the Mozilla/Netscape
1978 `cookies.txt' format. lynx uses this file format, too.
1979
1980 Don't expect cookies saved while the browser is running to be noticed by
1981 the browser (in fact, Mozilla on unix will overwrite your saved cookies if
1982 you change them on disk while it's running; on Windows, you probably can't
1983 save at all while the browser is running).
1984
1985 Note that the Mozilla/Netscape format will downgrade RFC2965 cookies to
1986 Netscape cookies on saving.
1987
1988 In particular, the cookie version and port number information is lost,
1989 together with information about whether or not Path, Port and Discard were
1990 specified by the Set-Cookie2 (or Set-Cookie) header, and whether or not the
1991 domain as set in the HTTP header started with a dot (yes, I'm aware some
1992 domains in Netscape files start with a dot and some don't -- trust me, you
1993 really don't want to know any more about this).
1994
1995 Note that though Mozilla and Netscape use the same format, they use
1996 slightly different headers. The class saves cookies using the Netscape
1997 header by default (Mozilla can cope with that).
1998
1999 """
2000 magic_re = re.compile("#( Netscape)? HTTP Cookie File")
2001 header = """\
2002 # Netscape HTTP Cookie File
2003 # http://www.netscape.com/newsref/std/cookie_spec.html
2004 # This is a generated file! Do not edit.
2005
2006 """
2007
2008 def _really_load(self, f, filename, ignore_discard, ignore_expires):
2009 now = time.time()
2010
2011 magic = f.readline()
2012 if not self.magic_re.search(magic):
2013 f.close()
2014 raise LoadError(
2015 "%r does not look like a Netscape format cookies file" %
2016 filename)
2017
2018 try:
2019 while 1:
2020 line = f.readline()
2021 if line == "": break
2022
2023 # last field may be absent, so keep any trailing tab
2024 if line.endswith("\n"): line = line[:-1]
2025
2026 # skip comments and blank lines XXX what is $ for?
2027 if (line.strip().startswith(("#", "$")) or
2028 line.strip() == ""):
2029 continue
2030
2031 domain, domain_specified, path, secure, expires, name, value = \
2032 line.split("\t")
2033 secure = (secure == "TRUE")
2034 domain_specified = (domain_specified == "TRUE")
2035 if name == "":
2036 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2037 # with no name, whereas http.cookiejar regards it as a
2038 # cookie with no value.
2039 name = value
2040 value = None
2041
2042 initial_dot = domain.startswith(".")
2043 assert domain_specified == initial_dot
2044
2045 discard = False
2046 if expires == "":
2047 expires = None
2048 discard = True
2049
2050 # assume path_specified is false
2051 c = Cookie(0, name, value,
2052 None, False,
2053 domain, domain_specified, initial_dot,
2054 path, False,
2055 secure,
2056 expires,
2057 discard,
2058 None,
2059 None,
2060 {})
2061 if not ignore_discard and c.discard:
2062 continue
2063 if not ignore_expires and c.is_expired(now):
2064 continue
2065 self.set_cookie(c)
2066
2067 except IOError:
2068 raise
2069 except Exception:
2070 _warn_unhandled_exception()
2071 raise LoadError("invalid Netscape format cookies file %r: %r" %
2072 (filename, line))
2073
2074 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2075 if filename is None:
2076 if self.filename is not None: filename = self.filename
2077 else: raise ValueError(MISSING_FILENAME_TEXT)
2078
2079 f = open(filename, "w")
2080 try:
2081 f.write(self.header)
2082 now = time.time()
2083 for cookie in self:
2084 if not ignore_discard and cookie.discard:
2085 continue
2086 if not ignore_expires and cookie.is_expired(now):
2087 continue
2088 if cookie.secure: secure = "TRUE"
2089 else: secure = "FALSE"
2090 if cookie.domain.startswith("."): initial_dot = "TRUE"
2091 else: initial_dot = "FALSE"
2092 if cookie.expires is not None:
2093 expires = str(cookie.expires)
2094 else:
2095 expires = ""
2096 if cookie.value is None:
2097 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2098 # with no name, whereas http.cookiejar regards it as a
2099 # cookie with no value.
2100 name = ""
2101 value = cookie.name
2102 else:
2103 name = cookie.name
2104 value = cookie.value
2105 f.write(
2106 "\t".join([cookie.domain, initial_dot, cookie.path,
2107 secure, expires, name, value])+
2108 "\n")
2109 finally:
2110 f.close()