Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/future/backports/urllib/robotparser.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 from __future__ import absolute_import, division, unicode_literals | |
2 from future.builtins import str | |
3 """ robotparser.py | |
4 | |
5 Copyright (C) 2000 Bastian Kleineidam | |
6 | |
7 You can choose between two licenses when using this package: | |
8 1) GNU GPLv2 | |
9 2) PSF license for Python 2.2 | |
10 | |
11 The robots.txt Exclusion Protocol is implemented as specified in | |
12 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html | |
13 """ | |
14 | |
15 # Was: import urllib.parse, urllib.request | |
16 from future.backports import urllib | |
17 from future.backports.urllib import parse as _parse, request as _request | |
18 urllib.parse = _parse | |
19 urllib.request = _request | |
20 | |
21 | |
22 __all__ = ["RobotFileParser"] | |
23 | |
24 class RobotFileParser(object): | |
25 """ This class provides a set of methods to read, parse and answer | |
26 questions about a single robots.txt file. | |
27 | |
28 """ | |
29 | |
30 def __init__(self, url=''): | |
31 self.entries = [] | |
32 self.default_entry = None | |
33 self.disallow_all = False | |
34 self.allow_all = False | |
35 self.set_url(url) | |
36 self.last_checked = 0 | |
37 | |
38 def mtime(self): | |
39 """Returns the time the robots.txt file was last fetched. | |
40 | |
41 This is useful for long-running web spiders that need to | |
42 check for new robots.txt files periodically. | |
43 | |
44 """ | |
45 return self.last_checked | |
46 | |
47 def modified(self): | |
48 """Sets the time the robots.txt file was last fetched to the | |
49 current time. | |
50 | |
51 """ | |
52 import time | |
53 self.last_checked = time.time() | |
54 | |
55 def set_url(self, url): | |
56 """Sets the URL referring to a robots.txt file.""" | |
57 self.url = url | |
58 self.host, self.path = urllib.parse.urlparse(url)[1:3] | |
59 | |
60 def read(self): | |
61 """Reads the robots.txt URL and feeds it to the parser.""" | |
62 try: | |
63 f = urllib.request.urlopen(self.url) | |
64 except urllib.error.HTTPError as err: | |
65 if err.code in (401, 403): | |
66 self.disallow_all = True | |
67 elif err.code >= 400: | |
68 self.allow_all = True | |
69 else: | |
70 raw = f.read() | |
71 self.parse(raw.decode("utf-8").splitlines()) | |
72 | |
73 def _add_entry(self, entry): | |
74 if "*" in entry.useragents: | |
75 # the default entry is considered last | |
76 if self.default_entry is None: | |
77 # the first default entry wins | |
78 self.default_entry = entry | |
79 else: | |
80 self.entries.append(entry) | |
81 | |
82 def parse(self, lines): | |
83 """Parse the input lines from a robots.txt file. | |
84 | |
85 We allow that a user-agent: line is not preceded by | |
86 one or more blank lines. | |
87 """ | |
88 # states: | |
89 # 0: start state | |
90 # 1: saw user-agent line | |
91 # 2: saw an allow or disallow line | |
92 state = 0 | |
93 entry = Entry() | |
94 | |
95 for line in lines: | |
96 if not line: | |
97 if state == 1: | |
98 entry = Entry() | |
99 state = 0 | |
100 elif state == 2: | |
101 self._add_entry(entry) | |
102 entry = Entry() | |
103 state = 0 | |
104 # remove optional comment and strip line | |
105 i = line.find('#') | |
106 if i >= 0: | |
107 line = line[:i] | |
108 line = line.strip() | |
109 if not line: | |
110 continue | |
111 line = line.split(':', 1) | |
112 if len(line) == 2: | |
113 line[0] = line[0].strip().lower() | |
114 line[1] = urllib.parse.unquote(line[1].strip()) | |
115 if line[0] == "user-agent": | |
116 if state == 2: | |
117 self._add_entry(entry) | |
118 entry = Entry() | |
119 entry.useragents.append(line[1]) | |
120 state = 1 | |
121 elif line[0] == "disallow": | |
122 if state != 0: | |
123 entry.rulelines.append(RuleLine(line[1], False)) | |
124 state = 2 | |
125 elif line[0] == "allow": | |
126 if state != 0: | |
127 entry.rulelines.append(RuleLine(line[1], True)) | |
128 state = 2 | |
129 if state == 2: | |
130 self._add_entry(entry) | |
131 | |
132 | |
133 def can_fetch(self, useragent, url): | |
134 """using the parsed robots.txt decide if useragent can fetch url""" | |
135 if self.disallow_all: | |
136 return False | |
137 if self.allow_all: | |
138 return True | |
139 # search for given user agent matches | |
140 # the first match counts | |
141 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) | |
142 url = urllib.parse.urlunparse(('','',parsed_url.path, | |
143 parsed_url.params,parsed_url.query, parsed_url.fragment)) | |
144 url = urllib.parse.quote(url) | |
145 if not url: | |
146 url = "/" | |
147 for entry in self.entries: | |
148 if entry.applies_to(useragent): | |
149 return entry.allowance(url) | |
150 # try the default entry last | |
151 if self.default_entry: | |
152 return self.default_entry.allowance(url) | |
153 # agent not found ==> access granted | |
154 return True | |
155 | |
156 def __str__(self): | |
157 return ''.join([str(entry) + "\n" for entry in self.entries]) | |
158 | |
159 | |
160 class RuleLine(object): | |
161 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" | |
162 (allowance==False) followed by a path.""" | |
163 def __init__(self, path, allowance): | |
164 if path == '' and not allowance: | |
165 # an empty value means allow all | |
166 allowance = True | |
167 self.path = urllib.parse.quote(path) | |
168 self.allowance = allowance | |
169 | |
170 def applies_to(self, filename): | |
171 return self.path == "*" or filename.startswith(self.path) | |
172 | |
173 def __str__(self): | |
174 return (self.allowance and "Allow" or "Disallow") + ": " + self.path | |
175 | |
176 | |
177 class Entry(object): | |
178 """An entry has one or more user-agents and zero or more rulelines""" | |
179 def __init__(self): | |
180 self.useragents = [] | |
181 self.rulelines = [] | |
182 | |
183 def __str__(self): | |
184 ret = [] | |
185 for agent in self.useragents: | |
186 ret.extend(["User-agent: ", agent, "\n"]) | |
187 for line in self.rulelines: | |
188 ret.extend([str(line), "\n"]) | |
189 return ''.join(ret) | |
190 | |
191 def applies_to(self, useragent): | |
192 """check if this entry applies to the specified agent""" | |
193 # split the name token and make it lower case | |
194 useragent = useragent.split("/")[0].lower() | |
195 for agent in self.useragents: | |
196 if agent == '*': | |
197 # we have the catch-all agent | |
198 return True | |
199 agent = agent.lower() | |
200 if agent in useragent: | |
201 return True | |
202 return False | |
203 | |
204 def allowance(self, filename): | |
205 """Preconditions: | |
206 - our agent applies to this entry | |
207 - filename is URL decoded""" | |
208 for line in self.rulelines: | |
209 if line.applies_to(filename): | |
210 return line.allowance | |
211 return True |