comparison planemo/lib/python3.7/site-packages/future/backports/urllib/robotparser.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 from __future__ import absolute_import, division, unicode_literals
2 from future.builtins import str
3 """ robotparser.py
4
5 Copyright (C) 2000 Bastian Kleineidam
6
7 You can choose between two licenses when using this package:
8 1) GNU GPLv2
9 2) PSF license for Python 2.2
10
11 The robots.txt Exclusion Protocol is implemented as specified in
12 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
13 """
14
15 # Was: import urllib.parse, urllib.request
16 from future.backports import urllib
17 from future.backports.urllib import parse as _parse, request as _request
18 urllib.parse = _parse
19 urllib.request = _request
20
21
22 __all__ = ["RobotFileParser"]
23
24 class RobotFileParser(object):
25 """ This class provides a set of methods to read, parse and answer
26 questions about a single robots.txt file.
27
28 """
29
30 def __init__(self, url=''):
31 self.entries = []
32 self.default_entry = None
33 self.disallow_all = False
34 self.allow_all = False
35 self.set_url(url)
36 self.last_checked = 0
37
38 def mtime(self):
39 """Returns the time the robots.txt file was last fetched.
40
41 This is useful for long-running web spiders that need to
42 check for new robots.txt files periodically.
43
44 """
45 return self.last_checked
46
47 def modified(self):
48 """Sets the time the robots.txt file was last fetched to the
49 current time.
50
51 """
52 import time
53 self.last_checked = time.time()
54
55 def set_url(self, url):
56 """Sets the URL referring to a robots.txt file."""
57 self.url = url
58 self.host, self.path = urllib.parse.urlparse(url)[1:3]
59
60 def read(self):
61 """Reads the robots.txt URL and feeds it to the parser."""
62 try:
63 f = urllib.request.urlopen(self.url)
64 except urllib.error.HTTPError as err:
65 if err.code in (401, 403):
66 self.disallow_all = True
67 elif err.code >= 400:
68 self.allow_all = True
69 else:
70 raw = f.read()
71 self.parse(raw.decode("utf-8").splitlines())
72
73 def _add_entry(self, entry):
74 if "*" in entry.useragents:
75 # the default entry is considered last
76 if self.default_entry is None:
77 # the first default entry wins
78 self.default_entry = entry
79 else:
80 self.entries.append(entry)
81
82 def parse(self, lines):
83 """Parse the input lines from a robots.txt file.
84
85 We allow that a user-agent: line is not preceded by
86 one or more blank lines.
87 """
88 # states:
89 # 0: start state
90 # 1: saw user-agent line
91 # 2: saw an allow or disallow line
92 state = 0
93 entry = Entry()
94
95 for line in lines:
96 if not line:
97 if state == 1:
98 entry = Entry()
99 state = 0
100 elif state == 2:
101 self._add_entry(entry)
102 entry = Entry()
103 state = 0
104 # remove optional comment and strip line
105 i = line.find('#')
106 if i >= 0:
107 line = line[:i]
108 line = line.strip()
109 if not line:
110 continue
111 line = line.split(':', 1)
112 if len(line) == 2:
113 line[0] = line[0].strip().lower()
114 line[1] = urllib.parse.unquote(line[1].strip())
115 if line[0] == "user-agent":
116 if state == 2:
117 self._add_entry(entry)
118 entry = Entry()
119 entry.useragents.append(line[1])
120 state = 1
121 elif line[0] == "disallow":
122 if state != 0:
123 entry.rulelines.append(RuleLine(line[1], False))
124 state = 2
125 elif line[0] == "allow":
126 if state != 0:
127 entry.rulelines.append(RuleLine(line[1], True))
128 state = 2
129 if state == 2:
130 self._add_entry(entry)
131
132
133 def can_fetch(self, useragent, url):
134 """using the parsed robots.txt decide if useragent can fetch url"""
135 if self.disallow_all:
136 return False
137 if self.allow_all:
138 return True
139 # search for given user agent matches
140 # the first match counts
141 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
142 url = urllib.parse.urlunparse(('','',parsed_url.path,
143 parsed_url.params,parsed_url.query, parsed_url.fragment))
144 url = urllib.parse.quote(url)
145 if not url:
146 url = "/"
147 for entry in self.entries:
148 if entry.applies_to(useragent):
149 return entry.allowance(url)
150 # try the default entry last
151 if self.default_entry:
152 return self.default_entry.allowance(url)
153 # agent not found ==> access granted
154 return True
155
156 def __str__(self):
157 return ''.join([str(entry) + "\n" for entry in self.entries])
158
159
160 class RuleLine(object):
161 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
162 (allowance==False) followed by a path."""
163 def __init__(self, path, allowance):
164 if path == '' and not allowance:
165 # an empty value means allow all
166 allowance = True
167 self.path = urllib.parse.quote(path)
168 self.allowance = allowance
169
170 def applies_to(self, filename):
171 return self.path == "*" or filename.startswith(self.path)
172
173 def __str__(self):
174 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
175
176
177 class Entry(object):
178 """An entry has one or more user-agents and zero or more rulelines"""
179 def __init__(self):
180 self.useragents = []
181 self.rulelines = []
182
183 def __str__(self):
184 ret = []
185 for agent in self.useragents:
186 ret.extend(["User-agent: ", agent, "\n"])
187 for line in self.rulelines:
188 ret.extend([str(line), "\n"])
189 return ''.join(ret)
190
191 def applies_to(self, useragent):
192 """check if this entry applies to the specified agent"""
193 # split the name token and make it lower case
194 useragent = useragent.split("/")[0].lower()
195 for agent in self.useragents:
196 if agent == '*':
197 # we have the catch-all agent
198 return True
199 agent = agent.lower()
200 if agent in useragent:
201 return True
202 return False
203
204 def allowance(self, filename):
205 """Preconditions:
206 - our agent applies to this entry
207 - filename is URL decoded"""
208 for line in self.rulelines:
209 if line.applies_to(filename):
210 return line.allowance
211 return True