Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/future/backports/urllib/robotparser.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler |
|---|---|
| date | Fri, 31 Jul 2020 00:18:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d30785e31577 |
|---|---|
| 1 from __future__ import absolute_import, division, unicode_literals | |
| 2 from future.builtins import str | |
| 3 """ robotparser.py | |
| 4 | |
| 5 Copyright (C) 2000 Bastian Kleineidam | |
| 6 | |
| 7 You can choose between two licenses when using this package: | |
| 8 1) GNU GPLv2 | |
| 9 2) PSF license for Python 2.2 | |
| 10 | |
| 11 The robots.txt Exclusion Protocol is implemented as specified in | |
| 12 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html | |
| 13 """ | |
| 14 | |
| 15 # Was: import urllib.parse, urllib.request | |
| 16 from future.backports import urllib | |
| 17 from future.backports.urllib import parse as _parse, request as _request | |
| 18 urllib.parse = _parse | |
| 19 urllib.request = _request | |
| 20 | |
| 21 | |
| 22 __all__ = ["RobotFileParser"] | |
| 23 | |
| 24 class RobotFileParser(object): | |
| 25 """ This class provides a set of methods to read, parse and answer | |
| 26 questions about a single robots.txt file. | |
| 27 | |
| 28 """ | |
| 29 | |
| 30 def __init__(self, url=''): | |
| 31 self.entries = [] | |
| 32 self.default_entry = None | |
| 33 self.disallow_all = False | |
| 34 self.allow_all = False | |
| 35 self.set_url(url) | |
| 36 self.last_checked = 0 | |
| 37 | |
| 38 def mtime(self): | |
| 39 """Returns the time the robots.txt file was last fetched. | |
| 40 | |
| 41 This is useful for long-running web spiders that need to | |
| 42 check for new robots.txt files periodically. | |
| 43 | |
| 44 """ | |
| 45 return self.last_checked | |
| 46 | |
| 47 def modified(self): | |
| 48 """Sets the time the robots.txt file was last fetched to the | |
| 49 current time. | |
| 50 | |
| 51 """ | |
| 52 import time | |
| 53 self.last_checked = time.time() | |
| 54 | |
| 55 def set_url(self, url): | |
| 56 """Sets the URL referring to a robots.txt file.""" | |
| 57 self.url = url | |
| 58 self.host, self.path = urllib.parse.urlparse(url)[1:3] | |
| 59 | |
| 60 def read(self): | |
| 61 """Reads the robots.txt URL and feeds it to the parser.""" | |
| 62 try: | |
| 63 f = urllib.request.urlopen(self.url) | |
| 64 except urllib.error.HTTPError as err: | |
| 65 if err.code in (401, 403): | |
| 66 self.disallow_all = True | |
| 67 elif err.code >= 400: | |
| 68 self.allow_all = True | |
| 69 else: | |
| 70 raw = f.read() | |
| 71 self.parse(raw.decode("utf-8").splitlines()) | |
| 72 | |
| 73 def _add_entry(self, entry): | |
| 74 if "*" in entry.useragents: | |
| 75 # the default entry is considered last | |
| 76 if self.default_entry is None: | |
| 77 # the first default entry wins | |
| 78 self.default_entry = entry | |
| 79 else: | |
| 80 self.entries.append(entry) | |
| 81 | |
| 82 def parse(self, lines): | |
| 83 """Parse the input lines from a robots.txt file. | |
| 84 | |
| 85 We allow that a user-agent: line is not preceded by | |
| 86 one or more blank lines. | |
| 87 """ | |
| 88 # states: | |
| 89 # 0: start state | |
| 90 # 1: saw user-agent line | |
| 91 # 2: saw an allow or disallow line | |
| 92 state = 0 | |
| 93 entry = Entry() | |
| 94 | |
| 95 for line in lines: | |
| 96 if not line: | |
| 97 if state == 1: | |
| 98 entry = Entry() | |
| 99 state = 0 | |
| 100 elif state == 2: | |
| 101 self._add_entry(entry) | |
| 102 entry = Entry() | |
| 103 state = 0 | |
| 104 # remove optional comment and strip line | |
| 105 i = line.find('#') | |
| 106 if i >= 0: | |
| 107 line = line[:i] | |
| 108 line = line.strip() | |
| 109 if not line: | |
| 110 continue | |
| 111 line = line.split(':', 1) | |
| 112 if len(line) == 2: | |
| 113 line[0] = line[0].strip().lower() | |
| 114 line[1] = urllib.parse.unquote(line[1].strip()) | |
| 115 if line[0] == "user-agent": | |
| 116 if state == 2: | |
| 117 self._add_entry(entry) | |
| 118 entry = Entry() | |
| 119 entry.useragents.append(line[1]) | |
| 120 state = 1 | |
| 121 elif line[0] == "disallow": | |
| 122 if state != 0: | |
| 123 entry.rulelines.append(RuleLine(line[1], False)) | |
| 124 state = 2 | |
| 125 elif line[0] == "allow": | |
| 126 if state != 0: | |
| 127 entry.rulelines.append(RuleLine(line[1], True)) | |
| 128 state = 2 | |
| 129 if state == 2: | |
| 130 self._add_entry(entry) | |
| 131 | |
| 132 | |
| 133 def can_fetch(self, useragent, url): | |
| 134 """using the parsed robots.txt decide if useragent can fetch url""" | |
| 135 if self.disallow_all: | |
| 136 return False | |
| 137 if self.allow_all: | |
| 138 return True | |
| 139 # search for given user agent matches | |
| 140 # the first match counts | |
| 141 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) | |
| 142 url = urllib.parse.urlunparse(('','',parsed_url.path, | |
| 143 parsed_url.params,parsed_url.query, parsed_url.fragment)) | |
| 144 url = urllib.parse.quote(url) | |
| 145 if not url: | |
| 146 url = "/" | |
| 147 for entry in self.entries: | |
| 148 if entry.applies_to(useragent): | |
| 149 return entry.allowance(url) | |
| 150 # try the default entry last | |
| 151 if self.default_entry: | |
| 152 return self.default_entry.allowance(url) | |
| 153 # agent not found ==> access granted | |
| 154 return True | |
| 155 | |
| 156 def __str__(self): | |
| 157 return ''.join([str(entry) + "\n" for entry in self.entries]) | |
| 158 | |
| 159 | |
| 160 class RuleLine(object): | |
| 161 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" | |
| 162 (allowance==False) followed by a path.""" | |
| 163 def __init__(self, path, allowance): | |
| 164 if path == '' and not allowance: | |
| 165 # an empty value means allow all | |
| 166 allowance = True | |
| 167 self.path = urllib.parse.quote(path) | |
| 168 self.allowance = allowance | |
| 169 | |
| 170 def applies_to(self, filename): | |
| 171 return self.path == "*" or filename.startswith(self.path) | |
| 172 | |
| 173 def __str__(self): | |
| 174 return (self.allowance and "Allow" or "Disallow") + ": " + self.path | |
| 175 | |
| 176 | |
| 177 class Entry(object): | |
| 178 """An entry has one or more user-agents and zero or more rulelines""" | |
| 179 def __init__(self): | |
| 180 self.useragents = [] | |
| 181 self.rulelines = [] | |
| 182 | |
| 183 def __str__(self): | |
| 184 ret = [] | |
| 185 for agent in self.useragents: | |
| 186 ret.extend(["User-agent: ", agent, "\n"]) | |
| 187 for line in self.rulelines: | |
| 188 ret.extend([str(line), "\n"]) | |
| 189 return ''.join(ret) | |
| 190 | |
| 191 def applies_to(self, useragent): | |
| 192 """check if this entry applies to the specified agent""" | |
| 193 # split the name token and make it lower case | |
| 194 useragent = useragent.split("/")[0].lower() | |
| 195 for agent in self.useragents: | |
| 196 if agent == '*': | |
| 197 # we have the catch-all agent | |
| 198 return True | |
| 199 agent = agent.lower() | |
| 200 if agent in useragent: | |
| 201 return True | |
| 202 return False | |
| 203 | |
| 204 def allowance(self, filename): | |
| 205 """Preconditions: | |
| 206 - our agent applies to this entry | |
| 207 - filename is URL decoded""" | |
| 208 for line in self.rulelines: | |
| 209 if line.applies_to(filename): | |
| 210 return line.allowance | |
| 211 return True |
