comparison planemo/lib/python3.7/site-packages/bs4/diagnose.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d30785e31577
1 """Diagnostic functions, mainly for use when doing tech support."""
2
3 # Use of this source code is governed by the MIT license.
4 __license__ = "MIT"
5
6 import cProfile
7 from io import StringIO
8 from html.parser import HTMLParser
9 import bs4
10 from bs4 import BeautifulSoup, __version__
11 from bs4.builder import builder_registry
12
13 import os
14 import pstats
15 import random
16 import tempfile
17 import time
18 import traceback
19 import sys
20 import cProfile
21
22 def diagnose(data):
23 """Diagnostic suite for isolating common problems.
24
25 :param data: A string containing markup that needs to be explained.
26 :return: None; diagnostics are printed to standard output.
27 """
28 print(("Diagnostic running on Beautiful Soup %s" % __version__))
29 print(("Python version %s" % sys.version))
30
31 basic_parsers = ["html.parser", "html5lib", "lxml"]
32 for name in basic_parsers:
33 for builder in builder_registry.builders:
34 if name in builder.features:
35 break
36 else:
37 basic_parsers.remove(name)
38 print((
39 "I noticed that %s is not installed. Installing it may help." %
40 name))
41
42 if 'lxml' in basic_parsers:
43 basic_parsers.append("lxml-xml")
44 try:
45 from lxml import etree
46 print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
47 except ImportError as e:
48 print(
49 "lxml is not installed or couldn't be imported.")
50
51
52 if 'html5lib' in basic_parsers:
53 try:
54 import html5lib
55 print(("Found html5lib version %s" % html5lib.__version__))
56 except ImportError as e:
57 print(
58 "html5lib is not installed or couldn't be imported.")
59
60 if hasattr(data, 'read'):
61 data = data.read()
62 elif data.startswith("http:") or data.startswith("https:"):
63 print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
64 print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
65 return
66 else:
67 try:
68 if os.path.exists(data):
69 print(('"%s" looks like a filename. Reading data from the file.' % data))
70 with open(data) as fp:
71 data = fp.read()
72 except ValueError:
73 # This can happen on some platforms when the 'filename' is
74 # too long. Assume it's data and not a filename.
75 pass
76 print("")
77
78 for parser in basic_parsers:
79 print(("Trying to parse your markup with %s" % parser))
80 success = False
81 try:
82 soup = BeautifulSoup(data, features=parser)
83 success = True
84 except Exception as e:
85 print(("%s could not parse the markup." % parser))
86 traceback.print_exc()
87 if success:
88 print(("Here's what %s did with the markup:" % parser))
89 print((soup.prettify()))
90
91 print(("-" * 80))
92
93 def lxml_trace(data, html=True, **kwargs):
94 """Print out the lxml events that occur during parsing.
95
96 This lets you see how lxml parses a document when no Beautiful
97 Soup code is running. You can use this to determine whether
98 an lxml-specific problem is in Beautiful Soup's lxml tree builders
99 or in lxml itself.
100
101 :param data: Some markup.
102 :param html: If True, markup will be parsed with lxml's HTML parser.
103 if False, lxml's XML parser will be used.
104 """
105 from lxml import etree
106 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
107 print(("%s, %4s, %s" % (event, element.tag, element.text)))
108
109 class AnnouncingParser(HTMLParser):
110 """Subclass of HTMLParser that announces parse events, without doing
111 anything else.
112
113 You can use this to get a picture of how html.parser sees a given
114 document. The easiest way to do this is to call `htmlparser_trace`.
115 """
116
117 def _p(self, s):
118 print(s)
119
120 def handle_starttag(self, name, attrs):
121 self._p("%s START" % name)
122
123 def handle_endtag(self, name):
124 self._p("%s END" % name)
125
126 def handle_data(self, data):
127 self._p("%s DATA" % data)
128
129 def handle_charref(self, name):
130 self._p("%s CHARREF" % name)
131
132 def handle_entityref(self, name):
133 self._p("%s ENTITYREF" % name)
134
135 def handle_comment(self, data):
136 self._p("%s COMMENT" % data)
137
138 def handle_decl(self, data):
139 self._p("%s DECL" % data)
140
141 def unknown_decl(self, data):
142 self._p("%s UNKNOWN-DECL" % data)
143
144 def handle_pi(self, data):
145 self._p("%s PI" % data)
146
147 def htmlparser_trace(data):
148 """Print out the HTMLParser events that occur during parsing.
149
150 This lets you see how HTMLParser parses a document when no
151 Beautiful Soup code is running.
152
153 :param data: Some markup.
154 """
155 parser = AnnouncingParser()
156 parser.feed(data)
157
158 _vowels = "aeiou"
159 _consonants = "bcdfghjklmnpqrstvwxyz"
160
161 def rword(length=5):
162 "Generate a random word-like string."
163 s = ''
164 for i in range(length):
165 if i % 2 == 0:
166 t = _consonants
167 else:
168 t = _vowels
169 s += random.choice(t)
170 return s
171
172 def rsentence(length=4):
173 "Generate a random sentence-like string."
174 return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
175
176 def rdoc(num_elements=1000):
177 """Randomly generate an invalid HTML document."""
178 tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
179 elements = []
180 for i in range(num_elements):
181 choice = random.randint(0,3)
182 if choice == 0:
183 # New tag.
184 tag_name = random.choice(tag_names)
185 elements.append("<%s>" % tag_name)
186 elif choice == 1:
187 elements.append(rsentence(random.randint(1,4)))
188 elif choice == 2:
189 # Close a tag.
190 tag_name = random.choice(tag_names)
191 elements.append("</%s>" % tag_name)
192 return "<html>" + "\n".join(elements) + "</html>"
193
194 def benchmark_parsers(num_elements=100000):
195 """Very basic head-to-head performance benchmark."""
196 print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
197 data = rdoc(num_elements)
198 print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
199
200 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
201 success = False
202 try:
203 a = time.time()
204 soup = BeautifulSoup(data, parser)
205 b = time.time()
206 success = True
207 except Exception as e:
208 print(("%s could not parse the markup." % parser))
209 traceback.print_exc()
210 if success:
211 print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
212
213 from lxml import etree
214 a = time.time()
215 etree.HTML(data)
216 b = time.time()
217 print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
218
219 import html5lib
220 parser = html5lib.HTMLParser()
221 a = time.time()
222 parser.parse(data)
223 b = time.time()
224 print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
225
226 def profile(num_elements=100000, parser="lxml"):
227 """Use Python's profiler on a randomly generated document."""
228 filehandle = tempfile.NamedTemporaryFile()
229 filename = filehandle.name
230
231 data = rdoc(num_elements)
232 vars = dict(bs4=bs4, data=data, parser=parser)
233 cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
234
235 stats = pstats.Stats(filename)
236 # stats.strip_dirs()
237 stats.sort_stats("cumulative")
238 stats.print_stats('_html5lib|bs4', 50)
239
240 # If this file is run as a script, standard input is diagnosed.
241 if __name__ == '__main__':
242 diagnose(sys.stdin.read())