Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/lxml/html/html5parser.py @ 2:6af9afd405e9 draft
"planemo upload commit 0a63dd5f4d38a1f6944587f52a8cd79874177fc1"
author | shellac |
---|---|
date | Thu, 14 May 2020 14:56:58 -0400 |
parents | 26e78fe6e8c4 |
children |
comparison
equal
deleted
inserted
replaced
1:75ca89e9b81c | 2:6af9afd405e9 |
---|---|
1 """ | |
2 An interface to html5lib that mimics the lxml.html interface. | |
3 """ | |
4 import sys | |
5 import string | |
6 | |
7 from html5lib import HTMLParser as _HTMLParser | |
8 from html5lib.treebuilders.etree_lxml import TreeBuilder | |
9 from lxml import etree | |
10 from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag | |
11 | |
12 # python3 compatibility | |
13 try: | |
14 _strings = basestring | |
15 except NameError: | |
16 _strings = (bytes, str) | |
17 try: | |
18 from urllib2 import urlopen | |
19 except ImportError: | |
20 from urllib.request import urlopen | |
21 try: | |
22 from urlparse import urlparse | |
23 except ImportError: | |
24 from urllib.parse import urlparse | |
25 | |
26 | |
27 class HTMLParser(_HTMLParser): | |
28 """An html5lib HTML parser with lxml as tree.""" | |
29 | |
30 def __init__(self, strict=False, **kwargs): | |
31 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) | |
32 | |
33 | |
34 try: | |
35 from html5lib import XHTMLParser as _XHTMLParser | |
36 except ImportError: | |
37 pass | |
38 else: | |
39 class XHTMLParser(_XHTMLParser): | |
40 """An html5lib XHTML Parser with lxml as tree.""" | |
41 | |
42 def __init__(self, strict=False, **kwargs): | |
43 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs) | |
44 | |
45 xhtml_parser = XHTMLParser() | |
46 | |
47 | |
48 def _find_tag(tree, tag): | |
49 elem = tree.find(tag) | |
50 if elem is not None: | |
51 return elem | |
52 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag)) | |
53 | |
54 | |
55 def document_fromstring(html, guess_charset=None, parser=None): | |
56 """ | |
57 Parse a whole document into a string. | |
58 | |
59 If `guess_charset` is true, or if the input is not Unicode but a | |
60 byte string, the `chardet` library will perform charset guessing | |
61 on the string. | |
62 """ | |
63 if not isinstance(html, _strings): | |
64 raise TypeError('string required') | |
65 | |
66 if parser is None: | |
67 parser = html_parser | |
68 | |
69 options = {} | |
70 if guess_charset is None and isinstance(html, bytes): | |
71 # html5lib does not accept useChardet as an argument, if it | |
72 # detected the html argument would produce unicode objects. | |
73 guess_charset = True | |
74 if guess_charset is not None: | |
75 options['useChardet'] = guess_charset | |
76 return parser.parse(html, **options).getroot() | |
77 | |
78 | |
79 def fragments_fromstring(html, no_leading_text=False, | |
80 guess_charset=None, parser=None): | |
81 """Parses several HTML elements, returning a list of elements. | |
82 | |
83 The first item in the list may be a string. If no_leading_text is true, | |
84 then it will be an error if there is leading text, and it will always be | |
85 a list of only elements. | |
86 | |
87 If `guess_charset` is true, the `chardet` library will perform charset | |
88 guessing on the string. | |
89 """ | |
90 if not isinstance(html, _strings): | |
91 raise TypeError('string required') | |
92 | |
93 if parser is None: | |
94 parser = html_parser | |
95 | |
96 options = {} | |
97 if guess_charset is None and isinstance(html, bytes): | |
98 # html5lib does not accept useChardet as an argument, if it | |
99 # detected the html argument would produce unicode objects. | |
100 guess_charset = False | |
101 if guess_charset is not None: | |
102 options['useChardet'] = guess_charset | |
103 children = parser.parseFragment(html, 'div', **options) | |
104 if children and isinstance(children[0], _strings): | |
105 if no_leading_text: | |
106 if children[0].strip(): | |
107 raise etree.ParserError('There is leading text: %r' % | |
108 children[0]) | |
109 del children[0] | |
110 return children | |
111 | |
112 | |
113 def fragment_fromstring(html, create_parent=False, | |
114 guess_charset=None, parser=None): | |
115 """Parses a single HTML element; it is an error if there is more than | |
116 one element, or if anything but whitespace precedes or follows the | |
117 element. | |
118 | |
119 If 'create_parent' is true (or is a tag name) then a parent node | |
120 will be created to encapsulate the HTML in a single element. In | |
121 this case, leading or trailing text is allowed. | |
122 | |
123 If `guess_charset` is true, the `chardet` library will perform charset | |
124 guessing on the string. | |
125 """ | |
126 if not isinstance(html, _strings): | |
127 raise TypeError('string required') | |
128 | |
129 accept_leading_text = bool(create_parent) | |
130 | |
131 elements = fragments_fromstring( | |
132 html, guess_charset=guess_charset, parser=parser, | |
133 no_leading_text=not accept_leading_text) | |
134 | |
135 if create_parent: | |
136 if not isinstance(create_parent, _strings): | |
137 create_parent = 'div' | |
138 new_root = Element(create_parent) | |
139 if elements: | |
140 if isinstance(elements[0], _strings): | |
141 new_root.text = elements[0] | |
142 del elements[0] | |
143 new_root.extend(elements) | |
144 return new_root | |
145 | |
146 if not elements: | |
147 raise etree.ParserError('No elements found') | |
148 if len(elements) > 1: | |
149 raise etree.ParserError('Multiple elements found') | |
150 result = elements[0] | |
151 if result.tail and result.tail.strip(): | |
152 raise etree.ParserError('Element followed by text: %r' % result.tail) | |
153 result.tail = None | |
154 return result | |
155 | |
156 | |
157 def fromstring(html, guess_charset=None, parser=None): | |
158 """Parse the html, returning a single element/document. | |
159 | |
160 This tries to minimally parse the chunk of text, without knowing if it | |
161 is a fragment or a document. | |
162 | |
163 'base_url' will set the document's base_url attribute (and the tree's | |
164 docinfo.URL) | |
165 | |
166 If `guess_charset` is true, or if the input is not Unicode but a | |
167 byte string, the `chardet` library will perform charset guessing | |
168 on the string. | |
169 """ | |
170 if not isinstance(html, _strings): | |
171 raise TypeError('string required') | |
172 doc = document_fromstring(html, parser=parser, | |
173 guess_charset=guess_charset) | |
174 | |
175 # document starts with doctype or <html>, full document! | |
176 start = html[:50] | |
177 if isinstance(start, bytes): | |
178 # Allow text comparison in python3. | |
179 # Decode as ascii, that also covers latin-1 and utf-8 for the | |
180 # characters we need. | |
181 start = start.decode('ascii', 'replace') | |
182 | |
183 start = start.lstrip().lower() | |
184 if start.startswith('<html') or start.startswith('<!doctype'): | |
185 return doc | |
186 | |
187 head = _find_tag(doc, 'head') | |
188 | |
189 # if the head is not empty we have a full document | |
190 if len(head): | |
191 return doc | |
192 | |
193 body = _find_tag(doc, 'body') | |
194 | |
195 # The body has just one element, so it was probably a single | |
196 # element passed in | |
197 if (len(body) == 1 and (not body.text or not body.text.strip()) | |
198 and (not body[-1].tail or not body[-1].tail.strip())): | |
199 return body[0] | |
200 | |
201 # Now we have a body which represents a bunch of tags which have the | |
202 # content that was passed in. We will create a fake container, which | |
203 # is the body tag, except <body> implies too much structure. | |
204 if _contains_block_level_tag(body): | |
205 body.tag = 'div' | |
206 else: | |
207 body.tag = 'span' | |
208 return body | |
209 | |
210 | |
211 def parse(filename_url_or_file, guess_charset=None, parser=None): | |
212 """Parse a filename, URL, or file-like object into an HTML document | |
213 tree. Note: this returns a tree, not an element. Use | |
214 ``parse(...).getroot()`` to get the document root. | |
215 | |
216 If ``guess_charset`` is true, the ``useChardet`` option is passed into | |
217 html5lib to enable character detection. This option is on by default | |
218 when parsing from URLs, off by default when parsing from file(-like) | |
219 objects (which tend to return Unicode more often than not), and on by | |
220 default when parsing from a file path (which is read in binary mode). | |
221 """ | |
222 if parser is None: | |
223 parser = html_parser | |
224 if not isinstance(filename_url_or_file, _strings): | |
225 fp = filename_url_or_file | |
226 if guess_charset is None: | |
227 # assume that file-like objects return Unicode more often than bytes | |
228 guess_charset = False | |
229 elif _looks_like_url(filename_url_or_file): | |
230 fp = urlopen(filename_url_or_file) | |
231 if guess_charset is None: | |
232 # assume that URLs return bytes | |
233 guess_charset = True | |
234 else: | |
235 fp = open(filename_url_or_file, 'rb') | |
236 if guess_charset is None: | |
237 guess_charset = True | |
238 | |
239 options = {} | |
240 # html5lib does not accept useChardet as an argument, if it | |
241 # detected the html argument would produce unicode objects. | |
242 if guess_charset: | |
243 options['useChardet'] = guess_charset | |
244 return parser.parse(fp, **options) | |
245 | |
246 | |
247 def _looks_like_url(str): | |
248 scheme = urlparse(str)[0] | |
249 if not scheme: | |
250 return False | |
251 elif (sys.platform == 'win32' and | |
252 scheme in string.ascii_letters | |
253 and len(scheme) == 1): | |
254 # looks like a 'normal' absolute path | |
255 return False | |
256 else: | |
257 return True | |
258 | |
259 | |
260 html_parser = HTMLParser() |