Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/humanfriendly/terminal/html.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 # Human friendly input/output in Python. | |
2 # | |
3 # Author: Peter Odding <peter@peterodding.com> | |
4 # Last Change: February 29, 2020 | |
5 # URL: https://humanfriendly.readthedocs.io | |
6 | |
7 """Convert HTML with simple text formatting to text with ANSI escape sequences.""" | |
8 | |
9 # Standard library modules. | |
10 import re | |
11 | |
12 # Modules included in our package. | |
13 from humanfriendly.compat import HTMLParser, StringIO, name2codepoint, unichr | |
14 from humanfriendly.text import compact_empty_lines | |
15 from humanfriendly.terminal import ANSI_COLOR_CODES, ANSI_RESET, ansi_style | |
16 | |
17 # Public identifiers that require documentation. | |
18 __all__ = ('HTMLConverter', 'html_to_ansi') | |
19 | |
20 | |
21 def html_to_ansi(data, callback=None): | |
22 """ | |
23 Convert HTML with simple text formatting to text with ANSI escape sequences. | |
24 | |
25 :param data: The HTML to convert (a string). | |
26 :param callback: Optional callback to pass to :class:`HTMLConverter`. | |
27 :returns: Text with ANSI escape sequences (a string). | |
28 | |
29 Please refer to the documentation of the :class:`HTMLConverter` class for | |
30 details about the conversion process (like which tags are supported) and an | |
31 example with a screenshot. | |
32 """ | |
33 converter = HTMLConverter(callback=callback) | |
34 return converter(data) | |
35 | |
36 | |
37 class HTMLConverter(HTMLParser): | |
38 | |
39 """ | |
40 Convert HTML with simple text formatting to text with ANSI escape sequences. | |
41 | |
42 The following text styles are supported: | |
43 | |
44 - Bold: ``<b>``, ``<strong>`` and ``<span style="font-weight: bold;">`` | |
45 - Italic: ``<i>``, ``<em>`` and ``<span style="font-style: italic;">`` | |
46 - Strike-through: ``<del>``, ``<s>`` and ``<span style="text-decoration: line-through;">`` | |
47 - Underline: ``<ins>``, ``<u>`` and ``<span style="text-decoration: underline">`` | |
48 | |
49 Colors can be specified as follows: | |
50 | |
51 - Foreground color: ``<span style="color: #RRGGBB;">`` | |
52 - Background color: ``<span style="background-color: #RRGGBB;">`` | |
53 | |
54 Here's a small demonstration: | |
55 | |
56 .. code-block:: python | |
57 | |
58 from humanfriendly.text import dedent | |
59 from humanfriendly.terminal import html_to_ansi | |
60 | |
61 print(html_to_ansi(dedent(''' | |
62 <b>Hello world!</b> | |
63 <i>Is this thing on?</i> | |
64 I guess I can <u>underline</u> or <s>strike-through</s> text? | |
65 And what about <span style="color: red">color</span>? | |
66 '''))) | |
67 | |
68 rainbow_colors = [ | |
69 '#FF0000', '#E2571E', '#FF7F00', '#FFFF00', '#00FF00', | |
70 '#96BF33', '#0000FF', '#4B0082', '#8B00FF', '#FFFFFF', | |
71 ] | |
72 html_rainbow = "".join('<span style="color: %s">o</span>' % c for c in rainbow_colors) | |
73 print(html_to_ansi("Let's try a rainbow: %s" % html_rainbow)) | |
74 | |
75 Here's what the results look like: | |
76 | |
77 .. image:: images/html-to-ansi.png | |
78 | |
79 Some more details: | |
80 | |
81 - Nested tags are supported, within reasonable limits. | |
82 | |
83 - Text in ``<code>`` and ``<pre>`` tags will be highlighted in a | |
84 different color from the main text (currently this is yellow). | |
85 | |
86 - ``<a href="URL">TEXT</a>`` is converted to the format "TEXT (URL)" where | |
87 the uppercase symbols are highlighted in light blue with an underline. | |
88 | |
89 - ``<div>``, ``<p>`` and ``<pre>`` tags are considered block level tags | |
90 and are wrapped in vertical whitespace to prevent their content from | |
91 "running into" surrounding text. This may cause runs of multiple empty | |
92 lines to be emitted. As a *workaround* the :func:`__call__()` method | |
93 will automatically call :func:`.compact_empty_lines()` on the generated | |
94 output before returning it to the caller. Of course this won't work | |
95 when `output` is set to something like :data:`sys.stdout`. | |
96 | |
97 - ``<br>`` is converted to a single plain text line break. | |
98 | |
99 Implementation notes: | |
100 | |
101 - A list of dictionaries with style information is used as a stack where | |
102 new styling can be pushed and a pop will restore the previous styling. | |
103 When new styling is pushed, it is merged with (but overrides) the current | |
104 styling. | |
105 | |
106 - If you're going to be converting a lot of HTML it might be useful from | |
107 a performance standpoint to re-use an existing :class:`HTMLConverter` | |
108 object for unrelated HTML fragments, in this case take a look at the | |
109 :func:`__call__()` method (it makes this use case very easy). | |
110 | |
111 .. versionadded:: 4.15 | |
112 :class:`humanfriendly.terminal.HTMLConverter` was added to the | |
113 `humanfriendly` package during the initial development of my new | |
114 `chat-archive <https://chat-archive.readthedocs.io/>`_ project, whose | |
115 command line interface makes for a great demonstration of the | |
116 flexibility that this feature provides (hint: check out how the search | |
117 keyword highlighting combines with the regular highlighting). | |
118 """ | |
119 | |
120 BLOCK_TAGS = ('div', 'p', 'pre') | |
121 """The names of tags that are padded with vertical whitespace.""" | |
122 | |
123 def __init__(self, *args, **kw): | |
124 """ | |
125 Initialize an :class:`HTMLConverter` object. | |
126 | |
127 :param callback: Optional keyword argument to specify a function that | |
128 will be called to process text fragments before they | |
129 are emitted on the output stream. Note that link text | |
130 and preformatted text fragments are not processed by | |
131 this callback. | |
132 :param output: Optional keyword argument to redirect the output to the | |
133 given file-like object. If this is not given a new | |
134 :class:`~python3:io.StringIO` object is created. | |
135 """ | |
136 # Hide our optional keyword arguments from the superclass. | |
137 self.callback = kw.pop("callback", None) | |
138 self.output = kw.pop("output", None) | |
139 # Initialize the superclass. | |
140 HTMLParser.__init__(self, *args, **kw) | |
141 | |
142 def __call__(self, data): | |
143 """ | |
144 Reset the parser, convert some HTML and get the text with ANSI escape sequences. | |
145 | |
146 :param data: The HTML to convert to text (a string). | |
147 :returns: The converted text (only in case `output` is | |
148 a :class:`~python3:io.StringIO` object). | |
149 """ | |
150 self.reset() | |
151 self.feed(data) | |
152 self.close() | |
153 if isinstance(self.output, StringIO): | |
154 return compact_empty_lines(self.output.getvalue()) | |
155 | |
156 @property | |
157 def current_style(self): | |
158 """Get the current style from the top of the stack (a dictionary).""" | |
159 return self.stack[-1] if self.stack else {} | |
160 | |
161 def close(self): | |
162 """ | |
163 Close previously opened ANSI escape sequences. | |
164 | |
165 This method overrides the same method in the superclass to ensure that | |
166 an :data:`.ANSI_RESET` code is emitted when parsing reaches the end of | |
167 the input but a style is still active. This is intended to prevent | |
168 malformed HTML from messing up terminal output. | |
169 """ | |
170 if any(self.stack): | |
171 self.output.write(ANSI_RESET) | |
172 self.stack = [] | |
173 HTMLParser.close(self) | |
174 | |
175 def emit_style(self, style=None): | |
176 """ | |
177 Emit an ANSI escape sequence for the given or current style to the output stream. | |
178 | |
179 :param style: A dictionary with arguments for :func:`.ansi_style()` or | |
180 :data:`None`, in which case the style at the top of the | |
181 stack is emitted. | |
182 """ | |
183 # Clear the current text styles. | |
184 self.output.write(ANSI_RESET) | |
185 # Apply a new text style? | |
186 style = self.current_style if style is None else style | |
187 if style: | |
188 self.output.write(ansi_style(**style)) | |
189 | |
190 def handle_charref(self, value): | |
191 """ | |
192 Process a decimal or hexadecimal numeric character reference. | |
193 | |
194 :param value: The decimal or hexadecimal value (a string). | |
195 """ | |
196 self.output.write(unichr(int(value[1:], 16) if value.startswith('x') else int(value))) | |
197 | |
198 def handle_data(self, data): | |
199 """ | |
200 Process textual data. | |
201 | |
202 :param data: The decoded text (a string). | |
203 """ | |
204 if self.link_url: | |
205 # Link text is captured literally so that we can reliably check | |
206 # whether the text and the URL of the link are the same string. | |
207 self.link_text = data | |
208 elif self.callback and self.preformatted_text_level == 0: | |
209 # Text that is not part of a link and not preformatted text is | |
210 # passed to the user defined callback to allow for arbitrary | |
211 # pre-processing. | |
212 data = self.callback(data) | |
213 # All text is emitted unmodified on the output stream. | |
214 self.output.write(data) | |
215 | |
216 def handle_endtag(self, tag): | |
217 """ | |
218 Process the end of an HTML tag. | |
219 | |
220 :param tag: The name of the tag (a string). | |
221 """ | |
222 if tag in ('a', 'b', 'code', 'del', 'em', 'i', 'ins', 'pre', 's', 'strong', 'span', 'u'): | |
223 old_style = self.current_style | |
224 # The following conditional isn't necessary for well formed | |
225 # HTML but prevents raising exceptions on malformed HTML. | |
226 if self.stack: | |
227 self.stack.pop(-1) | |
228 new_style = self.current_style | |
229 if tag == 'a': | |
230 if self.urls_match(self.link_text, self.link_url): | |
231 # Don't render the URL when it's part of the link text. | |
232 self.emit_style(new_style) | |
233 else: | |
234 self.emit_style(new_style) | |
235 self.output.write(' (') | |
236 self.emit_style(old_style) | |
237 self.output.write(self.render_url(self.link_url)) | |
238 self.emit_style(new_style) | |
239 self.output.write(')') | |
240 else: | |
241 self.emit_style(new_style) | |
242 if tag in ('code', 'pre'): | |
243 self.preformatted_text_level -= 1 | |
244 if tag in self.BLOCK_TAGS: | |
245 # Emit an empty line after block level tags. | |
246 self.output.write('\n\n') | |
247 | |
248 def handle_entityref(self, name): | |
249 """ | |
250 Process a named character reference. | |
251 | |
252 :param name: The name of the character reference (a string). | |
253 """ | |
254 self.output.write(unichr(name2codepoint[name])) | |
255 | |
256 def handle_starttag(self, tag, attrs): | |
257 """ | |
258 Process the start of an HTML tag. | |
259 | |
260 :param tag: The name of the tag (a string). | |
261 :param attrs: A list of tuples with two strings each. | |
262 """ | |
263 if tag in self.BLOCK_TAGS: | |
264 # Emit an empty line before block level tags. | |
265 self.output.write('\n\n') | |
266 if tag == 'a': | |
267 self.push_styles(color='blue', bright=True, underline=True) | |
268 # Store the URL that the link points to for later use, so that we | |
269 # can render the link text before the URL (with the reasoning that | |
270 # this is the most intuitive way to present a link in a plain text | |
271 # interface). | |
272 self.link_url = next((v for n, v in attrs if n == 'href'), '') | |
273 elif tag == 'b' or tag == 'strong': | |
274 self.push_styles(bold=True) | |
275 elif tag == 'br': | |
276 self.output.write('\n') | |
277 elif tag == 'code' or tag == 'pre': | |
278 self.push_styles(color='yellow') | |
279 self.preformatted_text_level += 1 | |
280 elif tag == 'del' or tag == 's': | |
281 self.push_styles(strike_through=True) | |
282 elif tag == 'em' or tag == 'i': | |
283 self.push_styles(italic=True) | |
284 elif tag == 'ins' or tag == 'u': | |
285 self.push_styles(underline=True) | |
286 elif tag == 'span': | |
287 styles = {} | |
288 css = next((v for n, v in attrs if n == 'style'), "") | |
289 for rule in css.split(';'): | |
290 name, _, value = rule.partition(':') | |
291 name = name.strip() | |
292 value = value.strip() | |
293 if name == 'background-color': | |
294 styles['background'] = self.parse_color(value) | |
295 elif name == 'color': | |
296 styles['color'] = self.parse_color(value) | |
297 elif name == 'font-style' and value == 'italic': | |
298 styles['italic'] = True | |
299 elif name == 'font-weight' and value == 'bold': | |
300 styles['bold'] = True | |
301 elif name == 'text-decoration' and value == 'line-through': | |
302 styles['strike_through'] = True | |
303 elif name == 'text-decoration' and value == 'underline': | |
304 styles['underline'] = True | |
305 self.push_styles(**styles) | |
306 | |
307 def normalize_url(self, url): | |
308 """ | |
309 Normalize a URL to enable string equality comparison. | |
310 | |
311 :param url: The URL to normalize (a string). | |
312 :returns: The normalized URL (a string). | |
313 """ | |
314 return re.sub('^mailto:', '', url) | |
315 | |
316 def parse_color(self, value): | |
317 """ | |
318 Convert a CSS color to something that :func:`.ansi_style()` understands. | |
319 | |
320 :param value: A string like ``rgb(1,2,3)``, ``#AABBCC`` or ``yellow``. | |
321 :returns: A color value supported by :func:`.ansi_style()` or :data:`None`. | |
322 """ | |
323 # Parse an 'rgb(N,N,N)' expression. | |
324 if value.startswith('rgb'): | |
325 tokens = re.findall(r'\d+', value) | |
326 if len(tokens) == 3: | |
327 return tuple(map(int, tokens)) | |
328 # Parse an '#XXXXXX' expression. | |
329 elif value.startswith('#'): | |
330 value = value[1:] | |
331 length = len(value) | |
332 if length == 6: | |
333 # Six hex digits (proper notation). | |
334 return ( | |
335 int(value[:2], 16), | |
336 int(value[2:4], 16), | |
337 int(value[4:6], 16), | |
338 ) | |
339 elif length == 3: | |
340 # Three hex digits (shorthand). | |
341 return ( | |
342 int(value[0], 16), | |
343 int(value[1], 16), | |
344 int(value[2], 16), | |
345 ) | |
346 # Try to recognize a named color. | |
347 value = value.lower() | |
348 if value in ANSI_COLOR_CODES: | |
349 return value | |
350 | |
351 def push_styles(self, **changes): | |
352 """ | |
353 Push new style information onto the stack. | |
354 | |
355 :param changes: Any keyword arguments are passed on to :func:`.ansi_style()`. | |
356 | |
357 This method is a helper for :func:`handle_starttag()` | |
358 that does the following: | |
359 | |
360 1. Make a copy of the current styles (from the top of the stack), | |
361 2. Apply the given `changes` to the copy of the current styles, | |
362 3. Add the new styles to the stack, | |
363 4. Emit the appropriate ANSI escape sequence to the output stream. | |
364 """ | |
365 prototype = self.current_style | |
366 if prototype: | |
367 new_style = dict(prototype) | |
368 new_style.update(changes) | |
369 else: | |
370 new_style = changes | |
371 self.stack.append(new_style) | |
372 self.emit_style(new_style) | |
373 | |
374 def render_url(self, url): | |
375 """ | |
376 Prepare a URL for rendering on the terminal. | |
377 | |
378 :param url: The URL to simplify (a string). | |
379 :returns: The simplified URL (a string). | |
380 | |
381 This method pre-processes a URL before rendering on the terminal. The | |
382 following modifications are made: | |
383 | |
384 - The ``mailto:`` prefix is stripped. | |
385 - Spaces are converted to ``%20``. | |
386 - A trailing parenthesis is converted to ``%29``. | |
387 """ | |
388 url = re.sub('^mailto:', '', url) | |
389 url = re.sub(' ', '%20', url) | |
390 url = re.sub(r'\)$', '%29', url) | |
391 return url | |
392 | |
393 def reset(self): | |
394 """ | |
395 Reset the state of the HTML parser and ANSI converter. | |
396 | |
397 When `output` is a :class:`~python3:io.StringIO` object a new | |
398 instance will be created (and the old one garbage collected). | |
399 """ | |
400 # Reset the state of the superclass. | |
401 HTMLParser.reset(self) | |
402 # Reset our instance variables. | |
403 self.link_text = None | |
404 self.link_url = None | |
405 self.preformatted_text_level = 0 | |
406 if self.output is None or isinstance(self.output, StringIO): | |
407 # If the caller specified something like output=sys.stdout then it | |
408 # doesn't make much sense to negate that choice here in reset(). | |
409 self.output = StringIO() | |
410 self.stack = [] | |
411 | |
412 def urls_match(self, a, b): | |
413 """ | |
414 Compare two URLs for equality using :func:`normalize_url()`. | |
415 | |
416 :param a: A string containing a URL. | |
417 :param b: A string containing a URL. | |
418 :returns: :data:`True` if the URLs are the same, :data:`False` otherwise. | |
419 | |
420 This method is used by :func:`handle_endtag()` to omit the URL of a | |
421 hyperlink (``<a href="...">``) when the link text is that same URL. | |
422 """ | |
423 return self.normalize_url(a) == self.normalize_url(b) |