comparison env/lib/python3.7/site-packages/humanfriendly/terminal/html.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # Human friendly input/output in Python.
2 #
3 # Author: Peter Odding <peter@peterodding.com>
4 # Last Change: February 29, 2020
5 # URL: https://humanfriendly.readthedocs.io
6
7 """Convert HTML with simple text formatting to text with ANSI escape sequences."""
8
9 # Standard library modules.
10 import re
11
12 # Modules included in our package.
13 from humanfriendly.compat import HTMLParser, StringIO, name2codepoint, unichr
14 from humanfriendly.text import compact_empty_lines
15 from humanfriendly.terminal import ANSI_COLOR_CODES, ANSI_RESET, ansi_style
16
17 # Public identifiers that require documentation.
18 __all__ = ('HTMLConverter', 'html_to_ansi')
19
20
21 def html_to_ansi(data, callback=None):
22 """
23 Convert HTML with simple text formatting to text with ANSI escape sequences.
24
25 :param data: The HTML to convert (a string).
26 :param callback: Optional callback to pass to :class:`HTMLConverter`.
27 :returns: Text with ANSI escape sequences (a string).
28
29 Please refer to the documentation of the :class:`HTMLConverter` class for
30 details about the conversion process (like which tags are supported) and an
31 example with a screenshot.
32 """
33 converter = HTMLConverter(callback=callback)
34 return converter(data)
35
36
37 class HTMLConverter(HTMLParser):
38
39 """
40 Convert HTML with simple text formatting to text with ANSI escape sequences.
41
42 The following text styles are supported:
43
44 - Bold: ``<b>``, ``<strong>`` and ``<span style="font-weight: bold;">``
45 - Italic: ``<i>``, ``<em>`` and ``<span style="font-style: italic;">``
46 - Strike-through: ``<del>``, ``<s>`` and ``<span style="text-decoration: line-through;">``
47 - Underline: ``<ins>``, ``<u>`` and ``<span style="text-decoration: underline">``
48
49 Colors can be specified as follows:
50
51 - Foreground color: ``<span style="color: #RRGGBB;">``
52 - Background color: ``<span style="background-color: #RRGGBB;">``
53
54 Here's a small demonstration:
55
56 .. code-block:: python
57
58 from humanfriendly.text import dedent
59 from humanfriendly.terminal import html_to_ansi
60
61 print(html_to_ansi(dedent('''
62 <b>Hello world!</b>
63 <i>Is this thing on?</i>
64 I guess I can <u>underline</u> or <s>strike-through</s> text?
65 And what about <span style="color: red">color</span>?
66 ''')))
67
68 rainbow_colors = [
69 '#FF0000', '#E2571E', '#FF7F00', '#FFFF00', '#00FF00',
70 '#96BF33', '#0000FF', '#4B0082', '#8B00FF', '#FFFFFF',
71 ]
72 html_rainbow = "".join('<span style="color: %s">o</span>' % c for c in rainbow_colors)
73 print(html_to_ansi("Let's try a rainbow: %s" % html_rainbow))
74
75 Here's what the results look like:
76
77 .. image:: images/html-to-ansi.png
78
79 Some more details:
80
81 - Nested tags are supported, within reasonable limits.
82
83 - Text in ``<code>`` and ``<pre>`` tags will be highlighted in a
84 different color from the main text (currently this is yellow).
85
86 - ``<a href="URL">TEXT</a>`` is converted to the format "TEXT (URL)" where
87 the uppercase symbols are highlighted in light blue with an underline.
88
89 - ``<div>``, ``<p>`` and ``<pre>`` tags are considered block level tags
90 and are wrapped in vertical whitespace to prevent their content from
91 "running into" surrounding text. This may cause runs of multiple empty
92 lines to be emitted. As a *workaround* the :func:`__call__()` method
93 will automatically call :func:`.compact_empty_lines()` on the generated
94 output before returning it to the caller. Of course this won't work
95 when `output` is set to something like :data:`sys.stdout`.
96
97 - ``<br>`` is converted to a single plain text line break.
98
99 Implementation notes:
100
101 - A list of dictionaries with style information is used as a stack where
102 new styling can be pushed and a pop will restore the previous styling.
103 When new styling is pushed, it is merged with (but overrides) the current
104 styling.
105
106 - If you're going to be converting a lot of HTML it might be useful from
107 a performance standpoint to re-use an existing :class:`HTMLConverter`
108 object for unrelated HTML fragments, in this case take a look at the
109 :func:`__call__()` method (it makes this use case very easy).
110
111 .. versionadded:: 4.15
112 :class:`humanfriendly.terminal.HTMLConverter` was added to the
113 `humanfriendly` package during the initial development of my new
114 `chat-archive <https://chat-archive.readthedocs.io/>`_ project, whose
115 command line interface makes for a great demonstration of the
116 flexibility that this feature provides (hint: check out how the search
117 keyword highlighting combines with the regular highlighting).
118 """
119
120 BLOCK_TAGS = ('div', 'p', 'pre')
121 """The names of tags that are padded with vertical whitespace."""
122
123 def __init__(self, *args, **kw):
124 """
125 Initialize an :class:`HTMLConverter` object.
126
127 :param callback: Optional keyword argument to specify a function that
128 will be called to process text fragments before they
129 are emitted on the output stream. Note that link text
130 and preformatted text fragments are not processed by
131 this callback.
132 :param output: Optional keyword argument to redirect the output to the
133 given file-like object. If this is not given a new
134 :class:`~python3:io.StringIO` object is created.
135 """
136 # Hide our optional keyword arguments from the superclass.
137 self.callback = kw.pop("callback", None)
138 self.output = kw.pop("output", None)
139 # Initialize the superclass.
140 HTMLParser.__init__(self, *args, **kw)
141
142 def __call__(self, data):
143 """
144 Reset the parser, convert some HTML and get the text with ANSI escape sequences.
145
146 :param data: The HTML to convert to text (a string).
147 :returns: The converted text (only in case `output` is
148 a :class:`~python3:io.StringIO` object).
149 """
150 self.reset()
151 self.feed(data)
152 self.close()
153 if isinstance(self.output, StringIO):
154 return compact_empty_lines(self.output.getvalue())
155
156 @property
157 def current_style(self):
158 """Get the current style from the top of the stack (a dictionary)."""
159 return self.stack[-1] if self.stack else {}
160
161 def close(self):
162 """
163 Close previously opened ANSI escape sequences.
164
165 This method overrides the same method in the superclass to ensure that
166 an :data:`.ANSI_RESET` code is emitted when parsing reaches the end of
167 the input but a style is still active. This is intended to prevent
168 malformed HTML from messing up terminal output.
169 """
170 if any(self.stack):
171 self.output.write(ANSI_RESET)
172 self.stack = []
173 HTMLParser.close(self)
174
175 def emit_style(self, style=None):
176 """
177 Emit an ANSI escape sequence for the given or current style to the output stream.
178
179 :param style: A dictionary with arguments for :func:`.ansi_style()` or
180 :data:`None`, in which case the style at the top of the
181 stack is emitted.
182 """
183 # Clear the current text styles.
184 self.output.write(ANSI_RESET)
185 # Apply a new text style?
186 style = self.current_style if style is None else style
187 if style:
188 self.output.write(ansi_style(**style))
189
190 def handle_charref(self, value):
191 """
192 Process a decimal or hexadecimal numeric character reference.
193
194 :param value: The decimal or hexadecimal value (a string).
195 """
196 self.output.write(unichr(int(value[1:], 16) if value.startswith('x') else int(value)))
197
198 def handle_data(self, data):
199 """
200 Process textual data.
201
202 :param data: The decoded text (a string).
203 """
204 if self.link_url:
205 # Link text is captured literally so that we can reliably check
206 # whether the text and the URL of the link are the same string.
207 self.link_text = data
208 elif self.callback and self.preformatted_text_level == 0:
209 # Text that is not part of a link and not preformatted text is
210 # passed to the user defined callback to allow for arbitrary
211 # pre-processing.
212 data = self.callback(data)
213 # All text is emitted unmodified on the output stream.
214 self.output.write(data)
215
216 def handle_endtag(self, tag):
217 """
218 Process the end of an HTML tag.
219
220 :param tag: The name of the tag (a string).
221 """
222 if tag in ('a', 'b', 'code', 'del', 'em', 'i', 'ins', 'pre', 's', 'strong', 'span', 'u'):
223 old_style = self.current_style
224 # The following conditional isn't necessary for well formed
225 # HTML but prevents raising exceptions on malformed HTML.
226 if self.stack:
227 self.stack.pop(-1)
228 new_style = self.current_style
229 if tag == 'a':
230 if self.urls_match(self.link_text, self.link_url):
231 # Don't render the URL when it's part of the link text.
232 self.emit_style(new_style)
233 else:
234 self.emit_style(new_style)
235 self.output.write(' (')
236 self.emit_style(old_style)
237 self.output.write(self.render_url(self.link_url))
238 self.emit_style(new_style)
239 self.output.write(')')
240 else:
241 self.emit_style(new_style)
242 if tag in ('code', 'pre'):
243 self.preformatted_text_level -= 1
244 if tag in self.BLOCK_TAGS:
245 # Emit an empty line after block level tags.
246 self.output.write('\n\n')
247
248 def handle_entityref(self, name):
249 """
250 Process a named character reference.
251
252 :param name: The name of the character reference (a string).
253 """
254 self.output.write(unichr(name2codepoint[name]))
255
256 def handle_starttag(self, tag, attrs):
257 """
258 Process the start of an HTML tag.
259
260 :param tag: The name of the tag (a string).
261 :param attrs: A list of tuples with two strings each.
262 """
263 if tag in self.BLOCK_TAGS:
264 # Emit an empty line before block level tags.
265 self.output.write('\n\n')
266 if tag == 'a':
267 self.push_styles(color='blue', bright=True, underline=True)
268 # Store the URL that the link points to for later use, so that we
269 # can render the link text before the URL (with the reasoning that
270 # this is the most intuitive way to present a link in a plain text
271 # interface).
272 self.link_url = next((v for n, v in attrs if n == 'href'), '')
273 elif tag == 'b' or tag == 'strong':
274 self.push_styles(bold=True)
275 elif tag == 'br':
276 self.output.write('\n')
277 elif tag == 'code' or tag == 'pre':
278 self.push_styles(color='yellow')
279 self.preformatted_text_level += 1
280 elif tag == 'del' or tag == 's':
281 self.push_styles(strike_through=True)
282 elif tag == 'em' or tag == 'i':
283 self.push_styles(italic=True)
284 elif tag == 'ins' or tag == 'u':
285 self.push_styles(underline=True)
286 elif tag == 'span':
287 styles = {}
288 css = next((v for n, v in attrs if n == 'style'), "")
289 for rule in css.split(';'):
290 name, _, value = rule.partition(':')
291 name = name.strip()
292 value = value.strip()
293 if name == 'background-color':
294 styles['background'] = self.parse_color(value)
295 elif name == 'color':
296 styles['color'] = self.parse_color(value)
297 elif name == 'font-style' and value == 'italic':
298 styles['italic'] = True
299 elif name == 'font-weight' and value == 'bold':
300 styles['bold'] = True
301 elif name == 'text-decoration' and value == 'line-through':
302 styles['strike_through'] = True
303 elif name == 'text-decoration' and value == 'underline':
304 styles['underline'] = True
305 self.push_styles(**styles)
306
307 def normalize_url(self, url):
308 """
309 Normalize a URL to enable string equality comparison.
310
311 :param url: The URL to normalize (a string).
312 :returns: The normalized URL (a string).
313 """
314 return re.sub('^mailto:', '', url)
315
316 def parse_color(self, value):
317 """
318 Convert a CSS color to something that :func:`.ansi_style()` understands.
319
320 :param value: A string like ``rgb(1,2,3)``, ``#AABBCC`` or ``yellow``.
321 :returns: A color value supported by :func:`.ansi_style()` or :data:`None`.
322 """
323 # Parse an 'rgb(N,N,N)' expression.
324 if value.startswith('rgb'):
325 tokens = re.findall(r'\d+', value)
326 if len(tokens) == 3:
327 return tuple(map(int, tokens))
328 # Parse an '#XXXXXX' expression.
329 elif value.startswith('#'):
330 value = value[1:]
331 length = len(value)
332 if length == 6:
333 # Six hex digits (proper notation).
334 return (
335 int(value[:2], 16),
336 int(value[2:4], 16),
337 int(value[4:6], 16),
338 )
339 elif length == 3:
340 # Three hex digits (shorthand).
341 return (
342 int(value[0], 16),
343 int(value[1], 16),
344 int(value[2], 16),
345 )
346 # Try to recognize a named color.
347 value = value.lower()
348 if value in ANSI_COLOR_CODES:
349 return value
350
351 def push_styles(self, **changes):
352 """
353 Push new style information onto the stack.
354
355 :param changes: Any keyword arguments are passed on to :func:`.ansi_style()`.
356
357 This method is a helper for :func:`handle_starttag()`
358 that does the following:
359
360 1. Make a copy of the current styles (from the top of the stack),
361 2. Apply the given `changes` to the copy of the current styles,
362 3. Add the new styles to the stack,
363 4. Emit the appropriate ANSI escape sequence to the output stream.
364 """
365 prototype = self.current_style
366 if prototype:
367 new_style = dict(prototype)
368 new_style.update(changes)
369 else:
370 new_style = changes
371 self.stack.append(new_style)
372 self.emit_style(new_style)
373
374 def render_url(self, url):
375 """
376 Prepare a URL for rendering on the terminal.
377
378 :param url: The URL to simplify (a string).
379 :returns: The simplified URL (a string).
380
381 This method pre-processes a URL before rendering on the terminal. The
382 following modifications are made:
383
384 - The ``mailto:`` prefix is stripped.
385 - Spaces are converted to ``%20``.
386 - A trailing parenthesis is converted to ``%29``.
387 """
388 url = re.sub('^mailto:', '', url)
389 url = re.sub(' ', '%20', url)
390 url = re.sub(r'\)$', '%29', url)
391 return url
392
393 def reset(self):
394 """
395 Reset the state of the HTML parser and ANSI converter.
396
397 When `output` is a :class:`~python3:io.StringIO` object a new
398 instance will be created (and the old one garbage collected).
399 """
400 # Reset the state of the superclass.
401 HTMLParser.reset(self)
402 # Reset our instance variables.
403 self.link_text = None
404 self.link_url = None
405 self.preformatted_text_level = 0
406 if self.output is None or isinstance(self.output, StringIO):
407 # If the caller specified something like output=sys.stdout then it
408 # doesn't make much sense to negate that choice here in reset().
409 self.output = StringIO()
410 self.stack = []
411
412 def urls_match(self, a, b):
413 """
414 Compare two URLs for equality using :func:`normalize_url()`.
415
416 :param a: A string containing a URL.
417 :param b: A string containing a URL.
418 :returns: :data:`True` if the URLs are the same, :data:`False` otherwise.
419
420 This method is used by :func:`handle_endtag()` to omit the URL of a
421 hyperlink (``<a href="...">``) when the link text is that same URL.
422 """
423 return self.normalize_url(a) == self.normalize_url(b)