Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/humanfriendly/terminal/html.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.7/site-packages/humanfriendly/terminal/html.py Sat May 02 07:14:21 2020 -0400 @@ -0,0 +1,423 @@ +# Human friendly input/output in Python. +# +# Author: Peter Odding <peter@peterodding.com> +# Last Change: February 29, 2020 +# URL: https://humanfriendly.readthedocs.io + +"""Convert HTML with simple text formatting to text with ANSI escape sequences.""" + +# Standard library modules. +import re + +# Modules included in our package. +from humanfriendly.compat import HTMLParser, StringIO, name2codepoint, unichr +from humanfriendly.text import compact_empty_lines +from humanfriendly.terminal import ANSI_COLOR_CODES, ANSI_RESET, ansi_style + +# Public identifiers that require documentation. +__all__ = ('HTMLConverter', 'html_to_ansi') + + +def html_to_ansi(data, callback=None): + """ + Convert HTML with simple text formatting to text with ANSI escape sequences. + + :param data: The HTML to convert (a string). + :param callback: Optional callback to pass to :class:`HTMLConverter`. + :returns: Text with ANSI escape sequences (a string). + + Please refer to the documentation of the :class:`HTMLConverter` class for + details about the conversion process (like which tags are supported) and an + example with a screenshot. + """ + converter = HTMLConverter(callback=callback) + return converter(data) + + +class HTMLConverter(HTMLParser): + + """ + Convert HTML with simple text formatting to text with ANSI escape sequences. + + The following text styles are supported: + + - Bold: ``<b>``, ``<strong>`` and ``<span style="font-weight: bold;">`` + - Italic: ``<i>``, ``<em>`` and ``<span style="font-style: italic;">`` + - Strike-through: ``<del>``, ``<s>`` and ``<span style="text-decoration: line-through;">`` + - Underline: ``<ins>``, ``<u>`` and ``<span style="text-decoration: underline">`` + + Colors can be specified as follows: + + - Foreground color: ``<span style="color: #RRGGBB;">`` + - Background color: ``<span style="background-color: #RRGGBB;">`` + + Here's a small demonstration: + + .. code-block:: python + + from humanfriendly.text import dedent + from humanfriendly.terminal import html_to_ansi + + print(html_to_ansi(dedent(''' + <b>Hello world!</b> + <i>Is this thing on?</i> + I guess I can <u>underline</u> or <s>strike-through</s> text? + And what about <span style="color: red">color</span>? + '''))) + + rainbow_colors = [ + '#FF0000', '#E2571E', '#FF7F00', '#FFFF00', '#00FF00', + '#96BF33', '#0000FF', '#4B0082', '#8B00FF', '#FFFFFF', + ] + html_rainbow = "".join('<span style="color: %s">o</span>' % c for c in rainbow_colors) + print(html_to_ansi("Let's try a rainbow: %s" % html_rainbow)) + + Here's what the results look like: + + .. image:: images/html-to-ansi.png + + Some more details: + + - Nested tags are supported, within reasonable limits. + + - Text in ``<code>`` and ``<pre>`` tags will be highlighted in a + different color from the main text (currently this is yellow). + + - ``<a href="URL">TEXT</a>`` is converted to the format "TEXT (URL)" where + the uppercase symbols are highlighted in light blue with an underline. + + - ``<div>``, ``<p>`` and ``<pre>`` tags are considered block level tags + and are wrapped in vertical whitespace to prevent their content from + "running into" surrounding text. This may cause runs of multiple empty + lines to be emitted. As a *workaround* the :func:`__call__()` method + will automatically call :func:`.compact_empty_lines()` on the generated + output before returning it to the caller. Of course this won't work + when `output` is set to something like :data:`sys.stdout`. + + - ``<br>`` is converted to a single plain text line break. + + Implementation notes: + + - A list of dictionaries with style information is used as a stack where + new styling can be pushed and a pop will restore the previous styling. + When new styling is pushed, it is merged with (but overrides) the current + styling. + + - If you're going to be converting a lot of HTML it might be useful from + a performance standpoint to re-use an existing :class:`HTMLConverter` + object for unrelated HTML fragments, in this case take a look at the + :func:`__call__()` method (it makes this use case very easy). + + .. versionadded:: 4.15 + :class:`humanfriendly.terminal.HTMLConverter` was added to the + `humanfriendly` package during the initial development of my new + `chat-archive <https://chat-archive.readthedocs.io/>`_ project, whose + command line interface makes for a great demonstration of the + flexibility that this feature provides (hint: check out how the search + keyword highlighting combines with the regular highlighting). + """ + + BLOCK_TAGS = ('div', 'p', 'pre') + """The names of tags that are padded with vertical whitespace.""" + + def __init__(self, *args, **kw): + """ + Initialize an :class:`HTMLConverter` object. + + :param callback: Optional keyword argument to specify a function that + will be called to process text fragments before they + are emitted on the output stream. Note that link text + and preformatted text fragments are not processed by + this callback. + :param output: Optional keyword argument to redirect the output to the + given file-like object. If this is not given a new + :class:`~python3:io.StringIO` object is created. + """ + # Hide our optional keyword arguments from the superclass. + self.callback = kw.pop("callback", None) + self.output = kw.pop("output", None) + # Initialize the superclass. + HTMLParser.__init__(self, *args, **kw) + + def __call__(self, data): + """ + Reset the parser, convert some HTML and get the text with ANSI escape sequences. + + :param data: The HTML to convert to text (a string). + :returns: The converted text (only in case `output` is + a :class:`~python3:io.StringIO` object). + """ + self.reset() + self.feed(data) + self.close() + if isinstance(self.output, StringIO): + return compact_empty_lines(self.output.getvalue()) + + @property + def current_style(self): + """Get the current style from the top of the stack (a dictionary).""" + return self.stack[-1] if self.stack else {} + + def close(self): + """ + Close previously opened ANSI escape sequences. + + This method overrides the same method in the superclass to ensure that + an :data:`.ANSI_RESET` code is emitted when parsing reaches the end of + the input but a style is still active. This is intended to prevent + malformed HTML from messing up terminal output. + """ + if any(self.stack): + self.output.write(ANSI_RESET) + self.stack = [] + HTMLParser.close(self) + + def emit_style(self, style=None): + """ + Emit an ANSI escape sequence for the given or current style to the output stream. + + :param style: A dictionary with arguments for :func:`.ansi_style()` or + :data:`None`, in which case the style at the top of the + stack is emitted. + """ + # Clear the current text styles. + self.output.write(ANSI_RESET) + # Apply a new text style? + style = self.current_style if style is None else style + if style: + self.output.write(ansi_style(**style)) + + def handle_charref(self, value): + """ + Process a decimal or hexadecimal numeric character reference. + + :param value: The decimal or hexadecimal value (a string). + """ + self.output.write(unichr(int(value[1:], 16) if value.startswith('x') else int(value))) + + def handle_data(self, data): + """ + Process textual data. + + :param data: The decoded text (a string). + """ + if self.link_url: + # Link text is captured literally so that we can reliably check + # whether the text and the URL of the link are the same string. + self.link_text = data + elif self.callback and self.preformatted_text_level == 0: + # Text that is not part of a link and not preformatted text is + # passed to the user defined callback to allow for arbitrary + # pre-processing. + data = self.callback(data) + # All text is emitted unmodified on the output stream. + self.output.write(data) + + def handle_endtag(self, tag): + """ + Process the end of an HTML tag. + + :param tag: The name of the tag (a string). + """ + if tag in ('a', 'b', 'code', 'del', 'em', 'i', 'ins', 'pre', 's', 'strong', 'span', 'u'): + old_style = self.current_style + # The following conditional isn't necessary for well formed + # HTML but prevents raising exceptions on malformed HTML. + if self.stack: + self.stack.pop(-1) + new_style = self.current_style + if tag == 'a': + if self.urls_match(self.link_text, self.link_url): + # Don't render the URL when it's part of the link text. + self.emit_style(new_style) + else: + self.emit_style(new_style) + self.output.write(' (') + self.emit_style(old_style) + self.output.write(self.render_url(self.link_url)) + self.emit_style(new_style) + self.output.write(')') + else: + self.emit_style(new_style) + if tag in ('code', 'pre'): + self.preformatted_text_level -= 1 + if tag in self.BLOCK_TAGS: + # Emit an empty line after block level tags. + self.output.write('\n\n') + + def handle_entityref(self, name): + """ + Process a named character reference. + + :param name: The name of the character reference (a string). + """ + self.output.write(unichr(name2codepoint[name])) + + def handle_starttag(self, tag, attrs): + """ + Process the start of an HTML tag. + + :param tag: The name of the tag (a string). + :param attrs: A list of tuples with two strings each. + """ + if tag in self.BLOCK_TAGS: + # Emit an empty line before block level tags. + self.output.write('\n\n') + if tag == 'a': + self.push_styles(color='blue', bright=True, underline=True) + # Store the URL that the link points to for later use, so that we + # can render the link text before the URL (with the reasoning that + # this is the most intuitive way to present a link in a plain text + # interface). + self.link_url = next((v for n, v in attrs if n == 'href'), '') + elif tag == 'b' or tag == 'strong': + self.push_styles(bold=True) + elif tag == 'br': + self.output.write('\n') + elif tag == 'code' or tag == 'pre': + self.push_styles(color='yellow') + self.preformatted_text_level += 1 + elif tag == 'del' or tag == 's': + self.push_styles(strike_through=True) + elif tag == 'em' or tag == 'i': + self.push_styles(italic=True) + elif tag == 'ins' or tag == 'u': + self.push_styles(underline=True) + elif tag == 'span': + styles = {} + css = next((v for n, v in attrs if n == 'style'), "") + for rule in css.split(';'): + name, _, value = rule.partition(':') + name = name.strip() + value = value.strip() + if name == 'background-color': + styles['background'] = self.parse_color(value) + elif name == 'color': + styles['color'] = self.parse_color(value) + elif name == 'font-style' and value == 'italic': + styles['italic'] = True + elif name == 'font-weight' and value == 'bold': + styles['bold'] = True + elif name == 'text-decoration' and value == 'line-through': + styles['strike_through'] = True + elif name == 'text-decoration' and value == 'underline': + styles['underline'] = True + self.push_styles(**styles) + + def normalize_url(self, url): + """ + Normalize a URL to enable string equality comparison. + + :param url: The URL to normalize (a string). + :returns: The normalized URL (a string). + """ + return re.sub('^mailto:', '', url) + + def parse_color(self, value): + """ + Convert a CSS color to something that :func:`.ansi_style()` understands. + + :param value: A string like ``rgb(1,2,3)``, ``#AABBCC`` or ``yellow``. + :returns: A color value supported by :func:`.ansi_style()` or :data:`None`. + """ + # Parse an 'rgb(N,N,N)' expression. + if value.startswith('rgb'): + tokens = re.findall(r'\d+', value) + if len(tokens) == 3: + return tuple(map(int, tokens)) + # Parse an '#XXXXXX' expression. + elif value.startswith('#'): + value = value[1:] + length = len(value) + if length == 6: + # Six hex digits (proper notation). + return ( + int(value[:2], 16), + int(value[2:4], 16), + int(value[4:6], 16), + ) + elif length == 3: + # Three hex digits (shorthand). + return ( + int(value[0], 16), + int(value[1], 16), + int(value[2], 16), + ) + # Try to recognize a named color. + value = value.lower() + if value in ANSI_COLOR_CODES: + return value + + def push_styles(self, **changes): + """ + Push new style information onto the stack. + + :param changes: Any keyword arguments are passed on to :func:`.ansi_style()`. + + This method is a helper for :func:`handle_starttag()` + that does the following: + + 1. Make a copy of the current styles (from the top of the stack), + 2. Apply the given `changes` to the copy of the current styles, + 3. Add the new styles to the stack, + 4. Emit the appropriate ANSI escape sequence to the output stream. + """ + prototype = self.current_style + if prototype: + new_style = dict(prototype) + new_style.update(changes) + else: + new_style = changes + self.stack.append(new_style) + self.emit_style(new_style) + + def render_url(self, url): + """ + Prepare a URL for rendering on the terminal. + + :param url: The URL to simplify (a string). + :returns: The simplified URL (a string). + + This method pre-processes a URL before rendering on the terminal. The + following modifications are made: + + - The ``mailto:`` prefix is stripped. + - Spaces are converted to ``%20``. + - A trailing parenthesis is converted to ``%29``. + """ + url = re.sub('^mailto:', '', url) + url = re.sub(' ', '%20', url) + url = re.sub(r'\)$', '%29', url) + return url + + def reset(self): + """ + Reset the state of the HTML parser and ANSI converter. + + When `output` is a :class:`~python3:io.StringIO` object a new + instance will be created (and the old one garbage collected). + """ + # Reset the state of the superclass. + HTMLParser.reset(self) + # Reset our instance variables. + self.link_text = None + self.link_url = None + self.preformatted_text_level = 0 + if self.output is None or isinstance(self.output, StringIO): + # If the caller specified something like output=sys.stdout then it + # doesn't make much sense to negate that choice here in reset(). + self.output = StringIO() + self.stack = [] + + def urls_match(self, a, b): + """ + Compare two URLs for equality using :func:`normalize_url()`. + + :param a: A string containing a URL. + :param b: A string containing a URL. + :returns: :data:`True` if the URLs are the same, :data:`False` otherwise. + + This method is used by :func:`handle_endtag()` to omit the URL of a + hyperlink (``<a href="...">``) when the link text is that same URL. + """ + return self.normalize_url(a) == self.normalize_url(b)