Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/docutils/utils/smartquotes.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/lib/python3.7/site-packages/docutils/utils/smartquotes.py Sat May 02 07:14:21 2020 -0400 @@ -0,0 +1,1014 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# :Id: $Id: smartquotes.py 8376 2019-08-27 19:49:29Z milde $ +# :Copyright: © 2010 Günter Milde, +# original `SmartyPants`_: © 2003 John Gruber +# smartypants.py: © 2004, 2007 Chad Miller +# :Maintainer: docutils-develop@lists.sourceforge.net +# :License: Released under the terms of the `2-Clause BSD license`_, in short: +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notices and this notice are preserved. +# This file is offered as-is, without any warranty. +# +# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause + + +r""" +========================= +Smart Quotes for Docutils +========================= + +Synopsis +======== + +"SmartyPants" is a free web publishing plug-in for Movable Type, Blosxom, and +BBEdit that easily translates plain ASCII punctuation characters into "smart" +typographic punctuation characters. + +``smartquotes.py`` is an adaption of "SmartyPants" to Docutils_. + +* Using Unicode instead of HTML entities for typographic punctuation + characters, it works for any output format that supports Unicode. +* Supports `language specific quote characters`__. + +__ http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks + + +Authors +======= + +`John Gruber`_ did all of the hard work of writing this software in Perl for +`Movable Type`_ and almost all of this useful documentation. `Chad Miller`_ +ported it to Python to use with Pyblosxom_. +Adapted to Docutils_ by Günter Milde. + +Additional Credits +================== + +Portions of the SmartyPants original work are based on Brad Choate's nifty +MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to +this plug-in. Brad Choate is a fine hacker indeed. + +`Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta +testing of the original SmartyPants. + +`Rael Dornfest`_ ported SmartyPants to Blosxom. + +.. _Brad Choate: http://bradchoate.com/ +.. _Jeremy Hedley: http://antipixel.com/ +.. _Charles Wiltgen: http://playbacktime.com/ +.. _Rael Dornfest: http://raelity.org/ + + +Copyright and License +===================== + +SmartyPants_ license (3-Clause BSD license): + + Copyright (c) 2003 John Gruber (http://daringfireball.net/) + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name "SmartyPants" nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + + This software is provided by the copyright holders and contributors + "as is" and any express or implied warranties, including, but not + limited to, the implied warranties of merchantability and fitness for + a particular purpose are disclaimed. In no event shall the copyright + owner or contributors be liable for any direct, indirect, incidental, + special, exemplary, or consequential damages (including, but not + limited to, procurement of substitute goods or services; loss of use, + data, or profits; or business interruption) however caused and on any + theory of liability, whether in contract, strict liability, or tort + (including negligence or otherwise) arising in any way out of the use + of this software, even if advised of the possibility of such damage. + +smartypants.py license (2-Clause BSD license): + + smartypants.py is a derivative work of SmartyPants. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + This software is provided by the copyright holders and contributors + "as is" and any express or implied warranties, including, but not + limited to, the implied warranties of merchantability and fitness for + a particular purpose are disclaimed. In no event shall the copyright + owner or contributors be liable for any direct, indirect, incidental, + special, exemplary, or consequential damages (including, but not + limited to, procurement of substitute goods or services; loss of use, + data, or profits; or business interruption) however caused and on any + theory of liability, whether in contract, strict liability, or tort + (including negligence or otherwise) arising in any way out of the use + of this software, even if advised of the possibility of such damage. + +.. _John Gruber: http://daringfireball.net/ +.. _Chad Miller: http://web.chad.org/ + +.. _Pyblosxom: http://pyblosxom.bluesock.org/ +.. _SmartyPants: http://daringfireball.net/projects/smartypants/ +.. _Movable Type: http://www.movabletype.org/ +.. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause +.. _Docutils: http://docutils.sf.net/ + +Description +=========== + +SmartyPants can perform the following transformations: + +- Straight quotes ( " and ' ) into "curly" quote characters +- Backticks-style quotes (\`\`like this'') into "curly" quote characters +- Dashes (``--`` and ``---``) into en- and em-dash entities +- Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity + +This means you can write, edit, and save your posts using plain old +ASCII straight quotes, plain dashes, and plain dots, but your published +posts (and final HTML output) will appear with smart quotes, em-dashes, +and proper ellipses. + +SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``, +``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to +display text where smart quotes and other "smart punctuation" would not be +appropriate, such as source code or example markup. + + +Backslash Escapes +================= + +If you need to use literal straight quotes (or plain hyphens and periods), +`smartquotes` accepts the following backslash escape sequences to force +ASCII-punctuation. Mind, that you need two backslashes as Docutils expands it, +too. + +======== ========= +Escape Character +======== ========= +``\\`` \\ +``\\"`` \\" +``\\'`` \\' +``\\.`` \\. +``\\-`` \\- +``\\``` \\` +======== ========= + +This is useful, for example, when you want to use straight quotes as +foot and inch marks: 6\\'2\\" tall; a 17\\" iMac. + + +Caveats +======= + +Why You Might Not Want to Use Smart Quotes in Your Weblog +--------------------------------------------------------- + +For one thing, you might not care. + +Most normal, mentally stable individuals do not take notice of proper +typographic punctuation. Many design and typography nerds, however, break +out in a nasty rash when they encounter, say, a restaurant sign that uses +a straight apostrophe to spell "Joe's". + +If you're the sort of person who just doesn't care, you might well want to +continue not caring. Using straight quotes -- and sticking to the 7-bit +ASCII character set in general -- is certainly a simpler way to live. + +Even if you *do* care about accurate typography, you still might want to +think twice before educating the quote characters in your weblog. One side +effect of publishing curly quote characters is that it makes your +weblog a bit harder for others to quote from using copy-and-paste. What +happens is that when someone copies text from your blog, the copied text +contains the 8-bit curly quote characters (as well as the 8-bit characters +for em-dashes and ellipses, if you use these options). These characters +are not standard across different text encoding methods, which is why they +need to be encoded as characters. + +People copying text from your weblog, however, may not notice that you're +using curly quotes, and they'll go ahead and paste the unencoded 8-bit +characters copied from their browser into an email message or their own +weblog. When pasted as raw "smart quotes", these characters are likely to +get mangled beyond recognition. + +That said, my own opinion is that any decent text editor or email client +makes it easy to stupefy smart quote characters into their 7-bit +equivalents, and I don't consider it my problem if you're using an +indecent text editor or email client. + + +Algorithmic Shortcomings +------------------------ + +One situation in which quotes will get curled the wrong way is when +apostrophes are used at the start of leading contractions. For example:: + + 'Twas the night before Christmas. + +In the case above, SmartyPants will turn the apostrophe into an opening +single-quote, when in fact it should be the `right single quotation mark` +character which is also "the preferred character to use for apostrophe" +(Unicode). I don't think this problem can be solved in the general case -- +every word processor I've tried gets this wrong as well. In such cases, it's +best to use the proper character for closing single-quotes (’) by hand. + +In English, the same character is used for apostrophe and closing single +quote (both plain and "smart" ones). For other locales (French, Italean, +Swiss, ...) "smart" single closing quotes differ from the curly apostrophe. + + .. class:: language-fr + + Il dit : "C'est 'super' !" + +If the apostrophe is used at the end of a word, it cannot be distinguished +from a single quote by the algorithm. Therefore, a text like:: + + .. class:: language-de-CH + + "Er sagt: 'Ich fass' es nicht.'" + +will get a single closing guillemet instead of an apostrophe. + +This can be prevented by use use of the curly apostrophe character (’) in +the source:: + + - "Er sagt: 'Ich fass' es nicht.'" + + "Er sagt: 'Ich fass’ es nicht.'" + + +Version History +=============== + +1.8.1 2017-10-25 + - Use open quote after Unicode whitespace, ZWSP, and ZWNJ. + - Code cleanup. + +1.8: 2017-04-24 + - Command line front-end. + +1.7.1: 2017-03-19 + - Update and extend language-dependent quotes. + - Differentiate apostrophe from single quote. + +1.7: 2012-11-19 + - Internationalization: language-dependent quotes. + +1.6.1: 2012-11-06 + - Refactor code, code cleanup, + - `educate_tokens()` generator as interface for Docutils. + +1.6: 2010-08-26 + - Adaption to Docutils: + - Use Unicode instead of HTML entities, + - Remove code special to pyblosxom. + +1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400 + - Fixed bug where blocks of precious unalterable text was instead + interpreted. Thanks to Le Roux and Dirk van Oosterbosch. + +1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400 + - Fix bogus magical quotation when there is no hint that the + user wants it, e.g., in "21st century". Thanks to Nathan Hamblen. + - Be smarter about quotes before terminating numbers in an en-dash'ed + range. + +1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500 + - Fix a date-processing bug, as reported by jacob childress. + - Begin a test-suite for ensuring correct output. + - Removed import of "string", since I didn't really need it. + (This was my first every Python program. Sue me!) + +1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400 + - Abort processing if the flavour is in forbidden-list. Default of + [ "rss" ] (Idea of Wolfgang SCHNERRING.) + - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING. + +1.5_1.2: Mon, 24 May 2004 08:14:54 -0400 + - Some single quotes weren't replaced properly. Diff-tesuji played + by Benjamin GEIGER. + +1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500 + - Support upcoming pyblosxom 0.9 plugin verification feature. + +1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500 + - Initial release +""" +from __future__ import print_function + +options = r""" +Options +======= + +Numeric values are the easiest way to configure SmartyPants' behavior: + +:0: Suppress all transformations. (Do nothing.) + +:1: Performs default SmartyPants transformations: quotes (including + \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash) + is used to signify an em-dash; there is no support for en-dashes + +:2: Same as smarty_pants="1", except that it uses the old-school typewriter + shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``" + (dash dash dash) + for em-dashes. + +:3: Same as smarty_pants="2", but inverts the shorthand for dashes: + "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for + en-dashes. + +:-1: Stupefy mode. Reverses the SmartyPants transformation process, turning + the characters produced by SmartyPants into their ASCII equivalents. + E.g. the LEFT DOUBLE QUOTATION MARK (“) is turned into a simple + double-quote (\"), "—" is turned into two dashes, etc. + + +The following single-character attribute values can be combined to toggle +individual transformations from within the smarty_pants attribute. For +example, ``"1"`` is equivalent to ``"qBde"``. + +:q: Educates normal quote characters: (") and ('). + +:b: Educates \`\`backticks'' -style double quotes. + +:B: Educates \`\`backticks'' -style double quotes and \`single' quotes. + +:d: Educates em-dashes. + +:D: Educates em-dashes and en-dashes, using old-school typewriter shorthand: + (dash dash) for en-dashes, (dash dash dash) for em-dashes. + +:i: Educates em-dashes and en-dashes, using inverted old-school typewriter + shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes. + +:e: Educates ellipses. + +:w: Translates any instance of ``"`` into a normal double-quote character. + This should be of no interest to most people, but of particular interest + to anyone who writes their posts using Dreamweaver, as Dreamweaver + inexplicably uses this entity to represent a literal double-quote + character. SmartyPants only educates normal quotes, not entities (because + ordinarily, entities are used for the explicit purpose of representing the + specific character they represent). The "w" option must be used in + conjunction with one (or both) of the other quote options ("q" or "b"). + Thus, if you wish to apply all SmartyPants transformations (quotes, en- + and em-dashes, and ellipses) and also translate ``"`` entities into + regular quotes so SmartyPants can educate them, you should pass the + following to the smarty_pants attribute: +""" + + +default_smartypants_attr = "1" + + +import re, sys + +class smartchars(object): + """Smart quotes and dashes + """ + + endash = u'–' # "–" EN DASH + emdash = u'—' # "—" EM DASH + ellipsis = u'…' # "…" HORIZONTAL ELLIPSIS + apostrophe = u'’' # "’" RIGHT SINGLE QUOTATION MARK + + # quote characters (language-specific, set in __init__()) + # [1] http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks + # [2] http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen + # [3] https://fr.wikipedia.org/wiki/Guillemet + # [4] http://typographisme.net/post/Les-espaces-typographiques-et-le-web + # [5] http://www.btb.termiumplus.gc.ca/tpv2guides/guides/redac/index-fra.html + # [6] https://en.wikipedia.org/wiki/Hebrew_punctuation#Quotation_marks + # [7] http://www.tustep.uni-tuebingen.de/bi/bi00/bi001t1-anfuehrung.pdf + # [8] http://www.korrekturavdelingen.no/anforselstegn.htm + # [9] Typografisk håndbok. Oslo: Spartacus. 2000. s. 67. ISBN 8243001530. + # [10] http://www.typografi.org/sitat/sitatart.html + # + # See also configuration option "smartquote-locales". + quotes = {'af': u'“”‘’', + 'af-x-altquot': u'„”‚’', + 'bg': u'„“‚‘', # Bulgarian, https://bg.wikipedia.org/wiki/Кавички + 'ca': u'«»“”', + 'ca-x-altquot': u'“”‘’', + 'cs': u'„“‚‘', + 'cs-x-altquot': u'»«›‹', + 'da': u'»«›‹', + 'da-x-altquot': u'„“‚‘', + # 'da-x-altquot2': u'””’’', + 'de': u'„“‚‘', + 'de-x-altquot': u'»«›‹', + 'de-ch': u'«»‹›', + 'el': u'«»“”', + 'en': u'“”‘’', + 'en-uk-x-altquot': u'‘’“”', # Attention: " → ‘ and ' → “ ! + 'eo': u'“”‘’', + 'es': u'«»“”', + 'es-x-altquot': u'“”‘’', + 'et': u'„“‚‘', # no secondary quote listed in + 'et-x-altquot': u'«»‹›', # the sources above (wikipedia.org) + 'eu': u'«»‹›', + 'fi': u'””’’', + 'fi-x-altquot': u'»»››', + 'fr': (u'« ', u' »', u'“', u'”'), # full no-break space + 'fr-x-altquot': (u'« ', u' »', u'“', u'”'), # narrow no-break space + 'fr-ch': u'«»‹›', + 'fr-ch-x-altquot': (u'« ', u' »', u'‹ ', u' ›'), # narrow no-break space, http://typoguide.ch/ + 'gl': u'«»“”', + 'he': u'”“»«', # Hebrew is RTL, test position: + 'he-x-altquot': u'„”‚’', # low quotation marks are opening. + # 'he-x-altquot': u'“„‘‚', # RTL: low quotation marks opening + 'hr': u'„”‘’', # http://hrvatska-tipografija.com/polunavodnici/ + 'hr-x-altquot': u'»«›‹', + 'hsb': u'„“‚‘', + 'hsb-x-altquot': u'»«›‹', + 'hu': u'„”«»', + 'is': u'„“‚‘', + 'it': u'«»“”', + 'it-ch': u'«»‹›', + 'it-x-altquot': u'“”‘’', + # 'it-x-altquot2': u'“„‘‚', # [7] in headlines + 'ja': u'「」『』', + 'ko': u'《》〈〉', + 'lt': u'„“‚‘', + 'lv': u'„“‚‘', + 'mk': u'„“‚‘', # Macedonian, https://mk.wikipedia.org/wiki/Правопис_и_правоговор_на_македонскиот_јазик + 'nl': u'“”‘’', + 'nl-x-altquot': u'„”‚’', + # 'nl-x-altquot2': u'””’’', + 'nb': u'«»’’', # Norsk bokmål (canonical form 'no') + 'nn': u'«»’’', # Nynorsk [10] + 'nn-x-altquot': u'«»‘’', # [8], [10] + # 'nn-x-altquot2': u'«»«»', # [9], [10 + # 'nn-x-altquot3': u'„“‚‘', # [10] + 'no': u'«»’’', # Norsk bokmål [10] + 'no-x-altquot': u'«»‘’', # [8], [10] + # 'no-x-altquot2': u'«»«»', # [9], [10 + # 'no-x-altquot3': u'„“‚‘', # [10] + 'pl': u'„”«»', + 'pl-x-altquot': u'«»‚’', + # 'pl-x-altquot2': u'„”‚’', # https://pl.wikipedia.org/wiki/Cudzys%C5%82%C3%B3w + 'pt': u'«»“”', + 'pt-br': u'“”‘’', + 'ro': u'„”«»', + 'ru': u'«»„“', + 'sh': u'„”‚’', # Serbo-Croatian + 'sh-x-altquot': u'»«›‹', + 'sk': u'„“‚‘', # Slovak + 'sk-x-altquot': u'»«›‹', + 'sl': u'„“‚‘', # Slovenian + 'sl-x-altquot': u'»«›‹', + 'sq': u'«»‹›', # Albanian + 'sq-x-altquot': u'“„‘‚', + 'sr': u'„”’’', + 'sr-x-altquot': u'»«›‹', + 'sv': u'””’’', + 'sv-x-altquot': u'»»››', + 'tr': u'“”‘’', + 'tr-x-altquot': u'«»‹›', + # 'tr-x-altquot2': u'“„‘‚', # [7] antiquated? + 'uk': u'«»„“', + 'uk-x-altquot': u'„“‚‘', + 'zh-cn': u'“”‘’', + 'zh-tw': u'「」『』', + } + + def __init__(self, language='en'): + self.language = language + try: + (self.opquote, self.cpquote, + self.osquote, self.csquote) = self.quotes[language.lower()] + except KeyError: + self.opquote, self.cpquote, self.osquote, self.csquote = u'""\'\'' + + +def smartyPants(text, attr=default_smartypants_attr, language='en'): + """Main function for "traditional" use.""" + + return "".join([t for t in educate_tokens(tokenize(text), + attr, language)]) + + +def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'): + """Return iterator that "educates" the items of `text_tokens`. + """ + + # Parse attributes: + # 0 : do nothing + # 1 : set all + # 2 : set all, using old school en- and em- dash shortcuts + # 3 : set all, using inverted old school en and em- dash shortcuts + # + # q : quotes + # b : backtick quotes (``double'' only) + # B : backtick quotes (``double'' and `single') + # d : dashes + # D : old school dashes + # i : inverted old school dashes + # e : ellipses + # w : convert " entities to " for Dreamweaver users + + convert_quot = False # translate " entities into normal quotes? + do_dashes = False + do_backticks = False + do_quotes = False + do_ellipses = False + do_stupefy = False + + # if attr == "0": # pass tokens unchanged (see below). + if attr == "1": # Do everything, turn all options on. + do_quotes = True + do_backticks = True + do_dashes = 1 + do_ellipses = True + elif attr == "2": + # Do everything, turn all options on, use old school dash shorthand. + do_quotes = True + do_backticks = True + do_dashes = 2 + do_ellipses = True + elif attr == "3": + # Do everything, use inverted old school dash shorthand. + do_quotes = True + do_backticks = True + do_dashes = 3 + do_ellipses = True + elif attr == "-1": # Special "stupefy" mode. + do_stupefy = True + else: + if "q" in attr: do_quotes = True + if "b" in attr: do_backticks = True + if "B" in attr: do_backticks = 2 + if "d" in attr: do_dashes = 1 + if "D" in attr: do_dashes = 2 + if "i" in attr: do_dashes = 3 + if "e" in attr: do_ellipses = True + if "w" in attr: convert_quot = True + + prev_token_last_char = " " + # Last character of the previous text token. Used as + # context to curl leading quote characters correctly. + + for (ttype, text) in text_tokens: + + # skip HTML and/or XML tags as well as emtpy text tokens + # without updating the last character + if ttype == 'tag' or not text: + yield text + continue + + # skip literal text (math, literal, raw, ...) + if ttype == 'literal': + prev_token_last_char = text[-1:] + yield text + continue + + last_char = text[-1:] # Remember last char before processing. + + text = processEscapes(text) + + if convert_quot: + text = re.sub('"', '"', text) + + if do_dashes == 1: + text = educateDashes(text) + elif do_dashes == 2: + text = educateDashesOldSchool(text) + elif do_dashes == 3: + text = educateDashesOldSchoolInverted(text) + + if do_ellipses: + text = educateEllipses(text) + + # Note: backticks need to be processed before quotes. + if do_backticks: + text = educateBackticks(text, language) + + if do_backticks == 2: + text = educateSingleBackticks(text, language) + + if do_quotes: + # Replace plain quotes in context to prevent converstion to + # 2-character sequence in French. + context = prev_token_last_char.replace('"', ';').replace("'", ';') + text = educateQuotes(context+text, language)[1:] + + if do_stupefy: + text = stupefyEntities(text, language) + + # Remember last char as context for the next token + prev_token_last_char = last_char + + text = processEscapes(text, restore=True) + + yield text + + + +def educateQuotes(text, language='en'): + """ + Parameter: - text string (unicode or bytes). + - language (`BCP 47` language tag.) + Returns: The `text`, with "educated" curly quote characters. + + Example input: "Isn't this fun?" + Example output: “Isn’t this fun?“; + """ + + smart = smartchars(language) + + punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" + close_class = r"""[^\ \t\r\n\[\{\(\-]""" + open_class = u'[\u200B\u200C]' # ZWSP, ZWNJ + dec_dashes = r"""–|—""" + + # Special case if the very first character is a quote + # followed by punctuation at a non-word-break. + # Close the quotes by brute force: + text = re.sub(r"^'(?=%s\\B)" % (punct_class,), smart.csquote, text) + text = re.sub(r'^"(?=%s\\B)' % (punct_class,), smart.cpquote, text) + + # Special case for double sets of quotes, e.g.: + # <p>He said, "'Quoted' words in a larger quote."</p> + text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text) + text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text) + + # Special case for decade abbreviations (the '80s): + if language.startswith('en'): # TODO similar cases in other languages? + text = re.sub(r"'(?=\d{2}s)", smart.apostrophe, text) + + # Get most opening single quotes: + opening_single_quotes_regex = re.compile(u""" + (# ?<= # look behind fails: requires fixed-width pattern + \\s | # a whitespace char, or + %s | # another separating char, or + | # a non-breaking space entity, or + [\u2013 \u2014 ] | # literal dashes, or + -- | # dumb dashes, or + &[mn]dash; | # dash entities (named or + %s | # decimal or + &\\#x201[34]; # hex) + ) + ' # the quote + (?=\\w) # followed by a word character + """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE) + + text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text) + + # In many locales, single closing quotes are different from apostrophe: + if smart.csquote != smart.apostrophe: + apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE) + text = apostrophe_regex.sub(smart.apostrophe, text) + # TODO: keep track of quoting level to recognize apostrophe in, e.g., + # "Ich fass' es nicht." + + closing_single_quotes_regex = re.compile(r""" + (?<=%s) + ' + """ % close_class, re.VERBOSE) + text = closing_single_quotes_regex.sub(smart.csquote, text) + + # Any remaining single quotes should be opening ones: + text = re.sub(r"""'""", smart.osquote, text) + + # Get most opening double quotes: + opening_double_quotes_regex = re.compile(u""" + ( + \\s | # a whitespace char, or + %s | # another separating char, or + | # a non-breaking space entity, or + [\u2013 \u2014 ] | # literal dashes, or + -- | # dumb dashes, or + &[mn]dash; | # dash entities (named or + %s | # decimal or + &\\#x201[34]; # hex) + ) + " # the quote + (?=\\w) # followed by a word character + """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE) + + text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text) + + # Double closing quotes: + closing_double_quotes_regex = re.compile(r""" + ( + (?<=%s)" | # char indicating the quote should be closing + "(?=\s) # whitespace behind + ) + """ % (close_class,), re.VERBOSE | re.UNICODE) + text = closing_double_quotes_regex.sub(smart.cpquote, text) + + # Any remaining quotes should be opening ones. + text = re.sub(r'"', smart.opquote, text) + + return text + + +def educateBackticks(text, language='en'): + """ + Parameter: String (unicode or bytes). + Returns: The `text`, with ``backticks'' -style double quotes + translated into HTML curly quote entities. + Example input: ``Isn't this fun?'' + Example output: “Isn't this fun?“; + """ + smart = smartchars(language) + + text = re.sub(r"""``""", smart.opquote, text) + text = re.sub(r"""''""", smart.cpquote, text) + return text + + +def educateSingleBackticks(text, language='en'): + """ + Parameter: String (unicode or bytes). + Returns: The `text`, with `backticks' -style single quotes + translated into HTML curly quote entities. + + Example input: `Isn't this fun?' + Example output: ‘Isn’t this fun?’ + """ + smart = smartchars(language) + + text = re.sub(r"""`""", smart.osquote, text) + text = re.sub(r"""'""", smart.csquote, text) + return text + + +def educateDashes(text): + """ + Parameter: String (unicode or bytes). + Returns: The `text`, with each instance of "--" translated to + an em-dash character. + """ + + text = re.sub(r"""---""", smartchars.endash, text) # en (yes, backwards) + text = re.sub(r"""--""", smartchars.emdash, text) # em (yes, backwards) + return text + + +def educateDashesOldSchool(text): + """ + Parameter: String (unicode or bytes). + Returns: The `text`, with each instance of "--" translated to + an en-dash character, and each "---" translated to + an em-dash character. + """ + + text = re.sub(r"""---""", smartchars.emdash, text) + text = re.sub(r"""--""", smartchars.endash, text) + return text + + +def educateDashesOldSchoolInverted(text): + """ + Parameter: String (unicode or bytes). + Returns: The `text`, with each instance of "--" translated to + an em-dash character, and each "---" translated to + an en-dash character. Two reasons why: First, unlike the + en- and em-dash syntax supported by + EducateDashesOldSchool(), it's compatible with existing + entries written before SmartyPants 1.1, back when "--" was + only used for em-dashes. Second, em-dashes are more + common than en-dashes, and so it sort of makes sense that + the shortcut should be shorter to type. (Thanks to Aaron + Swartz for the idea.) + """ + text = re.sub(r"""---""", smartchars.endash, text) # em + text = re.sub(r"""--""", smartchars.emdash, text) # en + return text + + + +def educateEllipses(text): + """ + Parameter: String (unicode or bytes). + Returns: The `text`, with each instance of "..." translated to + an ellipsis character. + + Example input: Huh...? + Example output: Huh…? + """ + + text = re.sub(r"""\.\.\.""", smartchars.ellipsis, text) + text = re.sub(r"""\. \. \.""", smartchars.ellipsis, text) + return text + + +def stupefyEntities(text, language='en'): + """ + Parameter: String (unicode or bytes). + Returns: The `text`, with each SmartyPants character translated to + its ASCII counterpart. + + Example input: “Hello — world.” + Example output: "Hello -- world." + """ + smart = smartchars(language) + + text = re.sub(smart.endash, "-", text) # en-dash + text = re.sub(smart.emdash, "--", text) # em-dash + + text = re.sub(smart.osquote, "'", text) # open single quote + text = re.sub(smart.csquote, "'", text) # close single quote + + text = re.sub(smart.opquote, '"', text) # open double quote + text = re.sub(smart.cpquote, '"', text) # close double quote + + text = re.sub(smart.ellipsis, '...', text)# ellipsis + + return text + + +def processEscapes(text, restore=False): + r""" + Parameter: String (unicode or bytes). + Returns: The `text`, with after processing the following backslash + escape sequences. This is useful if you want to force a "dumb" + quote or other character to appear. + + Escape Value + ------ ----- + \\ \ + \" " + \' ' + \. . + \- - + \` ` + """ + replacements = ((r'\\', r'\'), + (r'\"', r'"'), + (r"\'", r'''), + (r'\.', r'.'), + (r'\-', r'-'), + (r'\`', r'`')) + if restore: + for (ch, rep) in replacements: + text = text.replace(rep, ch[1]) + else: + for (ch, rep) in replacements: + text = text.replace(ch, rep) + + return text + + +def tokenize(text): + """ + Parameter: String containing HTML markup. + Returns: An iterator that yields the tokens comprising the input + string. Each token is either a tag (possibly with nested, + tags contained therein, such as <a href="<MTFoo>">, or a + run of text between tags. Each yielded element is a + two-element tuple; the first is either 'tag' or 'text'; + the second is the actual value. + + Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin. + <http://www.bradchoate.com/past/mtregex.php> + """ + + pos = 0 + length = len(text) + # tokens = [] + + depth = 6 + nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth) + #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments + # (?: <\? .*? \?> ) | # directives + # %s # nested tags """ % (nested_tags,) + tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""") + + token_match = tag_soup.search(text) + + previous_end = 0 + while token_match is not None: + if token_match.group(1): + yield ('text', token_match.group(1)) + + yield ('tag', token_match.group(2)) + + previous_end = token_match.end() + token_match = tag_soup.search(text, token_match.end()) + + if previous_end < len(text): + yield ('text', text[previous_end:]) + + + +if __name__ == "__main__": + + import itertools + try: + import locale # module missing in Jython + locale.setlocale(locale.LC_ALL, '') # set to user defaults + defaultlanguage = locale.getdefaultlocale()[0] + except: + defaultlanguage = 'en' + + # Normalize and drop unsupported subtags: + defaultlanguage = defaultlanguage.lower().replace('-', '_') + # split (except singletons, which mark the following tag as non-standard): + defaultlanguage = re.sub(r'_([a-zA-Z0-9])_', r'_\1-', defaultlanguage) + _subtags = [subtag for subtag in defaultlanguage.split('_')] + _basetag = _subtags.pop(0) + # find all combinations of subtags + for n in range(len(_subtags), 0, -1): + for tags in itertools.combinations(_subtags, n): + _tag = '-'.join((_basetag,)+tags) + if _tag in smartchars.quotes: + defaultlanguage = _tag + break + else: + if _basetag in smartchars.quotes: + defaultlanguage = _basetag + else: + defaultlanguage = 'en' + + + import argparse + parser = argparse.ArgumentParser( + description='Filter stdin making ASCII punctuation "smart".') + # parser.add_argument("text", help="text to be acted on") + parser.add_argument("-a", "--action", default="1", + help="what to do with the input (see --actionhelp)") + parser.add_argument("-e", "--encoding", default="utf8", + help="text encoding") + parser.add_argument("-l", "--language", default=defaultlanguage, + help="text language (BCP47 tag), " + "Default: %s"% defaultlanguage) + parser.add_argument("-q", "--alternative-quotes", action="store_true", + help="use alternative quote style") + parser.add_argument("--doc", action="store_true", + help="print documentation") + parser.add_argument("--actionhelp", action="store_true", + help="list available actions") + parser.add_argument("--stylehelp", action="store_true", + help="list available quote styles") + parser.add_argument("--test", action="store_true", + help="perform short self-test") + args = parser.parse_args() + + if args.doc: + print(__doc__) + elif args.actionhelp: + print(options) + elif args.stylehelp: + print() + print("Available styles (primary open/close, secondary open/close)") + print("language tag quotes") + print("============ ======") + for key in sorted(smartchars.quotes.keys()): + print("%-14s %s" % (key, smartchars.quotes[key])) + elif args.test: + # Unit test output goes to stderr. + import unittest + + class TestSmartypantsAllAttributes(unittest.TestCase): + # the default attribute is "1", which means "all". + def test_dates(self): + self.assertEqual(smartyPants("1440-80's"), u"1440-80’s") + self.assertEqual(smartyPants("1440-'80s"), u"1440-’80s") + self.assertEqual(smartyPants("1440---'80s"), u"1440–’80s") + self.assertEqual(smartyPants("1960's"), u"1960’s") + self.assertEqual(smartyPants("one two '60s"), u"one two ’60s") + self.assertEqual(smartyPants("'60s"), u"’60s") + + def test_educated_quotes(self): + self.assertEqual(smartyPants('"Isn\'t this fun?"'), u'“Isn’t this fun?”') + + def test_html_tags(self): + text = '<a src="foo">more</a>' + self.assertEqual(smartyPants(text), text) + + suite = unittest.TestLoader().loadTestsFromTestCase( + TestSmartypantsAllAttributes) + unittest.TextTestRunner().run(suite) + + else: + if args.alternative_quotes: + if '-x-altquot' in args.language: + args.language = args.language.replace('-x-altquot', '') + else: + args.language += '-x-altquot' + text = sys.stdin.read().decode(args.encoding) + print(smartyPants(text, attr=args.action, + language=args.language).encode(args.encoding))