Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/docutils/utils/smartquotes.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/docutils/utils/smartquotes.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1014 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -# :Id: $Id: smartquotes.py 8376 2019-08-27 19:49:29Z milde $ -# :Copyright: © 2010 Günter Milde, -# original `SmartyPants`_: © 2003 John Gruber -# smartypants.py: © 2004, 2007 Chad Miller -# :Maintainer: docutils-develop@lists.sourceforge.net -# :License: Released under the terms of the `2-Clause BSD license`_, in short: -# -# Copying and distribution of this file, with or without modification, -# are permitted in any medium without royalty provided the copyright -# notices and this notice are preserved. -# This file is offered as-is, without any warranty. -# -# .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause - - -r""" -========================= -Smart Quotes for Docutils -========================= - -Synopsis -======== - -"SmartyPants" is a free web publishing plug-in for Movable Type, Blosxom, and -BBEdit that easily translates plain ASCII punctuation characters into "smart" -typographic punctuation characters. - -``smartquotes.py`` is an adaption of "SmartyPants" to Docutils_. - -* Using Unicode instead of HTML entities for typographic punctuation - characters, it works for any output format that supports Unicode. -* Supports `language specific quote characters`__. - -__ http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks - - -Authors -======= - -`John Gruber`_ did all of the hard work of writing this software in Perl for -`Movable Type`_ and almost all of this useful documentation. `Chad Miller`_ -ported it to Python to use with Pyblosxom_. -Adapted to Docutils_ by Günter Milde. - -Additional Credits -================== - -Portions of the SmartyPants original work are based on Brad Choate's nifty -MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to -this plug-in. Brad Choate is a fine hacker indeed. - -`Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta -testing of the original SmartyPants. - -`Rael Dornfest`_ ported SmartyPants to Blosxom. - -.. _Brad Choate: http://bradchoate.com/ -.. _Jeremy Hedley: http://antipixel.com/ -.. _Charles Wiltgen: http://playbacktime.com/ -.. _Rael Dornfest: http://raelity.org/ - - -Copyright and License -===================== - -SmartyPants_ license (3-Clause BSD license): - - Copyright (c) 2003 John Gruber (http://daringfireball.net/) - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - - * Neither the name "SmartyPants" nor the names of its contributors - may be used to endorse or promote products derived from this - software without specific prior written permission. - - This software is provided by the copyright holders and contributors - "as is" and any express or implied warranties, including, but not - limited to, the implied warranties of merchantability and fitness for - a particular purpose are disclaimed. In no event shall the copyright - owner or contributors be liable for any direct, indirect, incidental, - special, exemplary, or consequential damages (including, but not - limited to, procurement of substitute goods or services; loss of use, - data, or profits; or business interruption) however caused and on any - theory of liability, whether in contract, strict liability, or tort - (including negligence or otherwise) arising in any way out of the use - of this software, even if advised of the possibility of such damage. - -smartypants.py license (2-Clause BSD license): - - smartypants.py is a derivative work of SmartyPants. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - - This software is provided by the copyright holders and contributors - "as is" and any express or implied warranties, including, but not - limited to, the implied warranties of merchantability and fitness for - a particular purpose are disclaimed. In no event shall the copyright - owner or contributors be liable for any direct, indirect, incidental, - special, exemplary, or consequential damages (including, but not - limited to, procurement of substitute goods or services; loss of use, - data, or profits; or business interruption) however caused and on any - theory of liability, whether in contract, strict liability, or tort - (including negligence or otherwise) arising in any way out of the use - of this software, even if advised of the possibility of such damage. - -.. _John Gruber: http://daringfireball.net/ -.. _Chad Miller: http://web.chad.org/ - -.. _Pyblosxom: http://pyblosxom.bluesock.org/ -.. _SmartyPants: http://daringfireball.net/projects/smartypants/ -.. _Movable Type: http://www.movabletype.org/ -.. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause -.. _Docutils: http://docutils.sf.net/ - -Description -=========== - -SmartyPants can perform the following transformations: - -- Straight quotes ( " and ' ) into "curly" quote characters -- Backticks-style quotes (\`\`like this'') into "curly" quote characters -- Dashes (``--`` and ``---``) into en- and em-dash entities -- Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity - -This means you can write, edit, and save your posts using plain old -ASCII straight quotes, plain dashes, and plain dots, but your published -posts (and final HTML output) will appear with smart quotes, em-dashes, -and proper ellipses. - -SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``, -``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to -display text where smart quotes and other "smart punctuation" would not be -appropriate, such as source code or example markup. - - -Backslash Escapes -================= - -If you need to use literal straight quotes (or plain hyphens and periods), -`smartquotes` accepts the following backslash escape sequences to force -ASCII-punctuation. Mind, that you need two backslashes as Docutils expands it, -too. - -======== ========= -Escape Character -======== ========= -``\\`` \\ -``\\"`` \\" -``\\'`` \\' -``\\.`` \\. -``\\-`` \\- -``\\``` \\` -======== ========= - -This is useful, for example, when you want to use straight quotes as -foot and inch marks: 6\\'2\\" tall; a 17\\" iMac. - - -Caveats -======= - -Why You Might Not Want to Use Smart Quotes in Your Weblog ---------------------------------------------------------- - -For one thing, you might not care. - -Most normal, mentally stable individuals do not take notice of proper -typographic punctuation. Many design and typography nerds, however, break -out in a nasty rash when they encounter, say, a restaurant sign that uses -a straight apostrophe to spell "Joe's". - -If you're the sort of person who just doesn't care, you might well want to -continue not caring. Using straight quotes -- and sticking to the 7-bit -ASCII character set in general -- is certainly a simpler way to live. - -Even if you *do* care about accurate typography, you still might want to -think twice before educating the quote characters in your weblog. One side -effect of publishing curly quote characters is that it makes your -weblog a bit harder for others to quote from using copy-and-paste. What -happens is that when someone copies text from your blog, the copied text -contains the 8-bit curly quote characters (as well as the 8-bit characters -for em-dashes and ellipses, if you use these options). These characters -are not standard across different text encoding methods, which is why they -need to be encoded as characters. - -People copying text from your weblog, however, may not notice that you're -using curly quotes, and they'll go ahead and paste the unencoded 8-bit -characters copied from their browser into an email message or their own -weblog. When pasted as raw "smart quotes", these characters are likely to -get mangled beyond recognition. - -That said, my own opinion is that any decent text editor or email client -makes it easy to stupefy smart quote characters into their 7-bit -equivalents, and I don't consider it my problem if you're using an -indecent text editor or email client. - - -Algorithmic Shortcomings ------------------------- - -One situation in which quotes will get curled the wrong way is when -apostrophes are used at the start of leading contractions. For example:: - - 'Twas the night before Christmas. - -In the case above, SmartyPants will turn the apostrophe into an opening -single-quote, when in fact it should be the `right single quotation mark` -character which is also "the preferred character to use for apostrophe" -(Unicode). I don't think this problem can be solved in the general case -- -every word processor I've tried gets this wrong as well. In such cases, it's -best to use the proper character for closing single-quotes (’) by hand. - -In English, the same character is used for apostrophe and closing single -quote (both plain and "smart" ones). For other locales (French, Italean, -Swiss, ...) "smart" single closing quotes differ from the curly apostrophe. - - .. class:: language-fr - - Il dit : "C'est 'super' !" - -If the apostrophe is used at the end of a word, it cannot be distinguished -from a single quote by the algorithm. Therefore, a text like:: - - .. class:: language-de-CH - - "Er sagt: 'Ich fass' es nicht.'" - -will get a single closing guillemet instead of an apostrophe. - -This can be prevented by use use of the curly apostrophe character (’) in -the source:: - - - "Er sagt: 'Ich fass' es nicht.'" - + "Er sagt: 'Ich fass’ es nicht.'" - - -Version History -=============== - -1.8.1 2017-10-25 - - Use open quote after Unicode whitespace, ZWSP, and ZWNJ. - - Code cleanup. - -1.8: 2017-04-24 - - Command line front-end. - -1.7.1: 2017-03-19 - - Update and extend language-dependent quotes. - - Differentiate apostrophe from single quote. - -1.7: 2012-11-19 - - Internationalization: language-dependent quotes. - -1.6.1: 2012-11-06 - - Refactor code, code cleanup, - - `educate_tokens()` generator as interface for Docutils. - -1.6: 2010-08-26 - - Adaption to Docutils: - - Use Unicode instead of HTML entities, - - Remove code special to pyblosxom. - -1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400 - - Fixed bug where blocks of precious unalterable text was instead - interpreted. Thanks to Le Roux and Dirk van Oosterbosch. - -1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400 - - Fix bogus magical quotation when there is no hint that the - user wants it, e.g., in "21st century". Thanks to Nathan Hamblen. - - Be smarter about quotes before terminating numbers in an en-dash'ed - range. - -1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500 - - Fix a date-processing bug, as reported by jacob childress. - - Begin a test-suite for ensuring correct output. - - Removed import of "string", since I didn't really need it. - (This was my first every Python program. Sue me!) - -1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400 - - Abort processing if the flavour is in forbidden-list. Default of - [ "rss" ] (Idea of Wolfgang SCHNERRING.) - - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING. - -1.5_1.2: Mon, 24 May 2004 08:14:54 -0400 - - Some single quotes weren't replaced properly. Diff-tesuji played - by Benjamin GEIGER. - -1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500 - - Support upcoming pyblosxom 0.9 plugin verification feature. - -1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500 - - Initial release -""" -from __future__ import print_function - -options = r""" -Options -======= - -Numeric values are the easiest way to configure SmartyPants' behavior: - -:0: Suppress all transformations. (Do nothing.) - -:1: Performs default SmartyPants transformations: quotes (including - \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash) - is used to signify an em-dash; there is no support for en-dashes - -:2: Same as smarty_pants="1", except that it uses the old-school typewriter - shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``" - (dash dash dash) - for em-dashes. - -:3: Same as smarty_pants="2", but inverts the shorthand for dashes: - "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for - en-dashes. - -:-1: Stupefy mode. Reverses the SmartyPants transformation process, turning - the characters produced by SmartyPants into their ASCII equivalents. - E.g. the LEFT DOUBLE QUOTATION MARK (“) is turned into a simple - double-quote (\"), "—" is turned into two dashes, etc. - - -The following single-character attribute values can be combined to toggle -individual transformations from within the smarty_pants attribute. For -example, ``"1"`` is equivalent to ``"qBde"``. - -:q: Educates normal quote characters: (") and ('). - -:b: Educates \`\`backticks'' -style double quotes. - -:B: Educates \`\`backticks'' -style double quotes and \`single' quotes. - -:d: Educates em-dashes. - -:D: Educates em-dashes and en-dashes, using old-school typewriter shorthand: - (dash dash) for en-dashes, (dash dash dash) for em-dashes. - -:i: Educates em-dashes and en-dashes, using inverted old-school typewriter - shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes. - -:e: Educates ellipses. - -:w: Translates any instance of ``"`` into a normal double-quote character. - This should be of no interest to most people, but of particular interest - to anyone who writes their posts using Dreamweaver, as Dreamweaver - inexplicably uses this entity to represent a literal double-quote - character. SmartyPants only educates normal quotes, not entities (because - ordinarily, entities are used for the explicit purpose of representing the - specific character they represent). The "w" option must be used in - conjunction with one (or both) of the other quote options ("q" or "b"). - Thus, if you wish to apply all SmartyPants transformations (quotes, en- - and em-dashes, and ellipses) and also translate ``"`` entities into - regular quotes so SmartyPants can educate them, you should pass the - following to the smarty_pants attribute: -""" - - -default_smartypants_attr = "1" - - -import re, sys - -class smartchars(object): - """Smart quotes and dashes - """ - - endash = u'–' # "–" EN DASH - emdash = u'—' # "—" EM DASH - ellipsis = u'…' # "…" HORIZONTAL ELLIPSIS - apostrophe = u'’' # "’" RIGHT SINGLE QUOTATION MARK - - # quote characters (language-specific, set in __init__()) - # [1] http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks - # [2] http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen - # [3] https://fr.wikipedia.org/wiki/Guillemet - # [4] http://typographisme.net/post/Les-espaces-typographiques-et-le-web - # [5] http://www.btb.termiumplus.gc.ca/tpv2guides/guides/redac/index-fra.html - # [6] https://en.wikipedia.org/wiki/Hebrew_punctuation#Quotation_marks - # [7] http://www.tustep.uni-tuebingen.de/bi/bi00/bi001t1-anfuehrung.pdf - # [8] http://www.korrekturavdelingen.no/anforselstegn.htm - # [9] Typografisk håndbok. Oslo: Spartacus. 2000. s. 67. ISBN 8243001530. - # [10] http://www.typografi.org/sitat/sitatart.html - # - # See also configuration option "smartquote-locales". - quotes = {'af': u'“”‘’', - 'af-x-altquot': u'„”‚’', - 'bg': u'„“‚‘', # Bulgarian, https://bg.wikipedia.org/wiki/Кавички - 'ca': u'«»“”', - 'ca-x-altquot': u'“”‘’', - 'cs': u'„“‚‘', - 'cs-x-altquot': u'»«›‹', - 'da': u'»«›‹', - 'da-x-altquot': u'„“‚‘', - # 'da-x-altquot2': u'””’’', - 'de': u'„“‚‘', - 'de-x-altquot': u'»«›‹', - 'de-ch': u'«»‹›', - 'el': u'«»“”', - 'en': u'“”‘’', - 'en-uk-x-altquot': u'‘’“”', # Attention: " → ‘ and ' → “ ! - 'eo': u'“”‘’', - 'es': u'«»“”', - 'es-x-altquot': u'“”‘’', - 'et': u'„“‚‘', # no secondary quote listed in - 'et-x-altquot': u'«»‹›', # the sources above (wikipedia.org) - 'eu': u'«»‹›', - 'fi': u'””’’', - 'fi-x-altquot': u'»»››', - 'fr': (u'« ', u' »', u'“', u'”'), # full no-break space - 'fr-x-altquot': (u'« ', u' »', u'“', u'”'), # narrow no-break space - 'fr-ch': u'«»‹›', - 'fr-ch-x-altquot': (u'« ', u' »', u'‹ ', u' ›'), # narrow no-break space, http://typoguide.ch/ - 'gl': u'«»“”', - 'he': u'”“»«', # Hebrew is RTL, test position: - 'he-x-altquot': u'„”‚’', # low quotation marks are opening. - # 'he-x-altquot': u'“„‘‚', # RTL: low quotation marks opening - 'hr': u'„”‘’', # http://hrvatska-tipografija.com/polunavodnici/ - 'hr-x-altquot': u'»«›‹', - 'hsb': u'„“‚‘', - 'hsb-x-altquot': u'»«›‹', - 'hu': u'„”«»', - 'is': u'„“‚‘', - 'it': u'«»“”', - 'it-ch': u'«»‹›', - 'it-x-altquot': u'“”‘’', - # 'it-x-altquot2': u'“„‘‚', # [7] in headlines - 'ja': u'「」『』', - 'ko': u'《》〈〉', - 'lt': u'„“‚‘', - 'lv': u'„“‚‘', - 'mk': u'„“‚‘', # Macedonian, https://mk.wikipedia.org/wiki/Правопис_и_правоговор_на_македонскиот_јазик - 'nl': u'“”‘’', - 'nl-x-altquot': u'„”‚’', - # 'nl-x-altquot2': u'””’’', - 'nb': u'«»’’', # Norsk bokmål (canonical form 'no') - 'nn': u'«»’’', # Nynorsk [10] - 'nn-x-altquot': u'«»‘’', # [8], [10] - # 'nn-x-altquot2': u'«»«»', # [9], [10 - # 'nn-x-altquot3': u'„“‚‘', # [10] - 'no': u'«»’’', # Norsk bokmål [10] - 'no-x-altquot': u'«»‘’', # [8], [10] - # 'no-x-altquot2': u'«»«»', # [9], [10 - # 'no-x-altquot3': u'„“‚‘', # [10] - 'pl': u'„”«»', - 'pl-x-altquot': u'«»‚’', - # 'pl-x-altquot2': u'„”‚’', # https://pl.wikipedia.org/wiki/Cudzys%C5%82%C3%B3w - 'pt': u'«»“”', - 'pt-br': u'“”‘’', - 'ro': u'„”«»', - 'ru': u'«»„“', - 'sh': u'„”‚’', # Serbo-Croatian - 'sh-x-altquot': u'»«›‹', - 'sk': u'„“‚‘', # Slovak - 'sk-x-altquot': u'»«›‹', - 'sl': u'„“‚‘', # Slovenian - 'sl-x-altquot': u'»«›‹', - 'sq': u'«»‹›', # Albanian - 'sq-x-altquot': u'“„‘‚', - 'sr': u'„”’’', - 'sr-x-altquot': u'»«›‹', - 'sv': u'””’’', - 'sv-x-altquot': u'»»››', - 'tr': u'“”‘’', - 'tr-x-altquot': u'«»‹›', - # 'tr-x-altquot2': u'“„‘‚', # [7] antiquated? - 'uk': u'«»„“', - 'uk-x-altquot': u'„“‚‘', - 'zh-cn': u'“”‘’', - 'zh-tw': u'「」『』', - } - - def __init__(self, language='en'): - self.language = language - try: - (self.opquote, self.cpquote, - self.osquote, self.csquote) = self.quotes[language.lower()] - except KeyError: - self.opquote, self.cpquote, self.osquote, self.csquote = u'""\'\'' - - -def smartyPants(text, attr=default_smartypants_attr, language='en'): - """Main function for "traditional" use.""" - - return "".join([t for t in educate_tokens(tokenize(text), - attr, language)]) - - -def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'): - """Return iterator that "educates" the items of `text_tokens`. - """ - - # Parse attributes: - # 0 : do nothing - # 1 : set all - # 2 : set all, using old school en- and em- dash shortcuts - # 3 : set all, using inverted old school en and em- dash shortcuts - # - # q : quotes - # b : backtick quotes (``double'' only) - # B : backtick quotes (``double'' and `single') - # d : dashes - # D : old school dashes - # i : inverted old school dashes - # e : ellipses - # w : convert " entities to " for Dreamweaver users - - convert_quot = False # translate " entities into normal quotes? - do_dashes = False - do_backticks = False - do_quotes = False - do_ellipses = False - do_stupefy = False - - # if attr == "0": # pass tokens unchanged (see below). - if attr == "1": # Do everything, turn all options on. - do_quotes = True - do_backticks = True - do_dashes = 1 - do_ellipses = True - elif attr == "2": - # Do everything, turn all options on, use old school dash shorthand. - do_quotes = True - do_backticks = True - do_dashes = 2 - do_ellipses = True - elif attr == "3": - # Do everything, use inverted old school dash shorthand. - do_quotes = True - do_backticks = True - do_dashes = 3 - do_ellipses = True - elif attr == "-1": # Special "stupefy" mode. - do_stupefy = True - else: - if "q" in attr: do_quotes = True - if "b" in attr: do_backticks = True - if "B" in attr: do_backticks = 2 - if "d" in attr: do_dashes = 1 - if "D" in attr: do_dashes = 2 - if "i" in attr: do_dashes = 3 - if "e" in attr: do_ellipses = True - if "w" in attr: convert_quot = True - - prev_token_last_char = " " - # Last character of the previous text token. Used as - # context to curl leading quote characters correctly. - - for (ttype, text) in text_tokens: - - # skip HTML and/or XML tags as well as emtpy text tokens - # without updating the last character - if ttype == 'tag' or not text: - yield text - continue - - # skip literal text (math, literal, raw, ...) - if ttype == 'literal': - prev_token_last_char = text[-1:] - yield text - continue - - last_char = text[-1:] # Remember last char before processing. - - text = processEscapes(text) - - if convert_quot: - text = re.sub('"', '"', text) - - if do_dashes == 1: - text = educateDashes(text) - elif do_dashes == 2: - text = educateDashesOldSchool(text) - elif do_dashes == 3: - text = educateDashesOldSchoolInverted(text) - - if do_ellipses: - text = educateEllipses(text) - - # Note: backticks need to be processed before quotes. - if do_backticks: - text = educateBackticks(text, language) - - if do_backticks == 2: - text = educateSingleBackticks(text, language) - - if do_quotes: - # Replace plain quotes in context to prevent converstion to - # 2-character sequence in French. - context = prev_token_last_char.replace('"', ';').replace("'", ';') - text = educateQuotes(context+text, language)[1:] - - if do_stupefy: - text = stupefyEntities(text, language) - - # Remember last char as context for the next token - prev_token_last_char = last_char - - text = processEscapes(text, restore=True) - - yield text - - - -def educateQuotes(text, language='en'): - """ - Parameter: - text string (unicode or bytes). - - language (`BCP 47` language tag.) - Returns: The `text`, with "educated" curly quote characters. - - Example input: "Isn't this fun?" - Example output: “Isn’t this fun?“; - """ - - smart = smartchars(language) - - punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" - close_class = r"""[^\ \t\r\n\[\{\(\-]""" - open_class = u'[\u200B\u200C]' # ZWSP, ZWNJ - dec_dashes = r"""–|—""" - - # Special case if the very first character is a quote - # followed by punctuation at a non-word-break. - # Close the quotes by brute force: - text = re.sub(r"^'(?=%s\\B)" % (punct_class,), smart.csquote, text) - text = re.sub(r'^"(?=%s\\B)' % (punct_class,), smart.cpquote, text) - - # Special case for double sets of quotes, e.g.: - # <p>He said, "'Quoted' words in a larger quote."</p> - text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text) - text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text) - - # Special case for decade abbreviations (the '80s): - if language.startswith('en'): # TODO similar cases in other languages? - text = re.sub(r"'(?=\d{2}s)", smart.apostrophe, text) - - # Get most opening single quotes: - opening_single_quotes_regex = re.compile(u""" - (# ?<= # look behind fails: requires fixed-width pattern - \\s | # a whitespace char, or - %s | # another separating char, or - | # a non-breaking space entity, or - [\u2013 \u2014 ] | # literal dashes, or - -- | # dumb dashes, or - &[mn]dash; | # dash entities (named or - %s | # decimal or - &\\#x201[34]; # hex) - ) - ' # the quote - (?=\\w) # followed by a word character - """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE) - - text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text) - - # In many locales, single closing quotes are different from apostrophe: - if smart.csquote != smart.apostrophe: - apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE) - text = apostrophe_regex.sub(smart.apostrophe, text) - # TODO: keep track of quoting level to recognize apostrophe in, e.g., - # "Ich fass' es nicht." - - closing_single_quotes_regex = re.compile(r""" - (?<=%s) - ' - """ % close_class, re.VERBOSE) - text = closing_single_quotes_regex.sub(smart.csquote, text) - - # Any remaining single quotes should be opening ones: - text = re.sub(r"""'""", smart.osquote, text) - - # Get most opening double quotes: - opening_double_quotes_regex = re.compile(u""" - ( - \\s | # a whitespace char, or - %s | # another separating char, or - | # a non-breaking space entity, or - [\u2013 \u2014 ] | # literal dashes, or - -- | # dumb dashes, or - &[mn]dash; | # dash entities (named or - %s | # decimal or - &\\#x201[34]; # hex) - ) - " # the quote - (?=\\w) # followed by a word character - """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE) - - text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text) - - # Double closing quotes: - closing_double_quotes_regex = re.compile(r""" - ( - (?<=%s)" | # char indicating the quote should be closing - "(?=\s) # whitespace behind - ) - """ % (close_class,), re.VERBOSE | re.UNICODE) - text = closing_double_quotes_regex.sub(smart.cpquote, text) - - # Any remaining quotes should be opening ones. - text = re.sub(r'"', smart.opquote, text) - - return text - - -def educateBackticks(text, language='en'): - """ - Parameter: String (unicode or bytes). - Returns: The `text`, with ``backticks'' -style double quotes - translated into HTML curly quote entities. - Example input: ``Isn't this fun?'' - Example output: “Isn't this fun?“; - """ - smart = smartchars(language) - - text = re.sub(r"""``""", smart.opquote, text) - text = re.sub(r"""''""", smart.cpquote, text) - return text - - -def educateSingleBackticks(text, language='en'): - """ - Parameter: String (unicode or bytes). - Returns: The `text`, with `backticks' -style single quotes - translated into HTML curly quote entities. - - Example input: `Isn't this fun?' - Example output: ‘Isn’t this fun?’ - """ - smart = smartchars(language) - - text = re.sub(r"""`""", smart.osquote, text) - text = re.sub(r"""'""", smart.csquote, text) - return text - - -def educateDashes(text): - """ - Parameter: String (unicode or bytes). - Returns: The `text`, with each instance of "--" translated to - an em-dash character. - """ - - text = re.sub(r"""---""", smartchars.endash, text) # en (yes, backwards) - text = re.sub(r"""--""", smartchars.emdash, text) # em (yes, backwards) - return text - - -def educateDashesOldSchool(text): - """ - Parameter: String (unicode or bytes). - Returns: The `text`, with each instance of "--" translated to - an en-dash character, and each "---" translated to - an em-dash character. - """ - - text = re.sub(r"""---""", smartchars.emdash, text) - text = re.sub(r"""--""", smartchars.endash, text) - return text - - -def educateDashesOldSchoolInverted(text): - """ - Parameter: String (unicode or bytes). - Returns: The `text`, with each instance of "--" translated to - an em-dash character, and each "---" translated to - an en-dash character. Two reasons why: First, unlike the - en- and em-dash syntax supported by - EducateDashesOldSchool(), it's compatible with existing - entries written before SmartyPants 1.1, back when "--" was - only used for em-dashes. Second, em-dashes are more - common than en-dashes, and so it sort of makes sense that - the shortcut should be shorter to type. (Thanks to Aaron - Swartz for the idea.) - """ - text = re.sub(r"""---""", smartchars.endash, text) # em - text = re.sub(r"""--""", smartchars.emdash, text) # en - return text - - - -def educateEllipses(text): - """ - Parameter: String (unicode or bytes). - Returns: The `text`, with each instance of "..." translated to - an ellipsis character. - - Example input: Huh...? - Example output: Huh…? - """ - - text = re.sub(r"""\.\.\.""", smartchars.ellipsis, text) - text = re.sub(r"""\. \. \.""", smartchars.ellipsis, text) - return text - - -def stupefyEntities(text, language='en'): - """ - Parameter: String (unicode or bytes). - Returns: The `text`, with each SmartyPants character translated to - its ASCII counterpart. - - Example input: “Hello — world.” - Example output: "Hello -- world." - """ - smart = smartchars(language) - - text = re.sub(smart.endash, "-", text) # en-dash - text = re.sub(smart.emdash, "--", text) # em-dash - - text = re.sub(smart.osquote, "'", text) # open single quote - text = re.sub(smart.csquote, "'", text) # close single quote - - text = re.sub(smart.opquote, '"', text) # open double quote - text = re.sub(smart.cpquote, '"', text) # close double quote - - text = re.sub(smart.ellipsis, '...', text)# ellipsis - - return text - - -def processEscapes(text, restore=False): - r""" - Parameter: String (unicode or bytes). - Returns: The `text`, with after processing the following backslash - escape sequences. This is useful if you want to force a "dumb" - quote or other character to appear. - - Escape Value - ------ ----- - \\ \ - \" " - \' ' - \. . - \- - - \` ` - """ - replacements = ((r'\\', r'\'), - (r'\"', r'"'), - (r"\'", r'''), - (r'\.', r'.'), - (r'\-', r'-'), - (r'\`', r'`')) - if restore: - for (ch, rep) in replacements: - text = text.replace(rep, ch[1]) - else: - for (ch, rep) in replacements: - text = text.replace(ch, rep) - - return text - - -def tokenize(text): - """ - Parameter: String containing HTML markup. - Returns: An iterator that yields the tokens comprising the input - string. Each token is either a tag (possibly with nested, - tags contained therein, such as <a href="<MTFoo>">, or a - run of text between tags. Each yielded element is a - two-element tuple; the first is either 'tag' or 'text'; - the second is the actual value. - - Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin. - <http://www.bradchoate.com/past/mtregex.php> - """ - - pos = 0 - length = len(text) - # tokens = [] - - depth = 6 - nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth) - #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments - # (?: <\? .*? \?> ) | # directives - # %s # nested tags """ % (nested_tags,) - tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""") - - token_match = tag_soup.search(text) - - previous_end = 0 - while token_match is not None: - if token_match.group(1): - yield ('text', token_match.group(1)) - - yield ('tag', token_match.group(2)) - - previous_end = token_match.end() - token_match = tag_soup.search(text, token_match.end()) - - if previous_end < len(text): - yield ('text', text[previous_end:]) - - - -if __name__ == "__main__": - - import itertools - try: - import locale # module missing in Jython - locale.setlocale(locale.LC_ALL, '') # set to user defaults - defaultlanguage = locale.getdefaultlocale()[0] - except: - defaultlanguage = 'en' - - # Normalize and drop unsupported subtags: - defaultlanguage = defaultlanguage.lower().replace('-', '_') - # split (except singletons, which mark the following tag as non-standard): - defaultlanguage = re.sub(r'_([a-zA-Z0-9])_', r'_\1-', defaultlanguage) - _subtags = [subtag for subtag in defaultlanguage.split('_')] - _basetag = _subtags.pop(0) - # find all combinations of subtags - for n in range(len(_subtags), 0, -1): - for tags in itertools.combinations(_subtags, n): - _tag = '-'.join((_basetag,)+tags) - if _tag in smartchars.quotes: - defaultlanguage = _tag - break - else: - if _basetag in smartchars.quotes: - defaultlanguage = _basetag - else: - defaultlanguage = 'en' - - - import argparse - parser = argparse.ArgumentParser( - description='Filter stdin making ASCII punctuation "smart".') - # parser.add_argument("text", help="text to be acted on") - parser.add_argument("-a", "--action", default="1", - help="what to do with the input (see --actionhelp)") - parser.add_argument("-e", "--encoding", default="utf8", - help="text encoding") - parser.add_argument("-l", "--language", default=defaultlanguage, - help="text language (BCP47 tag), " - "Default: %s"% defaultlanguage) - parser.add_argument("-q", "--alternative-quotes", action="store_true", - help="use alternative quote style") - parser.add_argument("--doc", action="store_true", - help="print documentation") - parser.add_argument("--actionhelp", action="store_true", - help="list available actions") - parser.add_argument("--stylehelp", action="store_true", - help="list available quote styles") - parser.add_argument("--test", action="store_true", - help="perform short self-test") - args = parser.parse_args() - - if args.doc: - print(__doc__) - elif args.actionhelp: - print(options) - elif args.stylehelp: - print() - print("Available styles (primary open/close, secondary open/close)") - print("language tag quotes") - print("============ ======") - for key in sorted(smartchars.quotes.keys()): - print("%-14s %s" % (key, smartchars.quotes[key])) - elif args.test: - # Unit test output goes to stderr. - import unittest - - class TestSmartypantsAllAttributes(unittest.TestCase): - # the default attribute is "1", which means "all". - def test_dates(self): - self.assertEqual(smartyPants("1440-80's"), u"1440-80’s") - self.assertEqual(smartyPants("1440-'80s"), u"1440-’80s") - self.assertEqual(smartyPants("1440---'80s"), u"1440–’80s") - self.assertEqual(smartyPants("1960's"), u"1960’s") - self.assertEqual(smartyPants("one two '60s"), u"one two ’60s") - self.assertEqual(smartyPants("'60s"), u"’60s") - - def test_educated_quotes(self): - self.assertEqual(smartyPants('"Isn\'t this fun?"'), u'“Isn’t this fun?”') - - def test_html_tags(self): - text = '<a src="foo">more</a>' - self.assertEqual(smartyPants(text), text) - - suite = unittest.TestLoader().loadTestsFromTestCase( - TestSmartypantsAllAttributes) - unittest.TextTestRunner().run(suite) - - else: - if args.alternative_quotes: - if '-x-altquot' in args.language: - args.language = args.language.replace('-x-altquot', '') - else: - args.language += '-x-altquot' - text = sys.stdin.read().decode(args.encoding) - print(smartyPants(text, attr=args.action, - language=args.language).encode(args.encoding))