Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/docutils/utils/smartquotes.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 #!/usr/bin/python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # :Id: $Id: smartquotes.py 8376 2019-08-27 19:49:29Z milde $ | |
5 # :Copyright: © 2010 Günter Milde, | |
6 # original `SmartyPants`_: © 2003 John Gruber | |
7 # smartypants.py: © 2004, 2007 Chad Miller | |
8 # :Maintainer: docutils-develop@lists.sourceforge.net | |
9 # :License: Released under the terms of the `2-Clause BSD license`_, in short: | |
10 # | |
11 # Copying and distribution of this file, with or without modification, | |
12 # are permitted in any medium without royalty provided the copyright | |
13 # notices and this notice are preserved. | |
14 # This file is offered as-is, without any warranty. | |
15 # | |
16 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause | |
17 | |
18 | |
19 r""" | |
20 ========================= | |
21 Smart Quotes for Docutils | |
22 ========================= | |
23 | |
24 Synopsis | |
25 ======== | |
26 | |
27 "SmartyPants" is a free web publishing plug-in for Movable Type, Blosxom, and | |
28 BBEdit that easily translates plain ASCII punctuation characters into "smart" | |
29 typographic punctuation characters. | |
30 | |
31 ``smartquotes.py`` is an adaption of "SmartyPants" to Docutils_. | |
32 | |
33 * Using Unicode instead of HTML entities for typographic punctuation | |
34 characters, it works for any output format that supports Unicode. | |
35 * Supports `language specific quote characters`__. | |
36 | |
37 __ http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks | |
38 | |
39 | |
40 Authors | |
41 ======= | |
42 | |
43 `John Gruber`_ did all of the hard work of writing this software in Perl for | |
44 `Movable Type`_ and almost all of this useful documentation. `Chad Miller`_ | |
45 ported it to Python to use with Pyblosxom_. | |
46 Adapted to Docutils_ by Günter Milde. | |
47 | |
48 Additional Credits | |
49 ================== | |
50 | |
51 Portions of the SmartyPants original work are based on Brad Choate's nifty | |
52 MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to | |
53 this plug-in. Brad Choate is a fine hacker indeed. | |
54 | |
55 `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta | |
56 testing of the original SmartyPants. | |
57 | |
58 `Rael Dornfest`_ ported SmartyPants to Blosxom. | |
59 | |
60 .. _Brad Choate: http://bradchoate.com/ | |
61 .. _Jeremy Hedley: http://antipixel.com/ | |
62 .. _Charles Wiltgen: http://playbacktime.com/ | |
63 .. _Rael Dornfest: http://raelity.org/ | |
64 | |
65 | |
66 Copyright and License | |
67 ===================== | |
68 | |
69 SmartyPants_ license (3-Clause BSD license): | |
70 | |
71 Copyright (c) 2003 John Gruber (http://daringfireball.net/) | |
72 All rights reserved. | |
73 | |
74 Redistribution and use in source and binary forms, with or without | |
75 modification, are permitted provided that the following conditions are | |
76 met: | |
77 | |
78 * Redistributions of source code must retain the above copyright | |
79 notice, this list of conditions and the following disclaimer. | |
80 | |
81 * Redistributions in binary form must reproduce the above copyright | |
82 notice, this list of conditions and the following disclaimer in | |
83 the documentation and/or other materials provided with the | |
84 distribution. | |
85 | |
86 * Neither the name "SmartyPants" nor the names of its contributors | |
87 may be used to endorse or promote products derived from this | |
88 software without specific prior written permission. | |
89 | |
90 This software is provided by the copyright holders and contributors | |
91 "as is" and any express or implied warranties, including, but not | |
92 limited to, the implied warranties of merchantability and fitness for | |
93 a particular purpose are disclaimed. In no event shall the copyright | |
94 owner or contributors be liable for any direct, indirect, incidental, | |
95 special, exemplary, or consequential damages (including, but not | |
96 limited to, procurement of substitute goods or services; loss of use, | |
97 data, or profits; or business interruption) however caused and on any | |
98 theory of liability, whether in contract, strict liability, or tort | |
99 (including negligence or otherwise) arising in any way out of the use | |
100 of this software, even if advised of the possibility of such damage. | |
101 | |
102 smartypants.py license (2-Clause BSD license): | |
103 | |
104 smartypants.py is a derivative work of SmartyPants. | |
105 | |
106 Redistribution and use in source and binary forms, with or without | |
107 modification, are permitted provided that the following conditions are | |
108 met: | |
109 | |
110 * Redistributions of source code must retain the above copyright | |
111 notice, this list of conditions and the following disclaimer. | |
112 | |
113 * Redistributions in binary form must reproduce the above copyright | |
114 notice, this list of conditions and the following disclaimer in | |
115 the documentation and/or other materials provided with the | |
116 distribution. | |
117 | |
118 This software is provided by the copyright holders and contributors | |
119 "as is" and any express or implied warranties, including, but not | |
120 limited to, the implied warranties of merchantability and fitness for | |
121 a particular purpose are disclaimed. In no event shall the copyright | |
122 owner or contributors be liable for any direct, indirect, incidental, | |
123 special, exemplary, or consequential damages (including, but not | |
124 limited to, procurement of substitute goods or services; loss of use, | |
125 data, or profits; or business interruption) however caused and on any | |
126 theory of liability, whether in contract, strict liability, or tort | |
127 (including negligence or otherwise) arising in any way out of the use | |
128 of this software, even if advised of the possibility of such damage. | |
129 | |
130 .. _John Gruber: http://daringfireball.net/ | |
131 .. _Chad Miller: http://web.chad.org/ | |
132 | |
133 .. _Pyblosxom: http://pyblosxom.bluesock.org/ | |
134 .. _SmartyPants: http://daringfireball.net/projects/smartypants/ | |
135 .. _Movable Type: http://www.movabletype.org/ | |
136 .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause | |
137 .. _Docutils: http://docutils.sf.net/ | |
138 | |
139 Description | |
140 =========== | |
141 | |
142 SmartyPants can perform the following transformations: | |
143 | |
144 - Straight quotes ( " and ' ) into "curly" quote characters | |
145 - Backticks-style quotes (\`\`like this'') into "curly" quote characters | |
146 - Dashes (``--`` and ``---``) into en- and em-dash entities | |
147 - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity | |
148 | |
149 This means you can write, edit, and save your posts using plain old | |
150 ASCII straight quotes, plain dashes, and plain dots, but your published | |
151 posts (and final HTML output) will appear with smart quotes, em-dashes, | |
152 and proper ellipses. | |
153 | |
154 SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``, | |
155 ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to | |
156 display text where smart quotes and other "smart punctuation" would not be | |
157 appropriate, such as source code or example markup. | |
158 | |
159 | |
160 Backslash Escapes | |
161 ================= | |
162 | |
163 If you need to use literal straight quotes (or plain hyphens and periods), | |
164 `smartquotes` accepts the following backslash escape sequences to force | |
165 ASCII-punctuation. Mind, that you need two backslashes as Docutils expands it, | |
166 too. | |
167 | |
168 ======== ========= | |
169 Escape Character | |
170 ======== ========= | |
171 ``\\`` \\ | |
172 ``\\"`` \\" | |
173 ``\\'`` \\' | |
174 ``\\.`` \\. | |
175 ``\\-`` \\- | |
176 ``\\``` \\` | |
177 ======== ========= | |
178 | |
179 This is useful, for example, when you want to use straight quotes as | |
180 foot and inch marks: 6\\'2\\" tall; a 17\\" iMac. | |
181 | |
182 | |
183 Caveats | |
184 ======= | |
185 | |
186 Why You Might Not Want to Use Smart Quotes in Your Weblog | |
187 --------------------------------------------------------- | |
188 | |
189 For one thing, you might not care. | |
190 | |
191 Most normal, mentally stable individuals do not take notice of proper | |
192 typographic punctuation. Many design and typography nerds, however, break | |
193 out in a nasty rash when they encounter, say, a restaurant sign that uses | |
194 a straight apostrophe to spell "Joe's". | |
195 | |
196 If you're the sort of person who just doesn't care, you might well want to | |
197 continue not caring. Using straight quotes -- and sticking to the 7-bit | |
198 ASCII character set in general -- is certainly a simpler way to live. | |
199 | |
200 Even if you *do* care about accurate typography, you still might want to | |
201 think twice before educating the quote characters in your weblog. One side | |
202 effect of publishing curly quote characters is that it makes your | |
203 weblog a bit harder for others to quote from using copy-and-paste. What | |
204 happens is that when someone copies text from your blog, the copied text | |
205 contains the 8-bit curly quote characters (as well as the 8-bit characters | |
206 for em-dashes and ellipses, if you use these options). These characters | |
207 are not standard across different text encoding methods, which is why they | |
208 need to be encoded as characters. | |
209 | |
210 People copying text from your weblog, however, may not notice that you're | |
211 using curly quotes, and they'll go ahead and paste the unencoded 8-bit | |
212 characters copied from their browser into an email message or their own | |
213 weblog. When pasted as raw "smart quotes", these characters are likely to | |
214 get mangled beyond recognition. | |
215 | |
216 That said, my own opinion is that any decent text editor or email client | |
217 makes it easy to stupefy smart quote characters into their 7-bit | |
218 equivalents, and I don't consider it my problem if you're using an | |
219 indecent text editor or email client. | |
220 | |
221 | |
222 Algorithmic Shortcomings | |
223 ------------------------ | |
224 | |
225 One situation in which quotes will get curled the wrong way is when | |
226 apostrophes are used at the start of leading contractions. For example:: | |
227 | |
228 'Twas the night before Christmas. | |
229 | |
230 In the case above, SmartyPants will turn the apostrophe into an opening | |
231 single-quote, when in fact it should be the `right single quotation mark` | |
232 character which is also "the preferred character to use for apostrophe" | |
233 (Unicode). I don't think this problem can be solved in the general case -- | |
234 every word processor I've tried gets this wrong as well. In such cases, it's | |
235 best to use the proper character for closing single-quotes (’) by hand. | |
236 | |
237 In English, the same character is used for apostrophe and closing single | |
238 quote (both plain and "smart" ones). For other locales (French, Italean, | |
239 Swiss, ...) "smart" single closing quotes differ from the curly apostrophe. | |
240 | |
241 .. class:: language-fr | |
242 | |
243 Il dit : "C'est 'super' !" | |
244 | |
245 If the apostrophe is used at the end of a word, it cannot be distinguished | |
246 from a single quote by the algorithm. Therefore, a text like:: | |
247 | |
248 .. class:: language-de-CH | |
249 | |
250 "Er sagt: 'Ich fass' es nicht.'" | |
251 | |
252 will get a single closing guillemet instead of an apostrophe. | |
253 | |
254 This can be prevented by use use of the curly apostrophe character (’) in | |
255 the source:: | |
256 | |
257 - "Er sagt: 'Ich fass' es nicht.'" | |
258 + "Er sagt: 'Ich fass’ es nicht.'" | |
259 | |
260 | |
261 Version History | |
262 =============== | |
263 | |
264 1.8.1 2017-10-25 | |
265 - Use open quote after Unicode whitespace, ZWSP, and ZWNJ. | |
266 - Code cleanup. | |
267 | |
268 1.8: 2017-04-24 | |
269 - Command line front-end. | |
270 | |
271 1.7.1: 2017-03-19 | |
272 - Update and extend language-dependent quotes. | |
273 - Differentiate apostrophe from single quote. | |
274 | |
275 1.7: 2012-11-19 | |
276 - Internationalization: language-dependent quotes. | |
277 | |
278 1.6.1: 2012-11-06 | |
279 - Refactor code, code cleanup, | |
280 - `educate_tokens()` generator as interface for Docutils. | |
281 | |
282 1.6: 2010-08-26 | |
283 - Adaption to Docutils: | |
284 - Use Unicode instead of HTML entities, | |
285 - Remove code special to pyblosxom. | |
286 | |
287 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400 | |
288 - Fixed bug where blocks of precious unalterable text was instead | |
289 interpreted. Thanks to Le Roux and Dirk van Oosterbosch. | |
290 | |
291 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400 | |
292 - Fix bogus magical quotation when there is no hint that the | |
293 user wants it, e.g., in "21st century". Thanks to Nathan Hamblen. | |
294 - Be smarter about quotes before terminating numbers in an en-dash'ed | |
295 range. | |
296 | |
297 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500 | |
298 - Fix a date-processing bug, as reported by jacob childress. | |
299 - Begin a test-suite for ensuring correct output. | |
300 - Removed import of "string", since I didn't really need it. | |
301 (This was my first every Python program. Sue me!) | |
302 | |
303 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400 | |
304 - Abort processing if the flavour is in forbidden-list. Default of | |
305 [ "rss" ] (Idea of Wolfgang SCHNERRING.) | |
306 - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING. | |
307 | |
308 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400 | |
309 - Some single quotes weren't replaced properly. Diff-tesuji played | |
310 by Benjamin GEIGER. | |
311 | |
312 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500 | |
313 - Support upcoming pyblosxom 0.9 plugin verification feature. | |
314 | |
315 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500 | |
316 - Initial release | |
317 """ | |
318 from __future__ import print_function | |
319 | |
320 options = r""" | |
321 Options | |
322 ======= | |
323 | |
324 Numeric values are the easiest way to configure SmartyPants' behavior: | |
325 | |
326 :0: Suppress all transformations. (Do nothing.) | |
327 | |
328 :1: Performs default SmartyPants transformations: quotes (including | |
329 \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash) | |
330 is used to signify an em-dash; there is no support for en-dashes | |
331 | |
332 :2: Same as smarty_pants="1", except that it uses the old-school typewriter | |
333 shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``" | |
334 (dash dash dash) | |
335 for em-dashes. | |
336 | |
337 :3: Same as smarty_pants="2", but inverts the shorthand for dashes: | |
338 "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for | |
339 en-dashes. | |
340 | |
341 :-1: Stupefy mode. Reverses the SmartyPants transformation process, turning | |
342 the characters produced by SmartyPants into their ASCII equivalents. | |
343 E.g. the LEFT DOUBLE QUOTATION MARK (“) is turned into a simple | |
344 double-quote (\"), "—" is turned into two dashes, etc. | |
345 | |
346 | |
347 The following single-character attribute values can be combined to toggle | |
348 individual transformations from within the smarty_pants attribute. For | |
349 example, ``"1"`` is equivalent to ``"qBde"``. | |
350 | |
351 :q: Educates normal quote characters: (") and ('). | |
352 | |
353 :b: Educates \`\`backticks'' -style double quotes. | |
354 | |
355 :B: Educates \`\`backticks'' -style double quotes and \`single' quotes. | |
356 | |
357 :d: Educates em-dashes. | |
358 | |
359 :D: Educates em-dashes and en-dashes, using old-school typewriter shorthand: | |
360 (dash dash) for en-dashes, (dash dash dash) for em-dashes. | |
361 | |
362 :i: Educates em-dashes and en-dashes, using inverted old-school typewriter | |
363 shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes. | |
364 | |
365 :e: Educates ellipses. | |
366 | |
367 :w: Translates any instance of ``"`` into a normal double-quote character. | |
368 This should be of no interest to most people, but of particular interest | |
369 to anyone who writes their posts using Dreamweaver, as Dreamweaver | |
370 inexplicably uses this entity to represent a literal double-quote | |
371 character. SmartyPants only educates normal quotes, not entities (because | |
372 ordinarily, entities are used for the explicit purpose of representing the | |
373 specific character they represent). The "w" option must be used in | |
374 conjunction with one (or both) of the other quote options ("q" or "b"). | |
375 Thus, if you wish to apply all SmartyPants transformations (quotes, en- | |
376 and em-dashes, and ellipses) and also translate ``"`` entities into | |
377 regular quotes so SmartyPants can educate them, you should pass the | |
378 following to the smarty_pants attribute: | |
379 """ | |
380 | |
381 | |
382 default_smartypants_attr = "1" | |
383 | |
384 | |
385 import re, sys | |
386 | |
387 class smartchars(object): | |
388 """Smart quotes and dashes | |
389 """ | |
390 | |
391 endash = u'–' # "–" EN DASH | |
392 emdash = u'—' # "—" EM DASH | |
393 ellipsis = u'…' # "…" HORIZONTAL ELLIPSIS | |
394 apostrophe = u'’' # "’" RIGHT SINGLE QUOTATION MARK | |
395 | |
396 # quote characters (language-specific, set in __init__()) | |
397 # [1] http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks | |
398 # [2] http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen | |
399 # [3] https://fr.wikipedia.org/wiki/Guillemet | |
400 # [4] http://typographisme.net/post/Les-espaces-typographiques-et-le-web | |
401 # [5] http://www.btb.termiumplus.gc.ca/tpv2guides/guides/redac/index-fra.html | |
402 # [6] https://en.wikipedia.org/wiki/Hebrew_punctuation#Quotation_marks | |
403 # [7] http://www.tustep.uni-tuebingen.de/bi/bi00/bi001t1-anfuehrung.pdf | |
404 # [8] http://www.korrekturavdelingen.no/anforselstegn.htm | |
405 # [9] Typografisk håndbok. Oslo: Spartacus. 2000. s. 67. ISBN 8243001530. | |
406 # [10] http://www.typografi.org/sitat/sitatart.html | |
407 # | |
408 # See also configuration option "smartquote-locales". | |
409 quotes = {'af': u'“”‘’', | |
410 'af-x-altquot': u'„”‚’', | |
411 'bg': u'„“‚‘', # Bulgarian, https://bg.wikipedia.org/wiki/Кавички | |
412 'ca': u'«»“”', | |
413 'ca-x-altquot': u'“”‘’', | |
414 'cs': u'„“‚‘', | |
415 'cs-x-altquot': u'»«›‹', | |
416 'da': u'»«›‹', | |
417 'da-x-altquot': u'„“‚‘', | |
418 # 'da-x-altquot2': u'””’’', | |
419 'de': u'„“‚‘', | |
420 'de-x-altquot': u'»«›‹', | |
421 'de-ch': u'«»‹›', | |
422 'el': u'«»“”', | |
423 'en': u'“”‘’', | |
424 'en-uk-x-altquot': u'‘’“”', # Attention: " → ‘ and ' → “ ! | |
425 'eo': u'“”‘’', | |
426 'es': u'«»“”', | |
427 'es-x-altquot': u'“”‘’', | |
428 'et': u'„“‚‘', # no secondary quote listed in | |
429 'et-x-altquot': u'«»‹›', # the sources above (wikipedia.org) | |
430 'eu': u'«»‹›', | |
431 'fi': u'””’’', | |
432 'fi-x-altquot': u'»»››', | |
433 'fr': (u'« ', u' »', u'“', u'”'), # full no-break space | |
434 'fr-x-altquot': (u'« ', u' »', u'“', u'”'), # narrow no-break space | |
435 'fr-ch': u'«»‹›', | |
436 'fr-ch-x-altquot': (u'« ', u' »', u'‹ ', u' ›'), # narrow no-break space, http://typoguide.ch/ | |
437 'gl': u'«»“”', | |
438 'he': u'”“»«', # Hebrew is RTL, test position: | |
439 'he-x-altquot': u'„”‚’', # low quotation marks are opening. | |
440 # 'he-x-altquot': u'“„‘‚', # RTL: low quotation marks opening | |
441 'hr': u'„”‘’', # http://hrvatska-tipografija.com/polunavodnici/ | |
442 'hr-x-altquot': u'»«›‹', | |
443 'hsb': u'„“‚‘', | |
444 'hsb-x-altquot': u'»«›‹', | |
445 'hu': u'„”«»', | |
446 'is': u'„“‚‘', | |
447 'it': u'«»“”', | |
448 'it-ch': u'«»‹›', | |
449 'it-x-altquot': u'“”‘’', | |
450 # 'it-x-altquot2': u'“„‘‚', # [7] in headlines | |
451 'ja': u'「」『』', | |
452 'ko': u'《》〈〉', | |
453 'lt': u'„“‚‘', | |
454 'lv': u'„“‚‘', | |
455 'mk': u'„“‚‘', # Macedonian, https://mk.wikipedia.org/wiki/Правопис_и_правоговор_на_македонскиот_јазик | |
456 'nl': u'“”‘’', | |
457 'nl-x-altquot': u'„”‚’', | |
458 # 'nl-x-altquot2': u'””’’', | |
459 'nb': u'«»’’', # Norsk bokmål (canonical form 'no') | |
460 'nn': u'«»’’', # Nynorsk [10] | |
461 'nn-x-altquot': u'«»‘’', # [8], [10] | |
462 # 'nn-x-altquot2': u'«»«»', # [9], [10 | |
463 # 'nn-x-altquot3': u'„“‚‘', # [10] | |
464 'no': u'«»’’', # Norsk bokmål [10] | |
465 'no-x-altquot': u'«»‘’', # [8], [10] | |
466 # 'no-x-altquot2': u'«»«»', # [9], [10 | |
467 # 'no-x-altquot3': u'„“‚‘', # [10] | |
468 'pl': u'„”«»', | |
469 'pl-x-altquot': u'«»‚’', | |
470 # 'pl-x-altquot2': u'„”‚’', # https://pl.wikipedia.org/wiki/Cudzys%C5%82%C3%B3w | |
471 'pt': u'«»“”', | |
472 'pt-br': u'“”‘’', | |
473 'ro': u'„”«»', | |
474 'ru': u'«»„“', | |
475 'sh': u'„”‚’', # Serbo-Croatian | |
476 'sh-x-altquot': u'»«›‹', | |
477 'sk': u'„“‚‘', # Slovak | |
478 'sk-x-altquot': u'»«›‹', | |
479 'sl': u'„“‚‘', # Slovenian | |
480 'sl-x-altquot': u'»«›‹', | |
481 'sq': u'«»‹›', # Albanian | |
482 'sq-x-altquot': u'“„‘‚', | |
483 'sr': u'„”’’', | |
484 'sr-x-altquot': u'»«›‹', | |
485 'sv': u'””’’', | |
486 'sv-x-altquot': u'»»››', | |
487 'tr': u'“”‘’', | |
488 'tr-x-altquot': u'«»‹›', | |
489 # 'tr-x-altquot2': u'“„‘‚', # [7] antiquated? | |
490 'uk': u'«»„“', | |
491 'uk-x-altquot': u'„“‚‘', | |
492 'zh-cn': u'“”‘’', | |
493 'zh-tw': u'「」『』', | |
494 } | |
495 | |
496 def __init__(self, language='en'): | |
497 self.language = language | |
498 try: | |
499 (self.opquote, self.cpquote, | |
500 self.osquote, self.csquote) = self.quotes[language.lower()] | |
501 except KeyError: | |
502 self.opquote, self.cpquote, self.osquote, self.csquote = u'""\'\'' | |
503 | |
504 | |
505 def smartyPants(text, attr=default_smartypants_attr, language='en'): | |
506 """Main function for "traditional" use.""" | |
507 | |
508 return "".join([t for t in educate_tokens(tokenize(text), | |
509 attr, language)]) | |
510 | |
511 | |
512 def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'): | |
513 """Return iterator that "educates" the items of `text_tokens`. | |
514 """ | |
515 | |
516 # Parse attributes: | |
517 # 0 : do nothing | |
518 # 1 : set all | |
519 # 2 : set all, using old school en- and em- dash shortcuts | |
520 # 3 : set all, using inverted old school en and em- dash shortcuts | |
521 # | |
522 # q : quotes | |
523 # b : backtick quotes (``double'' only) | |
524 # B : backtick quotes (``double'' and `single') | |
525 # d : dashes | |
526 # D : old school dashes | |
527 # i : inverted old school dashes | |
528 # e : ellipses | |
529 # w : convert " entities to " for Dreamweaver users | |
530 | |
531 convert_quot = False # translate " entities into normal quotes? | |
532 do_dashes = False | |
533 do_backticks = False | |
534 do_quotes = False | |
535 do_ellipses = False | |
536 do_stupefy = False | |
537 | |
538 # if attr == "0": # pass tokens unchanged (see below). | |
539 if attr == "1": # Do everything, turn all options on. | |
540 do_quotes = True | |
541 do_backticks = True | |
542 do_dashes = 1 | |
543 do_ellipses = True | |
544 elif attr == "2": | |
545 # Do everything, turn all options on, use old school dash shorthand. | |
546 do_quotes = True | |
547 do_backticks = True | |
548 do_dashes = 2 | |
549 do_ellipses = True | |
550 elif attr == "3": | |
551 # Do everything, use inverted old school dash shorthand. | |
552 do_quotes = True | |
553 do_backticks = True | |
554 do_dashes = 3 | |
555 do_ellipses = True | |
556 elif attr == "-1": # Special "stupefy" mode. | |
557 do_stupefy = True | |
558 else: | |
559 if "q" in attr: do_quotes = True | |
560 if "b" in attr: do_backticks = True | |
561 if "B" in attr: do_backticks = 2 | |
562 if "d" in attr: do_dashes = 1 | |
563 if "D" in attr: do_dashes = 2 | |
564 if "i" in attr: do_dashes = 3 | |
565 if "e" in attr: do_ellipses = True | |
566 if "w" in attr: convert_quot = True | |
567 | |
568 prev_token_last_char = " " | |
569 # Last character of the previous text token. Used as | |
570 # context to curl leading quote characters correctly. | |
571 | |
572 for (ttype, text) in text_tokens: | |
573 | |
574 # skip HTML and/or XML tags as well as emtpy text tokens | |
575 # without updating the last character | |
576 if ttype == 'tag' or not text: | |
577 yield text | |
578 continue | |
579 | |
580 # skip literal text (math, literal, raw, ...) | |
581 if ttype == 'literal': | |
582 prev_token_last_char = text[-1:] | |
583 yield text | |
584 continue | |
585 | |
586 last_char = text[-1:] # Remember last char before processing. | |
587 | |
588 text = processEscapes(text) | |
589 | |
590 if convert_quot: | |
591 text = re.sub('"', '"', text) | |
592 | |
593 if do_dashes == 1: | |
594 text = educateDashes(text) | |
595 elif do_dashes == 2: | |
596 text = educateDashesOldSchool(text) | |
597 elif do_dashes == 3: | |
598 text = educateDashesOldSchoolInverted(text) | |
599 | |
600 if do_ellipses: | |
601 text = educateEllipses(text) | |
602 | |
603 # Note: backticks need to be processed before quotes. | |
604 if do_backticks: | |
605 text = educateBackticks(text, language) | |
606 | |
607 if do_backticks == 2: | |
608 text = educateSingleBackticks(text, language) | |
609 | |
610 if do_quotes: | |
611 # Replace plain quotes in context to prevent converstion to | |
612 # 2-character sequence in French. | |
613 context = prev_token_last_char.replace('"', ';').replace("'", ';') | |
614 text = educateQuotes(context+text, language)[1:] | |
615 | |
616 if do_stupefy: | |
617 text = stupefyEntities(text, language) | |
618 | |
619 # Remember last char as context for the next token | |
620 prev_token_last_char = last_char | |
621 | |
622 text = processEscapes(text, restore=True) | |
623 | |
624 yield text | |
625 | |
626 | |
627 | |
628 def educateQuotes(text, language='en'): | |
629 """ | |
630 Parameter: - text string (unicode or bytes). | |
631 - language (`BCP 47` language tag.) | |
632 Returns: The `text`, with "educated" curly quote characters. | |
633 | |
634 Example input: "Isn't this fun?" | |
635 Example output: “Isn’t this fun?“; | |
636 """ | |
637 | |
638 smart = smartchars(language) | |
639 | |
640 punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" | |
641 close_class = r"""[^\ \t\r\n\[\{\(\-]""" | |
642 open_class = u'[\u200B\u200C]' # ZWSP, ZWNJ | |
643 dec_dashes = r"""–|—""" | |
644 | |
645 # Special case if the very first character is a quote | |
646 # followed by punctuation at a non-word-break. | |
647 # Close the quotes by brute force: | |
648 text = re.sub(r"^'(?=%s\\B)" % (punct_class,), smart.csquote, text) | |
649 text = re.sub(r'^"(?=%s\\B)' % (punct_class,), smart.cpquote, text) | |
650 | |
651 # Special case for double sets of quotes, e.g.: | |
652 # <p>He said, "'Quoted' words in a larger quote."</p> | |
653 text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text) | |
654 text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text) | |
655 | |
656 # Special case for decade abbreviations (the '80s): | |
657 if language.startswith('en'): # TODO similar cases in other languages? | |
658 text = re.sub(r"'(?=\d{2}s)", smart.apostrophe, text) | |
659 | |
660 # Get most opening single quotes: | |
661 opening_single_quotes_regex = re.compile(u""" | |
662 (# ?<= # look behind fails: requires fixed-width pattern | |
663 \\s | # a whitespace char, or | |
664 %s | # another separating char, or | |
665 | # a non-breaking space entity, or | |
666 [\u2013 \u2014 ] | # literal dashes, or | |
667 -- | # dumb dashes, or | |
668 &[mn]dash; | # dash entities (named or | |
669 %s | # decimal or | |
670 &\\#x201[34]; # hex) | |
671 ) | |
672 ' # the quote | |
673 (?=\\w) # followed by a word character | |
674 """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE) | |
675 | |
676 text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text) | |
677 | |
678 # In many locales, single closing quotes are different from apostrophe: | |
679 if smart.csquote != smart.apostrophe: | |
680 apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE) | |
681 text = apostrophe_regex.sub(smart.apostrophe, text) | |
682 # TODO: keep track of quoting level to recognize apostrophe in, e.g., | |
683 # "Ich fass' es nicht." | |
684 | |
685 closing_single_quotes_regex = re.compile(r""" | |
686 (?<=%s) | |
687 ' | |
688 """ % close_class, re.VERBOSE) | |
689 text = closing_single_quotes_regex.sub(smart.csquote, text) | |
690 | |
691 # Any remaining single quotes should be opening ones: | |
692 text = re.sub(r"""'""", smart.osquote, text) | |
693 | |
694 # Get most opening double quotes: | |
695 opening_double_quotes_regex = re.compile(u""" | |
696 ( | |
697 \\s | # a whitespace char, or | |
698 %s | # another separating char, or | |
699 | # a non-breaking space entity, or | |
700 [\u2013 \u2014 ] | # literal dashes, or | |
701 -- | # dumb dashes, or | |
702 &[mn]dash; | # dash entities (named or | |
703 %s | # decimal or | |
704 &\\#x201[34]; # hex) | |
705 ) | |
706 " # the quote | |
707 (?=\\w) # followed by a word character | |
708 """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE) | |
709 | |
710 text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text) | |
711 | |
712 # Double closing quotes: | |
713 closing_double_quotes_regex = re.compile(r""" | |
714 ( | |
715 (?<=%s)" | # char indicating the quote should be closing | |
716 "(?=\s) # whitespace behind | |
717 ) | |
718 """ % (close_class,), re.VERBOSE | re.UNICODE) | |
719 text = closing_double_quotes_regex.sub(smart.cpquote, text) | |
720 | |
721 # Any remaining quotes should be opening ones. | |
722 text = re.sub(r'"', smart.opquote, text) | |
723 | |
724 return text | |
725 | |
726 | |
727 def educateBackticks(text, language='en'): | |
728 """ | |
729 Parameter: String (unicode or bytes). | |
730 Returns: The `text`, with ``backticks'' -style double quotes | |
731 translated into HTML curly quote entities. | |
732 Example input: ``Isn't this fun?'' | |
733 Example output: “Isn't this fun?“; | |
734 """ | |
735 smart = smartchars(language) | |
736 | |
737 text = re.sub(r"""``""", smart.opquote, text) | |
738 text = re.sub(r"""''""", smart.cpquote, text) | |
739 return text | |
740 | |
741 | |
742 def educateSingleBackticks(text, language='en'): | |
743 """ | |
744 Parameter: String (unicode or bytes). | |
745 Returns: The `text`, with `backticks' -style single quotes | |
746 translated into HTML curly quote entities. | |
747 | |
748 Example input: `Isn't this fun?' | |
749 Example output: ‘Isn’t this fun?’ | |
750 """ | |
751 smart = smartchars(language) | |
752 | |
753 text = re.sub(r"""`""", smart.osquote, text) | |
754 text = re.sub(r"""'""", smart.csquote, text) | |
755 return text | |
756 | |
757 | |
758 def educateDashes(text): | |
759 """ | |
760 Parameter: String (unicode or bytes). | |
761 Returns: The `text`, with each instance of "--" translated to | |
762 an em-dash character. | |
763 """ | |
764 | |
765 text = re.sub(r"""---""", smartchars.endash, text) # en (yes, backwards) | |
766 text = re.sub(r"""--""", smartchars.emdash, text) # em (yes, backwards) | |
767 return text | |
768 | |
769 | |
770 def educateDashesOldSchool(text): | |
771 """ | |
772 Parameter: String (unicode or bytes). | |
773 Returns: The `text`, with each instance of "--" translated to | |
774 an en-dash character, and each "---" translated to | |
775 an em-dash character. | |
776 """ | |
777 | |
778 text = re.sub(r"""---""", smartchars.emdash, text) | |
779 text = re.sub(r"""--""", smartchars.endash, text) | |
780 return text | |
781 | |
782 | |
783 def educateDashesOldSchoolInverted(text): | |
784 """ | |
785 Parameter: String (unicode or bytes). | |
786 Returns: The `text`, with each instance of "--" translated to | |
787 an em-dash character, and each "---" translated to | |
788 an en-dash character. Two reasons why: First, unlike the | |
789 en- and em-dash syntax supported by | |
790 EducateDashesOldSchool(), it's compatible with existing | |
791 entries written before SmartyPants 1.1, back when "--" was | |
792 only used for em-dashes. Second, em-dashes are more | |
793 common than en-dashes, and so it sort of makes sense that | |
794 the shortcut should be shorter to type. (Thanks to Aaron | |
795 Swartz for the idea.) | |
796 """ | |
797 text = re.sub(r"""---""", smartchars.endash, text) # em | |
798 text = re.sub(r"""--""", smartchars.emdash, text) # en | |
799 return text | |
800 | |
801 | |
802 | |
803 def educateEllipses(text): | |
804 """ | |
805 Parameter: String (unicode or bytes). | |
806 Returns: The `text`, with each instance of "..." translated to | |
807 an ellipsis character. | |
808 | |
809 Example input: Huh...? | |
810 Example output: Huh…? | |
811 """ | |
812 | |
813 text = re.sub(r"""\.\.\.""", smartchars.ellipsis, text) | |
814 text = re.sub(r"""\. \. \.""", smartchars.ellipsis, text) | |
815 return text | |
816 | |
817 | |
818 def stupefyEntities(text, language='en'): | |
819 """ | |
820 Parameter: String (unicode or bytes). | |
821 Returns: The `text`, with each SmartyPants character translated to | |
822 its ASCII counterpart. | |
823 | |
824 Example input: “Hello — world.” | |
825 Example output: "Hello -- world." | |
826 """ | |
827 smart = smartchars(language) | |
828 | |
829 text = re.sub(smart.endash, "-", text) # en-dash | |
830 text = re.sub(smart.emdash, "--", text) # em-dash | |
831 | |
832 text = re.sub(smart.osquote, "'", text) # open single quote | |
833 text = re.sub(smart.csquote, "'", text) # close single quote | |
834 | |
835 text = re.sub(smart.opquote, '"', text) # open double quote | |
836 text = re.sub(smart.cpquote, '"', text) # close double quote | |
837 | |
838 text = re.sub(smart.ellipsis, '...', text)# ellipsis | |
839 | |
840 return text | |
841 | |
842 | |
843 def processEscapes(text, restore=False): | |
844 r""" | |
845 Parameter: String (unicode or bytes). | |
846 Returns: The `text`, with after processing the following backslash | |
847 escape sequences. This is useful if you want to force a "dumb" | |
848 quote or other character to appear. | |
849 | |
850 Escape Value | |
851 ------ ----- | |
852 \\ \ | |
853 \" " | |
854 \' ' | |
855 \. . | |
856 \- - | |
857 \` ` | |
858 """ | |
859 replacements = ((r'\\', r'\'), | |
860 (r'\"', r'"'), | |
861 (r"\'", r'''), | |
862 (r'\.', r'.'), | |
863 (r'\-', r'-'), | |
864 (r'\`', r'`')) | |
865 if restore: | |
866 for (ch, rep) in replacements: | |
867 text = text.replace(rep, ch[1]) | |
868 else: | |
869 for (ch, rep) in replacements: | |
870 text = text.replace(ch, rep) | |
871 | |
872 return text | |
873 | |
874 | |
875 def tokenize(text): | |
876 """ | |
877 Parameter: String containing HTML markup. | |
878 Returns: An iterator that yields the tokens comprising the input | |
879 string. Each token is either a tag (possibly with nested, | |
880 tags contained therein, such as <a href="<MTFoo>">, or a | |
881 run of text between tags. Each yielded element is a | |
882 two-element tuple; the first is either 'tag' or 'text'; | |
883 the second is the actual value. | |
884 | |
885 Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin. | |
886 <http://www.bradchoate.com/past/mtregex.php> | |
887 """ | |
888 | |
889 pos = 0 | |
890 length = len(text) | |
891 # tokens = [] | |
892 | |
893 depth = 6 | |
894 nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth) | |
895 #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments | |
896 # (?: <\? .*? \?> ) | # directives | |
897 # %s # nested tags """ % (nested_tags,) | |
898 tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""") | |
899 | |
900 token_match = tag_soup.search(text) | |
901 | |
902 previous_end = 0 | |
903 while token_match is not None: | |
904 if token_match.group(1): | |
905 yield ('text', token_match.group(1)) | |
906 | |
907 yield ('tag', token_match.group(2)) | |
908 | |
909 previous_end = token_match.end() | |
910 token_match = tag_soup.search(text, token_match.end()) | |
911 | |
912 if previous_end < len(text): | |
913 yield ('text', text[previous_end:]) | |
914 | |
915 | |
916 | |
917 if __name__ == "__main__": | |
918 | |
919 import itertools | |
920 try: | |
921 import locale # module missing in Jython | |
922 locale.setlocale(locale.LC_ALL, '') # set to user defaults | |
923 defaultlanguage = locale.getdefaultlocale()[0] | |
924 except: | |
925 defaultlanguage = 'en' | |
926 | |
927 # Normalize and drop unsupported subtags: | |
928 defaultlanguage = defaultlanguage.lower().replace('-', '_') | |
929 # split (except singletons, which mark the following tag as non-standard): | |
930 defaultlanguage = re.sub(r'_([a-zA-Z0-9])_', r'_\1-', defaultlanguage) | |
931 _subtags = [subtag for subtag in defaultlanguage.split('_')] | |
932 _basetag = _subtags.pop(0) | |
933 # find all combinations of subtags | |
934 for n in range(len(_subtags), 0, -1): | |
935 for tags in itertools.combinations(_subtags, n): | |
936 _tag = '-'.join((_basetag,)+tags) | |
937 if _tag in smartchars.quotes: | |
938 defaultlanguage = _tag | |
939 break | |
940 else: | |
941 if _basetag in smartchars.quotes: | |
942 defaultlanguage = _basetag | |
943 else: | |
944 defaultlanguage = 'en' | |
945 | |
946 | |
947 import argparse | |
948 parser = argparse.ArgumentParser( | |
949 description='Filter stdin making ASCII punctuation "smart".') | |
950 # parser.add_argument("text", help="text to be acted on") | |
951 parser.add_argument("-a", "--action", default="1", | |
952 help="what to do with the input (see --actionhelp)") | |
953 parser.add_argument("-e", "--encoding", default="utf8", | |
954 help="text encoding") | |
955 parser.add_argument("-l", "--language", default=defaultlanguage, | |
956 help="text language (BCP47 tag), " | |
957 "Default: %s"% defaultlanguage) | |
958 parser.add_argument("-q", "--alternative-quotes", action="store_true", | |
959 help="use alternative quote style") | |
960 parser.add_argument("--doc", action="store_true", | |
961 help="print documentation") | |
962 parser.add_argument("--actionhelp", action="store_true", | |
963 help="list available actions") | |
964 parser.add_argument("--stylehelp", action="store_true", | |
965 help="list available quote styles") | |
966 parser.add_argument("--test", action="store_true", | |
967 help="perform short self-test") | |
968 args = parser.parse_args() | |
969 | |
970 if args.doc: | |
971 print(__doc__) | |
972 elif args.actionhelp: | |
973 print(options) | |
974 elif args.stylehelp: | |
975 print() | |
976 print("Available styles (primary open/close, secondary open/close)") | |
977 print("language tag quotes") | |
978 print("============ ======") | |
979 for key in sorted(smartchars.quotes.keys()): | |
980 print("%-14s %s" % (key, smartchars.quotes[key])) | |
981 elif args.test: | |
982 # Unit test output goes to stderr. | |
983 import unittest | |
984 | |
985 class TestSmartypantsAllAttributes(unittest.TestCase): | |
986 # the default attribute is "1", which means "all". | |
987 def test_dates(self): | |
988 self.assertEqual(smartyPants("1440-80's"), u"1440-80’s") | |
989 self.assertEqual(smartyPants("1440-'80s"), u"1440-’80s") | |
990 self.assertEqual(smartyPants("1440---'80s"), u"1440–’80s") | |
991 self.assertEqual(smartyPants("1960's"), u"1960’s") | |
992 self.assertEqual(smartyPants("one two '60s"), u"one two ’60s") | |
993 self.assertEqual(smartyPants("'60s"), u"’60s") | |
994 | |
995 def test_educated_quotes(self): | |
996 self.assertEqual(smartyPants('"Isn\'t this fun?"'), u'“Isn’t this fun?”') | |
997 | |
998 def test_html_tags(self): | |
999 text = '<a src="foo">more</a>' | |
1000 self.assertEqual(smartyPants(text), text) | |
1001 | |
1002 suite = unittest.TestLoader().loadTestsFromTestCase( | |
1003 TestSmartypantsAllAttributes) | |
1004 unittest.TextTestRunner().run(suite) | |
1005 | |
1006 else: | |
1007 if args.alternative_quotes: | |
1008 if '-x-altquot' in args.language: | |
1009 args.language = args.language.replace('-x-altquot', '') | |
1010 else: | |
1011 args.language += '-x-altquot' | |
1012 text = sys.stdin.read().decode(args.encoding) | |
1013 print(smartyPants(text, attr=args.action, | |
1014 language=args.language).encode(args.encoding)) |