comparison env/lib/python3.7/site-packages/docutils/utils/smartquotes.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 # :Id: $Id: smartquotes.py 8376 2019-08-27 19:49:29Z milde $
5 # :Copyright: © 2010 Günter Milde,
6 # original `SmartyPants`_: © 2003 John Gruber
7 # smartypants.py: © 2004, 2007 Chad Miller
8 # :Maintainer: docutils-develop@lists.sourceforge.net
9 # :License: Released under the terms of the `2-Clause BSD license`_, in short:
10 #
11 # Copying and distribution of this file, with or without modification,
12 # are permitted in any medium without royalty provided the copyright
13 # notices and this notice are preserved.
14 # This file is offered as-is, without any warranty.
15 #
16 # .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
17
18
19 r"""
20 =========================
21 Smart Quotes for Docutils
22 =========================
23
24 Synopsis
25 ========
26
27 "SmartyPants" is a free web publishing plug-in for Movable Type, Blosxom, and
28 BBEdit that easily translates plain ASCII punctuation characters into "smart"
29 typographic punctuation characters.
30
31 ``smartquotes.py`` is an adaption of "SmartyPants" to Docutils_.
32
33 * Using Unicode instead of HTML entities for typographic punctuation
34 characters, it works for any output format that supports Unicode.
35 * Supports `language specific quote characters`__.
36
37 __ http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
38
39
40 Authors
41 =======
42
43 `John Gruber`_ did all of the hard work of writing this software in Perl for
44 `Movable Type`_ and almost all of this useful documentation. `Chad Miller`_
45 ported it to Python to use with Pyblosxom_.
46 Adapted to Docutils_ by Günter Milde.
47
48 Additional Credits
49 ==================
50
51 Portions of the SmartyPants original work are based on Brad Choate's nifty
52 MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to
53 this plug-in. Brad Choate is a fine hacker indeed.
54
55 `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta
56 testing of the original SmartyPants.
57
58 `Rael Dornfest`_ ported SmartyPants to Blosxom.
59
60 .. _Brad Choate: http://bradchoate.com/
61 .. _Jeremy Hedley: http://antipixel.com/
62 .. _Charles Wiltgen: http://playbacktime.com/
63 .. _Rael Dornfest: http://raelity.org/
64
65
66 Copyright and License
67 =====================
68
69 SmartyPants_ license (3-Clause BSD license):
70
71 Copyright (c) 2003 John Gruber (http://daringfireball.net/)
72 All rights reserved.
73
74 Redistribution and use in source and binary forms, with or without
75 modification, are permitted provided that the following conditions are
76 met:
77
78 * Redistributions of source code must retain the above copyright
79 notice, this list of conditions and the following disclaimer.
80
81 * Redistributions in binary form must reproduce the above copyright
82 notice, this list of conditions and the following disclaimer in
83 the documentation and/or other materials provided with the
84 distribution.
85
86 * Neither the name "SmartyPants" nor the names of its contributors
87 may be used to endorse or promote products derived from this
88 software without specific prior written permission.
89
90 This software is provided by the copyright holders and contributors
91 "as is" and any express or implied warranties, including, but not
92 limited to, the implied warranties of merchantability and fitness for
93 a particular purpose are disclaimed. In no event shall the copyright
94 owner or contributors be liable for any direct, indirect, incidental,
95 special, exemplary, or consequential damages (including, but not
96 limited to, procurement of substitute goods or services; loss of use,
97 data, or profits; or business interruption) however caused and on any
98 theory of liability, whether in contract, strict liability, or tort
99 (including negligence or otherwise) arising in any way out of the use
100 of this software, even if advised of the possibility of such damage.
101
102 smartypants.py license (2-Clause BSD license):
103
104 smartypants.py is a derivative work of SmartyPants.
105
106 Redistribution and use in source and binary forms, with or without
107 modification, are permitted provided that the following conditions are
108 met:
109
110 * Redistributions of source code must retain the above copyright
111 notice, this list of conditions and the following disclaimer.
112
113 * Redistributions in binary form must reproduce the above copyright
114 notice, this list of conditions and the following disclaimer in
115 the documentation and/or other materials provided with the
116 distribution.
117
118 This software is provided by the copyright holders and contributors
119 "as is" and any express or implied warranties, including, but not
120 limited to, the implied warranties of merchantability and fitness for
121 a particular purpose are disclaimed. In no event shall the copyright
122 owner or contributors be liable for any direct, indirect, incidental,
123 special, exemplary, or consequential damages (including, but not
124 limited to, procurement of substitute goods or services; loss of use,
125 data, or profits; or business interruption) however caused and on any
126 theory of liability, whether in contract, strict liability, or tort
127 (including negligence or otherwise) arising in any way out of the use
128 of this software, even if advised of the possibility of such damage.
129
130 .. _John Gruber: http://daringfireball.net/
131 .. _Chad Miller: http://web.chad.org/
132
133 .. _Pyblosxom: http://pyblosxom.bluesock.org/
134 .. _SmartyPants: http://daringfireball.net/projects/smartypants/
135 .. _Movable Type: http://www.movabletype.org/
136 .. _2-Clause BSD license: http://www.spdx.org/licenses/BSD-2-Clause
137 .. _Docutils: http://docutils.sf.net/
138
139 Description
140 ===========
141
142 SmartyPants can perform the following transformations:
143
144 - Straight quotes ( " and ' ) into "curly" quote characters
145 - Backticks-style quotes (\`\`like this'') into "curly" quote characters
146 - Dashes (``--`` and ``---``) into en- and em-dash entities
147 - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity
148
149 This means you can write, edit, and save your posts using plain old
150 ASCII straight quotes, plain dashes, and plain dots, but your published
151 posts (and final HTML output) will appear with smart quotes, em-dashes,
152 and proper ellipses.
153
154 SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``,
155 ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to
156 display text where smart quotes and other "smart punctuation" would not be
157 appropriate, such as source code or example markup.
158
159
160 Backslash Escapes
161 =================
162
163 If you need to use literal straight quotes (or plain hyphens and periods),
164 `smartquotes` accepts the following backslash escape sequences to force
165 ASCII-punctuation. Mind, that you need two backslashes as Docutils expands it,
166 too.
167
168 ======== =========
169 Escape Character
170 ======== =========
171 ``\\`` \\
172 ``\\"`` \\"
173 ``\\'`` \\'
174 ``\\.`` \\.
175 ``\\-`` \\-
176 ``\\``` \\`
177 ======== =========
178
179 This is useful, for example, when you want to use straight quotes as
180 foot and inch marks: 6\\'2\\" tall; a 17\\" iMac.
181
182
183 Caveats
184 =======
185
186 Why You Might Not Want to Use Smart Quotes in Your Weblog
187 ---------------------------------------------------------
188
189 For one thing, you might not care.
190
191 Most normal, mentally stable individuals do not take notice of proper
192 typographic punctuation. Many design and typography nerds, however, break
193 out in a nasty rash when they encounter, say, a restaurant sign that uses
194 a straight apostrophe to spell "Joe's".
195
196 If you're the sort of person who just doesn't care, you might well want to
197 continue not caring. Using straight quotes -- and sticking to the 7-bit
198 ASCII character set in general -- is certainly a simpler way to live.
199
200 Even if you *do* care about accurate typography, you still might want to
201 think twice before educating the quote characters in your weblog. One side
202 effect of publishing curly quote characters is that it makes your
203 weblog a bit harder for others to quote from using copy-and-paste. What
204 happens is that when someone copies text from your blog, the copied text
205 contains the 8-bit curly quote characters (as well as the 8-bit characters
206 for em-dashes and ellipses, if you use these options). These characters
207 are not standard across different text encoding methods, which is why they
208 need to be encoded as characters.
209
210 People copying text from your weblog, however, may not notice that you're
211 using curly quotes, and they'll go ahead and paste the unencoded 8-bit
212 characters copied from their browser into an email message or their own
213 weblog. When pasted as raw "smart quotes", these characters are likely to
214 get mangled beyond recognition.
215
216 That said, my own opinion is that any decent text editor or email client
217 makes it easy to stupefy smart quote characters into their 7-bit
218 equivalents, and I don't consider it my problem if you're using an
219 indecent text editor or email client.
220
221
222 Algorithmic Shortcomings
223 ------------------------
224
225 One situation in which quotes will get curled the wrong way is when
226 apostrophes are used at the start of leading contractions. For example::
227
228 'Twas the night before Christmas.
229
230 In the case above, SmartyPants will turn the apostrophe into an opening
231 single-quote, when in fact it should be the `right single quotation mark`
232 character which is also "the preferred character to use for apostrophe"
233 (Unicode). I don't think this problem can be solved in the general case --
234 every word processor I've tried gets this wrong as well. In such cases, it's
235 best to use the proper character for closing single-quotes (’) by hand.
236
237 In English, the same character is used for apostrophe and closing single
238 quote (both plain and "smart" ones). For other locales (French, Italean,
239 Swiss, ...) "smart" single closing quotes differ from the curly apostrophe.
240
241 .. class:: language-fr
242
243 Il dit : "C'est 'super' !"
244
245 If the apostrophe is used at the end of a word, it cannot be distinguished
246 from a single quote by the algorithm. Therefore, a text like::
247
248 .. class:: language-de-CH
249
250 "Er sagt: 'Ich fass' es nicht.'"
251
252 will get a single closing guillemet instead of an apostrophe.
253
254 This can be prevented by use use of the curly apostrophe character (’) in
255 the source::
256
257 - "Er sagt: 'Ich fass' es nicht.'"
258 + "Er sagt: 'Ich fass’ es nicht.'"
259
260
261 Version History
262 ===============
263
264 1.8.1 2017-10-25
265 - Use open quote after Unicode whitespace, ZWSP, and ZWNJ.
266 - Code cleanup.
267
268 1.8: 2017-04-24
269 - Command line front-end.
270
271 1.7.1: 2017-03-19
272 - Update and extend language-dependent quotes.
273 - Differentiate apostrophe from single quote.
274
275 1.7: 2012-11-19
276 - Internationalization: language-dependent quotes.
277
278 1.6.1: 2012-11-06
279 - Refactor code, code cleanup,
280 - `educate_tokens()` generator as interface for Docutils.
281
282 1.6: 2010-08-26
283 - Adaption to Docutils:
284 - Use Unicode instead of HTML entities,
285 - Remove code special to pyblosxom.
286
287 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400
288 - Fixed bug where blocks of precious unalterable text was instead
289 interpreted. Thanks to Le Roux and Dirk van Oosterbosch.
290
291 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400
292 - Fix bogus magical quotation when there is no hint that the
293 user wants it, e.g., in "21st century". Thanks to Nathan Hamblen.
294 - Be smarter about quotes before terminating numbers in an en-dash'ed
295 range.
296
297 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500
298 - Fix a date-processing bug, as reported by jacob childress.
299 - Begin a test-suite for ensuring correct output.
300 - Removed import of "string", since I didn't really need it.
301 (This was my first every Python program. Sue me!)
302
303 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400
304 - Abort processing if the flavour is in forbidden-list. Default of
305 [ "rss" ] (Idea of Wolfgang SCHNERRING.)
306 - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING.
307
308 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400
309 - Some single quotes weren't replaced properly. Diff-tesuji played
310 by Benjamin GEIGER.
311
312 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500
313 - Support upcoming pyblosxom 0.9 plugin verification feature.
314
315 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500
316 - Initial release
317 """
318 from __future__ import print_function
319
320 options = r"""
321 Options
322 =======
323
324 Numeric values are the easiest way to configure SmartyPants' behavior:
325
326 :0: Suppress all transformations. (Do nothing.)
327
328 :1: Performs default SmartyPants transformations: quotes (including
329 \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash)
330 is used to signify an em-dash; there is no support for en-dashes
331
332 :2: Same as smarty_pants="1", except that it uses the old-school typewriter
333 shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``"
334 (dash dash dash)
335 for em-dashes.
336
337 :3: Same as smarty_pants="2", but inverts the shorthand for dashes:
338 "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for
339 en-dashes.
340
341 :-1: Stupefy mode. Reverses the SmartyPants transformation process, turning
342 the characters produced by SmartyPants into their ASCII equivalents.
343 E.g. the LEFT DOUBLE QUOTATION MARK (“) is turned into a simple
344 double-quote (\"), "—" is turned into two dashes, etc.
345
346
347 The following single-character attribute values can be combined to toggle
348 individual transformations from within the smarty_pants attribute. For
349 example, ``"1"`` is equivalent to ``"qBde"``.
350
351 :q: Educates normal quote characters: (") and (').
352
353 :b: Educates \`\`backticks'' -style double quotes.
354
355 :B: Educates \`\`backticks'' -style double quotes and \`single' quotes.
356
357 :d: Educates em-dashes.
358
359 :D: Educates em-dashes and en-dashes, using old-school typewriter shorthand:
360 (dash dash) for en-dashes, (dash dash dash) for em-dashes.
361
362 :i: Educates em-dashes and en-dashes, using inverted old-school typewriter
363 shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes.
364
365 :e: Educates ellipses.
366
367 :w: Translates any instance of ``&quot;`` into a normal double-quote character.
368 This should be of no interest to most people, but of particular interest
369 to anyone who writes their posts using Dreamweaver, as Dreamweaver
370 inexplicably uses this entity to represent a literal double-quote
371 character. SmartyPants only educates normal quotes, not entities (because
372 ordinarily, entities are used for the explicit purpose of representing the
373 specific character they represent). The "w" option must be used in
374 conjunction with one (or both) of the other quote options ("q" or "b").
375 Thus, if you wish to apply all SmartyPants transformations (quotes, en-
376 and em-dashes, and ellipses) and also translate ``&quot;`` entities into
377 regular quotes so SmartyPants can educate them, you should pass the
378 following to the smarty_pants attribute:
379 """
380
381
382 default_smartypants_attr = "1"
383
384
385 import re, sys
386
387 class smartchars(object):
388 """Smart quotes and dashes
389 """
390
391 endash = u'–' # "&#8211;" EN DASH
392 emdash = u'—' # "&#8212;" EM DASH
393 ellipsis = u'…' # "&#8230;" HORIZONTAL ELLIPSIS
394 apostrophe = u'’' # "&#8217;" RIGHT SINGLE QUOTATION MARK
395
396 # quote characters (language-specific, set in __init__())
397 # [1] http://en.wikipedia.org/wiki/Non-English_usage_of_quotation_marks
398 # [2] http://de.wikipedia.org/wiki/Anf%C3%BChrungszeichen#Andere_Sprachen
399 # [3] https://fr.wikipedia.org/wiki/Guillemet
400 # [4] http://typographisme.net/post/Les-espaces-typographiques-et-le-web
401 # [5] http://www.btb.termiumplus.gc.ca/tpv2guides/guides/redac/index-fra.html
402 # [6] https://en.wikipedia.org/wiki/Hebrew_punctuation#Quotation_marks
403 # [7] http://www.tustep.uni-tuebingen.de/bi/bi00/bi001t1-anfuehrung.pdf
404 # [8] http://www.korrekturavdelingen.no/anforselstegn.htm
405 # [9] Typografisk håndbok. Oslo: Spartacus. 2000. s. 67. ISBN 8243001530.
406 # [10] http://www.typografi.org/sitat/sitatart.html
407 #
408 # See also configuration option "smartquote-locales".
409 quotes = {'af': u'“”‘’',
410 'af-x-altquot': u'„”‚’',
411 'bg': u'„“‚‘', # Bulgarian, https://bg.wikipedia.org/wiki/Кавички
412 'ca': u'«»“”',
413 'ca-x-altquot': u'“”‘’',
414 'cs': u'„“‚‘',
415 'cs-x-altquot': u'»«›‹',
416 'da': u'»«›‹',
417 'da-x-altquot': u'„“‚‘',
418 # 'da-x-altquot2': u'””’’',
419 'de': u'„“‚‘',
420 'de-x-altquot': u'»«›‹',
421 'de-ch': u'«»‹›',
422 'el': u'«»“”',
423 'en': u'“”‘’',
424 'en-uk-x-altquot': u'‘’“”', # Attention: " → ‘ and ' → “ !
425 'eo': u'“”‘’',
426 'es': u'«»“”',
427 'es-x-altquot': u'“”‘’',
428 'et': u'„“‚‘', # no secondary quote listed in
429 'et-x-altquot': u'«»‹›', # the sources above (wikipedia.org)
430 'eu': u'«»‹›',
431 'fi': u'””’’',
432 'fi-x-altquot': u'»»››',
433 'fr': (u'« ', u' »', u'“', u'”'), # full no-break space
434 'fr-x-altquot': (u'« ', u' »', u'“', u'”'), # narrow no-break space
435 'fr-ch': u'«»‹›',
436 'fr-ch-x-altquot': (u'« ', u' »', u'‹ ', u' ›'), # narrow no-break space, http://typoguide.ch/
437 'gl': u'«»“”',
438 'he': u'”“»«', # Hebrew is RTL, test position:
439 'he-x-altquot': u'„”‚’', # low quotation marks are opening.
440 # 'he-x-altquot': u'“„‘‚', # RTL: low quotation marks opening
441 'hr': u'„”‘’', # http://hrvatska-tipografija.com/polunavodnici/
442 'hr-x-altquot': u'»«›‹',
443 'hsb': u'„“‚‘',
444 'hsb-x-altquot': u'»«›‹',
445 'hu': u'„”«»',
446 'is': u'„“‚‘',
447 'it': u'«»“”',
448 'it-ch': u'«»‹›',
449 'it-x-altquot': u'“”‘’',
450 # 'it-x-altquot2': u'“„‘‚', # [7] in headlines
451 'ja': u'「」『』',
452 'ko': u'《》〈〉',
453 'lt': u'„“‚‘',
454 'lv': u'„“‚‘',
455 'mk': u'„“‚‘', # Macedonian, https://mk.wikipedia.org/wiki/Правопис_и_правоговор_на_македонскиот_јазик
456 'nl': u'“”‘’',
457 'nl-x-altquot': u'„”‚’',
458 # 'nl-x-altquot2': u'””’’',
459 'nb': u'«»’’', # Norsk bokmål (canonical form 'no')
460 'nn': u'«»’’', # Nynorsk [10]
461 'nn-x-altquot': u'«»‘’', # [8], [10]
462 # 'nn-x-altquot2': u'«»«»', # [9], [10
463 # 'nn-x-altquot3': u'„“‚‘', # [10]
464 'no': u'«»’’', # Norsk bokmål [10]
465 'no-x-altquot': u'«»‘’', # [8], [10]
466 # 'no-x-altquot2': u'«»«»', # [9], [10
467 # 'no-x-altquot3': u'„“‚‘', # [10]
468 'pl': u'„”«»',
469 'pl-x-altquot': u'«»‚’',
470 # 'pl-x-altquot2': u'„”‚’', # https://pl.wikipedia.org/wiki/Cudzys%C5%82%C3%B3w
471 'pt': u'«»“”',
472 'pt-br': u'“”‘’',
473 'ro': u'„”«»',
474 'ru': u'«»„“',
475 'sh': u'„”‚’', # Serbo-Croatian
476 'sh-x-altquot': u'»«›‹',
477 'sk': u'„“‚‘', # Slovak
478 'sk-x-altquot': u'»«›‹',
479 'sl': u'„“‚‘', # Slovenian
480 'sl-x-altquot': u'»«›‹',
481 'sq': u'«»‹›', # Albanian
482 'sq-x-altquot': u'“„‘‚',
483 'sr': u'„”’’',
484 'sr-x-altquot': u'»«›‹',
485 'sv': u'””’’',
486 'sv-x-altquot': u'»»››',
487 'tr': u'“”‘’',
488 'tr-x-altquot': u'«»‹›',
489 # 'tr-x-altquot2': u'“„‘‚', # [7] antiquated?
490 'uk': u'«»„“',
491 'uk-x-altquot': u'„“‚‘',
492 'zh-cn': u'“”‘’',
493 'zh-tw': u'「」『』',
494 }
495
496 def __init__(self, language='en'):
497 self.language = language
498 try:
499 (self.opquote, self.cpquote,
500 self.osquote, self.csquote) = self.quotes[language.lower()]
501 except KeyError:
502 self.opquote, self.cpquote, self.osquote, self.csquote = u'""\'\''
503
504
505 def smartyPants(text, attr=default_smartypants_attr, language='en'):
506 """Main function for "traditional" use."""
507
508 return "".join([t for t in educate_tokens(tokenize(text),
509 attr, language)])
510
511
512 def educate_tokens(text_tokens, attr=default_smartypants_attr, language='en'):
513 """Return iterator that "educates" the items of `text_tokens`.
514 """
515
516 # Parse attributes:
517 # 0 : do nothing
518 # 1 : set all
519 # 2 : set all, using old school en- and em- dash shortcuts
520 # 3 : set all, using inverted old school en and em- dash shortcuts
521 #
522 # q : quotes
523 # b : backtick quotes (``double'' only)
524 # B : backtick quotes (``double'' and `single')
525 # d : dashes
526 # D : old school dashes
527 # i : inverted old school dashes
528 # e : ellipses
529 # w : convert &quot; entities to " for Dreamweaver users
530
531 convert_quot = False # translate &quot; entities into normal quotes?
532 do_dashes = False
533 do_backticks = False
534 do_quotes = False
535 do_ellipses = False
536 do_stupefy = False
537
538 # if attr == "0": # pass tokens unchanged (see below).
539 if attr == "1": # Do everything, turn all options on.
540 do_quotes = True
541 do_backticks = True
542 do_dashes = 1
543 do_ellipses = True
544 elif attr == "2":
545 # Do everything, turn all options on, use old school dash shorthand.
546 do_quotes = True
547 do_backticks = True
548 do_dashes = 2
549 do_ellipses = True
550 elif attr == "3":
551 # Do everything, use inverted old school dash shorthand.
552 do_quotes = True
553 do_backticks = True
554 do_dashes = 3
555 do_ellipses = True
556 elif attr == "-1": # Special "stupefy" mode.
557 do_stupefy = True
558 else:
559 if "q" in attr: do_quotes = True
560 if "b" in attr: do_backticks = True
561 if "B" in attr: do_backticks = 2
562 if "d" in attr: do_dashes = 1
563 if "D" in attr: do_dashes = 2
564 if "i" in attr: do_dashes = 3
565 if "e" in attr: do_ellipses = True
566 if "w" in attr: convert_quot = True
567
568 prev_token_last_char = " "
569 # Last character of the previous text token. Used as
570 # context to curl leading quote characters correctly.
571
572 for (ttype, text) in text_tokens:
573
574 # skip HTML and/or XML tags as well as emtpy text tokens
575 # without updating the last character
576 if ttype == 'tag' or not text:
577 yield text
578 continue
579
580 # skip literal text (math, literal, raw, ...)
581 if ttype == 'literal':
582 prev_token_last_char = text[-1:]
583 yield text
584 continue
585
586 last_char = text[-1:] # Remember last char before processing.
587
588 text = processEscapes(text)
589
590 if convert_quot:
591 text = re.sub('&quot;', '"', text)
592
593 if do_dashes == 1:
594 text = educateDashes(text)
595 elif do_dashes == 2:
596 text = educateDashesOldSchool(text)
597 elif do_dashes == 3:
598 text = educateDashesOldSchoolInverted(text)
599
600 if do_ellipses:
601 text = educateEllipses(text)
602
603 # Note: backticks need to be processed before quotes.
604 if do_backticks:
605 text = educateBackticks(text, language)
606
607 if do_backticks == 2:
608 text = educateSingleBackticks(text, language)
609
610 if do_quotes:
611 # Replace plain quotes in context to prevent converstion to
612 # 2-character sequence in French.
613 context = prev_token_last_char.replace('"', ';').replace("'", ';')
614 text = educateQuotes(context+text, language)[1:]
615
616 if do_stupefy:
617 text = stupefyEntities(text, language)
618
619 # Remember last char as context for the next token
620 prev_token_last_char = last_char
621
622 text = processEscapes(text, restore=True)
623
624 yield text
625
626
627
628 def educateQuotes(text, language='en'):
629 """
630 Parameter: - text string (unicode or bytes).
631 - language (`BCP 47` language tag.)
632 Returns: The `text`, with "educated" curly quote characters.
633
634 Example input: "Isn't this fun?"
635 Example output: “Isn’t this fun?“;
636 """
637
638 smart = smartchars(language)
639
640 punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""
641 close_class = r"""[^\ \t\r\n\[\{\(\-]"""
642 open_class = u'[\u200B\u200C]' # ZWSP, ZWNJ
643 dec_dashes = r"""&#8211;|&#8212;"""
644
645 # Special case if the very first character is a quote
646 # followed by punctuation at a non-word-break.
647 # Close the quotes by brute force:
648 text = re.sub(r"^'(?=%s\\B)" % (punct_class,), smart.csquote, text)
649 text = re.sub(r'^"(?=%s\\B)' % (punct_class,), smart.cpquote, text)
650
651 # Special case for double sets of quotes, e.g.:
652 # <p>He said, "'Quoted' words in a larger quote."</p>
653 text = re.sub(r""""'(?=\w)""", smart.opquote+smart.osquote, text)
654 text = re.sub(r"""'"(?=\w)""", smart.osquote+smart.opquote, text)
655
656 # Special case for decade abbreviations (the '80s):
657 if language.startswith('en'): # TODO similar cases in other languages?
658 text = re.sub(r"'(?=\d{2}s)", smart.apostrophe, text)
659
660 # Get most opening single quotes:
661 opening_single_quotes_regex = re.compile(u"""
662 (# ?<= # look behind fails: requires fixed-width pattern
663 \\s | # a whitespace char, or
664 %s | # another separating char, or
665 &nbsp; | # a non-breaking space entity, or
666 [\u2013 \u2014 ] | # literal dashes, or
667 -- | # dumb dashes, or
668 &[mn]dash; | # dash entities (named or
669 %s | # decimal or
670 &\\#x201[34]; # hex)
671 )
672 ' # the quote
673 (?=\\w) # followed by a word character
674 """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE)
675
676 text = opening_single_quotes_regex.sub(r'\1'+smart.osquote, text)
677
678 # In many locales, single closing quotes are different from apostrophe:
679 if smart.csquote != smart.apostrophe:
680 apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE)
681 text = apostrophe_regex.sub(smart.apostrophe, text)
682 # TODO: keep track of quoting level to recognize apostrophe in, e.g.,
683 # "Ich fass' es nicht."
684
685 closing_single_quotes_regex = re.compile(r"""
686 (?<=%s)
687 '
688 """ % close_class, re.VERBOSE)
689 text = closing_single_quotes_regex.sub(smart.csquote, text)
690
691 # Any remaining single quotes should be opening ones:
692 text = re.sub(r"""'""", smart.osquote, text)
693
694 # Get most opening double quotes:
695 opening_double_quotes_regex = re.compile(u"""
696 (
697 \\s | # a whitespace char, or
698 %s | # another separating char, or
699 &nbsp; | # a non-breaking space entity, or
700 [\u2013 \u2014 ] | # literal dashes, or
701 -- | # dumb dashes, or
702 &[mn]dash; | # dash entities (named or
703 %s | # decimal or
704 &\\#x201[34]; # hex)
705 )
706 " # the quote
707 (?=\\w) # followed by a word character
708 """ % (open_class, dec_dashes), re.VERBOSE | re.UNICODE)
709
710 text = opening_double_quotes_regex.sub(r'\1'+smart.opquote, text)
711
712 # Double closing quotes:
713 closing_double_quotes_regex = re.compile(r"""
714 (
715 (?<=%s)" | # char indicating the quote should be closing
716 "(?=\s) # whitespace behind
717 )
718 """ % (close_class,), re.VERBOSE | re.UNICODE)
719 text = closing_double_quotes_regex.sub(smart.cpquote, text)
720
721 # Any remaining quotes should be opening ones.
722 text = re.sub(r'"', smart.opquote, text)
723
724 return text
725
726
727 def educateBackticks(text, language='en'):
728 """
729 Parameter: String (unicode or bytes).
730 Returns: The `text`, with ``backticks'' -style double quotes
731 translated into HTML curly quote entities.
732 Example input: ``Isn't this fun?''
733 Example output: “Isn't this fun?“;
734 """
735 smart = smartchars(language)
736
737 text = re.sub(r"""``""", smart.opquote, text)
738 text = re.sub(r"""''""", smart.cpquote, text)
739 return text
740
741
742 def educateSingleBackticks(text, language='en'):
743 """
744 Parameter: String (unicode or bytes).
745 Returns: The `text`, with `backticks' -style single quotes
746 translated into HTML curly quote entities.
747
748 Example input: `Isn't this fun?'
749 Example output: ‘Isn’t this fun?’
750 """
751 smart = smartchars(language)
752
753 text = re.sub(r"""`""", smart.osquote, text)
754 text = re.sub(r"""'""", smart.csquote, text)
755 return text
756
757
758 def educateDashes(text):
759 """
760 Parameter: String (unicode or bytes).
761 Returns: The `text`, with each instance of "--" translated to
762 an em-dash character.
763 """
764
765 text = re.sub(r"""---""", smartchars.endash, text) # en (yes, backwards)
766 text = re.sub(r"""--""", smartchars.emdash, text) # em (yes, backwards)
767 return text
768
769
770 def educateDashesOldSchool(text):
771 """
772 Parameter: String (unicode or bytes).
773 Returns: The `text`, with each instance of "--" translated to
774 an en-dash character, and each "---" translated to
775 an em-dash character.
776 """
777
778 text = re.sub(r"""---""", smartchars.emdash, text)
779 text = re.sub(r"""--""", smartchars.endash, text)
780 return text
781
782
783 def educateDashesOldSchoolInverted(text):
784 """
785 Parameter: String (unicode or bytes).
786 Returns: The `text`, with each instance of "--" translated to
787 an em-dash character, and each "---" translated to
788 an en-dash character. Two reasons why: First, unlike the
789 en- and em-dash syntax supported by
790 EducateDashesOldSchool(), it's compatible with existing
791 entries written before SmartyPants 1.1, back when "--" was
792 only used for em-dashes. Second, em-dashes are more
793 common than en-dashes, and so it sort of makes sense that
794 the shortcut should be shorter to type. (Thanks to Aaron
795 Swartz for the idea.)
796 """
797 text = re.sub(r"""---""", smartchars.endash, text) # em
798 text = re.sub(r"""--""", smartchars.emdash, text) # en
799 return text
800
801
802
803 def educateEllipses(text):
804 """
805 Parameter: String (unicode or bytes).
806 Returns: The `text`, with each instance of "..." translated to
807 an ellipsis character.
808
809 Example input: Huh...?
810 Example output: Huh&#8230;?
811 """
812
813 text = re.sub(r"""\.\.\.""", smartchars.ellipsis, text)
814 text = re.sub(r"""\. \. \.""", smartchars.ellipsis, text)
815 return text
816
817
818 def stupefyEntities(text, language='en'):
819 """
820 Parameter: String (unicode or bytes).
821 Returns: The `text`, with each SmartyPants character translated to
822 its ASCII counterpart.
823
824 Example input: “Hello — world.”
825 Example output: "Hello -- world."
826 """
827 smart = smartchars(language)
828
829 text = re.sub(smart.endash, "-", text) # en-dash
830 text = re.sub(smart.emdash, "--", text) # em-dash
831
832 text = re.sub(smart.osquote, "'", text) # open single quote
833 text = re.sub(smart.csquote, "'", text) # close single quote
834
835 text = re.sub(smart.opquote, '"', text) # open double quote
836 text = re.sub(smart.cpquote, '"', text) # close double quote
837
838 text = re.sub(smart.ellipsis, '...', text)# ellipsis
839
840 return text
841
842
843 def processEscapes(text, restore=False):
844 r"""
845 Parameter: String (unicode or bytes).
846 Returns: The `text`, with after processing the following backslash
847 escape sequences. This is useful if you want to force a "dumb"
848 quote or other character to appear.
849
850 Escape Value
851 ------ -----
852 \\ &#92;
853 \" &#34;
854 \' &#39;
855 \. &#46;
856 \- &#45;
857 \` &#96;
858 """
859 replacements = ((r'\\', r'&#92;'),
860 (r'\"', r'&#34;'),
861 (r"\'", r'&#39;'),
862 (r'\.', r'&#46;'),
863 (r'\-', r'&#45;'),
864 (r'\`', r'&#96;'))
865 if restore:
866 for (ch, rep) in replacements:
867 text = text.replace(rep, ch[1])
868 else:
869 for (ch, rep) in replacements:
870 text = text.replace(ch, rep)
871
872 return text
873
874
875 def tokenize(text):
876 """
877 Parameter: String containing HTML markup.
878 Returns: An iterator that yields the tokens comprising the input
879 string. Each token is either a tag (possibly with nested,
880 tags contained therein, such as <a href="<MTFoo>">, or a
881 run of text between tags. Each yielded element is a
882 two-element tuple; the first is either 'tag' or 'text';
883 the second is the actual value.
884
885 Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin.
886 <http://www.bradchoate.com/past/mtregex.php>
887 """
888
889 pos = 0
890 length = len(text)
891 # tokens = []
892
893 depth = 6
894 nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth)
895 #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments
896 # (?: <\? .*? \?> ) | # directives
897 # %s # nested tags """ % (nested_tags,)
898 tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""")
899
900 token_match = tag_soup.search(text)
901
902 previous_end = 0
903 while token_match is not None:
904 if token_match.group(1):
905 yield ('text', token_match.group(1))
906
907 yield ('tag', token_match.group(2))
908
909 previous_end = token_match.end()
910 token_match = tag_soup.search(text, token_match.end())
911
912 if previous_end < len(text):
913 yield ('text', text[previous_end:])
914
915
916
917 if __name__ == "__main__":
918
919 import itertools
920 try:
921 import locale # module missing in Jython
922 locale.setlocale(locale.LC_ALL, '') # set to user defaults
923 defaultlanguage = locale.getdefaultlocale()[0]
924 except:
925 defaultlanguage = 'en'
926
927 # Normalize and drop unsupported subtags:
928 defaultlanguage = defaultlanguage.lower().replace('-', '_')
929 # split (except singletons, which mark the following tag as non-standard):
930 defaultlanguage = re.sub(r'_([a-zA-Z0-9])_', r'_\1-', defaultlanguage)
931 _subtags = [subtag for subtag in defaultlanguage.split('_')]
932 _basetag = _subtags.pop(0)
933 # find all combinations of subtags
934 for n in range(len(_subtags), 0, -1):
935 for tags in itertools.combinations(_subtags, n):
936 _tag = '-'.join((_basetag,)+tags)
937 if _tag in smartchars.quotes:
938 defaultlanguage = _tag
939 break
940 else:
941 if _basetag in smartchars.quotes:
942 defaultlanguage = _basetag
943 else:
944 defaultlanguage = 'en'
945
946
947 import argparse
948 parser = argparse.ArgumentParser(
949 description='Filter stdin making ASCII punctuation "smart".')
950 # parser.add_argument("text", help="text to be acted on")
951 parser.add_argument("-a", "--action", default="1",
952 help="what to do with the input (see --actionhelp)")
953 parser.add_argument("-e", "--encoding", default="utf8",
954 help="text encoding")
955 parser.add_argument("-l", "--language", default=defaultlanguage,
956 help="text language (BCP47 tag), "
957 "Default: %s"% defaultlanguage)
958 parser.add_argument("-q", "--alternative-quotes", action="store_true",
959 help="use alternative quote style")
960 parser.add_argument("--doc", action="store_true",
961 help="print documentation")
962 parser.add_argument("--actionhelp", action="store_true",
963 help="list available actions")
964 parser.add_argument("--stylehelp", action="store_true",
965 help="list available quote styles")
966 parser.add_argument("--test", action="store_true",
967 help="perform short self-test")
968 args = parser.parse_args()
969
970 if args.doc:
971 print(__doc__)
972 elif args.actionhelp:
973 print(options)
974 elif args.stylehelp:
975 print()
976 print("Available styles (primary open/close, secondary open/close)")
977 print("language tag quotes")
978 print("============ ======")
979 for key in sorted(smartchars.quotes.keys()):
980 print("%-14s %s" % (key, smartchars.quotes[key]))
981 elif args.test:
982 # Unit test output goes to stderr.
983 import unittest
984
985 class TestSmartypantsAllAttributes(unittest.TestCase):
986 # the default attribute is "1", which means "all".
987 def test_dates(self):
988 self.assertEqual(smartyPants("1440-80's"), u"1440-80’s")
989 self.assertEqual(smartyPants("1440-'80s"), u"1440-’80s")
990 self.assertEqual(smartyPants("1440---'80s"), u"1440–’80s")
991 self.assertEqual(smartyPants("1960's"), u"1960’s")
992 self.assertEqual(smartyPants("one two '60s"), u"one two ’60s")
993 self.assertEqual(smartyPants("'60s"), u"’60s")
994
995 def test_educated_quotes(self):
996 self.assertEqual(smartyPants('"Isn\'t this fun?"'), u'“Isn’t this fun?”')
997
998 def test_html_tags(self):
999 text = '<a src="foo">more</a>'
1000 self.assertEqual(smartyPants(text), text)
1001
1002 suite = unittest.TestLoader().loadTestsFromTestCase(
1003 TestSmartypantsAllAttributes)
1004 unittest.TextTestRunner().run(suite)
1005
1006 else:
1007 if args.alternative_quotes:
1008 if '-x-altquot' in args.language:
1009 args.language = args.language.replace('-x-altquot', '')
1010 else:
1011 args.language += '-x-altquot'
1012 text = sys.stdin.read().decode(args.encoding)
1013 print(smartyPants(text, attr=args.action,
1014 language=args.language).encode(args.encoding))