comparison env/lib/python3.7/site-packages/humanfriendly/text.py @ 2:6af9afd405e9 draft

"planemo upload commit 0a63dd5f4d38a1f6944587f52a8cd79874177fc1"
author shellac
date Thu, 14 May 2020 14:56:58 -0400
parents 26e78fe6e8c4
children
comparison
equal deleted inserted replaced
1:75ca89e9b81c 2:6af9afd405e9
1 # Human friendly input/output in Python.
2 #
3 # Author: Peter Odding <peter@peterodding.com>
4 # Last Change: March 1, 2020
5 # URL: https://humanfriendly.readthedocs.io
6
7 """
8 Simple text manipulation functions.
9
10 The :mod:`~humanfriendly.text` module contains simple functions to manipulate text:
11
12 - The :func:`concatenate()` and :func:`pluralize()` functions make it easy to
13 generate human friendly output.
14
15 - The :func:`format()`, :func:`compact()` and :func:`dedent()` functions
16 provide a clean and simple to use syntax for composing large text fragments
17 with interpolated variables.
18
19 - The :func:`tokenize()` function parses simple user input.
20 """
21
22 # Standard library modules.
23 import math
24 import numbers
25 import random
26 import re
27 import string
28 import textwrap
29
30 # Public identifiers that require documentation.
31 __all__ = (
32 'compact',
33 'compact_empty_lines',
34 'concatenate',
35 'dedent',
36 'format',
37 'generate_slug',
38 'is_empty_line',
39 'join_lines',
40 'pluralize',
41 'random_string',
42 'split',
43 'split_paragraphs',
44 'tokenize',
45 'trim_empty_lines',
46 )
47
48
49 def compact(text, *args, **kw):
50 '''
51 Compact whitespace in a string.
52
53 Trims leading and trailing whitespace, replaces runs of whitespace
54 characters with a single space and interpolates any arguments using
55 :func:`format()`.
56
57 :param text: The text to compact (a string).
58 :param args: Any positional arguments are interpolated using :func:`format()`.
59 :param kw: Any keyword arguments are interpolated using :func:`format()`.
60 :returns: The compacted text (a string).
61
62 Here's an example of how I like to use the :func:`compact()` function, this
63 is an example from a random unrelated project I'm working on at the moment::
64
65 raise PortDiscoveryError(compact("""
66 Failed to discover port(s) that Apache is listening on!
67 Maybe I'm parsing the wrong configuration file? ({filename})
68 """, filename=self.ports_config))
69
70 The combination of :func:`compact()` and Python's multi line strings allows
71 me to write long text fragments with interpolated variables that are easy
72 to write, easy to read and work well with Python's whitespace
73 sensitivity.
74 '''
75 non_whitespace_tokens = text.split()
76 compacted_text = ' '.join(non_whitespace_tokens)
77 return format(compacted_text, *args, **kw)
78
79
80 def compact_empty_lines(text):
81 """
82 Replace repeating empty lines with a single empty line (similar to ``cat -s``).
83
84 :param text: The text in which to compact empty lines (a string).
85 :returns: The text with empty lines compacted (a string).
86 """
87 i = 0
88 lines = text.splitlines(True)
89 while i < len(lines):
90 if i > 0 and is_empty_line(lines[i - 1]) and is_empty_line(lines[i]):
91 lines.pop(i)
92 else:
93 i += 1
94 return ''.join(lines)
95
96
97 def concatenate(items):
98 """
99 Concatenate a list of items in a human friendly way.
100
101 :param items: A sequence of strings.
102 :returns: A single string.
103
104 >>> from humanfriendly.text import concatenate
105 >>> concatenate(["eggs", "milk", "bread"])
106 'eggs, milk and bread'
107 """
108 items = list(items)
109 if len(items) > 1:
110 return ', '.join(items[:-1]) + ' and ' + items[-1]
111 elif items:
112 return items[0]
113 else:
114 return ''
115
116
117 def dedent(text, *args, **kw):
118 """
119 Dedent a string (remove common leading whitespace from all lines).
120
121 Removes common leading whitespace from all lines in the string using
122 :func:`textwrap.dedent()`, removes leading and trailing empty lines using
123 :func:`trim_empty_lines()` and interpolates any arguments using
124 :func:`format()`.
125
126 :param text: The text to dedent (a string).
127 :param args: Any positional arguments are interpolated using :func:`format()`.
128 :param kw: Any keyword arguments are interpolated using :func:`format()`.
129 :returns: The dedented text (a string).
130
131 The :func:`compact()` function's documentation contains an example of how I
132 like to use the :func:`compact()` and :func:`dedent()` functions. The main
133 difference is that I use :func:`compact()` for text that will be presented
134 to the user (where whitespace is not so significant) and :func:`dedent()`
135 for data file and code generation tasks (where newlines and indentation are
136 very significant).
137 """
138 dedented_text = textwrap.dedent(text)
139 trimmed_text = trim_empty_lines(dedented_text)
140 return format(trimmed_text, *args, **kw)
141
142
143 def format(text, *args, **kw):
144 """
145 Format a string using the string formatting operator and/or :meth:`str.format()`.
146
147 :param text: The text to format (a string).
148 :param args: Any positional arguments are interpolated into the text using
149 the string formatting operator (``%``). If no positional
150 arguments are given no interpolation is done.
151 :param kw: Any keyword arguments are interpolated into the text using the
152 :meth:`str.format()` function. If no keyword arguments are given
153 no interpolation is done.
154 :returns: The text with any positional and/or keyword arguments
155 interpolated (a string).
156
157 The implementation of this function is so trivial that it seems silly to
158 even bother writing and documenting it. Justifying this requires some
159 context :-).
160
161 **Why format() instead of the string formatting operator?**
162
163 For really simple string interpolation Python's string formatting operator
164 is ideal, but it does have some strange quirks:
165
166 - When you switch from interpolating a single value to interpolating
167 multiple values you have to wrap them in tuple syntax. Because
168 :func:`format()` takes a `variable number of arguments`_ it always
169 receives a tuple (which saves me a context switch :-). Here's an
170 example:
171
172 >>> from humanfriendly.text import format
173 >>> # The string formatting operator.
174 >>> print('the magic number is %s' % 42)
175 the magic number is 42
176 >>> print('the magic numbers are %s and %s' % (12, 42))
177 the magic numbers are 12 and 42
178 >>> # The format() function.
179 >>> print(format('the magic number is %s', 42))
180 the magic number is 42
181 >>> print(format('the magic numbers are %s and %s', 12, 42))
182 the magic numbers are 12 and 42
183
184 - When you interpolate a single value and someone accidentally passes in a
185 tuple your code raises a :exc:`~exceptions.TypeError`. Because
186 :func:`format()` takes a `variable number of arguments`_ it always
187 receives a tuple so this can never happen. Here's an example:
188
189 >>> # How expecting to interpolate a single value can fail.
190 >>> value = (12, 42)
191 >>> print('the magic value is %s' % value)
192 Traceback (most recent call last):
193 File "<stdin>", line 1, in <module>
194 TypeError: not all arguments converted during string formatting
195 >>> # The following line works as intended, no surprises here!
196 >>> print(format('the magic value is %s', value))
197 the magic value is (12, 42)
198
199 **Why format() instead of the str.format() method?**
200
201 When you're doing complex string interpolation the :meth:`str.format()`
202 function results in more readable code, however I frequently find myself
203 adding parentheses to force evaluation order. The :func:`format()` function
204 avoids this because of the relative priority between the comma and dot
205 operators. Here's an example:
206
207 >>> "{adjective} example" + " " + "(can't think of anything less {adjective})".format(adjective='silly')
208 "{adjective} example (can't think of anything less silly)"
209 >>> ("{adjective} example" + " " + "(can't think of anything less {adjective})").format(adjective='silly')
210 "silly example (can't think of anything less silly)"
211 >>> format("{adjective} example" + " " + "(can't think of anything less {adjective})", adjective='silly')
212 "silly example (can't think of anything less silly)"
213
214 The :func:`compact()` and :func:`dedent()` functions are wrappers that
215 combine :func:`format()` with whitespace manipulation to make it easy to
216 write nice to read Python code.
217
218 .. _variable number of arguments: https://docs.python.org/2/tutorial/controlflow.html#arbitrary-argument-lists
219 """
220 if args:
221 text %= args
222 if kw:
223 text = text.format(**kw)
224 return text
225
226
227 def generate_slug(text, delimiter="-"):
228 """
229 Convert text to a normalized "slug" without whitespace.
230
231 :param text: The original text, for example ``Some Random Text!``.
232 :param delimiter: The delimiter used to separate words
233 (defaults to the ``-`` character).
234 :returns: The slug text, for example ``some-random-text``.
235 :raises: :exc:`~exceptions.ValueError` when the provided
236 text is nonempty but results in an empty slug.
237 """
238 slug = text.lower()
239 escaped = delimiter.replace("\\", "\\\\")
240 slug = re.sub("[^a-z0-9]+", escaped, slug)
241 slug = slug.strip(delimiter)
242 if text and not slug:
243 msg = "The provided text %r results in an empty slug!"
244 raise ValueError(format(msg, text))
245 return slug
246
247
248 def is_empty_line(text):
249 """
250 Check if a text is empty or contains only whitespace.
251
252 :param text: The text to check for "emptiness" (a string).
253 :returns: :data:`True` if the text is empty or contains only whitespace,
254 :data:`False` otherwise.
255 """
256 return len(text) == 0 or text.isspace()
257
258
259 def join_lines(text):
260 """
261 Remove "hard wrapping" from the paragraphs in a string.
262
263 :param text: The text to reformat (a string).
264 :returns: The text without hard wrapping (a string).
265
266 This function works by removing line breaks when the last character before
267 a line break and the first character after the line break are both
268 non-whitespace characters. This means that common leading indentation will
269 break :func:`join_lines()` (in that case you can use :func:`dedent()`
270 before calling :func:`join_lines()`).
271 """
272 return re.sub(r'(\S)\n(\S)', r'\1 \2', text)
273
274
275 def pluralize(count, singular, plural=None):
276 """
277 Combine a count with the singular or plural form of a word.
278
279 If the plural form of the word is not provided it is obtained by
280 concatenating the singular form of the word with the letter "s". Of course
281 this will not always be correct, which is why you have the option to
282 specify both forms.
283
284 :param count: The count (a number).
285 :param singular: The singular form of the word (a string).
286 :param plural: The plural form of the word (a string or :data:`None`).
287 :returns: The count and singular/plural word concatenated (a string).
288 """
289 if not plural:
290 plural = singular + 's'
291 return '%s %s' % (count, singular if math.floor(float(count)) == 1 else plural)
292
293
294 def random_string(length=(25, 100), characters=string.ascii_letters):
295 """random_string(length=(25, 100), characters=string.ascii_letters)
296 Generate a random string.
297
298 :param length: The length of the string to be generated (a number or a
299 tuple with two numbers). If this is a tuple then a random
300 number between the two numbers given in the tuple is used.
301 :param characters: The characters to be used (a string, defaults
302 to :data:`string.ascii_letters`).
303 :returns: A random string.
304
305 The :func:`random_string()` function is very useful in test suites; by the
306 time I included it in :mod:`humanfriendly.text` I had already included
307 variants of this function in seven different test suites :-).
308 """
309 if not isinstance(length, numbers.Number):
310 length = random.randint(length[0], length[1])
311 return ''.join(random.choice(characters) for _ in range(length))
312
313
314 def split(text, delimiter=','):
315 """
316 Split a comma-separated list of strings.
317
318 :param text: The text to split (a string).
319 :param delimiter: The delimiter to split on (a string).
320 :returns: A list of zero or more nonempty strings.
321
322 Here's the default behavior of Python's built in :meth:`str.split()`
323 function:
324
325 >>> 'foo,bar, baz,'.split(',')
326 ['foo', 'bar', ' baz', '']
327
328 In contrast here's the default behavior of the :func:`split()` function:
329
330 >>> from humanfriendly.text import split
331 >>> split('foo,bar, baz,')
332 ['foo', 'bar', 'baz']
333
334 Here is an example that parses a nested data structure (a mapping of
335 logging level names to one or more styles per level) that's encoded in a
336 string so it can be set as an environment variable:
337
338 >>> from pprint import pprint
339 >>> encoded_data = 'debug=green;warning=yellow;error=red;critical=red,bold'
340 >>> parsed_data = dict((k, split(v, ',')) for k, v in (split(kv, '=') for kv in split(encoded_data, ';')))
341 >>> pprint(parsed_data)
342 {'debug': ['green'],
343 'warning': ['yellow'],
344 'error': ['red'],
345 'critical': ['red', 'bold']}
346 """
347 return [token.strip() for token in text.split(delimiter) if token and not token.isspace()]
348
349
350 def split_paragraphs(text):
351 """
352 Split a string into paragraphs (one or more lines delimited by an empty line).
353
354 :param text: The text to split into paragraphs (a string).
355 :returns: A list of strings.
356 """
357 paragraphs = []
358 for chunk in text.split('\n\n'):
359 chunk = trim_empty_lines(chunk)
360 if chunk and not chunk.isspace():
361 paragraphs.append(chunk)
362 return paragraphs
363
364
365 def tokenize(text):
366 """
367 Tokenize a text into numbers and strings.
368
369 :param text: The text to tokenize (a string).
370 :returns: A list of strings and/or numbers.
371
372 This function is used to implement robust tokenization of user input in
373 functions like :func:`.parse_size()` and :func:`.parse_timespan()`. It
374 automatically coerces integer and floating point numbers, ignores
375 whitespace and knows how to separate numbers from strings even without
376 whitespace. Some examples to make this more concrete:
377
378 >>> from humanfriendly.text import tokenize
379 >>> tokenize('42')
380 [42]
381 >>> tokenize('42MB')
382 [42, 'MB']
383 >>> tokenize('42.5MB')
384 [42.5, 'MB']
385 >>> tokenize('42.5 MB')
386 [42.5, 'MB']
387 """
388 tokenized_input = []
389 for token in re.split(r'(\d+(?:\.\d+)?)', text):
390 token = token.strip()
391 if re.match(r'\d+\.\d+', token):
392 tokenized_input.append(float(token))
393 elif token.isdigit():
394 tokenized_input.append(int(token))
395 elif token:
396 tokenized_input.append(token)
397 return tokenized_input
398
399
400 def trim_empty_lines(text):
401 """
402 Trim leading and trailing empty lines from the given text.
403
404 :param text: The text to trim (a string).
405 :returns: The trimmed text (a string).
406 """
407 lines = text.splitlines(True)
408 while lines and is_empty_line(lines[0]):
409 lines.pop(0)
410 while lines and is_empty_line(lines[-1]):
411 lines.pop(-1)
412 return ''.join(lines)