Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/humanfriendly/text.py @ 2:6af9afd405e9 draft
"planemo upload commit 0a63dd5f4d38a1f6944587f52a8cd79874177fc1"
author | shellac |
---|---|
date | Thu, 14 May 2020 14:56:58 -0400 |
parents | 26e78fe6e8c4 |
children |
comparison
equal
deleted
inserted
replaced
1:75ca89e9b81c | 2:6af9afd405e9 |
---|---|
1 # Human friendly input/output in Python. | |
2 # | |
3 # Author: Peter Odding <peter@peterodding.com> | |
4 # Last Change: March 1, 2020 | |
5 # URL: https://humanfriendly.readthedocs.io | |
6 | |
7 """ | |
8 Simple text manipulation functions. | |
9 | |
10 The :mod:`~humanfriendly.text` module contains simple functions to manipulate text: | |
11 | |
12 - The :func:`concatenate()` and :func:`pluralize()` functions make it easy to | |
13 generate human friendly output. | |
14 | |
15 - The :func:`format()`, :func:`compact()` and :func:`dedent()` functions | |
16 provide a clean and simple to use syntax for composing large text fragments | |
17 with interpolated variables. | |
18 | |
19 - The :func:`tokenize()` function parses simple user input. | |
20 """ | |
21 | |
22 # Standard library modules. | |
23 import math | |
24 import numbers | |
25 import random | |
26 import re | |
27 import string | |
28 import textwrap | |
29 | |
30 # Public identifiers that require documentation. | |
31 __all__ = ( | |
32 'compact', | |
33 'compact_empty_lines', | |
34 'concatenate', | |
35 'dedent', | |
36 'format', | |
37 'generate_slug', | |
38 'is_empty_line', | |
39 'join_lines', | |
40 'pluralize', | |
41 'random_string', | |
42 'split', | |
43 'split_paragraphs', | |
44 'tokenize', | |
45 'trim_empty_lines', | |
46 ) | |
47 | |
48 | |
49 def compact(text, *args, **kw): | |
50 ''' | |
51 Compact whitespace in a string. | |
52 | |
53 Trims leading and trailing whitespace, replaces runs of whitespace | |
54 characters with a single space and interpolates any arguments using | |
55 :func:`format()`. | |
56 | |
57 :param text: The text to compact (a string). | |
58 :param args: Any positional arguments are interpolated using :func:`format()`. | |
59 :param kw: Any keyword arguments are interpolated using :func:`format()`. | |
60 :returns: The compacted text (a string). | |
61 | |
62 Here's an example of how I like to use the :func:`compact()` function, this | |
63 is an example from a random unrelated project I'm working on at the moment:: | |
64 | |
65 raise PortDiscoveryError(compact(""" | |
66 Failed to discover port(s) that Apache is listening on! | |
67 Maybe I'm parsing the wrong configuration file? ({filename}) | |
68 """, filename=self.ports_config)) | |
69 | |
70 The combination of :func:`compact()` and Python's multi line strings allows | |
71 me to write long text fragments with interpolated variables that are easy | |
72 to write, easy to read and work well with Python's whitespace | |
73 sensitivity. | |
74 ''' | |
75 non_whitespace_tokens = text.split() | |
76 compacted_text = ' '.join(non_whitespace_tokens) | |
77 return format(compacted_text, *args, **kw) | |
78 | |
79 | |
80 def compact_empty_lines(text): | |
81 """ | |
82 Replace repeating empty lines with a single empty line (similar to ``cat -s``). | |
83 | |
84 :param text: The text in which to compact empty lines (a string). | |
85 :returns: The text with empty lines compacted (a string). | |
86 """ | |
87 i = 0 | |
88 lines = text.splitlines(True) | |
89 while i < len(lines): | |
90 if i > 0 and is_empty_line(lines[i - 1]) and is_empty_line(lines[i]): | |
91 lines.pop(i) | |
92 else: | |
93 i += 1 | |
94 return ''.join(lines) | |
95 | |
96 | |
97 def concatenate(items): | |
98 """ | |
99 Concatenate a list of items in a human friendly way. | |
100 | |
101 :param items: A sequence of strings. | |
102 :returns: A single string. | |
103 | |
104 >>> from humanfriendly.text import concatenate | |
105 >>> concatenate(["eggs", "milk", "bread"]) | |
106 'eggs, milk and bread' | |
107 """ | |
108 items = list(items) | |
109 if len(items) > 1: | |
110 return ', '.join(items[:-1]) + ' and ' + items[-1] | |
111 elif items: | |
112 return items[0] | |
113 else: | |
114 return '' | |
115 | |
116 | |
117 def dedent(text, *args, **kw): | |
118 """ | |
119 Dedent a string (remove common leading whitespace from all lines). | |
120 | |
121 Removes common leading whitespace from all lines in the string using | |
122 :func:`textwrap.dedent()`, removes leading and trailing empty lines using | |
123 :func:`trim_empty_lines()` and interpolates any arguments using | |
124 :func:`format()`. | |
125 | |
126 :param text: The text to dedent (a string). | |
127 :param args: Any positional arguments are interpolated using :func:`format()`. | |
128 :param kw: Any keyword arguments are interpolated using :func:`format()`. | |
129 :returns: The dedented text (a string). | |
130 | |
131 The :func:`compact()` function's documentation contains an example of how I | |
132 like to use the :func:`compact()` and :func:`dedent()` functions. The main | |
133 difference is that I use :func:`compact()` for text that will be presented | |
134 to the user (where whitespace is not so significant) and :func:`dedent()` | |
135 for data file and code generation tasks (where newlines and indentation are | |
136 very significant). | |
137 """ | |
138 dedented_text = textwrap.dedent(text) | |
139 trimmed_text = trim_empty_lines(dedented_text) | |
140 return format(trimmed_text, *args, **kw) | |
141 | |
142 | |
143 def format(text, *args, **kw): | |
144 """ | |
145 Format a string using the string formatting operator and/or :meth:`str.format()`. | |
146 | |
147 :param text: The text to format (a string). | |
148 :param args: Any positional arguments are interpolated into the text using | |
149 the string formatting operator (``%``). If no positional | |
150 arguments are given no interpolation is done. | |
151 :param kw: Any keyword arguments are interpolated into the text using the | |
152 :meth:`str.format()` function. If no keyword arguments are given | |
153 no interpolation is done. | |
154 :returns: The text with any positional and/or keyword arguments | |
155 interpolated (a string). | |
156 | |
157 The implementation of this function is so trivial that it seems silly to | |
158 even bother writing and documenting it. Justifying this requires some | |
159 context :-). | |
160 | |
161 **Why format() instead of the string formatting operator?** | |
162 | |
163 For really simple string interpolation Python's string formatting operator | |
164 is ideal, but it does have some strange quirks: | |
165 | |
166 - When you switch from interpolating a single value to interpolating | |
167 multiple values you have to wrap them in tuple syntax. Because | |
168 :func:`format()` takes a `variable number of arguments`_ it always | |
169 receives a tuple (which saves me a context switch :-). Here's an | |
170 example: | |
171 | |
172 >>> from humanfriendly.text import format | |
173 >>> # The string formatting operator. | |
174 >>> print('the magic number is %s' % 42) | |
175 the magic number is 42 | |
176 >>> print('the magic numbers are %s and %s' % (12, 42)) | |
177 the magic numbers are 12 and 42 | |
178 >>> # The format() function. | |
179 >>> print(format('the magic number is %s', 42)) | |
180 the magic number is 42 | |
181 >>> print(format('the magic numbers are %s and %s', 12, 42)) | |
182 the magic numbers are 12 and 42 | |
183 | |
184 - When you interpolate a single value and someone accidentally passes in a | |
185 tuple your code raises a :exc:`~exceptions.TypeError`. Because | |
186 :func:`format()` takes a `variable number of arguments`_ it always | |
187 receives a tuple so this can never happen. Here's an example: | |
188 | |
189 >>> # How expecting to interpolate a single value can fail. | |
190 >>> value = (12, 42) | |
191 >>> print('the magic value is %s' % value) | |
192 Traceback (most recent call last): | |
193 File "<stdin>", line 1, in <module> | |
194 TypeError: not all arguments converted during string formatting | |
195 >>> # The following line works as intended, no surprises here! | |
196 >>> print(format('the magic value is %s', value)) | |
197 the magic value is (12, 42) | |
198 | |
199 **Why format() instead of the str.format() method?** | |
200 | |
201 When you're doing complex string interpolation the :meth:`str.format()` | |
202 function results in more readable code, however I frequently find myself | |
203 adding parentheses to force evaluation order. The :func:`format()` function | |
204 avoids this because of the relative priority between the comma and dot | |
205 operators. Here's an example: | |
206 | |
207 >>> "{adjective} example" + " " + "(can't think of anything less {adjective})".format(adjective='silly') | |
208 "{adjective} example (can't think of anything less silly)" | |
209 >>> ("{adjective} example" + " " + "(can't think of anything less {adjective})").format(adjective='silly') | |
210 "silly example (can't think of anything less silly)" | |
211 >>> format("{adjective} example" + " " + "(can't think of anything less {adjective})", adjective='silly') | |
212 "silly example (can't think of anything less silly)" | |
213 | |
214 The :func:`compact()` and :func:`dedent()` functions are wrappers that | |
215 combine :func:`format()` with whitespace manipulation to make it easy to | |
216 write nice to read Python code. | |
217 | |
218 .. _variable number of arguments: https://docs.python.org/2/tutorial/controlflow.html#arbitrary-argument-lists | |
219 """ | |
220 if args: | |
221 text %= args | |
222 if kw: | |
223 text = text.format(**kw) | |
224 return text | |
225 | |
226 | |
227 def generate_slug(text, delimiter="-"): | |
228 """ | |
229 Convert text to a normalized "slug" without whitespace. | |
230 | |
231 :param text: The original text, for example ``Some Random Text!``. | |
232 :param delimiter: The delimiter used to separate words | |
233 (defaults to the ``-`` character). | |
234 :returns: The slug text, for example ``some-random-text``. | |
235 :raises: :exc:`~exceptions.ValueError` when the provided | |
236 text is nonempty but results in an empty slug. | |
237 """ | |
238 slug = text.lower() | |
239 escaped = delimiter.replace("\\", "\\\\") | |
240 slug = re.sub("[^a-z0-9]+", escaped, slug) | |
241 slug = slug.strip(delimiter) | |
242 if text and not slug: | |
243 msg = "The provided text %r results in an empty slug!" | |
244 raise ValueError(format(msg, text)) | |
245 return slug | |
246 | |
247 | |
248 def is_empty_line(text): | |
249 """ | |
250 Check if a text is empty or contains only whitespace. | |
251 | |
252 :param text: The text to check for "emptiness" (a string). | |
253 :returns: :data:`True` if the text is empty or contains only whitespace, | |
254 :data:`False` otherwise. | |
255 """ | |
256 return len(text) == 0 or text.isspace() | |
257 | |
258 | |
259 def join_lines(text): | |
260 """ | |
261 Remove "hard wrapping" from the paragraphs in a string. | |
262 | |
263 :param text: The text to reformat (a string). | |
264 :returns: The text without hard wrapping (a string). | |
265 | |
266 This function works by removing line breaks when the last character before | |
267 a line break and the first character after the line break are both | |
268 non-whitespace characters. This means that common leading indentation will | |
269 break :func:`join_lines()` (in that case you can use :func:`dedent()` | |
270 before calling :func:`join_lines()`). | |
271 """ | |
272 return re.sub(r'(\S)\n(\S)', r'\1 \2', text) | |
273 | |
274 | |
275 def pluralize(count, singular, plural=None): | |
276 """ | |
277 Combine a count with the singular or plural form of a word. | |
278 | |
279 If the plural form of the word is not provided it is obtained by | |
280 concatenating the singular form of the word with the letter "s". Of course | |
281 this will not always be correct, which is why you have the option to | |
282 specify both forms. | |
283 | |
284 :param count: The count (a number). | |
285 :param singular: The singular form of the word (a string). | |
286 :param plural: The plural form of the word (a string or :data:`None`). | |
287 :returns: The count and singular/plural word concatenated (a string). | |
288 """ | |
289 if not plural: | |
290 plural = singular + 's' | |
291 return '%s %s' % (count, singular if math.floor(float(count)) == 1 else plural) | |
292 | |
293 | |
294 def random_string(length=(25, 100), characters=string.ascii_letters): | |
295 """random_string(length=(25, 100), characters=string.ascii_letters) | |
296 Generate a random string. | |
297 | |
298 :param length: The length of the string to be generated (a number or a | |
299 tuple with two numbers). If this is a tuple then a random | |
300 number between the two numbers given in the tuple is used. | |
301 :param characters: The characters to be used (a string, defaults | |
302 to :data:`string.ascii_letters`). | |
303 :returns: A random string. | |
304 | |
305 The :func:`random_string()` function is very useful in test suites; by the | |
306 time I included it in :mod:`humanfriendly.text` I had already included | |
307 variants of this function in seven different test suites :-). | |
308 """ | |
309 if not isinstance(length, numbers.Number): | |
310 length = random.randint(length[0], length[1]) | |
311 return ''.join(random.choice(characters) for _ in range(length)) | |
312 | |
313 | |
314 def split(text, delimiter=','): | |
315 """ | |
316 Split a comma-separated list of strings. | |
317 | |
318 :param text: The text to split (a string). | |
319 :param delimiter: The delimiter to split on (a string). | |
320 :returns: A list of zero or more nonempty strings. | |
321 | |
322 Here's the default behavior of Python's built in :meth:`str.split()` | |
323 function: | |
324 | |
325 >>> 'foo,bar, baz,'.split(',') | |
326 ['foo', 'bar', ' baz', ''] | |
327 | |
328 In contrast here's the default behavior of the :func:`split()` function: | |
329 | |
330 >>> from humanfriendly.text import split | |
331 >>> split('foo,bar, baz,') | |
332 ['foo', 'bar', 'baz'] | |
333 | |
334 Here is an example that parses a nested data structure (a mapping of | |
335 logging level names to one or more styles per level) that's encoded in a | |
336 string so it can be set as an environment variable: | |
337 | |
338 >>> from pprint import pprint | |
339 >>> encoded_data = 'debug=green;warning=yellow;error=red;critical=red,bold' | |
340 >>> parsed_data = dict((k, split(v, ',')) for k, v in (split(kv, '=') for kv in split(encoded_data, ';'))) | |
341 >>> pprint(parsed_data) | |
342 {'debug': ['green'], | |
343 'warning': ['yellow'], | |
344 'error': ['red'], | |
345 'critical': ['red', 'bold']} | |
346 """ | |
347 return [token.strip() for token in text.split(delimiter) if token and not token.isspace()] | |
348 | |
349 | |
350 def split_paragraphs(text): | |
351 """ | |
352 Split a string into paragraphs (one or more lines delimited by an empty line). | |
353 | |
354 :param text: The text to split into paragraphs (a string). | |
355 :returns: A list of strings. | |
356 """ | |
357 paragraphs = [] | |
358 for chunk in text.split('\n\n'): | |
359 chunk = trim_empty_lines(chunk) | |
360 if chunk and not chunk.isspace(): | |
361 paragraphs.append(chunk) | |
362 return paragraphs | |
363 | |
364 | |
365 def tokenize(text): | |
366 """ | |
367 Tokenize a text into numbers and strings. | |
368 | |
369 :param text: The text to tokenize (a string). | |
370 :returns: A list of strings and/or numbers. | |
371 | |
372 This function is used to implement robust tokenization of user input in | |
373 functions like :func:`.parse_size()` and :func:`.parse_timespan()`. It | |
374 automatically coerces integer and floating point numbers, ignores | |
375 whitespace and knows how to separate numbers from strings even without | |
376 whitespace. Some examples to make this more concrete: | |
377 | |
378 >>> from humanfriendly.text import tokenize | |
379 >>> tokenize('42') | |
380 [42] | |
381 >>> tokenize('42MB') | |
382 [42, 'MB'] | |
383 >>> tokenize('42.5MB') | |
384 [42.5, 'MB'] | |
385 >>> tokenize('42.5 MB') | |
386 [42.5, 'MB'] | |
387 """ | |
388 tokenized_input = [] | |
389 for token in re.split(r'(\d+(?:\.\d+)?)', text): | |
390 token = token.strip() | |
391 if re.match(r'\d+\.\d+', token): | |
392 tokenized_input.append(float(token)) | |
393 elif token.isdigit(): | |
394 tokenized_input.append(int(token)) | |
395 elif token: | |
396 tokenized_input.append(token) | |
397 return tokenized_input | |
398 | |
399 | |
400 def trim_empty_lines(text): | |
401 """ | |
402 Trim leading and trailing empty lines from the given text. | |
403 | |
404 :param text: The text to trim (a string). | |
405 :returns: The trimmed text (a string). | |
406 """ | |
407 lines = text.splitlines(True) | |
408 while lines and is_empty_line(lines[0]): | |
409 lines.pop(0) | |
410 while lines and is_empty_line(lines[-1]): | |
411 lines.pop(-1) | |
412 return ''.join(lines) |