comparison env/lib/python3.7/site-packages/docutils/io.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400 (2020-05-02)
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # $Id: io.py 8394 2019-09-18 10:13:17Z milde $
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
4
5 """
6 I/O classes provide a uniform API for low-level input and output. Subclasses
7 exist for a variety of input/output mechanisms.
8 """
9 from __future__ import print_function
10
11 __docformat__ = 'reStructuredText'
12
13 import sys
14 import os
15 import re
16 import codecs
17 from docutils import TransformSpec
18 from docutils.utils.error_reporting import locale_encoding, ErrorString, ErrorOutput
19
20 if sys.version_info >= (3, 0):
21 unicode = str # noqa
22
23
24 class InputError(IOError): pass
25 class OutputError(IOError): pass
26
27 def check_encoding(stream, encoding):
28 """Test, whether the encoding of `stream` matches `encoding`.
29
30 Returns
31
32 :None: if `encoding` or `stream.encoding` are not a valid encoding
33 argument (e.g. ``None``) or `stream.encoding is missing.
34 :True: if the encoding argument resolves to the same value as `encoding`,
35 :False: if the encodings differ.
36 """
37 try:
38 return codecs.lookup(stream.encoding) == codecs.lookup(encoding)
39 except (LookupError, AttributeError, TypeError):
40 return None
41
42
43 class Input(TransformSpec):
44
45 """
46 Abstract base class for input wrappers.
47 """
48
49 component_type = 'input'
50
51 default_source_path = None
52
53 def __init__(self, source=None, source_path=None, encoding=None,
54 error_handler='strict'):
55 self.encoding = encoding
56 """Text encoding for the input source."""
57
58 self.error_handler = error_handler
59 """Text decoding error handler."""
60
61 self.source = source
62 """The source of input data."""
63
64 self.source_path = source_path
65 """A text reference to the source."""
66
67 if not source_path:
68 self.source_path = self.default_source_path
69
70 self.successful_encoding = None
71 """The encoding that successfully decoded the source data."""
72
73 def __repr__(self):
74 return '%s: source=%r, source_path=%r' % (self.__class__, self.source,
75 self.source_path)
76
77 def read(self):
78 raise NotImplementedError
79
80 def decode(self, data):
81 """
82 Decode a string, `data`, heuristically.
83 Raise UnicodeError if unsuccessful.
84
85 The client application should call ``locale.setlocale`` at the
86 beginning of processing::
87
88 locale.setlocale(locale.LC_ALL, '')
89 """
90 if self.encoding and self.encoding.lower() == 'unicode':
91 assert isinstance(data, unicode), (
92 'input encoding is "unicode" '
93 'but input is not a unicode object')
94 if isinstance(data, unicode):
95 # Accept unicode even if self.encoding != 'unicode'.
96 return data
97 if self.encoding:
98 # We believe the user/application when the encoding is
99 # explicitly given.
100 encodings = [self.encoding]
101 else:
102 data_encoding = self.determine_encoding_from_data(data)
103 if data_encoding:
104 # If the data declares its encoding (explicitly or via a BOM),
105 # we believe it.
106 encodings = [data_encoding]
107 else:
108 # Apply heuristics only if no encoding is explicitly given and
109 # no BOM found. Start with UTF-8, because that only matches
110 # data that *IS* UTF-8:
111 encodings = ['utf-8', 'latin-1']
112 if locale_encoding:
113 encodings.insert(1, locale_encoding)
114 for enc in encodings:
115 try:
116 decoded = unicode(data, enc, self.error_handler)
117 self.successful_encoding = enc
118 # Return decoded, removing BOMs.
119 return decoded.replace(u'\ufeff', u'')
120 except (UnicodeError, LookupError) as err:
121 error = err # in Python 3, the <exception instance> is
122 # local to the except clause
123 raise UnicodeError(
124 'Unable to decode input data. Tried the following encodings: '
125 '%s.\n(%s)' % (', '.join([repr(enc) for enc in encodings]),
126 ErrorString(error)))
127
128 coding_slug = re.compile(br"coding[:=]\s*([-\w.]+)")
129 """Encoding declaration pattern."""
130
131 byte_order_marks = ((codecs.BOM_UTF8, 'utf-8'),
132 (codecs.BOM_UTF16_BE, 'utf-16-be'),
133 (codecs.BOM_UTF16_LE, 'utf-16-le'),)
134 """Sequence of (start_bytes, encoding) tuples for encoding detection.
135 The first bytes of input data are checked against the start_bytes strings.
136 A match indicates the given encoding."""
137
138 def determine_encoding_from_data(self, data):
139 """
140 Try to determine the encoding of `data` by looking *in* `data`.
141 Check for a byte order mark (BOM) or an encoding declaration.
142 """
143 # check for a byte order mark:
144 for start_bytes, encoding in self.byte_order_marks:
145 if data.startswith(start_bytes):
146 return encoding
147 # check for an encoding declaration pattern in first 2 lines of file:
148 for line in data.splitlines()[:2]:
149 match = self.coding_slug.search(line)
150 if match:
151 return match.group(1).decode('ascii')
152 return None
153
154
155 class Output(TransformSpec):
156
157 """
158 Abstract base class for output wrappers.
159 """
160
161 component_type = 'output'
162
163 default_destination_path = None
164
165 def __init__(self, destination=None, destination_path=None,
166 encoding=None, error_handler='strict'):
167 self.encoding = encoding
168 """Text encoding for the output destination."""
169
170 self.error_handler = error_handler or 'strict'
171 """Text encoding error handler."""
172
173 self.destination = destination
174 """The destination for output data."""
175
176 self.destination_path = destination_path
177 """A text reference to the destination."""
178
179 if not destination_path:
180 self.destination_path = self.default_destination_path
181
182 def __repr__(self):
183 return ('%s: destination=%r, destination_path=%r'
184 % (self.__class__, self.destination, self.destination_path))
185
186 def write(self, data):
187 """`data` is a Unicode string, to be encoded by `self.encode`."""
188 raise NotImplementedError
189
190 def encode(self, data):
191 if self.encoding and self.encoding.lower() == 'unicode':
192 assert isinstance(data, unicode), (
193 'the encoding given is "unicode" but the output is not '
194 'a Unicode string')
195 return data
196 if not isinstance(data, unicode):
197 # Non-unicode (e.g. bytes) output.
198 return data
199 else:
200 return data.encode(self.encoding, self.error_handler)
201
202
203 class FileInput(Input):
204
205 """
206 Input for single, simple file-like objects.
207 """
208 def __init__(self, source=None, source_path=None,
209 encoding=None, error_handler='strict',
210 autoclose=True,
211 mode='r' if sys.version_info >= (3, 0) else 'rU'):
212 """
213 :Parameters:
214 - `source`: either a file-like object (which is read directly), or
215 `None` (which implies `sys.stdin` if no `source_path` given).
216 - `source_path`: a path to a file, which is opened and then read.
217 - `encoding`: the expected text encoding of the input file.
218 - `error_handler`: the encoding error handler to use.
219 - `autoclose`: close automatically after read (except when
220 `sys.stdin` is the source).
221 - `mode`: how the file is to be opened (see standard function
222 `open`). The default 'rU' provides universal newline support
223 for text files with Python 2.x.
224 """
225 Input.__init__(self, source, source_path, encoding, error_handler)
226 self.autoclose = autoclose
227 self._stderr = ErrorOutput()
228
229 if source is None:
230 if source_path:
231 # Specify encoding in Python 3
232 if sys.version_info >= (3, 0):
233 kwargs = {'encoding': self.encoding,
234 'errors': self.error_handler}
235 else:
236 kwargs = {}
237 try:
238 self.source = open(source_path, mode, **kwargs)
239 except IOError as error:
240 raise InputError(error.errno, error.strerror, source_path)
241 else:
242 self.source = sys.stdin
243 elif (sys.version_info >= (3, 0) and
244 check_encoding(self.source, self.encoding) is False):
245 # TODO: re-open, warn or raise error?
246 raise UnicodeError('Encoding clash: encoding given is "%s" '
247 'but source is opened with encoding "%s".' %
248 (self.encoding, self.source.encoding))
249 if not source_path:
250 try:
251 self.source_path = self.source.name
252 except AttributeError:
253 pass
254
255 def read(self):
256 """
257 Read and decode a single file and return the data (Unicode string).
258 """
259 try:
260 if self.source is sys.stdin and sys.version_info >= (3, 0):
261 # read as binary data to circumvent auto-decoding
262 data = self.source.buffer.read()
263 # normalize newlines
264 data = b'\n'.join(data.splitlines()) + b'\n'
265 else:
266 data = self.source.read()
267 except (UnicodeError, LookupError) as err: # (in Py3k read() decodes)
268 if not self.encoding and self.source_path:
269 # re-read in binary mode and decode with heuristics
270 b_source = open(self.source_path, 'rb')
271 data = b_source.read()
272 b_source.close()
273 # normalize newlines
274 data = b'\n'.join(data.splitlines()) + b'\n'
275 else:
276 raise
277 finally:
278 if self.autoclose:
279 self.close()
280 return self.decode(data)
281
282 def readlines(self):
283 """
284 Return lines of a single file as list of Unicode strings.
285 """
286 return self.read().splitlines(True)
287
288 def close(self):
289 if self.source is not sys.stdin:
290 self.source.close()
291
292
293 class FileOutput(Output):
294
295 """
296 Output for single, simple file-like objects.
297 """
298
299 mode = 'w'
300 """The mode argument for `open()`."""
301 # 'wb' for binary (e.g. OpenOffice) files (see also `BinaryFileOutput`).
302 # (Do not use binary mode ('wb') for text files, as this prevents the
303 # conversion of newlines to the system specific default.)
304
305 def __init__(self, destination=None, destination_path=None,
306 encoding=None, error_handler='strict', autoclose=True,
307 handle_io_errors=None, mode=None):
308 """
309 :Parameters:
310 - `destination`: either a file-like object (which is written
311 directly) or `None` (which implies `sys.stdout` if no
312 `destination_path` given).
313 - `destination_path`: a path to a file, which is opened and then
314 written.
315 - `encoding`: the text encoding of the output file.
316 - `error_handler`: the encoding error handler to use.
317 - `autoclose`: close automatically after write (except when
318 `sys.stdout` or `sys.stderr` is the destination).
319 - `handle_io_errors`: ignored, deprecated, will be removed.
320 - `mode`: how the file is to be opened (see standard function
321 `open`). The default is 'w', providing universal newline
322 support for text files.
323 """
324 Output.__init__(self, destination, destination_path,
325 encoding, error_handler)
326 self.opened = True
327 self.autoclose = autoclose
328 if mode is not None:
329 self.mode = mode
330 self._stderr = ErrorOutput()
331 if destination is None:
332 if destination_path:
333 self.opened = False
334 else:
335 self.destination = sys.stdout
336 elif (# destination is file-type object -> check mode:
337 mode and hasattr(self.destination, 'mode')
338 and mode != self.destination.mode):
339 print('Warning: Destination mode "%s" differs from specified '
340 'mode "%s"' % (self.destination.mode, mode),
341 file=self._stderr)
342 if not destination_path:
343 try:
344 self.destination_path = self.destination.name
345 except AttributeError:
346 pass
347
348 def open(self):
349 # Specify encoding in Python 3.
350 if sys.version_info >= (3, 0) and 'b' not in self.mode:
351 kwargs = {'encoding': self.encoding,
352 'errors': self.error_handler}
353 else:
354 kwargs = {}
355 try:
356 self.destination = open(self.destination_path, self.mode, **kwargs)
357 except IOError as error:
358 raise OutputError(error.errno, error.strerror,
359 self.destination_path)
360 self.opened = True
361
362 def write(self, data):
363 """Encode `data`, write it to a single file, and return it.
364
365 With Python 3 or binary output mode, `data` is returned unchanged,
366 except when specified encoding and output encoding differ.
367 """
368 if not self.opened:
369 self.open()
370 if ('b' not in self.mode and sys.version_info < (3, 0)
371 or check_encoding(self.destination, self.encoding) is False
372 ):
373 data = self.encode(data)
374 if sys.version_info >= (3, 0) and os.linesep != '\n':
375 data = data.replace(b'\n', bytes(os.linesep, 'ascii')) # fix endings
376
377 try:
378 self.destination.write(data)
379 except TypeError as e:
380 if sys.version_info >= (3, 0) and isinstance(data, bytes):
381 try:
382 self.destination.buffer.write(data)
383 except AttributeError:
384 if check_encoding(self.destination,
385 self.encoding) is False:
386 raise ValueError('Encoding of %s (%s) differs \n'
387 ' from specified encoding (%s)' %
388 (self.destination_path or 'destination',
389 self.destination.encoding, self.encoding))
390 else:
391 raise e
392 except (UnicodeError, LookupError) as err:
393 raise UnicodeError(
394 'Unable to encode output data. output-encoding is: '
395 '%s.\n(%s)' % (self.encoding, ErrorString(err)))
396 finally:
397 if self.autoclose:
398 self.close()
399 return data
400
401 def close(self):
402 if self.destination not in (sys.stdout, sys.stderr):
403 self.destination.close()
404 self.opened = False
405
406
407 class BinaryFileOutput(FileOutput):
408 """
409 A version of docutils.io.FileOutput which writes to a binary file.
410 """
411 # Used by core.publish_cmdline_to_binary() which in turn is used by
412 # rst2odt (OpenOffice writer)
413 mode = 'wb'
414
415
416 class StringInput(Input):
417
418 """
419 Direct string input.
420 """
421
422 default_source_path = '<string>'
423
424 def read(self):
425 """Decode and return the source string."""
426 return self.decode(self.source)
427
428
429 class StringOutput(Output):
430
431 """
432 Direct string output.
433 """
434
435 default_destination_path = '<string>'
436
437 def write(self, data):
438 """Encode `data`, store it in `self.destination`, and return it."""
439 self.destination = self.encode(data)
440 return self.destination
441
442
443 class NullInput(Input):
444
445 """
446 Degenerate input: read nothing.
447 """
448
449 default_source_path = 'null input'
450
451 def read(self):
452 """Return a null string."""
453 return u''
454
455
456 class NullOutput(Output):
457
458 """
459 Degenerate output: write nothing.
460 """
461
462 default_destination_path = 'null output'
463
464 def write(self, data):
465 """Do nothing ([don't even] send data to the bit bucket)."""
466 pass
467
468
469 class DocTreeInput(Input):
470
471 """
472 Adapter for document tree input.
473
474 The document tree must be passed in the ``source`` parameter.
475 """
476
477 default_source_path = 'doctree input'
478
479 def read(self):
480 """Return the document tree."""
481 return self.source