comparison env/lib/python3.7/site-packages/boltons/jsonutils.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # -*- coding: utf-8 -*-
2 """``jsonutils`` aims to provide various helpers for working with
3 JSON. Currently it focuses on providing a reliable and intuitive means
4 of working with `JSON Lines`_-formatted files.
5
6 .. _JSON Lines: http://jsonlines.org/
7
8 """
9
10 from __future__ import print_function
11
12 import os
13 import json
14
15
16 DEFAULT_BLOCKSIZE = 4096
17
18 # reverse iter lines algorithm:
19 #
20 # - if it ends in a newline, add an empty string to the line list
21 # - if there's one item, then prepend it to the buffer, continue
22 # - if there's more than one item, pop the last item and prepend it
23 # to the buffer, yielding it
24 # - yield all remaining items in reverse, except for the first
25 # - first item becomes the new buffer
26 #
27 # - when the outer loop completes, yield the buffer
28
29
30 __all__ = ['JSONLIterator', 'reverse_iter_lines']
31
32
33 def reverse_iter_lines(file_obj, blocksize=DEFAULT_BLOCKSIZE, preseek=True):
34 """Returns an iterator over the lines from a file object, in
35 reverse order, i.e., last line first, first line last. Uses the
36 :meth:`file.seek` method of file objects, and is tested compatible with
37 :class:`file` objects, as well as :class:`StringIO.StringIO`.
38
39 Args:
40 file_obj (file): An open file object. Note that ``reverse_iter_lines``
41 mutably reads from the file and other functions should not mutably
42 interact with the file object.
43 blocksize (int): The block size to pass to :meth:`file.read()`
44 preseek (bool): Tells the function whether or not to automatically
45 seek to the end of the file. Defaults to ``True``.
46 ``preseek=False`` is useful in cases when the
47 file cursor is already in position, either at the end of
48 the file or in the middle for relative reverse line
49 generation.
50 """
51 if preseek:
52 file_obj.seek(0, os.SEEK_END)
53 cur_pos = file_obj.tell()
54 buff = ''
55 while 0 < cur_pos:
56 read_size = min(blocksize, cur_pos)
57 cur_pos -= read_size
58 file_obj.seek(cur_pos, os.SEEK_SET)
59 cur = file_obj.read(read_size)
60 lines = cur.splitlines()
61 if cur[-1] == '\n':
62 lines.append('')
63 if len(lines) == 1:
64 buff = lines[0] + buff
65 continue
66 last = lines.pop()
67 yield last + buff
68 for line in lines[:0:-1]:
69 yield line
70 buff = lines[0]
71 if buff:
72 # TODO: test this, does an empty buffer always mean don't yield?
73 yield buff
74
75
76 """
77 TODO: allow passthroughs for:
78
79 json.load(fp[, encoding[, cls[, object_hook[, parse_float[, parse_int[, parse_constant[, object_pairs_hook[, **kw]]]]]]]])
80 """
81
82
83 class JSONLIterator(object):
84 """The ``JSONLIterator`` is used to iterate over JSON-encoded objects
85 stored in the `JSON Lines format`_ (one object per line).
86
87 Most notably it has the ability to efficiently read from the
88 bottom of files, making it very effective for reading in simple
89 append-only JSONL use cases. It also has the ability to start from
90 anywhere in the file and ignore corrupted lines.
91
92 Args:
93 file_obj (file): An open file object.
94 ignore_errors (bool): Whether to skip over lines that raise an error on
95 deserialization (:func:`json.loads`).
96 reverse (bool): Controls the direction of the iteration.
97 Defaults to ``False``. If set to ``True`` and *rel_seek*
98 is unset, seeks to the end of the file before iteration
99 begins.
100 rel_seek (float): Used to preseek the start position of
101 iteration. Set to 0.0 for the start of the file, 1.0 for the
102 end, and anything in between.
103
104 .. _JSON Lines format: http://jsonlines.org/
105 """
106 def __init__(self, file_obj,
107 ignore_errors=False, reverse=False, rel_seek=None):
108 self._reverse = bool(reverse)
109 self._file_obj = file_obj
110 self.ignore_errors = ignore_errors
111
112 if rel_seek is None:
113 if reverse:
114 rel_seek = 1.0
115 elif not -1.0 < rel_seek < 1.0:
116 raise ValueError("'rel_seek' expected a float between"
117 " -1.0 and 1.0, not %r" % rel_seek)
118 elif rel_seek < 0:
119 rel_seek = 1.0 - rel_seek
120 self._rel_seek = rel_seek
121 self._blocksize = 4096
122 if rel_seek is not None:
123 self._init_rel_seek()
124 if self._reverse:
125 self._line_iter = reverse_iter_lines(self._file_obj,
126 blocksize=self._blocksize,
127 preseek=False)
128 else:
129 self._line_iter = iter(self._file_obj)
130
131 @property
132 def cur_byte_pos(self):
133 "A property representing where in the file the iterator is reading."
134 return self._file_obj.tell()
135
136 def _align_to_newline(self):
137 "Aligns the file object's position to the next newline."
138 fo, bsize = self._file_obj, self._blocksize
139 cur, total_read = '', 0
140 cur_pos = fo.tell()
141 while '\n' not in cur:
142 cur = fo.read(bsize)
143 total_read += bsize
144 try:
145 newline_offset = cur.index('\n') + total_read - bsize
146 except ValueError:
147 raise # TODO: seek to end?
148 fo.seek(cur_pos + newline_offset)
149
150 def _init_rel_seek(self):
151 "Sets the file object's position to the relative location set above."
152 rs, fo = self._rel_seek, self._file_obj
153 if rs == 0.0:
154 fo.seek(0, os.SEEK_SET)
155 else:
156 fo.seek(0, os.SEEK_END)
157 size = fo.tell()
158 if rs == 1.0:
159 self._cur_pos = size
160 else:
161 target = int(size * rs)
162 fo.seek(target, os.SEEK_SET)
163 self._align_to_newline()
164 self._cur_pos = fo.tell()
165
166 def __iter__(self):
167 return self
168
169 def next(self):
170 """Yields one :class:`dict` loaded with :func:`json.loads`, advancing
171 the file object by one line. Raises :exc:`StopIteration` upon reaching
172 the end of the file (or beginning, if ``reverse`` was set to ``True``.
173 """
174 while 1:
175 line = next(self._line_iter).lstrip()
176 if not line:
177 continue
178 try:
179 obj = json.loads(line)
180 except Exception:
181 if not self.ignore_errors:
182 raise
183 continue
184 return obj
185
186 __next__ = next
187
188
189 if __name__ == '__main__':
190 def _main():
191 import sys
192 if '-h' in sys.argv or '--help' in sys.argv:
193 print('loads one or more JSON Line files for basic validation.')
194 return
195 verbose = False
196 if '-v' in sys.argv or '--verbose' in sys.argv:
197 verbose = True
198 file_count, obj_count = 0, 0
199 filenames = sys.argv[1:]
200 for filename in filenames:
201 if filename in ('-h', '--help', '-v', '--verbose'):
202 continue
203 file_count += 1
204 with open(filename, 'rb') as file_obj:
205 iterator = JSONLIterator(file_obj)
206 cur_obj_count = 0
207 while 1:
208 try:
209 next(iterator)
210 except ValueError:
211 print('error reading object #%s around byte %s in %s'
212 % (cur_obj_count + 1, iterator.cur_byte_pos, filename))
213 return
214 except StopIteration:
215 break
216 obj_count += 1
217 cur_obj_count += 1
218 if verbose and obj_count and obj_count % 100 == 0:
219 sys.stdout.write('.')
220 if obj_count % 10000:
221 sys.stdout.write('%s\n' % obj_count)
222 if verbose:
223 print('files checked: %s' % file_count)
224 print('objects loaded: %s' % obj_count)
225 return
226
227 _main()