Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/boltons/jsonutils.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """``jsonutils`` aims to provide various helpers for working with | |
3 JSON. Currently it focuses on providing a reliable and intuitive means | |
4 of working with `JSON Lines`_-formatted files. | |
5 | |
6 .. _JSON Lines: http://jsonlines.org/ | |
7 | |
8 """ | |
9 | |
10 from __future__ import print_function | |
11 | |
12 import os | |
13 import json | |
14 | |
15 | |
16 DEFAULT_BLOCKSIZE = 4096 | |
17 | |
18 # reverse iter lines algorithm: | |
19 # | |
20 # - if it ends in a newline, add an empty string to the line list | |
21 # - if there's one item, then prepend it to the buffer, continue | |
22 # - if there's more than one item, pop the last item and prepend it | |
23 # to the buffer, yielding it | |
24 # - yield all remaining items in reverse, except for the first | |
25 # - first item becomes the new buffer | |
26 # | |
27 # - when the outer loop completes, yield the buffer | |
28 | |
29 | |
30 __all__ = ['JSONLIterator', 'reverse_iter_lines'] | |
31 | |
32 | |
33 def reverse_iter_lines(file_obj, blocksize=DEFAULT_BLOCKSIZE, preseek=True): | |
34 """Returns an iterator over the lines from a file object, in | |
35 reverse order, i.e., last line first, first line last. Uses the | |
36 :meth:`file.seek` method of file objects, and is tested compatible with | |
37 :class:`file` objects, as well as :class:`StringIO.StringIO`. | |
38 | |
39 Args: | |
40 file_obj (file): An open file object. Note that ``reverse_iter_lines`` | |
41 mutably reads from the file and other functions should not mutably | |
42 interact with the file object. | |
43 blocksize (int): The block size to pass to :meth:`file.read()` | |
44 preseek (bool): Tells the function whether or not to automatically | |
45 seek to the end of the file. Defaults to ``True``. | |
46 ``preseek=False`` is useful in cases when the | |
47 file cursor is already in position, either at the end of | |
48 the file or in the middle for relative reverse line | |
49 generation. | |
50 """ | |
51 if preseek: | |
52 file_obj.seek(0, os.SEEK_END) | |
53 cur_pos = file_obj.tell() | |
54 buff = '' | |
55 while 0 < cur_pos: | |
56 read_size = min(blocksize, cur_pos) | |
57 cur_pos -= read_size | |
58 file_obj.seek(cur_pos, os.SEEK_SET) | |
59 cur = file_obj.read(read_size) | |
60 lines = cur.splitlines() | |
61 if cur[-1] == '\n': | |
62 lines.append('') | |
63 if len(lines) == 1: | |
64 buff = lines[0] + buff | |
65 continue | |
66 last = lines.pop() | |
67 yield last + buff | |
68 for line in lines[:0:-1]: | |
69 yield line | |
70 buff = lines[0] | |
71 if buff: | |
72 # TODO: test this, does an empty buffer always mean don't yield? | |
73 yield buff | |
74 | |
75 | |
76 """ | |
77 TODO: allow passthroughs for: | |
78 | |
79 json.load(fp[, encoding[, cls[, object_hook[, parse_float[, parse_int[, parse_constant[, object_pairs_hook[, **kw]]]]]]]]) | |
80 """ | |
81 | |
82 | |
83 class JSONLIterator(object): | |
84 """The ``JSONLIterator`` is used to iterate over JSON-encoded objects | |
85 stored in the `JSON Lines format`_ (one object per line). | |
86 | |
87 Most notably it has the ability to efficiently read from the | |
88 bottom of files, making it very effective for reading in simple | |
89 append-only JSONL use cases. It also has the ability to start from | |
90 anywhere in the file and ignore corrupted lines. | |
91 | |
92 Args: | |
93 file_obj (file): An open file object. | |
94 ignore_errors (bool): Whether to skip over lines that raise an error on | |
95 deserialization (:func:`json.loads`). | |
96 reverse (bool): Controls the direction of the iteration. | |
97 Defaults to ``False``. If set to ``True`` and *rel_seek* | |
98 is unset, seeks to the end of the file before iteration | |
99 begins. | |
100 rel_seek (float): Used to preseek the start position of | |
101 iteration. Set to 0.0 for the start of the file, 1.0 for the | |
102 end, and anything in between. | |
103 | |
104 .. _JSON Lines format: http://jsonlines.org/ | |
105 """ | |
106 def __init__(self, file_obj, | |
107 ignore_errors=False, reverse=False, rel_seek=None): | |
108 self._reverse = bool(reverse) | |
109 self._file_obj = file_obj | |
110 self.ignore_errors = ignore_errors | |
111 | |
112 if rel_seek is None: | |
113 if reverse: | |
114 rel_seek = 1.0 | |
115 elif not -1.0 < rel_seek < 1.0: | |
116 raise ValueError("'rel_seek' expected a float between" | |
117 " -1.0 and 1.0, not %r" % rel_seek) | |
118 elif rel_seek < 0: | |
119 rel_seek = 1.0 - rel_seek | |
120 self._rel_seek = rel_seek | |
121 self._blocksize = 4096 | |
122 if rel_seek is not None: | |
123 self._init_rel_seek() | |
124 if self._reverse: | |
125 self._line_iter = reverse_iter_lines(self._file_obj, | |
126 blocksize=self._blocksize, | |
127 preseek=False) | |
128 else: | |
129 self._line_iter = iter(self._file_obj) | |
130 | |
131 @property | |
132 def cur_byte_pos(self): | |
133 "A property representing where in the file the iterator is reading." | |
134 return self._file_obj.tell() | |
135 | |
136 def _align_to_newline(self): | |
137 "Aligns the file object's position to the next newline." | |
138 fo, bsize = self._file_obj, self._blocksize | |
139 cur, total_read = '', 0 | |
140 cur_pos = fo.tell() | |
141 while '\n' not in cur: | |
142 cur = fo.read(bsize) | |
143 total_read += bsize | |
144 try: | |
145 newline_offset = cur.index('\n') + total_read - bsize | |
146 except ValueError: | |
147 raise # TODO: seek to end? | |
148 fo.seek(cur_pos + newline_offset) | |
149 | |
150 def _init_rel_seek(self): | |
151 "Sets the file object's position to the relative location set above." | |
152 rs, fo = self._rel_seek, self._file_obj | |
153 if rs == 0.0: | |
154 fo.seek(0, os.SEEK_SET) | |
155 else: | |
156 fo.seek(0, os.SEEK_END) | |
157 size = fo.tell() | |
158 if rs == 1.0: | |
159 self._cur_pos = size | |
160 else: | |
161 target = int(size * rs) | |
162 fo.seek(target, os.SEEK_SET) | |
163 self._align_to_newline() | |
164 self._cur_pos = fo.tell() | |
165 | |
166 def __iter__(self): | |
167 return self | |
168 | |
169 def next(self): | |
170 """Yields one :class:`dict` loaded with :func:`json.loads`, advancing | |
171 the file object by one line. Raises :exc:`StopIteration` upon reaching | |
172 the end of the file (or beginning, if ``reverse`` was set to ``True``. | |
173 """ | |
174 while 1: | |
175 line = next(self._line_iter).lstrip() | |
176 if not line: | |
177 continue | |
178 try: | |
179 obj = json.loads(line) | |
180 except Exception: | |
181 if not self.ignore_errors: | |
182 raise | |
183 continue | |
184 return obj | |
185 | |
186 __next__ = next | |
187 | |
188 | |
189 if __name__ == '__main__': | |
190 def _main(): | |
191 import sys | |
192 if '-h' in sys.argv or '--help' in sys.argv: | |
193 print('loads one or more JSON Line files for basic validation.') | |
194 return | |
195 verbose = False | |
196 if '-v' in sys.argv or '--verbose' in sys.argv: | |
197 verbose = True | |
198 file_count, obj_count = 0, 0 | |
199 filenames = sys.argv[1:] | |
200 for filename in filenames: | |
201 if filename in ('-h', '--help', '-v', '--verbose'): | |
202 continue | |
203 file_count += 1 | |
204 with open(filename, 'rb') as file_obj: | |
205 iterator = JSONLIterator(file_obj) | |
206 cur_obj_count = 0 | |
207 while 1: | |
208 try: | |
209 next(iterator) | |
210 except ValueError: | |
211 print('error reading object #%s around byte %s in %s' | |
212 % (cur_obj_count + 1, iterator.cur_byte_pos, filename)) | |
213 return | |
214 except StopIteration: | |
215 break | |
216 obj_count += 1 | |
217 cur_obj_count += 1 | |
218 if verbose and obj_count and obj_count % 100 == 0: | |
219 sys.stdout.write('.') | |
220 if obj_count % 10000: | |
221 sys.stdout.write('%s\n' % obj_count) | |
222 if verbose: | |
223 print('files checked: %s' % file_count) | |
224 print('objects loaded: %s' % obj_count) | |
225 return | |
226 | |
227 _main() |