diff planemo/lib/python3.7/site-packages/boltons/jsonutils.py @ 0:d30785e31577 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:18:57 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/planemo/lib/python3.7/site-packages/boltons/jsonutils.py	Fri Jul 31 00:18:57 2020 -0400
@@ -0,0 +1,227 @@
+# -*- coding: utf-8 -*-
+"""``jsonutils`` aims to provide various helpers for working with
+JSON. Currently it focuses on providing a reliable and intuitive means
+of working with `JSON Lines`_-formatted files.
+
+.. _JSON Lines: http://jsonlines.org/
+
+"""
+
+from __future__ import print_function
+
+import os
+import json
+
+
+DEFAULT_BLOCKSIZE = 4096
+
+# reverse iter lines algorithm:
+#
+#  - if it ends in a newline, add an empty string to the line list
+#  - if there's one item, then prepend it to the buffer, continue
+#  - if there's more than one item, pop the last item and prepend it
+#    to the buffer, yielding it
+#  - yield all remaining items in reverse, except for the first
+#  - first item becomes the new buffer
+#
+#  - when the outer loop completes, yield the buffer
+
+
+__all__ = ['JSONLIterator', 'reverse_iter_lines']
+
+
+def reverse_iter_lines(file_obj, blocksize=DEFAULT_BLOCKSIZE, preseek=True):
+    """Returns an iterator over the lines from a file object, in
+    reverse order, i.e., last line first, first line last. Uses the
+    :meth:`file.seek` method of file objects, and is tested compatible with
+    :class:`file` objects, as well as :class:`StringIO.StringIO`.
+
+    Args:
+        file_obj (file): An open file object. Note that ``reverse_iter_lines``
+            mutably reads from the file and other functions should not mutably
+            interact with the file object.
+        blocksize (int): The block size to pass to :meth:`file.read()`
+        preseek (bool): Tells the function whether or not to automatically
+            seek to the end of the file. Defaults to ``True``.
+            ``preseek=False`` is useful in cases when the
+            file cursor is already in position, either at the end of
+            the file or in the middle for relative reverse line
+            generation.
+    """
+    if preseek:
+        file_obj.seek(0, os.SEEK_END)
+    cur_pos = file_obj.tell()
+    buff = ''
+    while 0 < cur_pos:
+        read_size = min(blocksize, cur_pos)
+        cur_pos -= read_size
+        file_obj.seek(cur_pos, os.SEEK_SET)
+        cur = file_obj.read(read_size)
+        lines = cur.splitlines()
+        if cur[-1] == '\n':
+            lines.append('')
+        if len(lines) == 1:
+            buff = lines[0] + buff
+            continue
+        last = lines.pop()
+        yield last + buff
+        for line in lines[:0:-1]:
+            yield line
+        buff = lines[0]
+    if buff:
+        # TODO: test this, does an empty buffer always mean don't yield?
+        yield buff
+
+
+"""
+TODO: allow passthroughs for:
+
+json.load(fp[, encoding[, cls[, object_hook[, parse_float[, parse_int[, parse_constant[, object_pairs_hook[, **kw]]]]]]]])
+"""
+
+
+class JSONLIterator(object):
+    """The ``JSONLIterator`` is used to iterate over JSON-encoded objects
+    stored in the `JSON Lines format`_ (one object per line).
+
+    Most notably it has the ability to efficiently read from the
+    bottom of files, making it very effective for reading in simple
+    append-only JSONL use cases. It also has the ability to start from
+    anywhere in the file and ignore corrupted lines.
+
+    Args:
+        file_obj (file): An open file object.
+        ignore_errors (bool): Whether to skip over lines that raise an error on
+            deserialization (:func:`json.loads`).
+        reverse (bool): Controls the direction of the iteration.
+            Defaults to ``False``. If set to ``True`` and *rel_seek*
+            is unset, seeks to the end of the file before iteration
+            begins.
+        rel_seek (float): Used to preseek the start position of
+            iteration. Set to 0.0 for the start of the file, 1.0 for the
+            end, and anything in between.
+
+    .. _JSON Lines format: http://jsonlines.org/
+    """
+    def __init__(self, file_obj,
+                 ignore_errors=False, reverse=False, rel_seek=None):
+        self._reverse = bool(reverse)
+        self._file_obj = file_obj
+        self.ignore_errors = ignore_errors
+
+        if rel_seek is None:
+            if reverse:
+                rel_seek = 1.0
+        elif not -1.0 < rel_seek < 1.0:
+            raise ValueError("'rel_seek' expected a float between"
+                             " -1.0 and 1.0, not %r" % rel_seek)
+        elif rel_seek < 0:
+            rel_seek = 1.0 - rel_seek
+        self._rel_seek = rel_seek
+        self._blocksize = 4096
+        if rel_seek is not None:
+            self._init_rel_seek()
+        if self._reverse:
+            self._line_iter = reverse_iter_lines(self._file_obj,
+                                                 blocksize=self._blocksize,
+                                                 preseek=False)
+        else:
+            self._line_iter = iter(self._file_obj)
+
+    @property
+    def cur_byte_pos(self):
+        "A property representing where in the file the iterator is reading."
+        return self._file_obj.tell()
+
+    def _align_to_newline(self):
+        "Aligns the file object's position to the next newline."
+        fo, bsize = self._file_obj, self._blocksize
+        cur, total_read = '', 0
+        cur_pos = fo.tell()
+        while '\n' not in cur:
+            cur = fo.read(bsize)
+            total_read += bsize
+        try:
+            newline_offset = cur.index('\n') + total_read - bsize
+        except ValueError:
+            raise  # TODO: seek to end?
+        fo.seek(cur_pos + newline_offset)
+
+    def _init_rel_seek(self):
+        "Sets the file object's position to the relative location set above."
+        rs, fo = self._rel_seek, self._file_obj
+        if rs == 0.0:
+            fo.seek(0, os.SEEK_SET)
+        else:
+            fo.seek(0, os.SEEK_END)
+            size = fo.tell()
+            if rs == 1.0:
+                self._cur_pos = size
+            else:
+                target = int(size * rs)
+                fo.seek(target, os.SEEK_SET)
+                self._align_to_newline()
+                self._cur_pos = fo.tell()
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        """Yields one :class:`dict` loaded with :func:`json.loads`, advancing
+        the file object by one line. Raises :exc:`StopIteration` upon reaching
+        the end of the file (or beginning, if ``reverse`` was set to ``True``.
+        """
+        while 1:
+            line = next(self._line_iter).lstrip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except Exception:
+                if not self.ignore_errors:
+                    raise
+                continue
+            return obj
+
+    __next__ = next
+
+
+if __name__ == '__main__':
+    def _main():
+        import sys
+        if '-h' in sys.argv or '--help' in sys.argv:
+            print('loads one or more JSON Line files for basic validation.')
+            return
+        verbose = False
+        if '-v' in sys.argv or '--verbose' in sys.argv:
+            verbose = True
+        file_count, obj_count = 0, 0
+        filenames = sys.argv[1:]
+        for filename in filenames:
+            if filename in ('-h', '--help', '-v', '--verbose'):
+                continue
+            file_count += 1
+            with open(filename, 'rb') as file_obj:
+                iterator = JSONLIterator(file_obj)
+                cur_obj_count = 0
+                while 1:
+                    try:
+                        next(iterator)
+                    except ValueError:
+                        print('error reading object #%s around byte %s in %s'
+                              % (cur_obj_count + 1, iterator.cur_byte_pos, filename))
+                        return
+                    except StopIteration:
+                        break
+                    obj_count += 1
+                    cur_obj_count += 1
+                    if verbose and obj_count and obj_count % 100 == 0:
+                        sys.stdout.write('.')
+                        if obj_count % 10000:
+                            sys.stdout.write('%s\n' % obj_count)
+        if verbose:
+            print('files checked: %s' % file_count)
+            print('objects loaded: %s' % obj_count)
+        return
+
+    _main()