Mercurial > repos > shellac > guppy_basecaller
annotate env/bin/dynamodb_load @ 4:79f47841a781 draft
"planemo upload commit 2a0fe2cc28b09e101d37293e53e82f61762262ec"
| author | shellac | 
|---|---|
| date | Thu, 14 May 2020 16:47:39 -0400 | 
| parents | 26e78fe6e8c4 | 
| children | 
| rev | line source | 
|---|---|
| 0 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 1 #!/Users/pldms/Development/Projects/2020/david-matthews-galaxy/guppy_basecaller/env/bin/python3 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 2 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 3 import argparse | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 4 import os | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 5 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 6 import boto | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 7 from boto.compat import json | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 8 from boto.compat import six | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 9 from boto.dynamodb.schema import Schema | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 10 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 11 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 12 DESCRIPTION = """Load data into one or more DynamoDB tables. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 13 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 14 For each table, data is read from two files: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 15 - {table_name}.metadata for the table's name, schema and provisioned | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 16 throughput (only required if creating the table). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 17 - {table_name}.data for the table's actual contents. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 18 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 19 Both files are searched for in the current directory. To read them from | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 20 somewhere else, use the --in-dir parameter. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 21 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 22 This program does not wipe the tables prior to loading data. However, any | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 23 items present in the data files will overwrite the table's contents. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 24 """ | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 25 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 26 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 27 def _json_iterload(fd): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 28 """Lazily load newline-separated JSON objects from a file-like object.""" | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 29 buffer = "" | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 30 eof = False | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 31 while not eof: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 32 try: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 33 # Add a line to the buffer | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 34 buffer += fd.next() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 35 except StopIteration: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 36 # We can't let that exception bubble up, otherwise the last | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 37 # object in the file will never be decoded. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 38 eof = True | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 39 try: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 40 # Try to decode a JSON object. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 41 json_object = json.loads(buffer.strip()) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 42 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 43 # Success: clear the buffer (everything was decoded). | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 44 buffer = "" | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 45 except ValueError: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 46 if eof and buffer.strip(): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 47 # No more lines to load and the buffer contains something other | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 48 # than whitespace: the file is, in fact, malformed. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 49 raise | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 50 # We couldn't decode a complete JSON object: load more lines. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 51 continue | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 52 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 53 yield json_object | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 54 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 55 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 56 def create_table(metadata_fd): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 57 """Create a table from a metadata file-like object.""" | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 58 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 59 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 60 def load_table(table, in_fd): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 61 """Load items into a table from a file-like object.""" | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 62 for i in _json_iterload(in_fd): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 63 # Convert lists back to sets. | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 64 data = {} | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 65 for k, v in six.iteritems(i): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 66 if isinstance(v, list): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 67 data[k] = set(v) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 68 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 69 data[k] = v | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 70 table.new_item(attrs=data).put() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 71 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 72 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 73 def dynamodb_load(tables, in_dir, create_tables): | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 74 conn = boto.connect_dynamodb() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 75 for t in tables: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 76 metadata_file = os.path.join(in_dir, "%s.metadata" % t) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 77 data_file = os.path.join(in_dir, "%s.data" % t) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 78 if create_tables: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 79 with open(metadata_file) as meta_fd: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 80 metadata = json.load(meta_fd) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 81 table = conn.create_table( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 82 name=t, | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 83 schema=Schema(metadata["schema"]), | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 84 read_units=metadata["read_units"], | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 85 write_units=metadata["write_units"], | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 86 ) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 87 table.refresh(wait_for_active=True) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 88 else: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 89 table = conn.get_table(t) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 90 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 91 with open(data_file) as in_fd: | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 92 load_table(table, in_fd) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 93 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 94 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 95 if __name__ == "__main__": | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 96 parser = argparse.ArgumentParser( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 97 prog="dynamodb_load", | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 98 description=DESCRIPTION | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 99 ) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 100 parser.add_argument( | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 101 "--create-tables", | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 102 action="store_true", | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 103 help="Create the tables if they don't exist already (without this flag, attempts to load data into non-existing tables fail)." | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 104 ) | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 105 parser.add_argument("--in-dir", default=".") | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 106 parser.add_argument("tables", metavar="TABLES", nargs="+") | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 107 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 108 namespace = parser.parse_args() | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 109 | 
| 
26e78fe6e8c4
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
 shellac parents: diff
changeset | 110 dynamodb_load(namespace.tables, namespace.in_dir, namespace.create_tables) | 
