# HG changeset patch # User greg # Date 1539956963 14400 # Node ID f56e20e010e2f7f2d5603b3d4cb09955182c048c # Parent cb101ec1a0dd81daf7f65437c99a7cc6d50e449a Uploaded diff -r cb101ec1a0dd -r f56e20e010e2 gene_family_scaffold_loader.py --- a/gene_family_scaffold_loader.py Mon Oct 01 13:36:56 2018 -0400 +++ b/gene_family_scaffold_loader.py Fri Oct 19 09:49:23 2018 -0400 @@ -10,8 +10,15 @@ import sys import psycopg2 +from sqlalchemy import create_engine, MetaData, Table from sqlalchemy.engine.url import make_url +BLACKLIST_STRINGS = ['NULL', + 'Unknown protein', + 'No TAIR description', + 'Representative annotation below 0' + 'Representative AHRD below 0'] + class ScaffoldLoader(object): def __init__(self): @@ -27,6 +34,8 @@ self.parse_args() self.fh = open(self.args.output, "w") self.connect_db() + self.engine = create_engine(self.args.database_connection_string) + self.metadata = MetaData(self.engine) def parse_args(self): parser = argparse.ArgumentParser() @@ -103,6 +112,7 @@ 1. Parse all of the *.list files in the same directory to populate self.scaffold_genes_dict. """ + self.pto_table = Table('plant_tribes_orthogroup', self.metadata, autoload=True) scaffold_id = os.path.basename(self.args.scaffold_path) file_dir = os.path.join(self.args.scaffold_path, 'annot') # The scaffold naming convention must follow this pattern: @@ -151,16 +161,53 @@ # The species has at least 1 gene num_species += 1 num_genes += j_int - # Insert a row into the plant_tribes_orthogroup table. - args = [orthogroup_id, scaffold_id_db, num_species, num_genes] - for k in range(super_ortho_start_index, len(items)): - args.append('%s' % str(items[k])) - sql = """ - INSERT INTO plant_tribes_orthogroup - VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); - """ - cur = self.update(sql, tuple(args)) - self.flush() + # Get the auto-incremented row id to insert a row inot + # the plant_tribes_orthogroup table. + sql = "SELECT nextval('plant_tribes_orthogroup_id_seq');" + cur = self.conn.cursor() + cur.execute(sql) + plant_tribes_orthogroup_id = cur.fetchone()[0] + args = [plant_tribes_orthogroup_id, orthogroup_id, scaffold_id_db, num_species, num_genes] + last_item = len(items) + for k in range(super_ortho_start_index, last_item): + bs_found = False + # The last 7 items in this range are as follows. + # items[last_item-6]: AHRD Descriptions + # items[last_item-5]: TAIR Gene(s) Descriptions + # items[last_item-4]: Pfam Domains + # items[last_item-3]: InterProScan Descriptions + # items[last_item-2]: GO Molecular Functions + # items[last_item-1]: GO Biological Processes + # items[last_item]: GO Cellular Components + # We'll translate each of these items into a JSON + # dictionary for inserting into the table. + if k >= (last_item-7) and k <= last_item: + json_str = str(items[k]) + # Here is an example string: + # Phosphate transporter PHO1 [0.327] | Phosphate + for bs in BLACKLIST_STRINGS: + if json_str.find(bs) >= 0: + bs_found = True + args.append(None) + break + if not bs_found: + # We'll split the string on " | " to create each value. + # The keys will be zero-padded integers to enable sorting. + json_dict = dict() + json_vals = json_str.split(' | ') + for key_index, json_val in enumerate(json_vals): + # The zero-padded key is 1 based. + json_key = '%04d' % key_index + json_dict[json_key] = json_val + args.append(json_dict) + else: + args.append('%s' % str(items[k])) + sql = self.pto_table.insert().values(args) + try: + self.engine.execute(sql) + except Exception as e: + msg = "Caught exception executing SQL:\n%s\nvalues:\n%s\nException:\n%s\n" % (str(sql), str(args), e) + self.stop_err(msg) i += 1 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method)) for file_name in glob.glob(os.path.join(file_dir, "*list")):