plant_tribes_gene_family_scaffold_loader: gene_family_scaffold

comparison gene_family_scaffold_loader.py @ 0:6282484a52bc draft

Uploaded

author	greg
date	Tue, 21 Aug 2018 12:57:00 -0400
parents
children	cb101ec1a0dd

comparison

equal deleted inserted replaced

--1:000000000000
+:6282484a52bc
+#!/usr/bin/env python
+"""
+add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes
+database efficiently by bypassing the Galaxy model and operating directly on the database.
+PostgreSQL 9.1 or greater is required.
+"""
+import argparse
+import glob
+import os
+import sys
+import psycopg2
+from sqlalchemy.engine.url import make_url
+class ScaffoldLoader(object):
+def __init__(self):
+self.args = None
+self.clustering_methods = []
+self.conn = None
+self.gene_sequences_dict = {}
+self.scaffold_genes_dict = {}
+self.scaffold_recs = []
+self.species_genes_dict = {}
+self.species_ids_dict = {}
+self.taxa_lineage_config = None
+self.parse_args()
+self.fh = open(self.args.output, "w")
+self.connect_db()
+def parse_args(self):
+parser = argparse.ArgumentParser()
+parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'),
+parser.add_argument('--output', dest='output', help='Output dataset'),
+parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory')
+self.args = parser.parse_args()
+def connect_db(self):
+url = make_url(self.args.database_connection_string)
+self.log('Connecting to database with URL: %s' % url)
+args = url.translate_connect_args(username='user')
+args.update(url.query)
+assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.'
+self.conn = psycopg2.connect(**args)
+def flush(self):
+self.conn.commit()
+def shutdown(self):
+self.conn.close()
+def update(self, sql, args):
+try:
+cur = self.conn.cursor()
+cur.execute(sql, args)
+except Exception as e:
+msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e)
+self.stop_err(msg)
+return cur
+def stop_err(self, msg):
+sys.stderr.write(msg)
+self.fh.flush()
+self.fh.close()
+sys.exit(1)
+def log(self, msg):
+self.fh.write("%s\n" % msg)
+self.fh.flush()
+@property
+def can_add_scaffold(self):
+"""
+Make sure the scaffold has not already been added.
+"""
+scaffold_id = os.path.basename(self.args.scaffold_path)
+sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id
+cur = self.conn.cursor()
+cur.execute(sql)
+try:
+cur.fetchone()[0]
+# The scaffold has been added to the database.
+return False
+except:
+# The scaffold has not yet been added.
+return True
+def run(self):
+if self.can_add_scaffold:
+self.process_annot_dir()
+self.process_scaffold_config_files()
+self.process_orthogroup_fasta_files()
+self.fh.flush()
+self.fh.close()
+else:
+self.stop_err("The scaffold %s has already been added to the database." % os.path.basename(self.args.scaffold_path))
+def process_annot_dir(self):
+"""
+1. Parse all of the *.min_evalue.summary files in the
+~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate
+both the plant_tribes_scaffold and the plant_tribes_orthogroup tables.
+1. Parse all of the *.list files in the same directory to populate
+self.scaffold_genes_dict.
+"""
+scaffold_id = os.path.basename(self.args.scaffold_path)
+file_dir = os.path.join(self.args.scaffold_path, 'annot')
+# The scaffol naming convention must follow this pattern:
+# <integer1>Gv<integer2>.<integer3>
+# where integer 1 is the number of genomes in the scaffold_id.  For example:
+# 22Gv1.1 -> 22 genomes
+# 12Gv1.0 -> 12 genomes
+# 26Gv2.0 -> 26 genomes, etc.
+num_genomes = int(scaffold_id.split("Gv")[0])
+super_ortho_start_index = num_genomes + 1
+for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")):
+items = os.path.basename(file_name).split(".")
+clustering_method = items[0]
+# Save all clustering methods for later processing.
+if clustering_method not in self.clustering_methods:
+self.clustering_methods.append(clustering_method)
+# Insert a row in to the plant_tribes_scaffold table.
+self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method))
+args = [scaffold_id, clustering_method]
+sql = """
+INSERT INTO plant_tribes_scaffold
+VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s)
+RETURNING id;
+"""
+cur = self.update(sql, tuple(args))
+self.flush()
+scaffold_id_db = cur.fetchone()[0]
+self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method])
+with open(file_name, "r") as fh:
+i = 0
+for i2, line in enumerate(fh):
+if i2 == 0:
+# Skip first line.
+continue
+num_genes = 0
+num_species = 0
+items = line.split("\t")
+orthogroup_id = int(items[0])
+# Zero based items 1 to num_genomes consists of the
+# number of species classified in the orthogroup (i.e.,
+# species with at least 1 gene in the orthogroup).
+for j in range(1, num_genomes):
+j_int = int(items[j])
+if j_int > 0:
+# The  species has at least 1 gene
+num_species += 1
+num_genes += j_int
+# Insert a row into the plant_tribes_orthogroup table.
+args = [orthogroup_id, scaffold_id_db, num_species, num_genes]
+for k in range(super_ortho_start_index, len(items)):
+args.append('%s' % str(items[k]))
+sql = """
+INSERT INTO plant_tribes_orthogroup
+VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+"""
+cur = self.update(sql, tuple(args))
+self.flush()
+i += 1
+self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method))
+for file_name in glob.glob(os.path.join(file_dir, "*list")):
+items = os.path.basename(file_name).split(".")
+clustering_method = items[0]
+with open(file_name, "r") as fh:
+for i, line in enumerate(fh):
+items = line.split("\t")
+# The key will be a combination of clustering_method and
+# orthogroup_id separated by "^^" for easy splitting later.
+key = "%s^^%s" % (clustering_method, items[0])
+# The value is the gen_id with all white space replaced by "_".
+val = items[1].replace("|", "_")
+if key in self.scaffold_genes_dict:
+self.scaffold_genes_dict[key].append(val)
+else:
+self.scaffold_genes_dict[key] = [val]
+def process_scaffold_config_files(self):
+"""
+1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config
+(e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate.
+2. Calculate the number of genes found
+for each species and add the number to self.species_genes_dict.
+3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to
+populate the plant_tribes_taxon table.
+"""
+scaffold_id = os.path.basename(self.args.scaffold_path)
+file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id)
+self.log("Processing rooting order config: %s" % str(file_name))
+# Populate self.species_ids_dict.
+with open(file_name, "r") as fh:
+for i, line in enumerate(fh):
+line = line.strip()
+if len(line) == 0 or line.startswith("#") or line.startswith("["):
+# Skip blank lines, comments and section headers.
+continue
+# Example line:
+# Physcomitrella patens=Phypa
+items = line.split("=")
+self.species_ids_dict[items[1]] = items[0]
+# Get lineage information for orthogrpoup taxa.
+for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()):
+# The format of key is <clustering_method>^^<orthogroup_id>.
+# For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001"
+scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^")
+clustering_method = scaffold_genes_dict_key_items[0]
+# Get the list of genes for the current scaffold_genes_dict_key.
+gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key]
+for gene_id in gene_list:
+# Example species_code: Musac1.0, where
+# species_name is Musac and version is 1.0.
+species_code = gene_id.split("_")[1]
+# Strip the version from the species_code.
+species_code = species_code[0:5]
+# Get the species_name from self.species_ids_dict.
+species_name = self.species_ids_dict[species_code]
+# Create a key for self.species_genes_dict, with the format:
+# <clustering_method>^^<species_name>
+species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
+# Add an entry to self.species_genes_dict, where the value
+# is a list containing species_name and num_genes.
+if species_genes_dict_key in self.species_genes_dict:
+tup = self.species_genes_dict[species_genes_dict_key]
+tup[1] += 1
+self.species_genes_dict[species_genes_dict_key] = tup
+else:
+self.species_genes_dict[species_genes_dict_key] = [species_name, 1]
+# Populate the plant_tribes_taxon table.
+file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id)
+self.log("Processing taxa lineage config: %s" % str(file_name))
+with open(file_name, "r") as fh:
+for line in fh:
+line = line.strip()
+if len(line) == 0 or line.startswith("#") or line.startswith("Species"):
+# Skip blank lines, comments and section headers.
+continue
+# Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots
+items = line.split("\t")
+species_name = items[0]
+i = 0
+for clustering_method in self.clustering_methods:
+# The format of species_genes_dict_key is <clustering_method>^^<species_name>.
+species_genes_dict_key = "%s^^%s" % (clustering_method, species_name)
+# Get the scaffold_rec for the current scaffold_id and clustering_method.
+# The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
+for scaffold_rec in self.scaffold_recs:
+if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
+scaffold_id_db = scaffold_rec[0]
+# The value is a list containing species_name and num_genes.
+val = self.species_genes_dict[species_genes_dict_key]
+if species_name == val[0]:
+num_genes = val[1]
+else:
+num_genes = 0
+# Insert a row in to the plant_tribes_scaffold table.
+args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]]
+sql = """
+INSERT INTO plant_tribes_taxon
+VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s);
+"""
+self.update(sql, tuple(args))
+self.flush()
+i += 1
+self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % (i, str(species_name)))
+def process_orthogroup_fasta_files(self):
+"""
+1. Analyze all of the scaffold .fna and .faa files for each clustering
+method to populate the aa_dict and dna_dict sequence dictionaries.
+2. Use the populated sequence dictionaries to populate the plant_tribes_gene
+and gene_scaffold_orthogroup_taxon_association tables.
+"""
+scaffold_id = os.path.basename(self.args.scaffold_path)
+aa_dict = {}
+dna_dict = {}
+# Populate aa_dict and dna_dict.
+for clustering_method in self.clustering_methods:
+file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method)
+for file_name in os.listdir(file_dir):
+items = file_name.split(".")
+orthogroup_id = items[0]
+file_extension = items[1]
+if file_extension == "fna":
+adict = dna_dict
+else:
+adict = aa_dict
+file_path = os.path.join(file_dir, file_name)
+with open(file_path, "r") as fh:
+for i, line in enumerate(fh):
+line = line.strip()
+if len(line) == 0:
+# Skip blank lines (shoudn't happen).
+continue
+if line.startswith(">"):
+# Example line:
+# >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110
+gene_id = line.lstrip(">")
+# The dictionary keys will combine the orthogroup_id,
+# clustering method and gene id using the format
+# ,orthogroup_id>^^<clustering_method>^^<gene_id>.
+combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id)
+if combined_id not in adict:
+# The value will be the dna sequence string..
+adict[combined_id] = ""
+else:
+# Example line:
+# ATGGAGAAGGACTTT
+# Here combined_id is set because the fasta format specifies
+# that all lines following the gene id defined in the if block
+# above will be the sequence associated with that gene until
+# the next gene id line is encountered.
+sequence = adict[combined_id]
+sequence = "%s%s" % (sequence, line)
+adict[combined_id] = sequence
+# Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables
+# from the contents of aa_dict and dna_dict.
+self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.")
+gi = 0
+for gsoai, combined_id in enumerate(sorted(dna_dict.keys())):
+# The dictionary keys combine the orthogroup_id, clustering method and
+# gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>.
+items = combined_id.split("^^")
+orthogroup_id = items[0]
+clustering_method = items[1]
+gene_id = items[2]
+# The value will be a list containing both
+# clustering_method and the dna string.
+dna_sequence = dna_dict[combined_id]
+aa_sequence = aa_dict[combined_id]
+# Get the species_code from the gene_id.
+species_code = gene_id.split("_")[1]
+# Strip the version from the species_code.
+species_code = species_code[0:5]
+# Get the species_name from self.species_ids_dict.
+species_name = self.species_ids_dict[species_code]
+# Get the plant_tribes_orthogroup primary key id  for
+# the orthogroup_id from the plant_tribes_orthogroup table.
+sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id
+cur = self.conn.cursor()
+cur.execute(sql)
+orthogroup_id_db = cur.fetchone()[0]
+# If the plant_tribes_gene table contains a row that has the gene_id,
+# then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
+# Get the taxon_id  for the species_name from the plant_tribes_taxon table.
+sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name
+cur = self.conn.cursor()
+cur.execute(sql)
+taxon_id_db = cur.fetchone()[0]
+# If the plant_tribes_gene table contains a row that has the gene_id,
+# then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table.
+sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id
+cur = self.conn.cursor()
+cur.execute(sql)
+try:
+gene_id_db = cur.fetchone()[0]
+except:
+# Insert a row into the plant_tribes_gene table.
+args = [gene_id, dna_sequence, aa_sequence]
+sql = """
+INSERT INTO plant_tribes_gene
+VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s)
+RETURNING id;
+"""
+cur = self.update(sql, tuple(args))
+self.flush()
+gene_id_db = cur.fetchone()[0]
+gi += 1
+if gi % 1000 == 0:
+self.log("Inserted 1000 more rows into the plant_tribes_gene table.")
+# Insert a row into the gene_scaffold_orthogroup_taxon_association table.
+# Get the scaffold_rec for the current scaffold_id and clustering_method.
+# The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>]
+for scaffold_rec in self.scaffold_recs:
+if scaffold_id in scaffold_rec and clustering_method in scaffold_rec:
+scaffold_id_db = scaffold_rec[0]
+args = [gene_id_db, scaffold_id_db, orthogroup_id_db, taxon_id_db]
+sql = """
+INSERT INTO gene_scaffold_orthogroup_taxon_association
+VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s);
+"""
+cur = self.update(sql, tuple(args))
+self.flush()
+if gsoai % 1000 == 0:
+self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.")
+self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi)
+self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai)
+if __name__ == '__main__':
+scaffold_loader = ScaffoldLoader()
+scaffold_loader.run()
+scaffold_loader.shutdown()

Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader

comparison gene_family_scaffold_loader.py @ 0:6282484a52bc draft