Mercurial > repos > greg > plant_tribes_gene_family_scaffold_loader
comparison gene_family_scaffold_loader.py @ 0:6282484a52bc draft
Uploaded
author | greg |
---|---|
date | Tue, 21 Aug 2018 12:57:00 -0400 |
parents | |
children | cb101ec1a0dd |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6282484a52bc |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 add_plant_tribes_scaffold.py - A script for adding a scaffold to the Galaxy PlantTribes | |
4 database efficiently by bypassing the Galaxy model and operating directly on the database. | |
5 PostgreSQL 9.1 or greater is required. | |
6 """ | |
7 import argparse | |
8 import glob | |
9 import os | |
10 import sys | |
11 | |
12 import psycopg2 | |
13 from sqlalchemy.engine.url import make_url | |
14 | |
15 | |
16 class ScaffoldLoader(object): | |
17 def __init__(self): | |
18 self.args = None | |
19 self.clustering_methods = [] | |
20 self.conn = None | |
21 self.gene_sequences_dict = {} | |
22 self.scaffold_genes_dict = {} | |
23 self.scaffold_recs = [] | |
24 self.species_genes_dict = {} | |
25 self.species_ids_dict = {} | |
26 self.taxa_lineage_config = None | |
27 self.parse_args() | |
28 self.fh = open(self.args.output, "w") | |
29 self.connect_db() | |
30 | |
31 def parse_args(self): | |
32 parser = argparse.ArgumentParser() | |
33 parser.add_argument('--database_connection_string', dest='database_connection_string', help='Postgres database connection string'), | |
34 parser.add_argument('--output', dest='output', help='Output dataset'), | |
35 parser.add_argument('--scaffold_path', dest='scaffold_path', help='Full path to PlantTribes scaffold directory') | |
36 self.args = parser.parse_args() | |
37 | |
38 def connect_db(self): | |
39 url = make_url(self.args.database_connection_string) | |
40 self.log('Connecting to database with URL: %s' % url) | |
41 args = url.translate_connect_args(username='user') | |
42 args.update(url.query) | |
43 assert url.get_dialect().name == 'postgresql', 'This script can only be used with PostgreSQL.' | |
44 self.conn = psycopg2.connect(**args) | |
45 | |
46 def flush(self): | |
47 self.conn.commit() | |
48 | |
49 def shutdown(self): | |
50 self.conn.close() | |
51 | |
52 def update(self, sql, args): | |
53 try: | |
54 cur = self.conn.cursor() | |
55 cur.execute(sql, args) | |
56 except Exception as e: | |
57 msg = "Caught exception executing SQL:\n%s\nException:\n%s\n" % (sql.format(args), e) | |
58 self.stop_err(msg) | |
59 return cur | |
60 | |
61 def stop_err(self, msg): | |
62 sys.stderr.write(msg) | |
63 self.fh.flush() | |
64 self.fh.close() | |
65 sys.exit(1) | |
66 | |
67 def log(self, msg): | |
68 self.fh.write("%s\n" % msg) | |
69 self.fh.flush() | |
70 | |
71 @property | |
72 def can_add_scaffold(self): | |
73 """ | |
74 Make sure the scaffold has not already been added. | |
75 """ | |
76 scaffold_id = os.path.basename(self.args.scaffold_path) | |
77 sql = "SELECT id FROM plant_tribes_scaffold WHERE scaffold_id = '%s';" % scaffold_id | |
78 cur = self.conn.cursor() | |
79 cur.execute(sql) | |
80 try: | |
81 cur.fetchone()[0] | |
82 # The scaffold has been added to the database. | |
83 return False | |
84 except: | |
85 # The scaffold has not yet been added. | |
86 return True | |
87 | |
88 def run(self): | |
89 if self.can_add_scaffold: | |
90 self.process_annot_dir() | |
91 self.process_scaffold_config_files() | |
92 self.process_orthogroup_fasta_files() | |
93 self.fh.flush() | |
94 self.fh.close() | |
95 else: | |
96 self.stop_err("The scaffold %s has already been added to the database." % os.path.basename(self.args.scaffold_path)) | |
97 | |
98 def process_annot_dir(self): | |
99 """ | |
100 1. Parse all of the *.min_evalue.summary files in the | |
101 ~/<scaffold_id>/annot directory (e.g., ~/22Gv1.1/annot) to populate | |
102 both the plant_tribes_scaffold and the plant_tribes_orthogroup tables. | |
103 1. Parse all of the *.list files in the same directory to populate | |
104 self.scaffold_genes_dict. | |
105 """ | |
106 scaffold_id = os.path.basename(self.args.scaffold_path) | |
107 file_dir = os.path.join(self.args.scaffold_path, 'annot') | |
108 # The scaffol naming convention must follow this pattern: | |
109 # <integer1>Gv<integer2>.<integer3> | |
110 # where integer 1 is the number of genomes in the scaffold_id. For example: | |
111 # 22Gv1.1 -> 22 genomes | |
112 # 12Gv1.0 -> 12 genomes | |
113 # 26Gv2.0 -> 26 genomes, etc. | |
114 num_genomes = int(scaffold_id.split("Gv")[0]) | |
115 super_ortho_start_index = num_genomes + 1 | |
116 for file_name in glob.glob(os.path.join(file_dir, "*min_evalue.summary")): | |
117 items = os.path.basename(file_name).split(".") | |
118 clustering_method = items[0] | |
119 # Save all clustering methods for later processing. | |
120 if clustering_method not in self.clustering_methods: | |
121 self.clustering_methods.append(clustering_method) | |
122 # Insert a row in to the plant_tribes_scaffold table. | |
123 self.log("Inserting a row into the plant_tribes_scaffold table for scaffold %s and clustering method %s." % (scaffold_id, clustering_method)) | |
124 args = [scaffold_id, clustering_method] | |
125 sql = """ | |
126 INSERT INTO plant_tribes_scaffold | |
127 VALUES (nextval('plant_tribes_scaffold_id_seq'), %s, %s) | |
128 RETURNING id; | |
129 """ | |
130 cur = self.update(sql, tuple(args)) | |
131 self.flush() | |
132 scaffold_id_db = cur.fetchone()[0] | |
133 self.scaffold_recs.append([scaffold_id_db, scaffold_id, clustering_method]) | |
134 with open(file_name, "r") as fh: | |
135 i = 0 | |
136 for i2, line in enumerate(fh): | |
137 if i2 == 0: | |
138 # Skip first line. | |
139 continue | |
140 num_genes = 0 | |
141 num_species = 0 | |
142 items = line.split("\t") | |
143 orthogroup_id = int(items[0]) | |
144 # Zero based items 1 to num_genomes consists of the | |
145 # number of species classified in the orthogroup (i.e., | |
146 # species with at least 1 gene in the orthogroup). | |
147 for j in range(1, num_genomes): | |
148 j_int = int(items[j]) | |
149 if j_int > 0: | |
150 # The species has at least 1 gene | |
151 num_species += 1 | |
152 num_genes += j_int | |
153 # Insert a row into the plant_tribes_orthogroup table. | |
154 args = [orthogroup_id, scaffold_id_db, num_species, num_genes] | |
155 for k in range(super_ortho_start_index, len(items)): | |
156 args.append('%s' % str(items[k])) | |
157 sql = """ | |
158 INSERT INTO plant_tribes_orthogroup | |
159 VALUES (nextval('plant_tribes_orthogroup_id_seq'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); | |
160 """ | |
161 cur = self.update(sql, tuple(args)) | |
162 self.flush() | |
163 i += 1 | |
164 self.log("Inserted %d rows into the plant_tribes_orthogroup table for scaffold %s and clustering method %s." % (i, scaffold_id, clustering_method)) | |
165 for file_name in glob.glob(os.path.join(file_dir, "*list")): | |
166 items = os.path.basename(file_name).split(".") | |
167 clustering_method = items[0] | |
168 with open(file_name, "r") as fh: | |
169 for i, line in enumerate(fh): | |
170 items = line.split("\t") | |
171 # The key will be a combination of clustering_method and | |
172 # orthogroup_id separated by "^^" for easy splitting later. | |
173 key = "%s^^%s" % (clustering_method, items[0]) | |
174 # The value is the gen_id with all white space replaced by "_". | |
175 val = items[1].replace("|", "_") | |
176 if key in self.scaffold_genes_dict: | |
177 self.scaffold_genes_dict[key].append(val) | |
178 else: | |
179 self.scaffold_genes_dict[key] = [val] | |
180 | |
181 def process_scaffold_config_files(self): | |
182 """ | |
183 1. Parse ~/<scaffold_id>/<scaffold_id>/.rootingOrder.config | |
184 (e.g., ~/22Gv1.1/22Gv1.1..rootingOrder.config) to populate. | |
185 2. Calculate the number of genes found | |
186 for each species and add the number to self.species_genes_dict. | |
187 3. Parse ~/<scaffold_id>/<scaffold_id>.taxaLineage.config to | |
188 populate the plant_tribes_taxon table. | |
189 """ | |
190 scaffold_id = os.path.basename(self.args.scaffold_path) | |
191 file_name = os.path.join(self.args.scaffold_path, '%s.rootingOrder.config' % scaffold_id) | |
192 self.log("Processing rooting order config: %s" % str(file_name)) | |
193 # Populate self.species_ids_dict. | |
194 with open(file_name, "r") as fh: | |
195 for i, line in enumerate(fh): | |
196 line = line.strip() | |
197 if len(line) == 0 or line.startswith("#") or line.startswith("["): | |
198 # Skip blank lines, comments and section headers. | |
199 continue | |
200 # Example line: | |
201 # Physcomitrella patens=Phypa | |
202 items = line.split("=") | |
203 self.species_ids_dict[items[1]] = items[0] | |
204 # Get lineage information for orthogrpoup taxa. | |
205 for scaffold_genes_dict_key in sorted(self.scaffold_genes_dict.keys()): | |
206 # The format of key is <clustering_method>^^<orthogroup_id>. | |
207 # For example: {"gfam^^1" : "gnl_Musac1.0_GSMUA_Achr1T11000_001" | |
208 scaffold_genes_dict_key_items = scaffold_genes_dict_key.split("^^") | |
209 clustering_method = scaffold_genes_dict_key_items[0] | |
210 # Get the list of genes for the current scaffold_genes_dict_key. | |
211 gene_list = self.scaffold_genes_dict[scaffold_genes_dict_key] | |
212 for gene_id in gene_list: | |
213 # Example species_code: Musac1.0, where | |
214 # species_name is Musac and version is 1.0. | |
215 species_code = gene_id.split("_")[1] | |
216 # Strip the version from the species_code. | |
217 species_code = species_code[0:5] | |
218 # Get the species_name from self.species_ids_dict. | |
219 species_name = self.species_ids_dict[species_code] | |
220 # Create a key for self.species_genes_dict, with the format: | |
221 # <clustering_method>^^<species_name> | |
222 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name) | |
223 # Add an entry to self.species_genes_dict, where the value | |
224 # is a list containing species_name and num_genes. | |
225 if species_genes_dict_key in self.species_genes_dict: | |
226 tup = self.species_genes_dict[species_genes_dict_key] | |
227 tup[1] += 1 | |
228 self.species_genes_dict[species_genes_dict_key] = tup | |
229 else: | |
230 self.species_genes_dict[species_genes_dict_key] = [species_name, 1] | |
231 # Populate the plant_tribes_taxon table. | |
232 file_name = os.path.join(self.args.scaffold_path, '%s.taxaLineage.config' % scaffold_id) | |
233 self.log("Processing taxa lineage config: %s" % str(file_name)) | |
234 with open(file_name, "r") as fh: | |
235 for line in fh: | |
236 line = line.strip() | |
237 if len(line) == 0 or line.startswith("#") or line.startswith("Species"): | |
238 # Skip blank lines, comments and section headers. | |
239 continue | |
240 # Example line: Populus trichocarpa\tSalicaceae\tMalpighiales\tRosids\tCore Eudicots | |
241 items = line.split("\t") | |
242 species_name = items[0] | |
243 i = 0 | |
244 for clustering_method in self.clustering_methods: | |
245 # The format of species_genes_dict_key is <clustering_method>^^<species_name>. | |
246 species_genes_dict_key = "%s^^%s" % (clustering_method, species_name) | |
247 # Get the scaffold_rec for the current scaffold_id and clustering_method. | |
248 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | |
249 for scaffold_rec in self.scaffold_recs: | |
250 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: | |
251 scaffold_id_db = scaffold_rec[0] | |
252 # The value is a list containing species_name and num_genes. | |
253 val = self.species_genes_dict[species_genes_dict_key] | |
254 if species_name == val[0]: | |
255 num_genes = val[1] | |
256 else: | |
257 num_genes = 0 | |
258 # Insert a row in to the plant_tribes_scaffold table. | |
259 args = [species_name, scaffold_id_db, num_genes, items[1], items[2], items[3], items[4]] | |
260 sql = """ | |
261 INSERT INTO plant_tribes_taxon | |
262 VALUES (nextval('plant_tribes_taxon_id_seq'), %s, %s, %s, %s, %s, %s, %s); | |
263 """ | |
264 self.update(sql, tuple(args)) | |
265 self.flush() | |
266 i += 1 | |
267 self.log("Inserted %d rows into the plant_tribes_taxon table for species name: %s." % (i, str(species_name))) | |
268 | |
269 def process_orthogroup_fasta_files(self): | |
270 """ | |
271 1. Analyze all of the scaffold .fna and .faa files for each clustering | |
272 method to populate the aa_dict and dna_dict sequence dictionaries. | |
273 2. Use the populated sequence dictionaries to populate the plant_tribes_gene | |
274 and gene_scaffold_orthogroup_taxon_association tables. | |
275 """ | |
276 scaffold_id = os.path.basename(self.args.scaffold_path) | |
277 aa_dict = {} | |
278 dna_dict = {} | |
279 # Populate aa_dict and dna_dict. | |
280 for clustering_method in self.clustering_methods: | |
281 file_dir = os.path.join(self.args.scaffold_path, 'fasta', clustering_method) | |
282 for file_name in os.listdir(file_dir): | |
283 items = file_name.split(".") | |
284 orthogroup_id = items[0] | |
285 file_extension = items[1] | |
286 if file_extension == "fna": | |
287 adict = dna_dict | |
288 else: | |
289 adict = aa_dict | |
290 file_path = os.path.join(file_dir, file_name) | |
291 with open(file_path, "r") as fh: | |
292 for i, line in enumerate(fh): | |
293 line = line.strip() | |
294 if len(line) == 0: | |
295 # Skip blank lines (shoudn't happen). | |
296 continue | |
297 if line.startswith(">"): | |
298 # Example line: | |
299 # >gnl_Ambtr1.0.27_AmTr_v1.0_scaffold00001.110 | |
300 gene_id = line.lstrip(">") | |
301 # The dictionary keys will combine the orthogroup_id, | |
302 # clustering method and gene id using the format | |
303 # ,orthogroup_id>^^<clustering_method>^^<gene_id>. | |
304 combined_id = "%s^^%s^^%s" % (orthogroup_id, clustering_method, gene_id) | |
305 if combined_id not in adict: | |
306 # The value will be the dna sequence string.. | |
307 adict[combined_id] = "" | |
308 else: | |
309 # Example line: | |
310 # ATGGAGAAGGACTTT | |
311 # Here combined_id is set because the fasta format specifies | |
312 # that all lines following the gene id defined in the if block | |
313 # above will be the sequence associated with that gene until | |
314 # the next gene id line is encountered. | |
315 sequence = adict[combined_id] | |
316 sequence = "%s%s" % (sequence, line) | |
317 adict[combined_id] = sequence | |
318 # Populate the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables | |
319 # from the contents of aa_dict and dna_dict. | |
320 self.log("Populating the plant_tribes_gene and gene_scaffold_orthogroup_taxon_association tables.") | |
321 gi = 0 | |
322 for gsoai, combined_id in enumerate(sorted(dna_dict.keys())): | |
323 # The dictionary keys combine the orthogroup_id, clustering method and | |
324 # gene id using the format <orthogroup_id>^^<clustering_method>^^<gene_id>. | |
325 items = combined_id.split("^^") | |
326 orthogroup_id = items[0] | |
327 clustering_method = items[1] | |
328 gene_id = items[2] | |
329 # The value will be a list containing both | |
330 # clustering_method and the dna string. | |
331 dna_sequence = dna_dict[combined_id] | |
332 aa_sequence = aa_dict[combined_id] | |
333 # Get the species_code from the gene_id. | |
334 species_code = gene_id.split("_")[1] | |
335 # Strip the version from the species_code. | |
336 species_code = species_code[0:5] | |
337 # Get the species_name from self.species_ids_dict. | |
338 species_name = self.species_ids_dict[species_code] | |
339 # Get the plant_tribes_orthogroup primary key id for | |
340 # the orthogroup_id from the plant_tribes_orthogroup table. | |
341 sql = "SELECT id FROM plant_tribes_orthogroup WHERE orthogroup_id = '%s';" % orthogroup_id | |
342 cur = self.conn.cursor() | |
343 cur.execute(sql) | |
344 orthogroup_id_db = cur.fetchone()[0] | |
345 # If the plant_tribes_gene table contains a row that has the gene_id, | |
346 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table. | |
347 # Get the taxon_id for the species_name from the plant_tribes_taxon table. | |
348 sql = "SELECT id FROM plant_tribes_taxon WHERE species_name = '%s';" % species_name | |
349 cur = self.conn.cursor() | |
350 cur.execute(sql) | |
351 taxon_id_db = cur.fetchone()[0] | |
352 # If the plant_tribes_gene table contains a row that has the gene_id, | |
353 # then we'll add a row only to the gene_scaffold_orthogroup_taxon_association table. | |
354 sql = "SELECT id FROM plant_tribes_gene WHERE gene_id = '%s';" % gene_id | |
355 cur = self.conn.cursor() | |
356 cur.execute(sql) | |
357 try: | |
358 gene_id_db = cur.fetchone()[0] | |
359 except: | |
360 # Insert a row into the plant_tribes_gene table. | |
361 args = [gene_id, dna_sequence, aa_sequence] | |
362 sql = """ | |
363 INSERT INTO plant_tribes_gene | |
364 VALUES (nextval('plant_tribes_gene_id_seq'), %s, %s, %s) | |
365 RETURNING id; | |
366 """ | |
367 cur = self.update(sql, tuple(args)) | |
368 self.flush() | |
369 gene_id_db = cur.fetchone()[0] | |
370 gi += 1 | |
371 if gi % 1000 == 0: | |
372 self.log("Inserted 1000 more rows into the plant_tribes_gene table.") | |
373 # Insert a row into the gene_scaffold_orthogroup_taxon_association table. | |
374 # Get the scaffold_rec for the current scaffold_id and clustering_method. | |
375 # The list is [<scaffold_id_db>, <scaffold_id>, <clustering_method>] | |
376 for scaffold_rec in self.scaffold_recs: | |
377 if scaffold_id in scaffold_rec and clustering_method in scaffold_rec: | |
378 scaffold_id_db = scaffold_rec[0] | |
379 args = [gene_id_db, scaffold_id_db, orthogroup_id_db, taxon_id_db] | |
380 sql = """ | |
381 INSERT INTO gene_scaffold_orthogroup_taxon_association | |
382 VALUES (nextval('gene_scaffold_orthogroup_taxon_association_id_seq'), %s, %s, %s, %s); | |
383 """ | |
384 cur = self.update(sql, tuple(args)) | |
385 self.flush() | |
386 if gsoai % 1000 == 0: | |
387 self.log("Inserted 1000 more rows into the gene_scaffold_orthogroup_taxon_association table.") | |
388 self.log("Inserted a total of %d rows into the plant_tribes_gene table." % gi) | |
389 self.log("Inserted a total of %d rows into the gene_scaffold_orthogroup_taxon_association table." % gsoai) | |
390 | |
391 | |
392 if __name__ == '__main__': | |
393 scaffold_loader = ScaffoldLoader() | |
394 scaffold_loader.run() | |
395 scaffold_loader.shutdown() |